Skip to content
This repository was archived by the owner on Jul 17, 2025. It is now read-only.

Commit 6d8dd58

Browse files
committed
Adds multi-shmem region and affinity shmem support
1 parent 5d7ca04 commit 6d8dd58

File tree

22 files changed

+677
-582
lines changed

22 files changed

+677
-582
lines changed

doc/src/development/Testing.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,16 @@ Benchmarks are named as such:
105105
* ```s10_*```: User-space applications benchmarks
106106
* ```s11_*```: Rackscale (distributed) benchmarks
107107

108+
The ```s11_*``` benchmarks may be configured with two features:
109+
* ```baseline```: Runs NrOS configured similarly to rackscale, for comparison
110+
* ```affinity-shmem```: Runs the ```ivshmem-server``` using shmem with NUMA affinity.
111+
This option requires preconfiguring hugetlbfs with
112+
```sudo hugeadm --create-global-mounts```,
113+
having a kernel with 2MB huge pages enabled, and then also adding 1024 2MB pages per
114+
node, with a command like:
115+
```echo <page-num> | sudo numactl -m <node-num> tee -a /proc/sys/vm/nr_hugepages_mempolicy```
116+
The number of huge pages per node may be verified with ```numastat -m```.
117+
108118
## Network
109119

110120
nrk has support for three network interfaces at the moment: virtio, e1000 and

doc/src/environment/Environment.md

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,47 +15,25 @@ already uncommented. Then, run the following commands:
1515

1616
```bash
1717
sudo apt update
18-
sudo apt install build-essential libpmem-dev libdaxctl-dev ninja-build
18+
sudo apt install build-essential libpmem-dev libdaxctl-dev ninja-build flex bison
1919
apt source qemu
2020
sudo apt build-dep qemu
21+
22+
For non-rackscale mode, execute the following:
23+
```
2124
wget https://download.qemu.org/qemu-6.0.0.tar.xz
2225
tar xvJf qemu-6.0.0.tar.xz
2326
cd qemu-6.0.0
2427
```
2528
26-
If you are planning on running the rackscale NrOS build, you'll need to modify
27-
the ivshmem server code. Open ```contrib/ivshmem-server/ivshmem-server.c```.
28-
Go to the function```ivshmem_server_ftruncate```. Replace it with:
29-
```c
30-
static int
31-
ivshmem_server_ftruncate(int fd, uint64_t shmsize)
32-
{
33-
int ret;
34-
struct stat mapstat;
35-
36-
/* align shmsize to next power of 2 */
37-
shmsize = pow2ceil(shmsize);
38-
39-
if (fstat(fd, &mapstat) != -1 && mapstat.st_size == shmsize) {
40-
return 0;
41-
}
42-
43-
/*
44-
* This is a do-while loop in case
45-
* shmsize > IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE
46-
*/
47-
do {
48-
ret = ftruncate64(fd, shmsize);
49-
if (ret == 0) {
50-
return ret;
51-
}
52-
shmsize *= 2;
53-
} while (shmsize <= IVSHMEM_SERVER_MAX_HUGEPAGE_SIZE);
54-
55-
return -1;
56-
}
29+
For rackscale mode, instead run:
30+
```
31+
git clone https://github.com/hunhoffe/qemu.git qemu-6.0.0.tar.xz
32+
cd qemu-6.0.0
33+
git checkout --track origin/dev/ivshmem-numa
5734
```
5835
36+
For either option, build and install with:
5937
```bash
6038
./configure --enable-rdma --enable-libpmem
6139
make -j 28

kernel/run.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def exception_handler(exception_type, exception, traceback):
3838
"-Z", "build-std-features=compiler-builtins-mem"]
3939
ARCH = "x86_64"
4040

41-
IVSHMEM_DEVICE_ADDR = 6
41+
IVSHMEM_DEVICE_ADDR = 10
4242

4343
def get_network_config(workers):
4444
"""
@@ -137,10 +137,10 @@ def get_network_config(workers):
137137
parser.add_argument('--kgdb', action="store_true",
138138
help="Use the GDB remote debugger to connect to the kernel")
139139
parser.add_argument('--qemu-ivshmem',
140-
type=int,
140+
type=str,
141141
help="Enable the ivshmem device with the size in MiB.",
142142
required=False,
143-
default=0)
143+
default="")
144144
parser.add_argument('--qemu-shmem-path',
145145
type=str,
146146
help="Provide shared memory file path.",
@@ -407,12 +407,17 @@ def run_qemu(args):
407407
'isa-debug-exit,iobase=0xf4,iosize=0x04']
408408

409409
if args.qemu_ivshmem:
410-
# If you change the the device addr, you must change it in vibrio::rumprt::dev
411-
qemu_default_args += ['-device', 'ivshmem-doorbell,vectors=3,chardev=id,addr={}'.format(IVSHMEM_DEVICE_ADDR)]
412-
qemu_default_args += [
413-
'-chardev',
414-
'socket,path={},id=id'.format(args.qemu_shmem_path)
415-
]
410+
sizes = [int(s.strip()) for s in args.qemu_ivshmem.split(",")]
411+
names = [s.strip() for s in args.qemu_shmem_path.split(",")]
412+
assert len(sizes) == len(names)
413+
414+
for i in range(len(sizes)):
415+
# TODO: Only device IVSHMEM_DEVICE_ADDR will handle interrupts?
416+
qemu_default_args += ['-device', 'ivshmem-doorbell,vectors=3,chardev=id{},addr={}'.format(i, IVSHMEM_DEVICE_ADDR + i)]
417+
qemu_default_args += [
418+
'-chardev',
419+
'socket,path={},id=id{}'.format(names[i], i)
420+
]
416421

417422
# Enable networking:
418423
mac, tap = NETWORK_CONFIG[args.tap]['mac'], args.tap

kernel/src/arch/x86_64/mod.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,9 @@ fn _start(argc: isize, _argv: *const *const u8) -> isize {
307307
// form by klogger::init above, but now we do it for more ports)
308308
debug::init();
309309

310+
use crate::transport::shmem::SHMEM_INITIALIZED;
311+
lazy_static::initialize(&SHMEM_INITIALIZED);
312+
310313
// Parse memory map provided by UEFI, create an initial emergency memory
311314
// manager with a little bit of memory so we can do some early allocations.
312315
let (emanager, memory_regions) = memory::process_uefi_memory_regions();
@@ -396,8 +399,8 @@ fn _start(argc: isize, _argv: *const *const u8) -> isize {
396399
#[cfg(feature = "rackscale")]
397400
{
398401
{
399-
use crate::transport::shmem::SHMEM_DEVICE;
400-
lazy_static::initialize(&SHMEM_DEVICE);
402+
use crate::transport::shmem::SHMEM;
403+
lazy_static::initialize(&SHMEM);
401404

402405
if crate::CMDLINE
403406
.get()
@@ -412,7 +415,7 @@ fn _start(argc: isize, _argv: *const *const u8) -> isize {
412415
};
413416

414417
// Setup to receive interrupts
415-
SHMEM_DEVICE.enable_msix_vector(
418+
SHMEM.devices[0].enable_msix_vector(
416419
REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR as usize,
417420
0,
418421
REMOTE_TLB_WORK_PENDING_VECTOR,

kernel/src/arch/x86_64/process.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ lazy_static! {
8888
let pcm = per_core_mem();
8989
pcm.set_mem_affinity(SHARED_AFFINITY).expect("Can't change affinity");
9090
} else {
91-
// Get location of the logs from the controller, who will created them in shared memory
91+
// Get location of the logs from the controller, who will have created them in shared memory
9292
use crate::arch::rackscale::get_shmem_structure::{rpc_get_shmem_structure, ShmemStructure};
9393

9494
let mut log_ptrs = [0u64; MAX_PROCESSES];

kernel/src/arch/x86_64/rackscale/get_shmem_frames.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::error::{KError, KResult};
2020
use crate::memory::backends::PhysicalPageProvider;
2121
use crate::memory::Frame;
2222
use crate::process::Pid;
23-
use crate::transport::shmem::{ShmemRegion, SHMEM_DEVICE};
23+
use crate::transport::shmem::ShmemRegion;
2424

2525
use crate::memory::backends::AllocatorStatistics;
2626

kernel/src/arch/x86_64/rackscale/processops/allocate_physical.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::memory::backends::PhysicalPageProvider;
2020
use crate::memory::{Frame, PAddr, BASE_PAGE_SIZE};
2121
use crate::nrproc::NrProcess;
2222
use crate::process::Pid;
23-
use crate::transport::shmem::{is_shmem_frame, ShmemRegion};
23+
use crate::transport::shmem::ShmemRegion;
2424

2525
#[derive(Debug)]
2626
pub(crate) struct AllocatePhysicalReq {
@@ -65,10 +65,6 @@ pub(crate) fn rpc_allocate_physical(pid: Pid, size: u64, affinity: u64) -> KResu
6565
size,
6666
};
6767
let frame = shmem_region.get_frame(0);
68-
69-
// TODO(rackscale performance): should be debug assert
70-
assert!(is_shmem_frame(frame, false, false));
71-
7268
let fid = NrProcess::<Ring3Process>::allocate_frame_to_process(pid, frame)?;
7369

7470
// Add frame mapping to client map
@@ -128,9 +124,6 @@ pub(crate) fn handle_allocate_physical(
128124
.expect("DCM should ensure we have a frame to allocate here.")
129125
};
130126

131-
// TODO(rackscale performance): should be debug assert
132-
assert!(is_shmem_frame(frame, false, false));
133-
134127
construct_ret(hdr, payload, Ok((dcm_node_id, frame.base.as_u64())));
135128
Ok(state)
136129
}

kernel/src/arch/x86_64/rackscale/syscalls.rs

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ use crate::process::{KernArcBuffer, UserSlice};
1616
use crate::syscalls::{
1717
FsDispatch, ProcessDispatch, SystemCallDispatch, SystemDispatch, VSpaceDispatch,
1818
};
19-
use crate::transport::shmem::is_shmem_frame;
2019

2120
use super::super::syscall::{Arch86SystemCall, Arch86SystemDispatch, Arch86VSpaceDispatch};
2221
use super::fileops::close::rpc_close;
@@ -81,9 +80,6 @@ impl VSpaceDispatch<u64> for Arch86LwkSystemCall {
8180
.allocate_base_page()
8281
.expect("We ensure there is capabity in the FrameCacheBase above");
8382

84-
// TODO(rackscale performance): should be debug assert
85-
assert!(is_shmem_frame(frame, false, false));
86-
8783
initial_base_frames
8884
.try_push(frame)
8985
.expect("Can't fail see `try_with_capacity`");
@@ -102,8 +98,6 @@ impl VSpaceDispatch<u64> for Arch86LwkSystemCall {
10298
let mut allocated_frames = rpc_get_shmem_frames(Some(pid), total_needed_large_pages)?;
10399

104100
for i in 0..lp {
105-
// TODO(rackscale performance): should be debug assert
106-
assert!(is_shmem_frame(allocated_frames[i], false, false));
107101
total_len += allocated_frames[i].size;
108102
unsafe { allocated_frames[i].zero() };
109103
frames
@@ -116,18 +110,12 @@ impl VSpaceDispatch<u64> for Arch86LwkSystemCall {
116110

117111
// Grow base pages
118112
if total_needed_base_pages > 0 {
119-
// TODO(rackscale performance): should be debug assert
120-
assert!(is_shmem_frame(allocated_frames[lp], false, false));
121-
122113
let mut base_page_iter = allocated_frames[lp].into_iter();
123114
for _i in 0..total_needed_base_pages {
124115
let mut frame = base_page_iter
125116
.next()
126117
.expect("needed base frames should all fit within one large frame");
127118

128-
// TODO(rackscale performance): should be debug assert
129-
assert!(is_shmem_frame(frame, false, false));
130-
131119
total_len += frame.size;
132120
unsafe { frame.zero() };
133121
if paddr.is_none() {
@@ -151,9 +139,6 @@ impl VSpaceDispatch<u64> for Arch86LwkSystemCall {
151139
.next()
152140
.expect("needed base frames should all fit within one large frame");
153141

154-
// TODO(rackscale performance): should be debug assert
155-
assert!(is_shmem_frame(frame, false, false));
156-
157142
per_process_bp_cache
158143
.grow_base_pages(&[frame])
159144
.expect("We ensure not to overfill the FrameCacheBase above.");

kernel/src/arch/x86_64/tlb.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ pub(crate) fn remote_shootdown(handles: Vec<TlbFlushHandle>) {
419419
use crate::arch::irq::REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR;
420420
use crate::arch::kcb::per_core_mem;
421421
use crate::memory::SHARED_AFFINITY;
422-
use crate::transport::shmem::SHMEM_DEVICE;
422+
use crate::transport::shmem::SHMEM;
423423

424424
let my_mtid = kpi::system::mtid_from_gtid(*crate::environment::CORE_ID);
425425
let my_mid = kpi::system::mid_from_gtid(*crate::environment::CORE_ID);
@@ -468,7 +468,8 @@ pub(crate) fn remote_shootdown(handles: Vec<TlbFlushHandle>) {
468468
i,
469469
handles[i].core_map.is_empty()
470470
);
471-
SHMEM_DEVICE.set_doorbell(REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR, i.try_into().unwrap());
471+
SHMEM.devices[0]
472+
.set_doorbell(REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR, i.try_into().unwrap());
472473
}
473474
}
474475

kernel/src/integration_tests.rs

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -765,37 +765,41 @@ fn vmxnet_smoltcp() {
765765
shutdown(ExitReason::Ok);
766766
}
767767

768-
/// Write and test the content on a shared-mem device.
769-
pub(crate) const BUFFER_CONTENT: u8 = 0xb;
770-
771768
/// Test cxl device in the kernel.
772769
#[cfg(all(feature = "integration-test", target_arch = "x86_64"))]
773770
pub(crate) fn cxl_write() {
774771
use crate::memory::KERNEL_BASE;
775-
use crate::transport::shmem::SHMEM_DEVICE;
772+
use crate::transport::shmem::SHMEM;
776773

777-
lazy_static::initialize(&SHMEM_DEVICE);
774+
lazy_static::initialize(&SHMEM);
778775

779-
for i in 0..SHMEM_DEVICE.region.size {
780-
let region = (SHMEM_DEVICE.region.base + KERNEL_BASE + i as u64) as *mut u8;
781-
unsafe { core::ptr::write(region, BUFFER_CONTENT) };
776+
let mut buffer_content: u8 = 0x0;
777+
for device in SHMEM.devices.iter() {
778+
buffer_content += 1;
779+
for i in 0..(device.region.size / 1024) {
780+
let region = (device.region.base + KERNEL_BASE + (i * 1024) as u64) as *mut u8;
781+
unsafe { core::ptr::write(region, buffer_content) };
782+
}
782783
}
783-
784784
shutdown(ExitReason::Ok);
785785
}
786786

787787
/// Test cxl device in the kernel.
788788
#[cfg(all(feature = "integration-test", target_arch = "x86_64"))]
789789
pub(crate) fn cxl_read() {
790790
use crate::memory::KERNEL_BASE;
791-
use crate::transport::shmem::SHMEM_DEVICE;
791+
use crate::transport::shmem::SHMEM;
792792

793-
lazy_static::initialize(&SHMEM_DEVICE);
793+
lazy_static::initialize(&SHMEM);
794794

795-
for i in 0..SHMEM_DEVICE.region.size {
796-
let region = (SHMEM_DEVICE.region.base + KERNEL_BASE + i as u64) as *mut u8;
797-
let read = unsafe { core::ptr::read(region) };
798-
assert_eq!(read, BUFFER_CONTENT);
795+
let mut buffer_content: u8 = 0x0;
796+
for device in SHMEM.devices.iter() {
797+
buffer_content += 1;
798+
for i in 0..(device.region.size / 1024) {
799+
let region = (device.region.base + KERNEL_BASE + (i * 1024) as u64) as *mut u8;
800+
let read = unsafe { core::ptr::read(region) };
801+
assert_eq!(read, buffer_content);
802+
}
799803
}
800804

801805
shutdown(ExitReason::Ok);
@@ -805,18 +809,21 @@ pub(crate) fn cxl_read() {
805809
#[cfg(all(feature = "integration-test", target_arch = "x86_64"))]
806810
pub(crate) fn shmem_interruptor() {
807811
use crate::arch::irq::REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR;
808-
use crate::transport::shmem::SHMEM_DEVICE;
812+
use crate::transport::shmem::SHMEM;
809813

810-
lazy_static::initialize(&SHMEM_DEVICE);
814+
lazy_static::initialize(&SHMEM);
811815
{
812816
// The ivshmem server allocates IDs consecutively, so we'll assume interruptee is
813817
// current_id - 1
814818
log::info!(
815819
"Sending shmem interrupt to: {:?} on vector {:?}",
816-
SHMEM_DEVICE.id - 1,
820+
SHMEM.devices[0].id - 1,
817821
1
818822
);
819-
SHMEM_DEVICE.set_doorbell(REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR, SHMEM_DEVICE.id - 1);
823+
SHMEM.devices[0].set_doorbell(
824+
REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR,
825+
SHMEM.devices[0].id - 1,
826+
);
820827
}
821828
shutdown(ExitReason::Ok);
822829
}
@@ -828,11 +835,11 @@ pub(crate) fn shmem_interruptee() {
828835
use core::time::Duration;
829836

830837
use crate::arch::irq::{REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR, REMOTE_TLB_WORK_PENDING_VECTOR};
831-
use crate::transport::shmem::SHMEM_DEVICE;
838+
use crate::transport::shmem::SHMEM;
832839

833-
lazy_static::initialize(&SHMEM_DEVICE);
840+
lazy_static::initialize(&SHMEM);
834841

835-
SHMEM_DEVICE.enable_msix_vector(
842+
SHMEM.get_interrupt_device().unwrap().enable_msix_vector(
836843
REMOTE_TLB_WORK_PENDING_SHMEM_VECTOR as usize,
837844
0,
838845
REMOTE_TLB_WORK_PENDING_VECTOR,

0 commit comments

Comments
 (0)