diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml index e1e76c9d..a5910cf4 100644 --- a/.github/workflows/ci_windows.yml +++ b/.github/workflows/ci_windows.yml @@ -67,6 +67,11 @@ jobs: - name: Add rustup components run: rustup component add rustfmt clippy + - name: Update PATH to expose CUDA codegen backend + shell: pwsh + run: | + echo "$env:CUDA_PATH\nvvm\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Verify CUDA, Rust installation run: | nvcc --version diff --git a/Cargo.lock b/Cargo.lock index 7cd5a8d8..1af88d55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -156,6 +156,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" +[[package]] +name = "async_api" +version = "0.1.0" +dependencies = [ + "cuda_builder", + "cust", + "nanorand", +] + +[[package]] +name = "async_api-kernels" +version = "0.1.0" +dependencies = [ + "cuda_std", +] + [[package]] name = "atty" version = "0.2.14" diff --git a/Cargo.toml b/Cargo.toml index 18931ef0..309132ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,9 @@ members = [ "examples/optix/*", "tests/compiletests", "tests/compiletests/deps-helper", + + "samples/introduction/async_api", + "samples/introduction/async_api/kernels", ] exclude = [ diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 00000000..73b39156 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,5 @@ +# Rust-Cuda Samples + +These are the Rust-Cuda port of the samples from Nvidia's [cuda-samples](https://github.com/NVIDIA/cuda-samples/tree/master/Samples) repository. + +1. Chapter 0: [Introduction](https://github.com/Rust-GPU/rust-cuda/samples/introduction) \ No newline at end of file diff --git a/samples/introduction/README.md b/samples/introduction/README.md new file mode 100644 index 00000000..3c980a66 --- /dev/null +++ b/samples/introduction/README.md @@ -0,0 +1,8 @@ +# Chapter 0: Introduction + +## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api) +This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations. + +1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations. +2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device) +3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU. diff --git a/samples/introduction/async_api/Cargo.toml b/samples/introduction/async_api/Cargo.toml new file mode 100644 index 00000000..b5a8cdfa --- /dev/null +++ b/samples/introduction/async_api/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "async_api" +version = "0.1.0" +edition = "2024" + +[dependencies] +cust = { path = "../../../crates/cust" } +nanorand = "0.7" + +[build-dependencies] +cuda_builder = { workspace = true, default-features = false } diff --git a/samples/introduction/async_api/build.rs b/samples/introduction/async_api/build.rs new file mode 100644 index 00000000..7f23bac1 --- /dev/null +++ b/samples/introduction/async_api/build.rs @@ -0,0 +1,17 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + CudaBuilder::new(manifest_dir.join("kernels")) + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); +} diff --git a/samples/introduction/async_api/kernels/Cargo.toml b/samples/introduction/async_api/kernels/Cargo.toml new file mode 100644 index 00000000..7ad69be6 --- /dev/null +++ b/samples/introduction/async_api/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "async_api-kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs new file mode 100644 index 00000000..ffcc07d3 --- /dev/null +++ b/samples/introduction/async_api/kernels/src/lib.rs @@ -0,0 +1,17 @@ +use cuda_std::prelude::*; + +#[kernel] +/// # Safety +/// +/// The user must ensure that the number of (threads * blocks * grids) +/// must not be greater than the number of elements in `g_data`. +pub unsafe fn increment(g_data: *mut u32, inc_value: u32) { + // This can also be obtained directly as + // + // let idx: usize = cuda_std::thread::index() as usize; + let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x + + cuda_std::thread::thread_idx().x) as usize; + + let elem: &mut u32 = unsafe { &mut *g_data.add(idx) }; + *elem += inc_value; +} diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs new file mode 100644 index 00000000..063efbd3 --- /dev/null +++ b/samples/introduction/async_api/src/main.rs @@ -0,0 +1,122 @@ +use cust::device::Device; +use cust::event::{Event, EventFlags}; +use cust::function::{BlockSize, GridSize}; +use cust::launch; +use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer}; +use cust::module::Module; +use cust::prelude::EventStatus; +use cust::stream::{Stream, StreamFlags}; +use std::time::Instant; + +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +fn correct_output(data: &[u32], x: u32) -> bool { + let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x); + + match not_matching_element { + Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"), + None => println!("All elements of the array match the value!"), + } + + not_matching_element.is_none() +} + +fn main() -> Result<(), cust::error::CudaError> { + // Set up the context, load the module, and create a stream to run kernels in. + let _ctx = cust::quick_init(); + let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!"); + println!("Device Name: {}", device.name().unwrap()); + + let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!"); + let increment = module + .get_function("increment") + .expect("Kernel function not found!"); + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!"); + + const N: usize = 16 * 1024 * 1024; + let value = 26; + + let blocks = BlockSize::xy(512, 1); + let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1); + + let start_event = Event::new(EventFlags::DEFAULT)?; + let stop_event = Event::new(EventFlags::DEFAULT)?; + + // Create buffers for data on host-side + // Ideally should be page-locked for efficiency + let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!"); + let mut device_a = + DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!"); + + start_event + .record(&stream) + .expect("Failed to record start_event in the CUDA stream!"); + let start = Instant::now(); + + // # Safety + // + // Until the stop_event is triggered: + // 1. `host_a` is not being modified + // 2. Both `device_a` and `host_a` are not deallocated + // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation + // other than those of the operations in the stream. + unsafe { + device_a + .async_copy_from(&host_a, &stream) + .expect("Could not copy from host to device!"); + } + + // # Safety + // + // Number of threads * number of blocks = total number of elements. + // Hence there will not be any out-of-bounds issues. + unsafe { + let result = launch!(increment<<>>( + device_a.as_device_ptr(), + value + )); + result.expect("Result of `increment` kernel did not process!"); + } + + // # Safety + // + // Until the stop_event is triggered: + // 1. `device_a` is not being modified + // 2. Both `device_a` and `host_a` are not deallocated + // 3. At this point, until `stop_query` yields `EventStatus::Ready`, + // `host_a` is not involved in any other operation. + unsafe { + device_a + .async_copy_to(&mut host_a, &stream) + .expect("Could not copy from device to host!"); + } + + stop_event + .record(&stream) + .expect("Failed to record stop_event in the CUDA stream!"); + let cpu_time: u128 = start.elapsed().as_micros(); + + let mut counter: u64 = 0; + while stop_event.query() != Ok(EventStatus::Ready) { + counter += 1 + } + + let gpu_time: u128 = stop_event + .elapsed(&start_event) + .expect("Failed to calculate duration of GPU operations!") + .as_micros(); + + println!("Time spent executing by the GPU: {gpu_time} microseconds"); + println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds"); + println!("CPU executed {counter} iterations while waiting for GPU to finish."); + + assert!(correct_output(host_a.as_slice(), value)); + + // Stream is synchronized as a safety measure + stream.synchronize().expect("Stream couldn't synchronize!"); + + println!("test PASSED"); + Ok(()) + + // The events and the memory buffers are automatically dropped here. +}