Skip to content

Commit d609576

Browse files
madhav-madhusoodananLegNeato
authored andcommitted
feat: ported the AsyncAPI sample of CUDA examples
1 parent f53708a commit d609576

File tree

7 files changed

+202
-0
lines changed

7 files changed

+202
-0
lines changed

Cargo.lock

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ members = [
88

99
"xtask",
1010

11+
"examples/cuda/async_api",
12+
"examples/cuda/async_api/kernels",
1113
"examples/cuda/vecadd",
1214
"examples/cuda/vecadd/kernels",
1315
"examples/cuda/gemm",

examples/cuda/async_api/Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "async_api"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
nanorand = "0.7"
9+
10+
[build-dependencies]
11+
cuda_builder = { workspace = true, default-features = false }

examples/cuda/async_api/build.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use std::env;
2+
use std::path;
3+
4+
use cuda_builder::CudaBuilder;
5+
6+
fn main() {
7+
println!("cargo::rerun-if-changed=build.rs");
8+
println!("cargo::rerun-if-changed=kernels");
9+
10+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11+
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12+
13+
CudaBuilder::new(manifest_dir.join("kernels"))
14+
.copy_to(out_path.join("kernels.ptx"))
15+
.build()
16+
.unwrap();
17+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "async_api-kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use cuda_std::prelude::*;
2+
3+
#[kernel]
4+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
5+
pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
6+
// This can also be obtained directly as
7+
//
8+
// let idx: usize = cuda_std::thread::index() as usize;
9+
let idx: usize = (
10+
cuda_std::thread::block_dim().x
11+
* cuda_std::thread::block_idx().x
12+
+ cuda_std::thread::thread_idx().x
13+
) as usize;
14+
15+
let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
16+
*elem = *elem + inc_value;
17+
}
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
2+
use cust::memory::{DeviceBuffer, LockedBuffer, AsyncCopyDestination};
3+
use cust::event::{Event, EventFlags};
4+
use cust::prelude::EventStatus;
5+
use cust::stream::{Stream, StreamFlags};
6+
use cust::module::Module;
7+
use cust::context::Context;
8+
use cust::{launch, CudaFlags};
9+
use cust::device::Device;
10+
use cust::function::{GridSize, BlockSize};
11+
use std::time::Instant;
12+
13+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
14+
15+
fn correct_output(data: &[u32], x: u32) -> bool {
16+
let not_matching_element = data
17+
.iter()
18+
.enumerate()
19+
.find(|&(_, &elem)| elem != x);
20+
21+
match not_matching_element {
22+
Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
23+
None => println!("All elements of the array match the value!")
24+
}
25+
26+
not_matching_element.is_none()
27+
}
28+
29+
fn main() -> Result<(), cust::error::CudaError> {
30+
cust::init(CudaFlags::empty()).expect("Couldn't initialize CUDA environment!");
31+
32+
let device = Device::get_device(0)
33+
.expect("Couldn't find Cuda supported devices!");
34+
35+
println!("Device Name: {}", device.name().unwrap());
36+
37+
// Set up the context, load the module, and create a stream to run kernels in.
38+
let _ctx = Context::new(device);
39+
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
40+
let increment = module.get_function("increment").expect("Kernel function not found!");
41+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
42+
43+
const N: usize = 16 * 1024 * 1024;
44+
const N_BYTES: usize = N * (i32::BITS as usize);
45+
let value = 26;
46+
47+
let blocks = BlockSize::xy(512, 1);
48+
let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);
49+
50+
let start_event = Event::new(EventFlags::DEFAULT)?;
51+
let stop_event = Event::new(EventFlags::DEFAULT)?;
52+
53+
// Create buffers for data on host-side
54+
// Ideally must be page-locked for efficiency
55+
let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
56+
let mut device_a = DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
57+
58+
start_event.record(&stream).expect("Failed to record start_event in the CUDA stream!");
59+
let start = Instant::now();
60+
61+
// SAFETY: until the stop_event being triggered:
62+
// 1. `host_a` is not being modified
63+
// 2. Both `device_a` and `host_a` are not deallocated
64+
// 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
65+
// other than those of the operations in the stream.
66+
unsafe {
67+
device_a.async_copy_from(&host_a, &stream).expect("Could not copy from host to device!");
68+
}
69+
70+
// SAFETY: number of threads * number of blocks = total number of elements.
71+
// Hence there will not be any out-of-bounds issues.
72+
unsafe {
73+
let result = launch!(increment<<<grids, blocks, 0, stream>>>(
74+
device_a.as_device_ptr(),
75+
value
76+
));
77+
result.expect("Result of `increment` kernel did not process!");
78+
}
79+
80+
// SAFETY: until the stop_event being triggered:
81+
// 1. `device_a` is not being modified
82+
// 2. Both `device_a` and `host_a` are not deallocated
83+
// 3. At this point, until `stop_query` yields `EventStatus::Ready`,
84+
// `host_a` is not involved in any other operation.
85+
unsafe {
86+
device_a.async_copy_to(&mut host_a, &stream).expect("Could not copy from device to host!");
87+
}
88+
89+
stop_event.record(&stream).expect("Failed to record stop_event in the CUDA stream!");
90+
let cpu_time: u128 = start.elapsed().as_micros();
91+
92+
let mut counter: u64 = 0;
93+
while stop_event.query() != Ok(EventStatus::Ready) { counter += 1 }
94+
95+
let gpu_time: u128 = stop_event
96+
.elapsed(&start_event)
97+
.expect("Failed to calculate duration of GPU operations!")
98+
.as_micros();
99+
100+
println!("Time spent executing by the GPU: {gpu_time} microseconds");
101+
println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
102+
println!("CPU executed {counter} iterations while waiting for GPU to finish.");
103+
104+
assert!(correct_output(host_a.as_slice(), value));
105+
106+
// Stream is synchronized as a safety measure
107+
stream.synchronize().expect("Stream couldn't synchronize!");
108+
109+
// Events and buffers can be safely dropped now
110+
match Event::drop(start_event) {
111+
Ok(()) => println!("Successfully destroyed start_event"),
112+
Err((cuda_error, _event)) => {
113+
println!("Failed to destroy start_event: {:?}", cuda_error);
114+
},
115+
}
116+
117+
match Event::drop(stop_event) {
118+
Ok(()) => println!("Successfully destroyed stop_event"),
119+
Err((cuda_error, _event)) => {
120+
println!("Failed to destroy stop_event: {:?}", cuda_error);
121+
},
122+
}
123+
124+
DeviceBuffer::drop(device_a).expect("Couldn't drop device array!");
125+
LockedBuffer::drop(host_a).expect("Couldn't drop host array!");
126+
127+
println!("test PASSED");
128+
Ok(())
129+
}

0 commit comments

Comments
 (0)