From ee96fb3c80b47567d2d9073e35063662bc6fcf04 Mon Sep 17 00:00:00 2001 From: Brad Larson Date: Sun, 16 Nov 2025 15:50:01 -0600 Subject: [PATCH] Make Int <-> UInt conversions explicit. Co-authored-by: raju --- problems/p03/p03.mojo | 4 +-- problems/p04/p04.mojo | 4 +-- problems/p04/p04_layout_tensor.mojo | 4 +-- problems/p05/p05.mojo | 4 +-- problems/p05/p05_layout_tensor.mojo | 4 +-- problems/p06/p06.mojo | 4 +-- problems/p07/p07.mojo | 4 +-- problems/p07/p07_layout_tensor.mojo | 4 +-- problems/p08/p08.mojo | 4 +-- problems/p08/p08_layout_tensor.mojo | 4 +-- problems/p10/p10.mojo | 8 +++--- problems/p11/p11.mojo | 4 +-- problems/p11/p11_layout_tensor.mojo | 4 +-- problems/p12/p12.mojo | 4 +-- problems/p12/p12_layout_tensor.mojo | 4 +-- problems/p13/p13.mojo | 6 ++-- problems/p14/p14.mojo | 12 ++++---- problems/p15/p15.mojo | 4 +-- problems/p16/p16.mojo | 12 ++++---- problems/p17/op/conv1d.mojo | 4 +-- problems/p19/op/attention.mojo | 16 +++++------ problems/p21/op/embedding.mojo | 6 ++-- problems/p22/op/layernorm_linear.mojo | 38 +++++++++++++------------- problems/p24/p24.mojo | 6 ++-- problems/p25/p25.mojo | 20 +++++++------- problems/p26/p26.mojo | 10 +++---- problems/p27/p27.mojo | 18 ++++++------ problems/p29/p29.mojo | 8 +++--- problems/p33/p33.mojo | 20 ++++++++------ problems/p34/p34.mojo | 10 +++---- solutions/p03/p03.mojo | 4 +-- solutions/p04/p04.mojo | 4 +-- solutions/p04/p04_layout_tensor.mojo | 4 +-- solutions/p05/p05.mojo | 4 +-- solutions/p05/p05_layout_tensor.mojo | 4 +-- solutions/p06/p06.mojo | 4 +-- solutions/p07/p07.mojo | 4 +-- solutions/p07/p07_layout_tensor.mojo | 4 +-- solutions/p08/p08.mojo | 4 +-- solutions/p08/p08_layout_tensor.mojo | 4 +-- solutions/p10/p10.mojo | 8 +++--- solutions/p11/p11.mojo | 6 ++-- solutions/p11/p11_layout_tensor.mojo | 6 ++-- solutions/p12/p12.mojo | 6 ++-- solutions/p12/p12_layout_tensor.mojo | 6 ++-- solutions/p13/p13.mojo | 6 ++-- solutions/p14/p14.mojo | 16 +++++------ solutions/p15/p15.mojo | 6 ++-- solutions/p16/p16.mojo | 24 ++++++++-------- solutions/p17/op/conv1d.mojo | 4 +-- solutions/p18/op/softmax.mojo | 2 +- solutions/p19/op/attention.mojo | 28 +++++++++---------- solutions/p21/op/embedding.mojo | 6 ++-- solutions/p22/op/layernorm_linear.mojo | 38 +++++++++++++------------- solutions/p24/p24.mojo | 6 ++-- solutions/p25/p25.mojo | 26 +++++++++--------- solutions/p26/p26.mojo | 10 +++---- solutions/p27/p27.mojo | 12 ++++---- solutions/p28/p28.mojo | 6 ++-- solutions/p29/p29.mojo | 8 +++--- solutions/p33/p33.mojo | 20 ++++++++------ solutions/p34/p34.mojo | 10 +++---- 62 files changed, 279 insertions(+), 275 deletions(-) diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo index f388faf4..0c99fddf 100644 --- a/problems/p03/p03.mojo +++ b/problems/p03/p03.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_guard( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): i = thread_idx.x # FILL ME IN (roughly 2 lines) @@ -35,7 +35,7 @@ def main(): ctx.enqueue_function_checked[add_10_guard, add_10_guard]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo index 6e11fe1c..156868d6 100644 --- a/problems/p04/p04.mojo +++ b/problems/p04/p04.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_2d( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -41,7 +41,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo index da393f0e..c5698daa 100644 --- a/problems/p04/p04_layout_tensor.mojo +++ b/problems/p04/p04_layout_tensor.mojo @@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn add_10_2d( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -46,7 +46,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo index 2d9079d4..70dd0204 100644 --- a/problems/p05/p05.mojo +++ b/problems/p05/p05.mojo @@ -14,7 +14,7 @@ fn broadcast_add( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], b: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -45,7 +45,7 @@ def main(): out, a, b, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo index a0996217..9da93208 100644 --- a/problems/p05/p05_layout_tensor.mojo +++ b/problems/p05/p05_layout_tensor.mojo @@ -21,7 +21,7 @@ fn broadcast_add[ output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -63,7 +63,7 @@ def main(): out_tensor, a_tensor, b_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo index 6992f420..68ee774d 100644 --- a/problems/p06/p06.mojo +++ b/problems/p06/p06.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): i = block_dim.x * block_idx.x + thread_idx.x # FILL ME IN (roughly 2 lines) @@ -35,7 +35,7 @@ def main(): ctx.enqueue_function_checked[add_10_blocks, add_10_blocks]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo index 225dcace..71174c38 100644 --- a/problems/p07/p07.mojo +++ b/problems/p07/p07.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks_2d( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x @@ -42,7 +42,7 @@ def main(): ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo index 4c7baf41..973ebb92 100644 --- a/problems/p07/p07_layout_tensor.mojo +++ b/problems/p07/p07_layout_tensor.mojo @@ -18,7 +18,7 @@ fn add_10_blocks_2d[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x @@ -53,7 +53,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p08/p08.mojo b/problems/p08/p08.mojo index e7bf09b3..935d2c8b 100644 --- a/problems/p08/p08.mojo +++ b/problems/p08/p08.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn add_10_shared( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, @@ -48,7 +48,7 @@ def main(): ctx.enqueue_function_checked[add_10_shared, add_10_shared]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p08/p08_layout_tensor.mojo b/problems/p08/p08_layout_tensor.mojo index 498ac8e2..af416c97 100644 --- a/problems/p08/p08_layout_tensor.mojo +++ b/problems/p08/p08_layout_tensor.mojo @@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): # Allocate shared memory using LayoutTensor with explicit address_space shared = LayoutTensor[ @@ -57,7 +57,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p10/p10.mojo b/problems/p10/p10.mojo index 6109c2ba..bd456efc 100644 --- a/problems/p10/p10.mojo +++ b/problems/p10/p10.mojo @@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn shared_memory_race( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -45,7 +45,7 @@ fn shared_memory_race( fn add_10_2d( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -94,7 +94,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) @@ -127,7 +127,7 @@ def main(): ]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p11/p11.mojo b/problems/p11/p11.mojo index ebc94c6e..22fc7157 100644 --- a/problems/p11/p11.mojo +++ b/problems/p11/p11.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn pooling( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, @@ -44,7 +44,7 @@ def main(): ctx.enqueue_function_checked[pooling, pooling]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p11/p11_layout_tensor.mojo b/problems/p11/p11_layout_tensor.mojo index c2cfe3f1..de87752b 100644 --- a/problems/p11/p11_layout_tensor.mojo +++ b/problems/p11/p11_layout_tensor.mojo @@ -18,7 +18,7 @@ fn pooling[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): # Allocate shared memory using tensor builder shared = LayoutTensor[ @@ -53,7 +53,7 @@ def main(): ctx.enqueue_function_checked[pooling[layout], pooling[layout]]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p12/p12.mojo b/problems/p12/p12.mojo index 9c6a1afe..65b3c9fe 100644 --- a/problems/p12/p12.mojo +++ b/problems/p12/p12.mojo @@ -17,7 +17,7 @@ fn dot_product( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], b: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): # FILL ME IN (roughly 13 lines) ... @@ -43,7 +43,7 @@ def main(): out, a, b, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p12/p12_layout_tensor.mojo b/problems/p12/p12_layout_tensor.mojo index e5ad9bd3..f8ddea21 100644 --- a/problems/p12/p12_layout_tensor.mojo +++ b/problems/p12/p12_layout_tensor.mojo @@ -22,7 +22,7 @@ fn dot_product[ output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): # FILL ME IN (roughly 13 lines) ... @@ -54,7 +54,7 @@ def main(): out_tensor, a_tensor, b_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p13/p13.mojo b/problems/p13/p13.mojo index bc541cee..9b227865 100644 --- a/problems/p13/p13.mojo +++ b/problems/p13/p13.mojo @@ -25,7 +25,7 @@ fn conv_1d_simple[ b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin], ): global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + local_i = Int(thread_idx.x) # FILL ME IN (roughly 14 lines) @@ -48,8 +48,8 @@ fn conv_1d_block_boundary[ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # FILL ME IN (roughly 18 lines) diff --git a/problems/p14/p14.mojo b/problems/p14/p14.mojo index d1e4b9cc..7b78e849 100644 --- a/problems/p14/p14.mojo +++ b/problems/p14/p14.mojo @@ -20,7 +20,7 @@ fn prefix_sum_simple[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -44,7 +44,7 @@ fn prefix_sum_local_phase[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -54,7 +54,7 @@ fn prefix_sum_local_phase[ # Kernel 2: Add block sums to their respective blocks fn prefix_sum_block_sum_phase[ layout: Layout -](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: Int): +](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: UInt): global_i = block_dim.x * block_idx.x + thread_idx.x # FILL ME IN (roughly 3 lines) @@ -98,7 +98,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) @@ -114,7 +114,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID_2, block_dim=THREADS_PER_BLOCK_2, ) @@ -128,7 +128,7 @@ def main(): alias kernel2 = prefix_sum_block_sum_phase[extended_layout] ctx.enqueue_function_checked[kernel2, kernel2]( out_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID_2, block_dim=THREADS_PER_BLOCK_2, ) diff --git a/problems/p15/p15.mojo b/problems/p15/p15.mojo index fad73211..1e4fb6f2 100644 --- a/problems/p15/p15.mojo +++ b/problems/p15/p15.mojo @@ -23,7 +23,7 @@ fn axis_sum[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -52,7 +52,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, inp_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/problems/p16/p16.mojo b/problems/p16/p16.mojo index d04c577c..8af616e5 100644 --- a/problems/p16/p16.mojo +++ b/problems/p16/p16.mojo @@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn naive_matmul[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], @@ -33,7 +33,7 @@ fn naive_matmul[ # ANCHOR: single_block_matmul fn single_block_matmul[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], @@ -56,7 +56,7 @@ alias layout_tiled = Layout.row_major(SIZE_TILED, SIZE_TILED) fn matmul_tiled[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin], a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin], @@ -114,7 +114,7 @@ def main(): b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp2) if argv()[1] == "--naive": - alias kernel = naive_matmul[layout, SIZE] + alias kernel = naive_matmul[layout, UInt(SIZE)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, @@ -123,7 +123,7 @@ def main(): block_dim=THREADS_PER_BLOCK, ) elif argv()[1] == "--single-block": - alias kernel = single_block_matmul[layout, SIZE] + alias kernel = single_block_matmul[layout, UInt(SIZE)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, @@ -143,7 +143,7 @@ def main(): inp2 ) - alias kernel = matmul_tiled[layout_tiled, SIZE_TILED] + alias kernel = matmul_tiled[layout_tiled, UInt(SIZE_TILED)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor_tiled, a_tensor_tiled, diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo index 05d4d248..c0ca5ad3 100644 --- a/problems/p17/op/conv1d.mojo +++ b/problems/p17/op/conv1d.mojo @@ -20,8 +20,8 @@ fn conv1d_kernel[ input: LayoutTensor[dtype, in_layout, MutAnyOrigin], kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # first: need to account for padding shared_a = LayoutTensor[ dtype, diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo index eddb45d1..1299d253 100644 --- a/problems/p19/op/attention.mojo +++ b/problems/p19/op/attention.mojo @@ -42,14 +42,14 @@ fn matmul_idiomatic_tiled[ b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin], ): """Updated idiomatic tiled matrix multiplication from p16.""" - local_row = thread_idx.y - local_col = thread_idx.x - tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row - tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col + local_row = Int(thread_idx.y) + local_col = Int(thread_idx.x) + tiled_row = Int(block_idx.y) * MATMUL_BLOCK_DIM_XY + local_row + tiled_col = Int(block_idx.x) * MATMUL_BLOCK_DIM_XY + local_col # Get the tile of the output matrix that this thread block is responsible for out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, block_idx.x + Int(block_idx.y), Int(block_idx.x) ) a_shared = LayoutTensor[ dtype, @@ -76,10 +76,10 @@ fn matmul_idiomatic_tiled[ for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY): # Get tiles from A and B matrices a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, idx + Int(block_idx.y), idx ) b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - idx, block_idx.x + idx, Int(block_idx.x) ) # Asynchronously copy tiles to shared memory with consistent orientation @@ -155,7 +155,7 @@ fn softmax_gpu_kernel[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = thread_idx.x + global_i = Int(thread_idx.x) # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo index 18f205e4..c25249ae 100644 --- a/problems/p21/op/embedding.mojo +++ b/problems/p21/op/embedding.mojo @@ -33,7 +33,7 @@ fn embedding_kernel_coalesced[ """ # Simple 1D indexing - each thread = one output element - global_idx = block_idx.x * block_dim.x + thread_idx.x + global_idx = Int(block_idx.x * block_dim.x + thread_idx.x) total_elements = batch_size * seq_len * embed_dim if global_idx >= total_elements: @@ -77,8 +77,8 @@ fn embedding_kernel_2d[ """ # 2D grid indexing - batch_seq_idx = block_idx.x * block_dim.x + thread_idx.x - embed_idx = block_idx.y * block_dim.y + thread_idx.y + batch_seq_idx = Int(block_idx.x * block_dim.x + thread_idx.x) + embed_idx = Int(block_idx.y * block_dim.y + thread_idx.y) total_positions = batch_size * seq_len if batch_seq_idx >= total_positions or embed_idx >= embed_dim: diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo index 2f3b4019..7792b60b 100644 --- a/problems/p22/op/layernorm_linear.mojo +++ b/problems/p22/op/layernorm_linear.mojo @@ -35,12 +35,12 @@ fn matmul_idiomatic_tiled[ """Idiomatic tiled matrix multiplication from p19.""" local_row = thread_idx.y local_col = thread_idx.x - tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row - tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col + tiled_row = Int(block_idx.y * MATMUL_BLOCK_DIM_XY + local_row) + tiled_col = Int(block_idx.x * MATMUL_BLOCK_DIM_XY + local_col) # Get the tile of the output matrix that this thread block is responsible for out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, block_idx.x + Int(block_idx.y), Int(block_idx.x) ) a_shared = LayoutTensor[ dtype, @@ -67,10 +67,10 @@ fn matmul_idiomatic_tiled[ for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY): # Get tiles from A and B matrices a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, idx + Int(block_idx.y), idx ) b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - idx, block_idx.x + idx, Int(block_idx.x) ) # Asynchronously copy tiles to shared memory with consistent orientation @@ -124,9 +124,9 @@ fn layernorm_kernel[ ln_weight: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin], ln_bias: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin], ): - batch_idx = block_idx.x - seq_idx = block_idx.y - hidden_idx = thread_idx.x + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) + hidden_idx = Int(thread_idx.x) if ( batch_idx >= batch_size @@ -149,8 +149,8 @@ fn layernorm_kernel[ fn transpose_kernel[ layout_in: Layout, layout_out: Layout, - rows: Int, - cols: Int, + rows: UInt, + cols: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[dtype, layout_out, MutAnyOrigin], @@ -203,9 +203,9 @@ fn add_bias_kernel[ bias: LayoutTensor[dtype, bias_layout, ImmutAnyOrigin], ): """Simple bias addition.""" - batch_idx = block_idx.x - seq_idx = block_idx.y - out_idx = thread_idx.x + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) + out_idx = Int(thread_idx.x) if batch_idx >= batch_size or seq_idx >= seq_len or out_idx >= output_dim: return @@ -241,8 +241,8 @@ fn minimal_fused_kernel[ """ # Grid: (batch_size, seq_len) - one thread block per sequence position # Block: (1,) - single thread per sequence position to avoid redundant computation - batch_idx = block_idx.x - seq_idx = block_idx.y + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) if batch_idx >= batch_size or seq_idx >= seq_len: return @@ -290,8 +290,8 @@ fn minimal_fused_kernel_backward[ """ # Grid: (batch_size, seq_len) - one thread per sequence position # Block: (1,) - single thread per sequence position - batch_idx = block_idx.x - seq_idx = block_idx.y + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) if batch_idx >= batch_size or seq_idx >= seq_len: return @@ -464,8 +464,8 @@ struct LayerNormLinearCustomOp: alias kernel2 = transpose_kernel[ weight_layout, transposed_weight_tensor.layout, - output_dim, - hidden_dim, + UInt(output_dim), + UInt(hidden_dim), ] gpu_ctx.enqueue_function_checked[kernel2, kernel2]( transposed_weight_tensor, diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo index c48c6b51..2339aa2b 100644 --- a/problems/p24/p24.mojo +++ b/problems/p24/p24.mojo @@ -47,8 +47,8 @@ fn traditional_dot_product_p12_style[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) if global_i < size: shared[local_i] = (a[global_i] * b[global_i]).reduce_add() @@ -79,7 +79,7 @@ fn simple_warp_dot_product[ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) # FILL IN (6 lines at most) diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo index 110e408a..76c62ad9 100644 --- a/problems/p25/p25.mojo +++ b/problems/p25/p25.mojo @@ -24,8 +24,8 @@ fn neighbor_difference[ Uses shuffle_down(val, 1) to get the next neighbor's value. Works across multiple blocks, each processing one warp worth of data. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) # FILL IN (roughly 7 lines) @@ -50,8 +50,8 @@ fn moving_average_3[ Uses shuffle_down with offsets 1 and 2 to access neighbors. Works within warp boundaries across multiple blocks. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) # FILL IN (roughly 10 lines) @@ -71,8 +71,8 @@ fn broadcast_shuffle_coordination[ Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp. Each lane uses shuffle_down() for neighbor access and applies broadcast factor. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: var scale_factor: output.element_type = 0.0 @@ -93,8 +93,8 @@ fn basic_broadcast[ Basic broadcast: Lane 0 computes a block-local value, broadcasts it to all lanes. Each lane then uses this broadcast value in its own computation. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: var broadcast_value: output.element_type = 0.0 @@ -115,8 +115,8 @@ fn conditional_broadcast[ Conditional broadcast: Lane 0 makes a decision based on block-local data, broadcasts it to all lanes. All lanes apply different logic based on the broadcast decision. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: var decision_value: output.element_type = 0.0 diff --git a/problems/p26/p26.mojo b/problems/p26/p26.mojo index 87c9f408..6eacef9d 100644 --- a/problems/p26/p26.mojo +++ b/problems/p26/p26.mojo @@ -25,7 +25,7 @@ fn butterfly_pair_swap[ Uses shuffle_xor(val, 1) to swap values within each pair. This is the foundation of butterfly network communication patterns. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) # FILL ME IN (4 lines) @@ -47,7 +47,7 @@ fn butterfly_parallel_max[ This implements an efficient O(log n) parallel reduction algorithm that works for any WARP_SIZE (32, 64, etc.). """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) # FILL ME IN (roughly 7 lines) @@ -73,7 +73,7 @@ fn butterfly_conditional_max[ in even-numbered lanes. Odd-numbered lanes store the minimum value seen. Demonstrates conditional logic combined with butterfly communication patterns. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) lane = lane_id() if global_i < size: @@ -113,7 +113,7 @@ fn warp_inclusive_prefix_sum[ NOTE: This implementation only works correctly within a single warp (WARP_SIZE threads). For multi-warp scenarios, additional coordination would be needed. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) # FILL ME IN (roughly 4 lines) @@ -145,7 +145,7 @@ fn warp_partition[ Input: [3, 7, 1, 8, 2, 9, 4, 6] Result: [3, 1, 2, 4, 7, 8, 9, 6] (< pivot | >= pivot). """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) if global_i < size: current_val = input[global_i] diff --git a/problems/p27/p27.mojo b/problems/p27/p27.mojo index e3eb8a93..88c30fba 100644 --- a/problems/p27/p27.mojo +++ b/problems/p27/p27.mojo @@ -28,8 +28,8 @@ fn traditional_dot_product[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Each thread computes partial product if global_i < size: @@ -74,7 +74,7 @@ fn block_sum_dot_product[ """Dot product using block.sum() - convenience function like warp.sum()! Replaces manual shared memory + barriers + tree reduction with one line.""" - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # FILL IN (roughly 6 lines) @@ -104,8 +104,8 @@ fn block_histogram_bin_extract[ 3. Extract and pack only elements belonging to target_bin """ - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Step 1: Each thread determines its bin and element value @@ -152,7 +152,7 @@ fn block_normalize_vector[ 4. Each thread normalizes: output[i] = input[i] / mean """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # Step 1: Each thread loads its element @@ -335,7 +335,7 @@ def main(): # Execute histogram kernel for this specific bin alias kernel = block_histogram_bin_extract[ in_layout, bin_layout, out_layout, TPB - ], + ] ctx.enqueue_function_checked[kernel, kernel]( input_tensor, bin_tensor, @@ -409,9 +409,7 @@ def main(): ) # Execute vector normalization kernel - alias kernel = block_normalize_vector[ - in_layout, vector_layout, TPB - ], + alias kernel = block_normalize_vector[in_layout, vector_layout, TPB] ctx.enqueue_function_checked[kernel, kernel]( input_tensor, output_tensor, diff --git a/problems/p29/p29.mojo b/problems/p29/p29.mojo index 64a82d61..52ef8acf 100644 --- a/problems/p29/p29.mojo +++ b/problems/p29/p29.mojo @@ -57,8 +57,8 @@ fn multi_stage_image_blur_pipeline[ address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Stage 1: Load and preprocess (threads 0-127) @@ -135,8 +135,8 @@ fn double_buffered_stencil_computation[ address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Initialize barriers (only thread 0) if local_i == 0: diff --git a/problems/p33/p33.mojo b/problems/p33/p33.mojo index b5e5d491..148249d1 100644 --- a/problems/p33/p33.mojo +++ b/problems/p33/p33.mojo @@ -35,11 +35,13 @@ fn matmul_idiomatic_tiled[ local_row = thread_idx.y local_col = thread_idx.x - tiled_row = block_idx.y * tile_size_y + local_row - tiled_col = block_idx.x * tile_size_x + local_col + tiled_row = Int(block_idx.y * tile_size_y + local_row) + tiled_col = Int(block_idx.x * tile_size_x + local_col) # Get the tile of the output matrix that this thread block is responsible for - out_tile = output.tile[TILE_SIZE, TILE_SIZE](block_idx.y, block_idx.x) + out_tile = output.tile[TILE_SIZE, TILE_SIZE]( + Int(block_idx.y), Int(block_idx.x) + ) a_shared = LayoutTensor[ dtype, Layout.row_major(TILE_SIZE, TILE_SIZE), @@ -62,8 +64,8 @@ fn matmul_idiomatic_tiled[ for idx in range(size // TILE_SIZE): # Iterate over K tiles # Get tiles from A and B matrices - a_tile = a.tile[TILE_SIZE, TILE_SIZE](block_idx.y, idx) - b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, block_idx.x) + a_tile = a.tile[TILE_SIZE, TILE_SIZE](Int(block_idx.y), idx) + b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, Int(block_idx.x)) # Asynchronously copy tiles to shared memory with consistent orientation copy_dram_to_sram_async[ @@ -141,7 +143,7 @@ fn tensor_core_matrix_multiplication[ alias N = C.shape[1]() alias K = A.shape[1]() - warp_id = thread_idx.x // WARP_SIZE + warp_id = Int(thread_idx.x) // WARP_SIZE warps_in_n = BN // WN warps_in_m = BM // WM warp_y = warp_id // warps_in_n @@ -149,7 +151,7 @@ fn tensor_core_matrix_multiplication[ warp_is_active = warp_y < warps_in_m - C_block_tile = C.tile[BM, BN](block_idx.y, block_idx.x) + C_block_tile = C.tile[BM, BN](Int(block_idx.y), Int(block_idx.x)) C_warp_tile = C_block_tile.tile[WM, WN](warp_y, warp_x) mma_op = TensorCore[A.dtype, C.dtype, Index(MMA_M, MMA_N, MMA_K)]() @@ -190,8 +192,8 @@ fn tensor_core_matrix_multiplication[ for k_i in range(K // BK): barrier() - A_dram_tile = A.tile[BM, BK](block_idx.y, k_i) - B_dram_tile = B.tile[BK, BN](k_i, block_idx.x) + A_dram_tile = A.tile[BM, BK](Int(block_idx.y), k_i) + B_dram_tile = B.tile[BK, BN](k_i, Int(block_idx.x)) copy_dram_to_sram_async[ thread_layout = Layout.row_major(4, 8), diff --git a/problems/p34/p34.mojo b/problems/p34/p34.mojo index d0678593..c1d41a96 100644 --- a/problems/p34/p34.mojo +++ b/problems/p34/p34.mojo @@ -29,7 +29,7 @@ fn cluster_coordination_basics[ size: Int, ): """Real cluster coordination using SM90+ cluster APIs.""" - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # Check what's happening with cluster ranks @@ -87,8 +87,8 @@ fn cluster_collective_operations[ size: Int, ): """Cluster-wide collective operations using real cluster APIs.""" - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # FILL IN (roughly 24 lines) @@ -106,8 +106,8 @@ fn advanced_cluster_patterns[ ): """Advanced cluster programming using cluster masks and relaxed synchronization. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # FILL IN (roughly 26 lines) diff --git a/solutions/p03/p03.mojo b/solutions/p03/p03.mojo index 2f90fa95..54cbc6ab 100644 --- a/solutions/p03/p03.mojo +++ b/solutions/p03/p03.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_guard( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): i = thread_idx.x if i < size: @@ -36,7 +36,7 @@ def main(): ctx.enqueue_function_checked[add_10_guard, add_10_guard]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p04/p04.mojo b/solutions/p04/p04.mojo index b0ae3aec..4c81df0a 100644 --- a/solutions/p04/p04.mojo +++ b/solutions/p04/p04.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_2d( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -43,7 +43,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p04/p04_layout_tensor.mojo b/solutions/p04/p04_layout_tensor.mojo index a911f431..5df8a7ac 100644 --- a/solutions/p04/p04_layout_tensor.mojo +++ b/solutions/p04/p04_layout_tensor.mojo @@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn add_10_2d( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -51,7 +51,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p05/p05.mojo b/solutions/p05/p05.mojo index f5fb34db..0b1acd43 100644 --- a/solutions/p05/p05.mojo +++ b/solutions/p05/p05.mojo @@ -14,7 +14,7 @@ fn broadcast_add( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], b: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -48,7 +48,7 @@ def main(): out, a, b, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p05/p05_layout_tensor.mojo b/solutions/p05/p05_layout_tensor.mojo index d4963013..6701662f 100644 --- a/solutions/p05/p05_layout_tensor.mojo +++ b/solutions/p05/p05_layout_tensor.mojo @@ -21,7 +21,7 @@ fn broadcast_add[ output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -66,7 +66,7 @@ def main(): out_tensor, a_tensor, b_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p06/p06.mojo b/solutions/p06/p06.mojo index 870dc570..e5db902a 100644 --- a/solutions/p06/p06.mojo +++ b/solutions/p06/p06.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): i = block_dim.x * block_idx.x + thread_idx.x if i < size: @@ -36,7 +36,7 @@ def main(): ctx.enqueue_function_checked[add_10_blocks, add_10_blocks]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p07/p07.mojo b/solutions/p07/p07.mojo index e54f2b00..84c58629 100644 --- a/solutions/p07/p07.mojo +++ b/solutions/p07/p07.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks_2d( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x @@ -43,7 +43,7 @@ def main(): ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p07/p07_layout_tensor.mojo b/solutions/p07/p07_layout_tensor.mojo index e4a80f31..658b1bc8 100644 --- a/solutions/p07/p07_layout_tensor.mojo +++ b/solutions/p07/p07_layout_tensor.mojo @@ -18,7 +18,7 @@ fn add_10_blocks_2d[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x @@ -54,7 +54,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p08/p08.mojo b/solutions/p08/p08.mojo index 80fe6938..544e3409 100644 --- a/solutions/p08/p08.mojo +++ b/solutions/p08/p08.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn add_10_shared( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, @@ -54,7 +54,7 @@ def main(): ctx.enqueue_function_checked[add_10_shared, add_10_shared]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p08/p08_layout_tensor.mojo b/solutions/p08/p08_layout_tensor.mojo index 49c2c66c..0fe2f0d7 100644 --- a/solutions/p08/p08_layout_tensor.mojo +++ b/solutions/p08/p08_layout_tensor.mojo @@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): # Allocate shared memory using tensor builder shared = LayoutTensor[ @@ -62,7 +62,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p10/p10.mojo b/solutions/p10/p10.mojo index c7b04f0d..1014fd72 100644 --- a/solutions/p10/p10.mojo +++ b/solutions/p10/p10.mojo @@ -18,7 +18,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn shared_memory_race( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): """Fixed: sequential access with barriers eliminates race conditions.""" row = thread_idx.y @@ -55,7 +55,7 @@ fn shared_memory_race( fn add_10_2d( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -105,7 +105,7 @@ def main(): ctx.enqueue_function_checked[add_10_2d, add_10_2d]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) @@ -139,7 +139,7 @@ def main(): ]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p11/p11.mojo b/solutions/p11/p11.mojo index cef5ac59..c07ab3b7 100644 --- a/solutions/p11/p11.mojo +++ b/solutions/p11/p11.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn pooling( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, @@ -34,7 +34,7 @@ fn pooling( output[0] = shared[0] elif global_i == 1: output[1] = shared[0] + shared[1] - elif 1 < global_i < size: + elif UInt(1) < global_i < size: output[global_i] = ( shared[local_i - 2] + shared[local_i - 1] + shared[local_i] ) @@ -56,7 +56,7 @@ def main(): ctx.enqueue_function_checked[pooling, pooling]( out, a, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p11/p11_layout_tensor.mojo b/solutions/p11/p11_layout_tensor.mojo index f1712756..c4607412 100644 --- a/solutions/p11/p11_layout_tensor.mojo +++ b/solutions/p11/p11_layout_tensor.mojo @@ -18,7 +18,7 @@ fn pooling[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): # Allocate shared memory using tensor builder shared = LayoutTensor[ @@ -44,7 +44,7 @@ fn pooling[ elif global_i == 1: output[1] = shared[0] + shared[1] # Handle general case - elif 1 < global_i < size: + elif UInt(1) < global_i < size: output[global_i] = ( shared[local_i - 2] + shared[local_i - 1] + shared[local_i] ) @@ -70,7 +70,7 @@ def main(): ctx.enqueue_function_checked[pooling[layout], pooling[layout]]( out_tensor, a_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p12/p12.mojo b/solutions/p12/p12.mojo index f38f2d8c..793071fc 100644 --- a/solutions/p12/p12.mojo +++ b/solutions/p12/p12.mojo @@ -17,7 +17,7 @@ fn dot_product( output: UnsafePointer[Scalar[dtype], MutAnyOrigin], a: UnsafePointer[Scalar[dtype], MutAnyOrigin], b: UnsafePointer[Scalar[dtype], MutAnyOrigin], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, @@ -41,7 +41,7 @@ fn dot_product( # and warps can be scheduled independently. # However, shared memory does not have such issues as long as we use `barrier()` # correctly when we're in the same thread block. - stride = TPB // 2 + stride = UInt(TPB // 2) while stride > 0: if local_i < stride: shared[local_i] += shared[local_i + stride] @@ -74,7 +74,7 @@ def main(): out, a, b, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p12/p12_layout_tensor.mojo b/solutions/p12/p12_layout_tensor.mojo index fe2303e9..05f38dc6 100644 --- a/solutions/p12/p12_layout_tensor.mojo +++ b/solutions/p12/p12_layout_tensor.mojo @@ -20,7 +20,7 @@ fn dot_product[ output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): shared = LayoutTensor[ dtype, @@ -39,7 +39,7 @@ fn dot_product[ barrier() # Parallel reduction in shared memory - stride = TPB // 2 + stride = UInt(TPB // 2) while stride > 0: if local_i < stride: shared[local_i] += shared[local_i + stride] @@ -78,7 +78,7 @@ def main(): out_tensor, a_tensor, b_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p13/p13.mojo b/solutions/p13/p13.mojo index c24d2207..25946f5a 100644 --- a/solutions/p13/p13.mojo +++ b/solutions/p13/p13.mojo @@ -25,7 +25,7 @@ fn conv_1d_simple[ b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin], ): global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + local_i = Int(thread_idx.x) shared_a = LayoutTensor[ dtype, Layout.row_major(SIZE), @@ -91,8 +91,8 @@ fn conv_1d_block_boundary[ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # first: need to account for padding shared_a = LayoutTensor[ dtype, diff --git a/solutions/p14/p14.mojo b/solutions/p14/p14.mojo index e5082782..57c21475 100644 --- a/solutions/p14/p14.mojo +++ b/solutions/p14/p14.mojo @@ -20,7 +20,7 @@ fn prefix_sum_simple[ ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -35,7 +35,7 @@ fn prefix_sum_simple[ barrier() - offset = 1 + offset = UInt(1) for i in range(Int(log2(Scalar[dtype](TPB)))): var current_val: output.element_type = 0 if local_i >= offset and local_i < size: @@ -71,7 +71,7 @@ fn prefix_sum_local_phase[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -102,7 +102,7 @@ fn prefix_sum_local_phase[ # Iteration 3 (offset=4): # Block 0: [0,1,3,6,10+0,14+1,18+3,22+6] = [0,1,3,6,10,15,21,28] # Block 1 follows same pattern to get [8,17,27,38,50,63,77,???] - offset = 1 + offset = UInt(1) for i in range(Int(log2(Scalar[dtype](TPB)))): var current_val: output.element_type = 0 if local_i >= offset and local_i < TPB: @@ -134,7 +134,7 @@ fn prefix_sum_local_phase[ # Kernel 2: Add block sums to their respective blocks fn prefix_sum_block_sum_phase[ layout: Layout -](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: Int): +](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: UInt): global_i = block_dim.x * block_idx.x + thread_idx.x # Second pass: add previous block's sum to each element @@ -179,7 +179,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) @@ -195,7 +195,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID_2, block_dim=THREADS_PER_BLOCK_2, ) @@ -204,7 +204,7 @@ def main(): alias kernel2 = prefix_sum_block_sum_phase[extended_layout] ctx.enqueue_function_checked[kernel2, kernel2]( out_tensor, - size, + UInt(size), grid_dim=BLOCKS_PER_GRID_2, block_dim=THREADS_PER_BLOCK_2, ) diff --git a/solutions/p15/p15.mojo b/solutions/p15/p15.mojo index d6ff9144..4c8d4050 100644 --- a/solutions/p15/p15.mojo +++ b/solutions/p15/p15.mojo @@ -21,7 +21,7 @@ fn axis_sum[ ]( output: LayoutTensor[dtype, out_layout, MutAnyOrigin], a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -50,7 +50,7 @@ fn axis_sum[ barrier() # do reduction sum per each block - stride = TPB // 2 + stride = UInt(TPB // 2) while stride > 0: # Read phase: all threads read the values they need first to avoid race conditions var temp_val: output.element_type = 0 @@ -92,7 +92,7 @@ def main(): ctx.enqueue_function_checked[kernel, kernel]( out_tensor, inp_tensor, - SIZE, + UInt(SIZE), grid_dim=BLOCKS_PER_GRID, block_dim=THREADS_PER_BLOCK, ) diff --git a/solutions/p16/p16.mojo b/solutions/p16/p16.mojo index e129735b..4ede3870 100644 --- a/solutions/p16/p16.mojo +++ b/solutions/p16/p16.mojo @@ -15,7 +15,7 @@ alias layout = Layout.row_major(SIZE, SIZE) # ANCHOR: naive_matmul_solution fn naive_matmul[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], @@ -39,7 +39,7 @@ fn naive_matmul[ # ANCHOR: single_block_matmul_solution fn single_block_matmul[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout, MutAnyOrigin], a: LayoutTensor[dtype, layout, ImmutAnyOrigin], @@ -90,7 +90,7 @@ alias layout_tiled = Layout.row_major(SIZE_TILED, SIZE_TILED) # ANCHOR: matmul_tiled_solution fn matmul_tiled[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin], a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin], @@ -158,7 +158,7 @@ alias BLOCK_DIM_COUNT = 2 fn matmul_idiomatic_tiled[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin], a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin], @@ -170,7 +170,7 @@ fn matmul_idiomatic_tiled[ tiled_col = block_idx.x * TPB + local_col # Get the tile of the output matrix that this thread block is responsible for - out_tile = output.tile[TPB, TPB](block_idx.y, block_idx.x) + out_tile = output.tile[TPB, TPB](Int(block_idx.y), Int(block_idx.x)) a_shared = LayoutTensor[ dtype, Layout.row_major(TPB, TPB), @@ -194,8 +194,8 @@ fn matmul_idiomatic_tiled[ @parameter for idx in range(size // TPB): # Perfect division: 9 // 3 = 3 tiles # Get tiles from A and B matrices - a_tile = a.tile[TPB, TPB](block_idx.y, idx) - b_tile = b.tile[TPB, TPB](idx, block_idx.x) + a_tile = a.tile[TPB, TPB](Int(block_idx.y), Int(idx)) + b_tile = b.tile[TPB, TPB](Int(idx), Int(block_idx.x)) # Asynchronously copy tiles to shared memory with consistent orientation copy_dram_to_sram_async[ @@ -264,7 +264,7 @@ def main(): b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp2) if argv()[1] == "--naive": - alias kernel = naive_matmul[layout, SIZE] + alias kernel = naive_matmul[layout, UInt(SIZE)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, @@ -273,7 +273,7 @@ def main(): block_dim=THREADS_PER_BLOCK, ) elif argv()[1] == "--single-block": - alias kernel = single_block_matmul[layout, SIZE] + alias kernel = single_block_matmul[layout, UInt(SIZE)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor, a_tensor, @@ -293,7 +293,7 @@ def main(): inp2 ) - alias kernel = matmul_tiled[layout_tiled, SIZE_TILED] + alias kernel = matmul_tiled[layout_tiled, UInt(SIZE_TILED)] ctx.enqueue_function_checked[kernel, kernel]( out_tensor_tiled, a_tensor_tiled, @@ -312,7 +312,9 @@ def main(): inp2 ) - alias kernel = matmul_idiomatic_tiled[layout_tiled, SIZE_TILED] + alias kernel = matmul_idiomatic_tiled[ + layout_tiled, UInt(SIZE_TILED) + ] ctx.enqueue_function_checked[kernel, kernel]( out_tensor_tiled, a_tensor_tiled, diff --git a/solutions/p17/op/conv1d.mojo b/solutions/p17/op/conv1d.mojo index 6f746c26..516f7de4 100644 --- a/solutions/p17/op/conv1d.mojo +++ b/solutions/p17/op/conv1d.mojo @@ -21,8 +21,8 @@ fn conv1d_kernel[ input: LayoutTensor[dtype, in_layout, MutAnyOrigin], kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # first: need to account for padding shared_a = LayoutTensor[ dtype, diff --git a/solutions/p18/op/softmax.mojo b/solutions/p18/op/softmax.mojo index 16308532..c0280d6d 100644 --- a/solutions/p18/op/softmax.mojo +++ b/solutions/p18/op/softmax.mojo @@ -36,7 +36,7 @@ fn softmax_gpu_kernel[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = thread_idx.x + global_i = Int(thread_idx.x) # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they diff --git a/solutions/p19/op/attention.mojo b/solutions/p19/op/attention.mojo index ce6ab1c2..21f6ca37 100644 --- a/solutions/p19/op/attention.mojo +++ b/solutions/p19/op/attention.mojo @@ -44,14 +44,14 @@ fn matmul_idiomatic_tiled[ b: LayoutTensor[dtype, b_layout, MutAnyOrigin], ): """Updated idiomatic tiled matrix multiplication from p16.""" - local_row = thread_idx.y - local_col = thread_idx.x - tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row - tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col + local_row = Int(thread_idx.y) + local_col = Int(thread_idx.x) + tiled_row = Int(block_idx.y) * MATMUL_BLOCK_DIM_XY + local_row + tiled_col = Int(block_idx.x) * MATMUL_BLOCK_DIM_XY + local_col # Get the tile of the output matrix that this thread block is responsible for out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, block_idx.x + Int(block_idx.y), Int(block_idx.x) ) a_shared = LayoutTensor[ dtype, @@ -78,10 +78,10 @@ fn matmul_idiomatic_tiled[ for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY): # Get tiles from A and B matrices a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, idx + Int(block_idx.y), idx ) b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - idx, block_idx.x + idx, Int(block_idx.x) ) # Asynchronously copy tiles to shared memory with consistent orientation @@ -137,19 +137,19 @@ fn transpose_kernel[ address_space = AddressSpace.SHARED, ].stack_allocation() - local_row = thread_idx.y - local_col = thread_idx.x + local_row = Int(thread_idx.y) + local_col = Int(thread_idx.x) - global_row = block_idx.y * TRANSPOSE_BLOCK_DIM_XY + local_row - global_col = block_idx.x * TRANSPOSE_BLOCK_DIM_XY + local_col + global_row = Int(block_idx.y) * TRANSPOSE_BLOCK_DIM_XY + local_row + global_col = Int(block_idx.x) * TRANSPOSE_BLOCK_DIM_XY + local_col if global_row < rows and global_col < cols: shared_tile[local_row, local_col] = inp[global_row, global_col] barrier() - out_row = block_idx.x * TRANSPOSE_BLOCK_DIM_XY + local_row - out_col = block_idx.y * TRANSPOSE_BLOCK_DIM_XY + local_col + out_row = Int(block_idx.x) * TRANSPOSE_BLOCK_DIM_XY + local_row + out_col = Int(block_idx.y) * TRANSPOSE_BLOCK_DIM_XY + local_col # Store data from shared memory to global memory (coalesced write) # Note: we transpose the shared memory access pattern @@ -181,7 +181,7 @@ fn softmax_gpu_kernel[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = thread_idx.x + global_i = Int(thread_idx.x) # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they diff --git a/solutions/p21/op/embedding.mojo b/solutions/p21/op/embedding.mojo index 4ce564a2..07365220 100644 --- a/solutions/p21/op/embedding.mojo +++ b/solutions/p21/op/embedding.mojo @@ -33,7 +33,7 @@ fn embedding_kernel_coalesced[ """ # Simple 1D indexing - each thread = one output element - global_idx = block_idx.x * block_dim.x + thread_idx.x + global_idx = Int(block_idx.x * block_dim.x + thread_idx.x) total_elements = batch_size * seq_len * embed_dim if global_idx >= total_elements: @@ -85,8 +85,8 @@ fn embedding_kernel_2d[ """ # 2D grid indexing - batch_seq_idx = block_idx.x * block_dim.x + thread_idx.x - embed_idx = block_idx.y * block_dim.y + thread_idx.y + batch_seq_idx = Int(block_idx.x * block_dim.x + thread_idx.x) + embed_idx = Int(block_idx.y * block_dim.y + thread_idx.y) total_positions = batch_size * seq_len diff --git a/solutions/p22/op/layernorm_linear.mojo b/solutions/p22/op/layernorm_linear.mojo index 8760c183..2eaf8e51 100644 --- a/solutions/p22/op/layernorm_linear.mojo +++ b/solutions/p22/op/layernorm_linear.mojo @@ -33,12 +33,12 @@ fn matmul_idiomatic_tiled[ """Idiomatic tiled matrix multiplication from p19.""" local_row = thread_idx.y local_col = thread_idx.x - tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row - tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col + tiled_row = Int(block_idx.y * MATMUL_BLOCK_DIM_XY + local_row) + tiled_col = Int(block_idx.x * MATMUL_BLOCK_DIM_XY + local_col) # Get the tile of the output matrix that this thread block is responsible for out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, block_idx.x + Int(block_idx.y), Int(block_idx.x) ) a_shared = LayoutTensor[ dtype, @@ -65,10 +65,10 @@ fn matmul_idiomatic_tiled[ for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY): # Get tiles from A and B matrices a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - block_idx.y, idx + Int(block_idx.y), idx ) b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY]( - idx, block_idx.x + idx, Int(block_idx.x) ) # Asynchronously copy tiles to shared memory with consistent orientation @@ -123,9 +123,9 @@ fn layernorm_kernel[ ln_weight: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin], ln_bias: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin], ): - batch_idx = block_idx.x - seq_idx = block_idx.y - hidden_idx = thread_idx.x + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) + hidden_idx = Int(thread_idx.x) if ( batch_idx >= batch_size @@ -163,8 +163,8 @@ fn layernorm_kernel[ fn transpose_kernel[ layout_in: Layout, layout_out: Layout, - rows: Int, - cols: Int, + rows: UInt, + cols: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[dtype, layout_out, MutAnyOrigin], @@ -218,9 +218,9 @@ fn add_bias_kernel[ bias: LayoutTensor[dtype, bias_layout, ImmutAnyOrigin], ): """Simple bias addition.""" - batch_idx = block_idx.x - seq_idx = block_idx.y - out_idx = thread_idx.x + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) + out_idx = Int(thread_idx.x) if batch_idx >= batch_size or seq_idx >= seq_len or out_idx >= output_dim: return @@ -257,8 +257,8 @@ fn minimal_fused_kernel[ """ # Grid: (batch_size, seq_len) - one thread block per sequence position # Block: (1,) - single thread per sequence position to avoid redundant computation - batch_idx = block_idx.x - seq_idx = block_idx.y + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) if batch_idx >= batch_size or seq_idx >= seq_len: return @@ -330,8 +330,8 @@ fn minimal_fused_kernel_backward[ """ # Grid: (batch_size, seq_len) - one thread per sequence position # Block: (1,) - single thread per sequence position - batch_idx = block_idx.x - seq_idx = block_idx.y + batch_idx = Int(block_idx.x) + seq_idx = Int(block_idx.y) if batch_idx >= batch_size or seq_idx >= seq_len: return @@ -601,8 +601,8 @@ struct LayerNormLinearCustomOp: alias kernel2 = transpose_kernel[ weight_layout, transposed_weight_tensor.layout, - output_dim, - hidden_dim, + UInt(output_dim), + UInt(hidden_dim), ] gpu_ctx.enqueue_function_checked[kernel2, kernel2]( transposed_weight_tensor, diff --git a/solutions/p24/p24.mojo b/solutions/p24/p24.mojo index 83d54757..6c922182 100644 --- a/solutions/p24/p24.mojo +++ b/solutions/p24/p24.mojo @@ -47,8 +47,8 @@ fn traditional_dot_product_p12_style[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) if global_i < size: shared[local_i] = (a[global_i] * b[global_i]).reduce_add() @@ -79,7 +79,7 @@ fn simple_warp_dot_product[ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin], ): - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) # Each thread computes one partial product using vectorized approach as values in Mojo are SIMD based var partial_product: Scalar[dtype] = 0 diff --git a/solutions/p25/p25.mojo b/solutions/p25/p25.mojo index 660adbe9..efc012d8 100644 --- a/solutions/p25/p25.mojo +++ b/solutions/p25/p25.mojo @@ -25,8 +25,8 @@ fn neighbor_difference[ Uses shuffle_down(val, 1) to get the next neighbor's value. Works across multiple blocks, each processing one warp worth of data. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: # Get current value @@ -67,8 +67,8 @@ fn moving_average_3[ Uses shuffle_down with offsets 1 and 2 to access neighbors. Works within warp boundaries across multiple blocks. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: # Get current, next, and next+1 values @@ -102,15 +102,15 @@ fn broadcast_shuffle_coordination[ Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp. Each lane uses shuffle_down() for neighbor access and applies broadcast factor. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: # Step 1: Lane 0 computes block-local scaling factor var scale_factor: output.element_type = 0.0 if lane == 0: # Compute average of first 4 elements in this block's data - block_start = block_idx.x * block_dim.x + block_start = Int(block_idx.x * block_dim.x) var sum: output.element_type = 0.0 for i in range(4): if block_start + i < size: @@ -147,14 +147,14 @@ fn basic_broadcast[ Basic broadcast: Lane 0 computes a block-local value, broadcasts it to all lanes. Each lane then uses this broadcast value in its own computation. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: # Step 1: Lane 0 computes special value (sum of first 4 elements in this block) var broadcast_value: output.element_type = 0.0 if lane == 0: - block_start = block_idx.x * block_dim.x + block_start = Int(block_idx.x * block_dim.x) var sum: output.element_type = 0.0 for i in range(4): if block_start + i < size: @@ -182,14 +182,14 @@ fn conditional_broadcast[ Conditional broadcast: Lane 0 makes a decision based on block-local data, broadcasts it to all lanes. All lanes apply different logic based on the broadcast decision. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - lane = lane_id() + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + lane = Int(lane_id()) if global_i < size: # Step 1: Lane 0 analyzes block-local data and makes decision (find max of first 8 in block) var decision_value: output.element_type = 0.0 if lane == 0: - block_start = block_idx.x * block_dim.x + block_start = Int(block_idx.x * block_dim.x) decision_value = input[block_start] if block_start < size else 0.0 for i in range(1, min(8, min(WARP_SIZE, size - block_start))): if block_start + i < size: diff --git a/solutions/p26/p26.mojo b/solutions/p26/p26.mojo index 3967a90f..32755c9d 100644 --- a/solutions/p26/p26.mojo +++ b/solutions/p26/p26.mojo @@ -26,7 +26,7 @@ fn butterfly_pair_swap[ Uses shuffle_xor(val, 1) to swap values within each pair. This is the foundation of butterfly network communication patterns. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) if global_i < size: current_val = input[global_i] @@ -56,7 +56,7 @@ fn butterfly_parallel_max[ Each step reduces the active range by half until all threads have the maximum value. This implements an efficient O(log n) parallel reduction algorithm. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) if global_i < size: max_val = input[global_i] @@ -93,7 +93,7 @@ fn butterfly_conditional_max[ in even-numbered lanes. Odd-numbered lanes store the minimum value seen. Demonstrates conditional logic combined with butterfly communication patterns. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) lane = lane_id() if global_i < size: @@ -147,7 +147,7 @@ fn warp_inclusive_prefix_sum[ NOTE: This implementation only works correctly within a single warp (WARP_SIZE threads). For multi-warp scenarios, additional coordination would be needed. """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) if global_i < size: current_val = input[global_i] @@ -188,7 +188,7 @@ fn warp_partition[ Input: [3, 7, 1, 8, 2, 9, 4, 6] Result: [3, 1, 2, 4, 7, 8, 9, 6] (< pivot | >= pivot). """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) if global_i < size: current_val = input[global_i] diff --git a/solutions/p27/p27.mojo b/solutions/p27/p27.mojo index 797c9d9e..eb141f83 100644 --- a/solutions/p27/p27.mojo +++ b/solutions/p27/p27.mojo @@ -29,7 +29,7 @@ fn block_sum_dot_product[ """Dot product using block.sum() - convenience function like warp.sum()! Replaces manual shared memory + barriers + tree reduction with one line.""" - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # Each thread computes partial product @@ -70,8 +70,8 @@ fn traditional_dot_product[ MutAnyOrigin, address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Each thread computes partial product if global_i < size: @@ -118,8 +118,8 @@ fn block_histogram_bin_extract[ 3. Extract and pack only elements belonging to target_bin """ - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Step 1: Each thread determines its bin and element value var my_value: Scalar[dtype] = 0.0 @@ -180,7 +180,7 @@ fn block_normalize_vector[ 4. Each thread normalizes: output[i] = input[i] / mean """ - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # Step 1: Each thread loads its element diff --git a/solutions/p28/p28.mojo b/solutions/p28/p28.mojo index f3d80d5a..f5018f26 100644 --- a/solutions/p28/p28.mojo +++ b/solutions/p28/p28.mojo @@ -48,11 +48,11 @@ fn async_copy_overlap_convolution[ address_space = AddressSpace.SHARED, ].stack_allocation() - local_i = thread_idx.x + local_i = Int(thread_idx.x) # Phase 1: Launch async copy for input tile # Note: tile() does NOT perform bounds checking - ensure valid tile bounds - input_tile = input.tile[CONV_TILE_SIZE](block_idx.x) + input_tile = input.tile[CONV_TILE_SIZE](Int(block_idx.x)) # Use async copy with thread layout matching p14 pattern alias load_layout = Layout.row_major(THREADS_PER_BLOCK_ASYNC) @@ -67,7 +67,7 @@ fn async_copy_overlap_convolution[ barrier() # Sync all threads # Phase 4: Compute convolution - global_i = block_idx.x * CONV_TILE_SIZE + local_i + global_i = Int(block_idx.x) * CONV_TILE_SIZE + local_i if local_i < CONV_TILE_SIZE and global_i < output.shape[0](): var result: output.element_type = 0 diff --git a/solutions/p29/p29.mojo b/solutions/p29/p29.mojo index 0233fa19..75ce6894 100644 --- a/solutions/p29/p29.mojo +++ b/solutions/p29/p29.mojo @@ -53,8 +53,8 @@ fn multi_stage_image_blur_pipeline[ address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Stage 1: Load and preprocess (threads 0-127) if local_i < STAGE1_THREADS: @@ -180,8 +180,8 @@ fn double_buffered_stencil_computation[ address_space = AddressSpace.SHARED, ].stack_allocation() - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) # Initialize barriers (only thread 0) if local_i == 0: diff --git a/solutions/p33/p33.mojo b/solutions/p33/p33.mojo index fba87415..b442f97f 100644 --- a/solutions/p33/p33.mojo +++ b/solutions/p33/p33.mojo @@ -35,11 +35,13 @@ fn matmul_idiomatic_tiled[ local_row = thread_idx.y local_col = thread_idx.x - tiled_row = block_idx.y * tile_size_y + local_row - tiled_col = block_idx.x * tile_size_x + local_col + tiled_row = Int(block_idx.y * tile_size_y + local_row) + tiled_col = Int(block_idx.x * tile_size_x + local_col) # Get the tile of the output matrix that this thread block is responsible for - out_tile = output.tile[TILE_SIZE, TILE_SIZE](block_idx.y, block_idx.x) + out_tile = output.tile[TILE_SIZE, TILE_SIZE]( + Int(block_idx.y), Int(block_idx.x) + ) a_shared = LayoutTensor[ dtype, Layout.row_major(TILE_SIZE, TILE_SIZE), @@ -62,8 +64,8 @@ fn matmul_idiomatic_tiled[ for idx in range(size // TILE_SIZE): # Iterate over K tiles # Get tiles from A and B matrices - a_tile = a.tile[TILE_SIZE, TILE_SIZE](block_idx.y, idx) - b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, block_idx.x) + a_tile = a.tile[TILE_SIZE, TILE_SIZE](Int(block_idx.y), idx) + b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, Int(block_idx.x)) # Asynchronously copy tiles to shared memory with consistent orientation copy_dram_to_sram_async[ @@ -141,7 +143,7 @@ fn tensor_core_matrix_multiplication[ alias N = C.shape[1]() alias K = A.shape[1]() - warp_id = thread_idx.x // WARP_SIZE + warp_id = Int(thread_idx.x) // WARP_SIZE warps_in_n = BN // WN warps_in_m = BM // WM warp_y = warp_id // warps_in_n @@ -149,7 +151,7 @@ fn tensor_core_matrix_multiplication[ warp_is_active = warp_y < warps_in_m - C_block_tile = C.tile[BM, BN](block_idx.y, block_idx.x) + C_block_tile = C.tile[BM, BN](Int(block_idx.y), Int(block_idx.x)) C_warp_tile = C_block_tile.tile[WM, WN](warp_y, warp_x) mma_op = TensorCore[A.dtype, C.dtype, Index(MMA_M, MMA_N, MMA_K)]() @@ -192,8 +194,8 @@ fn tensor_core_matrix_multiplication[ for k_i in range(K // BK): barrier() - A_dram_tile = A.tile[BM, BK](block_idx.y, k_i) - B_dram_tile = B.tile[BK, BN](k_i, block_idx.x) + A_dram_tile = A.tile[BM, BK](Int(block_idx.y), k_i) + B_dram_tile = B.tile[BK, BN](k_i, Int(block_idx.x)) copy_dram_to_sram_async[ thread_layout = Layout.row_major(4, 8), diff --git a/solutions/p34/p34.mojo b/solutions/p34/p34.mojo index a87cd73b..1e852488 100644 --- a/solutions/p34/p34.mojo +++ b/solutions/p34/p34.mojo @@ -29,7 +29,7 @@ fn cluster_coordination_basics[ size: Int, ): """Real cluster coordination using SM90+ cluster APIs.""" - global_i = block_dim.x * block_idx.x + thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) local_i = thread_idx.x # Check what's happening with cluster ranks @@ -87,8 +87,8 @@ fn cluster_collective_operations[ size: Int, ): """Cluster-wide collective operations using real cluster APIs.""" - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) my_block_rank = Int(block_rank_in_cluster()) block_id = Int(block_idx.x) @@ -143,8 +143,8 @@ fn advanced_cluster_patterns[ ): """Advanced cluster programming using cluster masks and relaxed synchronization. """ - global_i = block_dim.x * block_idx.x + thread_idx.x - local_i = thread_idx.x + global_i = Int(block_dim.x * block_idx.x + thread_idx.x) + local_i = Int(thread_idx.x) my_block_rank = Int(block_rank_in_cluster()) block_id = Int(block_idx.x)