From ee96fb3c80b47567d2d9073e35063662bc6fcf04 Mon Sep 17 00:00:00 2001
From: Brad Larson <larson@sunsetlakesoftware.com>
Date: Sun, 16 Nov 2025 15:50:01 -0600
Subject: [PATCH] Make Int <-> UInt conversions explicit.

Co-authored-by: raju <raju.ptvs@gmail.com>
---
 problems/p03/p03.mojo                  |  4 +--
 problems/p04/p04.mojo                  |  4 +--
 problems/p04/p04_layout_tensor.mojo    |  4 +--
 problems/p05/p05.mojo                  |  4 +--
 problems/p05/p05_layout_tensor.mojo    |  4 +--
 problems/p06/p06.mojo                  |  4 +--
 problems/p07/p07.mojo                  |  4 +--
 problems/p07/p07_layout_tensor.mojo    |  4 +--
 problems/p08/p08.mojo                  |  4 +--
 problems/p08/p08_layout_tensor.mojo    |  4 +--
 problems/p10/p10.mojo                  |  8 +++---
 problems/p11/p11.mojo                  |  4 +--
 problems/p11/p11_layout_tensor.mojo    |  4 +--
 problems/p12/p12.mojo                  |  4 +--
 problems/p12/p12_layout_tensor.mojo    |  4 +--
 problems/p13/p13.mojo                  |  6 ++--
 problems/p14/p14.mojo                  | 12 ++++----
 problems/p15/p15.mojo                  |  4 +--
 problems/p16/p16.mojo                  | 12 ++++----
 problems/p17/op/conv1d.mojo            |  4 +--
 problems/p19/op/attention.mojo         | 16 +++++------
 problems/p21/op/embedding.mojo         |  6 ++--
 problems/p22/op/layernorm_linear.mojo  | 38 +++++++++++++-------------
 problems/p24/p24.mojo                  |  6 ++--
 problems/p25/p25.mojo                  | 20 +++++++-------
 problems/p26/p26.mojo                  | 10 +++----
 problems/p27/p27.mojo                  | 18 ++++++------
 problems/p29/p29.mojo                  |  8 +++---
 problems/p33/p33.mojo                  | 20 ++++++++------
 problems/p34/p34.mojo                  | 10 +++----
 solutions/p03/p03.mojo                 |  4 +--
 solutions/p04/p04.mojo                 |  4 +--
 solutions/p04/p04_layout_tensor.mojo   |  4 +--
 solutions/p05/p05.mojo                 |  4 +--
 solutions/p05/p05_layout_tensor.mojo   |  4 +--
 solutions/p06/p06.mojo                 |  4 +--
 solutions/p07/p07.mojo                 |  4 +--
 solutions/p07/p07_layout_tensor.mojo   |  4 +--
 solutions/p08/p08.mojo                 |  4 +--
 solutions/p08/p08_layout_tensor.mojo   |  4 +--
 solutions/p10/p10.mojo                 |  8 +++---
 solutions/p11/p11.mojo                 |  6 ++--
 solutions/p11/p11_layout_tensor.mojo   |  6 ++--
 solutions/p12/p12.mojo                 |  6 ++--
 solutions/p12/p12_layout_tensor.mojo   |  6 ++--
 solutions/p13/p13.mojo                 |  6 ++--
 solutions/p14/p14.mojo                 | 16 +++++------
 solutions/p15/p15.mojo                 |  6 ++--
 solutions/p16/p16.mojo                 | 24 ++++++++--------
 solutions/p17/op/conv1d.mojo           |  4 +--
 solutions/p18/op/softmax.mojo          |  2 +-
 solutions/p19/op/attention.mojo        | 28 +++++++++----------
 solutions/p21/op/embedding.mojo        |  6 ++--
 solutions/p22/op/layernorm_linear.mojo | 38 +++++++++++++-------------
 solutions/p24/p24.mojo                 |  6 ++--
 solutions/p25/p25.mojo                 | 26 +++++++++---------
 solutions/p26/p26.mojo                 | 10 +++----
 solutions/p27/p27.mojo                 | 12 ++++----
 solutions/p28/p28.mojo                 |  6 ++--
 solutions/p29/p29.mojo                 |  8 +++---
 solutions/p33/p33.mojo                 | 20 ++++++++------
 solutions/p34/p34.mojo                 | 10 +++----
 62 files changed, 279 insertions(+), 275 deletions(-)

diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo
index f388faf4..0c99fddf 100644
--- a/problems/p03/p03.mojo
+++ b/problems/p03/p03.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_guard(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     i = thread_idx.x
     # FILL ME IN (roughly 2 lines)
@@ -35,7 +35,7 @@ def main():
         ctx.enqueue_function_checked[add_10_guard, add_10_guard](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo
index 6e11fe1c..156868d6 100644
--- a/problems/p04/p04.mojo
+++ b/problems/p04/p04.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_2d(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -41,7 +41,7 @@ def main():
         ctx.enqueue_function_checked[add_10_2d, add_10_2d](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo
index da393f0e..c5698daa 100644
--- a/problems/p04/p04_layout_tensor.mojo
+++ b/problems/p04/p04_layout_tensor.mojo
@@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn add_10_2d(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -46,7 +46,7 @@ def main():
         ctx.enqueue_function_checked[add_10_2d, add_10_2d](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo
index 2d9079d4..70dd0204 100644
--- a/problems/p05/p05.mojo
+++ b/problems/p05/p05.mojo
@@ -14,7 +14,7 @@ fn broadcast_add(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     b: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -45,7 +45,7 @@ def main():
             out,
             a,
             b,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo
index a0996217..9da93208 100644
--- a/problems/p05/p05_layout_tensor.mojo
+++ b/problems/p05/p05_layout_tensor.mojo
@@ -21,7 +21,7 @@ fn broadcast_add[
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -63,7 +63,7 @@ def main():
             out_tensor,
             a_tensor,
             b_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo
index 6992f420..68ee774d 100644
--- a/problems/p06/p06.mojo
+++ b/problems/p06/p06.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     i = block_dim.x * block_idx.x + thread_idx.x
     # FILL ME IN (roughly 2 lines)
@@ -35,7 +35,7 @@ def main():
         ctx.enqueue_function_checked[add_10_blocks, add_10_blocks](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo
index 225dcace..71174c38 100644
--- a/problems/p07/p07.mojo
+++ b/problems/p07/p07.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks_2d(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x
@@ -42,7 +42,7 @@ def main():
         ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo
index 4c7baf41..973ebb92 100644
--- a/problems/p07/p07_layout_tensor.mojo
+++ b/problems/p07/p07_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn add_10_blocks_2d[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x
@@ -53,7 +53,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p08/p08.mojo b/problems/p08/p08.mojo
index e7bf09b3..935d2c8b 100644
--- a/problems/p08/p08.mojo
+++ b/problems/p08/p08.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn add_10_shared(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
@@ -48,7 +48,7 @@ def main():
         ctx.enqueue_function_checked[add_10_shared, add_10_shared](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p08/p08_layout_tensor.mojo b/problems/p08/p08_layout_tensor.mojo
index 498ac8e2..af416c97 100644
--- a/problems/p08/p08_layout_tensor.mojo
+++ b/problems/p08/p08_layout_tensor.mojo
@@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using LayoutTensor with explicit address_space
     shared = LayoutTensor[
@@ -57,7 +57,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p10/p10.mojo b/problems/p10/p10.mojo
index 6109c2ba..bd456efc 100644
--- a/problems/p10/p10.mojo
+++ b/problems/p10/p10.mojo
@@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn shared_memory_race(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -45,7 +45,7 @@ fn shared_memory_race(
 fn add_10_2d(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -94,7 +94,7 @@ def main():
             ctx.enqueue_function_checked[add_10_2d, add_10_2d](
                 out_tensor,
                 a_tensor,
-                SIZE,
+                UInt(SIZE),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
@@ -127,7 +127,7 @@ def main():
             ](
                 out_tensor,
                 a_tensor,
-                SIZE,
+                UInt(SIZE),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
diff --git a/problems/p11/p11.mojo b/problems/p11/p11.mojo
index ebc94c6e..22fc7157 100644
--- a/problems/p11/p11.mojo
+++ b/problems/p11/p11.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn pooling(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
@@ -44,7 +44,7 @@ def main():
         ctx.enqueue_function_checked[pooling, pooling](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p11/p11_layout_tensor.mojo b/problems/p11/p11_layout_tensor.mojo
index c2cfe3f1..de87752b 100644
--- a/problems/p11/p11_layout_tensor.mojo
+++ b/problems/p11/p11_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn pooling[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using tensor builder
     shared = LayoutTensor[
@@ -53,7 +53,7 @@ def main():
         ctx.enqueue_function_checked[pooling[layout], pooling[layout]](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p12/p12.mojo b/problems/p12/p12.mojo
index 9c6a1afe..65b3c9fe 100644
--- a/problems/p12/p12.mojo
+++ b/problems/p12/p12.mojo
@@ -17,7 +17,7 @@ fn dot_product(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     b: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # FILL ME IN (roughly 13 lines)
     ...
@@ -43,7 +43,7 @@ def main():
             out,
             a,
             b,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p12/p12_layout_tensor.mojo b/problems/p12/p12_layout_tensor.mojo
index e5ad9bd3..f8ddea21 100644
--- a/problems/p12/p12_layout_tensor.mojo
+++ b/problems/p12/p12_layout_tensor.mojo
@@ -22,7 +22,7 @@ fn dot_product[
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # FILL ME IN (roughly 13 lines)
     ...
@@ -54,7 +54,7 @@ def main():
             out_tensor,
             a_tensor,
             b_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p13/p13.mojo b/problems/p13/p13.mojo
index bc541cee..9b227865 100644
--- a/problems/p13/p13.mojo
+++ b/problems/p13/p13.mojo
@@ -25,7 +25,7 @@ fn conv_1d_simple[
     b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    local_i = Int(thread_idx.x)
     # FILL ME IN (roughly 14 lines)
 
 
@@ -48,8 +48,8 @@ fn conv_1d_block_boundary[
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     # FILL ME IN (roughly 18 lines)
 
 
diff --git a/problems/p14/p14.mojo b/problems/p14/p14.mojo
index d1e4b9cc..7b78e849 100644
--- a/problems/p14/p14.mojo
+++ b/problems/p14/p14.mojo
@@ -20,7 +20,7 @@ fn prefix_sum_simple[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -44,7 +44,7 @@ fn prefix_sum_local_phase[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -54,7 +54,7 @@ fn prefix_sum_local_phase[
 # Kernel 2: Add block sums to their respective blocks
 fn prefix_sum_block_sum_phase[
     layout: Layout
-](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: Int):
+](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: UInt):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     # FILL ME IN (roughly 3 lines)
 
@@ -98,7 +98,7 @@ def main():
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
@@ -114,7 +114,7 @@ def main():
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID_2,
                 block_dim=THREADS_PER_BLOCK_2,
             )
@@ -128,7 +128,7 @@ def main():
             alias kernel2 = prefix_sum_block_sum_phase[extended_layout]
             ctx.enqueue_function_checked[kernel2, kernel2](
                 out_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID_2,
                 block_dim=THREADS_PER_BLOCK_2,
             )
diff --git a/problems/p15/p15.mojo b/problems/p15/p15.mojo
index fad73211..1e4fb6f2 100644
--- a/problems/p15/p15.mojo
+++ b/problems/p15/p15.mojo
@@ -23,7 +23,7 @@ fn axis_sum[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -52,7 +52,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             inp_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/problems/p16/p16.mojo b/problems/p16/p16.mojo
index d04c577c..8af616e5 100644
--- a/problems/p16/p16.mojo
+++ b/problems/p16/p16.mojo
@@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 
 
 fn naive_matmul[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
@@ -33,7 +33,7 @@ fn naive_matmul[
 
 # ANCHOR: single_block_matmul
 fn single_block_matmul[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
@@ -56,7 +56,7 @@ alias layout_tiled = Layout.row_major(SIZE_TILED, SIZE_TILED)
 
 
 fn matmul_tiled[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin],
     a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin],
@@ -114,7 +114,7 @@ def main():
         b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp2)
 
         if argv()[1] == "--naive":
-            alias kernel = naive_matmul[layout, SIZE]
+            alias kernel = naive_matmul[layout, UInt(SIZE)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
@@ -123,7 +123,7 @@ def main():
                 block_dim=THREADS_PER_BLOCK,
             )
         elif argv()[1] == "--single-block":
-            alias kernel = single_block_matmul[layout, SIZE]
+            alias kernel = single_block_matmul[layout, UInt(SIZE)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
@@ -143,7 +143,7 @@ def main():
                 inp2
             )
 
-            alias kernel = matmul_tiled[layout_tiled, SIZE_TILED]
+            alias kernel = matmul_tiled[layout_tiled, UInt(SIZE_TILED)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor_tiled,
                 a_tensor_tiled,
diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo
index 05d4d248..c0ca5ad3 100644
--- a/problems/p17/op/conv1d.mojo
+++ b/problems/p17/op/conv1d.mojo
@@ -20,8 +20,8 @@ fn conv1d_kernel[
     input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
     kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     # first: need to account for padding
     shared_a = LayoutTensor[
         dtype,
diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo
index eddb45d1..1299d253 100644
--- a/problems/p19/op/attention.mojo
+++ b/problems/p19/op/attention.mojo
@@ -42,14 +42,14 @@ fn matmul_idiomatic_tiled[
     b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin],
 ):
     """Updated idiomatic tiled matrix multiplication from p16."""
-    local_row = thread_idx.y
-    local_col = thread_idx.x
-    tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row
-    tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col
+    local_row = Int(thread_idx.y)
+    local_col = Int(thread_idx.x)
+    tiled_row = Int(block_idx.y) * MATMUL_BLOCK_DIM_XY + local_row
+    tiled_col = Int(block_idx.x) * MATMUL_BLOCK_DIM_XY + local_col
 
     # Get the tile of the output matrix that this thread block is responsible for
     out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-        block_idx.y, block_idx.x
+        Int(block_idx.y), Int(block_idx.x)
     )
     a_shared = LayoutTensor[
         dtype,
@@ -76,10 +76,10 @@ fn matmul_idiomatic_tiled[
     for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY):
         # Get tiles from A and B matrices
         a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            block_idx.y, idx
+            Int(block_idx.y), idx
         )
         b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            idx, block_idx.x
+            idx, Int(block_idx.x)
         )
 
         # Asynchronously copy tiles to shared memory with consistent orientation
@@ -155,7 +155,7 @@ fn softmax_gpu_kernel[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = thread_idx.x
+    global_i = Int(thread_idx.x)
 
     # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum
     # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they
diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo
index 18f205e4..c25249ae 100644
--- a/problems/p21/op/embedding.mojo
+++ b/problems/p21/op/embedding.mojo
@@ -33,7 +33,7 @@ fn embedding_kernel_coalesced[
     """
 
     # Simple 1D indexing - each thread = one output element
-    global_idx = block_idx.x * block_dim.x + thread_idx.x
+    global_idx = Int(block_idx.x * block_dim.x + thread_idx.x)
     total_elements = batch_size * seq_len * embed_dim
 
     if global_idx >= total_elements:
@@ -77,8 +77,8 @@ fn embedding_kernel_2d[
     """
 
     # 2D grid indexing
-    batch_seq_idx = block_idx.x * block_dim.x + thread_idx.x
-    embed_idx = block_idx.y * block_dim.y + thread_idx.y
+    batch_seq_idx = Int(block_idx.x * block_dim.x + thread_idx.x)
+    embed_idx = Int(block_idx.y * block_dim.y + thread_idx.y)
     total_positions = batch_size * seq_len
 
     if batch_seq_idx >= total_positions or embed_idx >= embed_dim:
diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo
index 2f3b4019..7792b60b 100644
--- a/problems/p22/op/layernorm_linear.mojo
+++ b/problems/p22/op/layernorm_linear.mojo
@@ -35,12 +35,12 @@ fn matmul_idiomatic_tiled[
     """Idiomatic tiled matrix multiplication from p19."""
     local_row = thread_idx.y
     local_col = thread_idx.x
-    tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row
-    tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col
+    tiled_row = Int(block_idx.y * MATMUL_BLOCK_DIM_XY + local_row)
+    tiled_col = Int(block_idx.x * MATMUL_BLOCK_DIM_XY + local_col)
 
     # Get the tile of the output matrix that this thread block is responsible for
     out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-        block_idx.y, block_idx.x
+        Int(block_idx.y), Int(block_idx.x)
     )
     a_shared = LayoutTensor[
         dtype,
@@ -67,10 +67,10 @@ fn matmul_idiomatic_tiled[
     for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY):
         # Get tiles from A and B matrices
         a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            block_idx.y, idx
+            Int(block_idx.y), idx
         )
         b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            idx, block_idx.x
+            idx, Int(block_idx.x)
         )
 
         # Asynchronously copy tiles to shared memory with consistent orientation
@@ -124,9 +124,9 @@ fn layernorm_kernel[
     ln_weight: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin],
     ln_bias: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin],
 ):
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
-    hidden_idx = thread_idx.x
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
+    hidden_idx = Int(thread_idx.x)
 
     if (
         batch_idx >= batch_size
@@ -149,8 +149,8 @@ fn layernorm_kernel[
 fn transpose_kernel[
     layout_in: Layout,
     layout_out: Layout,
-    rows: Int,
-    cols: Int,
+    rows: UInt,
+    cols: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[dtype, layout_out, MutAnyOrigin],
@@ -203,9 +203,9 @@ fn add_bias_kernel[
     bias: LayoutTensor[dtype, bias_layout, ImmutAnyOrigin],
 ):
     """Simple bias addition."""
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
-    out_idx = thread_idx.x
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
+    out_idx = Int(thread_idx.x)
 
     if batch_idx >= batch_size or seq_idx >= seq_len or out_idx >= output_dim:
         return
@@ -241,8 +241,8 @@ fn minimal_fused_kernel[
     """
     # Grid: (batch_size, seq_len) - one thread block per sequence position
     # Block: (1,) - single thread per sequence position to avoid redundant computation
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
 
     if batch_idx >= batch_size or seq_idx >= seq_len:
         return
@@ -290,8 +290,8 @@ fn minimal_fused_kernel_backward[
     """
     # Grid: (batch_size, seq_len) - one thread per sequence position
     # Block: (1,) - single thread per sequence position
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
 
     if batch_idx >= batch_size or seq_idx >= seq_len:
         return
@@ -464,8 +464,8 @@ struct LayerNormLinearCustomOp:
                 alias kernel2 = transpose_kernel[
                     weight_layout,
                     transposed_weight_tensor.layout,
-                    output_dim,
-                    hidden_dim,
+                    UInt(output_dim),
+                    UInt(hidden_dim),
                 ]
                 gpu_ctx.enqueue_function_checked[kernel2, kernel2](
                     transposed_weight_tensor,
diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index c48c6b51..2339aa2b 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -47,8 +47,8 @@ fn traditional_dot_product_p12_style[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     if global_i < size:
         shared[local_i] = (a[global_i] * b[global_i]).reduce_add()
@@ -79,7 +79,7 @@ fn simple_warp_dot_product[
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     # FILL IN (6 lines at most)
 
 
diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo
index 110e408a..76c62ad9 100644
--- a/problems/p25/p25.mojo
+++ b/problems/p25/p25.mojo
@@ -24,8 +24,8 @@ fn neighbor_difference[
     Uses shuffle_down(val, 1) to get the next neighbor's value.
     Works across multiple blocks, each processing one warp worth of data.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     # FILL IN (roughly 7 lines)
 
@@ -50,8 +50,8 @@ fn moving_average_3[
     Uses shuffle_down with offsets 1 and 2 to access neighbors.
     Works within warp boundaries across multiple blocks.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     # FILL IN (roughly 10 lines)
 
@@ -71,8 +71,8 @@ fn broadcast_shuffle_coordination[
     Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp.
     Each lane uses shuffle_down() for neighbor access and applies broadcast factor.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
     if global_i < size:
         var scale_factor: output.element_type = 0.0
 
@@ -93,8 +93,8 @@ fn basic_broadcast[
     Basic broadcast: Lane 0 computes a block-local value, broadcasts it to all lanes.
     Each lane then uses this broadcast value in its own computation.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
     if global_i < size:
         var broadcast_value: output.element_type = 0.0
 
@@ -115,8 +115,8 @@ fn conditional_broadcast[
     Conditional broadcast: Lane 0 makes a decision based on block-local data, broadcasts it to all lanes.
     All lanes apply different logic based on the broadcast decision.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
     if global_i < size:
         var decision_value: output.element_type = 0.0
 
diff --git a/problems/p26/p26.mojo b/problems/p26/p26.mojo
index 87c9f408..6eacef9d 100644
--- a/problems/p26/p26.mojo
+++ b/problems/p26/p26.mojo
@@ -25,7 +25,7 @@ fn butterfly_pair_swap[
     Uses shuffle_xor(val, 1) to swap values within each pair.
     This is the foundation of butterfly network communication patterns.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     # FILL ME IN (4 lines)
 
@@ -47,7 +47,7 @@ fn butterfly_parallel_max[
     This implements an efficient O(log n) parallel reduction algorithm that works
     for any WARP_SIZE (32, 64, etc.).
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     # FILL ME IN (roughly 7 lines)
 
@@ -73,7 +73,7 @@ fn butterfly_conditional_max[
     in even-numbered lanes. Odd-numbered lanes store the minimum value seen.
     Demonstrates conditional logic combined with butterfly communication patterns.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     lane = lane_id()
 
     if global_i < size:
@@ -113,7 +113,7 @@ fn warp_inclusive_prefix_sum[
     NOTE: This implementation only works correctly within a single warp (WARP_SIZE threads).
     For multi-warp scenarios, additional coordination would be needed.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     # FILL ME IN (roughly 4 lines)
 
@@ -145,7 +145,7 @@ fn warp_partition[
     Input:  [3, 7, 1, 8, 2, 9, 4, 6]
     Result: [3, 1, 2, 4, 7, 8, 9, 6] (< pivot | >= pivot).
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     if global_i < size:
         current_val = input[global_i]
diff --git a/problems/p27/p27.mojo b/problems/p27/p27.mojo
index e3eb8a93..88c30fba 100644
--- a/problems/p27/p27.mojo
+++ b/problems/p27/p27.mojo
@@ -28,8 +28,8 @@ fn traditional_dot_product[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Each thread computes partial product
     if global_i < size:
@@ -74,7 +74,7 @@ fn block_sum_dot_product[
     """Dot product using block.sum() - convenience function like warp.sum()!
     Replaces manual shared memory + barriers + tree reduction with one line."""
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # FILL IN (roughly 6 lines)
@@ -104,8 +104,8 @@ fn block_histogram_bin_extract[
     3. Extract and pack only elements belonging to target_bin
     """
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Step 1: Each thread determines its bin and element value
 
@@ -152,7 +152,7 @@ fn block_normalize_vector[
     4. Each thread normalizes: output[i] = input[i] / mean
     """
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # Step 1: Each thread loads its element
@@ -335,7 +335,7 @@ def main():
                 # Execute histogram kernel for this specific bin
                 alias kernel = block_histogram_bin_extract[
                     in_layout, bin_layout, out_layout, TPB
-                ],
+                ]
                 ctx.enqueue_function_checked[kernel, kernel](
                     input_tensor,
                     bin_tensor,
@@ -409,9 +409,7 @@ def main():
             )
 
             # Execute vector normalization kernel
-            alias kernel = block_normalize_vector[
-                in_layout, vector_layout, TPB
-            ],
+            alias kernel = block_normalize_vector[in_layout, vector_layout, TPB]
             ctx.enqueue_function_checked[kernel, kernel](
                 input_tensor,
                 output_tensor,
diff --git a/problems/p29/p29.mojo b/problems/p29/p29.mojo
index 64a82d61..52ef8acf 100644
--- a/problems/p29/p29.mojo
+++ b/problems/p29/p29.mojo
@@ -57,8 +57,8 @@ fn multi_stage_image_blur_pipeline[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Stage 1: Load and preprocess (threads 0-127)
 
@@ -135,8 +135,8 @@ fn double_buffered_stencil_computation[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Initialize barriers (only thread 0)
     if local_i == 0:
diff --git a/problems/p33/p33.mojo b/problems/p33/p33.mojo
index b5e5d491..148249d1 100644
--- a/problems/p33/p33.mojo
+++ b/problems/p33/p33.mojo
@@ -35,11 +35,13 @@ fn matmul_idiomatic_tiled[
 
     local_row = thread_idx.y
     local_col = thread_idx.x
-    tiled_row = block_idx.y * tile_size_y + local_row
-    tiled_col = block_idx.x * tile_size_x + local_col
+    tiled_row = Int(block_idx.y * tile_size_y + local_row)
+    tiled_col = Int(block_idx.x * tile_size_x + local_col)
 
     # Get the tile of the output matrix that this thread block is responsible for
-    out_tile = output.tile[TILE_SIZE, TILE_SIZE](block_idx.y, block_idx.x)
+    out_tile = output.tile[TILE_SIZE, TILE_SIZE](
+        Int(block_idx.y), Int(block_idx.x)
+    )
     a_shared = LayoutTensor[
         dtype,
         Layout.row_major(TILE_SIZE, TILE_SIZE),
@@ -62,8 +64,8 @@ fn matmul_idiomatic_tiled[
 
     for idx in range(size // TILE_SIZE):  # Iterate over K tiles
         # Get tiles from A and B matrices
-        a_tile = a.tile[TILE_SIZE, TILE_SIZE](block_idx.y, idx)
-        b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, block_idx.x)
+        a_tile = a.tile[TILE_SIZE, TILE_SIZE](Int(block_idx.y), idx)
+        b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, Int(block_idx.x))
 
         # Asynchronously copy tiles to shared memory with consistent orientation
         copy_dram_to_sram_async[
@@ -141,7 +143,7 @@ fn tensor_core_matrix_multiplication[
     alias N = C.shape[1]()
     alias K = A.shape[1]()
 
-    warp_id = thread_idx.x // WARP_SIZE
+    warp_id = Int(thread_idx.x) // WARP_SIZE
     warps_in_n = BN // WN
     warps_in_m = BM // WM
     warp_y = warp_id // warps_in_n
@@ -149,7 +151,7 @@ fn tensor_core_matrix_multiplication[
 
     warp_is_active = warp_y < warps_in_m
 
-    C_block_tile = C.tile[BM, BN](block_idx.y, block_idx.x)
+    C_block_tile = C.tile[BM, BN](Int(block_idx.y), Int(block_idx.x))
     C_warp_tile = C_block_tile.tile[WM, WN](warp_y, warp_x)
 
     mma_op = TensorCore[A.dtype, C.dtype, Index(MMA_M, MMA_N, MMA_K)]()
@@ -190,8 +192,8 @@ fn tensor_core_matrix_multiplication[
     for k_i in range(K // BK):
         barrier()
 
-        A_dram_tile = A.tile[BM, BK](block_idx.y, k_i)
-        B_dram_tile = B.tile[BK, BN](k_i, block_idx.x)
+        A_dram_tile = A.tile[BM, BK](Int(block_idx.y), k_i)
+        B_dram_tile = B.tile[BK, BN](k_i, Int(block_idx.x))
 
         copy_dram_to_sram_async[
             thread_layout = Layout.row_major(4, 8),
diff --git a/problems/p34/p34.mojo b/problems/p34/p34.mojo
index d0678593..c1d41a96 100644
--- a/problems/p34/p34.mojo
+++ b/problems/p34/p34.mojo
@@ -29,7 +29,7 @@ fn cluster_coordination_basics[
     size: Int,
 ):
     """Real cluster coordination using SM90+ cluster APIs."""
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # Check what's happening with cluster ranks
@@ -87,8 +87,8 @@ fn cluster_collective_operations[
     size: Int,
 ):
     """Cluster-wide collective operations using real cluster APIs."""
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # FILL IN (roughly 24 lines)
 
@@ -106,8 +106,8 @@ fn advanced_cluster_patterns[
 ):
     """Advanced cluster programming using cluster masks and relaxed synchronization.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # FILL IN (roughly 26 lines)
 
diff --git a/solutions/p03/p03.mojo b/solutions/p03/p03.mojo
index 2f90fa95..54cbc6ab 100644
--- a/solutions/p03/p03.mojo
+++ b/solutions/p03/p03.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_guard(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     i = thread_idx.x
     if i < size:
@@ -36,7 +36,7 @@ def main():
         ctx.enqueue_function_checked[add_10_guard, add_10_guard](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p04/p04.mojo b/solutions/p04/p04.mojo
index b0ae3aec..4c81df0a 100644
--- a/solutions/p04/p04.mojo
+++ b/solutions/p04/p04.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_2d(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -43,7 +43,7 @@ def main():
         ctx.enqueue_function_checked[add_10_2d, add_10_2d](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p04/p04_layout_tensor.mojo b/solutions/p04/p04_layout_tensor.mojo
index a911f431..5df8a7ac 100644
--- a/solutions/p04/p04_layout_tensor.mojo
+++ b/solutions/p04/p04_layout_tensor.mojo
@@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn add_10_2d(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -51,7 +51,7 @@ def main():
         ctx.enqueue_function_checked[add_10_2d, add_10_2d](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p05/p05.mojo b/solutions/p05/p05.mojo
index f5fb34db..0b1acd43 100644
--- a/solutions/p05/p05.mojo
+++ b/solutions/p05/p05.mojo
@@ -14,7 +14,7 @@ fn broadcast_add(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     b: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -48,7 +48,7 @@ def main():
             out,
             a,
             b,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p05/p05_layout_tensor.mojo b/solutions/p05/p05_layout_tensor.mojo
index d4963013..6701662f 100644
--- a/solutions/p05/p05_layout_tensor.mojo
+++ b/solutions/p05/p05_layout_tensor.mojo
@@ -21,7 +21,7 @@ fn broadcast_add[
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -66,7 +66,7 @@ def main():
             out_tensor,
             a_tensor,
             b_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p06/p06.mojo b/solutions/p06/p06.mojo
index 870dc570..e5db902a 100644
--- a/solutions/p06/p06.mojo
+++ b/solutions/p06/p06.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     i = block_dim.x * block_idx.x + thread_idx.x
     if i < size:
@@ -36,7 +36,7 @@ def main():
         ctx.enqueue_function_checked[add_10_blocks, add_10_blocks](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p07/p07.mojo b/solutions/p07/p07.mojo
index e54f2b00..84c58629 100644
--- a/solutions/p07/p07.mojo
+++ b/solutions/p07/p07.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks_2d(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x
@@ -43,7 +43,7 @@ def main():
         ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p07/p07_layout_tensor.mojo b/solutions/p07/p07_layout_tensor.mojo
index e4a80f31..658b1bc8 100644
--- a/solutions/p07/p07_layout_tensor.mojo
+++ b/solutions/p07/p07_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn add_10_blocks_2d[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x
@@ -54,7 +54,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p08/p08.mojo b/solutions/p08/p08.mojo
index 80fe6938..544e3409 100644
--- a/solutions/p08/p08.mojo
+++ b/solutions/p08/p08.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn add_10_shared(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
@@ -54,7 +54,7 @@ def main():
         ctx.enqueue_function_checked[add_10_shared, add_10_shared](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p08/p08_layout_tensor.mojo b/solutions/p08/p08_layout_tensor.mojo
index 49c2c66c..0fe2f0d7 100644
--- a/solutions/p08/p08_layout_tensor.mojo
+++ b/solutions/p08/p08_layout_tensor.mojo
@@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using tensor builder
     shared = LayoutTensor[
@@ -62,7 +62,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p10/p10.mojo b/solutions/p10/p10.mojo
index c7b04f0d..1014fd72 100644
--- a/solutions/p10/p10.mojo
+++ b/solutions/p10/p10.mojo
@@ -18,7 +18,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn shared_memory_race(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     """Fixed: sequential access with barriers eliminates race conditions."""
     row = thread_idx.y
@@ -55,7 +55,7 @@ fn shared_memory_race(
 fn add_10_2d(
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -105,7 +105,7 @@ def main():
             ctx.enqueue_function_checked[add_10_2d, add_10_2d](
                 out_tensor,
                 a_tensor,
-                SIZE,
+                UInt(SIZE),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
@@ -139,7 +139,7 @@ def main():
             ](
                 out_tensor,
                 a_tensor,
-                SIZE,
+                UInt(SIZE),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
diff --git a/solutions/p11/p11.mojo b/solutions/p11/p11.mojo
index cef5ac59..c07ab3b7 100644
--- a/solutions/p11/p11.mojo
+++ b/solutions/p11/p11.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn pooling(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
@@ -34,7 +34,7 @@ fn pooling(
         output[0] = shared[0]
     elif global_i == 1:
         output[1] = shared[0] + shared[1]
-    elif 1 < global_i < size:
+    elif UInt(1) < global_i < size:
         output[global_i] = (
             shared[local_i - 2] + shared[local_i - 1] + shared[local_i]
         )
@@ -56,7 +56,7 @@ def main():
         ctx.enqueue_function_checked[pooling, pooling](
             out,
             a,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p11/p11_layout_tensor.mojo b/solutions/p11/p11_layout_tensor.mojo
index f1712756..c4607412 100644
--- a/solutions/p11/p11_layout_tensor.mojo
+++ b/solutions/p11/p11_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn pooling[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using tensor builder
     shared = LayoutTensor[
@@ -44,7 +44,7 @@ fn pooling[
     elif global_i == 1:
         output[1] = shared[0] + shared[1]
     # Handle general case
-    elif 1 < global_i < size:
+    elif UInt(1) < global_i < size:
         output[global_i] = (
             shared[local_i - 2] + shared[local_i - 1] + shared[local_i]
         )
@@ -70,7 +70,7 @@ def main():
         ctx.enqueue_function_checked[pooling[layout], pooling[layout]](
             out_tensor,
             a_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p12/p12.mojo b/solutions/p12/p12.mojo
index f38f2d8c..793071fc 100644
--- a/solutions/p12/p12.mojo
+++ b/solutions/p12/p12.mojo
@@ -17,7 +17,7 @@ fn dot_product(
     output: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     b: UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
@@ -41,7 +41,7 @@ fn dot_product(
     # and warps can be scheduled independently.
     # However, shared memory does not have such issues as long as we use `barrier()`
     # correctly when we're in the same thread block.
-    stride = TPB // 2
+    stride = UInt(TPB // 2)
     while stride > 0:
         if local_i < stride:
             shared[local_i] += shared[local_i + stride]
@@ -74,7 +74,7 @@ def main():
             out,
             a,
             b,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p12/p12_layout_tensor.mojo b/solutions/p12/p12_layout_tensor.mojo
index fe2303e9..05f38dc6 100644
--- a/solutions/p12/p12_layout_tensor.mojo
+++ b/solutions/p12/p12_layout_tensor.mojo
@@ -20,7 +20,7 @@ fn dot_product[
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     shared = LayoutTensor[
         dtype,
@@ -39,7 +39,7 @@ fn dot_product[
     barrier()
 
     # Parallel reduction in shared memory
-    stride = TPB // 2
+    stride = UInt(TPB // 2)
     while stride > 0:
         if local_i < stride:
             shared[local_i] += shared[local_i + stride]
@@ -78,7 +78,7 @@ def main():
             out_tensor,
             a_tensor,
             b_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p13/p13.mojo b/solutions/p13/p13.mojo
index c24d2207..25946f5a 100644
--- a/solutions/p13/p13.mojo
+++ b/solutions/p13/p13.mojo
@@ -25,7 +25,7 @@ fn conv_1d_simple[
     b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    local_i = Int(thread_idx.x)
     shared_a = LayoutTensor[
         dtype,
         Layout.row_major(SIZE),
@@ -91,8 +91,8 @@ fn conv_1d_block_boundary[
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, conv_layout, ImmutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     # first: need to account for padding
     shared_a = LayoutTensor[
         dtype,
diff --git a/solutions/p14/p14.mojo b/solutions/p14/p14.mojo
index e5082782..57c21475 100644
--- a/solutions/p14/p14.mojo
+++ b/solutions/p14/p14.mojo
@@ -20,7 +20,7 @@ fn prefix_sum_simple[
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -35,7 +35,7 @@ fn prefix_sum_simple[
 
     barrier()
 
-    offset = 1
+    offset = UInt(1)
     for i in range(Int(log2(Scalar[dtype](TPB)))):
         var current_val: output.element_type = 0
         if local_i >= offset and local_i < size:
@@ -71,7 +71,7 @@ fn prefix_sum_local_phase[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -102,7 +102,7 @@ fn prefix_sum_local_phase[
     # Iteration 3 (offset=4):
     #   Block 0: [0,1,3,6,10+0,14+1,18+3,22+6] = [0,1,3,6,10,15,21,28]
     #   Block 1 follows same pattern to get [8,17,27,38,50,63,77,???]
-    offset = 1
+    offset = UInt(1)
     for i in range(Int(log2(Scalar[dtype](TPB)))):
         var current_val: output.element_type = 0
         if local_i >= offset and local_i < TPB:
@@ -134,7 +134,7 @@ fn prefix_sum_local_phase[
 # Kernel 2: Add block sums to their respective blocks
 fn prefix_sum_block_sum_phase[
     layout: Layout
-](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: Int):
+](output: LayoutTensor[dtype, layout, MutAnyOrigin], size: UInt):
     global_i = block_dim.x * block_idx.x + thread_idx.x
 
     # Second pass: add previous block's sum to each element
@@ -179,7 +179,7 @@ def main():
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID,
                 block_dim=THREADS_PER_BLOCK,
             )
@@ -195,7 +195,7 @@ def main():
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID_2,
                 block_dim=THREADS_PER_BLOCK_2,
             )
@@ -204,7 +204,7 @@ def main():
             alias kernel2 = prefix_sum_block_sum_phase[extended_layout]
             ctx.enqueue_function_checked[kernel2, kernel2](
                 out_tensor,
-                size,
+                UInt(size),
                 grid_dim=BLOCKS_PER_GRID_2,
                 block_dim=THREADS_PER_BLOCK_2,
             )
diff --git a/solutions/p15/p15.mojo b/solutions/p15/p15.mojo
index d6ff9144..4c8d4050 100644
--- a/solutions/p15/p15.mojo
+++ b/solutions/p15/p15.mojo
@@ -21,7 +21,7 @@ fn axis_sum[
 ](
     output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -50,7 +50,7 @@ fn axis_sum[
     barrier()
 
     # do reduction sum per each block
-    stride = TPB // 2
+    stride = UInt(TPB // 2)
     while stride > 0:
         # Read phase: all threads read the values they need first to avoid race conditions
         var temp_val: output.element_type = 0
@@ -92,7 +92,7 @@ def main():
         ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             inp_tensor,
-            SIZE,
+            UInt(SIZE),
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
diff --git a/solutions/p16/p16.mojo b/solutions/p16/p16.mojo
index e129735b..4ede3870 100644
--- a/solutions/p16/p16.mojo
+++ b/solutions/p16/p16.mojo
@@ -15,7 +15,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 
 # ANCHOR: naive_matmul_solution
 fn naive_matmul[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
@@ -39,7 +39,7 @@ fn naive_matmul[
 
 # ANCHOR: single_block_matmul_solution
 fn single_block_matmul[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout, MutAnyOrigin],
     a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
@@ -90,7 +90,7 @@ alias layout_tiled = Layout.row_major(SIZE_TILED, SIZE_TILED)
 
 # ANCHOR: matmul_tiled_solution
 fn matmul_tiled[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin],
     a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin],
@@ -158,7 +158,7 @@ alias BLOCK_DIM_COUNT = 2
 
 
 fn matmul_idiomatic_tiled[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[dtype, layout_tiled, MutAnyOrigin],
     a: LayoutTensor[dtype, layout_tiled, ImmutAnyOrigin],
@@ -170,7 +170,7 @@ fn matmul_idiomatic_tiled[
     tiled_col = block_idx.x * TPB + local_col
 
     # Get the tile of the output matrix that this thread block is responsible for
-    out_tile = output.tile[TPB, TPB](block_idx.y, block_idx.x)
+    out_tile = output.tile[TPB, TPB](Int(block_idx.y), Int(block_idx.x))
     a_shared = LayoutTensor[
         dtype,
         Layout.row_major(TPB, TPB),
@@ -194,8 +194,8 @@ fn matmul_idiomatic_tiled[
     @parameter
     for idx in range(size // TPB):  # Perfect division: 9 // 3 = 3 tiles
         # Get tiles from A and B matrices
-        a_tile = a.tile[TPB, TPB](block_idx.y, idx)
-        b_tile = b.tile[TPB, TPB](idx, block_idx.x)
+        a_tile = a.tile[TPB, TPB](Int(block_idx.y), Int(idx))
+        b_tile = b.tile[TPB, TPB](Int(idx), Int(block_idx.x))
 
         # Asynchronously copy tiles to shared memory with consistent orientation
         copy_dram_to_sram_async[
@@ -264,7 +264,7 @@ def main():
         b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp2)
 
         if argv()[1] == "--naive":
-            alias kernel = naive_matmul[layout, SIZE]
+            alias kernel = naive_matmul[layout, UInt(SIZE)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
@@ -273,7 +273,7 @@ def main():
                 block_dim=THREADS_PER_BLOCK,
             )
         elif argv()[1] == "--single-block":
-            alias kernel = single_block_matmul[layout, SIZE]
+            alias kernel = single_block_matmul[layout, UInt(SIZE)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor,
                 a_tensor,
@@ -293,7 +293,7 @@ def main():
                 inp2
             )
 
-            alias kernel = matmul_tiled[layout_tiled, SIZE_TILED]
+            alias kernel = matmul_tiled[layout_tiled, UInt(SIZE_TILED)]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor_tiled,
                 a_tensor_tiled,
@@ -312,7 +312,9 @@ def main():
                 inp2
             )
 
-            alias kernel = matmul_idiomatic_tiled[layout_tiled, SIZE_TILED]
+            alias kernel = matmul_idiomatic_tiled[
+                layout_tiled, UInt(SIZE_TILED)
+            ]
             ctx.enqueue_function_checked[kernel, kernel](
                 out_tensor_tiled,
                 a_tensor_tiled,
diff --git a/solutions/p17/op/conv1d.mojo b/solutions/p17/op/conv1d.mojo
index 6f746c26..516f7de4 100644
--- a/solutions/p17/op/conv1d.mojo
+++ b/solutions/p17/op/conv1d.mojo
@@ -21,8 +21,8 @@ fn conv1d_kernel[
     input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
     kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     # first: need to account for padding
     shared_a = LayoutTensor[
         dtype,
diff --git a/solutions/p18/op/softmax.mojo b/solutions/p18/op/softmax.mojo
index 16308532..c0280d6d 100644
--- a/solutions/p18/op/softmax.mojo
+++ b/solutions/p18/op/softmax.mojo
@@ -36,7 +36,7 @@ fn softmax_gpu_kernel[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = thread_idx.x
+    global_i = Int(thread_idx.x)
 
     # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum
     # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they
diff --git a/solutions/p19/op/attention.mojo b/solutions/p19/op/attention.mojo
index ce6ab1c2..21f6ca37 100644
--- a/solutions/p19/op/attention.mojo
+++ b/solutions/p19/op/attention.mojo
@@ -44,14 +44,14 @@ fn matmul_idiomatic_tiled[
     b: LayoutTensor[dtype, b_layout, MutAnyOrigin],
 ):
     """Updated idiomatic tiled matrix multiplication from p16."""
-    local_row = thread_idx.y
-    local_col = thread_idx.x
-    tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row
-    tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col
+    local_row = Int(thread_idx.y)
+    local_col = Int(thread_idx.x)
+    tiled_row = Int(block_idx.y) * MATMUL_BLOCK_DIM_XY + local_row
+    tiled_col = Int(block_idx.x) * MATMUL_BLOCK_DIM_XY + local_col
 
     # Get the tile of the output matrix that this thread block is responsible for
     out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-        block_idx.y, block_idx.x
+        Int(block_idx.y), Int(block_idx.x)
     )
     a_shared = LayoutTensor[
         dtype,
@@ -78,10 +78,10 @@ fn matmul_idiomatic_tiled[
     for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY):
         # Get tiles from A and B matrices
         a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            block_idx.y, idx
+            Int(block_idx.y), idx
         )
         b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            idx, block_idx.x
+            idx, Int(block_idx.x)
         )
 
         # Asynchronously copy tiles to shared memory with consistent orientation
@@ -137,19 +137,19 @@ fn transpose_kernel[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    local_row = thread_idx.y
-    local_col = thread_idx.x
+    local_row = Int(thread_idx.y)
+    local_col = Int(thread_idx.x)
 
-    global_row = block_idx.y * TRANSPOSE_BLOCK_DIM_XY + local_row
-    global_col = block_idx.x * TRANSPOSE_BLOCK_DIM_XY + local_col
+    global_row = Int(block_idx.y) * TRANSPOSE_BLOCK_DIM_XY + local_row
+    global_col = Int(block_idx.x) * TRANSPOSE_BLOCK_DIM_XY + local_col
 
     if global_row < rows and global_col < cols:
         shared_tile[local_row, local_col] = inp[global_row, global_col]
 
     barrier()
 
-    out_row = block_idx.x * TRANSPOSE_BLOCK_DIM_XY + local_row
-    out_col = block_idx.y * TRANSPOSE_BLOCK_DIM_XY + local_col
+    out_row = Int(block_idx.x) * TRANSPOSE_BLOCK_DIM_XY + local_row
+    out_col = Int(block_idx.y) * TRANSPOSE_BLOCK_DIM_XY + local_col
 
     # Store data from shared memory to global memory (coalesced write)
     # Note: we transpose the shared memory access pattern
@@ -181,7 +181,7 @@ fn softmax_gpu_kernel[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = thread_idx.x
+    global_i = Int(thread_idx.x)
 
     # Initialize out-of-bounds (shared_max[local_i], global_i >= input_size) shared memory addresses to the minimum
     # finite value for dtype, ensuring that if these elements are accessed in the parallel max reduction below they
diff --git a/solutions/p21/op/embedding.mojo b/solutions/p21/op/embedding.mojo
index 4ce564a2..07365220 100644
--- a/solutions/p21/op/embedding.mojo
+++ b/solutions/p21/op/embedding.mojo
@@ -33,7 +33,7 @@ fn embedding_kernel_coalesced[
     """
 
     # Simple 1D indexing - each thread = one output element
-    global_idx = block_idx.x * block_dim.x + thread_idx.x
+    global_idx = Int(block_idx.x * block_dim.x + thread_idx.x)
     total_elements = batch_size * seq_len * embed_dim
 
     if global_idx >= total_elements:
@@ -85,8 +85,8 @@ fn embedding_kernel_2d[
     """
 
     # 2D grid indexing
-    batch_seq_idx = block_idx.x * block_dim.x + thread_idx.x
-    embed_idx = block_idx.y * block_dim.y + thread_idx.y
+    batch_seq_idx = Int(block_idx.x * block_dim.x + thread_idx.x)
+    embed_idx = Int(block_idx.y * block_dim.y + thread_idx.y)
 
     total_positions = batch_size * seq_len
 
diff --git a/solutions/p22/op/layernorm_linear.mojo b/solutions/p22/op/layernorm_linear.mojo
index 8760c183..2eaf8e51 100644
--- a/solutions/p22/op/layernorm_linear.mojo
+++ b/solutions/p22/op/layernorm_linear.mojo
@@ -33,12 +33,12 @@ fn matmul_idiomatic_tiled[
     """Idiomatic tiled matrix multiplication from p19."""
     local_row = thread_idx.y
     local_col = thread_idx.x
-    tiled_row = block_idx.y * MATMUL_BLOCK_DIM_XY + local_row
-    tiled_col = block_idx.x * MATMUL_BLOCK_DIM_XY + local_col
+    tiled_row = Int(block_idx.y * MATMUL_BLOCK_DIM_XY + local_row)
+    tiled_col = Int(block_idx.x * MATMUL_BLOCK_DIM_XY + local_col)
 
     # Get the tile of the output matrix that this thread block is responsible for
     out_tile = output.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-        block_idx.y, block_idx.x
+        Int(block_idx.y), Int(block_idx.x)
     )
     a_shared = LayoutTensor[
         dtype,
@@ -65,10 +65,10 @@ fn matmul_idiomatic_tiled[
     for idx in range((inner + MATMUL_BLOCK_DIM_XY - 1) // MATMUL_BLOCK_DIM_XY):
         # Get tiles from A and B matrices
         a_tile = a.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            block_idx.y, idx
+            Int(block_idx.y), idx
         )
         b_tile = b.tile[MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY](
-            idx, block_idx.x
+            idx, Int(block_idx.x)
         )
 
         # Asynchronously copy tiles to shared memory with consistent orientation
@@ -123,9 +123,9 @@ fn layernorm_kernel[
     ln_weight: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin],
     ln_bias: LayoutTensor[dtype, ln_params_layout, ImmutAnyOrigin],
 ):
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
-    hidden_idx = thread_idx.x
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
+    hidden_idx = Int(thread_idx.x)
 
     if (
         batch_idx >= batch_size
@@ -163,8 +163,8 @@ fn layernorm_kernel[
 fn transpose_kernel[
     layout_in: Layout,
     layout_out: Layout,
-    rows: Int,
-    cols: Int,
+    rows: UInt,
+    cols: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[dtype, layout_out, MutAnyOrigin],
@@ -218,9 +218,9 @@ fn add_bias_kernel[
     bias: LayoutTensor[dtype, bias_layout, ImmutAnyOrigin],
 ):
     """Simple bias addition."""
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
-    out_idx = thread_idx.x
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
+    out_idx = Int(thread_idx.x)
 
     if batch_idx >= batch_size or seq_idx >= seq_len or out_idx >= output_dim:
         return
@@ -257,8 +257,8 @@ fn minimal_fused_kernel[
     """
     # Grid: (batch_size, seq_len) - one thread block per sequence position
     # Block: (1,) - single thread per sequence position to avoid redundant computation
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
 
     if batch_idx >= batch_size or seq_idx >= seq_len:
         return
@@ -330,8 +330,8 @@ fn minimal_fused_kernel_backward[
     """
     # Grid: (batch_size, seq_len) - one thread per sequence position
     # Block: (1,) - single thread per sequence position
-    batch_idx = block_idx.x
-    seq_idx = block_idx.y
+    batch_idx = Int(block_idx.x)
+    seq_idx = Int(block_idx.y)
 
     if batch_idx >= batch_size or seq_idx >= seq_len:
         return
@@ -601,8 +601,8 @@ struct LayerNormLinearCustomOp:
                 alias kernel2 = transpose_kernel[
                     weight_layout,
                     transposed_weight_tensor.layout,
-                    output_dim,
-                    hidden_dim,
+                    UInt(output_dim),
+                    UInt(hidden_dim),
                 ]
                 gpu_ctx.enqueue_function_checked[kernel2, kernel2](
                     transposed_weight_tensor,
diff --git a/solutions/p24/p24.mojo b/solutions/p24/p24.mojo
index 83d54757..6c922182 100644
--- a/solutions/p24/p24.mojo
+++ b/solutions/p24/p24.mojo
@@ -47,8 +47,8 @@ fn traditional_dot_product_p12_style[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     if global_i < size:
         shared[local_i] = (a[global_i] * b[global_i]).reduce_add()
@@ -79,7 +79,7 @@ fn simple_warp_dot_product[
     a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
     b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
 ):
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     # Each thread computes one partial product using vectorized approach as values in Mojo are SIMD based
     var partial_product: Scalar[dtype] = 0
diff --git a/solutions/p25/p25.mojo b/solutions/p25/p25.mojo
index 660adbe9..efc012d8 100644
--- a/solutions/p25/p25.mojo
+++ b/solutions/p25/p25.mojo
@@ -25,8 +25,8 @@ fn neighbor_difference[
     Uses shuffle_down(val, 1) to get the next neighbor's value.
     Works across multiple blocks, each processing one warp worth of data.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     if global_i < size:
         # Get current value
@@ -67,8 +67,8 @@ fn moving_average_3[
     Uses shuffle_down with offsets 1 and 2 to access neighbors.
     Works within warp boundaries across multiple blocks.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     if global_i < size:
         # Get current, next, and next+1 values
@@ -102,15 +102,15 @@ fn broadcast_shuffle_coordination[
     Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp.
     Each lane uses shuffle_down() for neighbor access and applies broadcast factor.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     if global_i < size:
         # Step 1: Lane 0 computes block-local scaling factor
         var scale_factor: output.element_type = 0.0
         if lane == 0:
             # Compute average of first 4 elements in this block's data
-            block_start = block_idx.x * block_dim.x
+            block_start = Int(block_idx.x * block_dim.x)
             var sum: output.element_type = 0.0
             for i in range(4):
                 if block_start + i < size:
@@ -147,14 +147,14 @@ fn basic_broadcast[
     Basic broadcast: Lane 0 computes a block-local value, broadcasts it to all lanes.
     Each lane then uses this broadcast value in its own computation.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     if global_i < size:
         # Step 1: Lane 0 computes special value (sum of first 4 elements in this block)
         var broadcast_value: output.element_type = 0.0
         if lane == 0:
-            block_start = block_idx.x * block_dim.x
+            block_start = Int(block_idx.x * block_dim.x)
             var sum: output.element_type = 0.0
             for i in range(4):
                 if block_start + i < size:
@@ -182,14 +182,14 @@ fn conditional_broadcast[
     Conditional broadcast: Lane 0 makes a decision based on block-local data, broadcasts it to all lanes.
     All lanes apply different logic based on the broadcast decision.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    lane = lane_id()
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    lane = Int(lane_id())
 
     if global_i < size:
         # Step 1: Lane 0 analyzes block-local data and makes decision (find max of first 8 in block)
         var decision_value: output.element_type = 0.0
         if lane == 0:
-            block_start = block_idx.x * block_dim.x
+            block_start = Int(block_idx.x * block_dim.x)
             decision_value = input[block_start] if block_start < size else 0.0
             for i in range(1, min(8, min(WARP_SIZE, size - block_start))):
                 if block_start + i < size:
diff --git a/solutions/p26/p26.mojo b/solutions/p26/p26.mojo
index 3967a90f..32755c9d 100644
--- a/solutions/p26/p26.mojo
+++ b/solutions/p26/p26.mojo
@@ -26,7 +26,7 @@ fn butterfly_pair_swap[
     Uses shuffle_xor(val, 1) to swap values within each pair.
     This is the foundation of butterfly network communication patterns.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     if global_i < size:
         current_val = input[global_i]
@@ -56,7 +56,7 @@ fn butterfly_parallel_max[
     Each step reduces the active range by half until all threads have the maximum value.
     This implements an efficient O(log n) parallel reduction algorithm.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     if global_i < size:
         max_val = input[global_i]
@@ -93,7 +93,7 @@ fn butterfly_conditional_max[
     in even-numbered lanes. Odd-numbered lanes store the minimum value seen.
     Demonstrates conditional logic combined with butterfly communication patterns.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     lane = lane_id()
 
     if global_i < size:
@@ -147,7 +147,7 @@ fn warp_inclusive_prefix_sum[
     NOTE: This implementation only works correctly within a single warp (WARP_SIZE threads).
     For multi-warp scenarios, additional coordination would be needed.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     if global_i < size:
         current_val = input[global_i]
@@ -188,7 +188,7 @@ fn warp_partition[
     Input:  [3, 7, 1, 8, 2, 9, 4, 6]
     Result: [3, 1, 2, 4, 7, 8, 9, 6] (< pivot | >= pivot).
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
 
     if global_i < size:
         current_val = input[global_i]
diff --git a/solutions/p27/p27.mojo b/solutions/p27/p27.mojo
index 797c9d9e..eb141f83 100644
--- a/solutions/p27/p27.mojo
+++ b/solutions/p27/p27.mojo
@@ -29,7 +29,7 @@ fn block_sum_dot_product[
     """Dot product using block.sum() - convenience function like warp.sum()!
     Replaces manual shared memory + barriers + tree reduction with one line."""
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # Each thread computes partial product
@@ -70,8 +70,8 @@ fn traditional_dot_product[
         MutAnyOrigin,
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Each thread computes partial product
     if global_i < size:
@@ -118,8 +118,8 @@ fn block_histogram_bin_extract[
     3. Extract and pack only elements belonging to target_bin
     """
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Step 1: Each thread determines its bin and element value
     var my_value: Scalar[dtype] = 0.0
@@ -180,7 +180,7 @@ fn block_normalize_vector[
     4. Each thread normalizes: output[i] = input[i] / mean
     """
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # Step 1: Each thread loads its element
diff --git a/solutions/p28/p28.mojo b/solutions/p28/p28.mojo
index f3d80d5a..f5018f26 100644
--- a/solutions/p28/p28.mojo
+++ b/solutions/p28/p28.mojo
@@ -48,11 +48,11 @@ fn async_copy_overlap_convolution[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    local_i = thread_idx.x
+    local_i = Int(thread_idx.x)
 
     # Phase 1: Launch async copy for input tile
     # Note: tile() does NOT perform bounds checking - ensure valid tile bounds
-    input_tile = input.tile[CONV_TILE_SIZE](block_idx.x)
+    input_tile = input.tile[CONV_TILE_SIZE](Int(block_idx.x))
 
     # Use async copy with thread layout matching p14 pattern
     alias load_layout = Layout.row_major(THREADS_PER_BLOCK_ASYNC)
@@ -67,7 +67,7 @@ fn async_copy_overlap_convolution[
     barrier()  # Sync all threads
 
     # Phase 4: Compute convolution
-    global_i = block_idx.x * CONV_TILE_SIZE + local_i
+    global_i = Int(block_idx.x) * CONV_TILE_SIZE + local_i
     if local_i < CONV_TILE_SIZE and global_i < output.shape[0]():
         var result: output.element_type = 0
 
diff --git a/solutions/p29/p29.mojo b/solutions/p29/p29.mojo
index 0233fa19..75ce6894 100644
--- a/solutions/p29/p29.mojo
+++ b/solutions/p29/p29.mojo
@@ -53,8 +53,8 @@ fn multi_stage_image_blur_pipeline[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Stage 1: Load and preprocess (threads 0-127)
     if local_i < STAGE1_THREADS:
@@ -180,8 +180,8 @@ fn double_buffered_stencil_computation[
         address_space = AddressSpace.SHARED,
     ].stack_allocation()
 
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
 
     # Initialize barriers (only thread 0)
     if local_i == 0:
diff --git a/solutions/p33/p33.mojo b/solutions/p33/p33.mojo
index fba87415..b442f97f 100644
--- a/solutions/p33/p33.mojo
+++ b/solutions/p33/p33.mojo
@@ -35,11 +35,13 @@ fn matmul_idiomatic_tiled[
 
     local_row = thread_idx.y
     local_col = thread_idx.x
-    tiled_row = block_idx.y * tile_size_y + local_row
-    tiled_col = block_idx.x * tile_size_x + local_col
+    tiled_row = Int(block_idx.y * tile_size_y + local_row)
+    tiled_col = Int(block_idx.x * tile_size_x + local_col)
 
     # Get the tile of the output matrix that this thread block is responsible for
-    out_tile = output.tile[TILE_SIZE, TILE_SIZE](block_idx.y, block_idx.x)
+    out_tile = output.tile[TILE_SIZE, TILE_SIZE](
+        Int(block_idx.y), Int(block_idx.x)
+    )
     a_shared = LayoutTensor[
         dtype,
         Layout.row_major(TILE_SIZE, TILE_SIZE),
@@ -62,8 +64,8 @@ fn matmul_idiomatic_tiled[
 
     for idx in range(size // TILE_SIZE):  # Iterate over K tiles
         # Get tiles from A and B matrices
-        a_tile = a.tile[TILE_SIZE, TILE_SIZE](block_idx.y, idx)
-        b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, block_idx.x)
+        a_tile = a.tile[TILE_SIZE, TILE_SIZE](Int(block_idx.y), idx)
+        b_tile = b.tile[TILE_SIZE, TILE_SIZE](idx, Int(block_idx.x))
 
         # Asynchronously copy tiles to shared memory with consistent orientation
         copy_dram_to_sram_async[
@@ -141,7 +143,7 @@ fn tensor_core_matrix_multiplication[
     alias N = C.shape[1]()
     alias K = A.shape[1]()
 
-    warp_id = thread_idx.x // WARP_SIZE
+    warp_id = Int(thread_idx.x) // WARP_SIZE
     warps_in_n = BN // WN
     warps_in_m = BM // WM
     warp_y = warp_id // warps_in_n
@@ -149,7 +151,7 @@ fn tensor_core_matrix_multiplication[
 
     warp_is_active = warp_y < warps_in_m
 
-    C_block_tile = C.tile[BM, BN](block_idx.y, block_idx.x)
+    C_block_tile = C.tile[BM, BN](Int(block_idx.y), Int(block_idx.x))
     C_warp_tile = C_block_tile.tile[WM, WN](warp_y, warp_x)
 
     mma_op = TensorCore[A.dtype, C.dtype, Index(MMA_M, MMA_N, MMA_K)]()
@@ -192,8 +194,8 @@ fn tensor_core_matrix_multiplication[
     for k_i in range(K // BK):
         barrier()
 
-        A_dram_tile = A.tile[BM, BK](block_idx.y, k_i)
-        B_dram_tile = B.tile[BK, BN](k_i, block_idx.x)
+        A_dram_tile = A.tile[BM, BK](Int(block_idx.y), k_i)
+        B_dram_tile = B.tile[BK, BN](k_i, Int(block_idx.x))
 
         copy_dram_to_sram_async[
             thread_layout = Layout.row_major(4, 8),
diff --git a/solutions/p34/p34.mojo b/solutions/p34/p34.mojo
index a87cd73b..1e852488 100644
--- a/solutions/p34/p34.mojo
+++ b/solutions/p34/p34.mojo
@@ -29,7 +29,7 @@ fn cluster_coordination_basics[
     size: Int,
 ):
     """Real cluster coordination using SM90+ cluster APIs."""
-    global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
     local_i = thread_idx.x
 
     # Check what's happening with cluster ranks
@@ -87,8 +87,8 @@ fn cluster_collective_operations[
     size: Int,
 ):
     """Cluster-wide collective operations using real cluster APIs."""
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     my_block_rank = Int(block_rank_in_cluster())
     block_id = Int(block_idx.x)
 
@@ -143,8 +143,8 @@ fn advanced_cluster_patterns[
 ):
     """Advanced cluster programming using cluster masks and relaxed synchronization.
     """
-    global_i = block_dim.x * block_idx.x + thread_idx.x
-    local_i = thread_idx.x
+    global_i = Int(block_dim.x * block_idx.x + thread_idx.x)
+    local_i = Int(thread_idx.x)
     my_block_rank = Int(block_rank_in_cluster())
     block_id = Int(block_idx.x)