From 1e6e9d30ea1498dbcecf211b1cbcaebc40205e18 Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Tue, 14 May 2024 14:34:53 +0200
Subject: [PATCH 01/16] itertools.product & unvectorized fw index

---
 basalt/autograd/ops/mlops.mojo | 78 +++++++++++++++++++++++++++++++++-
 basalt/autograd/ops/ops.mojo   |  9 +++-
 basalt/utils/itertools.mojo    | 47 ++++++++++++++++++++
 tests/mojo/test_mlops.mojo     | 76 +++++++++++++++++++++++----------
 4 files changed, 185 insertions(+), 25 deletions(-)
 create mode 100644 basalt/utils/itertools.mojo

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 0869919..0f9bb1f 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -4,6 +4,7 @@ from math.limit import min_finite, max_finite
 
 from basalt import Tensor, TensorShape
 from basalt.utils.tensorutils import elwise_transform
+from basalt.utils.itertools import product
 from basalt.autograd.attributes import Attribute, AttributeVector
 
 
@@ -491,4 +492,79 @@ struct SLICE:
         
         Self.slice_kernel[ug_shape, t1_shape, steps, starts, ends, True](res_grad, ug)
         
-        return res_grad ^
\ No newline at end of file
+        return res_grad ^
+
+
+struct INDEX:
+    @staticmethod
+    fn adjust_boundary(slice: Int, dim_size: Int) -> Int:
+        # Adjust negative indices & ensure they are within bounds.
+        var s = slice if slice >= 0 else dim_size + slice
+        return max(min(s, dim_size), 0)
+
+    @staticmethod
+    fn to_indeces(shape: TensorShape, attrs: AttributeVector) -> List[List[Int]]:
+        var SLICE_LITERALS = List[StringLiteral]("dim_0s", "dim_1s", "dim_2s", "dim_3s", "dim_4s", "dim_5s", "dim_6s", "dim_7s")
+        var INDEX_LITERALS = List[StringLiteral]("dim_0i", "dim_1i", "dim_2i", "dim_3i", "dim_4i", "dim_5i", "dim_6i", "dim_7i")
+
+        var indeces = List[List[Int]]()
+        for dim in range(shape.rank()):
+            var temp = List[Int]()
+            
+            # Option 1: Slice
+            if attrs[SLICE_LITERALS[dim]]:
+                var slice = attrs[SLICE_LITERALS[dim]].value().to_shape()
+                var step = slice[2] if slice.rank() == 3 else 1
+                for i in range(
+                    start=Self.adjust_boundary(slice[0], shape[dim]),
+                    end=Self.adjust_boundary(slice[1], shape[dim]),
+                    step=step
+                ):
+                    temp.append(i)
+
+            # Option 2: Indeces
+            elif attrs[INDEX_LITERALS[dim]]:
+                var indeces = attrs[INDEX_LITERALS[dim]].value().to_shape()
+                for i in range(indeces.rank()):
+                    temp.append(indeces[i])
+
+            # All indeces
+            else:
+                for i in range(shape[dim]):
+                    temp.append(i)
+
+            indeces.append(temp)
+        
+        return indeces ^
+
+    @staticmethod
+    fn result_shape(shape: TensorShape, attrs: AttributeVector) -> TensorShape:
+        var indeces = Self.to_indeces(shape, attrs)
+        var new_shape = List[Int]()
+        for i in range(shape.rank()):
+            new_shape.append(len(indeces[i]))
+        return TensorShape(new_shape)
+
+    @staticmethod
+    fn forward[
+        t1_shape: TensorShape,
+        attributes: AttributeVector,
+    ](inout res: Tensor[dtype], t1: Tensor[dtype]):
+        alias indeces = Self.to_indeces(t1_shape, attributes)
+        alias strides = t1_shape.strides()
+
+        var j = 0
+        for comb in product(indeces):
+            var flat_index = 0
+            for dim in range(t1_shape.rank()):
+                flat_index += comb[dim] * strides[dim]
+            res[j] = t1[flat_index]
+            j += 1
+
+    @staticmethod
+    fn backward[
+        ug_shape: TensorShape,
+        t1_shape: TensorShape,
+        attributes: AttributeVector = AttributeVector(),
+    ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
+        return Tensor[dtype]()
\ No newline at end of file
diff --git a/basalt/autograd/ops/ops.mojo b/basalt/autograd/ops/ops.mojo
index 7198270..c737821 100644
--- a/basalt/autograd/ops/ops.mojo
+++ b/basalt/autograd/ops/ops.mojo
@@ -15,7 +15,7 @@ from .basics import (
     TRANSPOSE,
     FMA,
 )
-from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE
+from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE, INDEX
 from .dynamics import CONCAT, SPLIT
 from .conv import CONV2D
 from .pool import MAXPOOL2D
@@ -61,6 +61,7 @@ struct OP(Stringable):
     alias CONCAT = OP(23, "CONCAT", dynamic=True)
     alias SPLIT = OP(24, "SPLIT", dynamic=True)
     alias SLICE = OP(25, "SLICE")
+    alias INDEX = OP(26, "INDEX")
 
     var id: UInt8
     var name: Bytes[16]
@@ -135,6 +136,8 @@ fn static_result_shape(
         return UNSQUEEZE.result_shape(t1_shape, attributes)
     elif op == OP.SLICE:
         return SLICE.result_shape(t1_shape, attributes)
+    elif op == OP.INDEX:
+        return INDEX.result_shape(t1_shape, attributes)
     else:
         print("[ERROR] Operator not found.")
         return TensorShape(-1)
@@ -249,6 +252,8 @@ fn forward_op[
         UNSQUEEZE.forward[t1_shape, attributes](res, t1)
     elif op == OP.SLICE:
         SLICE.forward[t1_shape, attributes](res, t1)
+    elif op == OP.INDEX:
+        INDEX.forward[t1_shape, attributes](res, t1)
     else:
         print("[ERROR] Operator not found.")
 
@@ -361,6 +366,8 @@ fn backward_op[
         res_grad = UNSQUEEZE.backward[ug_shape, t1_shape](ug, t1)
     elif op == OP.SLICE:
         res_grad = SLICE.backward[ug_shape, t1_shape, attributes](ug, t1)
+    elif op == OP.INDEX:
+        res_grad = INDEX.backward[ug_shape, t1_shape, attributes](ug, t1)
     else:
         print("[ERROR] Operator not found.")
         res_grad = Tensor[dtype](-1)
diff --git a/basalt/utils/itertools.mojo b/basalt/utils/itertools.mojo
new file mode 100644
index 0000000..aceda31
--- /dev/null
+++ b/basalt/utils/itertools.mojo
@@ -0,0 +1,47 @@
+
+@value
+struct _ProductIterator(Sized):
+    var lists: List[List[Int]]
+    var indeces: List[Int]
+    var _iters: Int
+
+    @always_inline("nodebug")
+    fn __init__(inout self, lists: List[List[Int]]):
+        self.lists = lists
+        self.indeces = List[Int]()
+        for i in range(len(lists)):
+            self.indeces.append(0)
+        
+        self._iters = 1
+        for lst in self.lists:
+            self._iters *= len(lst[])
+
+    @always_inline("nodebug")
+    fn __len__(self) -> Int:
+        return self._iters
+
+    @always_inline("nodebug")
+    fn __iter__(self) -> Self:
+        return self
+
+    @always_inline("nodebug")
+    fn __next__(inout self) -> List[Int]:
+        var res = List[Int]()
+        for i in range(len(self.lists)):
+            res.append(self.lists[i][self.indeces[i]])
+        self._increment_indeces()
+        self._iters -= 1
+        return res ^
+
+    @always_inline("nodebug")
+    fn _increment_indeces(inout self):
+        for i in reversed(range(len(self.indeces))):
+            self.indeces[i] += 1
+            if self.indeces[i] < len(self.lists[i]):
+                break
+            self.indeces[i] = 0
+
+
+@always_inline("nodebug")
+fn product(lists: List[List[Int]]) -> _ProductIterator:
+    return _ProductIterator(lists)
\ No newline at end of file
diff --git a/tests/mojo/test_mlops.mojo b/tests/mojo/test_mlops.mojo
index 2ba723e..4d87bb1 100644
--- a/tests/mojo/test_mlops.mojo
+++ b/tests/mojo/test_mlops.mojo
@@ -620,33 +620,63 @@ fn test_backward_SLICE_multiple_axes() raises:
     ](t1, ug, expected_ug)
 
 
+from basalt.autograd.ops.mlops import INDEX
+
+fn test_INDEX() raises:
+    alias t1_shape = TensorShape(2, 3, 5)
+    var t = Tensor[dtype](t1_shape)
+    for i in range(t.num_elements()):
+        t[i] = i
+
+    # t[:, [0, 0], 0:5:2]
+    # TODO: need for a list attribute as this only supports to specify indeces of MAX_RANK
+    alias attr_1 = Attribute("dim_1i", TensorShape(0, 0))
+    alias attr_2 = Attribute("dim_2s", TensorShape(0, 5, 2))
+
+    var expected = Tensor[dtype](2, 2, 3)
+    for i in range(2):
+        for j in range(2):
+            for k in range(3):
+                expected[i*2*3 + j*3 + k] = i * 3 * 5 + k * 2
+
+    test_unary_op[
+        OP.INDEX, t1_shape, AttributeVector(
+            attr_1,
+            attr_2,
+        )
+    ](t, expected)
+
+    print(expected)
+
+
 fn main():
     try:
-        test_SIGMOID()
-        test_RELU()
-        test_TANH()
-        test_CLIP()
-        test_SQUEEZE()
-        test_UNSQUEEZE()
-        test_SLICE()
-        test_SLICE_step()
-        test_SLICE_neg()
-        test_SLICE_multiple_axes()
+        # test_SIGMOID()
+        # test_RELU()
+        # test_TANH()
+        # test_CLIP()
+        # test_SQUEEZE()
+        # test_UNSQUEEZE()
+        # test_SLICE()
+        # test_SLICE_step()
+        # test_SLICE_neg()
+        # test_SLICE_multiple_axes()
+        test_INDEX()
     except e:
         print("[ERROR] Error in forward mlops")
         print(e)
         return
 
-    try:
-        test_backward_SIGMOID()
-        test_backward_RELU()
-        test_backward_TANH()
-        test_backward_CLIP()
-        test_backward_SQUEEZE()
-        test_backward_UNSQUEEZE()
-        test_backward_SLICE()
-        test_backward_SLICE_multiple_axes()
-    except e:
-        print("[ERROR] Error in backward mlops")
-        print(e)
-        return
+    # try:
+    #     test_backward_SIGMOID()
+    #     test_backward_RELU()
+    #     test_backward_TANH()
+    #     test_backward_CLIP()
+    #     test_backward_SQUEEZE()
+    #     test_backward_UNSQUEEZE()
+    #     test_backward_SLICE()
+    #     test_backward_SLICE_multiple_axes()
+    # except e:
+    #     print("[ERROR] Error in backward mlops")
+    #     print(e)
+    #     return

From 60e510844596ede4fe51a22ed1c54fe27fc50292 Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Tue, 14 May 2024 14:56:27 +0200
Subject: [PATCH 02/16] unoptimized index bw

---
 basalt/autograd/ops/mlops.mojo | 15 ++++++++-
 tests/mojo/test_mlops.mojo     | 57 ++++++++++++++++++++++++++--------
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 0f9bb1f..5aa2d8b 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -567,4 +567,17 @@ struct INDEX:
         t1_shape: TensorShape,
         attributes: AttributeVector = AttributeVector(),
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
-        return Tensor[dtype]()
\ No newline at end of file
+        alias indeces = Self.to_indeces(t1_shape, attributes)
+        alias strides = t1_shape.strides()
+
+        var res_grad = Tensor[dtype](t1_shape)
+
+        var j = 0
+        for comb in product(indeces):
+            var flat_index = 0
+            for dim in range(t1_shape.rank()):
+                flat_index += comb[dim] * strides[dim]
+            res_grad[flat_index] += ug[j]
+            j += 1
+        
+        return res_grad^
\ No newline at end of file
diff --git a/tests/mojo/test_mlops.mojo b/tests/mojo/test_mlops.mojo
index 4d87bb1..964e134 100644
--- a/tests/mojo/test_mlops.mojo
+++ b/tests/mojo/test_mlops.mojo
@@ -649,6 +649,36 @@ fn test_INDEX() raises:
     print(expected)
 
 
+fn test_INDEX_backward() raises:
+    alias t1_shape = TensorShape(2, 3, 5)
+    var t = Tensor[dtype](t1_shape)
+    for i in range(t.num_elements()):
+        t[i] = i
+
+    alias attr_1 = Attribute("dim_1i", TensorShape(0, 0))
+    alias attr_2 = Attribute("dim_2s", TensorShape(0, 5, 2))
+
+    alias ug_shape = TensorShape(2, 2, 3)
+    var ug = Tensor[dtype](ug_shape)
+    fill(ug, 1.0)
+
+    var expected = Tensor[dtype](t1_shape)
+    for i in range(2):
+        for j in range(2):
+            for k in range(3):
+                # NOTE: `+=` because selected indeces [0, 0] can repeat
+                expected[i * 3 * 5 + k * 2] += 1.0 
+
+    test_unary_op_backward[
+        OP.INDEX, t1_shape, ug_shape, AttributeVector(
+            attr_1,
+            attr_2,
+        )
+    ](t, ug, expected)
+
+    print(expected)
+
+
 fn main():
     try:
         # test_SIGMOID()
@@ -667,16 +697,17 @@ fn main():
         print(e)
         return
 
-    # try:
-    #     test_backward_SIGMOID()
-    #     test_backward_RELU()
-    #     test_backward_TANH()
-    #     test_backward_CLIP()
-    #     test_backward_SQUEEZE()
-    #     test_backward_UNSQUEEZE()
-    #     test_backward_SLICE()
-    #     test_backward_SLICE_multiple_axes()
-    # except e:
-    #     print("[ERROR] Error in backward mlops")
-    #     print(e)
-    #     return
+    try:
+        # test_backward_SIGMOID()
+        # test_backward_RELU()
+        # test_backward_TANH()
+        # test_backward_CLIP()
+        # test_backward_SQUEEZE()
+        # test_backward_UNSQUEEZE()
+        # test_backward_SLICE()
+        # test_backward_SLICE_multiple_axes()
+        test_INDEX_backward()
+    except e:
+        print("[ERROR] Error in backward mlops")
+        print(e)
+        return

From 8d90c09d852356ac5f30c8328d5549d785e9c827 Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Tue, 14 May 2024 18:10:38 +0200
Subject: [PATCH 03/16] getindex to product & vectorized fw

---
 basalt/autograd/ops/mlops.mojo | 55 +++++++++++++++++++++++++++++-----
 basalt/utils/itertools.mojo    | 34 +++++++++++----------
 2 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 5aa2d8b..fd871fd 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -545,6 +545,26 @@ struct INDEX:
             new_shape.append(len(indeces[i]))
         return TensorShape(new_shape)
 
+    @staticmethod
+    fn map_indeces[
+        nelts: Int,
+        strides: TensorShape,
+        indeces: List[List[Int]],
+    ](idx: Int) -> SIMD[DType.int64, nelts]:
+        alias indeces_product = product(indeces)
+
+        var temp = SIMD[DType.int64, nelts]()
+        for i in range(idx, idx + nelts):
+            var comb = indeces_product[i]
+            var flat_index = 0
+
+            for dim in range(len(comb)):
+                flat_index += comb[dim] * strides[dim]
+
+            temp[i % nelts] = flat_index
+
+        return temp
+
     @staticmethod
     fn forward[
         t1_shape: TensorShape,
@@ -552,14 +572,17 @@ struct INDEX:
     ](inout res: Tensor[dtype], t1: Tensor[dtype]):
         alias indeces = Self.to_indeces(t1_shape, attributes)
         alias strides = t1_shape.strides()
+        alias total_length = len(product(indeces))
+
+        @parameter
+        fn vec_index[nelts: Int](i: Int):
+
+            res.store[nelts](i,
+                t1.data().gather(Self.map_indeces[nelts, strides, indeces](i))
+            )
+
+        vectorize[vec_index, nelts](total_length)
 
-        var j = 0
-        for comb in product(indeces):
-            var flat_index = 0
-            for dim in range(t1_shape.rank()):
-                flat_index += comb[dim] * strides[dim]
-            res[j] = t1[flat_index]
-            j += 1
 
     @staticmethod
     fn backward[
@@ -569,9 +592,25 @@ struct INDEX:
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
         alias indeces = Self.to_indeces(t1_shape, attributes)
         alias strides = t1_shape.strides()
+        # alias total_length = len(product(indeces))
 
         var res_grad = Tensor[dtype](t1_shape)
 
+        # @parameter
+        # fn vec_index[nelts: Int](i: Int):
+
+        #     var offset = Self.map_indeces[nelts, strides, indeces](i)
+        #     res_grad.data().scatter(
+        #         offset,
+        #         res_grad.data().gather(offset) + ug.load[nelts](i),
+        #     )
+        
+        # vectorize[vec_index, nelts](total_length)
+        
+        # BUG: Edge case in vectorization:
+        # When the offset = [0, 2, 4, 0] and ug = [1, 1, 1, 1]
+        # It doesn't scatter to index 0 twice as it should be: res_grad[0] += 1 + 1
+
         var j = 0
         for comb in product(indeces):
             var flat_index = 0
@@ -579,5 +618,5 @@ struct INDEX:
                 flat_index += comb[dim] * strides[dim]
             res_grad[flat_index] += ug[j]
             j += 1
-        
+
         return res_grad^
\ No newline at end of file
diff --git a/basalt/utils/itertools.mojo b/basalt/utils/itertools.mojo
index aceda31..fd7a6ce 100644
--- a/basalt/utils/itertools.mojo
+++ b/basalt/utils/itertools.mojo
@@ -2,16 +2,14 @@
 @value
 struct _ProductIterator(Sized):
     var lists: List[List[Int]]
-    var indeces: List[Int]
+    var _current: Int
     var _iters: Int
 
     @always_inline("nodebug")
     fn __init__(inout self, lists: List[List[Int]]):
         self.lists = lists
-        self.indeces = List[Int]()
-        for i in range(len(lists)):
-            self.indeces.append(0)
-        
+        self._current = 0
+
         self._iters = 1
         for lst in self.lists:
             self._iters *= len(lst[])
@@ -26,20 +24,24 @@ struct _ProductIterator(Sized):
 
     @always_inline("nodebug")
     fn __next__(inout self) -> List[Int]:
-        var res = List[Int]()
-        for i in range(len(self.lists)):
-            res.append(self.lists[i][self.indeces[i]])
-        self._increment_indeces()
+        self._current += 1
         self._iters -= 1
-        return res ^
+        return self._get_combination(self._current - 1)
+
+    @always_inline("nodebug")
+    fn _get_combination(self, current: Int) -> List[Int]:
+        var combination = List[Int]()
+        var count = current
+        for i in reversed(range(len(self.lists))):
+            var index = count % len(self.lists[i])
+            combination.append(self.lists[i][index])
+            count //= len(self.lists[i])
+        combination._reverse()
+        return combination ^
 
     @always_inline("nodebug")
-    fn _increment_indeces(inout self):
-        for i in reversed(range(len(self.indeces))):
-            self.indeces[i] += 1
-            if self.indeces[i] < len(self.lists[i]):
-                break
-            self.indeces[i] = 0
+    fn __getitem__(self, index: Int) -> List[Int]:
+        return self._get_combination(index)
 
 
 @always_inline("nodebug")

From 113b5aeabbf5479638addbf3b9afe24c189c73aa Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Tue, 14 May 2024 19:19:02 +0200
Subject: [PATCH 04/16] something inbetween

---
 basalt/autograd/ops/mlops.mojo | 39 ++++++++++++++++------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index fd871fd..6e38aaa 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -592,31 +592,28 @@ struct INDEX:
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
         alias indeces = Self.to_indeces(t1_shape, attributes)
         alias strides = t1_shape.strides()
-        # alias total_length = len(product(indeces))
+        alias total_length = len(product(indeces))
 
         var res_grad = Tensor[dtype](t1_shape)
 
-        # @parameter
-        # fn vec_index[nelts: Int](i: Int):
+        @parameter
+        fn vec_index[nelts: Int](i: Int):
 
-        #     var offset = Self.map_indeces[nelts, strides, indeces](i)
-        #     res_grad.data().scatter(
-        #         offset,
-        #         res_grad.data().gather(offset) + ug.load[nelts](i),
-        #     )
-        
-        # vectorize[vec_index, nelts](total_length)
-        
-        # BUG: Edge case in vectorization:
-        # When the offset = [0, 2, 4, 0] and ug = [1, 1, 1, 1]
-        # It doesn't scatter to index 0 twice as it should be: res_grad[0] += 1 + 1
+            var offset = Self.map_indeces[nelts, strides, indeces](i)
+            
+            # res_grad.data().scatter(
+            #     offset,
+            #     res_grad.data().gather(offset) + ug.load[nelts](i),
+            # )
+            # BUG: Edge case in vectorization:
+            # When the offset = [0, 2, 4, 0] and ug = [1, 1, 1, 1]
+            # It doesn't scatter to index 0 twice as it should be: res_grad[0] += 1 + 1
+            
+            # Workaround
+            var u = ug.load[nelts](i)
+            for j in range(nelts):
+                res_grad[int(offset[j])] += u[j]
 
-        var j = 0
-        for comb in product(indeces):
-            var flat_index = 0
-            for dim in range(t1_shape.rank()):
-                flat_index += comb[dim] * strides[dim]
-            res_grad[flat_index] += ug[j]
-            j += 1
+        vectorize[vec_index, nelts](total_length)
 
         return res_grad^
\ No newline at end of file

From 0c56fa4673263929ae86fee75d7a99e29d9301d6 Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Thu, 16 May 2024 00:57:56 +0200
Subject: [PATCH 05/16] upsampling nearest

---
 basalt/autograd/attributes.mojo |   5 +
 basalt/nn/__init__.mojo         |   1 +
 basalt/nn/layers/upsample.mojo  | 117 +++++++++++++++++++++++
 tests/python/test_upsample.mojo | 159 ++++++++++++++++++++++++++++++++
 4 files changed, 282 insertions(+)
 create mode 100644 basalt/nn/layers/upsample.mojo
 create mode 100644 tests/python/test_upsample.mojo

diff --git a/basalt/autograd/attributes.mojo b/basalt/autograd/attributes.mojo
index 9be1822..2e87300 100644
--- a/basalt/autograd/attributes.mojo
+++ b/basalt/autograd/attributes.mojo
@@ -67,6 +67,11 @@ struct AttributeVector(Sized, Stringable, CollectionElement):
                 return self.attributes[i]
         return None
 
+    @always_inline("nodebug")
+    fn append(inout self, attribute: Attribute):
+        self.attributes[self.size] = attribute
+        self.size += 1
+
     @always_inline("nodebug")
     fn __str__(self) -> String:
         var s: String = "["
diff --git a/basalt/nn/__init__.mojo b/basalt/nn/__init__.mojo
index 99b30a3..9c994a4 100644
--- a/basalt/nn/__init__.mojo
+++ b/basalt/nn/__init__.mojo
@@ -4,6 +4,7 @@ from .model import Model
 from .layers.linear import Linear
 from .layers.conv import Conv2d
 from .layers.pool import MaxPool2d
+from .layers.upsample import Upsample
 
 from .loss import MSELoss, CrossEntropyLoss
 from .activations import Softmax, LogSoftmax, ReLU, Sigmoid, Tanh
diff --git a/basalt/nn/layers/upsample.mojo b/basalt/nn/layers/upsample.mojo
new file mode 100644
index 0000000..c70de00
--- /dev/null
+++ b/basalt/nn/layers/upsample.mojo
@@ -0,0 +1,117 @@
+from basalt import dtype
+from basalt import Graph, Symbol, OP
+from basalt import Tensor, TensorShape
+from basalt.autograd.attributes import AttributeVector, Attribute
+from basalt.utils.itertools import product
+
+
+fn _scale_indeces(N: Int, scale: Scalar[dtype], align_corners: Bool, dim: Int, ndims: Int) -> List[Scalar[dtype]]:    
+    var M = int(scale * N)
+    var indeces = List[Scalar[dtype]]()
+    if align_corners:
+        for i in range(M):
+            indeces.append(i * ((N - 1) / (M - 1)))
+    else:
+        var step = 1 / scale
+        var start = ((M - 1) * step - N + 1) / 2
+        for i in range(M):
+            indeces.append(i * step - start)
+
+    return indeces ^
+
+
+fn nearest_coeffs(N: Int, scale: Scalar[dtype], dim: Int, ndims: Int) -> List[Int]:
+    
+    @parameter
+    fn round_to_index(number: Scalar[dtype]) -> Int:
+        return int(number + 0.5) if number > 0 else int(number - 0.5)
+    
+    var indeces = List[Int]()
+    var scaled = _scale_indeces(N, scale, True, dim, ndims)
+    for i in range(len(scaled)):
+        indeces.append(round_to_index(scaled[i]))
+    return indeces ^
+
+
+fn linear_coeffs(N: Int, scale: Scalar[dtype], align_corners: Bool, dim: Int, ndims: Int) -> Tuple[List[Int], List[Int]]:
+    # TODO
+    return (List[Int](), List[Int]())
+
+
+fn cubic_coeffs(N: Int, scale: Scalar[dtype], align_corners: Bool, dim: Int, ndims: Int) -> Tuple[List[Int], List[Int]]:
+    # TODO
+    return (List[Int](), List[Int]())
+
+
+fn interpolate_nd[
+    indices_fn: fn (Int, Scalar[dtype], Bool, Int, Int) -> Tuple[List[Int], List[Int]],
+](inout g: Graph, input: Symbol, scale_factors: List[Scalar[dtype]], align_corners: Bool) -> Symbol:
+
+    var spatial_dims = input.shape.rank() - 2
+    
+    var indeces_weights = List[Tuple[List[Int], List[Int]]]()
+    for i in range(spatial_dims):
+        indeces_weights.append(
+            indices_fn(
+                input.shape[i + 2],
+                scale_factors[i],
+                align_corners,
+                i,
+                spatial_dims,
+            )
+        )
+
+    # TODO: interpolation logic
+    # for idx_weight in product(indeces_weights):
+    #     ...
+
+    return Symbol(-1, dtype, TensorShape(), False)
+
+
+fn Upsample(
+    inout g: Graph,
+    input: Symbol,
+    mode: StringLiteral,
+    scale_factors: List[Scalar[dtype]],
+    align_corners: Bool = False,
+) -> Symbol:
+
+    # Assumption: A scale needs to be provided for each spatial dimension.
+    # input shape (B, C, *N) with batch and channel considered as non-spatial dimensions.
+    # input.shape.rank() - 2 == len(scale_factor)
+    var spatial_dims = input.shape.rank() - 2
+
+    var res: Symbol
+    var attributes = AttributeVector()
+    var INDEX_LITERALS = List[StringLiteral]("dim_2i", "dim_3i", "dim_4i")
+
+    if mode == "nearest":
+        # Nearest neighbor interpolation --> input[:, :, *indeces]
+        for i in range(spatial_dims):            
+            attributes.append(
+                Attribute(
+                    INDEX_LITERALS[i],
+                    nearest_coeffs(input.shape[i + 2], scale_factors[i], i, spatial_dims)
+                )
+            )
+
+        res = g.op(OP.INDEX, input, attributes=attributes)
+
+    # elif mode == "linear":
+    #     res = interpolate_nd[linear_coeffs](g, 
+    #         input,
+    #         scale_factor,
+    #         align_corners
+    #     )
+    
+    # elif mode == "cubic":
+    #     res = interpolate_nd[cubic_coeffs](g, 
+    #         input,
+    #         scale_factor,
+    #         align_corners
+    #     )
+    else:
+        res = input
+
+    return res
+
diff --git a/tests/python/test_upsample.mojo b/tests/python/test_upsample.mojo
new file mode 100644
index 0000000..c5918ff
--- /dev/null
+++ b/tests/python/test_upsample.mojo
@@ -0,0 +1,159 @@
+from python.python import Python, PythonObject
+
+import basalt.nn as nn
+from basalt import dtype, Graph
+from basalt import Tensor, TensorShape
+from tests import assert_tensors_equal, to_numpy, to_tensor
+
+
+fn test_upsample[
+    shape: TensorShape,
+    mode: StringLiteral,
+    scale_factors: List[Scalar[dtype]],
+    align_corners: Bool
+](
+    t1: Tensor[dtype],
+    ug: Tensor[dtype],
+    expected: Tensor[dtype],
+    expected_grad: Tensor[dtype]
+) raises:
+
+    fn create_graph() -> Graph:
+        var g = Graph()
+        var t1 = g.input(shape, trainable=True)
+        var t2 = nn.Upsample(g, t1, mode, scale_factors, align_corners)
+        g.out(t2)
+        return g ^
+
+    alias graph = create_graph()
+    var model = nn.Model[graph](inference_only=True)
+    var res = model.inference(t1)[0]
+
+    model.backward(ug)
+    var res_grad = model.parameters.grads[graph.inputs[0]]
+
+    assert_tensors_equal["almost"](res, expected)
+    assert_tensors_equal["almost"](res_grad, expected_grad)
+
+
+@value
+struct torch_upsample_result:
+    var expected: Tensor[dtype]
+    var grad: Tensor[dtype]
+
+
+fn test_upsample_torch[
+    shape: TensorShape,
+    mode: StringLiteral,
+    scale_factors: List[Scalar[dtype]],
+    align_corners: Bool
+](data: PythonObject, ug: PythonObject) raises -> torch_upsample_result:
+
+    var py = Python.import_module("builtins")
+    var np = Python.import_module("numpy")
+    var torch = Python.import_module("torch")
+
+    var py_scales = py.list()
+    for i in range(len(scale_factors)):
+        py_scales.append(scale_factors[i])
+
+    # if mode == "nearest":
+        # var ups = torch.nn.Upsample(scale_factor=py.tuple(py_scales), mode=mode)
+    # else:
+        # var ups = torch.nn.Upsample(scale_factor=py.tuple(py_scales), mode=mode, align_corners=align_corners)
+
+    var ups = torch.nn.Upsample(scale_factor=py.tuple(py_scales), mode=mode)
+
+    var tensor = torch.from_numpy(data).requires_grad_(True)
+    var expected = ups(tensor)
+    var upper_grad = torch.from_numpy(ug)
+    _ = expected.backward(upper_grad)
+
+    return torch_upsample_result(
+        to_tensor(expected.detach().numpy()),
+        to_tensor(tensor.grad.numpy()),
+    )
+
+
+
+fn test_UPSAMPLE_nearest() raises:
+    var np = Python.import_module("numpy")
+
+    alias shape = TensorShape(1, 1, 2, 2)
+    alias mode: StringLiteral = "nearest"
+    alias scales = List[Scalar[dtype]](2.0, 3.0)
+    alias align_corners = False
+
+    var data = np.array([
+        1, 2,
+        3, 4
+    ], dtype=np.float32).reshape(1, 1, 2, 2)
+
+    var ug = np.ones((1, 1, 4, 6))
+    
+    var torch_out = test_upsample_torch[shape, mode, scales, align_corners](data, ug)
+    test_upsample[shape, mode, scales, align_corners](
+        to_tensor(data),
+        to_tensor(ug),
+        torch_out.expected,
+        torch_out.grad
+    )
+
+    _ = data
+
+
+fn test_UPSAMPLE_linear() raises:
+    var np = Python.import_module("numpy")
+
+    alias shape = TensorShape(1, 1, 2, 2)
+    alias mode: StringLiteral = "linear"
+    alias scales = List[Scalar[dtype]](2.0, 2.0)
+
+    var data = np.array([
+        1, 2,
+        3, 4
+    ], dtype=np.float32).reshape(1, 1, 2, 2)
+
+    # var expected = np.array([
+    #     1.,   1.25, 1.75, 2.  ,
+    #     1.5,  1.75, 2.25, 2.5 ,
+    #     2.5,  2.75, 3.25, 3.5 ,
+    #     3.,   3.25, 3.75, 4.  ,
+    # ], dtype=np.float32).reshape(1, 1, 4, 4)
+
+
+fn test_UPSAMPLE_cubic() raises:
+    var np = Python.import_module("numpy")
+
+    alias shape = TensorShape(1, 1, 4, 4)
+    alias mode: StringLiteral = "cubic"
+    alias scales = List[Scalar[dtype]](2.0, 2.0)
+
+    var data = np.array([
+        1,  2,  3,  4,
+        5,  6,  7,  8,
+        9,  10, 11, 12,
+        13, 14, 15, 16,
+    ], dtype=np.float32).reshape(1, 1, 4, 4)
+
+    # var expected = np.array([
+    #      0.47265625,  0.76953125,  1.24609375,  1.875,       2.28125,   2.91015625,  3.38671875,  3.68359375,
+    #      1.66015625,  1.95703125,  2.43359375,  3.0625,      3.46875,   4.09765625,  4.57421875,  4.87109375,
+    #      3.56640625,  3.86328125,  4.33984375,  4.96875,     5.375,     6.00390625,  6.48046875,  6.77734375,
+    #      6.08203125,  6.37890625,  6.85546875,  7.484375,    7.890625,  8.51953125,  8.99609375,  9.29296875,
+    #      7.70703125,  8.00390625,  8.48046875,  9.109375,    9.515625, 10.14453125, 10.62109375, 10.91796875,
+    #     10.22265625, 10.51953125, 10.99609375, 11.625,      12.03125,  12.66015625, 13.13671875, 13.43359375,
+    #     12.12890625, 12.42578125, 12.90234375, 13.53125,    13.9375,   14.56640625, 15.04296875, 15.33984375,
+    #     13.31640625, 13.61328125, 14.08984375, 14.71875,    15.125,    15.75390625, 16.23046875, 16.52734375
+    # ], dtype=np.float32).reshape(1, 1, 8, 8)
+
+
+fn main():
+
+    try:
+        test_UPSAMPLE_nearest()
+        # test_UPSAMPLE_linear()
+        # test_UPSAMPLE_cubic()
+    except e:
+        print("[Error] Error in Upsample")
+        print(e)
\ No newline at end of file

From 0876ea3ba546ffd26a89c822f029089ddc51434a Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Mon, 27 May 2024 19:13:36 -0600
Subject: [PATCH 06/16] Added upsample nearest op and test

---
 basalt/autograd/ops/mlops.mojo | 84 +++++++++++++++++++++++++++++++++-
 basalt/autograd/ops/ops.mojo   |  7 ++-
 tests/mojo/test_mlops.mojo     | 48 ++++++++++++++++++-
 3 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 6e38aaa..995e0c4 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -616,4 +616,86 @@ struct INDEX:
 
         vectorize[vec_index, nelts](total_length)
 
-        return res_grad^
\ No newline at end of file
+        return res_grad^
+
+
+struct UPSAMPLE:
+    @staticmethod
+    fn result_shape(t1_shape: TensorShape, attributes: AttributeVector) -> TensorShape:
+        var scales = attributes["scales"].value().to_shape()
+        var mode = attributes["mode"].value().to_string()
+
+        var new_shape = List[Int]()
+        for i in range(0, t1_shape.rank()):
+            if i < 2:
+                new_shape.append(t1_shape[i])
+            else:
+                new_shape.append(t1_shape[i] * scales[i - 2])
+
+        return TensorShape(new_shape)
+
+    @staticmethod
+    fn recursive_iter[pos_shape: Int, shape: TensorShape, scales: TensorShape](inout res: Tensor[dtype], t1: Tensor[dtype], strides_res: StaticIntTuple[8], index_t1: Int, index_res: Int):
+        alias end_pos = shape.rank() - 1
+        alias strides = shape.strides()
+
+        @parameter
+        if pos_shape >= end_pos:
+            @parameter
+            fn v_iter[nelts: Int](i: Int):
+                var values = t1.load[nelts](index_t1 + i)
+
+                var offset_res = index_res + i * scales[end_pos - 2]
+                for j in range(nelts * scales[pos_shape - 2]):
+                    var temp = j // scales[pos_shape - 2]
+
+                    res[offset_res + j] = values[temp]
+
+            vectorize[v_iter, nelts](shape[pos_shape])
+            
+            return
+        else:
+            for i in range(shape[pos_shape] * scales[pos_shape - 2]):
+                var temp_i = i // scales[pos_shape - 2]
+                var temp_index_t1 = temp_i * strides[pos_shape] + index_t1
+                var temp_index_res = i * strides_res[pos_shape] + index_res
+
+                Self.recursive_iter[pos_shape + 1, shape, scales](res, t1, strides_res, temp_index_t1, temp_index_res)
+
+    @staticmethod
+    fn forward[
+        t1_shape: TensorShape,
+        attributes: AttributeVector,
+    ](inout res: Tensor[dtype], t1: Tensor[dtype]):
+        # Input is [N, C, D in, H in, W in], N is batch size and C is number of channels. Ranks 3-D, 4-D or 5-D tensors.
+        alias scales = attributes["scales"].value().to_shape() # Has to match input size (the last dimensions D, H and W) or just be one value
+        alias mode = attributes["mode"].value().to_string()
+
+        alias strides = t1_shape.strides()
+        alias total_length = t1_shape.num_elements()
+
+        alias first_loop = total_length // strides[1]
+
+        var strides_res = res.strides()
+
+        @parameter
+        if mode == "nearest":
+            @parameter
+            fn p_iter(i: Int):
+                var offset = i * strides[1]
+                var offset_res = i * strides_res[1]
+                
+                Self.recursive_iter[2, t1_shape, scales](
+                    res, t1, strides_res, offset, offset_res)
+
+            parallelize[p_iter](first_loop)
+        else:
+            pass
+
+    @staticmethod
+    fn backward[
+        ug_shape: TensorShape,
+        t1_shape: TensorShape,
+        attributes: AttributeVector = AttributeVector(),
+    ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
+        return t1
\ No newline at end of file
diff --git a/basalt/autograd/ops/ops.mojo b/basalt/autograd/ops/ops.mojo
index c737821..c47f5ce 100644
--- a/basalt/autograd/ops/ops.mojo
+++ b/basalt/autograd/ops/ops.mojo
@@ -15,7 +15,7 @@ from .basics import (
     TRANSPOSE,
     FMA,
 )
-from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE, INDEX
+from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE, INDEX, UPSAMPLE
 from .dynamics import CONCAT, SPLIT
 from .conv import CONV2D
 from .pool import MAXPOOL2D
@@ -62,6 +62,7 @@ struct OP(Stringable):
     alias SPLIT = OP(24, "SPLIT", dynamic=True)
     alias SLICE = OP(25, "SLICE")
     alias INDEX = OP(26, "INDEX")
+    alias UPSAMPLE = OP(27, "UPSAMPLE")
 
     var id: UInt8
     var name: Bytes[16]
@@ -138,6 +139,8 @@ fn static_result_shape(
         return SLICE.result_shape(t1_shape, attributes)
     elif op == OP.INDEX:
         return INDEX.result_shape(t1_shape, attributes)
+    elif op == OP.UPSAMPLE:
+        return UPSAMPLE.result_shape(t1_shape, attributes)
     else:
         print("[ERROR] Operator not found.")
         return TensorShape(-1)
@@ -254,6 +257,8 @@ fn forward_op[
         SLICE.forward[t1_shape, attributes](res, t1)
     elif op == OP.INDEX:
         INDEX.forward[t1_shape, attributes](res, t1)
+    elif op == OP.UPSAMPLE:
+        UPSAMPLE.forward[t1_shape, attributes](res, t1)
     else:
         print("[ERROR] Operator not found.")
 
diff --git a/tests/mojo/test_mlops.mojo b/tests/mojo/test_mlops.mojo
index 964e134..045ae7b 100644
--- a/tests/mojo/test_mlops.mojo
+++ b/tests/mojo/test_mlops.mojo
@@ -679,6 +679,48 @@ fn test_INDEX_backward() raises:
     print(expected)
 
 
+fn test_UPSAMPLE() raises:
+    alias t1_shape = TensorShape(2, 3, 5)
+    var t = Tensor[dtype](t1_shape)
+    for i in range(t.num_elements()):
+        t[i] = i
+
+    var expected = Tensor[dtype](2, 3, 10)
+    for i in range(2):
+        for j in range(3):
+            for k in range(5):
+                for l in range(2):
+                    expected[i*3*10 + j*10 + k*2 + l] = t[i*3*5 + j*5 + k]
+
+    test_unary_op[
+        OP.UPSAMPLE, t1_shape, AttributeVector(
+            Attribute("scales", TensorShape(2)),
+            Attribute("mode", "nearest")
+        )
+    ](t, expected)
+
+
+    alias t2_shape = TensorShape(1, 1, 2, 2)
+    t = Tensor[dtype](t2_shape)
+    for i in range(t.num_elements()):
+        t[i] = i
+
+    expected = Tensor[dtype](1, 1, 4, 6)
+    for i in range(1):
+        for j in range(1):
+            for k in range(4):
+                for l in range(6):
+                    var pos = i*1*2*2 + j*2*2 + (k // 2) * 2 + (l // 3)
+                    expected[i*1*4*6 + j*4*6 + k*6 + l] = t[pos]
+    
+    test_unary_op[
+        OP.UPSAMPLE, t2_shape, AttributeVector(
+            Attribute("scales", TensorShape(2, 3)),
+            Attribute("mode", "nearest")
+        )
+    ](t, expected)
+
+
 fn main():
     try:
         # test_SIGMOID()
@@ -691,7 +733,8 @@ fn main():
         # test_SLICE_step()
         # test_SLICE_neg()
         # test_SLICE_multiple_axes()
-        test_INDEX()
+        # test_INDEX()
+        test_UPSAMPLE()
     except e:
         print("[ERROR] Error in forward mlops")
         print(e)
@@ -706,7 +749,8 @@ fn main():
         # test_backward_UNSQUEEZE()
         # test_backward_SLICE()
         # test_backward_SLICE_multiple_axes()
-        test_INDEX_backward()
+        # test_INDEX_backward()
+        pass
     except e:
         print("[ERROR] Error in backward mlops")
         print(e)

From 1e5fc432123e83cfdb32a5a7c9c96d8c6cbdb43f Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Sun, 2 Jun 2024 16:34:23 -0600
Subject: [PATCH 07/16] Merged with main

---
 basalt/__init__.mojo                    |   1 +
 basalt/autograd/attributes.mojo         |   4 +-
 basalt/autograd/graph.mojo              | 116 +++++++-------
 basalt/autograd/ops/basics.mojo         |   5 +-
 basalt/autograd/ops/conv.mojo           |   1 -
 basalt/autograd/ops/matmul.mojo         | 175 +++++++++++++++++++++
 basalt/autograd/params.mojo             |   6 +-
 basalt/utils/collection.mojo            |  33 +++-
 basalt/utils/onnx_utils.mojo            | 150 +++++++++++-------
 basalt/utils/tensor_creation_utils.mojo |  73 +++++++++
 basalt/utils/tensorutils.mojo           | 196 +-----------------------
 profile.sh                              | 118 --------------
 tests/mojo/test_backward.mojo           |  11 +-
 tests/mojo/test_tensorutils.mojo        |   2 +-
 tests/testing_utils.mojo                |  54 +------
 15 files changed, 445 insertions(+), 500 deletions(-)
 create mode 100644 basalt/autograd/ops/matmul.mojo
 create mode 100644 basalt/utils/tensor_creation_utils.mojo
 delete mode 100755 profile.sh

diff --git a/basalt/__init__.mojo b/basalt/__init__.mojo
index 137eec4..73d84f1 100644
--- a/basalt/__init__.mojo
+++ b/basalt/__init__.mojo
@@ -5,3 +5,4 @@ from basalt.utils.collection import Collection
 alias dtype = DType.float32
 alias nelts = 2 * simdwidthof[dtype]()
 alias seed = 42
+alias epsilon = 1e-12
diff --git a/basalt/autograd/attributes.mojo b/basalt/autograd/attributes.mojo
index 2e87300..5a57a9a 100644
--- a/basalt/autograd/attributes.mojo
+++ b/basalt/autograd/attributes.mojo
@@ -120,7 +120,7 @@ struct Attribute(Stringable, CollectionElement):
         self.size = N
 
         for i in range(self.size):
-            self.data[i] = value[i]
+            self.data_shape[i] = value[i]
 
     @always_inline("nodebug")
     fn __init__[dtype: DType](inout self, name: String, value: Scalar[dtype]):
@@ -161,7 +161,7 @@ struct Attribute(Stringable, CollectionElement):
         var result = StaticIntTuple[N]()
 
         for i in range(N):
-            result[i] = int(self.data[i])
+            result[i] = int(self.data_shape[i])
 
         return result
 
diff --git a/basalt/autograd/graph.mojo b/basalt/autograd/graph.mojo
index 2353774..4038bdc 100644
--- a/basalt/autograd/graph.mojo
+++ b/basalt/autograd/graph.mojo
@@ -11,7 +11,6 @@ from basalt import seed, dtype
 from basalt import Tensor, TensorShape
 
 
-@value
 struct Graph:
     var inputs: List[Symbol]
     var params: ParamDict
@@ -28,41 +27,42 @@ struct Graph:
         self.loss_out = None
         self.symbol_count = 0
 
-    fn input(inout self, shape: TensorShape, trainable: Bool = False) -> Symbol:
-        var inp = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.inputs.append(inp)
-        self.symbol_count += 1
-        return inp
+    fn __moveinit__(inout self, owned other: Graph):
+        self.inputs = other.inputs^
+        self.params = other.params^
+        self.nodes = other.nodes^
+        self.outputs = other.outputs^
+        self.loss_out = other.loss_out
+        self.symbol_count = other.symbol_count
 
-    fn param(
-        inout self, shape: TensorShape, init: Param, trainable: Bool = True
-    ) -> Symbol:
-        var param_id = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.params.put(param_id, init)
+    fn create_symbol(inout self, shape: TensorShape, data: Optional[Param] = None, trainable: Bool = False, is_input: Bool = False) -> Symbol:
+        var symbol = Symbol(self.symbol_count, dtype, shape, trainable)
         self.symbol_count += 1
-        return param_id
+
+        if data is not None:
+            self.params.put(symbol, data.take())
+        else:
+            self.params.put(symbol)
+
+        if is_input:
+            self.inputs.append(symbol)
+
+        return symbol
+
+    fn input(inout self, shape: TensorShape, trainable: Bool = False) -> Symbol:
+        return self.create_symbol(shape, trainable=trainable, is_input=True)
+
+    fn param(inout self, shape: TensorShape, init: Param, trainable: Bool = True) -> Symbol:
+        return self.create_symbol(shape, init, trainable)
 
     fn param(inout self, shape: TensorShape, trainable: Bool = True) -> Symbol:
-        var param_id = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.params.put(param_id)
-        self.symbol_count += 1
-        return param_id
+        return self.create_symbol(shape, trainable=trainable)
 
     fn scalar(inout self, value: Scalar[dtype]) -> Symbol:
-        var scal = Param(value)
-        var scalar_id = Symbol(
-            self.symbol_count, dtype, TensorShape(1), trainable=False
-        )
-        self.params.put(scalar_id, scal)
-        self.symbol_count += 1
-        return scalar_id
+        return self.create_symbol(TensorShape(1), Param(value), trainable=False)
 
     fn constant(inout self, shape: TensorShape, data: List[Scalar[dtype]]) -> Symbol:
-        var cst = Param(data)
-        var constant_id = Symbol(self.symbol_count, dtype, shape, trainable=False)
-        self.params.put(constant_id, cst)
-        self.symbol_count += 1
-        return constant_id
+        return self.create_symbol(shape, Param(data), trainable=False)
 
     fn out(inout self, symbol: Symbol):
         self.outputs.append(symbol)
@@ -77,14 +77,15 @@ struct Graph:
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
         var res_shape = static_result_shape(op, operands, attributes)
-        var res = Symbol(
-            self.symbol_count, dtype, res_shape, self.result_trainable(operands)
-        )
+        var res = Symbol(self.symbol_count, dtype, res_shape, self.result_trainable(operands))
         self.symbol_count += 1
 
         var inputs = List[Symbol]()
+        inputs.reserve(len(operands))
+
         for operand in operands:
             inputs.append(operand)
+
         self.nodes.append(Node(op, inputs, List[Symbol](res), attributes))
         return res
 
@@ -95,8 +96,7 @@ struct Graph:
         operand_2: Float64,
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
-        var operand_2_symbol = self.scalar(operand_2)
-        return self.op(op, operand_1, operand_2_symbol, attributes=attributes)
+        return self.op(op, operand_1, self.scalar(operand_2), attributes=attributes)
 
     fn op(
         inout self,
@@ -105,43 +105,43 @@ struct Graph:
         operand_2: Symbol,
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
-        var operand_1_symbol = self.scalar(operand_1)
-        return self.op(op, operand_1_symbol, operand_2, attributes=attributes)
+        return self.op(op, self.scalar(operand_1), operand_2, attributes=attributes)
+
+    fn create_symbols(inout self, shapes: List[TensorShape], trainable: Bool = False) -> List[Symbol]:
+        var symbols = List[Symbol]()
+        symbols.reserve(len(shapes))
+        
+        for shape in shapes:
+            symbols.append(Symbol(self.symbol_count, dtype, shape[], trainable))
+            self.symbol_count += 1
+        
+        return symbols
+
+    fn add_node(inout self, op: OP, inputs: List[Symbol], outputs: List[Symbol], attributes: AttributeVector):
+        self.nodes.append(Node(op, inputs, outputs, attributes))
 
-    # Dynamic ops
     fn concat(inout self, *operands: Symbol, dim: Int = 0) -> Symbol:
-        # NOTE: Concat could fit into g.op() given a different static_result_shape is called
         var attributes = AttributeVector(Attribute("dim", dim))
-
         var res_shape = dynamic_result_shape(OP.CONCAT, operands, attributes)[0]
-        var res = Symbol(
-            self.symbol_count, dtype, res_shape, self.result_trainable(operands)
-        )
-        self.symbol_count += 1
+        var res_symbols = self.create_symbols(List[TensorShape](res_shape), self.result_trainable(operands))
 
-        var inputs = List[Symbol]()
+        var operand_list = List[Symbol]()
+        operand_list.reserve(len(operands))
         for operand in operands:
-            inputs.append(operand)
-        self.nodes.append(Node(OP.CONCAT, inputs, List[Symbol](res), attributes))
-        return res
+            operand_list.append(operand)
+
+        self.add_node(OP.CONCAT, operand_list, res_symbols, attributes)
+        return res_symbols[0]
 
     fn split(
         inout self, operand: Symbol, sections: List[Int], dim: Int = 0
     ) -> List[Symbol]:
-        var attributes = AttributeVector(
-            Attribute("sections", TensorShape(sections)), Attribute("dim", dim)
-        )
+        var attributes = AttributeVector(Attribute("sections", TensorShape(sections)), Attribute("dim", dim))
         var res_shapes = dynamic_result_shape(OP.SPLIT, operand, attributes)
         var trainable = self.result_trainable(operand)
-
-        var results = List[Symbol]()
-        for i in range(len(res_shapes)):
-            var symbol = Symbol(self.symbol_count, dtype, res_shapes[i], trainable)
-            results.append(symbol)
-            self.symbol_count += 1
-
-        self.nodes.append(Node(OP.SPLIT, List[Symbol](operand), results, attributes))
-        return results
+        var result_symbols = self.create_symbols(res_shapes, trainable)
+        self.add_node(OP.SPLIT, List[Symbol](operand), result_symbols, attributes)
+        return result_symbols
 
     @staticmethod
     fn result_trainable(operands: VariadicList[Symbol]) -> Bool:
diff --git a/basalt/autograd/ops/basics.mojo b/basalt/autograd/ops/basics.mojo
index 3a4a5ab..2305377 100644
--- a/basalt/autograd/ops/basics.mojo
+++ b/basalt/autograd/ops/basics.mojo
@@ -6,6 +6,7 @@ from basalt import Tensor, TensorShape
 from basalt.nn.tensor import MAX_RANK
 from basalt.utils.tensorutils import *
 from basalt.autograd.attributes import Attribute, AttributeVector
+from basalt.autograd.ops.matmul import dot, dot_transpose_t1, dot_transpose_t2
 
 """
 Implement forward and backward operations for basic tensor manipulations.
@@ -323,9 +324,7 @@ struct POW:
 
             @parameter
             fn vec_pow_bw_x[nelts: Int](i: Int):
-                res_grad.store[nelts](
-                    i, a * (t1.load[nelts](i) ** (a - 1)) * ug.load[nelts](i)
-                )
+                res_grad.store[nelts](i, a * ((t1.load[nelts](i) + epsilon) ** (a - 1)) * ug.load[nelts](i))
 
             vectorize[vec_pow_bw_x, nelts](t1_shape.num_elements())
 
diff --git a/basalt/autograd/ops/conv.mojo b/basalt/autograd/ops/conv.mojo
index 3e1a18c..4592c51 100644
--- a/basalt/autograd/ops/conv.mojo
+++ b/basalt/autograd/ops/conv.mojo
@@ -1,6 +1,5 @@
 from basalt import Tensor, TensorShape
 from basalt.autograd.attributes import AttributeVector
-from basalt.utils.tensorutils import dot, dot_transpose_t1, dot_transpose_t2
 
 from algorithm import parallelize, vectorize, tile
 from math import divmod
diff --git a/basalt/autograd/ops/matmul.mojo b/basalt/autograd/ops/matmul.mojo
new file mode 100644
index 0000000..e638f81
--- /dev/null
+++ b/basalt/autograd/ops/matmul.mojo
@@ -0,0 +1,175 @@
+from basalt.utils.tensorutils import transpose_2D
+from algorithm import vectorize, parallelize
+
+
+@always_inline
+fn calculate_block[
+    M: Int, N: Int, K: Int, BLOCK_M: Int, BLOCK_N: Int, nelts: Int
+](
+    res: DTypePointer[dtype],
+    t1: DTypePointer[dtype],
+    t2: DTypePointer[dtype],
+    bm: Int,
+    bn: Int,
+):
+    # Compute tile
+    var acc = stack_allocation[BLOCK_M * BLOCK_N, dtype]()
+    memset_zero[dtype](acc, BLOCK_M * BLOCK_N)
+
+    for k in range(K):
+
+        @unroll
+        for m in range(BLOCK_M):
+
+            @parameter
+            fn inner_n[nelts: Int](n: Int):
+                acc.store[width=nelts](
+                    m * BLOCK_N + n,
+                    SIMD[dtype, nelts]
+                    .splat(t1[(bm + m) * K + k])
+                    .fma(
+                        t2.load[width=nelts](k * N + (bn + n)),
+                        acc.load[width=nelts](m * BLOCK_N + n),
+                    ),
+                )
+
+            vectorize[inner_n, nelts](BLOCK_N)
+
+    # Store tile
+    for m in range(BLOCK_M):
+
+        @parameter
+        fn vec_store[nelts: Int](n: Int):
+            res.store[width=nelts](
+                (bm + m) * N + (bn + n), acc.load[width=nelts](m * BLOCK_N + n)
+            )
+
+        vectorize[vec_store, nelts](BLOCK_N)
+
+
+@parameter
+@always_inline
+fn dot[
+    t1_shape: TensorShape, t2_shape: TensorShape
+](inout res: Tensor[dtype], t1: Tensor[dtype], t2: Tensor[dtype]):
+    dot[t1_shape, t2_shape](res.data(), t1.data(), t2.data())
+
+
+@parameter
+@always_inline
+fn dot[
+    t1_shape: TensorShape, t2_shape: TensorShape
+](res: DTypePointer[dtype], t1: DTypePointer[dtype], t2: DTypePointer[dtype]):
+    alias M = t1_shape[0]  # t1[0]
+    alias K = t1_shape[1]  # t1[1], t2[0]
+    alias N = t2_shape[1]  # t2[1]
+
+    # simdwidthof[dtype]() = 8 for float32
+    alias nelts = simdwidthof[dtype]()
+    alias BLOCK_N = 8 * 2
+    alias BLOCK_M = 6
+    alias THREADS = 6  # num_logical_cores()
+
+    alias BLOCK_N_REMAINDER = N % BLOCK_N
+    alias BLOCK_M_REMAINDER = M % BLOCK_M
+
+    @parameter
+    fn bm_par(m_outer: Int):
+        var bm = m_outer * BLOCK_M
+
+        for n_outer in range(0, N // BLOCK_N):
+            var bn = n_outer * BLOCK_N
+
+            calculate_block[M, N, K, BLOCK_M, BLOCK_N, nelts](res, t1, t2, bm, bn)
+
+        # Handle the remainder of N
+        @parameter
+        if BLOCK_N_REMAINDER > 0:
+            var bn = N - BLOCK_N_REMAINDER
+
+            calculate_block[M, N, K, BLOCK_M, BLOCK_N_REMAINDER, nelts](
+                res, t1, t2, bm, bn
+            )
+
+    parallelize[bm_par](M // BLOCK_M, M // BLOCK_M)
+
+    # Handle the remainder of M
+    @parameter
+    if BLOCK_M_REMAINDER > 0:
+        var bm = M - BLOCK_M_REMAINDER
+
+        for n_outer in range(0, N // BLOCK_N):
+            var bn = n_outer * BLOCK_N
+
+            calculate_block[M, N, K, BLOCK_M_REMAINDER, BLOCK_N, nelts](
+                res, t1, t2, bm, bn
+            )
+
+        # Handle corner remainder
+        @parameter
+        if BLOCK_N_REMAINDER > 0:
+            var bn = N - BLOCK_N_REMAINDER
+
+            calculate_block[M, N, K, BLOCK_M_REMAINDER, BLOCK_N_REMAINDER, nelts](
+                res, t1, t2, bm, bn
+            )
+
+
+fn dot_transpose_t2[
+    A_shape: TensorShape, B_shape: TensorShape
+](inout C: DTypePointer[dtype], A: DTypePointer[dtype], B: DTypePointer[dtype]):
+    dot[A_shape, TensorShape(B_shape[1], B_shape[0])](C, A, transpose_2D[B_shape](B))
+
+
+fn dot_transpose_t2[
+    A_shape: TensorShape, B_shape: TensorShape
+](inout C: Tensor[dtype], A: Tensor[dtype], B: Tensor[dtype]):
+    memset_zero[dtype](C.data(), C.num_elements())
+
+    dot[A_shape, TensorShape(B_shape[1], B_shape[0])](C, A, transpose_2D[B_shape](B))
+
+    # @parameter
+    # fn calc_row(i: Int):
+    #     for j in range(B_shape[0]):
+
+    #         @parameter
+    #         fn calc_row_A_B[nelts: Int](k: Int):
+    #             var A_pos = i * A.dim(1) + k
+    #             var B_pos = j * A.dim(1) + k
+    #             var t_new_pos = i * C.dim(1) + j
+
+    #             C[t_new_pos] += (
+    #                 A.load[nelts](A_pos) * B.load[nelts](B_pos)
+    #             ).reduce_add()
+
+    #         vectorize[calc_row_A_B, nelts, size=A_shape[1]]()
+
+    # parallelize[calc_row](A_shape[0], 1)
+
+
+fn dot_transpose_t1[
+    A_shape: TensorShape, B_shape: TensorShape
+](inout C: Tensor[dtype], A: Tensor[dtype], B: Tensor[dtype]):
+    memset_zero[dtype](C.data(), C.num_elements())
+
+    dot[TensorShape(A_shape[1], A_shape[0]), B_shape](C, transpose_2D[A_shape](A), B)
+
+    # @parameter
+    # fn calc_row(i: Int):
+    #     for j in range(A_shape[0]):
+
+    #         @parameter
+    #         fn calc_row_t_new_B[nelts: Int](k: Int):
+    #             var A_pos = j * A.dim(1) + i
+    #             var B_pos = j * B.dim(1) + k
+    #             var t_new_pos = i * C.dim(1) + k
+
+    #             C.store[nelts](
+    #                 t_new_pos,
+    #                 C.load[nelts](t_new_pos)
+    #                 + A[A_pos] * B.load[nelts](B_pos),
+    #             )
+
+    #         vectorize[calc_row_t_new_B, nelts, size=B_shape[1]]()
+
+    # parallelize[calc_row](A_shape[1], 1)
diff --git a/basalt/autograd/params.mojo b/basalt/autograd/params.mojo
index 5d82848..37d682a 100644
--- a/basalt/autograd/params.mojo
+++ b/basalt/autograd/params.mojo
@@ -19,10 +19,8 @@ struct Param(CollectionElement, Stringable):
         self.data = data
         self.initializer = None
 
-    fn __init__(inout self, a: Scalar[dtype]):
-        var data = List[Scalar[dtype]]()
-        data.append(a)
-        self.data = data
+    fn __init__(inout self, data: Scalar[dtype]):
+        self.data = List[Scalar[dtype]](data)
         self.initializer = None
 
     fn __init__(inout self, initializer: String, *args: Scalar[dtype]):
diff --git a/basalt/utils/collection.mojo b/basalt/utils/collection.mojo
index 0a8aea9..16474a5 100644
--- a/basalt/utils/collection.mojo
+++ b/basalt/utils/collection.mojo
@@ -1,7 +1,7 @@
-from math import max
-from memory.unsafe_pointer import UnsafePointer, move_from_pointee, initialize_pointee_copy, initialize_pointee_move, destroy_pointee
+from math import max, divmod
+from memory.unsafe_pointer import UnsafePointer, initialize_pointee_move, destroy_pointee
 
-from basalt import Tensor, TensorShape, Symbol
+from basalt import Tensor, Symbol
 
 
 struct Collection(CollectionElement, Sized):
@@ -108,10 +108,29 @@ struct Collection(CollectionElement, Sized):
     fn get_index(self, symbol_name: UInt32) -> Int:
         """
         Returns the index of the tensor with the given symbol name.
-        """
-        for i in range(self.size):
-            if self.symbols[i] == symbol_name:
-                return i
+        """        
+        alias factor = 8
+        # 2 -> 5.32s MNIST
+        # 4 -> 4.95s MNIST
+        # 8 -> 4.85s MNIST
+        # 16 -> 5.19s MNIST
+        # NOTE: This ideally should just be a hashmap
+
+        for i in range(0, self.size, factor):
+            var elems = self.symbols.load[width=factor](i) == symbol_name
+
+            for j in range(factor):
+                if elems[j]: 
+                    return i + j
+
+        var split = divmod(self.size, factor)
+
+        for i in range(split[1]):
+            var index = split[0] + i
+            
+            if self.symbols[index] == symbol_name:
+                return index
+
         return -1
 
     @always_inline("nodebug")
diff --git a/basalt/utils/onnx_utils.mojo b/basalt/utils/onnx_utils.mojo
index 9eeda44..3e67164 100644
--- a/basalt/utils/onnx_utils.mojo
+++ b/basalt/utils/onnx_utils.mojo
@@ -6,33 +6,13 @@ from basalt.nn.model import Parameters
 from basalt.nn.tensor import Tensor, TensorShape
 from basalt.autograd.attributes import Attribute, AttributeType
 from basalt.autograd.ops import OP
+from basalt.autograd.graph import Node
+
+from .tensor_creation_utils import to_numpy, copy_np_data
 
 # NOTE: Maybe we could create our own model representation and from there convert to onnx or others (well we already have it in reallity)
 # NOTE: Torch doesn't import onnx, need onnx2torch and it doesn't support operators like reshape?
 
-fn to_numpy(tensor: Tensor) raises -> PythonObject:
-    var np = Python.import_module("numpy")
-
-    np.set_printoptions(4)
-    var rank = tensor.rank()
-    var pyarray: PythonObject = np.array([0])
-
-    if rank == 1:
-        pyarray = np.empty((tensor.dim(0)))
-    elif rank == 2:
-        pyarray = np.empty((tensor.dim(0), tensor.dim(1)))
-    elif rank == 3:
-        pyarray = np.empty((tensor.dim(0), tensor.dim(1), tensor.dim(2)))
-    elif rank == 4:
-        pyarray = np.empty((tensor.dim(0), tensor.dim(1), tensor.dim(2), tensor.dim(3)))
-    else:
-        print("Error: rank not supported: ", rank)
-
-    for i in range(tensor.num_elements()):
-        pyarray.itemset((i), tensor[i])
-
-    return pyarray
-
 
 fn make_onnx_attribute(op: OP, attr: Attribute) raises -> PythonObject:
     var onnx = Python.import_module("onnx")
@@ -68,9 +48,7 @@ fn make_onnx_attribute(op: OP, attr: Attribute) raises -> PythonObject:
         else:
             raise Error("Unsupported attribute name for operator " + str(op))
 
-    if (op == OP.CONV2D and attr_name) == "pads" or (
-        op == OP.MAXPOOL2D and attr_name
-    ) == "pads":
+    if (op == OP.CONV2D or op == OP.MAXPOOL2D) and attr_name == "pads":
         # Special case for pads in conv and maxpool, onnx wants pads to be [x1_begin, x2_begin…x1_end, x2_end,…],
         attr_value.append(attr_value[0])
         attr_value.append(attr_value[1])
@@ -190,15 +168,88 @@ fn load_onnx_model(
                         + data_shape
                     )
 
-            var data = data_np.flatten()
-
-            # It would be better to use memcpy here
-            for j in range(len(data)):
-                model_parameters.tensors[g.params.symbols[i]][j] = data[j].to_float64()
+            copy_np_data(model_parameters.tensors[g.params.symbols[i]], data_np)
         else:
             raise Error("Unsupported data type")
 
 
+fn create_attributes_and_constant_inputs(node: Node, node_number: Int) raises -> (List[PythonObject], List[PythonObject]):
+    var onnx = Python.import_module("onnx")
+    var np = Python.import_module("numpy")
+
+    var attributes = List[PythonObject]()
+    var inputs = List[PythonObject]()
+
+    for i in range(len(node.attributes)):
+        var attr = node.attributes[i]
+
+        @parameter
+        fn to_np_array(attr: Attribute) raises -> PythonObject:
+            if not attr.type == AttributeType.INTS:
+                raise Error("Attribute is not a shape")
+
+            var values_np: PythonObject
+            if attr.type == AttributeType.INTS:
+                var shape = attr.to_shape()
+                values_np = PythonObject([])
+                for j in range(shape.rank()):
+                    values_np.append(shape[j])
+            elif attr.type == AttributeType.FLOAT:
+                values_np = attr.to_scalar[DType.float64]()
+            elif attr.type == AttributeType.INT:
+                values_np = attr.to_int()
+            else:
+                raise Error("Unsupported attribute type")
+
+            var np_array = np.array(values_np, dtype=np.int64)
+
+            return onnx.numpy_helper.from_array(np_array)
+        
+        # Special cases where attributes are considered as inputs, so we create Constant inputs
+        if node.operator == OP.RESHAPE:
+            if str(attr.name) == "shape":
+                var outputs = PythonObject([])
+                outputs.append(str(node.operator) + "_" + str(attr.name) + "_" + str(node_number))
+                var temp_node = onnx.helper.make_node(
+                    op_type="Constant",
+                    inputs=[],
+                    outputs=outputs,
+                    value=to_np_array(attr),
+                )
+
+                inputs.append(temp_node)
+        elif node.operator == OP.CLIP:
+            if str(attr.name) == "min" or str(attr.name) == "max":
+                var outputs = PythonObject([])
+                outputs.append(str(node.operator) + "_" + str(attr.name) + "_" + str(node_number))
+                var temp_node = onnx.helper.make_node(
+                    op_type="Constant",
+                    inputs=[],
+                    outputs=outputs,
+                    value=to_np_array(attr),
+                )
+
+                inputs.append(temp_node)
+        elif node.operator == OP.SQUEEZE or node.operator == OP.UNSQUEEZE:
+            if str(attr.name) == "dims":
+                var outputs = PythonObject([])
+                outputs.append(str(node.operator) + "_" + str(attr.name) + "_" + str(node_number))
+                var temp_node = onnx.helper.make_node(
+                    op_type="Constant",
+                    inputs=[],
+                    outputs=outputs,
+                    value=to_np_array(attr),
+                )
+
+                inputs.append(temp_node)
+        else:
+            var attr_value = make_onnx_attribute(node.operator, attr)
+
+            attributes.append(attr_value)
+
+    return (attributes, inputs)
+
+
 fn export_onnx_model(model_path: Path, model_parameters: Parameters, g: Graph) raises:
     # Create onnx model with data and nodes
     var onnx = Python.import_module("onnx")
@@ -261,6 +312,14 @@ fn export_onnx_model(model_path: Path, model_parameters: Parameters, g: Graph) r
                 var onnx_output = onnx_helper.make_tensor_value_info(name, dtype, shape)
                 graph.value_info.append(onnx_output)
 
+        # Process attributes
+        var attributes_and_inputs = create_attributes_and_constant_inputs(node, i)
+        var attributes = attributes_and_inputs[0]
+        var inputs_constant = attributes_and_inputs[1]
+        for j in range(len(inputs_constant)):
+            inputs.append(inputs_constant[j].output[0])
+            graph.node.append(inputs_constant[j])
+
         # Create onnx node
         var onnx_node = onnx_helper.make_node(
             op_type,
@@ -268,33 +327,8 @@ fn export_onnx_model(model_path: Path, model_parameters: Parameters, g: Graph) r
             outputs,
             name,
         )
-
-        # Process attributes
-        for j in range(len(node.attributes)):
-            var attr = node.attributes[j]
-            var attr_value = make_onnx_attribute(node.operator, attr)
-
-            # Special case for reshape, shape in reshape is not an attribute, instead it is an input because they can be dynamic
-            if not node.operator == OP.RESHAPE:
-                onnx_node.attribute.append(attr_value)
-
-        # Special case for reshape, shape in reshape is not an attribute, instead it is an input because they can be dynamic (it can be the result of another operator, don't know why)
-        if node.operator == OP.RESHAPE:
-            var shape = node.attributes[0].to_shape()
-            var list_shape = PythonObject([])
-            for j in range(shape.rank()):
-                list_shape.append(shape[j])
-
-            graph.initializer.append(
-                onnx_helper.make_tensor(
-                    name=name + "_shape",
-                    data_type=onnx.TensorProto.INT64,
-                    dims=(shape.rank(),),
-                    vals=list_shape,
-                )
-            )
-
-            onnx_node.input.append(name + "_shape")
+        for attribute in attributes:
+            onnx_node.attribute.append(attribute[])
 
         graph.node.append(onnx_node)
 
diff --git a/basalt/utils/tensor_creation_utils.mojo b/basalt/utils/tensor_creation_utils.mojo
new file mode 100644
index 0000000..375279a
--- /dev/null
+++ b/basalt/utils/tensor_creation_utils.mojo
@@ -0,0 +1,73 @@
+from python import Python
+
+# maybe this functions should be from the Tensor struct (like tensor.to_numpy()) and tensor.__init__(np_array: PythonObject) to create a tensor from a numpy array and tensor.copy_np_data(np_array: PythonObject) to copy the numpy array to the tensor.
+
+
+fn to_numpy(tensor: Tensor) -> PythonObject:
+    try:
+        var np = Python.import_module("numpy")
+
+        np.set_printoptions(4)
+
+        var rank = tensor.rank()
+        var dims = PythonObject([])
+        for i in range(rank):
+            dims.append(tensor.dim(i))
+        var pyarray: PythonObject = np.empty(dims, dtype=np.float32)
+
+        var pointer = int(pyarray.__array_interface__["data"][0].to_float64())
+        var pointer_d = DTypePointer[tensor.dtype](address=pointer)
+        memcpy(pointer_d, tensor.data(), tensor.num_elements())
+
+        _ = tensor
+
+        return pyarray^
+    except e:
+        print("Error in to numpy", e)
+        return PythonObject()
+
+
+fn to_tensor(np_array: PythonObject) raises -> Tensor[dtype]:
+    var shape = List[Int]()
+    for i in range(np_array.ndim):
+        shape.append(int(np_array.shape[i].to_float64()))
+    if np_array.ndim == 0:
+        # When the numpy array is a scalar, you need or the reshape to a size 1 ndarray or do this, if not the memcpy gets a memory error (Maybe because it is a register value?).
+        var tensor = Tensor[dtype](TensorShape(1))
+        tensor[0] = np_array.to_float64().cast[dtype]()
+        return tensor^
+
+    var tensor = Tensor[dtype](TensorShape(shape))
+
+    var np_array_2 = np_array.copy()
+    try:
+        var np = Python.import_module("numpy")
+        np_array_2 = np.float32(np_array_2)
+    except e:
+        print("Error in to tensor", e)
+
+    var pointer = int(np_array_2.__array_interface__["data"][0].to_float64())
+    var pointer_d = DTypePointer[tensor.dtype](address=pointer)
+    memcpy(tensor.data(), pointer_d, tensor.num_elements())
+
+    _ = np_array_2
+    _ = np_array
+
+    return tensor^
+
+
+fn copy_np_data(tensor: Tensor, np_array: PythonObject) raises:
+    var np_array_2 = np_array.copy()
+    try:
+        var np = Python.import_module("numpy")
+        np_array_2 = np.float32(np_array_2)
+    except e:
+        print("Error in to tensor", e)
+
+    var pointer = int(np_array_2.__array_interface__["data"][0].to_float64())
+    var pointer_d = DTypePointer[tensor.dtype](address=pointer)
+    memcpy(tensor.data(), pointer_d, tensor.num_elements())
+
+    _ = np_array_2
+    _ = np_array
+    _ = tensor
diff --git a/basalt/utils/tensorutils.mojo b/basalt/utils/tensorutils.mojo
index fea82bc..b3b70de 100644
--- a/basalt/utils/tensorutils.mojo
+++ b/basalt/utils/tensorutils.mojo
@@ -48,14 +48,8 @@ fn broadcast_shapes(s1: TensorShape, s2: TensorShape) -> TensorShape:
     var ndim = max(s1.rank(), s2.rank())
     var diff = abs(s1.rank() - s2.rank())
 
-    var big: TensorShape
-    var small: TensorShape
-    if s1.rank() > s2.rank():
-        big = s1
-        small = s2
-    else:
-        big = s2
-        small = s1
+    var big = s1 if s1.rank() > s2.rank() else s2
+    var small = s2 if s1.rank() > s2.rank() else s1
 
     var res = StaticIntTuple[MAX_RANK](-1)
 
@@ -67,12 +61,7 @@ fn broadcast_shapes(s1: TensorShape, s2: TensorShape) -> TensorShape:
         elif a == 1 or b == 1:
             res[i] = a * b
         else:
-            # NOTE: consider assert and allow the function raises
-            var message: String = "[ERROR] Shapes " + str(s1) + " and " + str(
-                s2
-            ) + " cannot be broadcasted together."
-            print(message)
-            # raise Error(message)
+            print("[ERROR] Shapes " + str(s1) + " and " + str(s2) + " cannot be broadcasted together.")
 
     for i in range(diff - 1, -1, -1):
         res[i] = big[i]
@@ -91,9 +80,7 @@ fn broadcast_shapes(*s: TensorShape) -> TensorShape:
 
 
 @always_inline
-fn broadcast_calculate_strides[
-    size: Int, shape: TensorShape, broadcast_shape: TensorShape
-]() -> StaticIntTuple[size]:
+fn broadcast_calculate_strides[size: Int, shape: TensorShape, broadcast_shape: TensorShape]() -> StaticIntTuple[size]:
     alias shape_rank = shape.rank()
     alias diff = size - shape_rank
 
@@ -107,181 +94,6 @@ fn broadcast_calculate_strides[
 
     return strides
 
-
-# ----- Dot functions -----
-@always_inline
-fn calculate_block[
-    M: Int, N: Int, K: Int, BLOCK_M: Int, BLOCK_N: Int, nelts: Int
-](
-    res: DTypePointer[dtype],
-    t1: DTypePointer[dtype],
-    t2: DTypePointer[dtype],
-    bm: Int,
-    bn: Int,
-):
-    # Compute tile
-    var acc = stack_allocation[BLOCK_M * BLOCK_N, dtype]()
-    memset_zero[dtype](acc, BLOCK_M * BLOCK_N)
-
-    for k in range(K):
-
-        @unroll
-        for m in range(BLOCK_M):
-
-            @parameter
-            fn inner_n[nelts: Int](n: Int):
-                acc.store[width=nelts](
-                    m * BLOCK_N + n,
-                    SIMD[dtype, nelts]
-                    .splat(t1[(bm + m) * K + k])
-                    .fma(
-                        t2.load[width=nelts](k * N + (bn + n)),
-                        acc.load[width=nelts](m * BLOCK_N + n),
-                    ),
-                )
-
-            vectorize[inner_n, nelts](BLOCK_N)
-
-    # Store tile
-    for m in range(BLOCK_M):
-
-        @parameter
-        fn vec_store[nelts: Int](n: Int):
-            res.store[width=nelts](
-                (bm + m) * N + (bn + n), acc.load[width=nelts](m * BLOCK_N + n)
-            )
-
-        vectorize[vec_store, nelts](BLOCK_N)
-
-
-@parameter
-@always_inline
-fn dot[
-    t1_shape: TensorShape, t2_shape: TensorShape
-](inout res: Tensor[dtype], t1: Tensor[dtype], t2: Tensor[dtype]):
-    dot[t1_shape, t2_shape](res.data(), t1.data(), t2.data())
-
-
-@parameter
-@always_inline
-fn dot[
-    t1_shape: TensorShape, t2_shape: TensorShape
-](res: DTypePointer[dtype], t1: DTypePointer[dtype], t2: DTypePointer[dtype]):
-    alias M = t1_shape[0]  # t1[0]
-    alias K = t1_shape[1]  # t1[1], t2[0]
-    alias N = t2_shape[1]  # t2[1]
-
-    # simdwidthof[dtype]() = 8 for float32
-    alias nelts = simdwidthof[dtype]()
-    alias BLOCK_N = 8 * 2
-    alias BLOCK_M = 6
-    alias THREADS = 6  # num_logical_cores()
-
-    alias BLOCK_N_REMAINDER = N % BLOCK_N
-    alias BLOCK_M_REMAINDER = M % BLOCK_M
-
-    @parameter
-    fn bm_par(m_outer: Int):
-        var bm = m_outer * BLOCK_M
-
-        for n_outer in range(0, N // BLOCK_N):
-            var bn = n_outer * BLOCK_N
-
-            calculate_block[M, N, K, BLOCK_M, BLOCK_N, nelts](res, t1, t2, bm, bn)
-
-        # Handle the remainder of N
-        @parameter
-        if BLOCK_N_REMAINDER > 0:
-            var bn = N - BLOCK_N_REMAINDER
-
-            calculate_block[M, N, K, BLOCK_M, BLOCK_N_REMAINDER, nelts](
-                res, t1, t2, bm, bn
-            )
-
-    parallelize[bm_par](M // BLOCK_M, M // BLOCK_M)
-
-    # Handle the remainder of M
-    @parameter
-    if BLOCK_M_REMAINDER > 0:
-        var bm = M - BLOCK_M_REMAINDER
-
-        for n_outer in range(0, N // BLOCK_N):
-            var bn = n_outer * BLOCK_N
-
-            calculate_block[M, N, K, BLOCK_M_REMAINDER, BLOCK_N, nelts](
-                res, t1, t2, bm, bn
-            )
-
-        # Handle corner remainder
-        @parameter
-        if BLOCK_N_REMAINDER > 0:
-            var bn = N - BLOCK_N_REMAINDER
-
-            calculate_block[M, N, K, BLOCK_M_REMAINDER, BLOCK_N_REMAINDER, nelts](
-                res, t1, t2, bm, bn
-            )
-
-
-fn dot_transpose_t2[
-    A_shape: TensorShape, B_shape: TensorShape
-](inout C: DTypePointer[dtype], A: DTypePointer[dtype], B: DTypePointer[dtype]):
-    dot[A_shape, TensorShape(B_shape[1], B_shape[0])](C, A, transpose_2D[B_shape](B))
-
-
-fn dot_transpose_t2[
-    A_shape: TensorShape, B_shape: TensorShape
-](inout C: Tensor[dtype], A: Tensor[dtype], B: Tensor[dtype]):
-    memset_zero[dtype](C.data(), C.num_elements())
-
-    dot[A_shape, TensorShape(B_shape[1], B_shape[0])](C, A, transpose_2D[B_shape](B))
-
-    # @parameter
-    # fn calc_row(i: Int):
-    #     for j in range(B_shape[0]):
-
-    #         @parameter
-    #         fn calc_row_A_B[nelts: Int](k: Int):
-    #             var A_pos = i * A.dim(1) + k
-    #             var B_pos = j * A.dim(1) + k
-    #             var t_new_pos = i * C.dim(1) + j
-
-    #             C[t_new_pos] += (
-    #                 A.load[nelts](A_pos) * B.load[nelts](B_pos)
-    #             ).reduce_add()
-
-    #         vectorize[calc_row_A_B, nelts, size=A_shape[1]]()
-
-    # parallelize[calc_row](A_shape[0], 1)
-
-
-fn dot_transpose_t1[
-    A_shape: TensorShape, B_shape: TensorShape
-](inout C: Tensor[dtype], A: Tensor[dtype], B: Tensor[dtype]):
-    memset_zero[dtype](C.data(), C.num_elements())
-
-    dot[TensorShape(A_shape[1], A_shape[0]), B_shape](C, transpose_2D[A_shape](A), B)
-
-    # @parameter
-    # fn calc_row(i: Int):
-    #     for j in range(A_shape[0]):
-
-    #         @parameter
-    #         fn calc_row_t_new_B[nelts: Int](k: Int):
-    #             var A_pos = j * A.dim(1) + i
-    #             var B_pos = j * B.dim(1) + k
-    #             var t_new_pos = i * C.dim(1) + k
-
-    #             C.store[nelts](
-    #                 t_new_pos,
-    #                 C.load[nelts](t_new_pos)
-    #                 + A[A_pos] * B.load[nelts](B_pos),
-    #             )
-
-    #         vectorize[calc_row_t_new_B, nelts, size=B_shape[1]]()
-
-    # parallelize[calc_row](A_shape[1], 1)
-
-
 # ----- Element-wise unary operations -----
 @always_inline
 fn elwise_transform[
diff --git a/profile.sh b/profile.sh
deleted file mode 100755
index 870754f..0000000
--- a/profile.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-
-function profile() {
-    if [ ! -d ~/FlameGraph ]; then
-        InstallFlameGraph
-    fi
-
-    if [ -f /proc/sys/fs/binfmt_misc/WSLInterop ]; then
-        profileLinux "$1"
-    else
-        case "$OSTYPE" in
-            darwin*)
-                profileMac "$1"
-                ;;
-            linux-gnu*|msys)
-                profileLinux "$1"
-                ;;
-        esac
-    fi
-}
-
-function profileLinux() {
-    local mojo_file=$1
-    LinuxInstallDependencies
-    LinuxPermissions
-    runProfile "$mojo_file"
-}
-
-function profileMac() {
-    local mojo_file=$1
-    MacInstallDependencies
-    MacPermissions
-    runProfile "$mojo_file"
-}
-
-function runProfile() {
-    local mojo_file=$1
-    local mojo_name="${mojo_file%.mojo}"
-    local temp_dir="./temp"
-    local perf_output="$temp_dir/out.perf"
-    local flamegraph_output="flamegraph.svg"
-
-    echo "Profiling $mojo_file..."
-
-    mkdir -p "$temp_dir"
-
-    echo "Building $mojo_file..."
-    mojo build -I . "$mojo_file"
-
-    echo "Stripping debug symbols..."
-    mv "$mojo_name" "$temp_dir/run.exe"
-    llvm-strip --strip-debug "$temp_dir/run.exe"
-
-    echo "Running perf record..."
-    sudo perf record -F 99 -a -g -o "$perf_output" -- "$temp_dir/run.exe"
-
-    echo "Generating flamegraph..."
-    sudo perf script -i "$perf_output" | ~/FlameGraph/stackcollapse-perf.pl | ~/FlameGraph/flamegraph.pl > "$flamegraph_output"
-
-    echo "Opening flamegraph: $flamegraph_output"
-
-    if command -v open &> /dev/null; then
-        open "$flamegraph_output"
-    elif command -v explorer.exe &> /dev/null; then
-        explorer.exe "$flamegraph_output"
-    elif command -v google-chrome &> /dev/null; then
-        google-chrome "$flamegraph_output"
-    fi
-
-    echo "Cleaning up temporary files..."
-    rm -rf "$temp_dir"
-
-    echo "Profiling completed."
-}
-
-function LinuxInstallDependencies() {
-    if ! command -v perf &> /dev/null; then
-        echo "Installing perf for Linux/WSL"
-        sudo apt-get update
-        sudo apt-get install -y linux-tools-common linux-tools-generic
-    fi
-
-    if ! command -v llvm-strip &> /dev/null; then
-        echo "Installing LLVM for Linux/WSL"
-        sudo apt-get install -y llvm
-    fi
-}
-
-function MacInstallDependencies() {
-    if ! command -v perf &> /dev/null; then
-        echo "Installing perf for Mac"
-        brew install perf
-    fi
-
-    if ! command -v llvm-strip &> /dev/null; then
-        echo "Installing LLVM for Mac"
-        brew install llvm
-    fi
-}
-
-function InstallFlameGraph() {
-    echo "Installing FlameGraph"
-    git clone https://github.com/brendangregg/FlameGraph.git
-    mv FlameGraph ~/FlameGraph
-}
-
-function LinuxPermissions() {
-    echo "Setting Linux/WSL permissions"
-    echo 0 | sudo tee /proc/sys/kernel/kptr_restrict > /dev/null
-    echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid > /dev/null
-    sudo sysctl -p > /dev/null
-}
-
-function MacPermissions() {
-    echo "Setting Mac permissions"
-}
-
-profile "$1"
diff --git a/tests/mojo/test_backward.mojo b/tests/mojo/test_backward.mojo
index 167a232..d8acc45 100644
--- a/tests/mojo/test_backward.mojo
+++ b/tests/mojo/test_backward.mojo
@@ -157,10 +157,15 @@ fn test_POW() raises:
     fill(temp, (2**2) * log[dtype, 1](2))
     expected_grad2[0] = tsum(temp)
 
-    test_binary_op_backward[OP.POW, t1_shape, t2_shape, ug_shape](
-        t1, t2, ug, expected_grad1, expected_grad2
-    )
+    test_binary_op_backward[OP.POW, t1_shape, t2_shape, ug_shape](t1, t2, ug, expected_grad1, expected_grad2)
+
+    fill(t1, 0.0)
+    fill(t2, 0)
+    fill(ug, 1.0)
+    fill(expected_grad1, 0.0)
+    fill(expected_grad2, 0.0)
 
+    test_binary_op_backward[OP.POW, t1_shape, t2_shape, ug_shape](t1, t2, ug, expected_grad1, expected_grad2)
 
 fn test_SUM() raises:
     alias t1_shape = TensorShape(2, 3)
diff --git a/tests/mojo/test_tensorutils.mojo b/tests/mojo/test_tensorutils.mojo
index 1612f05..6c3dedf 100644
--- a/tests/mojo/test_tensorutils.mojo
+++ b/tests/mojo/test_tensorutils.mojo
@@ -3,9 +3,9 @@ from testing import assert_equal, assert_almost_equal
 from math import sqrt, exp, round, add, sub, mul, div
 
 from basalt import dtype, nelts
+from basalt.autograd.ops.matmul import dot
 from basalt.utils.tensorutils import (
     fill,
-    dot,
     elwise_transform,
     elwise_pow,
     elwise_op,
diff --git a/tests/testing_utils.mojo b/tests/testing_utils.mojo
index e28d311..40bb73b 100644
--- a/tests/testing_utils.mojo
+++ b/tests/testing_utils.mojo
@@ -7,6 +7,7 @@ from basalt.autograd import Graph, OP
 from basalt.autograd.ops.ops import backward_op
 from basalt.autograd.attributes import AttributeVector
 from basalt.nn import Tensor, TensorShape, Model
+from basalt.utils.tensor_creation_utils import to_numpy, to_tensor
 
 
 # The below regex should be used to convert deprecated calls
@@ -176,59 +177,6 @@ fn test_ternary_op_backward[
     assert_tensors_equal["almost"](grad_3, grad_3_expected)
 
 
-fn to_numpy(tensor: Tensor) -> PythonObject:
-    try:
-        var np = Python.import_module("numpy")
-    
-        np.set_printoptions(4)
-
-        var rank = tensor.rank()
-        var dims = PythonObject([])
-        for i in range(rank):
-            dims.append(tensor.dim(i))
-        var pyarray: PythonObject = np.empty(dims, dtype=np.float32)
-
-        var pointer = int(pyarray.__array_interface__['data'][0].to_float64())
-        var pointer_d = DTypePointer[tensor.dtype](address=pointer)
-        memcpy(pointer_d, tensor.data(), tensor.num_elements())
-
-        _ = tensor
-    
-        return pyarray ^
-    except e:
-        print("Error in to numpy", e)
-        return PythonObject()
-
-
-fn to_tensor(np_array: PythonObject) raises -> Tensor[dtype]:
-    var shape = List[Int]()
-    for i in range(np_array.ndim):
-        shape.append(int(np_array.shape[i].to_float64()))
-    if np_array.ndim == 0:
-        # When the numpy array is a scalar, you need or the reshape to a size 1 ndarray or do this, if not the memcpy gets a memory error (Maybe because it is a register value?).
-        var tensor = Tensor[dtype](TensorShape(1))
-        tensor[0] = np_array.to_float64().cast[dtype]()
-        return tensor ^
-
-    var tensor = Tensor[dtype](TensorShape(shape))
-
-    var np_array_2 = np_array.copy()
-    try:
-        var np = Python.import_module("numpy")
-        np_array_2 = np.float32(np_array_2)
-    except e:
-        print("Error in to tensor", e)
-
-    var pointer = int(np_array_2.__array_interface__['data'][0].to_float64())
-    var pointer_d = DTypePointer[tensor.dtype](address=pointer)
-    memcpy(tensor.data(), pointer_d, tensor.num_elements())
-
-    _ = np_array_2
-    _ = np_array
-
-    return tensor ^
-
-
 fn create_graph_concat(
     t1_shape: TensorShape, t2_shape: TensorShape, t3_shape: TensorShape, dim: Int
 ) -> Graph:

From aec36c74ea2a120036d0327eddb2d38c2dc053ad Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Sun, 2 Jun 2024 16:52:49 -0600
Subject: [PATCH 08/16] Fixed error in create_symbol when creating an input

---
 basalt/autograd/graph.mojo | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/basalt/autograd/graph.mojo b/basalt/autograd/graph.mojo
index 4038bdc..b4fdf53 100644
--- a/basalt/autograd/graph.mojo
+++ b/basalt/autograd/graph.mojo
@@ -39,13 +39,13 @@ struct Graph:
         var symbol = Symbol(self.symbol_count, dtype, shape, trainable)
         self.symbol_count += 1
 
-        if data is not None:
-            self.params.put(symbol, data.take())
-        else:
-            self.params.put(symbol)
-
         if is_input:
             self.inputs.append(symbol)
+        else:
+            if data is not None:
+                self.params.put(symbol, data.take())
+            else:
+                self.params.put(symbol)
 
         return symbol
 

From ab9174a7390aabd267749414171d0eb6c4a23c7a Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Sun, 2 Jun 2024 21:23:21 -0600
Subject: [PATCH 09/16] Added forward torch test for upsample

---
 tests/python/test_mlops_torch.mojo | 44 +++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/tests/python/test_mlops_torch.mojo b/tests/python/test_mlops_torch.mojo
index 2f4747c..4821d52 100644
--- a/tests/python/test_mlops_torch.mojo
+++ b/tests/python/test_mlops_torch.mojo
@@ -108,6 +108,18 @@ fn torch_unary_op(
                 indices[dim] = py.slice(start, end, step)
             
             expected = input_1.flip(flip_dims)[indices]
+        elif op == OP.UPSAMPLE:
+            var attrs = attrs.value()
+            var scales = attrs["scales"].value().to_shape()
+            var mode = attrs["mode"].value().to_string()
+
+            var scales_py = PythonObject([])
+            for i in range(scales.rank()):
+                scales_py.append(scales[i])
+
+            expected = torch.nn.functional.interpolate(
+                input_1, scale_factor=scales_py, mode=mode
+            )
         else:
             print("Error: op not supported (returning the value input_1): ", op)
             expected = input_1
@@ -429,16 +441,34 @@ fn test_SLICE() raises:
     test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_0_2, attrs_0_2](t1, ug, expected_and_grad.grad_1)
 
 
+fn test_UPSAMPLE() raises:
+    alias t1_shape = TensorShape(40, 40, 120, 120)
+    var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
+    rand(t1.data(), t1.num_elements())
+
+    alias attributes = AttributeVector(
+        Attribute("scales", TensorShape(2, 2)),
+        Attribute("mode", "nearest")
+    )
+
+    alias ug_shape = TensorShape(40, 40, 240, 240)
+    var ug = Tensor[dtype](ug_shape)
+
+    var expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes)
+    test_unary_op[OP.UPSAMPLE, t1_shape, attributes](t1, expected_and_grad.expected)
+
+
 fn main():
     print("Running mlops (compare with torch) tests")
     try:
-        test_SIGMOID()
-        test_RELU()
-        test_TANH()
-        test_CLIP()
-        test_SQUEEZE()
-        test_UNSQUEEZE()
-        test_SLICE()
+        # test_SIGMOID()
+        # test_RELU()
+        # test_TANH()
+        # test_CLIP()
+        # test_SQUEEZE()
+        # test_UNSQUEEZE()
+        # test_SLICE()
+        test_UPSAMPLE()
     except e:
         print("[ERROR] Error in mlops (compare with torch)")
         print(e)

From fa84ae523809b077b3ca1a99da8546b1081ccd48 Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Sun, 2 Jun 2024 21:26:11 -0600
Subject: [PATCH 10/16] Added yolov8 model

---
 examples/yolo_v8_utils.py |  85 +++++++++++
 examples/yolov8.mojo      | 305 ++++++++++++++++++++++++++++++++++++++
 examples/yolov8n_onnx.py  | 159 ++++++++++++++++++++
 3 files changed, 549 insertions(+)
 create mode 100644 examples/yolo_v8_utils.py
 create mode 100644 examples/yolov8.mojo
 create mode 100644 examples/yolov8n_onnx.py

diff --git a/examples/yolo_v8_utils.py b/examples/yolo_v8_utils.py
new file mode 100644
index 0000000..1081068
--- /dev/null
+++ b/examples/yolo_v8_utils.py
@@ -0,0 +1,85 @@
+import cv2
+import numpy as np
+
+CLASSES = {
+    0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane", 5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light", 10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench", 14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow", 20: "elephant", 21: "bear", 22: "zebra", 23: "giraffe", 24: "backpack", 25: "umbrella", 26: "handbag", 27: "tie", 28: "suitcase", 29: "frisbee", 30: "skis", 31: "snowboard", 32: "sports ball", 33: "kite", 34: "baseball bat", 35: "baseball glove", 36: "skateboard", 37: "surfboard", 38: "tennis racket",
+    39: "bottle", 40: "wine glass", 41: "cup", 42: "fork", 43: "knife", 44: "spoon", 45: "bowl", 46: "banana", 47: "apple", 48: "sandwich", 49: "orange", 50: "broccoli", 51: "carrot", 52: "hot dog", 53: "pizza", 54: "donut", 55: "cake", 56: "chair", 57: "couch", 58: "potted plant", 59: "bed", 60: "dining table", 61: "toilet", 62: "tv", 63: "laptop", 64: "mouse", 65: "remote", 66: "keyboard", 67: "cell phone", 68: "microwave", 69: "oven", 70: "toaster", 71: "sink", 72: "refrigerator", 73: "book", 74: "clock", 75: "vase", 76: "scissors", 77: "teddy bear", 78: "hair drier", 79: "toothbrush",
+}
+
+colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
+
+# This is needed because calling this code directly in mojo gives error of numpy array circular base dependency
+
+
+def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
+    label = f'{CLASSES[class_id]} ({confidence:.2f})'
+    color = colors[class_id]
+    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
+    cv2.putText(img, label, (x - 10, y - 10),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+
+def get_image(image_path):
+    # load the image
+    original_image: np.ndarray = cv2.imread(image_path)
+    [height, width, _] = original_image.shape
+    length = max((height, width))
+    image = np.zeros((length, length, 3), np.uint8)
+    image[0:height, 0:width] = original_image
+    scale = length / 640
+
+    blob = cv2.dnn.blobFromImage(
+        image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
+
+    return blob
+
+
+def draw_bounding_box_yolo(image_path, outputs):
+    original_image: np.ndarray = cv2.imread(image_path)
+    [height, width, _] = original_image.shape
+    length = max((height, width))
+    image = np.zeros((length, length, 3), np.uint8)
+    image[0:height, 0:width] = original_image
+    scale = length / 640
+
+    outputs = np.array([cv2.transpose(outputs[0])])
+    rows = outputs.shape[1]
+
+    boxes = []
+    scores = []
+    class_ids = []
+
+    for i in range(rows):
+        classes_scores = outputs[0][i][4:]
+        (minScore, maxScore, minClassLoc, (x, maxClassIndex)
+         ) = cv2.minMaxLoc(classes_scores)
+        if maxScore >= 0.25:
+            box = [
+                outputs[0][i][0] - (0.5 * outputs[0][i][2]
+                                    ), outputs[0][i][1] - (0.5 * outputs[0][i][3]),
+                outputs[0][i][2], outputs[0][i][3]]
+            boxes.append(box)
+            scores.append(maxScore)
+            class_ids.append(maxClassIndex)
+
+    result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)
+
+    detections = []
+    for i in range(len(result_boxes)):
+        index = result_boxes[i]
+        box = boxes[index]
+        detection = {
+            'class_id': class_ids[index],
+            'class_name': CLASSES[class_ids[index]],
+            'confidence': scores[index],
+            'box': box,
+            'scale': scale}
+        detections.append(detection)
+        draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
+                          round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))
+
+    cv2.imshow('image', original_image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()
+
+    print(detections)
diff --git a/examples/yolov8.mojo b/examples/yolov8.mojo
new file mode 100644
index 0000000..8f76344
--- /dev/null
+++ b/examples/yolov8.mojo
@@ -0,0 +1,305 @@
+import basalt.nn as nn
+from basalt import Tensor, TensorShape
+from basalt import Graph, Symbol, OP, dtype
+from basalt.autograd.attributes import AttributeVector, Attribute
+from basalt.utils.tensor_creation_utils import to_tensor, to_numpy
+
+from python import Python
+from math import ceil, max
+
+
+fn Conv(
+    inout g: Graph,
+    x: Symbol,
+    out_channels: Int,
+    kernel_size: Int,
+    padding: Int,
+    stride: Int,
+) -> Symbol:
+    # NOTE: This is functionally equivalent to the Conv2D -> BatchNorm2D (removed in graph) -> SiLU (According to ONNX)
+    var conv = nn.Conv2d(g, x, out_channels, kernel_size, padding, stride)
+    var sigmoid = g.op(OP.SIGMOID, conv)
+    return g.op(OP.MUL, conv, sigmoid)
+
+
+fn Conv(
+    inout g: Graph,
+    x: Symbol,
+    weight: Symbol,
+    bias: Symbol,
+    kernel_size: StaticIntTuple[2],
+    padding: StaticIntTuple[2],
+    stride: StaticIntTuple[2],
+) -> Symbol:
+    # NOTE: This is functionally equivalent to the Conv2D -> BatchNorm2D (removed in graph) -> SiLU (According to ONNX)
+    var conv = g.op(OP.CONV2D, x, weight, bias, attributes=AttributeVector(
+        Attribute("padding", padding),
+        Attribute("stride", stride),
+        Attribute("dilation", StaticIntTuple[2](1, 1)),
+    ))
+    var sigmoid = g.op(OP.SIGMOID, conv)
+    return g.op(OP.MUL, conv, sigmoid)
+
+
+fn C2f(
+    inout g: Graph,
+    x: Symbol,
+    out_channels: Int,
+    n: Int,
+    shortcut: Bool
+) -> Symbol:
+    var conv = Conv(g, x, out_channels, 1, 0, 1)
+
+    var split_size = out_channels // 2
+    var split_sections = List[Int](split_size, split_size)
+    var split = g.split(conv, split_sections, dim=1)
+
+    # declare the weights for the last conv here because that is the order in onnx file
+    var n_temp = 1
+    if n > 1:
+        n_temp = 2
+    var weight = g.param(TensorShape(out_channels, split_size * (n + 2), 1, 1))
+    var bias = g.param(TensorShape(out_channels))
+
+    @parameter
+    fn bottleneck(
+        x: Symbol, out_channels: Int, shortcut: Bool = False
+    ) -> Symbol:
+        var conv1 = Conv(g, x, out_channels, 3, 1, 1)
+        var conv2 = Conv(g, conv1, out_channels, 3, 1, 1)
+
+        if shortcut:
+            return g.op(OP.ADD, x, conv2)
+        else:
+            return conv2
+
+    var y1 = bottleneck(split[1], split_size, shortcut)
+    var y2 = y1
+
+    var concat_list = List[Symbol]() # add ability to concat to receive a list, becauase the the concatenation has to be done for each bottleneck layer that was run
+
+    # NOTE: This assumes n >= 1 (Could add a constrained for it later)
+    for i in range(1, n):
+        y2 = bottleneck(y2, split_size, shortcut)
+        # concat_list.append(y2)
+    
+    # add ability to concat to receive a list, becauase the the concatenation has to be done for each bottleneck layer that was run
+    var y: Symbol
+    if n > 1:
+        y = g.concat(split[0], split[1], y1, y2, dim=1)
+    else: 
+        y = g.concat(split[0], split[1], y1, dim=1)
+
+    return Conv(g, y, weight, bias, 1, 0, 1)
+
+
+fn SPPF(inout g: Graph, x: Symbol, out_channels: Int) -> Symbol:
+    var conv = Conv(g, x, out_channels // 2, 1, 0, 1)
+
+    var maxpool2d_1 = nn.MaxPool2d(g, conv, kernel_size=5, stride=StaticIntTuple[2](1), padding=2)
+    var maxpool2d_2 = nn.MaxPool2d(g, maxpool2d_1, kernel_size=5, stride=StaticIntTuple[2](1), padding=2)
+    var maxpool2d_3 = nn.MaxPool2d(g, maxpool2d_2, kernel_size=5, stride=StaticIntTuple[2](1), padding=2)
+
+    var y = g.concat(conv, maxpool2d_1, maxpool2d_2, maxpool2d_3, dim=1)
+
+    return Conv(g, y, out_channels, 1, 0, 1)
+
+
+fn Detect(inout g: Graph, x: Symbol, out_channels: Int, nc: Int, detect_conv: Int) -> Symbol:
+    # self.nc = nc  # number of classes
+    # self.nl = len(ch)  # number of detection layers
+    # self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
+    # self.no = nc + self.reg_max * 4  # number of outputs per anchor
+
+    var reg_max = 16
+
+    var c2 = max(max(16, out_channels // 4),  reg_max * 4)
+    var c3 = max(0, nc)  # channels
+    
+    if detect_conv == 1:
+        var conv1 = Conv(g, x, c2, 3, 1, 1)
+        var conv1_2 = Conv(g, conv1, c2, 3, 1, 1)
+        var conv1_3 = nn.Conv2d(g, conv1_2, 4 * reg_max, 1, 0, 1)
+
+        return conv1_3
+    else:
+        var conv2 = Conv(g, x, c3, 3, 1, 1)
+        var conv2_2 = Conv(g, conv2, c3, 3, 1, 1)
+        var conv2_3 = nn.Conv2d(g, conv2_2, nc, 1, 0, 1)
+
+        return conv2_3
+
+
+fn YoloV8(batch_size: Int, yolo_model_type: StaticTuple[Float64, 3]) -> Graph:
+    var g = Graph()
+    var x = g.input(TensorShape(batch_size, 3, 640, 640))
+
+    # Adapted from https://private-user-images.githubusercontent.com/27466624/239739723-57391d0f-1848-4388-9f30-88c2fb79233f.jpg?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTUxMTk0MDYsIm5iZiI6MTcxNTExOTEwNiwicGF0aCI6Ii8yNzQ2NjYyNC8yMzk3Mzk3MjMtNTczOTFkMGYtMTg0OC00Mzg4LTlmMzAtODhjMmZiNzkyMzNmLmpwZz9Y>LUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA1MDclMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNTA3VDIxNTgyNlomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTNlZTdkY2ZiMDA0Y2VlOGZkYjllN2FkYTQ1MTY5OWY1YzYwNjIxZDM4OTZiYWRiMGU5YWQxNzkyMTcwNGNmNTQmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.0ocPCiokkivvk95bQCds6Nt0EblUrHZElycV311ImF4. Some values (output_channels, stride, etc..) are different in the onnx file and the graph image.
+
+    # Backbone
+    var out_channels_1 = int(64 * yolo_model_type[1])
+    var conv_1 = Conv(g, x, out_channels_1, 3, 1, 2)
+    var out_channels_2 = int(128 * yolo_model_type[1])
+    var conv_2 = Conv(g, conv_1, out_channels_2, 3, 1, 2)
+    var C2F_n_1 = int((3 * yolo_model_type[0]) + 1) # ceil
+    var C2f_1 = C2f(g, conv_2, out_channels_2, n=C2F_n_1, shortcut=True)
+    var out_channels_3 = int(256 * yolo_model_type[1])
+    var conv_3 = Conv(g, C2f_1, out_channels_3, 3, 1, 2)
+    var C2F_n_2 = int((6 * yolo_model_type[0]) + 1) # ceil
+    var C2f_2 = C2f(g, conv_3, out_channels_3, n=C2F_n_2, shortcut=True)
+
+    var out_channels_4 = int(512 * yolo_model_type[1])
+    var conv_4 = Conv(g, C2f_2, out_channels_4, 3, 1, 2)
+    var C2f_3 = C2f(g, conv_4, out_channels_4, n=C2F_n_2, shortcut=True)
+
+    var out_channels_5 = int(512 * yolo_model_type[1] * yolo_model_type[2])
+    var conv_5 = Conv(g, C2f_3, out_channels_5, 3, 1, 2)
+    var C2f_4 = C2f(g, conv_5, out_channels_5, n=C2F_n_1, shortcut=True)
+    var SPPF_1 = SPPF(g, C2f_4, out_channels_5)
+
+    # Head
+    var upsample_1 = g.op(OP.UPSAMPLE, SPPF_1, attributes=AttributeVector(Attribute("mode", "nearest"), Attribute("scales", TensorShape(2, 2))))
+
+    # The order of concats was wrong
+    var concat_1 = g.concat(upsample_1, C2f_3, dim=1)
+
+    var out_channels_6 = int(512 * yolo_model_type[1])
+    var C2f_5 = C2f(g, concat_1, out_channels_6, n=C2F_n_1, shortcut=False)
+    
+    var upsample_2 = g.op(OP.UPSAMPLE, C2f_5, attributes=AttributeVector(Attribute("mode", "nearest"), Attribute("scales", TensorShape(2, 2))))
+
+    var concat_2 = g.concat(upsample_2, C2f_2, dim=1)
+
+    var out_channels_7 = int(256 * yolo_model_type[1])
+    var C2f_6 = C2f(g, concat_2, out_channels_7, n=C2F_n_1, shortcut=False)
+    
+    var conv_6 = Conv(g, C2f_6, out_channels_7, 3, 1, 2)
+    var concat_3 = g.concat(conv_6, C2f_5, dim=1)
+    var C2f_7 = C2f(g, concat_3, out_channels_6, n=C2F_n_1, shortcut=False)
+
+    var conv_7 = Conv(g, C2f_7, out_channels_6, 3, 1, 2)
+    var concat_4 = g.concat(conv_7, SPPF_1, dim=1)
+    var out_channels_8 = int(512 * yolo_model_type[1] * yolo_model_type[2])
+    var C2f_8 = C2f(g, concat_4, out_channels_8, n=C2F_n_1, shortcut=False)
+
+    # Detect
+    # declare them this way because the order of initializers in the onnx file is like this
+    var detect_1 = Detect(g, C2f_6, out_channels_7, 80, 1)
+    var detect_2 = Detect(g, C2f_7, out_channels_6, 80, 1)
+    var detect_3 = Detect(g, C2f_8, out_channels_8, 80, 1)
+
+    var detect_1_1 = Detect(g, C2f_6, out_channels_7, 80, 2)
+    var detect_2_1 = Detect(g, C2f_7, out_channels_6, 80, 2)
+    var detect_3_1 = Detect(g, C2f_8, out_channels_8, 80, 2)
+
+    var concat_detect_1 = g.concat(detect_1, detect_1_1, dim=1)
+    var concat_detect_2 = g.concat(detect_2, detect_2_1, dim=1)
+    var concat_detect_3 = g.concat(detect_3, detect_3_1, dim=1)
+
+    # -------- output
+    var reshape_1 = g.op(OP.RESHAPE, concat_detect_1, attributes=AttributeVector(Attribute("shape", TensorShape(1, 144, concat_detect_1.shape[2] * concat_detect_1.shape[3]))))
+
+    var reshape_2 = g.op(OP.RESHAPE, concat_detect_2, attributes=AttributeVector(Attribute("shape", TensorShape(1, 144, concat_detect_2.shape[2] * concat_detect_2.shape[3]))))
+
+    var reshape_3 = g.op(OP.RESHAPE, concat_detect_3, attributes=AttributeVector(Attribute("shape", TensorShape(1, 144, concat_detect_3.shape[2] * concat_detect_3.shape[3]))))
+
+    # --
+
+    var concat_5 = g.concat(reshape_1, reshape_2, reshape_3, dim=2)
+
+    var split_sections = List[Int](64, 80)
+    var split_1 = g.split(concat_5, split_sections, dim=1)
+
+    var for_second_concat = g.op(OP.SIGMOID, split_1[1])
+
+    var reshape_4 = g.op(OP.RESHAPE, split_1[0], attributes=AttributeVector(Attribute("shape", TensorShape(1, 4, 16, 8400))))
+
+    var transpose_1 = g.op(OP.TRANSPOSE, reshape_4, attributes=AttributeVector(Attribute("axes", List[Int](0, 2, 1, 3))))
+
+    var softmax = nn.Softmax(g, transpose_1, axis=1)
+
+    var conv_norm_1 = nn.Conv2d(g, softmax, 1, 1, 0, 1, 1)
+
+    var reshape_5 = g.op(OP.RESHAPE, conv_norm_1, attributes=AttributeVector(Attribute("shape", TensorShape(1, 4, 8400))))
+
+    var slice_1 = g.op(OP.SLICE, reshape_5, attributes=AttributeVector(
+        Attribute("axes", List[Int](1)), 
+        Attribute("starts", List[Int](0)), 
+        Attribute("ends", List[Int](2))))
+    var slice_2 = g.op(OP.SLICE, reshape_5, attributes=AttributeVector(
+        Attribute("axes", List[Int](1)), 
+        Attribute("starts", List[Int](2)), 
+        Attribute("ends", List[Int](4))))
+
+    var sub_constant_value = g.input(TensorShape(1, 2, 8400))
+    var sub_with_constant_1 = g.op(OP.SUB, sub_constant_value, slice_1)
+    var add_constant_value = g.input(TensorShape(1, 2, 8400))
+    var add_with_constant_2 = g.op(OP.ADD, add_constant_value, slice_2)
+
+    var add_1 = g.op(OP.ADD, sub_with_constant_1, add_with_constant_2)
+    var sub_1 = g.op(OP.SUB, add_with_constant_2, sub_with_constant_1)
+
+    var div_1 = g.op(OP.DIV, add_1, 2)
+
+    var concat_6 = g.concat(div_1, sub_1, dim=1)
+
+    var mul_constant_value = g.input(TensorShape(1, 8400))
+    var mul_with_constant_1 = g.op(OP.MUL, concat_6, mul_constant_value)
+
+    var concat_7 = g.concat(mul_with_constant_1, for_second_concat, dim=1)
+    
+    g.out(concat_7)
+
+    return g ^
+
+
+alias yolov8_n = StaticTuple[Float64, 3](
+    0.33, 0.25, 2
+)  # d (depth_multiplier), w (width_multiplier), r (ratio)
+# var yolov8_s
+# var yolov8_m
+
+
+fn get_constant_values_from_onnx_model(model_path: String) raises -> List[Tensor[dtype]]:
+    var onnx = Python.import_module("onnx")
+
+    var model = onnx.load(model_path)
+
+    var result = List[Tensor[dtype]]()
+    
+    for node in model.graph.node:
+        if node.op_type == "Constant":
+            for attr in node.attribute:
+                if attr.name == 'value':
+                    var tensor = onnx.numpy_helper.to_array(attr.t)
+                    if node.name == "/model.22/Constant_9":
+                        result.append(to_tensor(tensor))
+                    if node.name == "/model.22/Constant_10":
+                        result.append(to_tensor(tensor))
+                    if node.name == "/model.22/Constant_12":
+                        result.append(to_tensor(tensor))
+
+    return result
+    
+
+fn main() raises:
+    alias graph = YoloV8(1, yolov8_n)
+    var model = nn.Model[graph]()
+
+    # try: graph.render("node")
+    # except: print("Could not render graph")
+
+
+    model.load_model_data("./examples/data/yolov8n.onnx")
+
+    var constant_values = get_constant_values_from_onnx_model("./examples/data/yolov8n.onnx")
+
+    Python.add_to_path("./examples")
+    var get_image = Python.import_module("yolo_v8_utils")
+
+    var image_tensor = to_tensor(get_image.get_image('./examples/data/bus.jpg'))
+
+
+    var res = model.inference(image_tensor, constant_values[0], constant_values[1], constant_values[2])
+
+    get_image.draw_bounding_box_yolo("bus.jpg", to_numpy(res[0]))
\ No newline at end of file
diff --git a/examples/yolov8n_onnx.py b/examples/yolov8n_onnx.py
new file mode 100644
index 0000000..b880cad
--- /dev/null
+++ b/examples/yolov8n_onnx.py
@@ -0,0 +1,159 @@
+import cv2.dnn
+import numpy as np
+import onnxruntime as rt
+
+
+CLASSES = {0: "person",
+  1: "bicycle",
+  2: "car",
+  3: "motorcycle",
+  4: "airplane",
+  5: "bus",
+  6: "train",
+  7: "truck",
+  8: "boat",
+  9: "traffic light",
+  10: "fire hydrant",
+  11: "stop sign",
+  12: "parking meter",
+  13: "bench",
+  14: "bird",
+  15: "cat",
+  16: "dog",
+  17: "horse",
+  18: "sheep",
+  19: "cow",
+  20: "elephant",
+  21: "bear",
+  22: "zebra",
+  23: "giraffe",
+  24: "backpack",
+  25: "umbrella",
+  26: "handbag",
+  27: "tie",
+  28: "suitcase",
+  29: "frisbee",
+  30: "skis",
+  31: "snowboard",
+  32: "sports ball",
+  33: "kite",
+  34: "baseball bat",
+  35: "baseball glove",
+  36: "skateboard",
+  37: "surfboard",
+  38: "tennis racket",
+  39: "bottle",
+  40: "wine glass",
+  41: "cup",
+  42: "fork",
+  43: "knife",
+  44: "spoon",
+  45: "bowl",
+  46: "banana",
+  47: "apple",
+  48: "sandwich",
+  49: "orange",
+  50: "broccoli",
+  51: "carrot",
+  52: "hot dog",
+  53: "pizza",
+  54: "donut",
+  55: "cake",
+  56: "chair",
+  57: "couch",
+  58: "potted plant",
+  59: "bed",
+  60: "dining table",
+  61: "toilet", 
+  62: "tv",
+  63: "laptop",
+  64: "mouse",
+  65: "remote",
+  66: "keyboard",
+  67: "cell phone",
+  68: "microwave",
+  69: "oven",
+  70: "toaster",
+  71: "sink",
+  72: "refrigerator",
+  73: "book",
+  74: "clock",
+  75: "vase",
+  76: "scissors",
+  77: "teddy bear",
+  78: "hair drier",
+  79: "toothbrush",
+}
+
+colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
+
+
+def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
+    label = f'{CLASSES[class_id]} ({confidence:.2f})'
+    color = colors[class_id]
+    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
+    cv2.putText(img, label, (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+
+def main(onnx_model, input_image):
+    model: cv2.dnn.Net = cv2.dnn.readNetFromONNX(onnx_model)
+    original_image: np.ndarray = cv2.imread(input_image)
+    [height, width, _] = original_image.shape
+    length = max((height, width))
+    image = np.zeros((length, length, 3), np.uint8)
+    image[0:height, 0:width] = original_image
+    scale = length / 640
+
+    blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
+    # model.setInput(blob)
+    # outputs = model.forward()
+    # print(outputs.shape)
+
+    model2 = rt.InferenceSession(onnx_model)
+
+    outputs = model2.run(None, {"images": blob})[0]
+
+
+    outputs = np.array([cv2.transpose(outputs[0])])
+    rows = outputs.shape[1]
+
+    boxes = []
+    scores = []
+    class_ids = []
+
+    for i in range(rows):
+        classes_scores = outputs[0][i][4:]
+        (minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
+        if maxScore >= 0.25:
+            box = [
+                outputs[0][i][0] - (0.5 * outputs[0][i][2]), outputs[0][i][1] - (0.5 * outputs[0][i][3]),
+                outputs[0][i][2], outputs[0][i][3]]
+            boxes.append(box)
+            scores.append(maxScore)
+            class_ids.append(maxClassIndex)
+
+    result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)
+
+    detections = []
+    for i in range(len(result_boxes)):
+        index = result_boxes[i]
+        box = boxes[index]
+        detection = {
+            'class_id': class_ids[index],
+            'class_name': CLASSES[class_ids[index]],
+            'confidence': scores[index],
+            'box': box,
+            'scale': scale}
+        detections.append(detection)
+        draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
+                          round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))
+
+    cv2.imshow('image', original_image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()
+
+
+    # print(detections)
+
+
+main('examples/data/yolov8n.onnx', "examples/data/bus.jpg")
\ No newline at end of file

From c5a2d8d80c3c893fe21e22602f98bb5ecbc1341a Mon Sep 17 00:00:00 2001
From: StijnWoestenborghs <stijn.woestenborghs@live.be>
Date: Mon, 3 Jun 2024 13:09:58 +0200
Subject: [PATCH 11/16] add yolov8 cam example

---
 examples/yolo_v8_utils.py | 141 +++++++++++++++++++++++++++-------
 examples/yolov8.mojo      |   8 +-
 examples/yolov8_cam.mojo  |  85 +++++++++++++++++++++
 examples/yolov8n_onnx.py  | 157 +++-----------------------------------
 4 files changed, 213 insertions(+), 178 deletions(-)
 create mode 100644 examples/yolov8_cam.mojo

diff --git a/examples/yolo_v8_utils.py b/examples/yolo_v8_utils.py
index 1081068..e7b22e4 100644
--- a/examples/yolo_v8_utils.py
+++ b/examples/yolo_v8_utils.py
@@ -1,17 +1,94 @@
 import cv2
 import numpy as np
 
+
 CLASSES = {
-    0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane", 5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light", 10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench", 14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow", 20: "elephant", 21: "bear", 22: "zebra", 23: "giraffe", 24: "backpack", 25: "umbrella", 26: "handbag", 27: "tie", 28: "suitcase", 29: "frisbee", 30: "skis", 31: "snowboard", 32: "sports ball", 33: "kite", 34: "baseball bat", 35: "baseball glove", 36: "skateboard", 37: "surfboard", 38: "tennis racket",
-    39: "bottle", 40: "wine glass", 41: "cup", 42: "fork", 43: "knife", 44: "spoon", 45: "bowl", 46: "banana", 47: "apple", 48: "sandwich", 49: "orange", 50: "broccoli", 51: "carrot", 52: "hot dog", 53: "pizza", 54: "donut", 55: "cake", 56: "chair", 57: "couch", 58: "potted plant", 59: "bed", 60: "dining table", 61: "toilet", 62: "tv", 63: "laptop", 64: "mouse", 65: "remote", 66: "keyboard", 67: "cell phone", 68: "microwave", 69: "oven", 70: "toaster", 71: "sink", 72: "refrigerator", 73: "book", 74: "clock", 75: "vase", 76: "scissors", 77: "teddy bear", 78: "hair drier", 79: "toothbrush",
+    0: "person",
+    1: "bicycle",
+    2: "car",
+    3: "motorcycle",
+    4: "airplane",
+    5: "bus",
+    6: "train",
+    7: "truck",
+    8: "boat",
+    9: "traffic light",
+    10: "fire hydrant",
+    11: "stop sign",
+    12: "parking meter",
+    13: "bench",
+    14: "bird",
+    15: "cat",
+    16: "dog",
+    17: "horse",
+    18: "sheep",
+    19: "cow",
+    20: "elephant",
+    21: "bear",
+    22: "zebra",
+    23: "giraffe",
+    24: "backpack",
+    25: "umbrella",
+    26: "handbag",
+    27: "tie",
+    28: "suitcase",
+    29: "frisbee",
+    30: "skis",
+    31: "snowboard",
+    32: "sports ball",
+    33: "kite",
+    34: "baseball bat",
+    35: "baseball glove",
+    36: "skateboard",
+    37: "surfboard",
+    38: "tennis racket",
+    39: "bottle",
+    40: "wine glass",
+    41: "cup",
+    42: "fork",
+    43: "knife",
+    44: "spoon",
+    45: "bowl",
+    46: "banana",
+    47: "apple",
+    48: "sandwich",
+    49: "orange",
+    50: "broccoli",
+    51: "carrot",
+    52: "hot dog",
+    53: "pizza",
+    54: "donut",
+    55: "cake",
+    56: "chair",
+    57: "couch",
+    58: "potted plant",
+    59: "bed",
+    60: "dining table",
+    61: "toilet", 
+    62: "tv",
+    63: "laptop",
+    64: "mouse",
+    65: "remote",
+    66: "keyboard",
+    67: "cell phone",
+    68: "microwave",
+    69: "oven",
+    70: "toaster",
+    71: "sink",
+    72: "refrigerator",
+    73: "book",
+    74: "clock",
+    75: "vase",
+    76: "scissors",
+    77: "teddy bear",
+    78: "hair drier",
+    79: "toothbrush",
 }
 
 colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
 
-# This is needed because calling this code directly in mojo gives error of numpy array circular base dependency
-
 
-def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
+def bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
     label = f'{CLASSES[class_id]} ({confidence:.2f})'
     color = colors[class_id]
     cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
@@ -19,27 +96,22 @@ def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
                 cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
 
 
-def get_image(image_path):
+def get_model_input(image_path):
     # load the image
     original_image: np.ndarray = cv2.imread(image_path)
-    [height, width, _] = original_image.shape
+    height, width, _ = original_image.shape
     length = max((height, width))
-    image = np.zeros((length, length, 3), np.uint8)
-    image[0:height, 0:width] = original_image
-    scale = length / 640
-
-    blob = cv2.dnn.blobFromImage(
-        image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
+    image = np.pad(original_image, ((0, length - height), (0, length - width), (0, 0)), mode='constant', constant_values=0)
+    blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
 
     return blob
 
 
-def draw_bounding_box_yolo(image_path, outputs):
-    original_image: np.ndarray = cv2.imread(image_path)
-    [height, width, _] = original_image.shape
+def draw_bounding_box_yolo(original_image, outputs):
+    height, width, _ = original_image.shape
     length = max((height, width))
     image = np.zeros((length, length, 3), np.uint8)
-    image[0:height, 0:width] = original_image
+    image[:height, :width] = original_image
     scale = length / 640
 
     outputs = np.array([cv2.transpose(outputs[0])])
@@ -51,13 +123,14 @@ def draw_bounding_box_yolo(image_path, outputs):
 
     for i in range(rows):
         classes_scores = outputs[0][i][4:]
-        (minScore, maxScore, minClassLoc, (x, maxClassIndex)
-         ) = cv2.minMaxLoc(classes_scores)
+        (_, maxScore, _, (_, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
         if maxScore >= 0.25:
             box = [
-                outputs[0][i][0] - (0.5 * outputs[0][i][2]
-                                    ), outputs[0][i][1] - (0.5 * outputs[0][i][3]),
-                outputs[0][i][2], outputs[0][i][3]]
+                outputs[0][i][0] - (0.5 * outputs[0][i][2]),
+                outputs[0][i][1] - (0.5 * outputs[0][i][3]),
+                outputs[0][i][2],
+                outputs[0][i][3]
+            ]
             boxes.append(box)
             scores.append(maxScore)
             class_ids.append(maxClassIndex)
@@ -75,10 +148,26 @@ def draw_bounding_box_yolo(image_path, outputs):
             'box': box,
             'scale': scale}
         detections.append(detection)
-        draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
-                          round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))
-
-    cv2.imshow('image', original_image)
+        
+        bounding_box(
+            original_image,
+            class_ids[index],
+            scores[index],
+            round(box[0] * scale),
+            round(box[1] * scale),
+            round((box[0] + box[2]) * scale),
+            round((box[1] + box[3]) * scale)
+        )
+
+    return detections
+
+
+def draw_bbox_from_image(image_path, outputs):
+    image: np.ndarray = cv2.imread(image_path)
+    
+    detections = draw_bounding_box_yolo(image, outputs)
+    
+    cv2.imshow('image', image)
     cv2.waitKey(0)
     cv2.destroyAllWindows()
 
diff --git a/examples/yolov8.mojo b/examples/yolov8.mojo
index 8f76344..eb3c16c 100644
--- a/examples/yolov8.mojo
+++ b/examples/yolov8.mojo
@@ -295,11 +295,9 @@ fn main() raises:
     var constant_values = get_constant_values_from_onnx_model("./examples/data/yolov8n.onnx")
 
     Python.add_to_path("./examples")
-    var get_image = Python.import_module("yolo_v8_utils")
-
-    var image_tensor = to_tensor(get_image.get_image('./examples/data/bus.jpg'))
-
+    var yolo_utils = Python.import_module("yolo_v8_utils")
+    var image_tensor = to_tensor(yolo_utils.get_model_input('./examples/data/bus.jpg'))
 
     var res = model.inference(image_tensor, constant_values[0], constant_values[1], constant_values[2])
 
-    get_image.draw_bounding_box_yolo("bus.jpg", to_numpy(res[0]))
\ No newline at end of file
+    yolo_utils.draw_bbox_from_image("./examples/data/bus.jpg", to_numpy(res[0]))
\ No newline at end of file
diff --git a/examples/yolov8_cam.mojo b/examples/yolov8_cam.mojo
new file mode 100644
index 0000000..1c8b2c3
--- /dev/null
+++ b/examples/yolov8_cam.mojo
@@ -0,0 +1,85 @@
+import sys
+from time.time import now
+from python.python import Python
+from math import max
+
+from yolov8 import YoloV8, get_constant_values_from_onnx_model
+
+import basalt.nn as nn
+from basalt import Tensor, TensorShape, dtype
+from basalt.utils.tensor_creation_utils import to_tensor, to_numpy
+
+
+fn cam(
+    inout model: nn.Model,
+    constants: List[Tensor[dtype]]
+) raises:
+
+    Python.add_to_path("./examples")
+    var yolo_utils = Python.import_module("yolo_v8_utils")
+    
+    var cv2 = Python.import_module("cv2")
+    var np = Python.import_module("numpy")
+    var cap = cv2.VideoCapture(0)
+
+    if not cap.isOpened():
+        print("Error: Could not open webcam")
+        sys.exit(1)
+
+    var height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT).to_float64()
+    var width = cap.get(cv2.CAP_PROP_FRAME_WIDTH).to_float64()
+    var length = max(height, width)
+    var pads = np.array([0, length - height, 0, length - width, 0, 0], dtype=np.int32).reshape(3, 2)
+
+    var last_time = now()
+
+    while True:
+        var r = cap.read()
+
+        if not r[0]:
+            print("Error: Could not read frame")
+            break
+
+        var image = np.pad(r[1], pads, mode='constant', constant_values=0)
+        var blob = cv2.dnn.blobFromImage(image, scalefactor=1/255, size=(640, 640), swapRB=True)
+        
+        var res = model.inference(to_tensor(blob), constants[0], constants[1], constants[2])
+
+        yolo_utils.draw_bounding_box_yolo(r[1], to_numpy(res[0]))
+        cv2.imshow(
+            'Basalt',
+            cv2.putText(
+                r[1],
+                "FPS: " + String(1e9 / (now() - last_time)),
+                (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.30, (0, 0, 0), 1, cv2.LINE_AA
+            )
+        )
+
+        last_time = now()
+        if int(cv2.waitKey(1) & 0xFF) == 27 or cv2.getWindowProperty('Basalt', cv2.WND_PROP_VISIBLE) < 1:
+            cv2.destroyAllWindows()
+            sys.exit()
+
+
+fn main():
+
+    alias yolov8_n = StaticTuple[Float64, 3](
+        0.33, 0.25, 2
+    ) # d (depth_multiplier), w (width_multiplier), r (ratio)
+
+    alias graph = YoloV8(1, yolov8_n)
+    var model = nn.Model[graph]()
+
+    model.load_model_data("./examples/data/yolov8n.onnx")
+    
+    try:
+        var constant_values = get_constant_values_from_onnx_model("./examples/data/yolov8n.onnx")
+        
+        cam(
+            model,
+            constant_values
+        )
+
+    except e:
+        print("Error in cam() function")
+        print(e)
diff --git a/examples/yolov8n_onnx.py b/examples/yolov8n_onnx.py
index b880cad..5b4eb2d 100644
--- a/examples/yolov8n_onnx.py
+++ b/examples/yolov8n_onnx.py
@@ -2,158 +2,21 @@
 import numpy as np
 import onnxruntime as rt
 
-
-CLASSES = {0: "person",
-  1: "bicycle",
-  2: "car",
-  3: "motorcycle",
-  4: "airplane",
-  5: "bus",
-  6: "train",
-  7: "truck",
-  8: "boat",
-  9: "traffic light",
-  10: "fire hydrant",
-  11: "stop sign",
-  12: "parking meter",
-  13: "bench",
-  14: "bird",
-  15: "cat",
-  16: "dog",
-  17: "horse",
-  18: "sheep",
-  19: "cow",
-  20: "elephant",
-  21: "bear",
-  22: "zebra",
-  23: "giraffe",
-  24: "backpack",
-  25: "umbrella",
-  26: "handbag",
-  27: "tie",
-  28: "suitcase",
-  29: "frisbee",
-  30: "skis",
-  31: "snowboard",
-  32: "sports ball",
-  33: "kite",
-  34: "baseball bat",
-  35: "baseball glove",
-  36: "skateboard",
-  37: "surfboard",
-  38: "tennis racket",
-  39: "bottle",
-  40: "wine glass",
-  41: "cup",
-  42: "fork",
-  43: "knife",
-  44: "spoon",
-  45: "bowl",
-  46: "banana",
-  47: "apple",
-  48: "sandwich",
-  49: "orange",
-  50: "broccoli",
-  51: "carrot",
-  52: "hot dog",
-  53: "pizza",
-  54: "donut",
-  55: "cake",
-  56: "chair",
-  57: "couch",
-  58: "potted plant",
-  59: "bed",
-  60: "dining table",
-  61: "toilet", 
-  62: "tv",
-  63: "laptop",
-  64: "mouse",
-  65: "remote",
-  66: "keyboard",
-  67: "cell phone",
-  68: "microwave",
-  69: "oven",
-  70: "toaster",
-  71: "sink",
-  72: "refrigerator",
-  73: "book",
-  74: "clock",
-  75: "vase",
-  76: "scissors",
-  77: "teddy bear",
-  78: "hair drier",
-  79: "toothbrush",
-}
-
-colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
-
-
-def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
-    label = f'{CLASSES[class_id]} ({confidence:.2f})'
-    color = colors[class_id]
-    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
-    cv2.putText(img, label, (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+from yolo_v8_utils import get_model_input, draw_bbox_from_image
 
 
 def main(onnx_model, input_image):
+    blob = get_model_input(input_image)
+    
     model: cv2.dnn.Net = cv2.dnn.readNetFromONNX(onnx_model)
-    original_image: np.ndarray = cv2.imread(input_image)
-    [height, width, _] = original_image.shape
-    length = max((height, width))
-    image = np.zeros((length, length, 3), np.uint8)
-    image[0:height, 0:width] = original_image
-    scale = length / 640
-
-    blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
-    # model.setInput(blob)
-    # outputs = model.forward()
-    # print(outputs.shape)
-
-    model2 = rt.InferenceSession(onnx_model)
-
-    outputs = model2.run(None, {"images": blob})[0]
-
-
-    outputs = np.array([cv2.transpose(outputs[0])])
-    rows = outputs.shape[1]
-
-    boxes = []
-    scores = []
-    class_ids = []
-
-    for i in range(rows):
-        classes_scores = outputs[0][i][4:]
-        (minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
-        if maxScore >= 0.25:
-            box = [
-                outputs[0][i][0] - (0.5 * outputs[0][i][2]), outputs[0][i][1] - (0.5 * outputs[0][i][3]),
-                outputs[0][i][2], outputs[0][i][3]]
-            boxes.append(box)
-            scores.append(maxScore)
-            class_ids.append(maxClassIndex)
-
-    result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)
-
-    detections = []
-    for i in range(len(result_boxes)):
-        index = result_boxes[i]
-        box = boxes[index]
-        detection = {
-            'class_id': class_ids[index],
-            'class_name': CLASSES[class_ids[index]],
-            'confidence': scores[index],
-            'box': box,
-            'scale': scale}
-        detections.append(detection)
-        draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
-                          round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))
-
-    cv2.imshow('image', original_image)
-    cv2.waitKey(0)
-    cv2.destroyAllWindows()
+    model = rt.InferenceSession(onnx_model)
 
+    outputs = model.run(None, {"images": blob})[0]
 
-    # print(detections)
+    draw_bbox_from_image(input_image, outputs)
 
 
-main('examples/data/yolov8n.onnx', "examples/data/bus.jpg")
\ No newline at end of file
+main(
+    onnx_model='examples/data/yolov8n.onnx',
+    input_image="examples/data/bus.jpg"
+)
\ No newline at end of file

From 5cc351e449a1acce03b5f88e6e00f17282638a8d Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Tue, 4 Jun 2024 22:58:34 -0600
Subject: [PATCH 12/16] Made assert_tensors_equal vectorized

---
 tests/testing_utils.mojo | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/testing_utils.mojo b/tests/testing_utils.mojo
index 40bb73b..3706f54 100644
--- a/tests/testing_utils.mojo
+++ b/tests/testing_utils.mojo
@@ -1,6 +1,7 @@
 from python.python import Python
 from collections import OptionalReg
 from testing import assert_equal, assert_almost_equal
+from algorithm import vectorize
 
 from basalt import dtype
 from basalt.autograd import Graph, OP
@@ -20,13 +21,24 @@ fn assert_tensors_equal[
         mode == "exact" or mode == "almost", "Mode must be either 'exact' or 'almost'"
     ]()
 
+    alias nelts = simdwidthof[dtype]()
+
     assert_equal(t1.shape(), t2.shape(), "Tensor shape mismatch")
 
-    for i in range(t1.num_elements()):
+    @parameter
+    fn v_iter[nelts: Int](i: Int) raises:
+        @parameter
         if mode == "almost":
-            assert_almost_equal(t1[i], t2[i], rtol=1e-5, atol=1e-5, msg=msg)
+            assert_almost_equal(t1.load[nelts](i), t2.load[nelts](i), rtol=1e-5, atol=1e-5, msg=msg)
         else:
-            assert_equal(t1[i], t2[i], msg=msg)
+            assert_equal(t1.load[nelts](i), t2.load[nelts](i), msg=msg)
+
+    for i in range(0, t1.num_elements() - nelts + 1, nelts):
+        v_iter[nelts](i)
+
+    # Check the remaining elements
+    for i in range(nelts * (t1.num_elements() // nelts), t1.num_elements()):
+        v_iter[1](i)
 
 
 fn test_unary_op[

From 4af6f50015018536ee8b354900d74358e1a4b816 Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Tue, 4 Jun 2024 23:00:33 -0600
Subject: [PATCH 13/16] Added linear upsample and more torch tests

---
 basalt/autograd/ops/mlops.mojo     | 137 +++++++++++++++++++++++++----
 tests/python/test_mlops_torch.mojo |  33 +++++--
 2 files changed, 150 insertions(+), 20 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 995e0c4..4713862 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -1,6 +1,7 @@
 from algorithm import vectorize, parallelize
-from math import exp, pow, max, min, abs
+from math import exp, pow, max, min, abs, ceil, floor
 from math.limit import min_finite, max_finite
+from memory import stack_allocation
 
 from basalt import Tensor, TensorShape
 from basalt.utils.tensorutils import elwise_transform
@@ -667,30 +668,136 @@ struct UPSAMPLE:
         t1_shape: TensorShape,
         attributes: AttributeVector,
     ](inout res: Tensor[dtype], t1: Tensor[dtype]):
-        # Input is [N, C, D in, H in, W in], N is batch size and C is number of channels. Ranks 3-D, 4-D or 5-D tensors.
-        alias scales = attributes["scales"].value().to_shape() # Has to match input size (the last dimensions D, H and W) or just be one value
+        # Input is [N, C, D in, H in, W in], N is batch size and C is number of channels. Ranks 3-D, 4-D or 5-D tensors (only works on the spatial dimensions).
+        alias scales = attributes["scales"].value().to_shape() # Has to match spatial input dims (the last dimensions D, H and W)
         alias mode = attributes["mode"].value().to_string()
+        # alias align_corners = attributes["align_corners"].value().to_bool() if attributes["align_corners"] else false
+
+        @parameter
+        fn get_coordination_mode() -> String:
+            if mode == "linear":
+                return "half_pixel"
+            else:
+                return "asymmetric"
+        alias coordination_transforamtion = get_coordination_mode()
 
         alias strides = t1_shape.strides()
-        alias total_length = t1_shape.num_elements()
+        var strides_res = res.strides()
+        
+        var res_shape = res.shape()
 
-        alias first_loop = total_length // strides[1]
+        alias first_loop = t1_shape[0] * t1_shape[1]
+
+        @always_inline
+        fn pos_asymmetric(pos: Int, scale: Int) -> Int:
+            return pos // scale
+        
+        @always_inline
+        fn pos_half_pixel(pos: Int, scale: Int) -> Float64:
+            return max(0.0, (pos + 0.5) / scale - 0.5)
 
-        var strides_res = res.strides()
 
         @parameter
-        if mode == "nearest":
+        @always_inline
+        fn get_value_interpolate[size: Int](
+            indeces_t1: StaticTuple[Float64, size],
+            index_t1_sum: Float64
+        ) -> SIMD[t1.dtype, 1]:
             @parameter
-            fn p_iter(i: Int):
-                var offset = i * strides[1]
-                var offset_res = i * strides_res[1]
+            if mode == "nearest":
+                return t1[int(index_t1_sum)]
+            elif mode == "linear":
+                var t1_pos_floor = floor(indeces_t1[1])
+                var t1_pos_ceil = min(ceil(indeces_t1[1]), t1_shape[2] - 1)
+
+                var v1 = t1[int(indeces_t1[0]) + int(t1_pos_floor)]
+                var v2 = t1[int(indeces_t1[0]) + int(t1_pos_ceil)]
+
+                return v1 + (v2 - v1) * (indeces_t1[1] - t1_pos_floor)
+            else:
+                return 0
+
+        @always_inline
+        fn get_t1_position(
+            pos: Int, scale: Int, dim: Int
+        ) -> Float64:
+            @parameter
+            if coordination_transforamtion == "asymmetric":
+                return pos_asymmetric(pos, scale)
+            elif coordination_transforamtion == "half_pixel":
+                return pos_half_pixel(pos, scale)
+            else:
+                return 0
+
+        @parameter
+        fn p_iter(i: Int):
+            var offset_t1 = i * strides[1]
+            var offset_res = i * strides_res[1]
+    
+            @parameter
+            if t1_shape.rank() == 3:
+                var positions_t1 = StaticTuple[Float64, 2](0)
+                var positions_res = StaticIntTuple[2](0)
+
+                positions_res[0] = offset_res
+                positions_t1[0] = offset_t1
+        
+                @parameter
+                fn v_iter[nelts: Int](j: Int):
+                    positions_res[1] = j
+
+                    var index_res = positions_res[0] + positions_res[1]
+                    var values = res.load[nelts](index_res)
+
+                    for k in range(nelts):
+                        positions_t1[1] = get_t1_position(j + k, scales[scales.rank() - 1], 0)
+
+                        values[k] = get_value_interpolate(
+                            positions_t1, 
+                            positions_t1[0] + positions_t1[1])
+
+                    res.store[nelts](index_res, values)
+
                 
-                Self.recursive_iter[2, t1_shape, scales](
-                    res, t1, strides_res, offset, offset_res)
+                vectorize[v_iter, nelts](res_shape[res.rank() - 1])
+            elif t1_shape.rank() == 4:
+                var positions_t1 = StaticTuple[Float64, 3](0)
+                var positions_res = StaticIntTuple[3](0)
 
-            parallelize[p_iter](first_loop)
-        else:
-            pass
+                positions_res[0] = offset_res
+                positions_t1[0] = offset_t1
+
+                for j in range(res_shape[2]):
+                    positions_res[1] = j * strides_res[2]
+                    positions_t1[1] = get_t1_position(j, scales[0], 0) * strides[2]
+            
+                    @parameter
+                    fn v_iter_1[nelts: Int](k: Int):
+                        positions_res[2] = k
+
+                        var index_res = positions_res[0] + positions_res[1] + positions_res[2]
+                        var values = res.load[nelts](index_res)
+
+                        for l in range(nelts):
+                            positions_t1[2] = get_t1_position(k + l, scales[scales.rank() - 1], 1)
+
+                            values[l] = get_value_interpolate(
+                                positions_t1, 
+                                positions_t1[0] + positions_t1[1] + positions_t1[2])
+
+                        res.store[nelts](index_res, values)
+                    
+                    vectorize[v_iter_1, nelts](res_shape[res.rank() - 1])
+
+            elif t1_shape.rank() == 5:
+                for j in range(res.shape()[2]):
+                    for k in range(res.shape()[3]):
+                        pass
+            else:
+                # Error
+                pass    
+
+        parallelize[p_iter](first_loop)
 
     @staticmethod
     fn backward[
diff --git a/tests/python/test_mlops_torch.mojo b/tests/python/test_mlops_torch.mojo
index 4821d52..ccf9b2f 100644
--- a/tests/python/test_mlops_torch.mojo
+++ b/tests/python/test_mlops_torch.mojo
@@ -442,21 +442,44 @@ fn test_SLICE() raises:
 
 
 fn test_UPSAMPLE() raises:
-    alias t1_shape = TensorShape(40, 40, 120, 120)
-    var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
+    alias t1_shape = TensorShape(41, 41, 43)
+    var t1 = Tensor[dtype](t1_shape)
     rand(t1.data(), t1.num_elements())
 
     alias attributes = AttributeVector(
-        Attribute("scales", TensorShape(2, 2)),
-        Attribute("mode", "nearest")
+        Attribute("scales", TensorShape(3)),
+        Attribute("mode", "linear")
     )
 
-    alias ug_shape = TensorShape(40, 40, 240, 240)
+    alias ug_shape = TensorShape(41, 41, 129)
     var ug = Tensor[dtype](ug_shape)
 
     var expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes)
     test_unary_op[OP.UPSAMPLE, t1_shape, attributes](t1, expected_and_grad.expected)
 
+    alias attributes_2 = AttributeVector(
+        Attribute("scales", TensorShape(3)),
+        Attribute("mode", "nearest")
+    )
+
+    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_2)
+    test_unary_op[OP.UPSAMPLE, t1_shape, attributes_2](t1, expected_and_grad.expected)
+
+    alias t1_shape_1 = TensorShape(40, 40, 120, 120)
+    t1 = Tensor[dtype](t1_shape_1)
+    rand(t1.data(), t1.num_elements())
+
+    alias attributes_3 = AttributeVector(
+        Attribute("scales", TensorShape(2, 3)),
+        Attribute("mode", "nearest")
+    )
+
+    alias ug_shape_1 = TensorShape(40, 40, 240, 360)
+    ug = Tensor[dtype](ug_shape_1)
+
+    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_3)
+    test_unary_op[OP.UPSAMPLE, t1_shape_1, attributes_3](t1, expected_and_grad.expected)
+
 
 fn main():
     print("Running mlops (compare with torch) tests")

From 6295ce64b82eb37fe6aed322242035806b8591dc Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Thu, 6 Jun 2024 17:23:08 -0600
Subject: [PATCH 14/16] Added 5D ability to upsample

---
 basalt/autograd/ops/mlops.mojo     | 30 +++++++++++++++++++++++++++++-
 tests/python/test_mlops_torch.mojo | 22 ++++++++++++++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 4713862..58841f7 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -729,6 +729,7 @@ struct UPSAMPLE:
             else:
                 return 0
 
+        # it is possble to use gather, the only problem is to be able to create a simd arange (vectorized if it is with a for loop it is the same probably). (And from tests it seems to be slower, maybe because i do a lot of casts and because the arange of positions is not vectorized)
         @parameter
         fn p_iter(i: Int):
             var offset_t1 = i * strides[1]
@@ -790,9 +791,36 @@ struct UPSAMPLE:
                     vectorize[v_iter_1, nelts](res_shape[res.rank() - 1])
 
             elif t1_shape.rank() == 5:
+                var positions_t1 = StaticTuple[Float64, 4](0)
+                var positions_res = StaticIntTuple[4](0)
+
+                positions_res[0] = offset_res
+                positions_t1[0] = offset_t1
+
                 for j in range(res.shape()[2]):
+                    positions_res[1] = j * strides_res[2]
+                    positions_t1[1] = get_t1_position(j, scales[0], 0) * strides[2]
                     for k in range(res.shape()[3]):
-                        pass
+                        positions_res[2] = k * strides_res[3]
+                        positions_t1[2] = get_t1_position(k, scales[1], 1) * strides[3]
+                        
+                        @parameter
+                        fn v_iter_2[nelts: Int](l: Int):
+                            positions_res[3] = l
+
+                            var index_res = positions_res[0] + positions_res[1] + positions_res[2] + positions_res[3]
+                            var values = res.load[nelts](index_res)
+
+                            for m in range(nelts):
+                                positions_t1[3] = get_t1_position(l + m, scales[scales.rank() - 1], 2)
+
+                                values[m] = get_value_interpolate(
+                                    positions_t1, 
+                                    positions_t1[0] + positions_t1[1] + positions_t1[2] + positions_t1[3])
+
+                            res.store[nelts](index_res, values)
+                        
+                        vectorize[v_iter_2, nelts](res_shape[res.rank() - 1])
             else:
                 # Error
                 pass    
diff --git a/tests/python/test_mlops_torch.mojo b/tests/python/test_mlops_torch.mojo
index ccf9b2f..a3ae6e3 100644
--- a/tests/python/test_mlops_torch.mojo
+++ b/tests/python/test_mlops_torch.mojo
@@ -448,18 +448,19 @@ fn test_UPSAMPLE() raises:
 
     alias attributes = AttributeVector(
         Attribute("scales", TensorShape(3)),
-        Attribute("mode", "linear")
+        Attribute("mode", "nearest")
     )
 
     alias ug_shape = TensorShape(41, 41, 129)
     var ug = Tensor[dtype](ug_shape)
+    rand(ug.data(), ug.num_elements())
 
     var expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes)
     test_unary_op[OP.UPSAMPLE, t1_shape, attributes](t1, expected_and_grad.expected)
 
     alias attributes_2 = AttributeVector(
         Attribute("scales", TensorShape(3)),
-        Attribute("mode", "nearest")
+        Attribute("mode", "linear")
     )
 
     expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_2)
@@ -476,10 +477,27 @@ fn test_UPSAMPLE() raises:
 
     alias ug_shape_1 = TensorShape(40, 40, 240, 360)
     ug = Tensor[dtype](ug_shape_1)
+    rand(ug.data(), ug.num_elements())
 
     expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_3)
     test_unary_op[OP.UPSAMPLE, t1_shape_1, attributes_3](t1, expected_and_grad.expected)
 
+    alias t1_shape_2 = TensorShape(5, 5, 10, 20, 60)
+    t1 = Tensor[dtype](t1_shape_2)
+    rand(t1.data(), t1.num_elements())
+
+    alias attributes_4 = AttributeVector(
+        Attribute("scales", TensorShape(2, 3, 4)),
+        Attribute("mode", "nearest")
+    )
+
+    alias ug_shape_2 = TensorShape(5, 5, 20, 60, 240)
+    ug = Tensor[dtype](ug_shape_2)
+    rand(ug.data(), ug.num_elements())
+
+    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_4)
+    test_unary_op[OP.UPSAMPLE, t1_shape_2, attributes_4](t1, expected_and_grad.expected)
+
 
 fn main():
     print("Running mlops (compare with torch) tests")

From de1fb4149736cd17cf3f7befe57f838045db142f Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Thu, 6 Jun 2024 18:40:46 -0600
Subject: [PATCH 15/16] Added bilinear op

---
 basalt/autograd/ops/mlops.mojo     | 49 ++++++++++++++++++++----------
 tests/python/test_mlops_torch.mojo | 18 ++++++++---
 2 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index 58841f7..e06dbe1 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -675,7 +675,7 @@ struct UPSAMPLE:
 
         @parameter
         fn get_coordination_mode() -> String:
-            if mode == "linear":
+            if mode == "linear" or mode == "bilinear":
                 return "half_pixel"
             else:
                 return "asymmetric"
@@ -700,12 +700,16 @@ struct UPSAMPLE:
         @parameter
         @always_inline
         fn get_value_interpolate[size: Int](
-            indeces_t1: StaticTuple[Float64, size],
-            index_t1_sum: Float64
+            indeces_t1: StaticTuple[Float64, size]
         ) -> SIMD[t1.dtype, 1]:
             @parameter
             if mode == "nearest":
-                return t1[int(index_t1_sum)]
+                var indeces_t1_sum = indeces_t1[0]
+                @unroll
+                for i in range(1, size):
+                    indeces_t1_sum += indeces_t1[i] * strides[i + 1]
+
+                return t1[int(indeces_t1_sum)]
             elif mode == "linear":
                 var t1_pos_floor = floor(indeces_t1[1])
                 var t1_pos_ceil = min(ceil(indeces_t1[1]), t1_shape[2] - 1)
@@ -714,6 +718,25 @@ struct UPSAMPLE:
                 var v2 = t1[int(indeces_t1[0]) + int(t1_pos_ceil)]
 
                 return v1 + (v2 - v1) * (indeces_t1[1] - t1_pos_floor)
+            elif mode == "bilinear":
+                var t1_pos_floor_y = floor(indeces_t1[1])
+                var t1_pos_ceil_y = min(ceil(indeces_t1[1]), t1_shape[2] - 1)
+
+                var t1_pos_floor_x = floor(indeces_t1[2])
+                var t1_pos_ceil_x = min(ceil(indeces_t1[2]), t1_shape[3] - 1)
+
+                var v1 = t1[int(indeces_t1[0]) + int(t1_pos_floor_y) * strides[2] + int(t1_pos_floor_x) * strides[3]]
+                var v2 = t1[int(indeces_t1[0]) + int(t1_pos_floor_y) * strides[2] + int(t1_pos_ceil_x) * strides[3]]
+                var v3 = t1[int(indeces_t1[0]) + int(t1_pos_ceil_y) * strides[2] + int(t1_pos_floor_x) * strides[3]]
+                var v4 = t1[int(indeces_t1[0]) + int(t1_pos_ceil_y) * strides[2] + int(t1_pos_ceil_x) * strides[3]]
+
+                var wy = indeces_t1[1] - t1_pos_floor_y
+                var wx = indeces_t1[2] - t1_pos_floor_x
+
+                var top_interp = v1 + (v2 - v1) * wx
+                var bottom_interp = v3 + (v4 - v3) * wx
+    
+                return top_interp + (bottom_interp - top_interp) * wy
             else:
                 return 0
 
@@ -753,9 +776,7 @@ struct UPSAMPLE:
                     for k in range(nelts):
                         positions_t1[1] = get_t1_position(j + k, scales[scales.rank() - 1], 0)
 
-                        values[k] = get_value_interpolate(
-                            positions_t1, 
-                            positions_t1[0] + positions_t1[1])
+                        values[k] = get_value_interpolate(positions_t1)
 
                     res.store[nelts](index_res, values)
 
@@ -770,7 +791,7 @@ struct UPSAMPLE:
 
                 for j in range(res_shape[2]):
                     positions_res[1] = j * strides_res[2]
-                    positions_t1[1] = get_t1_position(j, scales[0], 0) * strides[2]
+                    positions_t1[1] = get_t1_position(j, scales[0], 0)
             
                     @parameter
                     fn v_iter_1[nelts: Int](k: Int):
@@ -782,9 +803,7 @@ struct UPSAMPLE:
                         for l in range(nelts):
                             positions_t1[2] = get_t1_position(k + l, scales[scales.rank() - 1], 1)
 
-                            values[l] = get_value_interpolate(
-                                positions_t1, 
-                                positions_t1[0] + positions_t1[1] + positions_t1[2])
+                            values[l] = get_value_interpolate(positions_t1)
 
                         res.store[nelts](index_res, values)
                     
@@ -799,10 +818,10 @@ struct UPSAMPLE:
 
                 for j in range(res.shape()[2]):
                     positions_res[1] = j * strides_res[2]
-                    positions_t1[1] = get_t1_position(j, scales[0], 0) * strides[2]
+                    positions_t1[1] = get_t1_position(j, scales[0], 0)
                     for k in range(res.shape()[3]):
                         positions_res[2] = k * strides_res[3]
-                        positions_t1[2] = get_t1_position(k, scales[1], 1) * strides[3]
+                        positions_t1[2] = get_t1_position(k, scales[1], 1)
                         
                         @parameter
                         fn v_iter_2[nelts: Int](l: Int):
@@ -814,9 +833,7 @@ struct UPSAMPLE:
                             for m in range(nelts):
                                 positions_t1[3] = get_t1_position(l + m, scales[scales.rank() - 1], 2)
 
-                                values[m] = get_value_interpolate(
-                                    positions_t1, 
-                                    positions_t1[0] + positions_t1[1] + positions_t1[2] + positions_t1[3])
+                                values[m] = get_value_interpolate(positions_t1)
 
                             res.store[nelts](index_res, values)
                         
diff --git a/tests/python/test_mlops_torch.mojo b/tests/python/test_mlops_torch.mojo
index a3ae6e3..69a4a12 100644
--- a/tests/python/test_mlops_torch.mojo
+++ b/tests/python/test_mlops_torch.mojo
@@ -466,7 +466,7 @@ fn test_UPSAMPLE() raises:
     expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_2)
     test_unary_op[OP.UPSAMPLE, t1_shape, attributes_2](t1, expected_and_grad.expected)
 
-    alias t1_shape_1 = TensorShape(40, 40, 120, 120)
+    alias t1_shape_1 = TensorShape(20, 20, 120, 120)
     t1 = Tensor[dtype](t1_shape_1)
     rand(t1.data(), t1.num_elements())
 
@@ -475,18 +475,26 @@ fn test_UPSAMPLE() raises:
         Attribute("mode", "nearest")
     )
 
-    alias ug_shape_1 = TensorShape(40, 40, 240, 360)
+    alias ug_shape_1 = TensorShape(20, 20, 240, 360)
     ug = Tensor[dtype](ug_shape_1)
     rand(ug.data(), ug.num_elements())
 
     expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_3)
     test_unary_op[OP.UPSAMPLE, t1_shape_1, attributes_3](t1, expected_and_grad.expected)
 
+    alias attributes_4 = AttributeVector(
+        Attribute("scales", TensorShape(2, 3)),
+        Attribute("mode", "bilinear")
+    )
+
+    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_4)
+    test_unary_op[OP.UPSAMPLE, t1_shape_1, attributes_4](t1, expected_and_grad.expected)
+
     alias t1_shape_2 = TensorShape(5, 5, 10, 20, 60)
     t1 = Tensor[dtype](t1_shape_2)
     rand(t1.data(), t1.num_elements())
 
-    alias attributes_4 = AttributeVector(
+    alias attributes_5 = AttributeVector(
         Attribute("scales", TensorShape(2, 3, 4)),
         Attribute("mode", "nearest")
     )
@@ -495,8 +503,8 @@ fn test_UPSAMPLE() raises:
     ug = Tensor[dtype](ug_shape_2)
     rand(ug.data(), ug.num_elements())
 
-    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_4)
-    test_unary_op[OP.UPSAMPLE, t1_shape_2, attributes_4](t1, expected_and_grad.expected)
+    expected_and_grad = torch_unary_op(OP.UPSAMPLE, t1, ug, attributes_5)
+    test_unary_op[OP.UPSAMPLE, t1_shape_2, attributes_5](t1, expected_and_grad.expected)
 
 
 fn main():

From 11e84112320fadbd8049e1cd00de445fb85cc663 Mon Sep 17 00:00:00 2001
From: NKspartan <andresnowak52@gmail.com>
Date: Fri, 19 Jul 2024 17:58:42 -0600
Subject: [PATCH 16/16] Merged with main

---
 basalt/autograd/attributes.mojo            |  18 +-
 basalt/autograd/graph.mojo                 |   2 +-
 basalt/autograd/ops/basics.mojo            |  16 +-
 basalt/autograd/ops/conv.mojo              |   1 -
 basalt/autograd/ops/dynamics.mojo          |   8 +-
 basalt/autograd/ops/matmul.mojo            |   2 +-
 basalt/autograd/ops/mlops.mojo             | 248 +++++++++----
 basalt/autograd/ops/ops.mojo               |  84 ++++-
 basalt/autograd/ops/pool.mojo              |   6 +-
 basalt/nn/__init__.mojo                    |   9 +-
 basalt/nn/activations.mojo                 |  13 +-
 basalt/nn/model.mojo                       |  17 +-
 basalt/nn/optim.mojo                       |  12 +-
 basalt/nn/tensor.mojo                      |  15 +-
 basalt/utils/bytes.mojo                    |  16 +-
 basalt/utils/collection.mojo               |  16 +-
 basalt/utils/datasets.mojo                 |   6 +-
 basalt/utils/itertools.mojo                |   2 +-
 basalt/utils/math_util.mojo                |  41 +++
 basalt/utils/onnx_utils.mojo               |   8 +-
 basalt/utils/perf_utils.mojo               |  17 +-
 basalt/utils/rand_utils.mojo               |   3 +-
 basalt/utils/tensor_creation_utils.mojo    |  18 +-
 basalt/utils/tensorutils.mojo              |  12 +-
 examples/yolov8.mojo                       |   3 +-
 examples/yolov8_cam.mojo                   |   2 +-
 tests/mojo/test_activations.mojo           |  56 ++-
 tests/mojo/test_mlops.mojo                 | 409 +++++++++++++++------
 tests/mojo/test_tensorutils.mojo           |   5 +-
 tests/mojo/test_tensorutils_data.mojo      |   3 +-
 tests/python/test_mlops_torch.mojo         | 224 ++++++++---
 tests/python/test_models_mnist.mojo        |   2 +-
 tests/python/test_models_regression.mojo   |   2 +-
 tests/python/test_models_sin_estimate.mojo |   4 +-
 34 files changed, 915 insertions(+), 385 deletions(-)
 create mode 100644 basalt/utils/math_util.mojo

diff --git a/basalt/autograd/attributes.mojo b/basalt/autograd/attributes.mojo
index 5a57a9a..3e1c3b3 100644
--- a/basalt/autograd/attributes.mojo
+++ b/basalt/autograd/attributes.mojo
@@ -1,4 +1,5 @@
 from collections import Optional, OptionalReg
+from utils.static_tuple import StaticTuple
 
 from basalt.nn.tensor import Tensor, TensorShape, MAX_RANK
 from basalt.utils.bytes import Bytes, scalar_to_bytes, bytes_to_scalar
@@ -45,9 +46,8 @@ struct AttributeVector(Sized, Stringable, CollectionElement):
     var attributes: StaticTuple[Attribute, MAX_ATTRS]
     var size: Int
 
-    @always_inline("nodebug")
     fn __init__(inout self, *attributes: Attribute):
-        self.attributes = StaticTuple[Attribute, MAX_ATTRS]()
+        self.attributes = StaticTuple[Attribute, MAX_ATTRS](Attribute("", ""))
         self.size = len(attributes)
         for i in range(self.size):
             self.attributes[i] = attributes[i]
@@ -67,12 +67,10 @@ struct AttributeVector(Sized, Stringable, CollectionElement):
                 return self.attributes[i]
         return None
 
-    @always_inline("nodebug")
     fn append(inout self, attribute: Attribute):
         self.attributes[self.size] = attribute
         self.size += 1
 
-    @always_inline("nodebug")
     fn __str__(self) -> String:
         var s: String = "["
         for i in range(self.size):
@@ -90,7 +88,6 @@ struct Attribute(Stringable, CollectionElement):
     var type: AttributeType
     var size: Int
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: String):
         self.data_shape = StaticIntTuple[MAX_RANK]()
         self.name = Bytes[MAX_NAME_CHARS](name)
@@ -98,7 +95,6 @@ struct Attribute(Stringable, CollectionElement):
         self.type = AttributeType.STRING
         self.size = len(value)
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: TensorShape):
         self.data_shape = StaticIntTuple[MAX_RANK]()
         self.name = Bytes[MAX_NAME_CHARS](name)
@@ -109,7 +105,6 @@ struct Attribute(Stringable, CollectionElement):
         for i in range(self.size):
             self.data_shape[i] = value._shape[i]
 
-    @always_inline("nodebug")
     fn __init__[N: Int](inout self, name: String, value: StaticIntTuple[N]):
         constrained[N < MAX_RANK, "Attribute rank must be less than MAX_RANK."]()
 
@@ -122,7 +117,6 @@ struct Attribute(Stringable, CollectionElement):
         for i in range(self.size):
             self.data_shape[i] = value[i]
 
-    @always_inline("nodebug")
     fn __init__[dtype: DType](inout self, name: String, value: Scalar[dtype]):
         constrained[dtype.is_numeric(), "Attribute value must be numeric."]()
 
@@ -132,29 +126,23 @@ struct Attribute(Stringable, CollectionElement):
         self.type = AttributeType(dtype)
         self.size = 1
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: Int):
         self.__init__(name, Int64(value))
         self.data_shape[0] = 1
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: FloatLiteral):
         self.__init__(name, Float64(value))
         self.data_shape[0] = 1
 
-    @always_inline("nodebug")
     fn __str__(self) -> String:
         return "Attribute(" + str(self.name) + ", " + "..." + ")"
 
-    @always_inline("nodebug")
     fn to_string(self) -> String:
         return str(self.data)
 
-    @always_inline("nodebug")
     fn to_shape(self) -> TensorShape:
         return TensorShape(rank=self.size, shape=self.data_shape)
 
-    @always_inline("nodebug")
     fn to_static[N: Int](self) -> StaticIntTuple[N]:
         constrained[N < MAX_RANK, "Attribute rank must be less than MAX_RANK."]()
 
@@ -165,13 +153,11 @@ struct Attribute(Stringable, CollectionElement):
 
         return result
 
-    @always_inline("nodebug")
     fn to_scalar[dtype: DType](self) -> Scalar[dtype]:
         constrained[dtype.is_numeric(), "Attribute value must be numeric."]()
 
         return bytes_to_scalar[dtype](self.data)
 
-    @always_inline("nodebug")
     fn to_int(self) -> Int:
         return int(self.to_scalar[DType.int64]())
 
diff --git a/basalt/autograd/graph.mojo b/basalt/autograd/graph.mojo
index b4fdf53..cd28b09 100644
--- a/basalt/autograd/graph.mojo
+++ b/basalt/autograd/graph.mojo
@@ -43,7 +43,7 @@ struct Graph:
             self.inputs.append(symbol)
         else:
             if data is not None:
-                self.params.put(symbol, data.take())
+                self.params.put(symbol, data.value()[])
             else:
                 self.params.put(symbol)
 
diff --git a/basalt/autograd/ops/basics.mojo b/basalt/autograd/ops/basics.mojo
index 2305377..74662ca 100644
--- a/basalt/autograd/ops/basics.mojo
+++ b/basalt/autograd/ops/basics.mojo
@@ -1,12 +1,15 @@
-from math import add, sub, mul, div, log, exp
+from math import log, exp
 from algorithm import vectorize
 from memory import memcpy
+from utils.numerics import isinf
 
 from basalt import Tensor, TensorShape
 from basalt.nn.tensor import MAX_RANK
 from basalt.utils.tensorutils import *
 from basalt.autograd.attributes import Attribute, AttributeVector
 from basalt.autograd.ops.matmul import dot, dot_transpose_t1, dot_transpose_t2
+from basalt.utils.math_util import add, sub, mul, div
+
 
 """
 Implement forward and backward operations for basic tensor manipulations.
@@ -316,7 +319,9 @@ struct POW:
         # d(x^y) / dx = y * x^(y-1)
         # d(x^y) / dy = sum( x^y * log(x) )
         var res_grad: Tensor[dtype]
-        var a = int(t2[0])
+        var a = t2[0]
+
+        alias epsilon = 1e-12
 
         @parameter
         if tensor_id == 0:
@@ -329,13 +334,18 @@ struct POW:
             vectorize[vec_pow_bw_x, nelts](t1_shape.num_elements())
 
         else:
+            # Gradient of the exponent
             res_grad = Tensor[dtype](t2_shape)  # t2_shape == TensorShape(1)
 
             @parameter
             fn vec_pow_bw_y[nelts: Int](i: Int):
+                # the case when the value passed to log is 0.0
+                var temp_log = log(t1.load[nelts](i))
+                var temp_log_is_inf = isinf(temp_log)
+                temp_log = temp_log_is_inf.select(0, temp_log)
                 res_grad[0] += (
                     (t1.load[nelts](i) ** a)
-                    * log(t1.load[nelts](i))
+                    * temp_log
                     * ug.load[nelts](i)
                 ).reduce_add()
 
diff --git a/basalt/autograd/ops/conv.mojo b/basalt/autograd/ops/conv.mojo
index 4592c51..774eb03 100644
--- a/basalt/autograd/ops/conv.mojo
+++ b/basalt/autograd/ops/conv.mojo
@@ -2,7 +2,6 @@ from basalt import Tensor, TensorShape
 from basalt.autograd.attributes import AttributeVector
 
 from algorithm import parallelize, vectorize, tile
-from math import divmod
 from utils.loop import unroll
 
 
diff --git a/basalt/autograd/ops/dynamics.mojo b/basalt/autograd/ops/dynamics.mojo
index 0f304ef..5c30493 100644
--- a/basalt/autograd/ops/dynamics.mojo
+++ b/basalt/autograd/ops/dynamics.mojo
@@ -33,7 +33,7 @@ struct CONCAT:
     fn forward[attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ):
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         var n_chunks = Self.calc_chunks(inputs[0].shape, dim)
@@ -58,7 +58,7 @@ struct CONCAT:
     fn backward[input_id: Int, attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ) -> Tensor[dtype]:
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         var n_chunks = Self.calc_chunks(inputs[0].shape, dim)
@@ -113,7 +113,7 @@ struct SPLIT:
     fn forward[attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ):
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         alias sections = attributes["sections"].value().to_shape()
@@ -139,7 +139,7 @@ struct SPLIT:
     fn backward[input_id: Int, attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ) -> Tensor[dtype]:
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         alias sections = attributes["sections"].value().to_shape()
diff --git a/basalt/autograd/ops/matmul.mojo b/basalt/autograd/ops/matmul.mojo
index e638f81..bc2cf2b 100644
--- a/basalt/autograd/ops/matmul.mojo
+++ b/basalt/autograd/ops/matmul.mojo
@@ -18,7 +18,7 @@ fn calculate_block[
 
     for k in range(K):
 
-        @unroll
+        @parameter
         for m in range(BLOCK_M):
 
             @parameter
diff --git a/basalt/autograd/ops/mlops.mojo b/basalt/autograd/ops/mlops.mojo
index e06dbe1..30a61e8 100644
--- a/basalt/autograd/ops/mlops.mojo
+++ b/basalt/autograd/ops/mlops.mojo
@@ -1,7 +1,7 @@
 from algorithm import vectorize, parallelize
-from math import exp, pow, max, min, abs, ceil, floor
-from math.limit import min_finite, max_finite
-from memory import stack_allocation
+from math import exp, floor, ceil
+from utils.numerics import min_finite, max_finite
+from utils.static_tuple import StaticTuple
 
 from basalt import Tensor, TensorShape
 from basalt.utils.tensorutils import elwise_transform
@@ -54,7 +54,7 @@ struct SIGMOID:
 
         vectorize[vec_sigmoid_bw, nelts](ug_shape.num_elements())
 
-        return res_grad ^
+        return res_grad^
 
 
 struct RELU:
@@ -102,7 +102,62 @@ struct RELU:
 
         vectorize[vec_relu_bw, nelts](ug_shape.num_elements())
 
-        return res_grad ^
+        return res_grad^
+
+
+struct LEAKYRELU:
+    @staticmethod
+    fn result_shape(t1_shape: TensorShape) -> TensorShape:
+        return t1_shape
+
+    @staticmethod
+    fn forward[
+        t1_shape: TensorShape,
+        attributes: AttributeVector,
+    ](inout res: Tensor[dtype], t1: Tensor[dtype]):
+        """Forward operation of leaky_relu."""
+
+        fn leaky_relu[
+            type: DType,
+            simd_width: Int,
+        ](x: SIMD[type, simd_width]) -> SIMD[type, simd_width]:
+            var negative_slope = attributes["negative_slope"].value().to_scalar[
+                type
+            ]()
+            return (x > 0).select(x, x * negative_slope)
+
+        elwise_transform[leaky_relu](res, t1)
+
+    @staticmethod
+    fn backward[
+        ug_shape: TensorShape,
+        t1_shape: TensorShape,
+        attributes: AttributeVector,
+    ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
+        """Backward operation of leaky_relu."""
+
+        @always_inline
+        fn leaky_relu_bw[
+            type: DType, simd_width: Int
+        ](x: SIMD[type, simd_width]) -> SIMD[type, simd_width]:
+            var negative_slope = attributes["negative_slope"].value().to_scalar[
+                type
+            ]()
+
+            return (x > 0).select[type](1, negative_slope)
+
+        var res_grad = Tensor[dtype](ug_shape)
+
+        @parameter
+        fn vec_leaky_relu_bw[nelts: Int](idx: Int):
+            res_grad.store[nelts](
+                idx,
+                leaky_relu_bw(t1.load[nelts](idx)) * ug.load[nelts](idx),
+            )
+
+        vectorize[vec_leaky_relu_bw, nelts](ug_shape.num_elements())
+
+        return res_grad^
 
 
 struct TANH:
@@ -148,7 +203,7 @@ struct TANH:
 
         vectorize[vec_tanh_bw, nelts](ug_shape.num_elements())
 
-        return res_grad ^
+        return res_grad^
 
 
 struct CLIP:
@@ -166,12 +221,12 @@ struct CLIP:
         alias min_attr = attributes["min"]
         alias max_attr = attributes["max"]
 
-        var min_val = min_attr.value().to_scalar[dtype]() if min_attr else min_finite[
+        var min_val = min_attr.value().to_scalar[
             dtype
-        ]()
-        var max_val = max_attr.value().to_scalar[dtype]() if max_attr else max_finite[
+        ]() if min_attr else min_finite[dtype]()
+        var max_val = max_attr.value().to_scalar[
             dtype
-        ]()
+        ]() if max_attr else max_finite[dtype]()
 
         @parameter
         fn vec_clip[nelts: Int](i: Int):
@@ -189,12 +244,12 @@ struct CLIP:
         alias min_attr = attributes["min"]
         alias max_attr = attributes["max"]
 
-        var min_val = min_attr.value().to_scalar[dtype]() if min_attr else min_finite[
+        var min_val = min_attr.value().to_scalar[
             dtype
-        ]()
-        var max_val = max_attr.value().to_scalar[dtype]() if max_attr else max_finite[
+        ]() if min_attr else min_finite[dtype]()
+        var max_val = max_attr.value().to_scalar[
             dtype
-        ]()
+        ]() if max_attr else max_finite[dtype]()
 
         var res_grad = Tensor[dtype](t_shape)
 
@@ -203,17 +258,21 @@ struct CLIP:
             var val = t.load[nelts](i)
             res_grad.store[nelts](
                 i,
-                ((val >= min_val) * (val <= max_val)).select(ug.load[nelts](i), 0),
+                ((val >= min_val) * (val <= max_val)).select(
+                    ug.load[nelts](i), 0
+                ),
             )
 
         vectorize[vec_clip_bw, nelts, size = t_shape.num_elements()]()
 
-        return res_grad ^
+        return res_grad^
 
 
 struct SQUEEZE:
     @staticmethod
-    fn result_shape(t1_shape: TensorShape, attributes: AttributeVector) -> TensorShape:
+    fn result_shape(
+        t1_shape: TensorShape, attributes: AttributeVector
+    ) -> TensorShape:
         var dim = attributes["dims"]
         var dims_to_squeeze = dim.value().to_shape() if dim else TensorShape()
 
@@ -241,12 +300,14 @@ struct SQUEEZE:
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
         var res_grad = Tensor[dtype](t1_shape)
         memcpy(res_grad.data(), ug.data(), ug.num_elements())
-        return res_grad ^
+        return res_grad^
 
 
 struct UNSQUEEZE:
     @staticmethod
-    fn result_shape(t1_shape: TensorShape, attributes: AttributeVector) -> TensorShape:
+    fn result_shape(
+        t1_shape: TensorShape, attributes: AttributeVector
+    ) -> TensorShape:
         var dim = attributes["dims"]
         var dims_to_squeeze = dim.value().to_shape() if dim else TensorShape()
 
@@ -278,7 +339,7 @@ struct UNSQUEEZE:
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
         var res_grad = Tensor[dtype](t1_shape)
         memcpy(res_grad.data(), ug.data(), ug.num_elements())
-        return res_grad ^
+        return res_grad^
 
 
 struct SLICE:
@@ -287,7 +348,7 @@ struct SLICE:
         # Adjust negative indices & ensure they are within bounds.
         var s = slice if slice >= 0 else dim_size + slice
         return max(min(s, dim_size), 0)
-    
+
     @staticmethod
     fn default_starts(shape: TensorShape) -> List[Int]:
         var starts = List[Int]()
@@ -308,7 +369,7 @@ struct SLICE:
         for i in range(shape.rank()):
             steps.append(1)
         return steps^
-    
+
     @staticmethod
     fn default_axes(shape: TensorShape) -> List[Int]:
         # NOTE: axes can't be negative
@@ -318,38 +379,55 @@ struct SLICE:
         return axes^
 
     @staticmethod
-    fn result_shape(t1_shape: TensorShape, attributes: AttributeVector) -> TensorShape:
+    fn result_shape(
+        t1_shape: TensorShape, attributes: AttributeVector
+    ) -> TensorShape:
         # NOTE: Starts and ends have to be of the same size
         # NOTE: If axes not provided, starts and ends have to be of the same size as t1_shape
         var starts = attributes["starts"].value().to_shape()
         var ends = attributes["ends"].value().to_shape()
-        var steps = attributes["steps"].value().to_shape() if attributes["steps"] else Self.default_steps(starts)
-        var axes = attributes["axes"].value().to_shape() if attributes["axes"] else Self.default_axes(t1_shape)
+        var steps = attributes["steps"].value().to_shape() if attributes[
+            "steps"
+        ] else Self.default_steps(starts)
+        var axes = attributes["axes"].value().to_shape() if attributes[
+            "axes"
+        ] else Self.default_axes(t1_shape)
 
         var new_shape = t1_shape
         for i in range(starts.rank()):
             var axis = axes[i]
-            new_shape[axis] = len(range(
-                start = Self.adjust_boundary(starts[i], t1_shape[axis]),
-                end = Self.adjust_boundary(ends[i], t1_shape[axis]),
-                step = steps[i]
-            ))
+            new_shape[axis] = len(
+                range(
+                    start=Self.adjust_boundary(starts[i], t1_shape[axis]),
+                    end=Self.adjust_boundary(ends[i], t1_shape[axis]),
+                    step=steps[i],
+                )
+            )
 
         return new_shape
 
     @staticmethod
-    fn reorder_positions[id: Int](original: TensorShape, axes: TensorShape, t1_shape: TensorShape) -> List[Int]:
+    fn reorder_positions[
+        id: Int
+    ](original: TensorShape, axes: TensorShape, t1_shape: TensorShape) -> List[
+        Int
+    ]:
         # Reorder the starts (id=0), ends (id=1) or steps (id=2) to match the order of the axes
         var updated: List[Int]
 
         @parameter
-        if id == 0: updated = Self.default_starts(t1_shape)
-        elif id == 1: updated = Self.default_ends(t1_shape)
-        else: updated = Self.default_steps(t1_shape)
-    
+        if id == 0:
+            updated = Self.default_starts(t1_shape)
+        elif id == 1:
+            updated = Self.default_ends(t1_shape)
+        else:
+            updated = Self.default_steps(t1_shape)
+
         for i in range(axes.rank()):
             var axis = axes[i]
-            updated[axis] = original[i] if id == 2 else Self.adjust_boundary(original[i], t1_shape[axis])
+            updated[axis] = original[i] if id == 2 else Self.adjust_boundary(
+                original[i], t1_shape[axis]
+            )
 
         return updated^
 
@@ -362,12 +440,12 @@ struct SLICE:
         steps: List[Int],
         starts: List[Int],
         ends: List[Int],
-        backward_op: Bool = False
+        backward_op: Bool = False,
     ](
         inout res: Tensor[dtype],
         t1: Tensor[dtype],
         last_dims: Int,
-        position: Int, 
+        position: Int,
         last_position: Int,
         idx: Int,
         idx_original: Int,
@@ -376,7 +454,9 @@ struct SLICE:
         alias t1_strides = original_shape.strides()
 
         var idx_temp = idx
-        var idx_original_temp = starts[position] * t1_strides[position] + idx_original
+        var idx_original_temp = starts[position] * t1_strides[
+            position
+        ] + idx_original
 
         if position == last_position + 1:
             # Work on the last dimensions
@@ -384,37 +464,50 @@ struct SLICE:
             alias stride = t1_strides[position] * steps[position]
 
             @parameter
-            fn v_slice[nelts: Int](k : Int):
-
+            fn v_slice[nelts: Int](k: Int):
                 @parameter
                 if not backward_op:
+
                     @parameter
                     if steps[position] == 1:
-                        res.store[nelts](idx_temp + k, t1.load[nelts](idx_original_temp))
+                        res.store[nelts](
+                            idx_temp + k, t1.load[nelts](idx_original_temp)
+                        )
                     else:
                         res.store[nelts](
                             idx_temp + k,
-                            t1.data().offset(idx_original_temp).simd_strided_load[nelts](stride)
+                            t1.data()
+                            .offset(idx_original_temp)
+                            .simd_strided_load[nelts](stride),
                         )
                 else:
+
                     @parameter
                     if steps[position] == 1:
                         res.store[nelts](idx_original_temp, t1.load[nelts](idx_temp + k))
                     else:
-                        res.data().offset(idx_original_temp).simd_strided_store[nelts](
+                        res.data().offset(idx_original_temp).simd_strided_store[width=nelts](
                             t1.load[nelts](idx_temp + k),
                             stride
                         )
-    
+
                 idx_original_temp += stride * nelts
 
             vectorize[v_slice, nelts](last_dims)
 
-            return 
+            return
 
         for _ in range(shape[position]):
-            Self.recursive_iters_slice[shape, original_shape, steps, starts, ends, backward_op](
-                res, t1, last_dims, position + 1, last_position, idx_temp, idx_original_temp
+            Self.recursive_iters_slice[
+                shape, original_shape, steps, starts, ends, backward_op
+            ](
+                res,
+                t1,
+                last_dims,
+                position + 1,
+                last_position,
+                idx_temp,
+                idx_original_temp,
             )
 
             idx_temp += strides[position]
@@ -427,10 +520,10 @@ struct SLICE:
         steps: List[Int],
         starts: List[Int],
         ends: List[Int],
-        backward_op: Bool = False
+        backward_op: Bool = False,
     ](inout res: Tensor[dtype], t1: Tensor[dtype]):
         alias strides = original_shape.strides()
-        
+
         # Get the dimensions for vectorization
         var last_dims = 1
         var positions_to_skip = 0
@@ -441,7 +534,7 @@ struct SLICE:
             positions_to_skip += 1
             if starts[i] != 0 or ends[i] != original_shape[i] or steps[i] != 1:
                 break
-        
+
         # Get the dimensions for the first loop
         var first_dims = 1
         var start_position = 0
@@ -452,31 +545,46 @@ struct SLICE:
             start_position += 1
 
         var middle_dims = res_shape.num_elements() // last_dims // first_dims
-        
+
         @parameter
         fn p_slice(i: Int):
             Self.recursive_iters_slice[
                 res_shape, original_shape, steps, starts, ends, backward_op
             ](
-                res, t1, last_dims, start_position, res_shape.rank() - 1 - positions_to_skip, 
-                i * middle_dims * last_dims, i * strides[start_position - 1]
+                res,
+                t1,
+                last_dims,
+                start_position,
+                res_shape.rank() - 1 - positions_to_skip,
+                i * middle_dims * last_dims,
+                i * strides[start_position - 1],
             )
 
         parallelize[p_slice](first_dims)
-    
+
     @staticmethod
     fn forward[
         t1_shape: TensorShape,
         attributes: AttributeVector,
     ](inout res: Tensor[dtype], t1: Tensor[dtype]):
-        alias axes = attributes["axes"].value().to_shape() if attributes["axes"] else Self.default_axes(t1_shape)
-        alias starts = Self.reorder_positions[0](attributes["starts"].value().to_shape(), axes, t1_shape)
-        alias ends = Self.reorder_positions[1](attributes["ends"].value().to_shape(), axes, t1_shape)
-        alias steps = Self.reorder_positions[2](attributes["steps"].value().to_shape(), axes, t1_shape) if attributes["steps"] else Self.default_steps(t1_shape)
+        alias axes = attributes["axes"].value().to_shape() if attributes[
+            "axes"
+        ] else Self.default_axes(t1_shape)
+        alias starts = Self.reorder_positions[0](
+            attributes["starts"].value().to_shape(), axes, t1_shape
+        )
+        alias ends = Self.reorder_positions[1](
+            attributes["ends"].value().to_shape(), axes, t1_shape
+        )
+        alias steps = Self.reorder_positions[2](
+            attributes["steps"].value().to_shape(), axes, t1_shape
+        ) if attributes["steps"] else Self.default_steps(t1_shape)
 
         alias res_shape = Self.result_shape(t1_shape, attributes)
 
-        Self.slice_kernel[res_shape, t1_shape, steps, starts, ends, False](res, t1)
+        Self.slice_kernel[res_shape, t1_shape, steps, starts, ends, False](
+            res, t1
+        )
 
     @staticmethod
     fn backward[
@@ -484,10 +592,18 @@ struct SLICE:
         t1_shape: TensorShape,
         attributes: AttributeVector = AttributeVector(),
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
-        alias axes = attributes["axes"].value().to_shape() if attributes["axes"] else Self.default_axes(t1_shape)
-        alias starts = Self.reorder_positions[0](attributes["starts"].value().to_shape(), axes, t1_shape)
-        alias ends = Self.reorder_positions[1](attributes["ends"].value().to_shape(), axes, t1_shape)
-        alias steps = Self.reorder_positions[2](attributes["steps"].value().to_shape(), axes, t1_shape) if attributes["steps"] else Self.default_steps(t1_shape)
+        alias axes = attributes["axes"].value().to_shape() if attributes[
+            "axes"
+        ] else Self.default_axes(t1_shape)
+        alias starts = Self.reorder_positions[0](
+            attributes["starts"].value().to_shape(), axes, t1_shape
+        )
+        alias ends = Self.reorder_positions[1](
+            attributes["ends"].value().to_shape(), axes, t1_shape
+        )
+        alias steps = Self.reorder_positions[2](
+            attributes["steps"].value().to_shape(), axes, t1_shape
+        ) if attributes["steps"] else Self.default_steps(t1_shape)
 
         var res_grad = Tensor[dtype](t1_shape)
         
@@ -705,7 +821,7 @@ struct UPSAMPLE:
             @parameter
             if mode == "nearest":
                 var indeces_t1_sum = indeces_t1[0]
-                @unroll
+                @parameter
                 for i in range(1, size):
                     indeces_t1_sum += indeces_t1[i] * strides[i + 1]
 
@@ -850,4 +966,4 @@ struct UPSAMPLE:
         t1_shape: TensorShape,
         attributes: AttributeVector = AttributeVector(),
     ](ug: Tensor[dtype], t1: Tensor[dtype]) -> Tensor[dtype]:
-        return t1
\ No newline at end of file
+        return t1
diff --git a/basalt/autograd/ops/ops.mojo b/basalt/autograd/ops/ops.mojo
index c47f5ce..b870f78 100644
--- a/basalt/autograd/ops/ops.mojo
+++ b/basalt/autograd/ops/ops.mojo
@@ -15,7 +15,7 @@ from .basics import (
     TRANSPOSE,
     FMA,
 )
-from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE, INDEX, UPSAMPLE
+from .mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE, SLICE, INDEX, UPSAMPLE, LEAKYRELU
 from .dynamics import CONCAT, SPLIT
 from .conv import CONV2D
 from .pool import MAXPOOL2D
@@ -63,6 +63,7 @@ struct OP(Stringable):
     alias SLICE = OP(25, "SLICE")
     alias INDEX = OP(26, "INDEX")
     alias UPSAMPLE = OP(27, "UPSAMPLE")
+    alias LEAKYRELU = OP(28, "LEAKYRELU")
 
     var id: UInt8
     var name: Bytes[16]
@@ -89,10 +90,16 @@ fn static_result_shape(
     if len(operands) == 1:
         return static_result_shape(op, operands[0].shape, attributes)
     elif len(operands) == 2:
-        return static_result_shape(op, operands[0].shape, operands[1].shape, attributes)
+        return static_result_shape(
+            op, operands[0].shape, operands[1].shape, attributes
+        )
     elif len(operands) == 3:
         return static_result_shape(
-            op, operands[0].shape, operands[1].shape, operands[2].shape, attributes
+            op,
+            operands[0].shape,
+            operands[1].shape,
+            operands[2].shape,
+            attributes,
         )
     else:
         print("Error: Invalid number of operands")
@@ -123,6 +130,8 @@ fn static_result_shape(
         return SIGMOID.result_shape(t1_shape)
     elif op == OP.RELU:
         return RELU.result_shape(t1_shape)
+    elif op == OP.LEAKYRELU:
+        return LEAKYRELU.result_shape(t1_shape)
     elif op == OP.TANH:
         return TANH.result_shape(t1_shape)
     elif op == OP.TRANSPOSE:
@@ -241,6 +250,8 @@ fn forward_op[
         SIGMOID.forward[t1_shape](res, t1)
     elif op == OP.RELU:
         RELU.forward[t1_shape](res, t1)
+    elif op == OP.LEAKYRELU:
+        LEAKYRELU.forward[t1_shape, attributes](res, t1)
     elif op == OP.TANH:
         TANH.forward[t1_shape](res, t1)
     elif op == OP.TRANSPOSE:
@@ -264,7 +275,10 @@ fn forward_op[
 
 
 fn forward_op[
-    op: OP, t1_shape: TensorShape, t2_shape: TensorShape, attributes: AttributeVector
+    op: OP,
+    t1_shape: TensorShape,
+    t2_shape: TensorShape,
+    attributes: AttributeVector,
 ](inout res: Tensor[dtype], t1: Tensor[dtype], t2: Tensor[dtype]):
     """
     Forward pass for binary operators.
@@ -293,14 +307,21 @@ fn forward_op[
     t2_shape: TensorShape,
     t3_shape: TensorShape,
     attributes: AttributeVector,
-](inout res: Tensor[dtype], t1: Tensor[dtype], t2: Tensor[dtype], t3: Tensor[dtype]):
+](
+    inout res: Tensor[dtype],
+    t1: Tensor[dtype],
+    t2: Tensor[dtype],
+    t3: Tensor[dtype],
+):
     """
     Forward pass for ternary operators.
     """
 
     @parameter
     if op == OP.CONV2D:
-        CONV2D.forward[t1_shape, t2_shape, t3_shape, attributes](res, t1, t2, t3)
+        CONV2D.forward[t1_shape, t2_shape, t3_shape, attributes](
+            res, t1, t2, t3
+        )
     elif op == OP.FMA:
         FMA.forward[t1_shape, t2_shape, t3_shape](res, t1, t2, t3)
     else:
@@ -313,7 +334,7 @@ fn forward_op[
 ](
     inputs: List[Symbol],
     outputs: List[Symbol],
-    parameters: Parameters,
+    inout parameters: Parameters,
 ):
     """
     Forward pass for dynamic operators.
@@ -357,6 +378,8 @@ fn backward_op[
         res_grad = SIGMOID.backward[ug_shape, t1_shape](ug, t1)
     elif op == OP.RELU:
         res_grad = RELU.backward[ug_shape, t1_shape](ug, t1)
+    elif op == OP.LEAKYRELU:
+        res_grad = LEAKYRELU.backward[ug_shape, t1_shape, attributes](ug, t1)
     elif op == OP.TANH:
         res_grad = TANH.backward[ug_shape, t1_shape](ug, t1)
     elif op == OP.TRANSPOSE:
@@ -387,7 +410,12 @@ fn backward_op[
     t1_shape: TensorShape,
     t2_shape: TensorShape,
     attributes: AttributeVector,
-](ug: Tensor[dtype], t1: Tensor[dtype], t2: Tensor[dtype], inout grad: Tensor[dtype]):
+](
+    ug: Tensor[dtype],
+    t1: Tensor[dtype],
+    t2: Tensor[dtype],
+    inout grad: Tensor[dtype],
+):
     """
     Backward pass for binary operators.
     """
@@ -395,17 +423,29 @@ fn backward_op[
 
     @parameter
     if op == OP.ADD:
-        res_grad = ADD.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = ADD.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     elif op == OP.SUB:
-        res_grad = SUB.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = SUB.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     elif op == OP.MUL:
-        res_grad = MUL.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = MUL.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     elif op == OP.DIV:
-        res_grad = DIV.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = DIV.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     elif op == OP.POW:
-        res_grad = POW.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = POW.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     elif op == OP.DOT:
-        res_grad = DOT.backward[tensor_id, ug_shape, t1_shape, t2_shape](ug, t1, t2)
+        res_grad = DOT.backward[tensor_id, ug_shape, t1_shape, t2_shape](
+            ug, t1, t2
+        )
     else:
         print("[ERROR] Operator not found.")
         res_grad = Tensor[dtype](-1, -1)
@@ -449,9 +489,9 @@ fn backward_op[
             tensor_id, ug_shape, t1_shape, t2_shape, t3_shape, attributes
         ](ug, t1, t2, t3)
     elif op == OP.FMA:
-        res_grad = FMA.backward[tensor_id, ug_shape, t1_shape, t2_shape, t3_shape](
-            ug, t1, t2, t3
-        )
+        res_grad = FMA.backward[
+            tensor_id, ug_shape, t1_shape, t2_shape, t3_shape
+        ](ug, t1, t2, t3)
     else:
         print("[ERROR] Operator not found.")
         res_grad = Tensor[dtype](-1, -1)
@@ -467,7 +507,7 @@ fn backward_op[
     inputs: List[Symbol],
     outputs: List[Symbol],
     inout grad: Tensor[dtype],
-    parameters: Parameters,
+    inout parameters: Parameters,
 ):
     """
     Backward pass for dynamic operators.
@@ -475,9 +515,13 @@ fn backward_op[
     var res_grad: Tensor[dtype]
 
     if op == OP.CONCAT:
-        res_grad = CONCAT.backward[input_id, attributes](inputs, outputs, parameters)
+        res_grad = CONCAT.backward[input_id, attributes](
+            inputs, outputs, parameters
+        )
     elif op == OP.SPLIT:
-        res_grad = SPLIT.backward[input_id, attributes](inputs, outputs, parameters)
+        res_grad = SPLIT.backward[input_id, attributes](
+            inputs, outputs, parameters
+        )
     else:
         print("[ERROR] Operator not found.")
         res_grad = Tensor[dtype](-1, -1)
diff --git a/basalt/autograd/ops/pool.mojo b/basalt/autograd/ops/pool.mojo
index 5e92740..3149cc1 100644
--- a/basalt/autograd/ops/pool.mojo
+++ b/basalt/autograd/ops/pool.mojo
@@ -1,4 +1,4 @@
-from math.limit import neginf
+from utils.numerics import min_or_neg_inf
 
 from basalt import Tensor, TensorShape
 from basalt.autograd.attributes import AttributeVector
@@ -48,7 +48,7 @@ struct MAXPOOL2D:
             for in_ch in range(input_shape[1]):
                 for x in range(output_shape[2]):
                     for y in range(output_shape[3]):
-                        var max_val: Scalar[dtype] = neginf[dtype]()
+                        var max_val: Scalar[dtype] = min_or_neg_inf[dtype]()
                         var ix_base = x * stride[0] - padding[0]
                         var iy_base = y * stride[1] - padding[1]
                         for kx in range(kernel_size[0]):
@@ -107,7 +107,7 @@ struct MAXPOOL2D:
             for in_ch in range(input_shape[1]):
                 for x in range(ug_shape[2]):
                     for y in range(ug_shape[3]):
-                        var max_val: Scalar[dtype] = neginf[dtype]()
+                        var max_val: Scalar[dtype] = min_or_neg_inf[dtype]()
                         var max_idx: Int = -1
                         var ix_base = x * stride[0] - padding[0]
                         var iy_base = y * stride[1] - padding[1]
diff --git a/basalt/nn/__init__.mojo b/basalt/nn/__init__.mojo
index 9c994a4..d85ab27 100644
--- a/basalt/nn/__init__.mojo
+++ b/basalt/nn/__init__.mojo
@@ -7,4 +7,11 @@ from .layers.pool import MaxPool2d
 from .layers.upsample import Upsample
 
 from .loss import MSELoss, CrossEntropyLoss
-from .activations import Softmax, LogSoftmax, ReLU, Sigmoid, Tanh
+from .activations import (
+    Softmax,
+    LogSoftmax,
+    ReLU,
+    LeakyReLU,
+    Sigmoid,
+    Tanh,
+)
diff --git a/basalt/nn/activations.mojo b/basalt/nn/activations.mojo
index 2264a54..9a83a0f 100644
--- a/basalt/nn/activations.mojo
+++ b/basalt/nn/activations.mojo
@@ -2,13 +2,22 @@ from basalt import Tensor, TensorShape
 from basalt import Graph, Symbol, OP
 from basalt.autograd.attributes import Attribute, AttributeVector
 
-# '''Activation functions.'''
-
 
+# '''Activation functions.'''
 fn ReLU(inout g: Graph, input: Symbol) -> Symbol:
     return g.op(OP.RELU, input)
 
 
+fn LeakyReLU(
+    inout g: Graph, input: Symbol, negative_slope: Scalar[dtype]
+) -> Symbol:
+    return g.op(
+        OP.LEAKYRELU,
+        input,
+        attributes=AttributeVector(Attribute("negative_slope", negative_slope)),
+    )
+
+
 fn Sigmoid(inout g: Graph, input: Symbol) -> Symbol:
     return g.op(OP.SIGMOID, input)
 
diff --git a/basalt/nn/model.mojo b/basalt/nn/model.mojo
index ed80c7e..a8993cf 100644
--- a/basalt/nn/model.mojo
+++ b/basalt/nn/model.mojo
@@ -80,7 +80,7 @@ struct Model[
 
     # TODO: remove when ability to concatenate graphs (modules)
     # Removes the need for splitting in forward and inference mode
-    fn forward(inout self, *t_inputs: Tensor[dtype]) -> Tensor[dtype]:
+    fn forward(inout self, *t_inputs: Tensor[dtype]) -> ref[__lifetime_of(self)] Tensor[dtype]:
         # NOTE: Important detail here is that the order of the inputs must be the same as the order the inputs were defined in the graph.
         # Example: If you were te define the y_true before the x when creating the graph
         #
@@ -117,7 +117,7 @@ struct Model[
 
         # 2. Loop over all nodes and execute forward operations
         @parameter
-        fn fw_unroll[i: Int]():
+        for i in range(num_nodes):
             alias op = g.nodes[i].operator
             alias attrs = g.nodes[i].attributes
 
@@ -169,8 +169,6 @@ struct Model[
             if DEBUG == 1:
                 self.perf_metrics.end_forward_pass(i)
 
-        unroll[fw_unroll, num_nodes]()
-
     fn backward(inout self, *upper_grads: Tensor[dtype]):
         """
         Main entrypoint of backward pass.
@@ -191,7 +189,7 @@ struct Model[
 
         # 2. Loop over all nodes in reverse order and execute backward operations
         @parameter
-        fn bw_unroll[i: Int]():
+        for i in range(g.nodes.size):
             alias reverse_i = g.nodes.size - i - 1
             alias op = g.nodes[reverse_i].operator
             alias attrs = g.nodes[reverse_i].attributes
@@ -206,7 +204,7 @@ struct Model[
             if op.dynamic:
 
                 @parameter
-                fn unroll_dynamic[j: Int]():
+                for j in range(num_operands):
                     @parameter
                     if g.nodes[reverse_i].inputs[j].trainable:
                         backward_op[j, op, attrs](
@@ -215,9 +213,6 @@ struct Model[
                             self.parameters.grads[g.nodes[reverse_i].inputs[j]],
                             self.parameters,
                         )
-
-                unroll[unroll_dynamic, num_operands]()
-
             else:
                 # Statically known shapes and number of operands
                 alias out = g.nodes[reverse_i].outputs[0]  # or upper_grad symbol
@@ -302,8 +297,6 @@ struct Model[
             if DEBUG == 1:
                 self.perf_metrics.end_backward_pass(i)
 
-        unroll[bw_unroll, g.nodes.size]()
-
     fn allocate_tensor_memory(inout self):
         for i in range(len(g.inputs)):
             self.parameters.tensors.append(
@@ -375,7 +368,7 @@ struct Model[
         except e:
             print("Error loading model data:", e)
 
-    fn export_model(self, model_path: String):
+    fn export_model(inout self, model_path: String):
         var path = Path(model_path)
         print("Exporting model to:", path)
 
diff --git a/basalt/nn/optim.mojo b/basalt/nn/optim.mojo
index 1ba90f2..db6210a 100644
--- a/basalt/nn/optim.mojo
+++ b/basalt/nn/optim.mojo
@@ -1,9 +1,10 @@
-from math import add, mul, div, sqrt, sub
+from math import sqrt
 from algorithm import vectorize, parallelize
 
 from .model import Parameters
 from basalt import Graph, Tensor, TensorShape
 from basalt.utils.collection import Collection
+from basalt.utils.math_util import add, sub, mul, div
 
 
 fn get_trainable_parameters(g: Graph) -> List[Symbol]:
@@ -20,13 +21,14 @@ fn get_trainable_parameters(g: Graph) -> List[Symbol]:
     return trainable_parameters ^
 
 
+@value
 struct Adam[
+    lifetime: MutableLifetime, # using mutability and anylifetime, seems to give problem for now because the the reference can't now for sure if the lifetime is mutable or not
+    //,
     g: Graph,
-    mutability: __mlir_type.i1,
-    lifetime: AnyLifetime[mutability].type,
     trainable_parameters: List[Symbol] = get_trainable_parameters(g),
 ]:
-    var parameters: Reference[Parameters, mutability, lifetime]
+    var parameters: Reference[Parameters, True, lifetime]
 
     var lr: Scalar[dtype]
     var beta1: Scalar[dtype]
@@ -39,7 +41,7 @@ struct Adam[
 
     fn __init__(
         inout self,
-        parameters: Reference[Parameters, mutability, lifetime],
+        parameters: Reference[Parameters, True, lifetime],
         lr: Scalar[dtype] = 0.001,
         beta1: Scalar[dtype] = 0.9,
         beta2: Scalar[dtype] = 0.999,
diff --git a/basalt/nn/tensor.mojo b/basalt/nn/tensor.mojo
index b3fa551..63fb02a 100644
--- a/basalt/nn/tensor.mojo
+++ b/basalt/nn/tensor.mojo
@@ -1,4 +1,3 @@
-from math import min
 from testing import assert_true
 from algorithm import vectorize
 
@@ -14,40 +13,34 @@ struct TensorShape(Stringable):
     var _rank: Int
     var _shape: StaticIntTuple[MAX_RANK]
 
-    @always_inline("nodebug")
     fn __init__(inout self, *shape: Int):
         self._rank = len(shape)
         self._shape = StaticIntTuple[MAX_RANK]()
         for i in range(min(self._rank, MAX_RANK)):
             self._shape[i] = shape[i]
 
-    @always_inline("nodebug")
     fn __init__(inout self, shapes: VariadicList[Int]):
         self._rank = len(shapes)
         self._shape = StaticIntTuple[MAX_RANK]()
         for i in range(min(self._rank, MAX_RANK)):
             self._shape[i] = shapes[i]
 
-    @always_inline("nodebug")
     fn __init__(inout self, shape: List[Int]):
         self._rank = len(shape)
         self._shape = StaticIntTuple[MAX_RANK]()
         for i in range(min(self._rank, MAX_RANK)):
             self._shape[i] = shape[i]
 
-    @always_inline("nodebug")
     fn __init__[num: Int](inout self, shape: StaticIntTuple[num]):
         self._rank = num
         self._shape = StaticIntTuple[MAX_RANK]()
         for i in range(min(self._rank, MAX_RANK)):
             self._shape[i] = shape[i]
 
-    @always_inline("nodebug")
     fn __init__(inout self, rank: Int, shape: StaticIntTuple[MAX_RANK]):
         self._rank = rank
         self._shape = shape
 
-    @always_inline("nodebug")
     fn __init__(inout self, owned shape: _TensorShape):
         self._rank = shape.rank()
         self._shape = StaticIntTuple[MAX_RANK]()
@@ -117,19 +110,16 @@ struct Tensor[dtype: DType](Stringable, Movable, CollectionElement):
     var _data: DTypePointer[dtype]
     var _shape: TensorShape
 
-    @always_inline("nodebug")
     fn __init__(inout self, *dims: Int):
         self._shape = TensorShape(dims)
         self._data = DTypePointer[dtype].alloc(self._shape.num_elements())
         memset_zero(self._data, self._shape.num_elements())
 
-    @always_inline("nodebug")
     fn __init__(inout self, owned shape: TensorShape):
         self._data = DTypePointer[dtype].alloc(shape.num_elements())
         memset_zero(self._data, shape.num_elements())
         self._shape = shape
 
-    @always_inline("nodebug")
     fn __init__(
         inout self, owned data: DTypePointer[dtype], owned shape: TensorShape
     ):
@@ -140,20 +130,17 @@ struct Tensor[dtype: DType](Stringable, Movable, CollectionElement):
         memcpy(self._data, data, self._shape.num_elements())
         _ = data
 
-    @always_inline("nodebug")
     fn __init__(inout self, owned tensor: _Tensor[dtype]):
         self._data = DTypePointer[dtype].alloc(tensor.num_elements())
         self._shape = tensor.shape()
 
-        memcpy(self._data, tensor.data(), self._shape.num_elements())
+        memcpy(self._data, tensor.unsafe_ptr(), self._shape.num_elements())
         _ = tensor
 
-    @always_inline("nodebug")
     fn __moveinit__(inout self, owned other: Tensor[dtype]):
         self._data = other._data
         self._shape = other._shape
 
-    @always_inline("nodebug")
     fn __copyinit__(inout self, other: Tensor[dtype]):
         # print("[WARNING] Copying tensor")
         self._data = DTypePointer[dtype].alloc(other._shape.num_elements())
diff --git a/basalt/utils/bytes.mojo b/basalt/utils/bytes.mojo
index 498851b..8125a30 100644
--- a/basalt/utils/bytes.mojo
+++ b/basalt/utils/bytes.mojo
@@ -1,5 +1,6 @@
 from math import nan
-from math.limit import inf
+from utils.numerics import inf
+from utils.static_tuple import StaticTuple
 
 alias ScalarBytes = DType.uint64.sizeof()
 
@@ -12,22 +13,18 @@ struct Bytes[capacity: Int](Stringable, CollectionElement, EqualityComparable):
 
     var data: StaticTuple[UInt8, capacity]
 
-    @always_inline("nodebug")
     fn __init__(inout self):
-        var data = StaticTuple[UInt8, capacity]()
+        var data = StaticTuple[UInt8, capacity](0)
 
-        @unroll
         for i in range(capacity):
             data[i] = 0
 
         self.data = data
 
-    @always_inline("nodebug")
     fn __init__(inout self, s: String):
-        var data = StaticTuple[UInt8, capacity]()
+        var data = StaticTuple[UInt8, capacity](0)
         var length = len(s)
 
-        @unroll
         for i in range(capacity):
             data[i] = ord(s[i]) if i < length else 0
 
@@ -47,7 +44,6 @@ struct Bytes[capacity: Int](Stringable, CollectionElement, EqualityComparable):
 
     @always_inline("nodebug")
     fn __eq__(self, other: Self) -> Bool:
-        @unroll
         for i in range(capacity):
             if self[i] != other[i]:
                 return False
@@ -55,7 +51,6 @@ struct Bytes[capacity: Int](Stringable, CollectionElement, EqualityComparable):
 
     @always_inline("nodebug")
     fn __ne__(self, other: Self) -> Bool:
-        @unroll
         for i in range(capacity):
             if self[i] != other[i]:
                 return True
@@ -65,7 +60,6 @@ struct Bytes[capacity: Int](Stringable, CollectionElement, EqualityComparable):
     fn __str__(self) -> String:
         var result: String = ""
 
-        @unroll
         for i in range(capacity):
             var val = self[i]
             if val != 0:
@@ -82,7 +76,6 @@ fn scalar_to_bytes[
     var bits = bitcast[DType.uint64](value.cast[expand_type[dtype]()]())
     var data = Bytes[Size]()
 
-    @unroll
     for i in range(ScalarBytes):
         data[i] = (bits >> (i << 3)).cast[DType.uint8]()
 
@@ -94,7 +87,6 @@ fn bytes_to_scalar[dtype: DType](data: Bytes) -> Scalar[dtype]:
 
     var bits: UInt64 = 0
 
-    @unroll
     for i in range(ScalarBytes):
         bits |= data[i].cast[DType.uint64]() << (i << 3)
 
diff --git a/basalt/utils/collection.mojo b/basalt/utils/collection.mojo
index 16474a5..1528844 100644
--- a/basalt/utils/collection.mojo
+++ b/basalt/utils/collection.mojo
@@ -1,4 +1,3 @@
-from math import max, divmod
 from memory.unsafe_pointer import UnsafePointer, initialize_pointee_move, destroy_pointee
 
 from basalt import Tensor, Symbol
@@ -133,20 +132,17 @@ struct Collection(CollectionElement, Sized):
 
         return -1
 
-    @always_inline("nodebug")
-    fn __refitem__[
-        mutability: __mlir_type.i1,
-        lifetime: AnyLifetime[mutability].type,
-    ](
-        self: Reference[Self, mutability, lifetime]._mlir_type,
+    fn __getitem__(
+        inout self,
         symbol: Symbol,
-    ) -> Reference[Tensor[dtype], mutability, lifetime]:
+    ) -> ref[__lifetime_of(self)] Tensor[dtype]:
         """
         Returns a reference to the tensor with the given symbol.
         """
-        var index = Reference(self)[].get_index(symbol.name)
+        var index = self.get_index(symbol.name)
+
 
-        return (Reference(self)[].data + index)[]
+        return (self.data + index)[0]
 
     @always_inline("nodebug")
     fn clear(inout self):
diff --git a/basalt/utils/datasets.mojo b/basalt/utils/datasets.mojo
index cb019ae..ff5b356 100644
--- a/basalt/utils/datasets.mojo
+++ b/basalt/utils/datasets.mojo
@@ -1,11 +1,15 @@
 from algorithm import vectorize
-from math import div
 
 from basalt import dtype
 from basalt import Tensor, TensorShape
 from basalt.utils.tensorutils import elwise_op, tmean, tstd
 
 
+@always_inline
+fn div[dtype: DType, simd_width: Int](a: SIMD[dtype, simd_width], b: Scalar[dtype]) -> SIMD[dtype, simd_width]:
+    return a / b
+
+
 struct BostonHousing:
     alias n_inputs = 13
 
diff --git a/basalt/utils/itertools.mojo b/basalt/utils/itertools.mojo
index fd7a6ce..2b7d3ab 100644
--- a/basalt/utils/itertools.mojo
+++ b/basalt/utils/itertools.mojo
@@ -36,7 +36,7 @@ struct _ProductIterator(Sized):
             var index = count % len(self.lists[i])
             combination.append(self.lists[i][index])
             count //= len(self.lists[i])
-        combination._reverse()
+        combination.reverse()
         return combination ^
 
     @always_inline("nodebug")
diff --git a/basalt/utils/math_util.mojo b/basalt/utils/math_util.mojo
new file mode 100644
index 0000000..faeab90
--- /dev/null
+++ b/basalt/utils/math_util.mojo
@@ -0,0 +1,41 @@
+@always_inline
+fn add[
+    dtype: DType, simd_width: Int
+](a: SIMD[dtype, simd_width], b: SIMD[dtype, simd_width]) -> SIMD[
+    dtype, simd_width
+]:
+    return a + b
+
+
+@always_inline
+fn sub[
+    dtype: DType, simd_width: Int
+](a: SIMD[dtype, simd_width], b: SIMD[dtype, simd_width]) -> SIMD[
+    dtype, simd_width
+]:
+    return a - b
+
+
+@always_inline
+fn mul[
+    dtype: DType, simd_width: Int
+](a: SIMD[dtype, simd_width], b: SIMD[dtype, simd_width]) -> SIMD[
+    dtype, simd_width
+]:
+    return a * b
+
+
+@always_inline
+fn div[
+    dtype: DType, simd_width: Int
+](a: SIMD[dtype, simd_width], b: SIMD[dtype, simd_width]) -> SIMD[
+    dtype, simd_width
+]:
+    return a / b
+
+
+@always_inline
+fn round_simd[
+    dtype: DType, simd_width: Int
+](x: SIMD[dtype, simd_width]) -> SIMD[dtype, simd_width]:
+    return round(x)
diff --git a/basalt/utils/onnx_utils.mojo b/basalt/utils/onnx_utils.mojo
index 3e67164..fde4d90 100644
--- a/basalt/utils/onnx_utils.mojo
+++ b/basalt/utils/onnx_utils.mojo
@@ -163,9 +163,9 @@ fn load_onnx_model(
                         "Shape mismatch for tensor "
                         + str(i)
                         + ". Expected shape: "
-                        + model_tensor_shape
+                        + str(model_tensor_shape)
                         + ", got shape: "
-                        + data_shape
+                        + str(data_shape)
                     )
 
             copy_np_data(model_parameters.tensors[g.params.symbols[i]], data_np)
@@ -250,7 +250,7 @@ fn create_attributes_and_constant_inputs(node: Node, node_number: Int) raises ->
     return (attributes, inputs)
 
 
-fn export_onnx_model(model_path: Path, model_parameters: Parameters, g: Graph) raises:
+fn export_onnx_model(model_path: Path, inout model_parameters: Parameters, g: Graph) raises:
     # Create onnx model with data and nodes
     var onnx = Python.import_module("onnx")
     var onnx_helper = Python.import_module("onnx.helper")
@@ -289,7 +289,7 @@ fn export_onnx_model(model_path: Path, model_parameters: Parameters, g: Graph) r
         var op_type = make_onnx_operator_type(node.operator)
         var inputs = PythonObject([])
         var outputs = PythonObject([])
-        var name = str(node.operator) + "_node" + i
+        var name = str(node.operator) + "_node" + str(i)
 
         for j in range(len(node.inputs)):
             inputs.append(str(node.inputs[j].name))
diff --git a/basalt/utils/perf_utils.mojo b/basalt/utils/perf_utils.mojo
index 9cf076c..bacd940 100644
--- a/basalt/utils/perf_utils.mojo
+++ b/basalt/utils/perf_utils.mojo
@@ -1,5 +1,4 @@
 from time import now
-from math import min
 from memory import memset
 
 from basalt.autograd.node import Node
@@ -7,10 +6,10 @@ from basalt.autograd.node import Node
 
 @always_inline("nodebug")
 fn fit_string[num: Int](s: String) -> String:
-    var data = DTypePointer[DType.int8]().alloc(num + 1)
+    var data = DTypePointer[DType.uint8]().alloc(num + 1)
     var copy_len = min(num, len(s))
 
-    memcpy(data, s._as_ptr(), copy_len)
+    memcpy(data, s.unsafe_uint8_ptr(), copy_len)
     memset(data + copy_len, ord(" "), num - copy_len)
     data[num] = 0
 
@@ -20,11 +19,11 @@ fn fit_string[num: Int](s: String) -> String:
 @always_inline("nodebug")
 fn truncate_decimals[num: Int](s: String) -> String:
     try:
-        var parts = s.split(delimiter=".")
+        var parts = s.split(".")
         var truncated = parts[0]
 
         if len(parts) > 1:
-            var decimal_parts = parts[1].split(delimiter="e")
+            var decimal_parts = parts[1].split("e")
             truncated += "." + fit_string[num](decimal_parts[0])
 
             if len(decimal_parts) > 1:
@@ -125,7 +124,7 @@ struct PerfMetrics:
         print(header)
 
         var header_length = len(header)
-        var seperator = DTypePointer[DType.int8]().alloc(header_length + 1)
+        var seperator = DTypePointer[DType.uint8]().alloc(header_length + 1)
         
         memset(seperator, ord("-"), header_length)
         seperator[header_length] = 0
@@ -146,11 +145,11 @@ struct PerfMetrics:
             var print_value = (
                 fit_string[5](str(i))
                 + "| "
-                + fit_string[15](value.node.operator)
+                + fit_string[15](str(value.node.operator))
                 + "| "
-                + fit_string[20](truncate_decimals[4](time))
+                + fit_string[20](truncate_decimals[4](str(time)))
                 + "| "
-                + fit_string[20](truncate_decimals[3](percentage) + " %")
+                + fit_string[20](truncate_decimals[3](str(percentage)) + " %")
                 + "| "
             )
 
diff --git a/basalt/utils/rand_utils.mojo b/basalt/utils/rand_utils.mojo
index 69fd80f..84b1925 100644
--- a/basalt/utils/rand_utils.mojo
+++ b/basalt/utils/rand_utils.mojo
@@ -1,6 +1,7 @@
 from basalt import Tensor
 from random import rand, randn
 from algorithm import vectorize
+from utils.static_tuple import StaticTuple
 
 
 @always_inline
@@ -71,4 +72,4 @@ struct MersenneTwister:
         return y
 
     fn next_ui8(inout self) -> UInt8:
-        return self.next().value & 0xFF
+        return self.next().value & int(0xFF)
diff --git a/basalt/utils/tensor_creation_utils.mojo b/basalt/utils/tensor_creation_utils.mojo
index 375279a..7662331 100644
--- a/basalt/utils/tensor_creation_utils.mojo
+++ b/basalt/utils/tensor_creation_utils.mojo
@@ -39,12 +39,14 @@ fn to_tensor(np_array: PythonObject) raises -> Tensor[dtype]:
 
     var tensor = Tensor[dtype](TensorShape(shape))
 
-    var np_array_2 = np_array.copy()
+    var np_array_2: PythonObject
     try:
         var np = Python.import_module("numpy")
-        np_array_2 = np.float32(np_array_2)
+        # copy is also necessary for ops like slices to make them contiguous instead of references.
+        np_array_2 = np.float32(np_array.copy())
     except e:
-        print("Error in to tensor", e)
+        np_array_2 = np_array.copy()
+        print("Error in to_tensor", e)
 
     var pointer = int(np_array_2.__array_interface__["data"][0].to_float64())
     var pointer_d = DTypePointer[tensor.dtype](address=pointer)
@@ -56,13 +58,15 @@ fn to_tensor(np_array: PythonObject) raises -> Tensor[dtype]:
     return tensor^
 
 
-fn copy_np_data(tensor: Tensor, np_array: PythonObject) raises:
-    var np_array_2 = np_array.copy()
+fn copy_np_data(inout tensor: Tensor, np_array: PythonObject) raises:
+    var np_array_2: PythonObject
     try:
         var np = Python.import_module("numpy")
-        np_array_2 = np.float32(np_array_2)
+        # copy is also necessary for ops like slices to make them contiguous instead of references.
+        np_array_2 = np.float32(np_array.copy())
     except e:
-        print("Error in to tensor", e)
+        np_array_2 = np_array.copy()
+        print("Error in to_tensor", e)
 
     var pointer = int(np_array_2.__array_interface__["data"][0].to_float64())
     var pointer_d = DTypePointer[tensor.dtype](address=pointer)
diff --git a/basalt/utils/tensorutils.mojo b/basalt/utils/tensorutils.mojo
index b3b70de..420ae5e 100644
--- a/basalt/utils/tensorutils.mojo
+++ b/basalt/utils/tensorutils.mojo
@@ -1,13 +1,17 @@
 from sys.info import num_physical_cores
-from algorithm import vectorize, parallelize, swap
+from algorithm import vectorize, parallelize
 from memory import memset_zero, memset, stack_allocation
-from math import sqrt, pow, equal, max, min, add, div, divmod, abs
+from math import sqrt
 from random import rand
+from utils.numerics import min_finite, max_finite
 
 from basalt import Tensor, TensorShape
 from basalt.nn.tensor import MAX_RANK
+from basalt.utils.math_util import add, sub, mul, div
 
 
+# ---- Start -----
+
 @always_inline
 fn fill[dtype: DType](inout t: Tensor[dtype], val: Scalar[dtype]):
     @parameter
@@ -472,13 +476,13 @@ fn _reduce_max[
 
 @always_inline
 fn tmax(t: Tensor[dtype]) -> Scalar[dtype]:
-    var starting_value = math.limit.min_finite[dtype]()
+    var starting_value = min_finite[dtype]()
     return reduce[max, _reduce_max](t, starting_value)
 
 
 @always_inline
 fn tmax(inout res: Tensor[dtype], t: Tensor[dtype], axis: Int):
-    var starting_value = math.limit.min_finite[dtype]()
+    var starting_value = min_finite[dtype]()
     reduce[max, _reduce_max](res, t, axis, starting_value)
 
 
diff --git a/examples/yolov8.mojo b/examples/yolov8.mojo
index eb3c16c..c23fd99 100644
--- a/examples/yolov8.mojo
+++ b/examples/yolov8.mojo
@@ -5,7 +5,8 @@ from basalt.autograd.attributes import AttributeVector, Attribute
 from basalt.utils.tensor_creation_utils import to_tensor, to_numpy
 
 from python import Python
-from math import ceil, max
+from math import ceil
+from utils.static_tuple import StaticTuple
 
 
 fn Conv(
diff --git a/examples/yolov8_cam.mojo b/examples/yolov8_cam.mojo
index 1c8b2c3..2c0cb3d 100644
--- a/examples/yolov8_cam.mojo
+++ b/examples/yolov8_cam.mojo
@@ -1,7 +1,7 @@
 import sys
 from time.time import now
 from python.python import Python
-from math import max
+from utils.static_tuple import StaticTuple
 
 from yolov8 import YoloV8, get_constant_values_from_onnx_model
 
diff --git a/tests/mojo/test_activations.mojo b/tests/mojo/test_activations.mojo
index 200215d..f2db8e9 100644
--- a/tests/mojo/test_activations.mojo
+++ b/tests/mojo/test_activations.mojo
@@ -8,6 +8,7 @@ from basalt.nn import (
     Softmax,
     LogSoftmax,
     ReLU,
+    LeakyReLU,
     Sigmoid,
     Tanh,
 )
@@ -19,6 +20,9 @@ from tests import assert_tensors_equal
 
 alias Activation = fn (inout g: Graph, input: Symbol) -> Symbol
 alias AxisActivation = fn (inout g: Graph, input: Symbol, axis: Int) -> Symbol
+alias LeakyReLUActivation = fn (
+    inout g: Graph, input: Symbol, negative_slope: Scalar[dtype]
+) -> Symbol
 
 
 fn create_graph[
@@ -30,7 +34,19 @@ fn create_graph[
     var x = g.input(shape)
     var activation = func(g, x, axis)
     g.out(activation)
-    return g ^
+    return g^
+
+
+fn create_graph[
+    shape: TensorShape,
+    func: LeakyReLUActivation,
+    negative_slope: Scalar[dtype],
+]() -> Graph:
+    var g = Graph()
+    var x = g.input(shape)
+    var activation = func(g, x, negative_slope)
+    g.out(activation)
+    return g^
 
 
 fn create_graph[shape: TensorShape, func: Activation]() -> Graph:
@@ -38,7 +54,7 @@ fn create_graph[shape: TensorShape, func: Activation]() -> Graph:
     var x = g.input(shape)
     var activation = func(g, x)
     g.out(activation)
-    return g ^
+    return g^
 
 
 fn test_graph[
@@ -56,6 +72,22 @@ fn test_graph[
     assert_equal(len(graph.nodes), nodes)
 
 
+fn test_graph[
+    shape: TensorShape,
+    func: LeakyReLUActivation,
+    nodes: Int,
+    negative_slope: Scalar[dtype],
+](input: Tensor[dtype], expected: Tensor[dtype]) raises:
+    alias graph = create_graph[shape, func, negative_slope]()
+
+    var model = Model[graph](inference_only=True)
+    var res = model.inference(input)[0]
+
+    assert_tensors_equal["almost"](res, expected)
+    assert_equal(len(graph.nodes), nodes)
+
+
+# TODO: All these overloads feel redundant. Find a way to condense them
 fn test_graph[
     shape: TensorShape,
     func: Activation,
@@ -125,6 +157,25 @@ fn test_RELU() raises:
     test_graph[shape, ReLU, nodes](input, expected)
 
 
+fn test_LEAKYRELU() raises:
+    alias negative_slope = 0.1
+
+    alias shape = TensorShape(2, 3)
+    alias nodes = 1
+
+    var input = Tensor[dtype](shape)
+
+    for i in range(6):
+        input[i] = i - 3
+
+    var expected = Tensor[dtype](shape)
+
+    for i in range(6):
+        expected[i] = i - 3 if i - 3 > 0 else negative_slope * (i - 3)
+
+    test_graph[shape, LeakyReLU, nodes, negative_slope](input, expected)
+
+
 fn test_SIGMOID() raises:
     alias shape = TensorShape(2, 3)
     alias nodes = 1
@@ -156,6 +207,7 @@ fn main():
         test_SOFTMAX()
         test_LOGSOFTMAX()
         test_RELU()
+        test_LEAKYRELU()
         test_SIGMOID()
         test_TANH()
     except e:
diff --git a/tests/mojo/test_mlops.mojo b/tests/mojo/test_mlops.mojo
index 045ae7b..07bdde5 100644
--- a/tests/mojo/test_mlops.mojo
+++ b/tests/mojo/test_mlops.mojo
@@ -1,11 +1,24 @@
 from basalt import dtype, nelts
 from basalt.autograd import OP
 from basalt.autograd.attributes import AttributeVector, Attribute
-from basalt.autograd.ops.mlops import SIGMOID, RELU, TANH, CLIP, SQUEEZE, UNSQUEEZE
+from basalt.autograd.ops.mlops import (
+    SIGMOID,
+    RELU,
+    LEAKYRELU,
+    TANH,
+    CLIP,
+    SQUEEZE,
+    UNSQUEEZE,
+)
 from basalt.nn import Tensor, TensorShape
 from basalt.utils.tensorutils import fill
 
-from tests import assert_tensors_equal, test_unary_op, test_unary_op_backward, to_numpy
+from tests import (
+    assert_tensors_equal,
+    test_unary_op,
+    test_unary_op_backward,
+    to_numpy,
+)
 
 
 fn test_SIGMOID() raises:
@@ -30,7 +43,9 @@ fn test_backward_SIGMOID() raises:
         expected_grad, 5.0 * 0.25
     )  # 0.25 = d(sigmoid(0))/dx = sigmoid(0) * (1 - sigmoid(0))
 
-    test_unary_op_backward[OP.SIGMOID, t1_shape, ug_shape](t1, ug, expected_grad)
+    test_unary_op_backward[OP.SIGMOID, t1_shape, ug_shape](
+        t1, ug, expected_grad
+    )
 
 
 fn test_RELU() raises:
@@ -71,6 +86,53 @@ fn test_backward_RELU() raises:
     test_unary_op_backward[OP.RELU, t1_shape, ug_shape](t1, ug, expected_grad)
 
 
+fn test_LEAKYRELU() raises:
+    alias t1_shape = TensorShape(2, 3)
+    var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
+    # TODO: When tensors can do slices, this could be changed to two fill functions.
+    for i in range(3):
+        t1[i] = 3
+    for i in range(3, 6):
+        t1[i] = -3
+
+    var expected = Tensor[dtype](2, 3)
+    for i in range(3):
+        expected[i] = 3
+    for i in range(3, 6):
+        expected[i] = -0.3
+
+    test_unary_op[
+        OP.LEAKYRELU,
+        t1_shape,
+        AttributeVector(Attribute("negative_slope", 0.1)),
+    ](t1, expected)
+
+
+fn test_backward_LEAKYRELU() raises:
+    alias t1_shape = TensorShape(2, 3)
+    alias ug_shape = TensorShape(2, 3)
+    var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
+    var ug: Tensor[dtype] = Tensor[dtype](ug_shape)
+    for i in range(3):
+        t1[i] = 3
+    for i in range(3, 6):
+        t1[i] = -3
+    fill(ug, 5.0)
+
+    var expected_grad = Tensor[dtype](2, 3)
+    for i in range(3):
+        expected_grad[i] = 1 * 5.0
+    for i in range(3, 6):
+        expected_grad[i] = 0.1 * 5.0
+
+    test_unary_op_backward[
+        OP.LEAKYRELU,
+        t1_shape,
+        ug_shape,
+        AttributeVector(Attribute("negative_slope", 0.1)),
+    ](t1, ug, expected_grad)
+
+
 fn test_TANH() raises:
     alias t1_shape = TensorShape(2, 3)
     var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
@@ -110,7 +172,9 @@ fn test_CLIP() raises:
     for i in range(6):
         var val = Scalar[dtype](i - 3)
         expected_min[i] = val if (val > -1.1) else -1.1
-    test_unary_op[OP.CLIP, t1_shape, AttributeVector(min_attr)](t1, expected_min)
+    test_unary_op[OP.CLIP, t1_shape, AttributeVector(min_attr)](
+        t1, expected_min
+    )
 
     # Clip with max
     alias max_attr = Attribute("max", 1.1)
@@ -118,7 +182,9 @@ fn test_CLIP() raises:
     for i in range(6):
         var val = Scalar[dtype](i - 3)
         expected_max[i] = val if (val < 1.1) else 1.1
-    test_unary_op[OP.CLIP, t1_shape, AttributeVector(max_attr)](t1, expected_max)
+    test_unary_op[OP.CLIP, t1_shape, AttributeVector(max_attr)](
+        t1, expected_max
+    )
 
     # Clip with min and max
     var expected = Tensor[dtype](2, 3)
@@ -130,7 +196,9 @@ fn test_CLIP() raises:
             expected[i] = 1.1
         else:
             expected[i] = val
-    test_unary_op[OP.CLIP, t1_shape, AttributeVector(min_attr, max_attr)](t1, expected)
+    test_unary_op[OP.CLIP, t1_shape, AttributeVector(min_attr, max_attr)](
+        t1, expected
+    )
 
 
 fn test_backward_CLIP() raises:
@@ -152,7 +220,9 @@ fn test_backward_CLIP() raises:
     for i in range(6):
         var val = Scalar[dtype](i - 3)
         expected_min[i] = 5.0 if (val > -1.1) else 0.0
-    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, min_attr](t1, ug, expected_min)
+    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, min_attr](
+        t1, ug, expected_min
+    )
 
     # Clip with max
     alias max_attr = AttributeVector(Attribute("max", 1.1))
@@ -160,7 +230,9 @@ fn test_backward_CLIP() raises:
     for i in range(6):
         var val = Scalar[dtype](i - 3)
         expected_max[i] = 5.0 if (val < 1.1) else 0.0
-    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, max_attr](t1, ug, expected_max)
+    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, max_attr](
+        t1, ug, expected_max
+    )
 
     # Clip with min and max
     alias attrs = AttributeVector(Attribute("min", -1.1), Attribute("max", 1.1))
@@ -201,7 +273,9 @@ fn test_SQUEEZE() raises:
     expected = Tensor[dtype](1, 2, 3)
     fill(expected, 5.0)
     test_unary_op[
-        OP.SQUEEZE, t1_shape, AttributeVector(Attribute("dims", TensorShape(2, 4)))
+        OP.SQUEEZE,
+        t1_shape,
+        AttributeVector(Attribute("dims", TensorShape(2, 4))),
     ](t1, expected)
 
 
@@ -216,7 +290,9 @@ fn test_backward_SQUEEZE() raises:
     var expected_grad = Tensor[dtype](2, 1, 3, 1)
     fill(expected_grad, 5.0)
 
-    test_unary_op_backward[OP.SQUEEZE, t1_shape, ug_shape](t1, ug, expected_grad)
+    test_unary_op_backward[OP.SQUEEZE, t1_shape, ug_shape](
+        t1, ug, expected_grad
+    )
 
 
 fn test_UNSQUEEZE() raises:
@@ -228,26 +304,34 @@ fn test_UNSQUEEZE() raises:
     var expected = Tensor[dtype](2, 1, 3, 1)
     fill(expected, 5.0)
     test_unary_op[
-        OP.UNSQUEEZE, t1_shape, AttributeVector(Attribute("dims", TensorShape(1, 3)))
+        OP.UNSQUEEZE,
+        t1_shape,
+        AttributeVector(Attribute("dims", TensorShape(1, 3))),
     ](t1, expected)
 
     expected = Tensor[dtype](2, 1, 3)
     fill(expected, 5.0)
 
     test_unary_op[
-        OP.UNSQUEEZE, t1_shape, AttributeVector(Attribute("dims", TensorShape(1)))
+        OP.UNSQUEEZE,
+        t1_shape,
+        AttributeVector(Attribute("dims", TensorShape(1))),
     ](t1, expected)
 
     expected = Tensor[dtype](1, 2, 3)
     fill(expected, 5.0)
     test_unary_op[
-        OP.UNSQUEEZE, t1_shape, AttributeVector(Attribute("dims", TensorShape(-3)))
+        OP.UNSQUEEZE,
+        t1_shape,
+        AttributeVector(Attribute("dims", TensorShape(-3))),
     ](t1, expected)
 
     expected = Tensor[dtype](2, 1, 3, 1)
     fill(expected, 5.0)
     test_unary_op[
-        OP.UNSQUEEZE, t1_shape, AttributeVector(Attribute("dims", TensorShape(-1, -3)))
+        OP.UNSQUEEZE,
+        t1_shape,
+        AttributeVector(Attribute("dims", TensorShape(-1, -3))),
     ](t1, expected)
 
 
@@ -262,7 +346,9 @@ fn test_backward_UNSQUEEZE() raises:
     var expected_grad = Tensor[dtype](2, 3)
     fill(expected_grad, 5.0)
 
-    test_unary_op_backward[OP.UNSQUEEZE, t1_shape, ug_shape](t1, ug, expected_grad)
+    test_unary_op_backward[OP.UNSQUEEZE, t1_shape, ug_shape](
+        t1, ug, expected_grad
+    )
 
 
 fn test_SLICE() raises:
@@ -270,7 +356,7 @@ fn test_SLICE() raises:
     var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
     for i in range(t1.num_elements()):
         t1[i] = i
-    
+
     alias slice = Slice(1, 3, 1)
 
     # dim = 0
@@ -278,15 +364,17 @@ fn test_SLICE() raises:
     for i in range(2):
         for j in range(4):
             for k in range(5):
-                expected_0[i*4*5 + j*5 + k] = (i + 1) * 4 * 5 + j * 5 + k
+                expected_0[i * 4 * 5 + j * 5 + k] = (i + 1) * 4 * 5 + j * 5 + k
 
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(0))
-        )
+            Attribute("axes", TensorShape(0)),
+        ),
     ](t1, expected_0)
 
     # dim = 1
@@ -294,15 +382,17 @@ fn test_SLICE() raises:
     for i in range(3):
         for j in range(2):
             for k in range(5):
-                expected_1[i*2*5 + j*5 + k] = i * 4 * 5 + (j + 1) * 5 + k
+                expected_1[i * 2 * 5 + j * 5 + k] = i * 4 * 5 + (j + 1) * 5 + k
 
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(1))
-        )
+            Attribute("axes", TensorShape(1)),
+        ),
     ](t1, expected_1)
 
     # dim = 2
@@ -310,15 +400,17 @@ fn test_SLICE() raises:
     for i in range(3):
         for j in range(4):
             for k in range(2):
-                expected_2[i*4*2 + j*2 + k] = i * 4 * 5 + j * 5 + (k + 1)
-        
+                expected_2[i * 4 * 2 + j * 2 + k] = i * 4 * 5 + j * 5 + (k + 1)
+
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(2))
-        )
+            Attribute("axes", TensorShape(2)),
+        ),
     ](t1, expected_2)
 
 
@@ -335,15 +427,19 @@ fn test_SLICE_step() raises:
     for i in range(3):
         for j in range(2):
             for k in range(2):
-                expected_0[i*2*2 + j*2 + k] = (i*2 + 1) * 2 * 2 + j * 2 + k
+                expected_0[i * 2 * 2 + j * 2 + k] = (
+                    (i * 2 + 1) * 2 * 2 + j * 2 + k
+                )
 
     test_unary_op[
-        OP.SLICE, t0_shape, AttributeVector(
+        OP.SLICE,
+        t0_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(0))
-        )
+            Attribute("axes", TensorShape(0)),
+        ),
     ](t0, expected_0)
 
     # dim = 1
@@ -356,15 +452,19 @@ fn test_SLICE_step() raises:
     for i in range(2):
         for j in range(3):
             for k in range(2):
-                expected_1[i*3*2 + j*2 + k] = i * 10 * 2 + (j*2 + 1) * 2 + k
+                expected_1[i * 3 * 2 + j * 2 + k] = (
+                    i * 10 * 2 + (j * 2 + 1) * 2 + k
+                )
 
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(1))
-        )
+            Attribute("axes", TensorShape(1)),
+        ),
     ](t1, expected_1)
 
     # dim = 2
@@ -377,15 +477,19 @@ fn test_SLICE_step() raises:
     for i in range(2):
         for j in range(2):
             for k in range(3):
-                expected_2[i*2*3 + j*3 + k] = i * 2 * 10 + j * 10 + (k*2 + 1)
+                expected_2[i * 2 * 3 + j * 3 + k] = (
+                    i * 2 * 10 + j * 10 + (k * 2 + 1)
+                )
 
     test_unary_op[
-        OP.SLICE, t2_shape, AttributeVector(
+        OP.SLICE,
+        t2_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(2))
-        )
+            Attribute("axes", TensorShape(2)),
+        ),
     ](t2, expected_2)
 
 
@@ -402,15 +506,19 @@ fn test_SLICE_neg() raises:
     for i in range(3):
         for j in range(2):
             for k in range(2):
-                expected_0[i*2*2 + j*2 + k] = StaticIntTuple[3](6, 4, 2)[i] * 2 * 2 + j * 2 + k
+                expected_0[i * 2 * 2 + j * 2 + k] = (
+                    StaticIntTuple[3](6, 4, 2)[i] * 2 * 2 + j * 2 + k
+                )
 
     test_unary_op[
-        OP.SLICE, t0_shape, AttributeVector(
+        OP.SLICE,
+        t0_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(0))
-        )
+            Attribute("axes", TensorShape(0)),
+        ),
     ](t0, expected_0)
 
     # dim = 1
@@ -423,15 +531,19 @@ fn test_SLICE_neg() raises:
     for i in range(2):
         for j in range(3):
             for k in range(2):
-                expected_1[i*3*2 + j*2 + k] = i * 10 * 2 + StaticIntTuple[3](6, 4, 2)[j] * 2 + k
+                expected_1[i * 3 * 2 + j * 2 + k] = (
+                    i * 10 * 2 + StaticIntTuple[3](6, 4, 2)[j] * 2 + k
+                )
 
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(1))
-        )
+            Attribute("axes", TensorShape(1)),
+        ),
     ](t1, expected_1)
 
     # dim = 2
@@ -444,15 +556,19 @@ fn test_SLICE_neg() raises:
     for i in range(2):
         for j in range(2):
             for k in range(3):
-                expected_2[i*2*3 + j*3 + k] = i * 2 * 10 + j * 10 + StaticIntTuple[3](6, 4, 2)[k]
+                expected_2[i * 2 * 3 + j * 3 + k] = (
+                    i * 2 * 10 + j * 10 + StaticIntTuple[3](6, 4, 2)[k]
+                )
 
     test_unary_op[
-        OP.SLICE, t2_shape, AttributeVector(
+        OP.SLICE,
+        t2_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice.start)),
             Attribute("ends", TensorShape(slice.end)),
             Attribute("steps", TensorShape(slice.step)),
-            Attribute("axes", TensorShape(2))
-        )
+            Attribute("axes", TensorShape(2)),
+        ),
     ](t2, expected_2)
 
 
@@ -470,22 +586,35 @@ fn test_SLICE_multiple_axes() raises:
     for i in range(3):
         for j in range(3):
             for k in range(5):
-                expected[i*3*5 + j*5 + k] = StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40 + StaticIntTuple[3](3, 6, 9)[j] * 40 + StaticIntTuple[5](5, 7, 9, 11, 13)[k]
-    
+                expected[i * 3 * 5 + j * 5 + k] = (
+                    StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40
+                    + StaticIntTuple[3](3, 6, 9)[j] * 40
+                    + StaticIntTuple[5](5, 7, 9, 11, 13)[k]
+                )
+
     test_unary_op[
-        OP.SLICE, t1_shape, AttributeVector(
-            Attribute("starts", TensorShape(slice_0.start, slice_1.start, slice_2.start)),
-            Attribute("ends", TensorShape(slice_0.end, slice_1.end, slice_2.end)),
-            Attribute("steps", TensorShape(slice_0.step, slice_1.step, slice_2.step)),
+        OP.SLICE,
+        t1_shape,
+        AttributeVector(
+            Attribute(
+                "starts",
+                TensorShape(slice_0.start, slice_1.start, slice_2.start),
+            ),
+            Attribute(
+                "ends", TensorShape(slice_0.end, slice_1.end, slice_2.end)
+            ),
+            Attribute(
+                "steps", TensorShape(slice_0.step, slice_1.step, slice_2.step)
+            ),
             # Attribute("axes", TensorShape(0, 1, 2))
-        )
+        ),
     ](t1, expected)
 
     alias t2_shape = TensorShape(20, 32, 40, 50)
     var t2: Tensor[dtype] = Tensor[dtype](t2_shape)
     for i in range(t2.num_elements()):
         t2[i] = i
-    
+
     alias slice_2_1 = Slice(1, 6, 2)
     alias slice_2_2 = Slice(3, 10, 3)
     alias slice_2_3 = Slice(5, 15, 2)
@@ -497,14 +626,42 @@ fn test_SLICE_multiple_axes() raises:
         for j in range(3):
             for k in range(5):
                 for l in range(4):
-                    expected_2[i*3*5*4 + j*5*4 + k*4 + l] = StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40 * 50 + StaticIntTuple[3](3, 6, 9)[j] * 40 * 50 + StaticIntTuple[5](5, 7, 9, 11, 13)[k] * 50 + StaticIntTuple[4](7, 11, 15, 19)[l]
-    
+                    expected_2[i * 3 * 5 * 4 + j * 5 * 4 + k * 4 + l] = (
+                        StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40 * 50
+                        + StaticIntTuple[3](3, 6, 9)[j] * 40 * 50
+                        + StaticIntTuple[5](5, 7, 9, 11, 13)[k] * 50
+                        + StaticIntTuple[4](7, 11, 15, 19)[l]
+                    )
+
     test_unary_op[
-        OP.SLICE, t2_shape, AttributeVector(
-            Attribute("starts", TensorShape(slice_2_1.start, slice_2_2.start, slice_2_3.start, slice_2_4.start)),
-            Attribute("ends", TensorShape(slice_2_1.end, slice_2_2.end, slice_2_3.end, slice_2_4.end)),
-            Attribute("steps", TensorShape(slice_2_1.step, slice_2_2.step, slice_2_3.step, slice_2_4.step)),
-        )
+        OP.SLICE,
+        t2_shape,
+        AttributeVector(
+            Attribute(
+                "starts",
+                TensorShape(
+                    slice_2_1.start,
+                    slice_2_2.start,
+                    slice_2_3.start,
+                    slice_2_4.start,
+                ),
+            ),
+            Attribute(
+                "ends",
+                TensorShape(
+                    slice_2_1.end, slice_2_2.end, slice_2_3.end, slice_2_4.end
+                ),
+            ),
+            Attribute(
+                "steps",
+                TensorShape(
+                    slice_2_1.step,
+                    slice_2_2.step,
+                    slice_2_3.step,
+                    slice_2_4.step,
+                ),
+            ),
+        ),
     ](t2, expected_2)
 
 
@@ -523,15 +680,18 @@ fn test_backward_SLICE() raises:
     for i in range(2):
         for j in range(4):
             for k in range(5):
-                expected_ug0[(i+1)*4*5 + j*5 + k] = 1.0
+                expected_ug0[(i + 1) * 4 * 5 + j * 5 + k] = 1.0
 
     test_unary_op_backward[
-        OP.SLICE, t0_shape, ug0_shape, AttributeVector(
+        OP.SLICE,
+        t0_shape,
+        ug0_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice_0.start)),
             Attribute("ends", TensorShape(slice_0.end)),
             Attribute("steps", TensorShape(slice_0.step)),
-            Attribute("axes", TensorShape(0))
-        )
+            Attribute("axes", TensorShape(0)),
+        ),
     ](t0, ug0, expected_ug0)
 
     # dim = 1 (step = 2)
@@ -543,20 +703,23 @@ fn test_backward_SLICE() raises:
     alias ug1_shape = TensorShape(2, 3, 2)
     var ug1: Tensor[dtype] = Tensor[dtype](ug1_shape)
     fill(ug1, 1.0)
-    
+
     var expected_ug1 = Tensor[dtype](t1_shape)
     for i in range(2):
         for j in range(3):
             for k in range(2):
-                expected_ug1[i*10*2 + (j*2 + 1)*2 + k] = 1.0
+                expected_ug1[i * 10 * 2 + (j * 2 + 1) * 2 + k] = 1.0
 
     test_unary_op_backward[
-        OP.SLICE, t1_shape, ug1_shape, AttributeVector(
+        OP.SLICE,
+        t1_shape,
+        ug1_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice_1.start)),
             Attribute("ends", TensorShape(slice_1.end)),
             Attribute("steps", TensorShape(slice_1.step)),
-            Attribute("axes", TensorShape(1))
-        )
+            Attribute("axes", TensorShape(1)),
+        ),
     ](t1, ug1, expected_ug1)
 
     # dim = 2 (step = -2)
@@ -573,15 +736,20 @@ fn test_backward_SLICE() raises:
     for i in range(2):
         for j in range(2):
             for k in range(3):
-                expected_ug2[i*2*10 + j*10 + StaticIntTuple[3](6, 4, 2)[k]] = 1.0
+                expected_ug2[
+                    i * 2 * 10 + j * 10 + StaticIntTuple[3](6, 4, 2)[k]
+                ] = 1.0
 
     test_unary_op_backward[
-        OP.SLICE, t2_shape, ug2_shape, AttributeVector(
+        OP.SLICE,
+        t2_shape,
+        ug2_shape,
+        AttributeVector(
             Attribute("starts", TensorShape(slice_2.start)),
             Attribute("ends", TensorShape(slice_2.end)),
             Attribute("steps", TensorShape(slice_2.step)),
-            Attribute("axes", TensorShape(2))
-        )
+            Attribute("axes", TensorShape(2)),
+        ),
     ](t2, ug2, expected_ug2)
 
 
@@ -599,8 +767,12 @@ fn test_backward_SLICE_multiple_axes() raises:
     for i in range(3):
         for j in range(3):
             for k in range(5):
-                expected[i*3*5 + j*5 + k] = StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40 + StaticIntTuple[3](3, 6, 9)[j] * 40 + StaticIntTuple[5](5, 7, 9, 11, 13)[k]
-    
+                expected[i * 3 * 5 + j * 5 + k] = (
+                    StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40
+                    + StaticIntTuple[3](3, 6, 9)[j] * 40
+                    + StaticIntTuple[5](5, 7, 9, 11, 13)[k]
+                )
+
     alias ug_shape = TensorShape(3, 3, 5)
     var ug: Tensor[dtype] = Tensor[dtype](ug_shape)
     fill(ug, 1.0)
@@ -609,14 +781,28 @@ fn test_backward_SLICE_multiple_axes() raises:
     for i in range(3):
         for j in range(3):
             for k in range(5):
-                expected_ug[StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40 + StaticIntTuple[3](3, 6, 9)[j] * 40 + StaticIntTuple[5](5, 7, 9, 11, 13)[k]] = 1.0
+                expected_ug[
+                    StaticIntTuple[5](1, 3, 5, 7, 9)[i] * 32 * 40
+                    + StaticIntTuple[3](3, 6, 9)[j] * 40
+                    + StaticIntTuple[5](5, 7, 9, 11, 13)[k]
+                ] = 1.0
 
     test_unary_op_backward[
-        OP.SLICE, t1_shape, ug_shape, AttributeVector(
-            Attribute("starts", TensorShape(slice_0.start, slice_1.start, slice_2.start)),
-            Attribute("ends", TensorShape(slice_0.end, slice_1.end, slice_2.end)),
-            Attribute("steps", TensorShape(slice_0.step, slice_1.step, slice_2.step)),
-        )
+        OP.SLICE,
+        t1_shape,
+        ug_shape,
+        AttributeVector(
+            Attribute(
+                "starts",
+                TensorShape(slice_0.start, slice_1.start, slice_2.start),
+            ),
+            Attribute(
+                "ends", TensorShape(slice_0.end, slice_1.end, slice_2.end)
+            ),
+            Attribute(
+                "steps", TensorShape(slice_0.step, slice_1.step, slice_2.step)
+            ),
+        ),
     ](t1, ug, expected_ug)
 
 
@@ -646,8 +832,6 @@ fn test_INDEX() raises:
         )
     ](t, expected)
 
-    print(expected)
-
 
 fn test_INDEX_backward() raises:
     alias t1_shape = TensorShape(2, 3, 5)
@@ -676,9 +860,6 @@ fn test_INDEX_backward() raises:
         )
     ](t, ug, expected)
 
-    print(expected)
-
-
 fn test_UPSAMPLE() raises:
     alias t1_shape = TensorShape(2, 3, 5)
     var t = Tensor[dtype](t1_shape)
@@ -723,17 +904,17 @@ fn test_UPSAMPLE() raises:
 
 fn main():
     try:
-        # test_SIGMOID()
-        # test_RELU()
-        # test_TANH()
-        # test_CLIP()
-        # test_SQUEEZE()
-        # test_UNSQUEEZE()
-        # test_SLICE()
-        # test_SLICE_step()
-        # test_SLICE_neg()
-        # test_SLICE_multiple_axes()
-        # test_INDEX()
+        test_SIGMOID()
+        test_RELU()
+        test_TANH()
+        test_CLIP()
+        test_SQUEEZE()
+        test_UNSQUEEZE()
+        test_SLICE()
+        test_SLICE_step()
+        test_SLICE_neg()
+        test_SLICE_multiple_axes()
+        test_INDEX()
         test_UPSAMPLE()
     except e:
         print("[ERROR] Error in forward mlops")
@@ -741,15 +922,15 @@ fn main():
         return
 
     try:
-        # test_backward_SIGMOID()
-        # test_backward_RELU()
-        # test_backward_TANH()
-        # test_backward_CLIP()
-        # test_backward_SQUEEZE()
-        # test_backward_UNSQUEEZE()
-        # test_backward_SLICE()
-        # test_backward_SLICE_multiple_axes()
-        # test_INDEX_backward()
+        test_backward_SIGMOID()
+        test_backward_RELU()
+        test_backward_TANH()
+        test_backward_CLIP()
+        test_backward_SQUEEZE()
+        test_backward_UNSQUEEZE()
+        test_backward_SLICE()
+        test_backward_SLICE_multiple_axes()
+        test_INDEX_backward()
         pass
     except e:
         print("[ERROR] Error in backward mlops")
diff --git a/tests/mojo/test_tensorutils.mojo b/tests/mojo/test_tensorutils.mojo
index 6c3dedf..7ea9cc0 100644
--- a/tests/mojo/test_tensorutils.mojo
+++ b/tests/mojo/test_tensorutils.mojo
@@ -1,6 +1,6 @@
 from random import rand
 from testing import assert_equal, assert_almost_equal
-from math import sqrt, exp, round, add, sub, mul, div
+from math import sqrt, exp
 
 from basalt import dtype, nelts
 from basalt.autograd.ops.matmul import dot
@@ -20,6 +20,7 @@ from basalt.utils.tensorutils import (
     transpose,
 )
 from basalt.nn import Tensor, TensorShape
+from basalt.utils.math_util import add, sub, mul, div, round_simd
 
 from tests import assert_tensors_equal
 
@@ -81,7 +82,7 @@ fn test_elwise_transform() raises:
     assert_tensors_equal(B_res, C)
 
     var C_res = Tensor[dtype](2, 10)
-    elwise_transform[round](C_res, C)
+    elwise_transform[round_simd](C_res, C)
     assert_tensors_equal(C_res, D)
 
 
diff --git a/tests/mojo/test_tensorutils_data.mojo b/tests/mojo/test_tensorutils_data.mojo
index 4cf956e..3a7466f 100644
--- a/tests/mojo/test_tensorutils_data.mojo
+++ b/tests/mojo/test_tensorutils_data.mojo
@@ -1,8 +1,7 @@
-from math import add
-
 from basalt import dtype, nelts
 from basalt.nn import Tensor, TensorShape
 from basalt.utils.tensorutils import fill, elwise_op
+from basalt.utils.math_util import add
 
 
 fn generate_tensor(*shape: Int) -> Tensor[dtype]:
diff --git a/tests/python/test_mlops_torch.mojo b/tests/python/test_mlops_torch.mojo
index 69a4a12..ba6288f 100644
--- a/tests/python/test_mlops_torch.mojo
+++ b/tests/python/test_mlops_torch.mojo
@@ -1,5 +1,5 @@
 from random import rand
-from math.limit import min_finite, max_finite
+from utils.numerics import min_finite, max_finite
 from collections.optional import OptionalReg, Optional
 from python.python import Python
 from python.object import PythonObject
@@ -47,6 +47,11 @@ fn torch_unary_op(
             expected = torch.sigmoid(input_1)
         elif op == OP.RELU:
             expected = torch.relu(input_1)
+        elif op == OP.LEAKYRELU:
+            expected = torch.nn.functional.leaky_relu(
+                input_1,
+                attrs.value()["negative_slope"].value().to_scalar[dtype](),
+            )
         elif op == OP.TANH:
             expected = torch.tanh(input_1)
         elif op == OP.CLIP:
@@ -65,7 +70,9 @@ fn torch_unary_op(
                 var dim = attrs["dims"]
 
                 if dim:
-                    expected = torch.squeeze(input_1, dim=dim.value().to_shape()[0])
+                    expected = torch.squeeze(
+                        input_1, dim=dim.value().to_shape()[0]
+                    )
                 else:
                     expected = torch.squeeze(input_1)
             elif attrs_tuple:
@@ -78,7 +85,9 @@ fn torch_unary_op(
                 var dim = attrs["dims"]
 
                 if dim:
-                    expected = torch.unsqueeze(input_1, dim=dim.value().to_shape()[0])
+                    expected = torch.unsqueeze(
+                        input_1, dim=dim.value().to_shape()[0]
+                    )
                 else:
                     expected = torch.unsqueeze(input_1, 0)
             elif attrs_tuple:
@@ -102,11 +111,11 @@ fn torch_unary_op(
 
                 if step < 0:
                     flip_dims.append(dim)
-                    step = step *- 1
+                    step = step * -1
                     end, start = (end + 1) * -1, (start + 1) * -1
 
                 indices[dim] = py.slice(start, end, step)
-            
+
             expected = input_1.flip(flip_dims)[indices]
         elif op == OP.UPSAMPLE:
             var attrs = attrs.value()
@@ -171,6 +180,31 @@ fn test_RELU() raises:
     )
 
 
+fn test_LEAKYRELU() raises:
+    alias t1_shape = TensorShape(37, 63, 107)
+    alias ug_shape = TensorShape(37, 63, 107)
+    var t1: Tensor[dtype] = Tensor[dtype](t1_shape)
+    rand(t1.data(), t1.num_elements())
+
+    var ug = Tensor[dtype](ug_shape)
+    rand(ug.data(), ug.num_elements())
+
+    var expected_and_grad = torch_unary_op(
+        OP.LEAKYRELU, t1, ug, AttributeVector(Attribute("negative_slope", Float32(0.1)))
+    )
+    test_unary_op[
+        OP.LEAKYRELU,
+        t1_shape,
+        AttributeVector(Attribute("negative_slope", 0.1)),
+    ](t1, expected_and_grad.expected)
+    test_unary_op_backward[
+        OP.LEAKYRELU,
+        t1_shape,
+        ug_shape,
+        AttributeVector(Attribute("negative_slope", 0.1)),
+    ](t1, ug, expected_and_grad.grad_1)
+
+
 fn test_TANH() raises:
     alias t1_shape = TensorShape(37, 63, 107)
     alias ug_shape = TensorShape(37, 63, 107)
@@ -205,23 +239,27 @@ fn test_CLIP() raises:
 
     # Clip with min
     alias min_attr = Attribute("min", 0.3333)
-    expected_and_grad = torch_unary_op(OP.CLIP, t1, ug, AttributeVector(min_attr))
+    expected_and_grad = torch_unary_op(
+        OP.CLIP, t1, ug, AttributeVector(min_attr)
+    )
     test_unary_op[OP.CLIP, t1_shape, AttributeVector(min_attr)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, AttributeVector(min_attr)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.CLIP, t1_shape, ug_shape, AttributeVector(min_attr)
+    ](t1, ug, expected_and_grad.grad_1)
 
     # Clip with max
     alias max_attr = Attribute("max", 0.6666)
-    expected_and_grad = torch_unary_op(OP.CLIP, t1, ug, AttributeVector(max_attr))
+    expected_and_grad = torch_unary_op(
+        OP.CLIP, t1, ug, AttributeVector(max_attr)
+    )
     test_unary_op[OP.CLIP, t1_shape, AttributeVector(max_attr)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.CLIP, t1_shape, ug_shape, AttributeVector(max_attr)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.CLIP, t1_shape, ug_shape, AttributeVector(max_attr)
+    ](t1, ug, expected_and_grad.grad_1)
 
     # Clip with min and max
     expected_and_grad = torch_unary_op(
@@ -261,9 +299,9 @@ fn test_SQUEEZE() raises:
     test_unary_op[OP.SQUEEZE, t1_shape, AttributeVector(dim)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.SQUEEZE, t1_shape, ug_shape_1, AttributeVector(dim)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.SQUEEZE, t1_shape, ug_shape_1, AttributeVector(dim)
+    ](t1, ug, expected_and_grad.grad_1)
 
     alias ug_shape_2 = TensorShape(20, 28, 1)
     ug = Tensor[dtype](ug_shape_2)
@@ -271,13 +309,15 @@ fn test_SQUEEZE() raises:
 
     alias dim_2 = Attribute("dims", TensorShape(1))
 
-    expected_and_grad = torch_unary_op(OP.SQUEEZE, t1, ug, AttributeVector(dim_2))
+    expected_and_grad = torch_unary_op(
+        OP.SQUEEZE, t1, ug, AttributeVector(dim_2)
+    )
     test_unary_op[OP.SQUEEZE, t1_shape, AttributeVector(dim_2)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.SQUEEZE, t1_shape, ug_shape_2, AttributeVector(dim_2)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.SQUEEZE, t1_shape, ug_shape_2, AttributeVector(dim_2)
+    ](t1, ug, expected_and_grad.grad_1)
 
     # Squeeze with multiple dims
     ug = Tensor[dtype](ug_shape)
@@ -294,9 +334,9 @@ fn test_SQUEEZE() raises:
     test_unary_op[OP.SQUEEZE, t1_shape, AttributeVector(dims)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.SQUEEZE, t1_shape, ug_shape, AttributeVector(dims)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.SQUEEZE, t1_shape, ug_shape, AttributeVector(dims)
+    ](t1, ug, expected_and_grad.grad_1)
 
 
 fn test_UNSQUEEZE() raises:
@@ -310,13 +350,15 @@ fn test_UNSQUEEZE() raises:
 
     alias dim = Attribute("dims", TensorShape(1))
 
-    var expected_and_grad = torch_unary_op(OP.UNSQUEEZE, t1, ug, AttributeVector(dim))
+    var expected_and_grad = torch_unary_op(
+        OP.UNSQUEEZE, t1, ug, AttributeVector(dim)
+    )
     test_unary_op[OP.UNSQUEEZE, t1_shape, AttributeVector(dim)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.UNSQUEEZE, t1_shape, ug_shape, AttributeVector(dim)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.UNSQUEEZE, t1_shape, ug_shape, AttributeVector(dim)
+    ](t1, ug, expected_and_grad.grad_1)
 
     # Unsqueeze with multiple dims
     alias ug_shape_2 = TensorShape(20, 1, 28, 1)
@@ -333,9 +375,9 @@ fn test_UNSQUEEZE() raises:
     test_unary_op[OP.UNSQUEEZE, t1_shape, AttributeVector(dims)](
         t1, expected_and_grad.expected
     )
-    test_unary_op_backward[OP.UNSQUEEZE, t1_shape, ug_shape_2, AttributeVector(dims)](
-        t1, ug, expected_and_grad.grad_1
-    )
+    test_unary_op_backward[
+        OP.UNSQUEEZE, t1_shape, ug_shape_2, AttributeVector(dims)
+    ](t1, ug, expected_and_grad.grad_1)
 
 
 fn test_SLICE() raises:
@@ -349,17 +391,23 @@ fn test_SLICE() raises:
         Attribute("starts", TensorShape(slice_0.start)),
         Attribute("ends", TensorShape(slice_0.end)),
         Attribute("steps", TensorShape(slice_0.step)),
-        Attribute("axes", TensorShape(0))
+        Attribute("axes", TensorShape(0)),
     )
 
     alias ug_shape = TensorShape(65, 322, 317)
     var ug = Tensor[dtype](ug_shape)
     rand(ug.data(), ug.num_elements())
 
-    var attrs_tuple_0 = PythonObject((slice_0.start, slice_0.end, slice_0.step, 0))
-    var expected_and_grad = torch_unary_op(OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0)
+    var attrs_tuple_0 = PythonObject(
+        (slice_0.start, slice_0.end, slice_0.step, 0)
+    )
+    var expected_and_grad = torch_unary_op(
+        OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0
+    )
     test_unary_op[OP.SLICE, t1_shape, attrs_0](t1, expected_and_grad.expected)
-    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape, attrs_0](t1, ug, expected_and_grad.grad_1)
+    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape, attrs_0](
+        t1, ug, expected_and_grad.grad_1
+    )
 
     # dim = 1
     alias slice_1 = Slice(10, 311, 5)
@@ -367,17 +415,23 @@ fn test_SLICE() raises:
         Attribute("starts", TensorShape(slice_1.start)),
         Attribute("ends", TensorShape(slice_1.end)),
         Attribute("steps", TensorShape(slice_1.step)),
-        Attribute("axes", TensorShape(1))
+        Attribute("axes", TensorShape(1)),
     )
 
     alias ug_shape_1 = TensorShape(430, 61, 317)
     ug = Tensor[dtype](ug_shape_1)
     rand(ug.data(), ug.num_elements())
 
-    var attrs_tuple_1 = PythonObject((slice_1.start, slice_1.end, slice_1.step, 1))
-    expected_and_grad = torch_unary_op(OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_1)
+    var attrs_tuple_1 = PythonObject(
+        (slice_1.start, slice_1.end, slice_1.step, 1)
+    )
+    expected_and_grad = torch_unary_op(
+        OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_1
+    )
     test_unary_op[OP.SLICE, t1_shape, attrs_1](t1, expected_and_grad.expected)
-    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_1, attrs_1](t1, ug, expected_and_grad.grad_1)
+    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_1, attrs_1](
+        t1, ug, expected_and_grad.grad_1
+    )
 
     # dim = 2
     alias slice_2 = Slice(293, 33, -7)
@@ -385,20 +439,26 @@ fn test_SLICE() raises:
         Attribute("starts", TensorShape(slice_2.start)),
         Attribute("ends", TensorShape(slice_2.end)),
         Attribute("steps", TensorShape(slice_2.step)),
-        Attribute("axes", TensorShape(2))
+        Attribute("axes", TensorShape(2)),
     )
 
     alias ug_shape_2 = TensorShape(430, 322, 38)
     ug = Tensor[dtype](ug_shape_2)
     rand(ug.data(), ug.num_elements())
 
-    var attrs_tuple_2 = PythonObject((slice_2.start, slice_2.end, slice_2.step, 2))
-    expected_and_grad = torch_unary_op(OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_2)
+    var attrs_tuple_2 = PythonObject(
+        (slice_2.start, slice_2.end, slice_2.step, 2)
+    )
+    expected_and_grad = torch_unary_op(
+        OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_2
+    )
     test_unary_op[OP.SLICE, t1_shape, attrs_2](t1, expected_and_grad.expected)
-    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_2, attrs_2](t1, ug, expected_and_grad.grad_1)
+    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_2, attrs_2](
+        t1, ug, expected_and_grad.grad_1
+    )
 
     # Multiple dims
-    
+
     # dim = 0, 1
     alias slice_0_1 = Slice(23, 340, 3)
     alias slice_1_1 = Slice(10, 250, 5)
@@ -407,17 +467,32 @@ fn test_SLICE() raises:
         Attribute("starts", TensorShape(slice_0_1.start, slice_1_1.start)),
         Attribute("ends", TensorShape(slice_0_1.end, slice_1_1.end)),
         Attribute("steps", TensorShape(slice_0_1.step, slice_1_1.step)),
-        Attribute("axes", TensorShape(0, 1))
+        Attribute("axes", TensorShape(0, 1)),
     )
 
     alias ug_shape_0_1 = TensorShape(106, 48, 317)
     ug = Tensor[dtype](ug_shape_0_1)
     rand(ug.data(), ug.num_elements())
 
-    var attrs_tuple_0_1 = PythonObject((slice_0_1.start, slice_0_1.end, slice_0_1.step, 0, slice_1_1.start, slice_1_1.end, slice_1_1.step, 1))
-    expected_and_grad = torch_unary_op(OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0_1)
+    var attrs_tuple_0_1 = PythonObject(
+        (
+            slice_0_1.start,
+            slice_0_1.end,
+            slice_0_1.step,
+            0,
+            slice_1_1.start,
+            slice_1_1.end,
+            slice_1_1.step,
+            1,
+        )
+    )
+    expected_and_grad = torch_unary_op(
+        OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0_1
+    )
     test_unary_op[OP.SLICE, t1_shape, attrs_0_1](t1, expected_and_grad.expected)
-    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_0_1, attrs_0_1](t1, ug, expected_and_grad.grad_1)
+    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_0_1, attrs_0_1](
+        t1, ug, expected_and_grad.grad_1
+    )
 
     # dim = 0, 1, 2
     alias slice_0_2 = Slice(-412, -5, 3)
@@ -425,20 +500,46 @@ fn test_SLICE() raises:
     alias slice_2_2 = Slice(293, 33, -7)
 
     alias attrs_0_2 = AttributeVector(
-        Attribute("starts", TensorShape(slice_0_2.start, slice_1_2.start, slice_2_2.start)),
-        Attribute("ends", TensorShape(slice_0_2.end, slice_1_2.end, slice_2_2.end)),
-        Attribute("steps", TensorShape(slice_0_2.step, slice_1_2.step, slice_2_2.step)),
-        Attribute("axes", TensorShape(0, 1, 2))
+        Attribute(
+            "starts",
+            TensorShape(slice_0_2.start, slice_1_2.start, slice_2_2.start),
+        ),
+        Attribute(
+            "ends", TensorShape(slice_0_2.end, slice_1_2.end, slice_2_2.end)
+        ),
+        Attribute(
+            "steps", TensorShape(slice_0_2.step, slice_1_2.step, slice_2_2.step)
+        ),
+        Attribute("axes", TensorShape(0, 1, 2)),
     )
 
     alias ug_shape_0_2 = TensorShape(136, 35, 38)
     ug = Tensor[dtype](ug_shape_0_2)
     rand(ug.data(), ug.num_elements())
 
-    var attrs_tuple_0_2 = PythonObject((slice_0_2.start, slice_0_2.end, slice_0_2.step, 0, slice_1_2.start, slice_1_2.end, slice_1_2.step, 1, slice_2_2.start, slice_2_2.end, slice_2_2.step, 2))
-    expected_and_grad = torch_unary_op(OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0_2)
+    var attrs_tuple_0_2 = PythonObject(
+        (
+            slice_0_2.start,
+            slice_0_2.end,
+            slice_0_2.step,
+            0,
+            slice_1_2.start,
+            slice_1_2.end,
+            slice_1_2.step,
+            1,
+            slice_2_2.start,
+            slice_2_2.end,
+            slice_2_2.step,
+            2,
+        )
+    )
+    expected_and_grad = torch_unary_op(
+        OP.SLICE, t1, ug, attrs_tuple=attrs_tuple_0_2
+    )
     test_unary_op[OP.SLICE, t1_shape, attrs_0_2](t1, expected_and_grad.expected)
-    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_0_2, attrs_0_2](t1, ug, expected_and_grad.grad_1)
+    test_unary_op_backward[OP.SLICE, t1_shape, ug_shape_0_2, attrs_0_2](
+        t1, ug, expected_and_grad.grad_1
+    )
 
 
 fn test_UPSAMPLE() raises:
@@ -510,13 +611,14 @@ fn test_UPSAMPLE() raises:
 fn main():
     print("Running mlops (compare with torch) tests")
     try:
-        # test_SIGMOID()
-        # test_RELU()
-        # test_TANH()
-        # test_CLIP()
-        # test_SQUEEZE()
-        # test_UNSQUEEZE()
-        # test_SLICE()
+        test_SIGMOID()
+        test_RELU()
+        test_LEAKYRELU()
+        test_TANH()
+        test_CLIP()
+        test_SQUEEZE()
+        test_UNSQUEEZE()
+        test_SLICE()
         test_UPSAMPLE()
     except e:
         print("[ERROR] Error in mlops (compare with torch)")
diff --git a/tests/python/test_models_mnist.mojo b/tests/python/test_models_mnist.mojo
index 85dd47d..5a0312d 100644
--- a/tests/python/test_models_mnist.mojo
+++ b/tests/python/test_models_mnist.mojo
@@ -120,7 +120,7 @@ fn run_mojo[
     )
 
     var model = Model[graph]()
-    var optim = optim.Adam[graph](Reference(model.parameters), lr=learning_rate)
+    var optim = optim.Adam[graph](model.parameters, lr=learning_rate)
 
     var losses = List[Scalar[dtype]]()
 
diff --git a/tests/python/test_models_regression.mojo b/tests/python/test_models_regression.mojo
index cc88444..1a36b77 100644
--- a/tests/python/test_models_regression.mojo
+++ b/tests/python/test_models_regression.mojo
@@ -1,6 +1,6 @@
 from random import rand
 from python import Python
-from math.limit import max_finite
+from utils.numerics import max_finite
 from testing import assert_almost_equal
 
 from basalt import dtype
diff --git a/tests/python/test_models_sin_estimate.mojo b/tests/python/test_models_sin_estimate.mojo
index fe6e2a4..9b59231 100644
--- a/tests/python/test_models_sin_estimate.mojo
+++ b/tests/python/test_models_sin_estimate.mojo
@@ -1,6 +1,6 @@
 from random import rand
 from python import Python
-from math.limit import max_finite
+from utils.numerics import max_finite
 from testing import assert_almost_equal
 
 from basalt import dtype
@@ -81,7 +81,7 @@ fn run_mojo[
     )
 
     var model = Model[graph]()
-    var optim = optim.Adam[graph](Reference(model.parameters), lr=learning_rate)
+    var optim = optim.Adam[graph](model.parameters, lr=learning_rate)
 
     var losses = List[Scalar[dtype]]()