diff --git a/numojo/__init__.mojo b/numojo/__init__.mojo
index 6443793d..9488b874 100644
--- a/numojo/__init__.mojo
+++ b/numojo/__init__.mojo
@@ -22,30 +22,43 @@ from numojo.core.complex.complex_dtype import (
     ci16,
     ci32,
     ci64,
-    cisize,
-    cintp,
+    ci128,
+    ci256,
+    cint,
     cu8,
     cu16,
     cu32,
     cu64,
+    cu128,
+    cu256,
+    cuint,
+    cbf16,
     cf16,
     cf32,
     cf64,
     cboolean,
+    cinvalid,
 )
 from numojo.core.datatypes import (
     i8,
     i16,
     i32,
     i64,
-    isize,
+    i128,
+    i256,
+    int,
     u8,
     u16,
     u32,
     u64,
+    u128,
+    u256,
+    uint,
+    bf16,
     f16,
     f32,
     f64,
+    boolean,
 )
 from numojo.core.error import (
     ShapeError,
@@ -195,7 +208,7 @@ from numojo.routines.creation import (
 )
 
 from numojo.routines import indexing
-from numojo.routines.indexing import where, compress, take_along_axis
+from numojo.routines.indexing import `where`, compress, take_along_axis
 
 from numojo.routines.functional import apply_along_axis
 
diff --git a/numojo/core/__init__.mojo b/numojo/core/__init__.mojo
index c1223e63..e90a56d7 100644
--- a/numojo/core/__init__.mojo
+++ b/numojo/core/__init__.mojo
@@ -16,30 +16,42 @@ from .complex import (
     ci16,
     ci32,
     ci64,
-    cisize,
-    cintp,
+    ci128,
+    ci256,
+    cint,
     cu8,
     cu16,
     cu32,
     cu64,
+    cu128,
+    cu256,
+    cuint,
+    cbf16,
     cf16,
     cf32,
     cf64,
     cboolean,
+    cinvalid,
 )
 
 from .datatypes import (
     i8,
-    i16,
-    i32,
     i64,
+    i128,
+    i256,
+    int,
     u8,
     u16,
     u32,
     u64,
+    u128,
+    u256,
+    uint,
+    bf16,
     f16,
     f32,
     f64,
+    boolean,
 )
 
 from .error import (
diff --git a/numojo/core/complex/__init__.mojo b/numojo/core/complex/__init__.mojo
index e0bf1271..243acad3 100644
--- a/numojo/core/complex/__init__.mojo
+++ b/numojo/core/complex/__init__.mojo
@@ -6,14 +6,20 @@ from .complex_dtype import (
     ci16,
     ci32,
     ci64,
-    cisize,
-    cintp,
+    ci128,
+    ci256,
+    cint,
     cu8,
     cu16,
     cu32,
     cu64,
+    cu128,
+    cu256,
+    cuint,
+    cbf16,
     cf16,
     cf32,
     cf64,
     cboolean,
+    cinvalid,
 )
diff --git a/numojo/core/complex/complex_dtype.mojo b/numojo/core/complex/complex_dtype.mojo
index 8080a87f..38c88c5a 100644
--- a/numojo/core/complex/complex_dtype.mojo
+++ b/numojo/core/complex/complex_dtype.mojo
@@ -12,7 +12,7 @@
 from hashlib.hasher import Hasher
 from os import abort
 from sys import CompilationTarget
-from sys.info import bitwidthof, size_of
+from sys.info import bit_width_of, size_of
 from sys.intrinsics import _type_is_eq
 
 alias _mIsSigned = UInt8(1)
@@ -22,34 +22,45 @@ alias _mIsFloat = UInt8(1 << 6)
 
 # rust like aliases for complex data types.
 alias ci8 = ComplexDType.int8
-"""Data type alias cfor ComplexDType.int8"""
+"""Data type alias for ComplexDType.int8"""
 alias ci16 = ComplexDType.int16
-"""Data type alias cfor ComplexDType.int16"""
+"""Data type alias for ComplexDType.int16"""
 alias ci32 = ComplexDType.int32
-"""Data type alias cfor ComplexDType.int32"""
+"""Data type alias for ComplexDType.int32"""
 alias ci64 = ComplexDType.int64
-"""Data type alias cfor ComplexDType.int64"""
-alias cisize = ComplexDType.index
-"""Data type alias cfor ComplexDType.index"""
-alias cintp = ComplexDType.index
-"""Data type alias cfor ComplexDType.index"""
+"""Data type alias for ComplexDType.int64"""
+alias ci128 = ComplexDType.int128
+"""Data type alias for ComplexDType.int128"""
+alias ci256 = ComplexDType.int256
+"""Data type alias for ComplexDType.int256"""
+alias cint = ComplexDType.int
+"""Data type alias for ComplexDType.int"""
 alias cu8 = ComplexDType.uint8
-"""Data type alias cfor ComplexDType.uint8"""
+"""Data type alias for ComplexDType.uint8"""
 alias cu16 = ComplexDType.uint16
-"""Data type alias cfor ComplexDType.uint16"""
+"""Data type alias for ComplexDType.uint16"""
 alias cu32 = ComplexDType.uint32
-"""Data type alias cfor ComplexDType.uint32"""
+"""Data type alias for ComplexDType.uint32"""
 alias cu64 = ComplexDType.uint64
-"""Data type alias cfor ComplexDType.uint64"""
+"""Data type alias for ComplexDType.uint64"""
+alias cu128 = ComplexDType.uint128
+"""Data type alias for ComplexDType.uint128"""
+alias cu256 = ComplexDType.uint256
+"""Data type alias for ComplexDType.uint256"""
+alias cuint = ComplexDType.uint
+"""Data type alias for ComplexDType.uint"""
+alias cbf16 = ComplexDType.bfloat16
+"""Data type alias for ComplexDType.bfloat16"""
 alias cf16 = ComplexDType.float16
-"""Data type alias cfor ComplexDType.float16"""
+"""Data type alias for ComplexDType.float16"""
 alias cf32 = ComplexDType.float32
-"""Data type alias cfor ComplexDType.float32"""
+"""Data type alias for ComplexDType.float32"""
 alias cf64 = ComplexDType.float64
-"""Data type alias cfor ComplexDType.float64"""
+"""Data type alias for ComplexDType.float64"""
 alias cboolean = ComplexDType.bool
-"""Data type alias cfor ComplexDType.bool"""
-
+"""Data type alias for ComplexDType.bool"""
+alias cinvalid = ComplexDType.invalid
+"""Data type alias for ComplexDType.invalid"""
 
 # ===----------------------------------------------------------------------=== #
 # Implements the Complex Datatype.
@@ -90,25 +101,27 @@ struct ComplexDType(
     # ===-------------------------------------------------------------------===#
     # Aliases
     # ===-------------------------------------------------------------------===#
-
+    # Refer to DType documentation for details on each data type.
     alias _mlir_type = __mlir_type.`!kgen.dtype`
-
     alias invalid = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<invalid> : !kgen.dtype`
     )
     alias bool = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<bool> : !kgen.dtype`
     )
-    alias index = ComplexDType(
+    alias int = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<index> : !kgen.dtype`
     )
-    alias uint1 = ComplexDType(
+    alias uint = ComplexDType(
+        mlir_value=__mlir_attr.`#kgen.dtype.constant<uindex> : !kgen.dtype`
+    )
+    alias _uint1 = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<ui1> : !kgen.dtype`
     )
-    alias uint2 = ComplexDType(
+    alias _uint2 = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<ui2> : !kgen.dtype`
     )
-    alias uint4 = ComplexDType(
+    alias _uint4 = ComplexDType(
         mlir_value=__mlir_attr.`#kgen.dtype.constant<ui4> : !kgen.dtype`
     )
     alias uint8 = ComplexDType(
@@ -204,36 +217,30 @@ struct ComplexDType(
         """
         if str.startswith("ComplexDType."):
             return Self._from_str(str.removeprefix("ComplexDType."))
-        elif str == "bool":
-            return ComplexDType.bool
-        elif str == "index":
-            return ComplexDType.index
-
-        elif str == "uint8":
-            return ComplexDType.uint8
         elif str == "int8":
             return ComplexDType.int8
+        elif str == "int64":
+            return ComplexDType.int64
+        elif str == "int128":
+            return ComplexDType.int128
+        elif str == "int256":
+            return ComplexDType.int256
+        elif str == "int":
+            return ComplexDType.int
+        elif str == "uint8":
+            return ComplexDType.uint8
         elif str == "uint16":
             return ComplexDType.uint16
-        elif str == "int16":
-            return ComplexDType.int16
         elif str == "uint32":
             return ComplexDType.uint32
-        elif str == "int32":
-            return ComplexDType.int32
         elif str == "uint64":
             return ComplexDType.uint64
-        elif str == "int64":
-            return ComplexDType.int64
         elif str == "uint128":
             return ComplexDType.uint128
-        elif str == "int128":
-            return ComplexDType.int128
         elif str == "uint256":
             return ComplexDType.uint256
-        elif str == "int256":
-            return ComplexDType.int256
-
+        elif str == "uint":
+            return ComplexDType.uint
         elif str == "float8_e3m4":
             return ComplexDType.float8_e3m4
         elif str == "float8_e4m3fn":
@@ -244,7 +251,6 @@ struct ComplexDType(
             return ComplexDType.float8_e5m2
         elif str == "float8_e5m2fnuz":
             return ComplexDType.float8_e5m2fnuz
-
         elif str == "bfloat16":
             return ComplexDType.bfloat16
         elif str == "float16":
@@ -253,7 +259,8 @@ struct ComplexDType(
             return ComplexDType.float32
         elif str == "float64":
             return ComplexDType.float64
-
+        elif str == "bool":
+            return ComplexDType.bool
         else:
             return ComplexDType.invalid
 
@@ -276,35 +283,30 @@ struct ComplexDType(
             writer: The object to write to.
         """
 
-        if self is ComplexDType.bool:
-            return writer.write("bool")
-        elif self is ComplexDType.index:
-            return writer.write("index")
+        if self is ComplexDType.int8:
+            return writer.write("int8")
+        elif self is ComplexDType.int64:
+            return writer.write("int64")
+        elif self is ComplexDType.int128:
+            return writer.write("int128")
+        elif self is ComplexDType.int256:
+            return writer.write("int256")
+        elif self is ComplexDType.int:
+            return writer.write("int")
         elif self is ComplexDType.uint8:
             return writer.write("uint8")
-        elif self is ComplexDType.int8:
-            return writer.write("int8")
         elif self is ComplexDType.uint16:
             return writer.write("uint16")
-        elif self is ComplexDType.int16:
-            return writer.write("int16")
         elif self is ComplexDType.uint32:
             return writer.write("uint32")
-        elif self is ComplexDType.int32:
-            return writer.write("int32")
         elif self is ComplexDType.uint64:
             return writer.write("uint64")
-        elif self is ComplexDType.int64:
-            return writer.write("int64")
         elif self is ComplexDType.uint128:
             return writer.write("uint128")
-        elif self is ComplexDType.int128:
-            return writer.write("int128")
         elif self is ComplexDType.uint256:
             return writer.write("uint256")
-        elif self is ComplexDType.int256:
-            return writer.write("int256")
-
+        elif self is ComplexDType.uint:
+            return writer.write("uint")
         elif self is ComplexDType.float8_e3m4:
             return writer.write("float8_e3m4")
         elif self is ComplexDType.float8_e4m3fn:
@@ -315,18 +317,16 @@ struct ComplexDType(
             return writer.write("float8_e5m2")
         elif self is ComplexDType.float8_e5m2fnuz:
             return writer.write("float8_e5m2fnuz")
-
         elif self is ComplexDType.bfloat16:
             return writer.write("bfloat16")
         elif self is ComplexDType.float16:
             return writer.write("float16")
-
         elif self is ComplexDType.float32:
             return writer.write("float32")
-
         elif self is ComplexDType.float64:
             return writer.write("float64")
-
+        elif self is ComplexDType.bool:
+            return writer.write("bool")
         elif self is ComplexDType.invalid:
             return writer.write("invalid")
 
@@ -476,7 +476,7 @@ struct ComplexDType(
         Returns:
             Returns True if the input type parameter is an integer.
         """
-        return self is ComplexDType.index or self._is_non_index_integral()
+        return self in (DType.int, DType.uint) or self._is_non_index_integral()
 
     @always_inline("nodebug")
     fn is_floating_point(self) -> Bool:
@@ -529,10 +529,10 @@ struct ComplexDType(
 
     @always_inline
     fn size_of(self) -> Int:
-        """Returns the size in bytes of the current ComplexDType.
+        """Returns the size in bytes of the current DType.
 
         Returns:
-            Returns the size in bytes of the current ComplexDType.
+            Returns the size in bytes of the current DType.
         """
 
         if self._is_non_index_integral():
@@ -556,8 +556,10 @@ struct ComplexDType(
 
         elif self is ComplexDType.bool:
             return size_of[DType.bool]()
-        elif self is ComplexDType.index:
-            return size_of[DType.index]()
+        elif self is ComplexDType.int:
+            return size_of[DType.int]()
+        elif self is ComplexDType.uint:
+            return size_of[DType.uint]()
 
         elif self is ComplexDType.float8_e3m4:
             return size_of[DType.float8_e3m4]()
@@ -599,7 +601,7 @@ struct ComplexDType(
     # ===-------------------------------------------------------------------===#
     @always_inline("nodebug")
     fn __mlir_type(self) -> __mlir_type.`!kgen.deferred`:
-        """Returns the MLIR type of the current ComplexDType as an MLIR type.
+        """Returns the MLIR type of the current DType as an MLIR type.
 
         Returns:
             The MLIR type of the current ComplexDType.
@@ -607,7 +609,7 @@ struct ComplexDType(
         if self is ComplexDType.bool:
             return __mlir_attr.i1
 
-        if self is ComplexDType.index:
+        if self is ComplexDType.int:
             return __mlir_attr.index
 
         if self is ComplexDType.uint8:
@@ -657,97 +659,21 @@ struct ComplexDType(
         if self is ComplexDType.float64:
             return __mlir_attr.f64
 
-        return abort[__mlir_type.`!kgen.deferred`]("invalid ComplexDType")
-
-    @parameter
-    fn compare_dtype(self, dtype: DType) -> Bool:
-        if self.to_dtype() == dtype:
-            return True
-        return False
-
-    @parameter
-    fn to_dtype(self) -> DType:
-        # Floating point types
-        if self == ComplexDType.float16:
-            return DType.float16
-        elif self == ComplexDType.float32:
-            return DType.float32
-        elif self == ComplexDType.float64:
-            return DType.float64
-        elif self == ComplexDType.bfloat16:
-            return DType.bfloat16
-
-        # Float8 types
-        elif self == ComplexDType.float8_e3m4:
-            return DType.float8_e3m4
-        elif self == ComplexDType.float8_e4m3fn:
-            return DType.float8_e4m3fn
-        elif self == ComplexDType.float8_e4m3fnuz:
-            return DType.float8_e4m3fnuz
-        elif self == ComplexDType.float8_e5m2:
-            return DType.float8_e5m2
-        elif self == ComplexDType.float8_e5m2fnuz:
-            return DType.float8_e5m2fnuz
-
-        # Signed integer types
-        elif self == ComplexDType.int8:
-            return DType.int8
-        elif self == ComplexDType.int16:
-            return DType.int16
-        elif self == ComplexDType.int32:
-            return DType.int32
-        elif self == ComplexDType.int64:
-            return DType.int64
-        elif self == ComplexDType.int128:
-            return DType.int128
-        elif self == ComplexDType.int256:
-            return DType.int256
-
-        # Unsigned integer types
-        # elif self == ComplexDType.uint1:
-        #     return DType.uint1
-        # elif self == ComplexDType.uint2:
-        #     return DType.uint2
-        # elif self == ComplexDType.uint4:
-        # return DType.uint4
-        elif self == ComplexDType.uint8:
-            return DType.uint8
-        elif self == ComplexDType.uint16:
-            return DType.uint16
-        elif self == ComplexDType.uint32:
-            return DType.uint32
-        elif self == ComplexDType.uint64:
-            return DType.uint64
-        elif self == ComplexDType.uint128:
-            return DType.uint128
-        elif self == ComplexDType.uint256:
-            return DType.uint256
-
-        # Special types
-        elif self == ComplexDType.bool:
-            return DType.bool
-        elif self == ComplexDType.index:
-            return DType.index
-        elif self == ComplexDType.invalid:
-            return DType.invalid
-
-        # Default case
-        else:
-            return DType.invalid
+        return abort[__mlir_type.`!kgen.deferred`]("invalid dtype")
 
 
 fn _concise_dtype_str(cdtype: ComplexDType) -> String:
     """Returns a concise string representation of the complex data type."""
     if cdtype == ci8:
         return "ci8"
-    elif cdtype == ci16:
-        return "ci16"
-    elif cdtype == ci32:
-        return "ci32"
     elif cdtype == ci64:
         return "ci64"
-    elif cdtype == cisize:
-        return "cindex"
+    elif cdtype == ci128:
+        return "ci128"
+    elif cdtype == ci256:
+        return "ci256"
+    elif cdtype == cint:
+        return "cint"
     elif cdtype == cu8:
         return "cu8"
     elif cdtype == cu16:
@@ -756,6 +682,14 @@ fn _concise_dtype_str(cdtype: ComplexDType) -> String:
         return "cu32"
     elif cdtype == cu64:
         return "cu64"
+    elif cdtype == cu128:
+        return "cu128"
+    elif cdtype == cu256:
+        return "cu256"
+    elif cdtype == cuint:
+        return "cuint"
+    elif cdtype == cbf16:
+        return "cbf16"
     elif cdtype == cf16:
         return "cf16"
     elif cdtype == cf32:
@@ -764,7 +698,7 @@ fn _concise_dtype_str(cdtype: ComplexDType) -> String:
         return "cf64"
     elif cdtype == cboolean:
         return "cboolean"
-    elif cdtype == cisize:
-        return "cisize"
+    elif cdtype == cinvalid:
+        return "cinvalid"
     else:
         return "Unknown"
diff --git a/numojo/core/complex/complex_ndarray.mojo b/numojo/core/complex/complex_ndarray.mojo
index b155674b..7bf323e0 100644
--- a/numojo/core/complex/complex_ndarray.mojo
+++ b/numojo/core/complex/complex_ndarray.mojo
@@ -39,7 +39,7 @@ import builtin.bool as builtin_bool
 import builtin.math as builtin_math
 from builtin.type_aliases import Origin
 from collections.optional import Optional
-from math import log10
+from math import log10, sqrt
 from memory import UnsafePointer, memset_zero, memcpy
 from python import PythonObject
 from sys import simd_width_of
@@ -91,6 +91,9 @@ import numojo.routines.math._array_funcs as _af
 from numojo.routines.math._math_funcs import Vectorized
 import numojo.routines.math.arithmetic as arithmetic
 import numojo.routines.math.rounding as rounding
+import numojo.routines.math.trig as trig
+import numojo.routines.math.exponents as exponents
+import numojo.routines.math.misc as misc
 import numojo.routines.searching as searching
 
 
@@ -201,7 +204,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self.strides = self._re.strides
         self.flags = self._re.flags
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     @always_inline("nodebug")
@@ -225,7 +228,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self.strides = self._re.strides
         self.flags = self._re.flags
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     @always_inline("nodebug")
@@ -249,7 +252,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self.strides = self._re.strides
         self.flags = self._re.flags
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     fn __init__(
@@ -269,7 +272,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self.strides = self._re.strides
         self.flags = self._re.flags
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     fn __init__(
@@ -301,7 +304,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self._re = NDArray[Self.dtype](shape, strides, ndim, size, flags)
         self._im = NDArray[Self.dtype](shape, strides, ndim, size, flags)
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     fn __init__(
@@ -331,7 +334,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         self.strides = self._re.strides
         self.flags = self._re.flags
         self.print_options = PrintOptions(
-            precision=2, edge_items=2, line_width=80, formatted_width=6
+            precision=2, edge_items=2, line_width=100, formatted_width=6
         )
 
     @always_inline("nodebug")
@@ -427,7 +430,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         return ComplexSIMD[cdtype](
             re=self._re._buf.ptr.load[width=1](index_of_buffer),
             im=self._im._buf.ptr.load[width=1](index_of_buffer),
@@ -457,7 +460,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         return ComplexSIMD[cdtype](
             re=self._re._buf.ptr.load[width=1](index_of_buffer),
             im=self._im._buf.ptr.load[width=1](index_of_buffer),
@@ -586,11 +589,12 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         Examples:
             ```mojo
             import numojo as nm
-            var a = nm.arange[nm.cf32](nm.CScalar[nm.f32](0, 0), nm.CScalar[nm.f32](12, 12), nm.CScalar[nm.f32](1, 1)).reshape(nm.Shape(3, 4))
+            from numojo.prelude import *
+            var a = nm.arange[cf32](CScalar[cf32](0, 0), CScalar[cf32](12, 12), CScalar[cf32](1, 1)).reshape(nm.Shape(3, 4))
             print(a.shape)        # (3,4)
             print(a[1].shape)     # (4,)  -- 1-D slice
             print(a[-1].shape)    # (4,)  -- negative index
-            var b = nm.arange[nm.cf32](nm.CScalar[nm.f32](6, 6)).reshape(nm.Shape(6))
+            var b = nm.arange[cf32](CScalar[cf32](6, 6)).reshape(nm.Shape(6))
             print(b[2])           # 0-D array (scalar wrapper)
             ```
         """
@@ -642,13 +646,21 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         # Fast path for C-contiguous
         if self.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(result._re._buf.ptr, self._re._buf.ptr + norm * block, block)
-            memcpy(result._im._buf.ptr, self._im._buf.ptr + norm * block, block)
+            memcpy(
+                dest=result._re._buf.ptr,
+                src=self._re._buf.ptr + norm * block,
+                count=block,
+            )
+            memcpy(
+                dest=result._im._buf.ptr,
+                src=self._im._buf.ptr + norm * block,
+                count=block,
+            )
             return result^
 
         # F layout
-        self._re._copy_first_axis_slice[Self.dtype](self._re, norm, result._re)
-        self._im._copy_first_axis_slice[Self.dtype](self._im, norm, result._im)
+        self._re._copy_first_axis_slice(self._re, norm, result._re)
+        self._im._copy_first_axis_slice(self._im, norm, result._im)
         return result^
 
     fn __getitem__(self, var *slices: Slice) raises -> Self:
@@ -751,7 +763,8 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         Examples:
             ```mojo
             import numojo as nm
-            var a = nm.arange[nm.cf32](nm.ComplexScalar(10.0, 10.0)).reshape(nm.Shape(2, 5))
+            from numojo.prelude import *
+            var a = nm.arange[cf32](CScalar[cf32](10.0, 10.0)).reshape(nm.Shape(2, 5))
             var b = a[List[Slice](Slice(0, 2, 1), Slice(2, 4, 1))]  # Equivalent to arr[:, 2:4], returns a 2x2 sliced array.
             print(b)
             ```
@@ -855,8 +868,9 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
         ```mojo
             import numojo as nm
-            var a = nm.fullC[nm.f32](nm.Shape(2, 5), ComplexSIMD[nm.f32](1.0, 1.0))
-            var b = a[1, 2:4]
+            from numojo.prelude import *
+            var a = nm.full[cf32](nm.Shape(2, 5), CScalar[cf32](1.0, 1.0))
+            var b = a[1, Slice(2,4)]
             print(b)
         ```
         """
@@ -921,7 +935,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         narr = self.__getitem__(slice_list^)
         return narr^
 
-    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
+    fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self:
         """
         Get items from 0-th dimension of a ComplexNDArray of indices.
         If the original array is of shape (i,j,k) and
@@ -963,14 +977,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                     )
                 )
             memcpy(
-                result._re._buf.ptr + i * size_per_item,
-                self._re._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._re._buf.ptr + i * size_per_item,
+                src=self._re._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
             memcpy(
-                result._im._buf.ptr + i * size_per_item,
-                self._im._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._im._buf.ptr + i * size_per_item,
+                src=self._im._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
 
         return result^
@@ -992,7 +1006,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
         """
 
-        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
+        var indices_array = NDArray[DType.int](shape=Shape(len(indices)))
         for i in range(len(indices)):
             (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
 
@@ -1102,14 +1116,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         for i in range(mask.size):
             if mask.item(i):
                 memcpy(
-                    result._re._buf.ptr + offset * size_per_item,
-                    self._re._buf.ptr + i * size_per_item,
-                    size_per_item,
+                    dest=result._re._buf.ptr + offset * size_per_item,
+                    src=self._re._buf.ptr + i * size_per_item,
+                    count=size_per_item,
                 )
                 memcpy(
-                    result._im._buf.ptr + offset * size_per_item,
-                    self._im._buf.ptr + i * size_per_item,
-                    size_per_item,
+                    dest=result._im._buf.ptr + offset * size_per_item,
+                    src=self._im._buf.ptr + i * size_per_item,
+                    count=size_per_item,
                 )
                 offset += 1
 
@@ -1528,7 +1542,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         self._re._buf.ptr[index_of_buffer] = val.re
         self._im._buf.ptr[index_of_buffer] = val.im
 
@@ -1651,13 +1665,21 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                         ),
                     )
                 )
-            memcpy(self._re._buf.ptr + norm * block, val._re._buf.ptr, block)
-            memcpy(self._im._buf.ptr + norm * block, val._im._buf.ptr, block)
+            memcpy(
+                dest=self._re._buf.ptr + norm * block,
+                src=val._re._buf.ptr,
+                count=block,
+            )
+            memcpy(
+                dest=self._im._buf.ptr + norm * block,
+                src=val._im._buf.ptr,
+                count=block,
+            )
             return
 
         # F order
-        self._re._write_first_axis_slice[Self.dtype](self._re, norm, val._re)
-        self._im._write_first_axis_slice[Self.dtype](self._im, norm, val._im)
+        self._re._write_first_axis_slice(self._re, norm, val._re)
+        self._im._write_first_axis_slice(self._im, norm, val._im)
 
     fn __setitem__(mut self, index: Item, val: ComplexSIMD[cdtype]) raises:
         """
@@ -1848,7 +1870,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
     #     self.__setitem__(slices=slice_list, val=val)
 
-    fn __setitem__(self, index: NDArray[DType.index], val: Self) raises:
+    fn __setitem__(self, index: NDArray[DType.int], val: Self) raises:
         """
         Returns the items of the ComplexNDArray from an array of indices.
 
@@ -1909,6 +1931,266 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             )
         return self * ComplexSIMD[cdtype](-1.0, -1.0)
 
+    fn __bool__(self) raises -> Bool:
+        """
+        Check if the complex array is non-zero.
+
+        For a 0-D or length-1 complex array, returns True if the complex number
+        is non-zero (i.e., either real or imaginary part is non-zero).
+
+        Returns:
+            True if the complex number is non-zero, False otherwise.
+
+        Raises:
+            Error: If the array is not 0-D or length-1.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape())  # 0-D array
+        A._re._buf.ptr[] = 1.0
+        A._im._buf.ptr[] = 0.0
+        var result = A.__bool__()  # True
+        ```
+        """
+        if (self.size == 1) or (self.ndim == 0):
+            var re_val = self._re._buf.ptr[]
+            var im_val = self._im._buf.ptr[]
+            return Bool((re_val != 0.0) or (im_val != 0.0))
+        else:
+            raise Error(
+                "\nError in `ComplexNDArray.__bool__(self)`: "
+                "Only 0-D arrays (numojo scalar) or length-1 arrays "
+                "can be converted to Bool. "
+                "The truth value of an array with more than one element is "
+                "ambiguous. Use a.any() or a.all()."
+            )
+
+    fn __int__(self) raises -> Int:
+        """
+        Gets `Int` representation of the complex array's real part.
+
+        Only 0-D arrays or length-1 arrays can be converted to scalars.
+        The imaginary part is discarded.
+
+        Returns:
+            Int representation of the real part of the array.
+
+        Raises:
+            Error: If the array is not 0-D or length-1.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape())  # 0-D array
+        A._re._buf.ptr[] = 42.7
+        A._im._buf.ptr[] = 3.14
+        print(A.__int__())  # 42 (only real part)
+        ```
+        """
+        if (self.size == 1) or (self.ndim == 0):
+            return Int(self._re._buf.ptr[])
+        else:
+            raise Error(
+                "\nError in `ComplexNDArray.__int__(self)`: "
+                "Only 0-D arrays (numojo scalar) or length-1 arrays "
+                "can be converted to scalars."
+            )
+
+    fn __float__(self) raises -> Float64:
+        """
+        Gets `Float64` representation of the complex array's magnitude.
+
+        Only 0-D arrays or length-1 arrays can be converted to scalars.
+        Returns the magnitude (absolute value) of the complex number.
+
+        Returns:
+            Float64 representation of the magnitude of the complex number.
+
+        Raises:
+            Error: If the array is not 0-D or length-1.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape())  # 0-D array
+        A._re._buf.ptr[] = 3.0
+        A._im._buf.ptr[] = 4.0
+        print(A.__float__())  # 5.0 (magnitude)
+        ```
+        """
+        if (self.size == 1) or (self.ndim == 0):
+            var re_val = self._re._buf.ptr[]
+            var im_val = self._im._buf.ptr[]
+            var magnitude_sq = Float64(re_val * re_val + im_val * im_val)
+            return sqrt(magnitude_sq)
+        else:
+            raise Error(
+                "\nError in `ComplexNDArray.__float__(self)`: "
+                "Only 0-D arrays (numojo scalar) or length-1 arrays "
+                "can be converted to scalars."
+            )
+
+    fn __abs__(self) raises -> NDArray[Self.dtype]:
+        """
+        Compute the magnitude (absolute value) of each complex element.
+
+        Returns an NDArray of real values containing the magnitude of each
+        complex number: sqrt(re^2 + im^2).
+
+        Returns:
+            NDArray containing the magnitude of each complex element.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        # Fill with some values
+        var mag = A.__abs__()  # Returns NDArray[f64] with magnitudes
+        ```
+        """
+        var re_sq = self._re * self._re
+        var im_sq = self._im * self._im
+        var sum_sq = re_sq + im_sq
+        return misc.sqrt[Self.dtype](sum_sq)
+
+    fn __pow__(self, p: Int) raises -> Self:
+        """
+        Raise complex array to integer power element-wise.
+
+        Uses De Moivre's formula for complex exponentiation:
+        (r * e^(i*theta))^n = r^n * e^(i*n*theta)
+
+        Args:
+            p: Integer exponent.
+
+        Returns:
+            ComplexNDArray with each element raised to power p.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = A ** 3  # Cube each element
+        ```
+        """
+        if p == 0:
+            var ones_re = creation.ones[Self.dtype](self.shape)
+            var zeros_im = creation.zeros[Self.dtype](self.shape)
+            return Self(ones_re^, zeros_im^)
+        elif p == 1:
+            return self.copy()
+        elif p < 0:
+            var pos_pow = self.__pow__(-p)
+            var denominator = (
+                pos_pow._re * pos_pow._re + pos_pow._im * pos_pow._im
+            )
+            var result_re = pos_pow._re / denominator
+            var result_im = -pos_pow._im / denominator
+            return Self(result_re^, result_im^)
+        else:
+            var result = self.copy()
+            for _ in range(p - 1):
+                var temp = result * self
+                result = temp^
+            return result^
+
+    fn __pow__(self, rhs: Scalar[Self.dtype]) raises -> Self:
+        """
+        Raise complex array to real scalar power element-wise.
+
+        Args:
+            rhs: Real scalar exponent.
+
+        Returns:
+            ComplexNDArray with each element raised to power rhs.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = A ** 2.5  # Raise to power 2.5
+        ```
+        """
+        var r = misc.sqrt[Self.dtype](self._re * self._re + self._im * self._im)
+        var theta = trig.atan2[Self.dtype](self._im, self._re)
+
+        var r_pow = r.__pow__(rhs)
+        var theta_p = theta * rhs
+
+        var result_re = r_pow * trig.cos[Self.dtype](theta_p)
+        var result_im = r_pow * trig.sin[Self.dtype](theta_p)
+
+        return Self(result_re^, result_im^)
+
+    fn __pow__(self, p: Self) raises -> Self:
+        """
+        Raise complex array to complex array power element-wise.
+
+        Args:
+            p: ComplexNDArray exponent.
+
+        Returns:
+            ComplexNDArray with each element raised to corresponding power.
+
+        Raises:
+            Error: If arrays have different sizes.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var C = A ** B  # Element-wise complex power
+        ```
+        """
+        if self.size != p.size:
+            raise Error(
+                String(
+                    "\nError in `ComplexNDArray.__pow__(self, p)`: "
+                    "Both arrays must have same number of elements! "
+                    "Self array has {} elements. "
+                    "Other array has {} elements"
+                ).format(self.size, p.size)
+            )
+
+        var mag = misc.sqrt[Self.dtype](
+            self._re * self._re + self._im * self._im
+        )
+        var arg = trig.atan2[Self.dtype](self._im, self._re)
+
+        var log_re = exponents.log[Self.dtype](mag)
+        var log_im = arg^
+
+        var exponent_re_temp1 = p._re * log_re
+        var exponent_re_temp2 = p._im * log_im
+        var exponent_re = exponent_re_temp1 - exponent_re_temp2
+        var exponent_im_temp1 = p._re * log_im
+        var exponent_im_temp2 = p._im * log_re
+        var exponent_im = exponent_im_temp1 + exponent_im_temp2
+
+        var exp_re = exponents.exp[Self.dtype](exponent_re)
+        var result_re = exp_re * trig.cos[Self.dtype](exponent_im)
+        var result_im = exp_re * trig.sin[Self.dtype](exponent_im)
+
+        return Self(result_re^, result_im^)
+
+    fn __ipow__(mut self, p: Int) raises:
+        """
+        In-place raise to integer power.
+
+        Args:
+            p: Integer exponent.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        A **= 3  # Cube in place
+        ```
+        """
+        self = self.__pow__(p)
+
     @always_inline("nodebug")
     fn __eq__(self, other: Self) raises -> NDArray[DType.bool]:
         """
@@ -1945,84 +2227,313 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             self._re, other.re
         ) and comparison.not_equal[Self.dtype](self._im, other.im)
 
-    # ===------------------------------------------------------------------=== #
-    # ARITHMETIC OPERATIONS
-    # ===------------------------------------------------------------------=== #
-
-    fn __add__(self, other: ComplexSIMD[cdtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __lt__(self, other: Self) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray + ComplexSIMD`.
+        Itemwise less than comparison by magnitude.
+
+        For complex numbers, compares the magnitudes: |self| < |other|.
+        This provides a natural ordering for complex numbers.
+
+        Args:
+            other: The other ComplexNDArray to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| < |other|.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var result = A < B  # Compare by magnitude
+        ```
+
+        Notes:
+            Complex number ordering is not naturally defined. This implementation
+            compares by magnitude (absolute value) to provide a consistent ordering.
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other.re)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other.im)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other._re * other._re + other._im * other._im
+        return comparison.less[Self.dtype](self_mag, other_mag)
 
-    fn __add__(self, other: Scalar[Self.dtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __lt__(self, other: ComplexSIMD[cdtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray + Scalar`.
+        Itemwise less than comparison with scalar by magnitude.
+
+        Args:
+            other: The ComplexSIMD scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| < |other|.
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other.re * other.re + other.im * other.im
+        return comparison.less[Self.dtype](self_mag, other_mag)
 
-    fn __add__(self, other: Self) raises -> Self:
+    @always_inline("nodebug")
+    fn __lt__(self, other: Scalar[Self.dtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray + ComplexNDArray`.
+        Itemwise less than comparison with real scalar by magnitude.
+
+        Args:
+            other: The real scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| < |other|.
         """
-        print("add complex arrays")
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](
-            self._re, other._re
-        )
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](
-            self._im, other._im
-        )
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other * other
+        return comparison.less[Self.dtype](self_mag, other_mag)
 
-    fn __add__(self, other: NDArray[Self.dtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __le__(self, other: Self) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray + NDArray`.
+        Itemwise less than or equal comparison by magnitude.
+
+        For complex numbers, compares the magnitudes: |self| <= |other|.
+
+        Args:
+            other: The other ComplexNDArray to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| <= |other|.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var result = A <= B  # Compare by magnitude
+        ```
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other._re * other._re + other._im * other._im
+        return comparison.less_equal[Self.dtype](self_mag, other_mag)
 
-    fn __radd__(mut self, other: ComplexSIMD[cdtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __le__(self, other: ComplexSIMD[cdtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexSIMD + ComplexNDArray`.
+        Itemwise less than or equal comparison with scalar by magnitude.
+
+        Args:
+            other: The ComplexSIMD scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| <= |other|.
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other.re)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other.im)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other.re * other.re + other.im * other.im
+        return comparison.less_equal[Self.dtype](self_mag, other_mag)
 
-    fn __radd__(mut self, other: Scalar[Self.dtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __le__(self, other: Scalar[Self.dtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `Scalar + ComplexNDArray`.
+        Itemwise less than or equal comparison with real scalar by magnitude.
+
+        Args:
+            other: The real scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| <= |other|.
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other * other
+        return comparison.less_equal[Self.dtype](self_mag, other_mag)
 
-    fn __radd__(mut self, other: NDArray[Self.dtype]) raises -> Self:
+    @always_inline("nodebug")
+    fn __gt__(self, other: Self) raises -> NDArray[DType.bool]:
         """
-        Enables `NDArray + ComplexNDArray`.
+        Itemwise greater than comparison by magnitude.
+
+        For complex numbers, compares the magnitudes: |self| > |other|.
+
+        Args:
+            other: The other ComplexNDArray to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| > |other|.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var result = A > B  # Compare by magnitude
+        ```
+
+        Notes:
+            Complex number ordering is not naturally defined. This implementation
+            compares by magnitude (absolute value) to provide a consistent ordering.
         """
-        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
-        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
-        return Self(real^, imag^)
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other._re * other._re + other._im * other._im
+        return comparison.greater[Self.dtype](self_mag, other_mag)
 
-    fn __iadd__(mut self, other: ComplexSIMD[cdtype]) raises:
+    @always_inline("nodebug")
+    fn __gt__(self, other: ComplexSIMD[cdtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray += ComplexSIMD`.
+        Itemwise greater than comparison with scalar by magnitude.
+
+        Args:
+            other: The ComplexSIMD scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| > |other|.
         """
-        self._re += other.re
-        self._im += other.im
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other.re * other.re + other.im * other.im
+        return comparison.greater[Self.dtype](self_mag, other_mag)
 
-    fn __iadd__(mut self, other: Scalar[Self.dtype]) raises:
+    @always_inline("nodebug")
+    fn __gt__(self, other: Scalar[Self.dtype]) raises -> NDArray[DType.bool]:
         """
-        Enables `ComplexNDArray += Scalar`.
+        Itemwise greater than comparison with real scalar by magnitude.
+
+        Args:
+            other: The real scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| > |other|.
         """
-        self._re += other
-        self._im += other
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other * other
+        return comparison.greater[Self.dtype](self_mag, other_mag)
+
+    @always_inline("nodebug")
+    fn __ge__(self, other: Self) raises -> NDArray[DType.bool]:
+        """
+        Itemwise greater than or equal comparison by magnitude.
+
+        For complex numbers, compares the magnitudes: |self| >= |other|.
+
+        Args:
+            other: The other ComplexNDArray to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| >= |other|.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var B = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 2))
+        var result = A >= B  # Compare by magnitude
+        ```
+        """
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other._re * other._re + other._im * other._im
+        return comparison.greater_equal[Self.dtype](self_mag, other_mag)
+
+    @always_inline("nodebug")
+    fn __ge__(self, other: ComplexSIMD[cdtype]) raises -> NDArray[DType.bool]:
+        """
+        Itemwise greater than or equal comparison with scalar by magnitude.
+
+        Args:
+            other: The ComplexSIMD scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| >= |other|.
+        """
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other.re * other.re + other.im * other.im
+        return comparison.greater_equal[Self.dtype](self_mag, other_mag)
+
+    @always_inline("nodebug")
+    fn __ge__(self, other: Scalar[Self.dtype]) raises -> NDArray[DType.bool]:
+        """
+        Itemwise greater than or equal comparison with real scalar by magnitude.
+
+        Args:
+            other: The real scalar to compare with.
+
+        Returns:
+            An array of boolean values indicating where |self| >= |other|.
+        """
+        var self_mag = self._re * self._re + self._im * self._im
+        var other_mag = other * other
+        return comparison.greater_equal[Self.dtype](self_mag, other_mag)
+
+    # ===------------------------------------------------------------------=== #
+    # ARITHMETIC OPERATIONS
+    # ===------------------------------------------------------------------=== #
+
+    fn __add__(self, other: ComplexSIMD[cdtype]) raises -> Self:
+        """
+        Enables `ComplexNDArray + ComplexSIMD`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other.re)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other.im)
+        return Self(real^, imag^)
+
+    fn __add__(self, other: Scalar[Self.dtype]) raises -> Self:
+        """
+        Enables `ComplexNDArray + Scalar`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
+        return Self(real^, imag^)
+
+    fn __add__(self, other: Self) raises -> Self:
+        """
+        Enables `ComplexNDArray + ComplexNDArray`.
+        """
+        print("add complex arrays")
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](
+            self._re, other._re
+        )
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](
+            self._im, other._im
+        )
+        return Self(real^, imag^)
+
+    fn __add__(self, other: NDArray[Self.dtype]) raises -> Self:
+        """
+        Enables `ComplexNDArray + NDArray`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
+        return Self(real^, imag^)
+
+    fn __radd__(mut self, other: ComplexSIMD[cdtype]) raises -> Self:
+        """
+        Enables `ComplexSIMD + ComplexNDArray`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other.re)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other.im)
+        return Self(real^, imag^)
+
+    fn __radd__(mut self, other: Scalar[Self.dtype]) raises -> Self:
+        """
+        Enables `Scalar + ComplexNDArray`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
+        return Self(real^, imag^)
+
+    fn __radd__(mut self, other: NDArray[Self.dtype]) raises -> Self:
+        """
+        Enables `NDArray + ComplexNDArray`.
+        """
+        var real: NDArray[Self.dtype] = math.add[Self.dtype](self._re, other)
+        var imag: NDArray[Self.dtype] = math.add[Self.dtype](self._im, other)
+        return Self(real^, imag^)
+
+    fn __iadd__(mut self, other: ComplexSIMD[cdtype]) raises:
+        """
+        Enables `ComplexNDArray += ComplexSIMD`.
+        """
+        self._re += other.re
+        self._im += other.im
+
+    fn __iadd__(mut self, other: Scalar[Self.dtype]) raises:
+        """
+        Enables `ComplexNDArray += Scalar`.
+        """
+        self._re += other
+        self._im += other
 
     fn __iadd__(mut self, other: Self) raises:
         """
@@ -2451,7 +2962,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             offset: The offset of the current dimension.
             summarize: Internal flag indicating summarization already chosen.
         """
-        var options: PrintOptions = self._re.print_options
+        var options: PrintOptions = self.print_options
         var separator = options.separator
         var padding = options.padding
         var edge_items = options.edge_items
@@ -2646,7 +3157,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
     fn __iter__(
         self,
-    ) raises -> _ComplexNDArrayIter[__origin_of(self._re), cdtype]:
+    ) raises -> _ComplexNDArrayIter[origin_of(self._re), cdtype]:
         """
         Iterates over elements of the ComplexNDArray and return sub-arrays as view.
 
@@ -2654,16 +3165,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             An iterator of ComplexNDArray elements.
         """
 
-        return _ComplexNDArrayIter[__origin_of(self._re), cdtype](
+        return _ComplexNDArrayIter[origin_of(self._re), cdtype](
             self,
             dimension=0,
         )
 
     fn __reversed__(
         self,
-    ) raises -> _ComplexNDArrayIter[
-        __origin_of(self._re), cdtype, forward=False
-    ]:
+    ) raises -> _ComplexNDArrayIter[origin_of(self._re), cdtype, forward=False]:
         """
         Iterates backwards over elements of the ComplexNDArray, returning
         copied value.
@@ -2672,9 +3181,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             A reversed iterator of NDArray elements.
         """
 
-        return _ComplexNDArrayIter[
-            __origin_of(self._re), cdtype, forward=False
-        ](
+        return _ComplexNDArrayIter[origin_of(self._re), cdtype, forward=False](
             self,
             dimension=0,
         )
@@ -2777,13 +3284,13 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             var result: NDArray[dtype = Self.dtype] = NDArray[
                 dtype = Self.dtype
             ](self.shape)
-            memcpy(result._buf.ptr, self._re._buf.ptr, self.size)
+            memcpy(dest=result._buf.ptr, src=self._re._buf.ptr, count=self.size)
             return result^
         elif type == "im":
             var result: NDArray[dtype = Self.dtype] = NDArray[
                 dtype = Self.dtype
             ](self.shape)
-            memcpy(result._buf.ptr, self._im._buf.ptr, self.size)
+            memcpy(dest=result._buf.ptr, src=self._im._buf.ptr, count=self.size)
             return result^
         else:
             raise Error(
@@ -2800,6 +3307,741 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                 )
             )
 
+    fn squeeze(mut self, axis: Int) raises:
+        """
+        Remove (squeeze) a single dimension of size 1 from the array shape.
+
+        Args:
+            axis: The axis to squeeze. Supports negative indices.
+
+        Raises:
+            IndexError: If the axis is out of range.
+            ShapeError: If the dimension at the given axis is not of size 1.
+        """
+        var normalized_axis: Int = axis
+        if normalized_axis < 0:
+            normalized_axis += self.ndim
+        if (normalized_axis < 0) or (normalized_axis >= self.ndim):
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Axis {} is out of range for array with {} dimensions."
+                    ).format(axis, self.ndim),
+                    suggestion=String(
+                        "Use an axis value in the range [-{}, {})."
+                    ).format(self.ndim, self.ndim),
+                    location=String("NDArray.squeeze(axis: Int)"),
+                )
+            )
+
+        if self.shape[normalized_axis] != 1:
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Cannot squeeze axis {} with size {}."
+                    ).format(normalized_axis, self.shape[normalized_axis]),
+                    suggestion=String(
+                        "Only axes with length 1 can be removed."
+                    ),
+                    location=String("NDArray.squeeze(axis: Int)"),
+                )
+            )
+        self.shape = self.shape._pop(normalized_axis)
+        self.strides = self.strides._pop(normalized_axis)
+        self.ndim -= 1
+
+    # ===-------------------------------------------------------------------===#
+    # Statistical and Reduction Methods
+    # ===-------------------------------------------------------------------===#
+
+    fn all(self) raises -> Bool:
+        """
+        Check if all complex elements are non-zero.
+
+        A complex number is considered "true" if either its real or imaginary
+        part is non-zero.
+
+        Returns:
+            True if all elements are non-zero, False otherwise.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        # Fill with non-zero values
+        var result = A.all()  # True if all non-zero
+        ```
+        """
+        var re_nonzero = True
+        var im_nonzero = True
+
+        for i in range(self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            if (re_val == 0.0) and (im_val == 0.0):
+                return False
+        return True
+
+    fn any(self) raises -> Bool:
+        """
+        Check if any complex element is non-zero.
+
+        A complex number is considered "true" if either its real or imaginary
+        part is non-zero.
+
+        Returns:
+            True if any element is non-zero, False otherwise.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        # Fill with some values
+        var result = A.any()  # True if any non-zero
+        ```
+        """
+        for i in range(self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            if (re_val != 0.0) or (im_val != 0.0):
+                return True
+        return False
+
+    fn sum(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Sum of all complex array elements.
+
+        Returns:
+            Complex scalar containing the sum of all elements.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var total = A.sum()  # Sum of all elements
+        ```
+        """
+        var sum_re = Scalar[Self.dtype](0)
+        var sum_im = Scalar[Self.dtype](0)
+
+        for i in range(self.size):
+            sum_re += self._re._buf.ptr.load(i)
+            sum_im += self._im._buf.ptr.load(i)
+
+        return ComplexSIMD[cdtype](sum_re, sum_im)
+
+    fn prod(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Product of all complex array elements.
+
+        Returns:
+            Complex scalar containing the product of all elements.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var product = A.prod()  # Product of all elements
+        ```
+        """
+        var prod_re = Scalar[Self.dtype](1)
+        var prod_im = Scalar[Self.dtype](0)
+
+        for i in range(self.size):
+            var a_re = self._re._buf.ptr.load(i)
+            var a_im = self._im._buf.ptr.load(i)
+            var new_re = prod_re * a_re - prod_im * a_im
+            var new_im = prod_re * a_im + prod_im * a_re
+            prod_re = new_re
+            prod_im = new_im
+
+        return ComplexSIMD[cdtype](prod_re, prod_im)
+
+    fn mean(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Mean (average) of all complex array elements.
+
+        Returns:
+            Complex scalar containing the mean of all elements.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var average = A.mean()  # Mean of all elements
+        ```
+        """
+        var total = self.sum()
+        var n = Scalar[Self.dtype](self.size)
+        return ComplexSIMD[cdtype](total.re / n, total.im / n)
+
+    fn max(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Find the complex element with maximum magnitude.
+
+        Returns:
+            The complex element with the largest magnitude.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var max_elem = A.max()  # Element with largest magnitude
+        ```
+
+        Notes:
+            Returns the element with maximum |z| = sqrt(re^2 + im^2).
+        """
+        if self.size == 0:
+            raise Error("Cannot find max of empty array")
+
+        var max_mag_sq = self._re._buf.ptr.load(0) * self._re._buf.ptr.load(
+            0
+        ) + self._im._buf.ptr.load(0) * self._im._buf.ptr.load(0)
+        var max_idx = 0
+
+        for i in range(1, self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            var mag_sq = re_val * re_val + im_val * im_val
+            if mag_sq > max_mag_sq:
+                max_mag_sq = mag_sq
+                max_idx = i
+
+        return ComplexSIMD[cdtype](
+            self._re._buf.ptr.load(max_idx), self._im._buf.ptr.load(max_idx)
+        )
+
+    fn min(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Find the complex element with minimum magnitude.
+
+        Returns:
+            The complex element with the smallest magnitude.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var min_elem = A.min()  # Element with smallest magnitude
+        ```
+
+        Notes:
+            Returns the element with minimum |z| = sqrt(re^2 + im^2).
+        """
+        if self.size == 0:
+            raise Error("Cannot find min of empty array")
+
+        var min_mag_sq = self._re._buf.ptr.load(0) * self._re._buf.ptr.load(
+            0
+        ) + self._im._buf.ptr.load(0) * self._im._buf.ptr.load(0)
+        var min_idx = 0
+
+        for i in range(1, self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            var mag_sq = re_val * re_val + im_val * im_val
+            if mag_sq < min_mag_sq:
+                min_mag_sq = mag_sq
+                min_idx = i
+
+        return ComplexSIMD[cdtype](
+            self._re._buf.ptr.load(min_idx), self._im._buf.ptr.load(min_idx)
+        )
+
+    fn argmax(self) raises -> Int:
+        """
+        Return the index of the element with maximum magnitude.
+
+        Returns:
+            Index (flattened) of the element with largest magnitude.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var idx = A.argmax()  # Index of element with largest magnitude
+        ```
+
+        Notes:
+            Compares by magnitude: |z| = sqrt(re^2 + im^2).
+        """
+        if self.size == 0:
+            raise Error("Cannot find argmax of empty array")
+
+        var max_mag_sq = self._re._buf.ptr.load(0) * self._re._buf.ptr.load(
+            0
+        ) + self._im._buf.ptr.load(0) * self._im._buf.ptr.load(0)
+        var max_idx = 0
+
+        for i in range(1, self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            var mag_sq = re_val * re_val + im_val * im_val
+            if mag_sq > max_mag_sq:
+                max_mag_sq = mag_sq
+                max_idx = i
+
+        return max_idx
+
+    fn argmin(self) raises -> Int:
+        """
+        Return the index of the element with minimum magnitude.
+
+        Returns:
+            Index (flattened) of the element with smallest magnitude.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var idx = A.argmin()  # Index of element with smallest magnitude
+        ```
+
+        Notes:
+            Compares by magnitude: |z| = sqrt(re^2 + im^2).
+        """
+        if self.size == 0:
+            raise Error("Cannot find argmin of empty array")
+
+        var min_mag_sq = self._re._buf.ptr.load(0) * self._re._buf.ptr.load(
+            0
+        ) + self._im._buf.ptr.load(0) * self._im._buf.ptr.load(0)
+        var min_idx = 0
+
+        for i in range(1, self.size):
+            var re_val = self._re._buf.ptr.load(i)
+            var im_val = self._im._buf.ptr.load(i)
+            var mag_sq = re_val * re_val + im_val * im_val
+            if mag_sq < min_mag_sq:
+                min_mag_sq = mag_sq
+                min_idx = i
+
+        return min_idx
+
+    fn cumsum(self) raises -> Self:
+        """
+        Cumulative sum of complex array elements.
+
+        Returns:
+            ComplexNDArray with cumulative sums.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(5))
+        var cumulative = A.cumsum()
+        ```
+
+        Notes:
+            For array [a, b, c, d], returns [a, a+b, a+b+c, a+b+c+d].
+        """
+        var result = Self(self.shape)
+        var cum_re = Scalar[Self.dtype](0)
+        var cum_im = Scalar[Self.dtype](0)
+
+        for i in range(self.size):
+            cum_re += self._re._buf.ptr.load(i)
+            cum_im += self._im._buf.ptr.load(i)
+            result._re._buf.ptr.store(i, cum_re)
+            result._im._buf.ptr.store(i, cum_im)
+
+        return result^
+
+    fn cumprod(self) raises -> Self:
+        """
+        Cumulative product of complex array elements.
+
+        Returns:
+            ComplexNDArray with cumulative products.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(5))
+        var cumulative = A.cumprod()
+        ```
+
+        Notes:
+            For array [a, b, c, d], returns [a, a*b, a*b*c, a*b*c*d].
+        """
+        var result = Self(self.shape)
+        var cum_re = Scalar[Self.dtype](1)
+        var cum_im = Scalar[Self.dtype](0)
+
+        for i in range(self.size):
+            var a_re = self._re._buf.ptr.load(i)
+            var a_im = self._im._buf.ptr.load(i)
+            var new_re = cum_re * a_re - cum_im * a_im
+            var new_im = cum_re * a_im + cum_im * a_re
+            cum_re = new_re
+            cum_im = new_im
+            result._re._buf.ptr.store(i, cum_re)
+            result._im._buf.ptr.store(i, cum_im)
+
+        return result^
+
+    # ===-------------------------------------------------------------------===#
+    # Array Manipulation Methods
+    # ===-------------------------------------------------------------------===#
+
+    fn flatten(self, order: String = "C") raises -> Self:
+        """
+        Return a copy of the array collapsed into one dimension.
+
+        Args:
+            order: Order of flattening - 'C' for row-major or 'F' for column-major.
+
+        Returns:
+            A 1D ComplexNDArray containing all elements.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 4))
+        var flat = A.flatten()  # Shape(12)
+        ```
+        """
+        var flat_re = self._re.flatten(order)
+        var flat_im = self._im.flatten(order)
+        return Self(flat_re^, flat_im^)
+
+    fn fill(mut self, val: ComplexSIMD[cdtype]):
+        """
+        Fill all items of array with a complex value.
+
+        Args:
+            val: Complex value to fill the array with.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        A.fill(nm.ComplexSIMD[nm.cf64](1.0, 2.0))  # Fill with 1+2i
+        ```
+        """
+        self._re.fill(val.re)
+        self._im.fill(val.im)
+
+    fn row(self, id: Int) raises -> Self:
+        """
+        Get the ith row of the matrix.
+
+        Args:
+            id: The row index.
+
+        Returns:
+            The ith row as a ComplexNDArray.
+
+        Raises:
+            Error: If ndim is greater than 2.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 4))
+        var first_row = A.row(0)  # Get first row
+        ```
+        """
+        if self.ndim > 2:
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Cannot extract row from array with {} dimensions."
+                    ).format(self.ndim),
+                    suggestion=String(
+                        "The row() method only works with 1D or 2D arrays."
+                    ),
+                    location=String("ComplexNDArray.row(id: Int)"),
+                )
+            )
+
+        var width: Int = self.shape[1]
+        var result = Self(Shape(width))
+        for i in range(width):
+            var idx = i + id * width
+            result._re._buf.ptr.store(i, self._re._buf.ptr.load(idx))
+            result._im._buf.ptr.store(i, self._im._buf.ptr.load(idx))
+        return result^
+
+    fn col(self, id: Int) raises -> Self:
+        """
+        Get the ith column of the matrix.
+
+        Args:
+            id: The column index.
+
+        Returns:
+            The ith column as a ComplexNDArray.
+
+        Raises:
+            Error: If ndim is greater than 2.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 4))
+        var first_col = A.col(0)  # Get first column
+        ```
+        """
+        if self.ndim > 2:
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Cannot extract column from array with {} dimensions."
+                    ).format(self.ndim),
+                    suggestion=String(
+                        "The col() method only works with 1D or 2D arrays."
+                    ),
+                    location=String("ComplexNDArray.col(id: Int)"),
+                )
+            )
+
+        var width: Int = self.shape[1]
+        var height: Int = self.shape[0]
+        var result = Self(Shape(height))
+        for i in range(height):
+            var idx = id + i * width
+            result._re._buf.ptr.store(i, self._re._buf.ptr.load(idx))
+            result._im._buf.ptr.store(i, self._im._buf.ptr.load(idx))
+        return result^
+
+    fn clip(
+        self, a_min: Scalar[Self.dtype], a_max: Scalar[Self.dtype]
+    ) raises -> Self:
+        """
+        Limit the magnitudes of complex values between [a_min, a_max].
+
+        Elements with magnitude less than a_min are scaled to have magnitude a_min.
+        Elements with magnitude greater than a_max are scaled to have magnitude a_max.
+        The phase (angle) of each complex number is preserved.
+
+        Args:
+            a_min: The minimum magnitude.
+            a_max: The maximum magnitude.
+
+        Returns:
+            A ComplexNDArray with clipped magnitudes.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(10))
+        var clipped = A.clip(1.0, 5.0)  # Clip magnitudes to [1, 5]
+        ```
+
+        Notes:
+            Clips by magnitude while preserving phase angle.
+        """
+        var result = Self(self.shape)
+
+        for i in range(self.size):
+            var re = self._re._buf.ptr.load(i)
+            var im = self._im._buf.ptr.load(i)
+            var mag_sq = re * re + im * im
+            var mag_val = sqrt(mag_sq)
+
+            if mag_val < a_min:
+                if mag_val > 0:
+                    var scale = a_min / mag_val
+                    result._re._buf.ptr.store(i, re * scale)
+                    result._im._buf.ptr.store(i, im * scale)
+                else:
+                    result._re._buf.ptr.store(i, a_min)
+                    result._im._buf.ptr.store(i, 0.0)
+            elif mag_val > a_max:
+                var scale = a_max / mag_val
+                result._re._buf.ptr.store(i, re * scale)
+                result._im._buf.ptr.store(i, im * scale)
+            else:
+                result._re._buf.ptr.store(i, re)
+                result._im._buf.ptr.store(i, im)
+
+        return result^
+
+    fn round(self) raises -> Self:
+        """
+        Round the real and imaginary parts of each element to the nearest integer.
+
+        Returns:
+            A ComplexNDArray with rounded components.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(10))
+        # A contains e.g. 1.7+2.3i
+        var rounded = A.round()  # Returns 2.0+2.0i
+        ```
+        """
+        var rounded_re = rounding.tround[Self.dtype](self._re)
+        var rounded_im = rounding.tround[Self.dtype](self._im)
+        return Self(rounded_re^, rounded_im^)
+
+    fn T(self) raises -> Self:
+        """
+        Transpose the complex array (reverse all axes).
+
+        Returns:
+            Transposed ComplexNDArray.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 4))
+        var A_T = A.T()  # Shape(4, 3)
+        ```
+        """
+        var transposed_re = self._re.T()
+        var transposed_im = self._im.T()
+        return Self(transposed_re^, transposed_im^)
+
+    fn T(self, axes: List[Int]) raises -> Self:
+        """
+        Transpose the complex array according to the given axes permutation.
+
+        Args:
+            axes: Permutation of axes (e.g., [1, 0, 2]).
+
+        Returns:
+            Transposed ComplexNDArray.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 3, 4))
+        var A_T = A.T(List[Int](2, 0, 1))  # Shape(4, 2, 3)
+        ```
+        """
+        var transposed_re = self._re.T(axes)
+        var transposed_im = self._im.T(axes)
+        return Self(transposed_re^, transposed_im^)
+
+    fn diagonal(self, offset: Int = 0) raises -> Self:
+        """
+        Extract the diagonal from a 2D complex array.
+
+        Args:
+            offset: Offset from the main diagonal (0 for main diagonal).
+
+        Returns:
+            1D ComplexNDArray containing the diagonal elements.
+
+        Raises:
+            Error: If array is not 2D.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(4, 4))
+        var diag = A.diagonal()      # Main diagonal
+        var upper = A.diagonal(1)    # First upper diagonal
+        ```
+        """
+        if self.ndim != 2:
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "diagonal() requires a 2D array, got {} dimensions."
+                    ).format(self.ndim),
+                    suggestion=String(
+                        "Use a 2D ComplexNDArray for diagonal extraction."
+                    ),
+                    location=String("ComplexNDArray.diagonal()"),
+                )
+            )
+
+        var diag_re = self._re.diagonal(offset)
+        var diag_im = self._im.diagonal(offset)
+        return Self(diag_re^, diag_im^)
+
+    fn trace(self) raises -> ComplexSIMD[cdtype]:
+        """
+        Return the sum of the diagonal elements (trace of the matrix).
+
+        Returns:
+            Complex scalar containing the trace.
+
+        Raises:
+            Error: If array is not 2D.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 3))
+        var tr = A.trace()  # Sum of diagonal elements
+        ```
+        """
+        var diag = self.diagonal()
+        return diag.sum()
+
+    fn tolist(self) -> List[ComplexSIMD[cdtype]]:
+        """
+        Convert the complex array to a List of complex scalars.
+
+        Returns:
+            A List containing all complex elements in row-major order.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 3))
+        var elements = A.tolist()  # List of 6 complex numbers
+        ```
+        """
+        var result = List[ComplexSIMD[cdtype]](capacity=self.size)
+        for i in range(self.size):
+            result.append(
+                ComplexSIMD[cdtype](
+                    self._re._buf.ptr.load(i), self._im._buf.ptr.load(i)
+                )
+            )
+        return result^
+
+    fn num_elements(self) -> Int:
+        """
+        Return the total number of elements in the array.
+
+        Returns:
+            The size of the array (same as self.size).
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(3, 4, 5))
+        print(A.num_elements())  # 60
+        ```
+        """
+        return self.size
+
+    fn resize(mut self, shape: NDArrayShape) raises:
+        """
+        Change shape and size of array in-place.
+
+        If the new shape requires more elements, they are filled with zero.
+        If the new shape requires fewer elements, the array is truncated.
+
+        Args:
+            shape: The new shape for the array.
+
+        Examples:
+        ```mojo
+        import numojo as nm
+        var A = nm.ComplexNDArray[nm.cf64](nm.Shape(2, 3))
+        A.resize(nm.Shape(3, 4))  # Now 3x4, filled with zeros as needed
+        ```
+
+        Notes:
+            This modifies the array in-place. To get a reshaped copy, use reshape().
+        """
+        self._re.resize(shape)
+        self._im.resize(shape)
+        self.shape = shape
+        self.ndim = shape.ndim
+        self.size = shape.size_of_array()
+        var order = "C" if self.flags.C_CONTIGUOUS else "F"
+        self.strides = NDArrayStrides(shape, order=order)
+
 
 struct _ComplexNDArrayIter[
     is_mutable: Bool, //,
diff --git a/numojo/core/complex/complex_simd.mojo b/numojo/core/complex/complex_simd.mojo
index 5fa03171..9c23f775 100644
--- a/numojo/core/complex/complex_simd.mojo
+++ b/numojo/core/complex/complex_simd.mojo
@@ -27,7 +27,9 @@ alias CScalar[cdtype: ComplexDType] = ComplexSIMD[cdtype, width=1]
 
 
 @register_passable("trivial")
-struct ComplexSIMD[cdtype: ComplexDType, width: Int = 1](Stringable, Writable):
+struct ComplexSIMD[cdtype: ComplexDType, width: Int = 1](
+    ImplicitlyCopyable, Movable, Stringable, Writable
+):
     """
     A SIMD-enabled complex number type that supports vectorized operations.
 
diff --git a/numojo/core/datatypes.mojo b/numojo/core/datatypes.mojo
index 04d5e9e2..d4c106ea 100644
--- a/numojo/core/datatypes.mojo
+++ b/numojo/core/datatypes.mojo
@@ -15,10 +15,14 @@ alias i32 = DType.int32
 """Data type alias for DType.int32"""
 alias i64 = DType.int64
 """Data type alias for DType.int64"""
-alias isize = DType.index
-"""Data type alias for DType.index"""
-alias intp = DType.index
-"""Data type alias for DType.index"""
+alias i128 = DType.int128
+"""Data type alias for DType.int128"""
+alias i256 = DType.int256
+"""Data type alias for DType.int256"""
+alias int = DType.int
+"""Data type alias for DType.int"""
+alias uint = DType.int
+"""Data type alias for DType.uint"""
 alias u8 = DType.uint8
 """Data type alias for DType.uint8"""
 alias u16 = DType.uint16
@@ -27,8 +31,14 @@ alias u32 = DType.uint32
 """Data type alias for DType.uint32"""
 alias u64 = DType.uint64
 """Data type alias for DType.uint64"""
+alias u128 = DType.uint128
+"""Data type alias for DType.uint128"""
+alias u256 = DType.uint256
+"""Data type alias for DType.uint256"""
 alias f16 = DType.float16
 """Data type alias for DType.float16"""
+alias bf16 = DType.bfloat16
+"""Data type alias for DType.bfloat16"""
 alias f32 = DType.float32
 """Data type alias for DType.float32"""
 alias f64 = DType.float64
@@ -174,14 +184,14 @@ fn _concise_dtype_str(dtype: DType) -> String:
     """Returns a concise string representation of the data type."""
     if dtype == i8:
         return "i8"
-    elif dtype == i16:
-        return "i16"
-    elif dtype == i32:
-        return "i32"
     elif dtype == i64:
         return "i64"
-    elif dtype == isize:
-        return "index"
+    elif dtype == i128:
+        return "i128"
+    elif dtype == i256:
+        return "i256"
+    elif dtype == int:
+        return "int"
     elif dtype == u8:
         return "u8"
     elif dtype == u16:
@@ -190,6 +200,14 @@ fn _concise_dtype_str(dtype: DType) -> String:
         return "u32"
     elif dtype == u64:
         return "u64"
+    elif dtype == u128:
+        return "u128"
+    elif dtype == u256:
+        return "u256"
+    elif dtype == uint:
+        return "uint"
+    elif dtype == bf16:
+        return "bf16"
     elif dtype == f16:
         return "f16"
     elif dtype == f32:
@@ -198,8 +216,6 @@ fn _concise_dtype_str(dtype: DType) -> String:
         return "f64"
     elif dtype == boolean:
         return "boolean"
-    elif dtype == isize:
-        return "isize"
     else:
         return "Unknown"
 
diff --git a/numojo/core/gpu/__init__.mojo b/numojo/core/gpu/__init__.mojo
new file mode 100644
index 00000000..e69de29b
diff --git a/numojo/core/gpu/device.mojo b/numojo/core/gpu/device.mojo
new file mode 100644
index 00000000..7016efc1
--- /dev/null
+++ b/numojo/core/gpu/device.mojo
@@ -0,0 +1,209 @@
+from gpu.host import DeviceBuffer, DeviceContext, HostBuffer
+from gpu import thread_idx, block_dim, block_idx, grid_dim, barrier
+from gpu.host.dim import Dim
+from gpu.memory import AddressSpace
+from memory import stack_allocation
+from collections.optional import Optional
+from sys import simd_width_of
+from testing.testing import assert_true
+from sys.info import (
+    has_accelerator,
+    has_nvidia_gpu_accelerator,
+    has_amd_gpu_accelerator,
+    has_apple_gpu_accelerator,
+)
+
+alias cpu = Device.CPU
+alias gpu = Device.GPU
+alias cuda = Device.CUDA
+alias rocm = Device.ROCM
+alias metal = Device.MPS
+alias mps = Device.MPS
+
+
+# Device descriptor
+struct Device(ImplicitlyCopyable, Movable, Representable, Stringable, Writable):
+    """Execution device for arrays/matrices.
+
+    Fields:
+    - type: "cpu" | "gpu"
+    - name: backend identifier ("" for CPU, "cuda" | "rocm" | "metal" for GPU)
+    - id:   device index on that backend (0-based)
+
+    Aliases:
+    - CPU       -> CPU execution
+    - GPU       -> Generic GPU "auto" selector (prefers available accelerator)
+    - CUDA      -> NVIDIA CUDA GPU
+    - ROCM      -> AMD ROCm GPU
+    - METAL     -> Apple Metal GPU
+    """
+
+    var type: String
+    var name: String
+    var id: Int
+
+    # Aliases for convenience
+    alias CPU = Device(type="cpu", name="", id=0)
+    alias GPU = Device(type="gpu", name="auto", id=0)
+
+    alias CUDA = Device(type="gpu", name="cuda", id=0)
+    alias ROCM = Device(type="gpu", name="rocm", id=0)
+    alias MPS = Device(type="gpu", name="mps", id=0)
+
+    @parameter
+    @staticmethod
+    fn detect_accelerator() -> String:
+        """Detect the best available accelerator on the system."""
+
+        @parameter
+        if has_nvidia_gpu_accelerator():
+            return "cuda"
+        if has_amd_gpu_accelerator():
+            return "rocm"
+        if has_apple_gpu_accelerator():
+            return "mps"
+        return "cpu"
+
+    fn __init__(out self, type: String, name: String, id: Int):
+        try:
+            assert_true(
+                type == "cpu" or type == "gpu",
+                "Device type must be 'cpu' or 'gpu'",
+            )
+            if type == "cpu":
+                assert_true(name == "", "CPU device name must be empty string")
+                assert_true(id == 0, "CPU device id must be 0")
+            else:
+                assert_true(
+                    name == "cuda"
+                    or name == "rocm"
+                    or name == "mps"
+                    or name == "auto",
+                    "Invalid GPU device name",
+                )
+                assert_true(id >= 0, "GPU device id must be non-negative")
+            self.type = type
+            self.name = name
+            if self.name == "auto":
+                self.name = Device.detect_accelerator()
+            self.id = id
+        except e:
+            print("Invalid device type provided. Defaulting to CPU.")
+            self.type = "cpu"
+            self.name = ""
+            self.id = 0
+
+    fn __repr__(self) -> String:
+        return self.__str__()
+
+    @staticmethod
+    fn default() -> Device:
+        """Choose a sensible default device: prefer any available GPU, else CPU.
+        """
+        if has_nvidia_gpu_accelerator():
+            return Device.CUDA
+        if has_amd_gpu_accelerator():
+            return Device.ROCM
+        if has_apple_gpu_accelerator():
+            return Device.MPS
+        return Device.CPU
+
+    @staticmethod
+    fn gpu_default() raises -> Device:
+        """Choose the best available GPU, or raise if none is available."""
+        if has_nvidia_gpu_accelerator():
+            return Device.CUDA
+        if has_amd_gpu_accelerator():
+            return Device.ROCM
+        if has_apple_gpu_accelerator():
+            return Device.MPS
+        raise Error("No GPU accelerator available on this system")
+
+    fn available(self) -> Bool:
+        """Check if this device is available on the current system."""
+        if self.type == "cpu":
+            return True
+        if self.name == "cuda":
+            return has_nvidia_gpu_accelerator()
+        if self.name == "rocm":
+            return has_amd_gpu_accelerator()
+        if self.name == "metal":
+            return has_apple_gpu_accelerator()
+        if self.type == "gpu" and self.name == "auto":
+            return has_accelerator()
+        return False
+
+    @staticmethod
+    @parameter
+    fn available_devices() -> String:
+        """List all available devices on the current system."""
+        var devices_string: String = "\n"
+        devices_string += (
+            "  • " + String(Device.CPU) + " (Default CPU device)\n"
+        )
+
+        if has_nvidia_gpu_accelerator():
+            devices_string += (
+                "  • " + String(Device.CUDA) + " (NVIDIA CUDA GPU)\n"
+            )
+        if has_amd_gpu_accelerator():
+            devices_string += "  • " + String(Device.ROCM) + " (AMD ROCm GPU)\n"
+        if has_apple_gpu_accelerator():
+            devices_string += (
+                "  • " + String(Device.MPS) + " (Apple Metal GPU)\n"
+            )
+
+        if not (
+            has_nvidia_gpu_accelerator()
+            or has_amd_gpu_accelerator()
+            or has_apple_gpu_accelerator()
+        ):
+            devices_string += "  (No GPU accelerators detected)"
+
+        return devices_string
+
+    fn __str__(self) -> String:
+        try:
+            return String("Device(type='{}', name='{}', id={})").format(
+                self.type, self.name, self.id
+            )
+        except:
+            return "Device(Invalid)"
+
+    fn write_to[W: Writer](self, mut writer: W):
+        writer.write(self.__str__())
+
+    fn __eq__(self, other: Self) -> Bool:
+        return (
+            (self.type == other.type)
+            and (self.id == other.id)
+            and (self.name == other.name)
+        )
+
+    fn __ne__(self, other: Self) -> Bool:
+        return not self.__eq__(other)
+
+
+@parameter
+fn check_accelerator[device: Device]() -> Bool:
+    @parameter
+    if device.type != "gpu":
+        return False
+
+    @parameter
+    if device.name == "auto":
+        return has_accelerator()
+
+    @parameter
+    if device.name == "" or device.name == "cpu":
+        return False
+
+    @parameter
+    if device.name == "cuda":
+        return has_nvidia_gpu_accelerator()
+    elif device.name == "rocm":
+        return has_amd_gpu_accelerator()
+    elif device.name == "mps" or device.name == "metal":
+        return has_apple_gpu_accelerator()
+    else:
+        return False
diff --git a/numojo/core/gpu/matrix_kernels.mojo b/numojo/core/gpu/matrix_kernels.mojo
new file mode 100644
index 00000000..998d0782
--- /dev/null
+++ b/numojo/core/gpu/matrix_kernels.mojo
@@ -0,0 +1,383 @@
+from gpu import thread_idx, block_dim, block_idx, grid_dim, barrier
+from gpu.globals import WARP_SIZE, MAX_THREADS_PER_BLOCK_METADATA
+from gpu.warp import sum as warp_sum
+from gpu.memory import load, AddressSpace
+from memory import UnsafePointer, stack_allocation
+from sys import simd_width_of
+
+# temporary defaults.
+alias MAX_THREADS_PER_BLOCK = 1024
+alias OPTIMAL_BLOCK_SIZE = 256
+alias TILE_SIZE = 32
+alias VECTOR_WIDTH = 4
+alias MAX_WARPS_PER_BLOCK = MAX_THREADS_PER_BLOCK // UInt(WARP_SIZE)
+
+
+fn matrix_add_kernel_vectorized[
+    dtype: DType
+](
+    output: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+    size: UInt,
+    total_threads: UInt,
+):
+    """Optimized GPU kernel for element-wise matrix addition with vectorization.
+
+    Features:
+    - Vectorized memory operations (dtype-specific widths)
+    - Coalesced memory access patterns
+    - Grid-stride loops
+    - Proper bounds checking
+
+    Parameters:
+        dtype: Data type of the matrices.
+
+    Args:
+        output: Output buffer for result.
+        a: First input matrix buffer.
+        b: Second input matrix buffer.
+        size: Total number of elements.
+        total_threads: Total number of threads launched (grid_size * block_size).
+    """
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    var grid_stride = total_threads
+
+    @parameter
+    if dtype == DType.float16:
+        # width = 8
+        var base_idx = tid * 8
+        var idx = base_idx
+        while idx + 8 <= size:
+            var a_vec = a.load[width=8](idx)
+            var b_vec = b.load[width=8](idx)
+            var result = a_vec + b_vec
+            output.store[width=8](idx, result)
+            idx += grid_stride * 8
+        while idx < size:
+            output[idx] = a[idx] + b[idx]
+            idx += grid_stride
+    elif dtype == DType.float32:
+        # width = 4
+        var base_idx = tid * 4
+        var idx = base_idx
+        while idx + 4 <= size:
+            var a_vec = a.load[width=4](idx)
+            var b_vec = b.load[width=4](idx)
+            var result = a_vec + b_vec
+            output.store[width=4](idx, result)
+            idx += grid_stride * 4
+        while idx < size:
+            output[idx] = a[idx] + b[idx]
+            idx += grid_stride
+    elif dtype == DType.float64:
+        # width = 2
+        var base_idx = tid * 2
+        var idx = base_idx
+        while idx + 2 <= size:
+            var a_vec = a.load[width=2](idx)
+            var b_vec = b.load[width=2](idx)
+            var result = a_vec + b_vec
+            output.store[width=2](idx, result)
+            idx += grid_stride * 2
+        while idx < size:
+            output[idx] = a[idx] + b[idx]
+            idx += grid_stride
+    else:
+        # Scalar fallback with grid-stride
+        var idx = tid
+        while idx < size:
+            output[idx] = a[idx] + b[idx]
+            idx += grid_stride
+
+
+fn matrix_sub_kernel_vectorized[
+    dtype: DType
+](
+    output: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+    size: UInt,
+    total_threads: UInt,
+):
+    """Optimized GPU kernel for element-wise matrix subtraction with vectorization.
+
+    Features:
+    - Vectorized memory operations (dtype-specific widths)
+    - Coalesced memory access patterns
+    - Grid-stride loops
+    - Proper bounds checking
+
+    Parameters:
+        dtype: Data type of the matrices.
+
+    Args:
+        output: Output buffer for result.
+        a: First input matrix buffer.
+        b: Second input matrix buffer.
+        size: Total number of elements.
+        total_threads: Total number of threads launched (grid_size * block_size).
+    """
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    var grid_stride = total_threads
+
+    @parameter
+    if dtype == DType.float16:
+        # width = 8
+        var base_idx = tid * 8
+        var idx = base_idx
+        while idx + 8 <= size:
+            var a_vec = a.load[width=8](idx)
+            var b_vec = b.load[width=8](idx)
+            var result = a_vec - b_vec
+            output.store[width=8](idx, result)
+            idx += grid_stride * 8
+        while idx < size:
+            output[idx] = a[idx] - b[idx]
+            idx += grid_stride
+    elif dtype == DType.float32:
+        # width = 4
+        var base_idx = tid * 4
+        var idx = base_idx
+        while idx + 4 <= size:
+            var a_vec = a.load[width=4](idx)
+            var b_vec = b.load[width=4](idx)
+            var result = a_vec - b_vec
+            output.store[width=4](idx, result)
+            idx += grid_stride * 4
+        while idx < size:
+            output[idx] = a[idx] - b[idx]
+            idx += grid_stride
+    elif dtype == DType.float64:
+        # width = 2
+        var base_idx = tid * 2
+        var idx = base_idx
+        while idx + 2 <= size:
+            var a_vec = a.load[width=2](idx)
+            var b_vec = b.load[width=2](idx)
+            var result = a_vec - b_vec
+            output.store[width=2](idx, result)
+            idx += grid_stride * 2
+        while idx < size:
+            output[idx] = a[idx] - b[idx]
+            idx += grid_stride
+    else:
+        # Scalar fallback with grid-stride
+        var idx = tid
+        while idx < size:
+            output[idx] = a[idx] - b[idx]
+            idx += grid_stride
+
+
+fn matrix_mul_kernel_vectorized[
+    dtype: DType
+](
+    output: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+    size: UInt,
+    total_threads: UInt,
+):
+    """Optimized GPU kernel for element-wise matrix multiplication with vectorization.
+
+    Features:
+    - Vectorized memory operations (dtype-specific widths)
+    - Coalesced memory access
+    - Grid-stride loops
+
+    Args:
+        output: Output buffer for result.
+        a: First input matrix buffer.
+        b: Second input matrix buffer.
+        size: Total number of elements.
+        total_threads: Total number of threads launched (grid_size * block_size).
+    """
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    var grid_stride = total_threads
+
+    @parameter
+    if dtype == DType.float16:
+        var base_idx = tid * 8
+        var idx = base_idx
+        while idx + 8 <= size:
+            var a_vec = a.load[width=8](idx)
+            var b_vec = b.load[width=8](idx)
+            var result = a_vec * b_vec
+            output.store[width=8](idx, result)
+            idx += grid_stride * 8
+        while idx < size:
+            output[idx] = a[idx] * b[idx]
+            idx += grid_stride
+    elif dtype == DType.float32:
+        var base_idx = tid * 4
+        var idx = base_idx
+        while idx + 4 <= size:
+            var a_vec = a.load[width=4](idx)
+            var b_vec = b.load[width=4](idx)
+            var result = a_vec * b_vec
+            output.store[width=4](idx, result)
+            idx += grid_stride * 4
+        while idx < size:
+            output[idx] = a[idx] * b[idx]
+            idx += grid_stride
+    elif dtype == DType.float64:
+        var base_idx = tid * 2
+        var idx = base_idx
+        while idx + 2 <= size:
+            var a_vec = a.load[width=2](idx)
+            var b_vec = b.load[width=2](idx)
+            var result = a_vec * b_vec
+            output.store[width=2](idx, result)
+            idx += grid_stride * 2
+        while idx < size:
+            output[idx] = a[idx] * b[idx]
+            idx += grid_stride
+    else:
+        var idx = tid
+        while idx < size:
+            output[idx] = a[idx] * b[idx]
+            idx += grid_stride
+
+
+fn matrix_fill_kernel_vectorized[
+    dtype: DType
+](
+    output: UnsafePointer[Scalar[dtype]],
+    value: Scalar[dtype],
+    size: UInt,
+    total_threads: UInt,
+):
+    """Optimized GPU kernel for filling matrix with a value.
+
+    Features:
+    - Vectorized stores for better memory bandwidth (dtype-specific widths)
+    - Grid-stride loops
+    - Coalesced memory writes
+
+    Args:
+        output: Output buffer to fill.
+        value: Value to fill with.
+        size: Total number of elements.
+        total_threads: Total number of threads launched (grid_size * block_size).
+    """
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    var grid_stride = total_threads
+
+    @parameter
+    if dtype == DType.float16:
+        # width = 8
+        alias VecType = SIMD[dtype, 8]
+        var vec_value = VecType(value)
+        var base_idx = tid * 8
+        var idx = base_idx
+        while idx + 8 <= size:
+            output.store[width=8](idx, vec_value)
+            idx += grid_stride * 8
+        while idx < size:
+            output[idx] = value
+            idx += grid_stride
+    elif dtype == DType.float32:
+        # width = 4
+        alias VecType = SIMD[dtype, 4]
+        var vec_value = VecType(value)
+        var base_idx = tid * 4
+        var idx = base_idx
+        while idx + 4 <= size:
+            output.store[width=4](idx, vec_value)
+            idx += grid_stride * 4
+        while idx < size:
+            output[idx] = value
+            idx += grid_stride
+    elif dtype == DType.float64:
+        # width = 2
+        alias VecType = SIMD[dtype, 2]
+        var vec_value = VecType(value)
+        var base_idx = tid * 2
+        var idx = base_idx
+        while idx + 2 <= size:
+            output.store[width=2](idx, vec_value)
+            idx += grid_stride * 2
+        while idx < size:
+            output[idx] = value
+            idx += grid_stride
+    else:
+        var idx = tid
+        while idx < size:
+            output[idx] = value
+            idx += grid_stride
+
+
+fn matrix_reduce_sum_kernel[
+    dtype: DType
+](
+    input: UnsafePointer[Scalar[dtype]],
+    output: UnsafePointer[Scalar[dtype]],
+    size: UInt,
+):
+    """Optimized reduction sum kernel using warp primitives.
+
+    This kernel computes a per-block reduction (one output element per
+    block) by performing:
+      1) a grid-stride accumulation per thread,
+      2) a warp-level reduction (fast shuffle) to produce one partial sum
+         per warp,
+      3) writing warp partials to shared memory,
+      4) a single-thread final accumulation of warp partials and write-out.
+
+    Args:
+        input: Input buffer to reduce.
+        output: Output buffer for per-block sums (indexed by blockIdx.x).
+        size: Total number of elements.
+    """
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    var grid_stride = block_dim.x * grid_dim.x
+
+    var thread_sum = Scalar[dtype](0)
+    var idx = tid
+    while idx < size:
+        thread_sum += input[idx]
+        idx += grid_stride
+
+    var lane_id: UInt = thread_idx.x % UInt(WARP_SIZE)
+    var warp_result = warp_sum(thread_sum)
+
+    var num_warps_per_block: UInt = (block_dim.x + UInt(WARP_SIZE) - 1) // UInt(
+        WARP_SIZE
+    )
+
+    var warp_shared = stack_allocation[
+        MAX_WARPS_PER_BLOCK,
+        Scalar[dtype],
+        address_space = AddressSpace.SHARED,
+    ]()
+
+    if lane_id == 0:
+        var warp_id_in_block: UInt = thread_idx.x // UInt(WARP_SIZE)
+        warp_shared[warp_id_in_block] = warp_result
+
+    barrier()
+    if thread_idx.x == 0:
+        var block_total = Scalar[dtype](0)
+        var i: UInt = 0
+        while i < num_warps_per_block:
+            block_total += warp_shared[i]
+            i += 1
+        output[block_idx.x] = block_total
+
+
+# TODO: move to compile time and specialize based on dtype.
+fn calculate_1d_launch_params_for_dtype[
+    dtype: DType
+](total_elements: Int) -> Tuple[Int, Int]:
+    """Calculate 1D launch parameters specialized for dtype, with warp alignment
+    and vector-width-aware grid sizing to reduce kernel launch overhead on small inputs.
+    """
+
+    # For Apple Metal
+    var block_size = 32
+    var grid_size = (total_elements + block_size - 1) // block_size
+
+    if grid_size > 8192:
+        grid_size = 8192
+
+    return (grid_size, block_size)
diff --git a/numojo/core/gpu/storage.mojo b/numojo/core/gpu/storage.mojo
new file mode 100644
index 00000000..901c021c
--- /dev/null
+++ b/numojo/core/gpu/storage.mojo
@@ -0,0 +1,158 @@
+from time import perf_counter_ns
+from gpu.host import DeviceBuffer, DeviceContext, HostBuffer
+from gpu import thread_idx, block_dim, block_idx, grid_dim, barrier
+from gpu.host.dim import Dim
+from gpu.memory import AddressSpace
+from memory import stack_allocation
+from gpu.globals import WARP_SIZE, MAX_THREADS_PER_BLOCK_METADATA
+from collections.optional import Optional
+from memory import UnsafePointer
+from sys import simd_width_of
+from sys.info import (
+    has_accelerator,
+    has_nvidia_gpu_accelerator,
+    has_amd_gpu_accelerator,
+    has_apple_gpu_accelerator,
+)
+
+from numojo.core.gpu.device import Device, check_accelerator
+
+
+struct HostStorage[dtype: DType, device: Device](
+    Copyable, Movable
+):  # similar to owndata
+    var ptr: UnsafePointer[Scalar[dtype]]
+
+    fn __init__(out self, size: Int):
+        """
+        Allocate given space on memory.
+        The bytes allocated is `size` * `byte size of dtype`.
+
+        Notes:
+        `ndarray.flags['OWN_DATA']` should be set as True.
+        The memory should be freed by `__del__`.
+        """
+        self.ptr = UnsafePointer[Scalar[dtype]]().alloc(size)
+
+    fn __init__(out self, ptr: UnsafePointer[Scalar[dtype]]):
+        """
+        Do not use this if you know what it means.
+        If the pointer is associated with another array, it might cause
+        dangling pointer problem.
+
+        Notes:
+        `ndarray.flags['OWN_DATA']` should be set as False.
+        The memory should not be freed by `__del__`.
+        """
+        self.ptr = ptr
+
+    fn __moveinit__(out self, deinit other: Self):
+        self.ptr = other.ptr
+
+    fn get_ptr(self) -> UnsafePointer[Scalar[dtype]]:
+        return self.ptr
+
+
+struct DeviceStorage[dtype: DType, device: Device](Copyable, Movable):
+    var buffer: DeviceBuffer[dtype]
+
+    fn __init__(out self, size: Int) raises:
+        constrained[
+            has_accelerator(), "No GPU device available for DeviceStorage"
+        ]()
+        var buf = DeviceContext().enqueue_create_buffer[dtype](size)
+        self.buffer = buf^
+
+    fn __moveinit__(out self, deinit other: Self):
+        self.buffer = other.buffer^
+
+    fn get_buffer(self) -> DeviceBuffer[dtype]:
+        return self.buffer
+
+    fn get_device_ptr(self) -> UnsafePointer[Scalar[dtype]]:
+        return self.buffer.unsafe_ptr()
+
+    fn get_device_context(self) raises -> DeviceContext:
+        return self.buffer.context()
+
+
+struct DataContainer[dtype: DType, device: Device = Device.CPU](
+    Copyable, Movable
+):
+    """Storage container using Optional approach for CPU/GPU memory.
+
+    Performance characteristics:
+    - Memory: Only allocates space for the storage type being used
+    - Access: Direct .value() call, no pattern matching needed
+    - Type safety: Clear which storage type is active
+    """
+
+    var host_storage: Optional[HostStorage[dtype, device]]
+    var device_storage: Optional[DeviceStorage[dtype, device]]
+    var size: Int
+
+    fn __init__(out self, size: Int) raises:
+        self.size = size
+
+        @parameter
+        if device.type == "cpu":
+            self.host_storage = HostStorage[dtype, device](size)
+            self.device_storage = None
+        elif device.type == "gpu":
+            if not check_accelerator[device]():
+                raise Error(
+                    "\n Requested GPU device: "
+                    + String(device)
+                    + " is not available. The available devices are: "
+                    + Device.available_devices()
+                )
+            self.host_storage = None
+            self.device_storage = DeviceStorage[dtype, device](size)
+        else:
+            raise Error("Unsupported device type: " + String(device.type))
+
+    fn __moveinit__(out self, deinit other: Self):
+        self.host_storage = other.host_storage^
+        self.device_storage = other.device_storage^
+        self.size = other.size
+
+    fn get_host_ptr(self) -> UnsafePointer[Scalar[dtype]]:
+        """Get pointer for CPU access. For GPU, use map_to_host context."""
+        constrained[
+            device == Device.CPU,
+            "Cannot retrieve a pointer to Device memory for a HostBuffer",
+        ]()
+        return self.host_storage.unsafe_value().get_ptr()
+
+    fn get_device_ptr(self) raises -> UnsafePointer[Scalar[dtype]]:
+        """Get pointer for GPU access."""
+        constrained[
+            (
+                device == Device.GPU
+                or device == Device.CUDA
+                or device == Device.ROCM
+                or device == Device.MPS
+            ),
+            "Cannot retrieve a pointer to Host memory for a DeviceBuffer",
+        ]()
+        return self.device_storage.value().get_device_ptr()
+
+    @parameter
+    fn is_cpu(self) -> Bool:
+        return device.type == "cpu"
+
+    @parameter
+    fn is_gpu(self) -> Bool:
+        return device.type == "gpu"
+
+    fn get_device_buffer(self) raises -> DeviceBuffer[dtype]:
+        constrained[
+            (
+                device == Device.GPU
+                or device == Device.CUDA
+                or device == Device.ROCM
+                or device == Device.MPS
+            ),
+            "Cannot retrieve a DeviceBuffer for a HostBuffer",
+        ]()
+        return self.device_storage.value().get_buffer()
diff --git a/numojo/core/item.mojo b/numojo/core/item.mojo
index 776b3eea..252f2d1c 100644
--- a/numojo/core/item.mojo
+++ b/numojo/core/item.mojo
@@ -5,11 +5,14 @@ Implements Item type.
 """
 
 from builtin.type_aliases import Origin
+from builtin.int import index as index_int
 from memory import UnsafePointer, memset_zero, memcpy
+from memory import memcmp
 from os import abort
 from sys import simd_width_of
 from utils import Variant
 
+from numojo.core.error import IndexError, ValueError
 from numojo.core.traits.indexer_collection_element import (
     IndexerCollectionElement,
 )
@@ -19,12 +22,18 @@ alias item = Item
 
 
 @register_passable
-struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
+struct Item(
+    ImplicitlyCopyable, Movable, Representable, Sized, Stringable, Writable
+):
     """
     Specifies the indices of an item of an array.
     """
 
-    var _buf: UnsafePointer[Int]
+    # Aliases
+    alias _type: DType = DType.int
+
+    # Fields
+    var _buf: UnsafePointer[Scalar[Self._type]]
     var ndim: Int
 
     @always_inline("nodebug")
@@ -37,7 +46,7 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
         Args:
             args: Initial values.
         """
-        self._buf = UnsafePointer[Int]().alloc(args.__len__())
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(args.__len__())
         self.ndim = args.__len__()
         for i in range(args.__len__()):
             self._buf[i] = index(args[i])
@@ -53,7 +62,7 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
             args: Initial values.
         """
         self.ndim = len(args)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(index(args[i]))
 
@@ -65,7 +74,7 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
             args: Initial values.
         """
         self.ndim = len(args)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(Int(args[i]))
 
@@ -103,58 +112,15 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
                 )
             )
 
-        self.ndim = ndim
-        self._buf = UnsafePointer[Int]().alloc(ndim)
-        if initialized:
-            for i in range(ndim):
-                (self._buf + i).init_pointee_copy(0)
-
-    fn __init__(out self, idx: Int, shape: NDArrayShape) raises:
-        """
-        Get indices of the i-th item of the array of the given shape.
-        The item traverse the array in C-order.
-
-        Args:
-            idx: The i-th item of the array.
-            shape: The strides of the array.
-
-        Examples:
-
-        The following example demonstrates how to get the indices (coordinates)
-        of the 123-th item of a 3D array with shape (20, 30, 40).
-
-        ```console
-        >>> from numojo.prelude import *
-        >>> var item = Item(123, Shape(20, 30, 40))
-        >>> print(item)
-        Item at index: (0,3,3)  Length: 3
-        ```
-        """
-
-        if (idx < 0) or (idx >= shape.size_of_array()):
-            raise Error(
-                IndexError(
-                    message=String(
-                        "Linear index {} out of range [0, {})."
-                    ).format(idx, shape.size_of_array()),
-                    suggestion=String(
-                        "Ensure 0 <= idx < total size ({})."
-                    ).format(shape.size_of_array()),
-                    location=String(
-                        "Item.__init__(idx: Int, shape: NDArrayShape)"
-                    ),
-                )
-            )
-
-        self.ndim = shape.ndim
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
-
-        var strides = NDArrayStrides(shape, order="C")
-        var remainder = idx
-
-        for i in range(self.ndim):
-            (self._buf + i).init_pointee_copy(remainder // strides._buf[i])
-            remainder %= strides._buf[i]
+        if ndim == 0:
+            self.ndim = 0
+            self._buf = UnsafePointer[Scalar[Self._type]]()
+        else:
+            self.ndim = ndim
+            self._buf = UnsafePointer[Scalar[Self._type]]().alloc(ndim)
+            if initialized:
+                for i in range(ndim):
+                    (self._buf + i).init_pointee_copy(0)
 
     @always_inline("nodebug")
     fn __copyinit__(out self, other: Self):
@@ -164,12 +130,13 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
             other: The tuple to copy.
         """
         self.ndim = other.ndim
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
-        memcpy(self._buf, other._buf, self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=self.ndim)
 
     @always_inline("nodebug")
     fn __del__(deinit self):
-        self._buf.free()
+        if self.ndim > 0:
+            self._buf.free()
 
     @always_inline("nodebug")
     fn __len__(self) -> Int:
@@ -180,6 +147,21 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
         """
         return self.ndim
 
+    fn normalize_index(self, index: Int) -> Int:
+        """
+        Normalizes the given index to be within the valid range.
+
+        Args:
+            index: The index to normalize.
+
+        Returns:
+            The normalized index.
+        """
+        var normalized_idx: Int = index
+        if normalized_idx < 0:
+            normalized_idx += self.ndim
+        return normalized_idx
+
     @always_inline("nodebug")
     fn __getitem__[T: Indexer](self, idx: T) raises -> Int:
         """Gets the value at the specified index.
@@ -193,16 +175,12 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
         Returns:
             The value at the specified index.
         """
-
-        var normalized_idx: Int = index(idx)
-        if normalized_idx < 0:
-            normalized_idx = index(idx) + self.ndim
-
-        if normalized_idx < 0 or normalized_idx >= self.ndim:
+        var index: Int = index_int(idx)
+        if index >= self.ndim or index < -self.ndim:
             raise Error(
                 IndexError(
                     message=String("Index {} out of range [{} , {}).").format(
-                        index(idx), -self.ndim, self.ndim
+                        index_int(idx), -self.ndim, self.ndim
                     ),
                     suggestion=String(
                         "Use indices in [-ndim, ndim) (negative indices wrap)."
@@ -210,8 +188,46 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
                     location=String("Item.__getitem__"),
                 )
             )
+        var normalized_idx: Int = self.normalize_index(index_int(idx))
+        return Int(self._buf[normalized_idx])
 
-        return self._buf[normalized_idx]
+    @always_inline("nodebug")
+    fn __getitem__(self, slice_index: Slice) raises -> Self:
+        """
+        Return a sliced view of the item as a new Item.
+        Delegates normalization & validation to _compute_slice_params.
+
+        Args:
+            slice_index: The slice to extract.
+
+        Returns:
+            A new Item containing the sliced values.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        var item = Item(10, 20, 30, 40, 50)
+        print(item[1:4])  # Item: (20, 30, 40)
+        print(item[::2])  # Item: (10, 30, 50)
+        ```
+        """
+        var updated_slice: Tuple[Int, Int, Int] = self._compute_slice_params(
+            slice_index
+        )
+        var start = updated_slice[0]
+        var step = updated_slice[1]
+        var length = updated_slice[2]
+
+        if length <= 0:
+            var empty_result = Self(ndim=0, initialized=False)
+            return empty_result
+
+        var result = Self(ndim=length, initialized=False)
+        var idx = start
+        for i in range(length):
+            (result._buf + i).init_pointee_copy(self._buf[idx])
+            idx += step
+        return result^
 
     @always_inline("nodebug")
     fn __setitem__[T: Indexer, U: Indexer](self, idx: T, val: U) raises:
@@ -226,15 +242,15 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
             val: The value to set.
         """
 
-        var normalized_idx: Int = index(idx)
+        var normalized_idx: Int = index_int(idx)
         if normalized_idx < 0:
-            normalized_idx = index(idx) + self.ndim
+            normalized_idx = index_int(idx) + self.ndim
 
         if normalized_idx < 0 or normalized_idx >= self.ndim:
             raise Error(
                 IndexError(
                     message=String("Index {} out of range [{} , {}).").format(
-                        index(idx), -self.ndim, self.ndim
+                        index_int(idx), -self.ndim, self.ndim
                     ),
                     suggestion=String(
                         "Use indices in [-ndim, ndim) (negative indices wrap)."
@@ -282,36 +298,399 @@ struct Item(ImplicitlyCopyable, Movable, Stringable, Writable):
             + String(self.ndim)
         )
 
+    @always_inline("nodebug")
+    fn __eq__(self, other: Self) -> Bool:
+        """
+        Checks if two items have identical dimensions and values.
+
+        Args:
+            other: The item to compare with.
+
+        Returns:
+            True if both items have identical dimensions and values.
+        """
+        if self.ndim != other.ndim:
+            return False
+        if memcmp(self._buf, other._buf, self.ndim) != 0:
+            return False
+        return True
+
+    @always_inline("nodebug")
+    fn __ne__(self, other: Self) -> Bool:
+        """
+        Checks if two items have different dimensions or values.
+
+        Args:
+            other: The item to compare with.
+
+        Returns:
+            True if both items do not have identical dimensions or values.
+        """
+        return not self.__eq__(other)
+
+    @always_inline("nodebug")
+    fn __contains__(self, val: Int) -> Bool:
+        """
+        Checks if the given value is present in the item.
+
+        Args:
+            val: The value to search for.
+
+        Returns:
+            True if the given value is present in the item.
+        """
+        for i in range(self.ndim):
+            if self._buf[i] == val:
+                return True
+        return False
+
     # ===-------------------------------------------------------------------===#
     # Other methods
     # ===-------------------------------------------------------------------===#
 
-    fn offset(self, strides: NDArrayStrides) -> Int:
+    @always_inline("nodebug")
+    fn copy(read self) raises -> Self:
         """
-        Calculates the offset of the item according to strides.
+        Returns a deep copy of the item.
+
+        Returns:
+            A new Item with the same values.
+        """
+        var res = Self(ndim=self.ndim, initialized=False)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
+        return res^
+
+    fn swapaxes(self, axis1: Int, axis2: Int) raises -> Self:
+        """
+        Returns a new item with the given axes swapped.
 
         Args:
-            strides: The strides of the array.
+            axis1: The first axis to swap.
+            axis2: The second axis to swap.
 
         Returns:
-            The offset of the item.
+            A new item with the given axes swapped.
+        """
+        var res: Self = Self(ndim=self.ndim, initialized=False)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
+        res[axis1] = self[axis2]
+        res[axis2] = self[axis1]
+        return res
+
+    fn join(self, *others: Self) raises -> Self:
+        """
+        Join multiple items into a single item.
+
+        Args:
+            others: Variable number of Item objects.
+
+        Returns:
+            A new Item object with all values concatenated.
 
         Examples:
+        ```mojo
+        from numojo.prelude import *
+        var item1 = Item(1, 2)
+        var item2 = Item(3, 4)
+        var item3 = Item(5)
+        var joined = item1.join(item2, item3)
+        print(joined)  # Item at index: (1,2,3,4,5)
+        ```
+        """
+        var total_dims: Int = self.ndim
+        for i in range(len(others)):
+            total_dims += others[i].ndim
+
+        var new_item: Self = Self(ndim=total_dims, initialized=False)
+
+        var index: UInt = 0
+        for i in range(self.ndim):
+            (new_item._buf + index).init_pointee_copy(self[i])
+            index += 1
+
+        for i in range(len(others)):
+            for j in range(others[i].ndim):
+                (new_item._buf + index).init_pointee_copy(others[i][j])
+                index += 1
+
+        return new_item^
+
+    # ===-------------------------------------------------------------------===#
+    # Other private methods
+    # ===-------------------------------------------------------------------===#
+
+    fn _flip(self) raises -> Self:
+        """
+        Returns a new item by flipping the items.
+        ***UNSAFE!*** No boundary check!
 
+        Returns:
+            A new item with the items flipped.
+
+        Example:
         ```mojo
         from numojo.prelude import *
         var item = Item(1, 2, 3)
-        var strides = nm.Strides(4, 3, 2)
-        print(item.offset(strides))
-        # This prints `16`.
+        print(item)          # Item: (1, 2, 3)
+        print(item._flip())  # Item: (3, 2, 1)
+        ```
+        """
+        var result: Self = Self(ndim=self.ndim, initialized=False)
+        memcpy(dest=result._buf, src=self._buf, count=self.ndim)
+        for i in range(result.ndim):
+            result._buf[i] = self._buf[self.ndim - 1 - i]
+        return result^
+
+    fn _move_axis_to_end(self, var axis: Int) raises -> Self:
+        """
+        Returns a new item by moving the value of axis to the end.
+        ***UNSAFE!*** No boundary check!
+
+        Args:
+            axis: The axis (index) to move. It should be in `[-ndim, ndim)`.
+
+        Returns:
+            A new item with the specified axis moved to the end.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        var item = Item(10, 20, 30)
+        print(item._move_axis_to_end(0))  # Item: (20, 30, 10)
+        print(item._move_axis_to_end(1))  # Item: (10, 30, 20)
         ```
-        .
         """
+        if axis < 0:
+            axis += self.ndim
+
+        var result: Self = Self(ndim=self.ndim, initialized=False)
+        memcpy(dest=result._buf, src=self._buf, count=self.ndim)
+
+        if axis == self.ndim - 1:
+            return result^
+
+        var value: Scalar[Self._type] = result._buf[axis]
+        for i in range(axis, result.ndim - 1):
+            result._buf[i] = result._buf[i + 1]
+        result._buf[result.ndim - 1] = value
+        return result^
+
+    fn _pop(self, axis: Int) raises -> Self:
+        """
+        Drops information of certain axis.
+        ***UNSAFE!*** No boundary check!
+
+        Args:
+            axis: The axis (index) to drop. It should be in `[0, ndim)`.
+
+        Returns:
+            A new item with the item at the given axis (index) dropped.
+        """
+        var res: Self = Self(ndim=self.ndim - 1, initialized=False)
+        memcpy(dest=res._buf, src=self._buf, count=axis)
+        memcpy(
+            dest=res._buf + axis,
+            src=self._buf.offset(axis + 1),
+            count=self.ndim - axis - 1,
+        )
+        return res^
+
+    fn _extend(self, *values: Int) raises -> Self:
+        """
+        Extend the item by additional values.
+        ***UNSAFE!*** No boundary check!
 
-        var offset: Int = 0
+        Args:
+            values: Additional values to append.
+
+        Returns:
+            A new Item object with the extended values.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        var item = Item(1, 2, 3)
+        var extended = item._extend(4, 5)
+        print(extended)  # Item: (1, 2, 3, 4, 5)
+        ```
+        """
+        var total_dims: Int = self.ndim + len(values)
+        var new_item: Self = Self(ndim=total_dims, initialized=False)
+
+        var offset: UInt = 0
         for i in range(self.ndim):
-            offset += self._buf[i] * strides._buf[i]
-        return offset
+            (new_item._buf + offset).init_pointee_copy(self[i])
+            offset += 1
+        for value in values:
+            (new_item._buf + offset).init_pointee_copy(value)
+            offset += 1
+
+        return new_item^
+
+    fn _compute_slice_params(
+        self, slice_index: Slice
+    ) raises -> Tuple[Int, Int, Int]:
+        """
+        Compute normalized slice parameters (start, step, length).
+
+        Args:
+            slice_index: The slice to compute parameters for.
+
+        Returns:
+            A tuple of (start, step, length).
+
+        Raises:
+            Error: If the slice step is zero.
+        """
+        var n = self.ndim
+        if n == 0:
+            return (0, 1, 0)
+
+        var step = slice_index.step.or_else(1)
+        if step == 0:
+            raise Error(
+                ValueError(
+                    message="Slice step cannot be zero.",
+                    suggestion="Use a non-zero step value.",
+                    location="Item._compute_slice_params",
+                )
+            )
+
+        var start: Int
+        var stop: Int
+        if step > 0:
+            start = slice_index.start.or_else(0)
+            stop = slice_index.end.or_else(n)
+        else:
+            start = slice_index.start.or_else(n - 1)
+            stop = slice_index.end.or_else(-1)
+
+        if start < 0:
+            start += n
+        if stop < 0:
+            stop += n
+
+        if step > 0:
+            if start < 0:
+                start = 0
+            if start > n:
+                start = n
+            if stop < 0:
+                stop = 0
+            if stop > n:
+                stop = n
+        else:
+            if start >= n:
+                start = n - 1
+            if start < -1:
+                start = -1
+            if stop >= n:
+                stop = n - 1
+            if stop < -1:
+                stop = -1
+
+        var length: Int = 0
+        if step > 0:
+            if start < stop:
+                length = Int((stop - start + step - 1) / step)
+        else:
+            if start > stop:
+                var neg_step = -step
+                length = Int((start - stop + neg_step - 1) / neg_step)
+
+        return (start, step, length)
+
+    fn load[width: Int = 1](self, idx: Int) raises -> SIMD[Self._type, width]:
+        """
+        Load a SIMD vector from the Item at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
+
+        Raises:
+            Error: If the load exceeds the bounds of the Item.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Load operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Item.load",
+                )
+            )
+
+        return self._buf.load[width=width](idx)
+
+    fn store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]) raises:
+        """
+        Store a SIMD vector into the Item at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
+
+        Raises:
+            Error: If the store exceeds the bounds of the Item.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Store operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Item.store",
+                )
+            )
+
+        self._buf.store[width=width](idx, value)
+
+    fn unsafe_load[width: Int = 1](self, idx: Int) -> SIMD[Self._type, width]:
+        """
+        Unsafely load a SIMD vector from the Item at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
+        """
+        return self._buf.load[width=width](idx)
+
+    fn unsafe_store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]):
+        """
+        Unsafely store a SIMD vector into the Item at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
+        """
+        self._buf.store[width=width](idx, value)
 
 
 struct _ItemIter[
@@ -346,7 +725,7 @@ struct _ItemIter[
         else:
             return self.index > 0
 
-    fn __next__(mut self) raises -> Scalar[DType.index]:
+    fn __next__(mut self) raises -> Scalar[DType.int]:
         @parameter
         if forward:
             var current_index = self.index
diff --git a/numojo/core/matrix.mojo b/numojo/core/matrix.mojo
index 93fbf3a5..66c3b208 100644
--- a/numojo/core/matrix.mojo
+++ b/numojo/core/matrix.mojo
@@ -178,9 +178,9 @@ struct Matrix[dtype: DType = DType.float64](
         if data.flags["C_CONTIGUOUS"]:
             for i in range(data.shape[0]):
                 memcpy(
-                    self._buf.ptr.offset(i * self.shape[0]),
-                    data._buf.ptr.offset(i * data.shape[0]),
-                    self.shape[0],
+                    dest=self._buf.ptr.offset(i * self.shape[0]),
+                    src=data._buf.ptr.offset(i * data.shape[0]),
+                    count=self.shape[0],
                 )
         else:
             for i in range(data.shape[0]):
@@ -196,7 +196,7 @@ struct Matrix[dtype: DType = DType.float64](
         self.strides = (other.strides[0], other.strides[1])
         self.size = other.size
         self._buf = OwnData[dtype](other.size)
-        memcpy(self._buf.ptr, other._buf.ptr, other.size)
+        memcpy(dest=self._buf.ptr, src=other._buf.ptr, count=other.size)
         self.flags = other.flags
 
     @always_inline("nodebug")
@@ -274,7 +274,7 @@ struct Matrix[dtype: DType = DType.float64](
 
         if self.flags.C_CONTIGUOUS:
             var ptr = self._buf.ptr.offset(x * self.strides[0])
-            memcpy(res._buf.ptr, ptr, self.shape[1])
+            memcpy(dest=res._buf.ptr, src=ptr, count=self.shape[1])
         else:
             for j in range(self.shape[1]):
                 res[0, j] = self[x, j]
@@ -437,7 +437,7 @@ struct Matrix[dtype: DType = DType.float64](
             )
 
         var ptr = self._buf.ptr.offset(x * self.shape[1])
-        memcpy(ptr, value._buf.ptr, value.size)
+        memcpy(dest=ptr, src=value._buf.ptr, count=value.size)
 
     fn _store[
         width: Int = 1
@@ -452,7 +452,7 @@ struct Matrix[dtype: DType = DType.float64](
     # Other dunders and auxiliary methods
     # ===-------------------------------------------------------------------===#
 
-    fn __iter__(self) raises -> _MatrixIter[__origin_of(self), dtype]:
+    fn __iter__(self) raises -> _MatrixIter[origin_of(self), dtype]:
         """Iterate over elements of the Matrix, returning copied value.
 
         Example:
@@ -467,7 +467,7 @@ struct Matrix[dtype: DType = DType.float64](
             An iterator of Matrix elements.
         """
 
-        return _MatrixIter[__origin_of(self), dtype](
+        return _MatrixIter[origin_of(self), dtype](
             matrix=self,
             length=self.shape[0],
         )
@@ -480,7 +480,7 @@ struct Matrix[dtype: DType = DType.float64](
 
     fn __reversed__(
         self,
-    ) raises -> _MatrixIter[__origin_of(self), dtype, forward=False]:
+    ) raises -> _MatrixIter[origin_of(self), dtype, forward=False]:
         """Iterate backwards over elements of the Matrix, returning
         copied value.
 
@@ -488,7 +488,7 @@ struct Matrix[dtype: DType = DType.float64](
             A reversed iterator of Matrix elements.
         """
 
-        return _MatrixIter[__origin_of(self), dtype, forward=False](
+        return _MatrixIter[origin_of(self), dtype, forward=False](
             matrix=self,
             length=self.shape[0],
         )
@@ -924,37 +924,37 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.logic.any(self, axis=axis)
 
-    fn argmax(self) raises -> Scalar[DType.index]:
+    fn argmax(self) raises -> Scalar[DType.int]:
         """
         Index of the max. It is first flattened before sorting.
         """
         return numojo.math.argmax(self)
 
-    fn argmax(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argmax(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Index of the max along the given axis.
         """
         return numojo.math.argmax(self, axis=axis)
 
-    fn argmin(self) raises -> Scalar[DType.index]:
+    fn argmin(self) raises -> Scalar[DType.int]:
         """
         Index of the min. It is first flattened before sorting.
         """
         return numojo.math.argmin(self)
 
-    fn argmin(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argmin(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Index of the min along the given axis.
         """
         return numojo.math.argmin(self, axis=axis)
 
-    fn argsort(self) raises -> Matrix[DType.index]:
+    fn argsort(self) raises -> Matrix[DType.int]:
         """
         Argsort the Matrix. It is first flattened before sorting.
         """
         return numojo.math.argsort(self)
 
-    fn argsort(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argsort(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Argsort the Matrix along the given axis.
         """
@@ -1021,7 +1021,7 @@ struct Matrix[dtype: DType = DType.float64](
         Return a flattened copy of the matrix.
         """
         var res = Self(shape=(1, self.size), order=self.order())
-        memcpy(res._buf.ptr, self._buf.ptr, res.size)
+        memcpy(dest=res._buf.ptr, src=self._buf.ptr, count=res.size)
         return res^
 
     fn inv(self) raises -> Self:
@@ -1118,10 +1118,10 @@ struct Matrix[dtype: DType = DType.float64](
         var res = Self(shape=shape, order="C")
         if self.flags.F_CONTIGUOUS:
             var temp = self.reorder_layout()
-            memcpy(res._buf.ptr, temp._buf.ptr, res.size)
+            memcpy(dest=res._buf.ptr, src=temp._buf.ptr, count=res.size)
             res = res.reorder_layout()
         else:
-            memcpy(res._buf.ptr, self._buf.ptr, res.size)
+            memcpy(dest=res._buf.ptr, src=self._buf.ptr, count=res.size)
         return res^
 
     fn resize(mut self, shape: Tuple[Int, Int]) raises:
@@ -1131,7 +1131,7 @@ struct Matrix[dtype: DType = DType.float64](
         if shape[0] * shape[1] > self.size:
             var other = Self(shape=shape)
             if self.flags.C_CONTIGUOUS:
-                memcpy(other._buf.ptr, self._buf.ptr, self.size)
+                memcpy(dest=other._buf.ptr, src=self._buf.ptr, count=self.size)
                 for i in range(self.size, other.size):
                     other._buf.ptr[i] = 0
             else:
@@ -1277,61 +1277,13 @@ struct Matrix[dtype: DType = DType.float64](
         var ndarray: NDArray[dtype] = NDArray[dtype](
             shape=List[Int](self.shape[0], self.shape[1]), order="C"
         )
-        memcpy(ndarray._buf.ptr, self._buf.ptr, ndarray.size)
+        memcpy(dest=ndarray._buf.ptr, src=self._buf.ptr, count=ndarray.size)
 
         return ndarray^
 
     fn to_numpy(self) raises -> PythonObject:
         """See `numojo.core.utility.to_numpy`."""
-        try:
-            var np = Python.import_module("numpy")
-
-            var np_arr_dim = Python.list()
-            np_arr_dim.append(self.shape[0])
-            np_arr_dim.append(self.shape[1])
-
-            np.set_printoptions(4)
-
-            # Implement a dictionary for this later
-            var numpyarray: PythonObject
-            var np_dtype = np.float64
-            if dtype == DType.float16:
-                np_dtype = np.float16
-            elif dtype == DType.float32:
-                np_dtype = np.float32
-            elif dtype == DType.int64:
-                np_dtype = np.int64
-            elif dtype == DType.int32:
-                np_dtype = np.int32
-            elif dtype == DType.int16:
-                np_dtype = np.int16
-            elif dtype == DType.int8:
-                np_dtype = np.int8
-            elif dtype == DType.uint64:
-                np_dtype = np.uint64
-            elif dtype == DType.uint32:
-                np_dtype = np.uint32
-            elif dtype == DType.uint16:
-                np_dtype = np.uint16
-            elif dtype == DType.uint8:
-                np_dtype = np.uint8
-            elif dtype == DType.bool:
-                np_dtype = np.bool_
-            elif dtype == DType.index:
-                np_dtype = np.int64
-
-            var order = "C" if self.flags.C_CONTIGUOUS else "F"
-            numpyarray = np.empty(np_arr_dim, dtype=np_dtype, order=order)
-            var pointer_d = numpyarray.__array_interface__["data"][
-                0
-            ].unsafe_get_as_pointer[dtype]()
-            memcpy(pointer_d, self._buf.ptr, self.size)
-
-            return numpyarray^
-
-        except e:
-            print("Error in converting to numpy", e)
-            return PythonObject()
+        return numojo.core.utility.to_numpy(self)
 
     # ===-----------------------------------------------------------------------===#
     # Static methods to construct matrix
@@ -1339,12 +1291,12 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn full[
-        dtype: DType = DType.float64
+        datatype: DType = DType.float64
     ](
         shape: Tuple[Int, Int],
-        fill_value: Scalar[dtype] = 0,
+        fill_value: Scalar[datatype] = 0,
         order: String = "C",
-    ) -> Matrix[dtype]:
+    ) -> Matrix[datatype]:
         """Return a matrix with given shape and filled value.
 
         Example:
@@ -1354,7 +1306,7 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        var matrix = Matrix[dtype](shape, order)
+        var matrix = Matrix[datatype](shape, order)
         for i in range(shape[0] * shape[1]):
             matrix._buf.ptr.store(i, fill_value)
 
@@ -1362,8 +1314,8 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn zeros[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype]:
         """Return a matrix with given shape and filled with zeros.
 
         Example:
@@ -1373,14 +1325,14 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        var M = Matrix[dtype](shape, order)
+        var M = Matrix[datatype](shape, order)
         memset_zero(M._buf.ptr, M.size)
         return M^
 
     @staticmethod
     fn ones[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype]:
         """Return a matrix with given shape and filled with ones.
 
         Example:
@@ -1390,12 +1342,12 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        return Matrix.full[dtype](shape=shape, fill_value=1)
+        return Matrix.full[datatype](shape=shape, fill_value=1)
 
     @staticmethod
     fn identity[
-        dtype: DType = DType.float64
-    ](len: Int, order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](len: Int, order: String = "C") -> Matrix[datatype]:
         """Return an identity matrix with given size.
 
         Example:
@@ -1404,7 +1356,7 @@ struct Matrix[dtype: DType = DType.float64](
         var A = Matrix.identity(12)
         ```
         """
-        var matrix = Matrix.zeros[dtype]((len, len), order)
+        var matrix = Matrix.zeros[datatype]((len, len), order)
         for i in range(len):
             matrix._buf.ptr.store(
                 i * matrix.strides[0] + i * matrix.strides[1], 1
@@ -1413,8 +1365,8 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn rand[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype]:
         """Return a matrix with random values uniformed distributed between 0 and 1.
 
         Example:
@@ -1424,25 +1376,25 @@ struct Matrix[dtype: DType = DType.float64](
         ```
 
         Parameters:
-            dtype: The data type of the NDArray elements.
+            datatype: The data type of the NDArray elements.
 
         Args:
             shape: The shape of the Matrix.
             order: The order of the Matrix. "C" or "F".
         """
-        var result = Matrix[dtype](shape, order)
+        var result = Matrix[datatype](shape, order)
         for i in range(result.size):
-            result._buf.ptr.store(i, random_float64(0, 1).cast[dtype]())
+            result._buf.ptr.store(i, random_float64(0, 1).cast[datatype]())
         return result^
 
     @staticmethod
     fn fromlist[
-        dtype: DType
+        datatype: DType
     ](
-        object: List[Scalar[dtype]],
+        object: List[Scalar[datatype]],
         shape: Tuple[Int, Int] = (0, 0),
         order: String = "C",
-    ) raises -> Matrix[dtype]:
+    ) raises -> Matrix[datatype]:
         """Create a matrix from a 1-dimensional list into given shape.
 
         If no shape is passed, the return matrix will be a row vector.
@@ -1456,8 +1408,8 @@ struct Matrix[dtype: DType = DType.float64](
         """
 
         if (shape[0] == 0) and (shape[1] == 0):
-            var M = Matrix[dtype](shape=(1, len(object)))
-            memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+            var M = Matrix[datatype](shape=(1, len(object)))
+            memcpy(dest=M._buf.ptr, src=object.unsafe_ptr(), count=M.size)
             return M^
 
         if shape[0] * shape[1] != len(object):
@@ -1465,18 +1417,18 @@ struct Matrix[dtype: DType = DType.float64](
                 "The input has {} elements, but the target has the shape {}x{}"
             ).format(len(object), shape[0], shape[1])
             raise Error(message)
-        var M = Matrix[dtype](shape=shape, order="C")
-        memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+        var M = Matrix[datatype](shape=shape, order="C")
+        memcpy(dest=M._buf.ptr, src=object.unsafe_ptr(), count=M.size)
         if order == "F":
             M = M.reorder_layout()
         return M^
 
     @staticmethod
     fn fromstring[
-        dtype: DType = DType.float64
+        datatype: DType = DType.float64
     ](
         text: String, shape: Tuple[Int, Int] = (0, 0), order: String = "C"
-    ) raises -> Matrix[dtype]:
+    ) raises -> Matrix[datatype]:
         """Matrix initialization from string representation of an matrix.
 
         Comma, right brackets, and whitespace are treated as seperators of numbers.
@@ -1506,7 +1458,7 @@ struct Matrix[dtype: DType = DType.float64](
             order: Order of the matrix. "C" or "F".
         """
 
-        var data = List[Scalar[dtype]]()
+        var data = List[Scalar[datatype]]()
         var bytes = text.as_bytes()
         var number_as_str: String = ""
         var size = shape[0] * shape[1]
@@ -1520,7 +1472,7 @@ struct Matrix[dtype: DType = DType.float64](
             ):
                 number_as_str = number_as_str + chr(Int(b))
                 if i == len(bytes) - 1:  # Last byte
-                    var number = atof(number_as_str).cast[dtype]()
+                    var number = atof(number_as_str).cast[datatype]()
                     data.append(number)  # Add the number to the data buffer
                     number_as_str = ""  # Clean the number cache
             if (
@@ -1529,7 +1481,7 @@ struct Matrix[dtype: DType = DType.float64](
                 or (chr(Int(b)) == " ")
             ):
                 if number_as_str != "":
-                    var number = atof(number_as_str).cast[dtype]()
+                    var number = atof(number_as_str).cast[datatype]()
                     data.append(number)  # Add the number to the data buffer
                     number_as_str = ""  # Clean the number cache
 
@@ -1543,7 +1495,7 @@ struct Matrix[dtype: DType = DType.float64](
             ).format(len(data), shape[0], shape[1])
             raise Error(message)
 
-        var result = Matrix[dtype](shape=shape)
+        var result = Matrix[datatype](shape=shape)
         for i in range(len(data)):
             result._buf.ptr[i] = data[i]
         return result^
diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index d4e0cf92..34464b9b 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -109,7 +109,7 @@ import numojo.routines.searching as searching
 # ===-----------------------------------------------------------------------===#
 # Implements the N-Dimensional Array.
 # ===-----------------------------------------------------------------------===#
-struct NDArray[dtype: DType = DType.float64](
+struct NDArray[dtype: DType = DType.float64,](
     Absable,
     Copyable,
     FloatableRaising,
@@ -315,7 +315,7 @@ struct NDArray[dtype: DType = DType.float64](
         self.size = other.size
         self.strides = other.strides
         self._buf = OwnData[dtype](self.size)
-        memcpy(self._buf.ptr, other._buf.ptr, other.size)
+        memcpy(dest=self._buf.ptr, src=other._buf.ptr, count=other.size)
         self.flags = Flags(
             c_contiguous=other.flags.C_CONTIGUOUS,
             f_contiguous=other.flags.F_CONTIGUOUS,
@@ -371,7 +371,7 @@ struct NDArray[dtype: DType = DType.float64](
     # fn __getitem__(self, *slices: Variant[Slice, Int]) raises -> Self         # Get by mix of slices/ints
     #
     # 4. Advanced Indexing
-    # fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self        # Get by index array
+    # fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self        # Get by index array
     # fn __getitem__(self, indices: List[Int]) raises -> Self                   # Get by list of indices
     # fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self            # Get by boolean mask
     # fn __getitem__(self, mask: List[Bool]) raises -> Self                     # Get by boolean list
@@ -401,14 +401,15 @@ struct NDArray[dtype: DType = DType.float64](
         Examples:
 
         ```mojo
-        import numojo
-        var A = numojo.ones(numojo.Shape(2,3,4))
+        import numojo as nm
+        from numojo.prelude import *
+        var A = nm.ones[f32](nm.Shape(2,3,4))
         print(A._getitem(1,2,3))
         ```
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         return self._buf.ptr[index_of_buffer]
 
     fn _getitem(self, indices: List[Int]) -> Scalar[dtype]:
@@ -428,14 +429,15 @@ struct NDArray[dtype: DType = DType.float64](
         Examples:
 
         ```mojo
-        import numojo
-        var A = numojo.ones(numojo.Shape(2,3,4))
+        import numojo as nm
+        from numojo.prelude import *
+        var A = nm.ones[f32](nm.Shape(2,3,4))
         print(A._getitem(List[Int](1,2,3)))
         ```
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         return self._buf.ptr[index_of_buffer]
 
     fn __getitem__(self) raises -> SIMD[dtype, 1]:
@@ -609,18 +611,22 @@ struct NDArray[dtype: DType = DType.float64](
         # Fast path for C-contiguous arrays
         if self.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(result._buf.ptr, self._buf.ptr + norm * block, block)
+            memcpy(
+                dest=result._buf.ptr,
+                src=self._buf.ptr + norm * block,
+                count=block,
+            )
             return result^
 
         # (F-order or arbitrary stride layout)
         # TODO: Optimize this further (multi-axis unrolling / smarter linear index without div/mod)
-        self._copy_first_axis_slice[dtype](self, norm, result)
+        self._copy_first_axis_slice(self, norm, result)
         return result^
 
     # perhaps move these to a utility module
-    fn _copy_first_axis_slice[
-        dtype: DType
-    ](self, src: NDArray[dtype], norm_idx: Int, mut dst: NDArray[dtype]):
+    fn _copy_first_axis_slice(
+        self, src: NDArray[dtype], norm_idx: Int, mut dst: NDArray[dtype]
+    ):
         """Generic stride-based copier for first-axis slice (works for any layout).
         """
         var out_ndim = dst.ndim
@@ -634,7 +640,7 @@ struct NDArray[dtype: DType = DType.float64](
         for lin in range(total):
             var rem = lin
             for d in range(out_ndim - 1, -1, -1):
-                var dim = dst.shape._buf[d]
+                var dim = Int(dst.shape._buf[d])
                 coords[d] = rem % dim
                 rem //= dim
             var off = base
@@ -642,7 +648,7 @@ struct NDArray[dtype: DType = DType.float64](
                 off += coords[d] * src.strides._buf[d + 1]
             var dst_off = 0
             for d in range(out_ndim):
-                dst_off += coords[d] * dst.strides._buf[d]
+                dst_off += coords[d] * Int(dst.strides._buf[d])
             dst._buf.ptr[dst_off] = src._buf.ptr[off]
 
     fn __getitem__(self, var *slices: Slice) raises -> Self:
@@ -789,9 +795,132 @@ struct NDArray[dtype: DType = DType.float64](
                 slice_len: Int = max(0, (end - start + (step - 1)) // step)
             else:
                 slice_len: Int = max(0, (start - end - step - 1) // (-step))
-            if (
-                slice_len > 1
-            ):  # TODO: remember to remove this behaviour -> Numpy doesn't dimension reduce when slicing. But I am keeping it for now since it messes up the sum, means etc tests due to shape inconsistencies.
+            nshape.append(slice_len)
+            ncoefficients.append(self.strides[i] * step)
+            ndims += 1
+            noffset += start * self.strides[i]
+
+        if len(nshape) == 0:
+            nshape.append(1)
+            ncoefficients.append(1)
+
+        # only C & F order are supported
+        var nstrides: List[Int] = self._calculate_strides_efficient(
+            nshape,
+        )
+        var narr: Self = Self(offset=noffset, shape=nshape, strides=nstrides)
+        var index: List[Int] = List[Int](length=ndims, fill=0)
+
+        _traverse_iterative[dtype](
+            self, narr, nshape, ncoefficients, nstrides, noffset, index, 0
+        )
+
+        return narr^
+
+    fn _getitem_variadic_slices(self, var *slices: Slice) raises -> Self:
+        """
+        Alternative implementation of `__getitem__(self, owned *slices: Slice)` which reduces dimension unlike the original one which is compatible with numpy slicing.
+
+        Args:
+            slices: Variadic list of `Slice` objects, one for each dimension to be sliced.
+
+        Constraints:
+            - The number of slices provided must not exceed the number of array dimensions.
+            - Each slice must be valid for its corresponding dimension.
+
+        Returns:
+            Self: A new array instance representing the sliced view of the original array.
+
+        Raises:
+            IndexError: If any slice is out of bounds for its corresponding dimension.
+            ValueError: If the number of slices does not match the array's dimensions.
+
+        NOTES:
+            - This method is for internal purposes only and is not exposed to users.
+        """
+        var n_slices: Int = slices.__len__()
+        if n_slices > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Too many slices provided: expected at most {} but"
+                        " got {}."
+                    ).format(self.ndim, n_slices),
+                    suggestion=String(
+                        "Provide at most {} slices for an array with {}"
+                        " dimensions."
+                    ).format(self.ndim, self.ndim),
+                    location=String("NDArray.__getitem__(slices: Slice)"),
+                )
+            )
+        var slice_list: List[Slice] = List[Slice](capacity=self.ndim)
+        for i in range(len(slices)):
+            slice_list.append(slices[i])
+
+        if n_slices < self.ndim:
+            for i in range(n_slices, self.ndim):
+                slice_list.append(Slice(0, self.shape[i], 1))
+
+        var narr: Self = self[slice_list^]
+        return narr^
+
+    fn _getitem_list_slices(self, var slice_list: List[Slice]) raises -> Self:
+        """
+        Alternative implementation of `__getitem__(self, owned slice_list: List[Slice])` for which reduces dimension unlike the original one which is compatible with numpy slicing.
+
+        Args:
+            slice_list: List of Slice objects, where each Slice defines the start, stop, and step for the corresponding dimension.
+
+        Returns:
+            Self: A new array instance representing the sliced view of the original array.
+
+        Raises:
+            Error: If slice_list is empty or contains invalid slices.
+            Error: The length of slice_list must not exceed the number of dimensions in the array.
+            Error: Each Slice in slice_list must be valid for its respective dimension.
+
+        Notes:
+            This function is only for internal use since it's not compatible with numpy slicing.
+        """
+        var n_slices: Int = slice_list.__len__()
+        if n_slices == 0:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Empty slice list provided to NDArray.__getitem__."
+                    ),
+                    suggestion=String(
+                        "Provide a List with at least one slice to index the"
+                        " array."
+                    ),
+                    location=String(
+                        "NDArray.__getitem__(slice_list: List[Slice])"
+                    ),
+                )
+            )
+
+        # adjust slice values for user provided slices
+        var slices: List[Slice] = self._adjust_slice(slice_list)
+        if n_slices < self.ndim:
+            for i in range(n_slices, self.ndim):
+                slices.append(Slice(0, self.shape[i], 1))
+
+        var ndims: Int = 0
+        var nshape: List[Int] = List[Int]()
+        var ncoefficients: List[Int] = List[Int]()
+        var noffset: Int = 0
+
+        for i in range(self.ndim):
+            var start: Int = slices[i].start.value()
+            var end: Int = slices[i].end.value()
+            var step: Int = slices[i].step.or_else(1)
+
+            var slice_len: Int
+            if step > 0:
+                slice_len: Int = max(0, (end - start + (step - 1)) // step)
+            else:
+                slice_len: Int = max(0, (start - end - step - 1) // (-step))
+            if slice_len > 1:
                 nshape.append(slice_len)
                 ncoefficients.append(self.strides[i] * step)
                 ndims += 1
@@ -1052,7 +1181,7 @@ struct NDArray[dtype: DType = DType.float64](
         narr = self.__getitem__(slice_list^)
         return narr^
 
-    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
+    fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self:
         """
         Get items from 0-th dimension of an ndarray of indices.
         If the original array is of shape (i,j,k) and
@@ -1118,14 +1247,14 @@ struct NDArray[dtype: DType = DType.float64](
                             " ({})."
                         ).format(self.shape[0]),
                         location=String(
-                            "NDArray.__getitem__(indices: NDArray[DType.index])"
+                            "NDArray.__getitem__(indices: NDArray[DType.int])"
                         ),
                     )
                 )
             memcpy(
-                result._buf.ptr + i * size_per_item,
-                self._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._buf.ptr + i * size_per_item,
+                src=self._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
 
         return result^
@@ -1134,7 +1263,7 @@ struct NDArray[dtype: DType = DType.float64](
         # TODO: Use trait IntLike when it is supported by Mojo.
         """
         Get items from 0-th dimension of an array. It is an overload of
-        `__getitem__(self, indices: NDArray[DType.index]) raises -> Self`.
+        `__getitem__(self, indices: NDArray[DType.int]) raises -> Self`.
 
         Args:
             indices: A list of Int.
@@ -1174,7 +1303,7 @@ struct NDArray[dtype: DType = DType.float64](
         ```.
         """
 
-        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
+        var indices_array = NDArray[DType.int](shape=Shape(len(indices)))
         for i in range(len(indices)):
             (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
 
@@ -1268,9 +1397,9 @@ struct NDArray[dtype: DType = DType.float64](
             for i in range(mask.size):
                 if mask.item(i):
                     memcpy(
-                        result._buf.ptr + offset * size_per_item,
-                        self._buf.ptr + i * size_per_item,
-                        size_per_item,
+                        dest=result._buf.ptr + offset * size_per_item,
+                        src=self._buf.ptr + i * size_per_item,
+                        count=size_per_item,
                     )
                     offset += 1
 
@@ -1681,7 +1810,7 @@ struct NDArray[dtype: DType = DType.float64](
     # fn __setitem__(mut self, *slices: Variant[Slice, Int], val: Self) raises  # Set by mix of slices/ints
 
     # Index-based Setters
-    # fn __setitem__(self, indices: NDArray[DType.index], val: NDArray) raises  # Set by index array
+    # fn __setitem__(self, indices: NDArray[DType.int], val: NDArray) raises  # Set by index array
     # fn __setitem__(mut self, mask: NDArray[DType.bool], val: NDArray[dtype])  # Set by boolean mask array
 
     # Helper Methods
@@ -1706,14 +1835,15 @@ struct NDArray[dtype: DType = DType.float64](
         Examples:
 
         ```mojo
-        import numojo
-        var A = numojo.ones(numojo.Shape(2,3,4))
+        import numojo as nm
+        from numojo.prelude import *
+        var A = nm.ones[f32](nm.Shape(2,3,4))
         A._setitem(1,2,3, val=10)
         ```
         """
         var index_of_buffer: Int = 0
         for i in range(self.ndim):
-            index_of_buffer += indices[i] * self.strides._buf[i]
+            index_of_buffer += indices[i] * Int(self.strides._buf[i])
         self._buf.ptr[index_of_buffer] = val
 
     fn __setitem__(self, idx: Int, val: Self) raises:
@@ -1817,16 +1947,18 @@ struct NDArray[dtype: DType = DType.float64](
         # Fast path for C-contiguous arrays (single block)
         if self.flags.C_CONTIGUOUS and val.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(self._buf.ptr + norm * block, val._buf.ptr, block)
+            memcpy(
+                dest=self._buf.ptr + norm * block, src=val._buf.ptr, count=block
+            )
             return
 
         # Generic stride path (F-order or irregular)
-        self._write_first_axis_slice[dtype](self, norm, val)
+        self._write_first_axis_slice(self, norm, val)
 
     # perhaps move these to a utility module
-    fn _write_first_axis_slice[
-        dtype: DType
-    ](self, dst: NDArray[dtype], norm_idx: Int, src: NDArray[dtype]):
+    fn _write_first_axis_slice(
+        self, dst: NDArray[dtype], norm_idx: Int, src: NDArray[dtype]
+    ):
         var out_ndim = src.ndim
         var total = src.size
         if total == 0:
@@ -1838,14 +1970,14 @@ struct NDArray[dtype: DType = DType.float64](
         for lin in range(total):
             var rem = lin
             for d in range(out_ndim - 1, -1, -1):
-                var dim = src.shape._buf[d]
+                var dim = Int(src.shape._buf[d])
                 coords[d] = rem % dim
                 rem //= dim
             var dst_off = base
             var src_off = 0
             for d in range(out_ndim):
-                var stride_src = src.strides._buf[d]
-                var stride_dst = dst.strides._buf[d + 1]
+                var stride_src = Int(src.strides._buf[d])
+                var stride_dst = Int(dst.strides._buf[d + 1])
                 var c = coords[d]
                 dst_off += c * stride_dst
                 src_off += c * stride_src
@@ -2191,7 +2323,7 @@ struct NDArray[dtype: DType = DType.float64](
 
     # TODO: fix this setter, add bound checks. Not sure about it's use case.
     fn __setitem__(
-        mut self, index: NDArray[DType.index], val: NDArray[dtype]
+        mut self, index: NDArray[DType.int], val: NDArray[dtype]
     ) raises:
         """
         Returns the items of the array from an array of indices.
@@ -2226,7 +2358,7 @@ struct NDArray[dtype: DType = DType.float64](
                         " each axis separately."
                     ),
                     location=String(
-                        "NDArray.__setitem__(index: NDArray[DType.index], val:"
+                        "NDArray.__setitem__(index: NDArray[DType.int], val:"
                         " NDArray)"
                     ),
                 )
@@ -2244,7 +2376,7 @@ struct NDArray[dtype: DType = DType.float64](
                         " first dimension ({})."
                     ).format(self.shape[0]),
                     location=String(
-                        "NDArray.__setitem__(index: NDArray[DType.index], val:"
+                        "NDArray.__setitem__(index: NDArray[DType.int], val:"
                         " NDArray)"
                     ),
                 )
@@ -2271,7 +2403,7 @@ struct NDArray[dtype: DType = DType.float64](
                             " ({})."
                         ).format(self.shape[0]),
                         location=String(
-                            "NDArray.__setitem__(index: NDArray[DType.index],"
+                            "NDArray.__setitem__(index: NDArray[DType.int],"
                             " val: NDArray)"
                         ),
                     )
@@ -3689,11 +3821,11 @@ struct NDArray[dtype: DType = DType.float64](
         """
         Returns length of 0-th dimension.
         """
-        return self.shape._buf[0]
+        return Int(self.shape._buf[0])
 
     fn __iter__(
         self,
-    ) raises -> _NDArrayIter[__origin_of(self), dtype]:
+    ) raises -> _NDArrayIter[origin_of(self), dtype]:
         """
         Iterates over elements of the NDArray and return sub-arrays as view.
 
@@ -3717,14 +3849,14 @@ struct NDArray[dtype: DType = DType.float64](
         ```.
         """
 
-        return _NDArrayIter[__origin_of(self), dtype](
+        return _NDArrayIter[origin_of(self), dtype](
             self,
             dimension=0,
         )
 
     fn __reversed__(
         self,
-    ) raises -> _NDArrayIter[__origin_of(self), dtype, forward=False]:
+    ) raises -> _NDArrayIter[origin_of(self), dtype, forward=False]:
         """
         Iterates backwards over elements of the NDArray, returning
         copied value.
@@ -3733,7 +3865,7 @@ struct NDArray[dtype: DType = DType.float64](
             A reversed iterator of NDArray elements.
         """
 
-        return _NDArrayIter[__origin_of(self), dtype, forward=False](
+        return _NDArrayIter[origin_of(self), dtype, forward=False](
             self,
             dimension=0,
         )
@@ -4051,33 +4183,33 @@ struct NDArray[dtype: DType = DType.float64](
         vectorize[vectorized_any, self.width](self.size)
         return result
 
-    fn argmax(self) raises -> Scalar[DType.index]:
+    fn argmax(self) raises -> Scalar[DType.int]:
         """Returns the indices of the maximum values along an axis.
         When no axis is specified, the array is flattened.
         See `numojo.argmax()` for more details.
         """
         return searching.argmax(self)
 
-    fn argmax(self, axis: Int) raises -> NDArray[DType.index]:
+    fn argmax(self, axis: Int) raises -> NDArray[DType.int]:
         """Returns the indices of the maximum values along an axis.
         See `numojo.argmax()` for more details.
         """
         return searching.argmax(self, axis=axis)
 
-    fn argmin(self) raises -> Scalar[DType.index]:
+    fn argmin(self) raises -> Scalar[DType.int]:
         """Returns the indices of the minimum values along an axis.
         When no axis is specified, the array is flattened.
         See `numojo.argmin()` for more details.
         """
         return searching.argmin(self)
 
-    fn argmin(self, axis: Int) raises -> NDArray[DType.index]:
+    fn argmin(self, axis: Int) raises -> NDArray[DType.int]:
         """Returns the indices of the minimum values along an axis.
         See `numojo.argmin()` for more details.
         """
         return searching.argmin(self, axis=axis)
 
-    fn argsort(mut self) raises -> NDArray[DType.index]:
+    fn argsort(mut self) raises -> NDArray[DType.int]:
         """
         Sort the NDArray and return the sorted indices.
         See `numojo.argsort()` for more details.
@@ -4088,7 +4220,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.sorting.argsort(self)
 
-    fn argsort(mut self, axis: Int) raises -> NDArray[DType.index]:
+    fn argsort(mut self, axis: Int) raises -> NDArray[DType.int]:
         """
         Sort the NDArray and return the sorted indices.
         See `numojo.argsort()` for more details.
@@ -4127,17 +4259,12 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.clip(self, a_min, a_max)
 
-    fn compress[
-        dtype: DType
-    ](self, condition: NDArray[DType.bool], axis: Int) raises -> Self:
+    fn compress(self, condition: NDArray[DType.bool], axis: Int) raises -> Self:
         # TODO: @forFudan try using parallelization for this function
         """
         Return selected slices of an array along given axis.
         If no axis is provided, the array is flattened before use.
 
-        Parameters:
-            dtype: DType.
-
         Args:
             condition: 1-D array of booleans that selects which entries to return.
                 If length of condition is less than the size of the array along the
@@ -4157,17 +4284,12 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.compress(condition=condition, a=self, axis=axis)
 
-    fn compress[
-        dtype: DType
-    ](self, condition: NDArray[DType.bool]) raises -> Self:
+    fn compress(self, condition: NDArray[DType.bool]) raises -> Self:
         """
         Return selected slices of an array along given axis.
         If no axis is provided, the array is flattened before use.
         This is a function ***OVERLOAD***.
 
-        Parameters:
-            dtype: DType.
-
         Args:
             condition: 1-D array of booleans that selects which entries to return.
                 If length of condition is less than the size of the array along the
@@ -4266,7 +4388,7 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return numojo.math.cumsum[dtype](self.copy(), axis=axis)
 
-    fn diagonal[dtype: DType](self, offset: Int = 0) raises -> Self:
+    fn diagonal(self, offset: Int = 0) raises -> Self:
         """
         Returns specific diagonals.
         Currently supports only 2D arrays.
@@ -4275,9 +4397,6 @@ struct NDArray[dtype: DType = DType.float64](
             Error: If the array is not 2D.
             Error: If the offset is beyond the shape of the array.
 
-        Parameters:
-            dtype: Data type of the array.
-
         Args:
             offset: Offset of the diagonal from the main diagonal.
 
@@ -4312,7 +4431,7 @@ struct NDArray[dtype: DType = DType.float64](
     fn iter_along_axis[
         forward: Bool = True
     ](self, axis: Int, order: String = "C") raises -> _NDAxisIter[
-        __origin_of(self), dtype, forward
+        origin_of(self), dtype, forward
     ]:
         """
         Returns an iterator yielding 1-d array slices along the given axis.
@@ -4405,7 +4524,7 @@ struct NDArray[dtype: DType = DType.float64](
                 ).format(axis, -self.ndim, self.ndim)
             )
 
-        return _NDAxisIter[__origin_of(self), dtype, forward](
+        return _NDAxisIter[origin_of(self), dtype, forward](
             self,
             axis=normalized_axis,
             order=order,
@@ -4414,7 +4533,7 @@ struct NDArray[dtype: DType = DType.float64](
     fn iter_over_dimension[
         forward: Bool = True
     ](read self, dimension: Int) raises -> _NDArrayIter[
-        __origin_of(self), dtype, forward
+        origin_of(self), dtype, forward
     ]:
         """
         Returns an iterator yielding `ndim-1` arrays over the given dimension.
@@ -4444,7 +4563,7 @@ struct NDArray[dtype: DType = DType.float64](
                 ).format(dimension, -self.ndim, self.ndim)
             )
 
-        return _NDArrayIter[__origin_of(self), dtype, forward](
+        return _NDArrayIter[origin_of(self), dtype, forward](
             a=self,
             dimension=normalized_dim,
         )
@@ -4602,7 +4721,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.math.min(self, axis=axis)
 
-    fn nditer(self) raises -> _NDIter[__origin_of(self), dtype]:
+    fn nditer(self) raises -> _NDIter[origin_of(self), dtype]:
         """
         ***Overload*** Return an iterator yielding the array elements according
         to the memory layout of the array.
@@ -4633,7 +4752,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return self.nditer(order=order)
 
-    fn nditer(self, order: String) raises -> _NDIter[__origin_of(self), dtype]:
+    fn nditer(self, order: String) raises -> _NDIter[origin_of(self), dtype]:
         """
         Return an iterator yielding the array elements according to the order.
 
@@ -4672,7 +4791,7 @@ struct NDArray[dtype: DType = DType.float64](
         else:
             axis = 0
 
-        return _NDIter[__origin_of(self), dtype](a=self, order=order, axis=axis)
+        return _NDIter[origin_of(self), dtype](a=self, order=order, axis=axis)
 
     fn num_elements(self) -> Int:
         """
@@ -4765,7 +4884,6 @@ struct NDArray[dtype: DType = DType.float64](
         Returns:
             Array of the same data with a new shape.
         """
-        print("WTF IS HAPPENING")
         return numojo.reshape(self, shape=shape, order=order)
 
     fn resize(mut self, shape: NDArrayShape) raises:
@@ -4783,7 +4901,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         if shape.size_of_array() > self.size:
             var other = Self(shape=shape, order=order)
-            memcpy(other._buf.ptr, self._buf.ptr, self.size)
+            memcpy(dest=other._buf.ptr, src=self._buf.ptr, count=self.size)
             for i in range(self.size, other.size):
                 (other._buf.ptr + i).init_pointee_copy(0)
             self = other^
@@ -5041,8 +5159,8 @@ struct NDArray[dtype: DType = DType.float64](
         ref self,
     ) -> UnsafePointer[
         Scalar[dtype],
-        mut = Origin(__origin_of(self)).mut,
-        origin = __origin_of(self),
+        mut = Origin(origin_of(self)).mut,
+        origin = origin_of(self),
     ]:
         """
         Retreive pointer without taking ownership.
@@ -5051,9 +5169,10 @@ struct NDArray[dtype: DType = DType.float64](
             Unsafe pointer to the data buffer.
 
         """
-        return self._buf.ptr.origin_cast[
-            Origin(__origin_of(self)).mut, __origin_of(self)
-        ]()
+        # TODO: figure out safe way to do this!
+        return self._buf.ptr.unsafe_mut_cast[
+            Origin(origin_of(self)).mut
+        ]().unsafe_origin_cast[origin_of(self)]()
 
     fn variance[
         returned_dtype: DType = DType.float64
@@ -5123,6 +5242,49 @@ struct NDArray[dtype: DType = DType.float64](
             sum = sum + self.load(i) * other.load(i)
         return sum
 
+    fn squeeze(mut self, axis: Int) raises:
+        """
+        Remove (squeeze) a single dimension of size 1 from the array shape.
+
+        Args:
+            axis: The axis to squeeze. Supports negative indices.
+
+        Raises:
+            IndexError: If the axis is out of range.
+            ShapeError: If the dimension at the given axis is not of size 1.
+        """
+        var normalized_axis: Int = axis
+        if normalized_axis < 0:
+            normalized_axis += self.ndim
+        if (normalized_axis < 0) or (normalized_axis >= self.ndim):
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Axis {} is out of range for array with {} dimensions."
+                    ).format(axis, self.ndim),
+                    suggestion=String(
+                        "Use an axis value in the range [-{}, {})."
+                    ).format(self.ndim, self.ndim),
+                    location=String("NDArray.squeeze(axis: Int)"),
+                )
+            )
+
+        if self.shape[normalized_axis] != 1:
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Cannot squeeze axis {} with size {}."
+                    ).format(normalized_axis, self.shape[normalized_axis]),
+                    suggestion=String(
+                        "Only axes with length 1 can be removed."
+                    ),
+                    location=String("NDArray.squeeze(axis: Int)"),
+                )
+            )
+        self.shape = self.shape._pop(normalized_axis)
+        self.strides = self.strides._pop(normalized_axis)
+        self.ndim -= 1
+
 
 # ===----------------------------------------------------------------------===#
 # NDArrayIterator
@@ -5460,9 +5622,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous or F-contiguous
             memcpy(
-                res._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=res._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
 
         else:
@@ -5522,9 +5684,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous or F-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
         else:
             for j in range(self.size_of_item):
@@ -5537,7 +5699,7 @@ struct _NDAxisIter[
 
     fn ith_with_offsets(
         self, index: Int
-    ) raises -> Tuple[NDArray[DType.index], NDArray[dtype]]:
+    ) raises -> Tuple[NDArray[DType.int], NDArray[dtype]]:
         """
         Gets the i-th 1-d array of the iterator and the offsets (in C-order)
         of its elements.
@@ -5549,7 +5711,7 @@ struct _NDAxisIter[
             Offsets (in C-order) and elements of the i-th 1-d array of the
             iterator.
         """
-        var offsets: NDArray[DType.index] = NDArray[DType.index](
+        var offsets: NDArray[DType.int] = NDArray[DType.int](
             Shape(self.size_of_item)
         )
         var elements: NDArray[dtype] = NDArray[dtype](Shape(self.size_of_item))
@@ -5578,9 +5740,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
             var begin_offset = _get_offset(item, new_strides)
             for j in range(self.size_of_item):
@@ -5591,9 +5753,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is F-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
             for j in range(self.size_of_item):
                 (offsets._buf.ptr + j).init_pointee_copy(
@@ -5683,7 +5845,7 @@ struct _NDIter[is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType](
                     (indices._buf + i).init_pointee_copy(
                         remainder // self.strides_compatible._buf[i]
                     )
-                    remainder %= self.strides_compatible._buf[i]
+                    remainder %= Int(self.strides_compatible._buf[i])
             (indices._buf + self.axis).init_pointee_copy(remainder)
 
         else:
@@ -5692,7 +5854,7 @@ struct _NDIter[is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType](
                     (indices._buf + i).init_pointee_copy(
                         remainder // self.strides_compatible._buf[i]
                     )
-                    remainder %= self.strides_compatible._buf[i]
+                    remainder %= Int(self.strides_compatible._buf[i])
             (indices._buf + self.axis).init_pointee_copy(remainder)
 
         return self.ptr[_get_offset(indices, self.strides)]
@@ -5725,7 +5887,7 @@ struct _NDIter[is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType](
                     (indices._buf + i).init_pointee_copy(
                         remainder // self.strides_compatible._buf[i]
                     )
-                    remainder %= self.strides_compatible._buf[i]
+                    remainder %= Int(self.strides_compatible._buf[i])
             (indices._buf + self.axis).init_pointee_copy(remainder)
         else:
             for i in range(self.ndim - 1, -1, -1):
@@ -5733,7 +5895,7 @@ struct _NDIter[is_mutable: Bool, //, origin: Origin[is_mutable], dtype: DType](
                     (indices._buf + i).init_pointee_copy(
                         remainder // self.strides_compatible._buf[i]
                     )
-                    remainder %= self.strides_compatible._buf[i]
+                    remainder %= Int(self.strides_compatible._buf[i])
             (indices._buf + self.axis).init_pointee_copy(remainder)
 
         return self.ptr[_get_offset(indices, self.strides)]
diff --git a/numojo/core/ndshape.mojo b/numojo/core/ndshape.mojo
index 5c47c566..6647ace2 100644
--- a/numojo/core/ndshape.mojo
+++ b/numojo/core/ndshape.mojo
@@ -10,13 +10,15 @@ Implements NDArrayShape type.
 
 from memory import UnsafePointer, memcpy, memcmp
 
+from numojo.core.error import IndexError, ShapeError, ValueError
+
 alias Shape = NDArrayShape
 """An alias of the NDArrayShape."""
 
 
 @register_passable
 struct NDArrayShape(
-    ImplicitlyCopyable, Sized, Stringable & Representable, Writable
+    ImplicitlyCopyable, Movable, Sized, Stringable & Representable, Writable
 ):
     """
     Presents the shape of `NDArray` type.
@@ -28,8 +30,11 @@ struct NDArrayShape(
     creation of the shape.
     """
 
+    # Aliases
+    alias _type: DType = DType.int
+
     # Fields
-    var _buf: UnsafePointer[Int]
+    var _buf: UnsafePointer[Scalar[Self._type]]
     """Data buffer."""
     var ndim: Int
     """Number of dimensions of array. It must be larger than 0."""
@@ -47,10 +52,18 @@ struct NDArrayShape(
         """
 
         if shape < 1:
-            raise Error(String("Items of shape must be positive."))
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Shape dimension must be positive, got {}."
+                    ).format(shape),
+                    suggestion="Use positive integers for shape dimensions.",
+                    location="NDArrayShape.__init__(shape: Int)",
+                )
+            )
 
         self.ndim = 1
-        self._buf = UnsafePointer[Int]().alloc(shape)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(shape)
         self._buf.init_pointee_copy(shape)
 
     @always_inline("nodebug")
@@ -66,17 +79,31 @@ struct NDArrayShape(
         """
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location="NDArrayShape.__init__(*shape: Int)",
+                )
             )
 
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 1:
-                raise Error(String("Items of shape must be positive."))
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be positive,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use positive integers for all shape dimensions."
+                        ),
+                        location="NDArrayShape.__init__(*shape: Int)",
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
 
     @always_inline("nodebug")
@@ -95,19 +122,46 @@ struct NDArrayShape(
         """
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location="NDArrayShape.__init__(*shape: Int, size: Int)",
+                )
             )
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 1:
-                raise Error(String("Items of shape must be positive."))
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be positive,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use positive integers for all shape dimensions."
+                        ),
+                        location=(
+                            "NDArrayShape.__init__(*shape: Int, size: Int)"
+                        ),
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
         if self.size_of_array() != size:
-            raise Error("Cannot create NDArray: shape and size mismatch")
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Shape size {} does not match provided size {}."
+                    ).format(self.size_of_array(), size),
+                    suggestion=(
+                        "Ensure the product of shape dimensions equals the"
+                        " size."
+                    ),
+                    location="NDArrayShape.__init__(*shape: Int, size: Int)",
+                )
+            )
 
     @always_inline("nodebug")
     fn __init__(out self, shape: List[Int]) raises:
@@ -123,16 +177,30 @@ struct NDArrayShape(
         """
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location="NDArrayShape.__init__(shape: List[Int])",
+                )
             )
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 0:
-                raise Error("Items of shape must be non negative.")
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be non-negative,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use non-negative integers for shape dimensions."
+                        ),
+                        location="NDArrayShape.__init__(shape: List[Int])",
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
 
     @always_inline("nodebug")
@@ -152,20 +220,51 @@ struct NDArrayShape(
 
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location=(
+                        "NDArrayShape.__init__(shape: List[Int], size: Int)"
+                    ),
+                )
             )
 
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 1:
-                raise Error("Items of shape must be positive.")
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be positive,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use positive integers for all shape dimensions."
+                        ),
+                        location=(
+                            "NDArrayShape.__init__(shape: List[Int], size: Int)"
+                        ),
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
         if self.size_of_array() != size:
-            raise Error("Cannot create NDArray: shape and size mismatch")
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Shape size {} does not match provided size {}."
+                    ).format(self.size_of_array(), size),
+                    suggestion=(
+                        "Ensure the product of shape dimensions equals the"
+                        " size."
+                    ),
+                    location=(
+                        "NDArrayShape.__init__(shape: List[Int], size: Int)"
+                    ),
+                )
+            )
 
     @always_inline("nodebug")
     fn __init__(out self, shape: VariadicList[Int]) raises:
@@ -182,17 +281,33 @@ struct NDArrayShape(
 
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location="NDArrayShape.__init__(shape: VariadicList[Int])",
+                )
             )
 
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 1:
-                raise Error("Items of shape must be positive.")
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be positive,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use positive integers for all shape dimensions."
+                        ),
+                        location=(
+                            "NDArrayShape.__init__(shape: VariadicList[Int])"
+                        ),
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
 
     @always_inline("nodebug")
@@ -212,21 +327,55 @@ struct NDArrayShape(
 
         if len(shape) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(shape))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(shape)),
+                    suggestion="Provide at least one shape dimension.",
+                    location=(
+                        "NDArrayShape.__init__(shape: VariadicList[Int], size:"
+                        " Int)"
+                    ),
+                )
             )
 
         self.ndim = len(shape)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             if shape[i] < 1:
-                raise Error("Items of shape must be positive.")
+                raise Error(
+                    ShapeError(
+                        message=String(
+                            "Shape dimension at index {} must be positive,"
+                            " got {}."
+                        ).format(i, shape[i]),
+                        suggestion=(
+                            "Use positive integers for all shape dimensions."
+                        ),
+                        location=(
+                            "NDArrayShape.__init__(shape: VariadicList[Int],"
+                            " size: Int)"
+                        ),
+                    )
+                )
             (self._buf + i).init_pointee_copy(shape[i])
 
         if self.size_of_array() != size:
-            raise Error("Cannot create NDArray: shape and size mismatch")
+            raise Error(
+                ShapeError(
+                    message=String(
+                        "Shape size {} does not match provided size {}."
+                    ).format(self.size_of_array(), size),
+                    suggestion=(
+                        "Ensure the product of shape dimensions equals the"
+                        " size."
+                    ),
+                    location=(
+                        "NDArrayShape.__init__(shape: VariadicList[Int], size:"
+                        " Int)"
+                    ),
+                )
+            )
 
     @always_inline("nodebug")
     fn __init__(out self, shape: NDArrayShape):
@@ -238,8 +387,8 @@ struct NDArrayShape(
             shape: Another NDArrayShape to initialize from.
         """
         self.ndim = shape.ndim
-        self._buf = UnsafePointer[Int]().alloc(shape.ndim)
-        memcpy(self._buf, shape._buf, shape.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(shape.ndim)
+        memcpy(dest=self._buf, src=shape._buf, count=shape.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(shape._buf[i])
 
@@ -263,26 +412,74 @@ struct NDArrayShape(
             initialized: Whether the shape is initialized.
                 If yes, the values will be set to 1.
                 If no, the values will be uninitialized.
+
+        Note:
+            After creating the shape with uninitialized values,
+            you must set the values before using it! Otherwise, it may lead to undefined behavior.
         """
         if ndim < 0:
             raise Error(
-                "Error in `numojo.NDArrayShape.__init__(out self, ndim:"
-                " Int, initialized: Bool,)`. \n"
-                "Number of dimensions must be non-negative."
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be non-negative, got {}."
+                    ).format(ndim),
+                    suggestion="Provide ndim >= 0.",
+                    location="NDArrayShape.__init__(ndim, initialized)",
+                )
             )
 
         if ndim == 0:
             # This is a 0darray (numojo scalar)
             self.ndim = ndim
-            self._buf = UnsafePointer[Int]()
+            self._buf = UnsafePointer[Scalar[Self._type]]()
 
         else:
             self.ndim = ndim
-            self._buf = UnsafePointer[Int]().alloc(ndim)
+            self._buf = UnsafePointer[Scalar[Self._type]]().alloc(ndim)
             if initialized:
                 for i in range(ndim):
                     (self._buf + i).init_pointee_copy(1)
 
+    fn row_major(self) raises -> NDArrayStrides:
+        """
+        Create row-major (C-style) strides from a shape.
+
+        Row-major means the last dimension has stride 1 and strides increase
+        going backwards through dimensions.
+
+        Returns:
+            A new NDArrayStrides object with row-major memory layout.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var shape = nm.Shape(2, 3, 4)
+        var strides = nm.Strides.row_major(shape)
+        print(strides)  # Strides: (12, 4, 1)
+        ```
+        """
+        return NDArrayStrides(shape=self, order="C")
+
+    fn col_major(self) raises -> NDArrayStrides:
+        """
+        Create column-major (Fortran-style) strides from a shape.
+
+        Column-major means the first dimension has stride 1 and strides increase
+        going forward through dimensions.
+
+        Returns:
+            A new NDArrayStrides object with column-major memory layout.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var shape = nm.Shape(2, 3, 4)
+        var strides = nm.Strides.col_major(shape)
+        print(strides)  # Strides: (1, 2, 6)
+        ```
+        """
+        return NDArrayStrides(shape=self, order="F")
+
     @always_inline("nodebug")
     fn __copyinit__(out self, other: Self):
         """
@@ -293,8 +490,31 @@ struct NDArrayShape(
             other: Another NDArrayShape to initialize from.
         """
         self.ndim = other.ndim
-        self._buf = UnsafePointer[Int]().alloc(other.ndim)
-        memcpy(self._buf, other._buf, other.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(other.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=other.ndim)
+
+    fn __del__(deinit self):
+        """
+        Destructor for NDArrayShape.
+        Frees the allocated memory for the data buffer.
+        """
+        if self.ndim > 0:
+            self._buf.free()
+
+    fn normalize_index(self, index: Int) -> Int:
+        """
+        Normalizes the given index to be within the valid range.
+
+        Args:
+            index: The index to normalize.
+
+        Returns:
+            The normalized index.
+        """
+        var normalized_idx: Int = index
+        if normalized_idx < 0:
+            normalized_idx += self.ndim
+        return normalized_idx
 
     @always_inline("nodebug")
     fn __getitem__(self, index: Int) raises -> Int:
@@ -310,31 +530,37 @@ struct NDArrayShape(
         Returns:
            Shape value at the given index.
         """
-
-        var normalized_index: Int = index
-        if normalized_index < 0:
-            normalized_index += self.ndim
-        if (normalized_index >= self.ndim) or (normalized_index < 0):
+        if index >= self.ndim or index < -self.ndim:
             raise Error(
-                String("Index {} out of bound [{}, {})").format(
-                    -self.ndim, self.ndim
+                IndexError(
+                    message=String("Index {} out of range [{}, {}).").format(
+                        index, -self.ndim, self.ndim
+                    ),
+                    suggestion="Use indices in [-ndim, ndim).",
+                    location="NDArrayShape.__getitem__",
                 )
             )
-
-        return self._buf[normalized_index]
+        var normalized_idx: Int = self.normalize_index(index)
+        return Int(self._buf[normalized_idx])
 
     # TODO: Check the negative steps result
     @always_inline("nodebug")
     fn _compute_slice_params(
         self, slice_index: Slice
-    ) raises -> (Int, Int, Int):
+    ) raises -> Tuple[Int, Int, Int]:
         var n = self.ndim
         if n == 0:
             return (0, 1, 0)
 
         var step = slice_index.step.or_else(1)
         if step == 0:
-            raise Error("Slice step cannot be zero.")
+            raise Error(
+                ValueError(
+                    message="Slice step cannot be zero.",
+                    suggestion="Use a non-zero step value.",
+                    location="NDArrayShape._compute_slice_params",
+                )
+            )
 
         var start: Int
         var stop: Int
@@ -404,7 +630,7 @@ struct NDArrayShape(
         return result^
 
     @always_inline("nodebug")
-    fn __setitem__(mut self, index: Int, val: Int) raises:
+    fn __setitem__(mut self, index: Int, val: Scalar[Self._type]) raises:
         """
         Sets shape at specified index.
 
@@ -416,26 +642,21 @@ struct NDArrayShape(
           index: Index to get the shape.
           val: Value to set at the given index.
         """
-
-        var normalized_index: Int = index
-        if normalized_index < 0:
-            normalized_index += self.ndim
-        if (normalized_index >= self.ndim) or (normalized_index < 0):
-            raise Error(
-                String("Index {} out of bound [{}, {})").format(
-                    -self.ndim, self.ndim
-                )
-            )
-
-        if val <= 0:
+        if index >= self.ndim or index < -self.ndim:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__setitem__`: "
-                    "Value to be set is not positive."
+                IndexError(
+                    message=String("Index {} out of range [{}, {}).").format(
+                        index, -self.ndim, self.ndim
+                    ),
+                    suggestion="Use indices in [-ndim, ndim).",
+                    location=(
+                        "NDArrayStrides.__setitem__(index: Int, val:"
+                        " Scalar[DType.int])"
+                    ),
                 )
             )
-
-        self._buf[normalized_index] = val
+        var normalized_idx: Int = self.normalize_index(index)
+        self._buf[normalized_idx] = val
 
     @always_inline("nodebug")
     fn __len__(self) -> Int:
@@ -519,6 +740,26 @@ struct NDArrayShape(
                 return True
         return False
 
+    fn __iter__(self) raises -> _ShapeIter:
+        """
+        Iterate over elements of the NDArrayShape, returning copied values.
+
+        Returns:
+            An iterator of NDArrayShape elements.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        var shape = Shape(2, 3, 4)
+        for dim in shape:
+            print(dim)  # Prints: 2, 3, 4
+        ```
+        """
+        return _ShapeIter(
+            shape=self,
+            length=self.ndim,
+        )
+
     # ===-------------------------------------------------------------------===#
     # Other methods
     # ===-------------------------------------------------------------------===#
@@ -530,7 +771,7 @@ struct NDArrayShape(
         """
 
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         return res
 
     fn join(self, *shapes: Self) raises -> Self:
@@ -549,7 +790,7 @@ struct NDArrayShape(
 
         var new_shape = Self(ndim=total_dims, initialized=False)
 
-        var index = 0
+        var index: Int = 0
         for i in range(self.ndim):
             (new_shape._buf + index).init_pointee_copy(self[i])
             index += 1
@@ -568,10 +809,10 @@ struct NDArrayShape(
         Returns:
           The total number of elements in the corresponding array.
         """
-        var size = 1
+        var size: Scalar[Self._type] = 1
         for i in range(self.ndim):
             size *= self._buf[i]
-        return size
+        return Int(size)
 
     fn swapaxes(self, axis1: Int, axis2: Int) raises -> Self:
         """
@@ -689,26 +930,147 @@ struct NDArrayShape(
             src=self._buf + axis + 1,
             count=self.ndim - axis - 1,
         )
-        return res
+        return res^
+
+    fn load[width: Int = 1](self, idx: Int) raises -> SIMD[Self._type, width]:
+        """
+        Load a SIMD vector from the Shape at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
+
+        Raises:
+            Error: If the load exceeds the bounds of the Shape.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Load operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Shape.load",
+                )
+            )
+
+        return self._buf.load[width=width](idx)
+
+    fn store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]) raises:
+        """
+        Store a SIMD vector into the Shape at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
 
-    # # can be used for vectorized index calculation
-    # @always_inline("nodebug")
-    # fn load[width: Int = 1](self, index: Int) raises -> SIMD[dtype, width]:
-    #     """
-    #     SIMD load dimensional information.
-    #     """
-    #     if index >= self.ndim:
-    #         raise Error("Index out of bound")
-    #     return self._buf.load[width=width](index)
-
-    # # can be used for vectorized index retrieval
-    # @always_inline("nodebug")
-    # fn store[
-    #     width: Int = 1
-    # ](out self, index: Int, val: SIMD[dtype, width]) raises:
-    #     """
-    #     SIMD store dimensional information.
-    #     """
-    #     # if index >= self.ndim:
-    #     #     raise Error("Index out of bound")
-    #     self._buf.ptr.store(index, val)
+        Raises:
+            Error: If the store exceeds the bounds of the Shape.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Store operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Shape.store",
+                )
+            )
+
+        self._buf.store[width=width](idx, value)
+
+    fn unsafe_load[width: Int = 1](self, idx: Int) -> SIMD[Self._type, width]:
+        """
+        Unsafely load a SIMD vector from the Shape at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
+        """
+        return self._buf.load[width=width](idx)
+
+    fn unsafe_store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]):
+        """
+        Unsafely store a SIMD vector into the Shape at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
+        """
+        self._buf.store[width=width](idx, value)
+
+
+struct _ShapeIter[
+    forward: Bool = True,
+](ImplicitlyCopyable, Movable):
+    """Iterator for NDArrayShape.
+
+    Parameters:
+        forward: The iteration direction. `False` is backwards.
+    """
+
+    var index: Int
+    var shape: NDArrayShape
+    var length: Int
+
+    fn __init__(
+        out self,
+        shape: NDArrayShape,
+        length: Int,
+    ):
+        self.index = 0 if forward else length
+        self.length = length
+        self.shape = shape
+
+    fn __iter__(self) -> Self:
+        return self
+
+    fn __has_next__(self) -> Bool:
+        @parameter
+        if forward:
+            return self.index < self.length
+        else:
+            return self.index > 0
+
+    fn __next__(mut self) raises -> Scalar[DType.int]:
+        @parameter
+        if forward:
+            var current_index = self.index
+            self.index += 1
+            return self.shape.__getitem__(current_index)
+        else:
+            var current_index = self.index
+            self.index -= 1
+            return self.shape.__getitem__(current_index)
+
+    fn __len__(self) -> Int:
+        @parameter
+        if forward:
+            return self.length - self.index
+        else:
+            return self.index
diff --git a/numojo/core/ndstrides.mojo b/numojo/core/ndstrides.mojo
index fee56c6d..8be0f91f 100644
--- a/numojo/core/ndstrides.mojo
+++ b/numojo/core/ndstrides.mojo
@@ -10,12 +10,17 @@ Implements NDArrayStrides type.
 
 from memory import UnsafePointer, memcmp, memcpy
 
+from numojo.core.error import IndexError, ValueError
+
+alias strides = NDArrayStrides
 alias Strides = NDArrayStrides
 """An alias of the NDArrayStrides."""
 
 
 @register_passable
-struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
+struct NDArrayStrides(
+    ImplicitlyCopyable, Movable, Representable, Sized, Stringable, Writable
+):
     """
     Presents the strides of `NDArray` type.
 
@@ -24,8 +29,11 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
     The number of dimension is checked upon creation of the strides.
     """
 
+    # Aliases
+    alias _type: DType = DType.int
+
     # Fields
-    var _buf: UnsafePointer[Int]
+    var _buf: UnsafePointer[Scalar[Self._type]]
     """Data buffer."""
     var ndim: Int
     """Number of dimensions of array. It must be larger than 0."""
@@ -43,14 +51,17 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
         if len(strides) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(strides))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(strides)),
+                    suggestion="Provide at least one stride value.",
+                    location="NDArrayStrides.__init__(*strides: Int)",
+                )
             )
 
         self.ndim = len(strides)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(strides[i])
 
@@ -67,14 +78,17 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
         if len(strides) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(strides))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(strides)),
+                    suggestion="Provide a non-empty list of strides.",
+                    location="NDArrayStrides.__init__(strides: List[Int])",
+                )
             )
 
         self.ndim = len(strides)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(strides[i])
 
@@ -91,14 +105,19 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
         if len(strides) <= 0:
             raise Error(
-                String(
-                    "\nError in `NDArrayShape.__init__()`: Number of dimensions"
-                    " of array must be positive. However, it is {}."
-                ).format(len(strides))
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be positive, got {}."
+                    ).format(len(strides)),
+                    suggestion="Provide a non-empty variadic list of strides.",
+                    location=(
+                        "NDArrayStrides.__init__(strides: VariadicList[Int])"
+                    ),
+                )
             )
 
         self.ndim = len(strides)
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(strides[i])
 
@@ -113,8 +132,8 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
 
         self.ndim = strides.ndim
-        self._buf = UnsafePointer[Int]().alloc(self.ndim)
-        memcpy(self._buf, strides._buf, strides.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
+        memcpy(dest=self._buf, src=strides._buf, count=strides.ndim)
 
     @always_inline("nodebug")
     fn __init__(
@@ -136,7 +155,7 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
 
         self.ndim = shape.ndim
-        self._buf = UnsafePointer[Int]().alloc(shape.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(shape.ndim)
 
         if order == "C":
             var temp = 1
@@ -145,19 +164,27 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
                 temp *= shape[i]
         elif order == "F":
             var temp = 1
+            # Should we check for temp overflow here? Maybe no?
             for i in range(0, self.ndim):
                 (self._buf + i).init_pointee_copy(temp)
                 temp *= shape[i]
         else:
             raise Error(
-                "Invalid order: Only C style row major `C` & Fortran style"
-                " column major `F` are supported"
+                ValueError(
+                    message=String(
+                        "Invalid order '{}'; expected 'C' or 'F'."
+                    ).format(order),
+                    suggestion=(
+                        "Use 'C' for row-major or 'F' for column-major layout."
+                    ),
+                    location="NDArrayStrides.__init__(shape, order)",
+                )
             )
 
     @always_inline("nodebug")
     fn __init__(out self, *shape: Int, order: String) raises:
         """
-        Overloads the function `__init__(shape: NDArrayShape, order: String)`.
+        Overloads the function `__init__(shape: NDArrayStrides, order: String)`.
         Initializes the NDArrayStrides from a given shapes and an order.
 
         Raises:
@@ -174,7 +201,7 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
     @always_inline("nodebug")
     fn __init__(out self, shape: List[Int], order: String = "C") raises:
         """
-        Overloads the function `__init__(shape: NDArrayShape, order: String)`.
+        Overloads the function `__init__(shape: NDArrayStrides, order: String)`.
         Initializes the NDArrayStrides from a given shapes and an order.
 
         Raises:
@@ -195,7 +222,7 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         order: String = "C",
     ) raises:
         """
-        Overloads the function `__init__(shape: NDArrayShape, order: String)`.
+        Overloads the function `__init__(shape: NDArrayStrides, order: String)`.
         Initializes the NDArrayStrides from a given shapes and an order.
 
         Raises:
@@ -232,23 +259,75 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
         if ndim < 0:
             raise Error(
-                "Error in `numojo.NDArrayStrides.__init__(out self, ndim:"
-                " Int, initialized: Bool,)`. \n"
-                "Number of dimensions must be non-negative."
+                ValueError(
+                    message=String(
+                        "Number of dimensions must be non-negative, got {}."
+                    ).format(ndim),
+                    suggestion="Provide ndim >= 0.",
+                    location="NDArrayStrides.__init__(ndim, initialized)",
+                )
             )
 
         if ndim == 0:
             # This is a 0darray (numojo scalar)
             self.ndim = ndim
-            self._buf = UnsafePointer[Int]()
+            self._buf = UnsafePointer[Scalar[Self._type]]()
 
         else:
             self.ndim = ndim
-            self._buf = UnsafePointer[Int]().alloc(ndim)
+            self._buf = UnsafePointer[Scalar[Self._type]]().alloc(ndim)
             if initialized:
                 for i in range(ndim):
                     (self._buf + i).init_pointee_copy(0)
 
+    @staticmethod
+    fn row_major(shape: NDArrayShape) raises -> NDArrayStrides:
+        """
+        Create row-major (C-style) strides from a shape.
+
+        Row-major means the last dimension has stride 1 and strides increase
+        going backwards through dimensions.
+
+        Args:
+            shape: The shape of the array.
+
+        Returns:
+            A new NDArrayStrides object with row-major memory layout.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var shape = nm.Shape(2, 3, 4)
+        var strides = nm.Strides.row_major(shape)
+        print(strides)  # Strides: (12, 4, 1)
+        ```
+        """
+        return NDArrayStrides(shape=shape, order="C")
+
+    @staticmethod
+    fn col_major(shape: NDArrayShape) raises -> NDArrayStrides:
+        """
+        Create column-major (Fortran-style) strides from a shape.
+
+        Column-major means the first dimension has stride 1 and strides increase
+        going forward through dimensions.
+
+        Args:
+            shape: The shape of the array.
+
+        Returns:
+            A new NDArrayStrides object with column-major memory layout.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var shape = nm.Shape(2, 3, 4)
+        var strides = nm.Strides.col_major(shape)
+        print(strides)  # Strides: (1, 2, 6)
+        ```
+        """
+        return NDArrayStrides(shape=shape, order="F")
+
     @always_inline("nodebug")
     fn __copyinit__(out self, other: Self):
         """
@@ -259,8 +338,31 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
             other: Strides of the array.
         """
         self.ndim = other.ndim
-        self._buf = UnsafePointer[Int]().alloc(other.ndim)
-        memcpy(self._buf, other._buf, other.ndim)
+        self._buf = UnsafePointer[Scalar[Self._type]]().alloc(other.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=other.ndim)
+
+    fn __del__(deinit self):
+        """
+        Destructor for NDArrayStrides.
+        Frees the allocated memory for the data buffer.
+        """
+        if self.ndim > 0:
+            self._buf.free()
+
+    fn normalize_index(self, index: Int) -> Int:
+        """
+        Normalizes the given index to be within the valid range.
+
+        Args:
+            index: The index to normalize.
+
+        Returns:
+            The normalized index.
+        """
+        var normalized_idx: Int = index
+        if normalized_idx < 0:
+            normalized_idx += self.ndim
+        return normalized_idx
 
     @always_inline("nodebug")
     fn __getitem__(self, index: Int) raises -> Int:
@@ -276,21 +378,132 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         Returns:
            Stride value at the given index.
         """
+        if index >= self.ndim or index < -self.ndim:
+            raise Error(
+                IndexError(
+                    message=String("Index {} out of range [{}, {}).").format(
+                        index, -self.ndim, self.ndim
+                    ),
+                    suggestion="Use indices in [-ndim, ndim).",
+                    location="NDArrayStrides.__getitem__",
+                )
+            )
+        var normalized_idx: Int = self.normalize_index(index)
+        return Int(self._buf[normalized_idx])
+
+    @always_inline("nodebug")
+    fn _compute_slice_params(
+        self, slice_index: Slice
+    ) raises -> Tuple[Int, Int, Int]:
+        """
+        Compute normalized slice parameters (start, step, length).
+
+        Args:
+            slice_index: The slice to compute parameters for.
+
+        Returns:
+            A tuple of (start, step, length).
+
+        Raises:
+            Error: If the slice step is zero.
+        """
+        var n: Int = self.ndim
+        if n == 0:
+            return (0, 1, 0)
 
-        var normalized_index: Int = index
-        if normalized_index < 0:
-            normalized_index += self.ndim
-        if (normalized_index >= self.ndim) or (normalized_index < 0):
+        var step = slice_index.step.or_else(1)
+        if step == 0:
             raise Error(
-                String("Index {} out of bound [{}, {})").format(
-                    -self.ndim, self.ndim
+                ValueError(
+                    message="Slice step cannot be zero.",
+                    suggestion="Use a non-zero step value.",
+                    location="NDArrayStrides._compute_slice_params",
                 )
             )
 
-        return self._buf[normalized_index]
+        var start: Int
+        var stop: Int
+        if step > 0:
+            start = slice_index.start.or_else(0)
+            stop = slice_index.end.or_else(n)
+        else:
+            start = slice_index.start.or_else(n - 1)
+            stop = slice_index.end.or_else(-1)
+
+        if start < 0:
+            start += n
+        if stop < 0:
+            stop += n
+
+        if step > 0:
+            if start < 0:
+                start = 0
+            if start > n:
+                start = n
+            if stop < 0:
+                stop = 0
+            if stop > n:
+                stop = n
+        else:
+            if start >= n:
+                start = n - 1
+            if start < -1:
+                start = -1
+            if stop >= n:
+                stop = n - 1
+            if stop < -1:
+                stop = -1
+
+        var length: Int = 0
+        if step > 0:
+            if start < stop:
+                length = Int((stop - start + step - 1) / step)
+        else:
+            if start > stop:
+                var neg_step = -step
+                length = Int((start - stop + neg_step - 1) / neg_step)
+
+        return (start, step, length)
+
+    @always_inline("nodebug")
+    fn __getitem__(self, slice_index: Slice) raises -> NDArrayStrides:
+        """
+        Return a sliced view of the strides as a new NDArrayStrides.
+        Delegates normalization & validation to _compute_slice_params.
+
+        Args:
+            slice_index: The slice to extract.
+
+        Returns:
+            A new NDArrayStrides containing the sliced values.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var strides = nm.Strides(12, 4, 1)
+        print(strides[1:])  # Strides: (4, 1)
+        ```
+        """
+        var updated_slice: Tuple[Int, Int, Int] = self._compute_slice_params(
+            slice_index
+        )
+        var start = updated_slice[0]
+        var step = updated_slice[1]
+        var length = updated_slice[2]
+
+        if length <= 0:
+            var empty_result = NDArrayStrides(ndim=0, initialized=False)
+            return empty_result
+
+        var result = NDArrayStrides(ndim=length, initialized=False)
+        var idx = start
+        for i in range(length):
+            (result._buf + i).init_pointee_copy(self._buf[idx])
+            idx += step
+        return result^
 
     @always_inline("nodebug")
-    fn __setitem__(mut self, index: Int, val: Int) raises:
+    fn __setitem__(mut self, index: Int, val: Scalar[Self._type]) raises:
         """
         Sets stride at specified index.
 
@@ -302,18 +515,21 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
           index: Index to get the stride.
           val: Value to set at the given index.
         """
-
-        var normalized_index: Int = index
-        if normalized_index < 0:
-            normalized_index += self.ndim
-        if (normalized_index >= self.ndim) or (normalized_index < 0):
+        if index >= self.ndim or index < -self.ndim:
             raise Error(
-                String("Index {} out of bound [{}, {})").format(
-                    -self.ndim, self.ndim
+                IndexError(
+                    message=String("Index {} out of range [{}, {}).").format(
+                        index, -self.ndim, self.ndim
+                    ),
+                    suggestion="Use indices in [-ndim, ndim).",
+                    location=(
+                        "NDArrayStrides.__setitem__(index: Int, val:"
+                        " Scalar[DType.int])"
+                    ),
                 )
             )
-
-        self._buf[normalized_index] = val
+        var normalized_idx: Int = self.normalize_index(index)
+        self._buf[normalized_idx] = val
 
     @always_inline("nodebug")
     fn __len__(self) -> Int:
@@ -397,6 +613,26 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
                 return True
         return False
 
+    fn __iter__(self) raises -> _StrideIter:
+        """
+        Iterate over elements of the NDArrayStrides, returning copied values.
+
+        Returns:
+            An iterator of NDArrayStrides elements.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var strides = nm.Strides(12, 4, 1)
+        for stride in strides:
+            print(stride)  # Prints: 12, 4, 1
+        ```
+        """
+        return _StrideIter(
+            strides=self,
+            length=self.ndim,
+        )
+
     # ===-------------------------------------------------------------------===#
     # Other methods
     # ===-------------------------------------------------------------------===#
@@ -408,7 +644,7 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         """
 
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         return res
 
     fn swapaxes(self, axis1: Int, axis2: Int) raises -> Self:
@@ -422,15 +658,109 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         Returns:
             A new strides with the given axes swapped.
         """
-        var res = self
+        var norm_axis1: Int = self.normalize_index(axis1)
+        var norm_axis2: Int = self.normalize_index(axis2)
+
+        if norm_axis1 < 0 or norm_axis1 >= self.ndim:
+            raise Error(
+                IndexError(
+                    message=String("axis1 {} out of range [0, {}).").format(
+                        norm_axis1, self.ndim
+                    ),
+                    suggestion="Provide axis1 in [-ndim, ndim).",
+                    location="NDArrayStrides.swapaxes",
+                )
+            )
+        if norm_axis2 < 0 or norm_axis2 >= self.ndim:
+            raise Error(
+                IndexError(
+                    message=String("axis2 {} out of range [0, {}).").format(
+                        norm_axis2, self.ndim
+                    ),
+                    suggestion="Provide axis2 in [-ndim, ndim).",
+                    location="NDArrayStrides.swapaxes",
+                )
+            )
+
+        var res = Self(ndim=self.ndim, initialized=False)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         res[axis1] = self[axis2]
         res[axis2] = self[axis1]
-        return res
+        return res^
+
+    fn join(self, *strides: Self) raises -> Self:
+        """
+        Join multiple strides into a single strides.
+
+        Args:
+            strides: Variable number of NDArrayStrides objects.
+
+        Returns:
+            A new NDArrayStrides object with all values concatenated.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var s1 = nm.Strides(12, 4)
+        var s2 = nm.Strides(1)
+        var joined = s1.join(s2)
+        print(joined)  # Strides: (12, 4, 1)
+        ```
+        """
+        var total_dims: Int = self.ndim
+        for i in range(len(strides)):
+            total_dims += strides[i].ndim
+
+        var new_strides: Self = Self(ndim=total_dims, initialized=False)
+
+        var index: Int = 0
+        for i in range(self.ndim):
+            (new_strides._buf + index).init_pointee_copy(self[i])
+            index += 1
+
+        for i in range(len(strides)):
+            for j in range(strides[i].ndim):
+                (new_strides._buf + index).init_pointee_copy(strides[i][j])
+                index += 1
+
+        return new_strides
 
     # ===-------------------------------------------------------------------===#
     # Other private methods
     # ===-------------------------------------------------------------------===#
 
+    fn _extend(self, *values: Int) raises -> Self:
+        """
+        Extend the strides by additional values.
+        ***UNSAFE!*** No boundary check!
+
+        Args:
+            values: Additional stride values to append.
+
+        Returns:
+            A new NDArrayStrides object with the extended values.
+
+        Example:
+        ```mojo
+        import numojo as nm
+        var strides = nm.Strides(12, 4)
+        var extended = strides._extend(1)
+        print(extended)  # Strides: (12, 4, 1)
+        ```
+        """
+        var total_dims: Int = self.ndim + len(values)
+        var new_strides: Self = Self(ndim=total_dims, initialized=False)
+
+        var offset: UInt = 0
+        for i in range(self.ndim):
+            (new_strides._buf + offset).init_pointee_copy(self[i])
+            offset += 1
+        for value in values:
+            (new_strides._buf + offset).init_pointee_copy(value)
+            offset += 1
+
+        return new_strides^
+
     fn _flip(self) -> Self:
         """
         Returns a new strides by flipping the items.
@@ -502,27 +832,145 @@ struct NDArrayStrides(ImplicitlyCopyable, Sized, Stringable, Writable):
         )
         return res
 
+    fn load[width: Int = 1](self, idx: Int) raises -> SIMD[Self._type, width]:
+        """
+        Load a SIMD vector from the Strides at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
 
-# @always_inline("nodebug")
-# fn load[width: Int = 1](self, index: Int) raises -> SIMD[dtype, width]:
-#     # if index >= self.ndim:
-#     #     raise Error("Index out of bound")
-#     return self._buf.ptr.load[width=width](index)
-
-# @always_inline("nodebug")
-# fn store[
-#     width: Int = 1
-# ](mut self, index: Int, val: SIMD[dtype, width]) raises:
-#     # if index >= self.ndim:
-#     #     raise Error("Index out of bound")
-#     self._buf.ptr.store(index, val)
-
-# @always_inline("nodebug")
-# fn load_unsafe[width: Int = 1](self, index: Int) -> Int:
-#     return self._buf.ptr.load[width=width](index).__int__()
-
-# @always_inline("nodebug")
-# fn store_unsafe[
-#     width: Int = 1
-# ](mut self, index: Int, val: SIMD[dtype, width]):
-#     self._buf.ptr.store(index, val)
+        Raises:
+            Error: If the load exceeds the bounds of the Strides.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Load operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Strides.load",
+                )
+            )
+
+        return self._buf.load[width=width](idx)
+
+    fn store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]) raises:
+        """
+        Store a SIMD vector into the Strides at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
+
+        Raises:
+            Error: If the store exceeds the bounds of the Strides.
+        """
+        if idx < 0 or idx + width > self.ndim:
+            raise Error(
+                IndexError(
+                    message=String(
+                        "Store operation out of bounds: idx={} width={} ndim={}"
+                    ).format(idx, width, self.ndim),
+                    suggestion=(
+                        "Ensure that idx and width are within valid range."
+                    ),
+                    location="Strides.store",
+                )
+            )
+
+        self._buf.store[width=width](idx, value)
+
+    fn unsafe_load[width: Int = 1](self, idx: Int) -> SIMD[Self._type, width]:
+        """
+        Unsafely load a SIMD vector from the Strides at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to load from.
+
+        Returns:
+            A SIMD vector containing the loaded values.
+        """
+        return self._buf.load[width=width](idx)
+
+    fn unsafe_store[
+        width: Int = 1
+    ](self, idx: Int, value: SIMD[Self._type, width]):
+        """
+        Unsafely store a SIMD vector into the Strides at the specified index.
+
+        Parameters:
+            width: The width of the SIMD vector.
+
+        Args:
+            idx: The starting index to store to.
+            value: The SIMD vector to store.
+        """
+        self._buf.store[width=width](idx, value)
+
+
+struct _StrideIter[
+    forward: Bool = True,
+](ImplicitlyCopyable, Movable):
+    """Iterator for NDArrayStrides.
+
+    Parameters:
+        forward: The iteration direction. `False` is backwards.
+    """
+
+    var index: Int
+    var strides: NDArrayStrides
+    var length: Int
+
+    fn __init__(
+        out self,
+        strides: NDArrayStrides,
+        length: Int,
+    ):
+        self.index = 0 if forward else length
+        self.length = length
+        self.strides = strides
+
+    fn __iter__(self) -> Self:
+        return self
+
+    fn __has_next__(self) -> Bool:
+        @parameter
+        if forward:
+            return self.index < self.length
+        else:
+            return self.index > 0
+
+    fn __next__(mut self) raises -> Scalar[DType.int]:
+        @parameter
+        if forward:
+            var current_index = self.index
+            self.index += 1
+            return self.strides.__getitem__(current_index)
+        else:
+            var current_index = self.index
+            self.index -= 1
+            return self.strides.__getitem__(current_index)
+
+    fn __len__(self) -> Int:
+        @parameter
+        if forward:
+            return self.length - self.index
+        else:
+            return self.index
diff --git a/numojo/core/own_data.mojo b/numojo/core/own_data.mojo
index 9ccabc9c..aaa5019c 100644
--- a/numojo/core/own_data.mojo
+++ b/numojo/core/own_data.mojo
@@ -11,7 +11,9 @@ from memory import UnsafePointer
 from numojo.core.traits.bufferable import Bufferable
 
 
-struct OwnData[dtype: DType]:  # TODO: implement `Bufferable` trait
+struct OwnData[dtype: DType](
+    Copyable, Movable
+):  # TODO: implement `Bufferable` trait
     var ptr: UnsafePointer[Scalar[dtype]]
 
     fn __init__(out self, size: Int):
diff --git a/numojo/core/staticmatrix.mojo b/numojo/core/staticmatrix.mojo
new file mode 100644
index 00000000..67996283
--- /dev/null
+++ b/numojo/core/staticmatrix.mojo
@@ -0,0 +1,910 @@
+# ===----------------------------------------------------------------------=== #
+# StaticMatrix2D for CPU and GPU
+# ===----------------------------------------------------------------------=== #
+from time import perf_counter_ns
+from gpu.host import DeviceBuffer, DeviceContext, HostBuffer
+from gpu import thread_idx, block_dim, block_idx, grid_dim, barrier
+from gpu.host.dim import Dim
+from gpu.memory import AddressSpace
+from memory import stack_allocation
+from gpu.globals import WARP_SIZE, MAX_THREADS_PER_BLOCK_METADATA
+from collections.optional import Optional
+from memory import UnsafePointer
+from sys import simd_width_of
+from sys.info import (
+    has_accelerator,
+    has_nvidia_gpu_accelerator,
+    has_amd_gpu_accelerator,
+    has_apple_gpu_accelerator,
+)
+
+from numojo.core.flags import Flags
+from numojo.core.own_data import OwnData
+from numojo.core.error import ValueError
+from numojo.core.gpu.matrix_kernels import (
+    matrix_add_kernel_vectorized,
+    matrix_sub_kernel_vectorized,
+    matrix_mul_kernel_vectorized,
+    calculate_1d_launch_params_for_dtype,
+)
+from testing.testing import assert_true
+
+from numojo.core.gpu.device import Device, check_accelerator
+from numojo.core.gpu.storage import DataContainer, HostStorage, DeviceStorage
+
+
+struct StaticMatrix[dtype: DType = DType.float32, device: Device = Device.CPU](
+    Movable, Stringable, Writable
+):
+    """Matrix implementation using Optional storage approach with corrected kernels.
+    """
+
+    alias width: Int = simd_width_of[dtype]()
+    var _buf: DataContainer[dtype, device]
+    var shape: Tuple[Int, Int]
+    var size: Int
+    var strides: Tuple[Int, Int]
+    var flags: Flags
+
+    @always_inline("nodebug")
+    fn __init__(
+        out self,
+        shape: Tuple[Int, Int],
+        order: String = "C",
+        fill_value: Scalar[dtype] = 0.0,
+    ) raises:
+        self.shape = (shape[0], shape[1])
+        if order == "C":
+            self.strides = (shape[1], 1)
+        else:
+            self.strides = (1, shape[0])
+        self.size = shape[0] * shape[1]
+        self._buf = DataContainer[dtype, device](self.size)
+        self.flags = Flags(
+            self.shape, self.strides, owndata=True, writeable=True
+        )
+        self.fill(fill_value)
+
+    @always_inline("nodebug")
+    fn __copyinit__(out self, other: Self):
+        """
+        Copy other into self.
+        """
+        self.shape = (other.shape[0], other.shape[1])
+        self.strides = (other.strides[0], other.strides[1])
+        self.size = other.size
+        self._buf = other._buf.copy()
+        self.flags = other.flags
+
+    fn __moveinit__(out self, deinit other: Self):
+        self._buf = other._buf^
+        self.shape = other.shape
+        self.size = other.size
+        self.strides = other.strides
+        self.flags = other.flags
+
+    @always_inline("nodebug")
+    fn __del__(deinit self):
+        @parameter
+        if device.type == "cpu":
+            owndata = self.flags.OWNDATA and self._buf.host_storage
+            if owndata:
+                self._buf.host_storage.value().ptr.free()
+
+    fn _check_bounds(self, row: Int, col: Int) raises:
+        if row < 0 or row >= self.shape[0] or col < 0 or col >= self.shape[1]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Index out of bounds: ({}, {}) for shape ({}, {})."
+                    ).format(row, col, self.shape[0], self.shape[1]),
+                    suggestion="Ensure 0 <= row < rows and 0 <= col < cols",
+                    location="StaticMatrix._check_bounds",
+                )
+            )
+
+    fn _index(self, row: Int, col: Int) -> Int:
+        return row * self.strides[0] + col * self.strides[1]
+
+    # ! perhaps we should remove indexing into gpu arrays since they are extremely slow and results in too many copies!
+    fn __getitem__(self, row: Int, col: Int) raises -> Scalar[dtype]:
+        self._check_bounds(row, col)
+        var idx = self._index(row, col)
+
+        @parameter
+        if device.type == "cpu":
+            return self._buf.get_host_ptr()[idx]
+        elif device.type == "gpu":
+            with self._buf.get_device_buffer().map_to_host() as host_ptr:
+                return host_ptr[idx]
+        else:
+            raise Error("Unsupported device type for __getitem__")
+
+    fn __getitem__(self, var x: Int) raises -> Self:
+        """1D indexing to get a row as a new StaticMatrix."""
+        if x < 0 or x >= self.shape[0]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Row index out of bounds: {} for shape ({}, {})."
+                    ).format(x, self.shape[0], self.shape[1]),
+                    suggestion="Ensure 0 <= row < rows",
+                    location="StaticMatrix.__getitem__",
+                )
+            )
+        var row_StaticMatrix = StaticMatrix[dtype, device](
+            (1, self.shape[1]), order=String("C")
+        )
+        for j in range(self.shape[1]):
+            row_StaticMatrix[0, j] = self[x, j]
+        return row_StaticMatrix^
+
+    fn __setitem__(mut self, row: Int, col: Int, value: Scalar[dtype]) raises:
+        self._check_bounds(row, col)
+        var idx = self._index(row, col)
+
+        @parameter
+        if device.type == "cpu":
+            self._buf.get_host_ptr()[idx] = value
+        elif device.type == "gpu":
+            with self._buf.get_device_buffer().map_to_host() as host_ptr:
+                host_ptr[idx] = value
+        else:
+            raise Error("Unsupported device type for __setitem__")
+
+    fn fill(mut self, value: Scalar[dtype]) raises:
+        """Fill matrix with value - optimized per device type."""
+
+        @parameter
+        if device.type == "cpu":
+            var ptr = self._buf.get_host_ptr()
+            for i in range(self.size):
+                ptr[i] = value
+        elif (
+            device.type == "gpu"
+        ):  # I should create a kernel for this instead to do it safely.
+            with self._buf.get_device_buffer().map_to_host() as host_ptr:
+                for i in range(self.size):
+                    host_ptr[i] = value
+        else:
+            raise Error("Unsupported device type for fill")
+
+    fn __str__(self) -> String:
+        try:
+            var result = ""
+            if self.size == 0:
+                return result + "[]"
+
+            if self.shape[0] > 6 or self.shape[1] > 6:
+                return self._format_large_StaticMatrix()
+
+            for i in range(self.shape[0]):
+                result += "["
+                for j in range(self.shape[1]):
+                    var v = self[i, j]
+                    result += String(v)
+                    if j < self.shape[1] - 1:
+                        result += ", "
+                result += "]"
+                if i < self.shape[0] - 1:
+                    result += "\n"
+
+            result += String("\nStaticMatrix(")
+            result += (
+                String(self.shape[0])
+                + "x"
+                + String(self.shape[1])
+                + ", dtype="
+                + String(dtype)
+                + ", device="
+                + String(device)
+                + ")\n"
+            )
+            return result
+        except:
+            return "Error generating string representation of StaticMatrix."
+
+    fn _format_large_StaticMatrix(self) -> String:
+        """Format large StaticMatrixs with truncated representation and shape info.
+        """
+        try:
+            var result = String("StaticMatrix(")
+            result += (
+                String(self.shape[0])
+                + "x"
+                + String(self.shape[1])
+                + ", dtype="
+                + String(dtype)
+                + ", device="
+                + String(device)
+                + ")\n"
+            )
+
+            for i in range(min(3, self.shape[0])):
+                result += "["
+                for j in range(min(3, self.shape[1])):
+                    result += String(self[i, j])
+                    if j < min(3, self.shape[1]) - 1:
+                        result += ", "
+                if self.shape[1] > 3:
+                    result += ", ..., " + String(self[i, self.shape[1] - 1])
+                result += "]"
+                if i < min(3, self.shape[0]) - 1:
+                    result += "\n"
+
+            if self.shape[0] > 3:
+                result += "\n...\n"
+                result += "["
+                for j in range(min(3, self.shape[1])):
+                    result += String(self[self.shape[0] - 1, j])
+                    if j < min(3, self.shape[1]) - 1:
+                        result += ", "
+                if self.shape[1] > 3:
+                    result += ", ..., " + String(
+                        self[self.shape[0] - 1, self.shape[1] - 1]
+                    )
+                result += "]"
+
+            return result
+        except:
+            return (
+                "Error generating truncated representation of large"
+                " StaticMatrix."
+            )
+
+    fn write_to[W: Writer](self, mut writer: W):
+        writer.write(self.__str__())
+
+    @staticmethod
+    fn zeros(rows: Int, cols: Int) raises -> StaticMatrix[dtype, device]:
+        return StaticMatrix[dtype, device](
+            shape=(rows, cols), fill_value=Scalar[dtype](0)
+        )
+
+    @staticmethod
+    fn ones(rows: Int, cols: Int) raises -> StaticMatrix[dtype, device]:
+        return StaticMatrix[dtype, device](
+            shape=(rows, cols), fill_value=Scalar[dtype](1)
+        )
+
+    @staticmethod
+    fn identity(size: Int) raises -> StaticMatrix[dtype, device]:
+        var mat = StaticMatrix[dtype, device](
+            shape=(size, size), fill_value=Scalar[dtype](0)
+        )
+        for i in range(size):
+            mat[i, i] = Scalar[dtype](1)
+        return mat^
+
+    fn __add__(self, other: Self) raises -> Self:
+        """Element-wise addition of two matrices."""
+        if self.shape[0] != other.shape[0] or self.shape[1] != other.shape[1]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Matrix shapes ({}, {}) and ({}, {}) are incompatible"
+                        " for addition."
+                    ).format(
+                        self.shape[0],
+                        self.shape[1],
+                        other.shape[0],
+                        other.shape[1],
+                    ),
+                    suggestion="Ensure both matrices have the same dimensions",
+                    location="StaticMatrix.__add__",
+                )
+            )
+
+        if self.device != other.device:
+            raise Error(
+                ValueError(
+                    message="Cannot add matrices on different devices",
+                    suggestion="Move matrices to the same device before adding",
+                    location="StaticMatrix.__add__",
+                )
+            )
+
+        return add[dtype, device](self, other)
+
+    fn __sub__(self, other: Self) raises -> Self:
+        """Element-wise subtraction of two matrices."""
+        if self.shape[0] != other.shape[0] or self.shape[1] != other.shape[1]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Matrix shapes ({}, {}) and ({}, {}) are incompatible"
+                        " for subtraction."
+                    ).format(
+                        self.shape[0],
+                        self.shape[1],
+                        other.shape[0],
+                        other.shape[1],
+                    ),
+                    suggestion="Ensure both matrices have the same dimensions",
+                    location="StaticMatrix.__sub__",
+                )
+            )
+
+        if self.device != other.device:
+            raise Error(
+                ValueError(
+                    message="Cannot subtract matrices on different devices",
+                    suggestion=(
+                        "Move matrices to the same device before subtracting"
+                    ),
+                    location="StaticMatrix.__sub__",
+                )
+            )
+
+        return sub[dtype, device](self, other)
+
+    # Element-wise multiplication
+    fn __mul__(self, other: Self) raises -> Self:
+        """Element-wise multiplication of two matrices."""
+        if self.shape[0] != other.shape[0] or self.shape[1] != other.shape[1]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Matrix shapes ({},t {}) and ({}, {}) are incompatible"
+                        " for element-wise multiplication."
+                    ).format(
+                        self.shape[0],
+                        self.shape[1],
+                        other.shape[0],
+                        other.shape[1],
+                    ),
+                    suggestion="Ensure both matrices have the same dimensions",
+                    location="StaticMatrix.__mul__",
+                )
+            )
+
+        if self._buf.device != other._buf.device:
+            raise Error(
+                ValueError(
+                    message="Cannot multiply matrices on different devices",
+                    suggestion=(
+                        "Move matrices to the same device before multiplying"
+                    ),
+                    location="StaticMatrix.__mul__",
+                )
+            )
+
+        @parameter
+        if device.type == "cpu":
+            return self._mul_cpu(other)
+        else:
+            return self._mul_gpu(other)
+
+    fn __matmul__(self, other: Self) raises -> Self:
+        """Matrix multiplication of two matrices."""
+        if self.shape[1] != other.shape[0]:
+            raise Error(
+                ValueError(
+                    message=String(
+                        "Matrix shapes ({}, {}) and ({}, {}) are incompatible"
+                        " for matrix multiplication."
+                    ).format(
+                        self.shape[0],
+                        self.shape[1],
+                        other.shape[0],
+                        other.shape[1],
+                    ),
+                    suggestion=(
+                        "Ensure first matrix columns equal second matrix rows"
+                    ),
+                    location="StaticMatrix.__matmul__",
+                )
+            )
+
+        if self.device != other.device:
+            raise Error(
+                ValueError(
+                    message="Cannot multiply matrices on different devices",
+                    suggestion=(
+                        "Move matrices to the same device before multiplying"
+                    ),
+                    location="StaticMatrix.__matmul__",
+                )
+            )
+
+        return matmul[dtype, device](self, other)
+
+    fn _mul_cpu(self, other: Self) raises -> Self:
+        """CPU implementation of element-wise multiplication with vectorization.
+        """
+        var result = StaticMatrix[dtype, device]((self.shape[0], self.shape[1]))
+        var n = self.size
+        var self_ptr = self._buf.get_host_ptr()
+        var other_ptr = other._buf.get_host_ptr()
+        var result_ptr = result._buf.get_host_ptr()
+
+        alias simd_width = simd_width_of[dtype]()
+
+        # TODO: use vectorize function.
+        for i in range(0, n, simd_width):
+            var remaining = min(simd_width, n - i)
+            if remaining == simd_width:
+                var a_simd = self_ptr.load[width=simd_width](i)
+                var b_simd = other_ptr.load[width=simd_width](i)
+                result_ptr.store[width=simd_width](i, a_simd * b_simd)
+            else:
+                # Handle remaining elements
+                for j in range(remaining):
+                    result_ptr[i + j] = self_ptr[i + j] * other_ptr[i + j]
+
+        return result^
+
+    fn _mul_gpu(self, other: Self) raises -> Self:
+        """GPU implementation of element-wise multiplication with optimized vectorized kernel.
+        """
+        var result = StaticMatrix[dtype, device]((self.shape[0], self.shape[1]))
+        var (grid_size, block_size) = calculate_1d_launch_params_for_dtype[
+            dtype
+        ](self.size)
+        var grid_dim = Dim(grid_size)
+        var block_dim = Dim(block_size)
+        var total_threads = grid_size * block_size
+
+        with DeviceContext() as ctx:
+            var a_buf = self._buf.get_device_buffer()
+            var b_buf = other._buf.get_device_buffer()
+            var out_buf = result._buf.get_device_buffer()
+
+            # Launch simple multiplication kernel
+            ctx.enqueue_function[matrix_mul_kernel_vectorized[dtype]](
+                out_buf.unsafe_ptr(),
+                a_buf.unsafe_ptr(),
+                b_buf.unsafe_ptr(),
+                self.size,
+                total_threads,
+                grid_dim=grid_dim,
+                block_dim=block_dim,
+            )
+
+            ctx.synchronize()
+
+        return result^
+
+    fn to[
+        target_device: Device
+    ](self) raises -> StaticMatrix[dtype, target_device]:
+        """Return a new matrix on the target device, copying data as needed (PyTorch-like .to).
+        """
+
+        # CPU -> CPU deep copy
+        @parameter
+        if device.type == "cpu" and target_device.type == "cpu":
+            raise Error("Redundant CPU to CPU copy attempted")
+        elif device.type == "gpu" and target_device.type == "gpu":
+            var out = StaticMatrix[dtype, target_device](
+                (self.shape[0], self.shape[1])
+            )
+            with DeviceContext() as ctx:
+                ctx.enqueue_copy(
+                    out._buf.get_device_buffer(), self._buf.get_device_buffer()
+                )
+                ctx.synchronize()
+            return out^
+        elif device.type == "cpu" and target_device.type == "gpu":
+            var out = StaticMatrix[dtype, target_device](
+                (self.shape[0], self.shape[1])
+            )
+            with DeviceContext() as ctx:
+                ctx.enqueue_copy(
+                    out._buf.get_device_buffer(), self._buf.get_host_ptr()
+                )
+                ctx.synchronize()
+            return out^
+        elif device.type == "gpu" and target_device.type == "cpu":
+            var out = StaticMatrix[dtype, target_device](
+                (self.shape[0], self.shape[1])
+            )
+            with DeviceContext() as ctx:
+                ctx.enqueue_copy(
+                    out._buf.get_host_ptr(), self._buf.get_device_buffer()
+                )
+                ctx.synchronize()
+            return out^
+
+        else:
+            raise Error(
+                "Unsupported device type conversion: "
+                + String(device)
+                + " to "
+                + String(target_device)
+            )
+
+    @always_inline
+    fn cpu(self) raises -> StaticMatrix[dtype, Device.CPU]:
+        return self.to[Device.CPU]()
+
+    @always_inline
+    fn cuda(self) raises -> StaticMatrix[dtype, Device.CUDA]:
+        return self.to[Device.CUDA]()
+
+    @always_inline
+    fn rocm(self) raises -> StaticMatrix[dtype, Device.ROCM]:
+        return self.to[Device.ROCM]()
+
+    @always_inline
+    fn metal(self) raises -> StaticMatrix[dtype, Device.MPS]:
+        return self.to[Device.MPS]()
+
+
+fn _add_cpu[
+    dtype: DType,
+    device: Device = Device.CPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """CPU StaticMatrix addition using vectorized operations."""
+    var result = StaticMatrix[dtype, device]((a.shape[0], a.shape[1]))
+    var n = a.size
+    var a_ptr = a._buf.get_host_ptr()
+    var b_ptr = b._buf.get_host_ptr()
+    var result_ptr = result._buf.get_host_ptr()
+
+    alias simd_width = simd_width_of[dtype]()
+
+    for i in range(0, n, simd_width):
+        var remaining = min(simd_width, n - i)
+        if remaining == simd_width:
+            var a_simd = a_ptr.load[width=simd_width](i)
+            var b_simd = b_ptr.load[width=simd_width](i)
+            result_ptr.store[width=simd_width](i, a_simd + b_simd)
+        else:
+            for j in range(remaining):
+                result_ptr[i + j] = a_ptr[i + j] + b_ptr[i + j]
+
+    return result^
+
+
+fn _add_gpu[
+    dtype: DType,
+    device: Device = Device.GPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """GPU StaticMatrix addition using optimized vectorized kernel."""
+    var result = StaticMatrix[dtype, device]((a.shape[0], a.shape[1]))
+    var total_elems = a.size
+    var (grid_size, block_size) = calculate_1d_launch_params_for_dtype[dtype](
+        total_elems
+    )
+    var total_threads = grid_size * block_size
+    var grid_dim = (grid_size, 1)
+    var block_dim_1d = (block_size, 1)
+
+    with DeviceContext() as ctx:
+        ctx.enqueue_function[matrix_add_kernel_vectorized[dtype]](
+            result._buf.get_device_ptr(),
+            a._buf.get_device_ptr(),
+            b._buf.get_device_ptr(),
+            total_elems,
+            total_threads,
+            grid_dim=grid_dim,
+            block_dim=block_dim_1d,
+        )
+
+    return result^
+
+
+fn _sub_cpu[
+    dtype: DType,
+    device: Device = Device.CPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """CPU StaticMatrix addition using vectorized operations."""
+    var result = StaticMatrix[dtype, device]((a.shape[0], a.shape[1]))
+    var n = a.size
+    var a_ptr = a._buf.get_host_ptr()
+    var b_ptr = b._buf.get_host_ptr()
+    var result_ptr = result._buf.get_host_ptr()
+
+    alias simd_width = simd_width_of[dtype]()
+
+    for i in range(0, n, simd_width):
+        var remaining = min(simd_width, n - i)
+        if remaining == simd_width:
+            var a_simd = a_ptr.load[width=simd_width](i)
+            var b_simd = b_ptr.load[width=simd_width](i)
+            result_ptr.store[width=simd_width](i, a_simd - b_simd)
+        else:
+            for j in range(remaining):
+                result_ptr[i + j] = a_ptr[i + j] - b_ptr[i + j]
+
+    return result^
+
+
+fn _sub_gpu[
+    dtype: DType,
+    device: Device = Device.GPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """GPU StaticMatrix subtraction using optimized vectorized kernel."""
+    var result = StaticMatrix[dtype, device]((a.shape[0], a.shape[1]))
+    var total_elems = a.size
+    var (grid_size, block_size) = calculate_1d_launch_params_for_dtype[dtype](
+        total_elems
+    )
+    var total_threads = grid_size * block_size
+    var grid_dim = (grid_size, 1)
+    var block_dim_1d = (block_size, 1)
+
+    with DeviceContext() as ctx:
+        ctx.enqueue_function[matrix_sub_kernel_vectorized[dtype]](
+            result._buf.get_device_ptr(),
+            a._buf.get_device_ptr(),
+            b._buf.get_device_ptr(),
+            total_elems,
+            total_threads,
+            grid_dim=grid_dim,
+            block_dim=block_dim_1d,
+        )
+
+    return result^
+
+
+fn add[
+    dtype: DType, device: Device
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """StaticMatrix addition with device-specific optimized kernels."""
+
+    @parameter
+    if device.type == "cpu":
+        return _add_cpu[dtype, device](a, b)
+    else:
+        return _add_gpu[dtype, device](a, b)
+
+
+fn sub[
+    dtype: DType, device: Device
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """StaticMatrix subtraction with device-specific optimized kernels."""
+
+    @parameter
+    if device.type == "cpu":
+        return _sub_cpu[dtype, device](a, b)
+    else:
+        return _sub_gpu[dtype, device](a, b)
+
+
+# ===----------------------------------------------------------------------=== #
+# GPU Matrix Multiplication Kernels
+# ===----------------------------------------------------------------------=== #
+fn matrix_matmul_kernel_naive[
+    dtype: DType
+](
+    output: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+    m: Int,
+    n: Int,
+    k: Int,
+):
+    """Naive GPU matrix multiplication kernel - O(n³) with no optimizations."""
+    var row: Int = block_dim.y * block_idx.y + thread_idx.y
+    var col: Int = block_dim.x * block_idx.x + thread_idx.x
+
+    if row < m and col < n:
+        var acc: Scalar[dtype] = 0
+
+        for ki in range(k):
+            acc += a[row * k + ki] * b[ki * n + col]
+
+        output[row * n + col] = acc
+
+
+fn matrix_matmul_kernel_tiled[
+    dtype: DType, tile_size: UInt
+](
+    output: UnsafePointer[Scalar[dtype]],
+    a: UnsafePointer[Scalar[dtype]],
+    b: UnsafePointer[Scalar[dtype]],
+    m: UInt,
+    n: UInt,
+    k: UInt,
+):
+    """Tiled GPU matrix multiplication kernel using shared memory tiles."""
+    var local_row = thread_idx.y
+    var local_col = thread_idx.x
+    var tiled_row: UInt = block_idx.y * tile_size + local_row
+    var tiled_col: UInt = block_idx.x * tile_size + local_col
+
+    # Allocate shared memory using stack_allocation
+    var a_shared = stack_allocation[
+        tile_size * tile_size,
+        Scalar[dtype],
+        address_space = AddressSpace.SHARED,
+    ]()
+    var b_shared = stack_allocation[
+        tile_size * tile_size,
+        Scalar[dtype],
+        address_space = AddressSpace.SHARED,
+    ]()
+
+    var acc: Scalar[dtype] = 0
+
+    # Iterate over tiles to compute matrix product
+    var num_tiles: UInt = (k + tile_size - 1) // tile_size
+    for tile in range(num_tiles):
+        # Load A tile - global row stays the same, col determined by tile
+        if tiled_row < m and (tile * tile_size + local_col) < k:
+            a_shared[local_row * tile_size + local_col] = a[
+                tiled_row * k + (tile * tile_size + local_col)
+            ]
+        else:
+            a_shared[local_row * tile_size + local_col] = 0
+
+        # Load B tile - row determined by tile, global col stays the same
+        if (tile * tile_size + local_row) < k and tiled_col < n:
+            b_shared[local_row * tile_size + local_col] = b[
+                (tile * tile_size + local_row) * n + tiled_col
+            ]
+        else:
+            b_shared[local_row * tile_size + local_col] = 0
+
+        barrier()
+
+        # Matrix multiplication within the tile
+        if tiled_row < m and tiled_col < n:
+            for ki in range(tile_size):
+                acc += (
+                    a_shared[local_row * tile_size + ki]
+                    * b_shared[ki * tile_size + local_col]
+                )
+
+        barrier()
+
+    # Write out final result
+    if tiled_row < m and tiled_col < n:
+        output[tiled_row * n + tiled_col] = acc
+
+
+# ===----------------------------------------------------------------------=== #
+# Matrix Multiplication Functions
+# ===----------------------------------------------------------------------=== #
+
+
+fn _matmul_cpu[
+    dtype: DType,
+    device: Device = Device.CPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """CPU matrix multiplication using vectorized operations."""
+    var m = a.shape[0]
+    var k = a.shape[1]
+    var n = b.shape[1]
+
+    var result = StaticMatrix[dtype, device]((m, n))
+
+    var a_ptr = a._buf.get_host_ptr()
+    var b_ptr = b._buf.get_host_ptr()
+    var c_ptr = result._buf.get_host_ptr()
+
+    # Initialize result
+    for i in range(m * n):
+        c_ptr[i] = 0
+
+    alias simd_width = simd_width_of[dtype]()
+
+    # Optimized loops with vectorization
+    for i in range(m):
+        for ki in range(k):
+            var a_val = a_ptr[i * k + ki]
+            var a_broadcast = SIMD[dtype, simd_width](a_val)
+
+            var j = 0
+            while j + simd_width <= n:
+                var b_vec = b_ptr.load[width=simd_width](ki * n + j)
+                var c_vec = c_ptr.load[width=simd_width](i * n + j)
+                c_ptr.store[width=simd_width](
+                    i * n + j, c_vec + a_broadcast * b_vec
+                )
+                j += simd_width
+
+            while j < n:
+                c_ptr[i * n + j] += a_val * b_ptr[ki * n + j]
+                j += 1
+
+    return result^
+
+
+fn _matmul_gpu_naive[
+    dtype: DType,
+    device: Device = Device.GPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """GPU matrix multiplication using naive kernel."""
+    var m = a.shape[0]
+    var k = a.shape[1]
+    var n = b.shape[1]
+
+    var result = StaticMatrix[dtype, device]((m, n))
+
+    # Calculate grid and block dimensions
+    alias TILE_SIZE = 16
+    var blocks_x = (n + TILE_SIZE - 1) // TILE_SIZE
+    var blocks_y = (m + TILE_SIZE - 1) // TILE_SIZE
+    var grid_dim_2d = (blocks_x, blocks_y)
+    var block_dim_2d = (TILE_SIZE, TILE_SIZE)
+
+    # Get device buffers as UnsafePointers
+    var a_ptr = a._buf.get_device_ptr()
+    var b_ptr = b._buf.get_device_ptr()
+    var result_ptr = result._buf.get_device_ptr()
+
+    with DeviceContext() as ctx:
+        ctx.enqueue_function[matrix_matmul_kernel_naive[dtype]](
+            result_ptr,
+            a_ptr,
+            b_ptr,
+            m,
+            n,
+            k,
+            grid_dim=grid_dim_2d,
+            block_dim=block_dim_2d,
+        )
+
+    return result^
+
+
+fn _matmul_gpu_tiled[
+    dtype: DType,
+    device: Device = Device.GPU,
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """GPU matrix multiplication using tiled kernel."""
+    var m = a.shape[0]
+    var k = a.shape[1]
+    var n = b.shape[1]
+
+    var result = StaticMatrix[dtype, device]((m, n))
+
+    # Calculate grid and block dimensions
+    alias TILE_SIZE = 32
+    var blocks_x = (n + TILE_SIZE - 1) // TILE_SIZE
+    var blocks_y = (m + TILE_SIZE - 1) // TILE_SIZE
+    var grid_dim_2d = (blocks_x, blocks_y)
+    var block_dim_2d = (TILE_SIZE, TILE_SIZE)
+
+    var a_ptr = a._buf.get_device_ptr()
+    var b_ptr = b._buf.get_device_ptr()
+    var result_ptr = result._buf.get_device_ptr()
+
+    with DeviceContext() as ctx:
+        ctx.enqueue_function[matrix_matmul_kernel_tiled[dtype, TILE_SIZE]](
+            result_ptr,
+            a_ptr,
+            b_ptr,
+            m,
+            n,
+            k,
+            grid_dim=grid_dim_2d,
+            block_dim=block_dim_2d,
+        )
+
+    return result^
+
+
+fn matmul[
+    dtype: DType, device: Device
+](
+    a: StaticMatrix[dtype, device], b: StaticMatrix[dtype, device]
+) raises -> StaticMatrix[dtype, device]:
+    """Matrix multiplication with device-specific optimized kernels."""
+
+    @parameter
+    if device.type == "cpu":
+        return _matmul_cpu[dtype, device](a, b)
+    else:
+        return _matmul_gpu_tiled[dtype, device](a, b)
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index e6c035e7..3e31904d 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -169,7 +169,7 @@ fn _transfer_offset(offset: Int, strides: NDArrayStrides) raises -> Int:
 
 
 fn _traverse_buffer_according_to_shape_and_strides(
-    mut ptr: UnsafePointer[Scalar[DType.index]],
+    mut ptr: UnsafePointer[Scalar[DType.int]],
     shape: NDArrayShape,
     strides: NDArrayStrides,
     current_dim: Int = 0,
@@ -194,7 +194,7 @@ fn _traverse_buffer_according_to_shape_and_strides(
     Example:
     ```console
     # A is a 2x3x4 array
-    var I = nm.NDArray[DType.index](nm.Shape(A.size))
+    var I = nm.NDArray[DType.int](nm.Shape(A.size))
     var ptr = I._buf
     _traverse_buffer_according_to_shape_and_strides(
         ptr, A.shape._flip(), A.strides._flip()
@@ -384,31 +384,163 @@ fn to_numpy[dtype: DType](array: NDArray[dtype]) raises -> PythonObject:
         for i in range(dimension):
             np_arr_dim.append(array.shape[i])
 
-        # Implement a dictionary for this later
+        # Implement a dictionary for this later.
+        # TODO: move to compile time checking.
         var numpyarray: PythonObject
         var np_dtype = np.float64
-        if dtype == DType.float16:
+        if dtype == DType.int8:
+            np_dtype = np.int8
+        elif dtype == DType.int16:
+            np_dtype = np.int16
+        elif dtype == DType.int32:
+            np_dtype = np.int32
+        elif dtype == DType.int64:
+            np_dtype = np.int64
+        elif dtype == DType.int128:
+            raise Error(
+                "NumPy has no native int128 type. Cannot convert"
+                " NDArray[DType.int128] to numpy array."
+            )
+        elif dtype == DType.int256:
+            raise Error(
+                "NumPy has no native int256 type. Cannot convert"
+                " NDArray[DType.int256] to numpy array."
+            )
+        elif dtype == DType.int:
+            np_dtype = np.intp
+        elif dtype == DType.uint8:
+            np_dtype = np.uint8
+        elif dtype == DType.uint16:
+            np_dtype = np.uint16
+        elif dtype == DType.uint32:
+            np_dtype = np.uint32
+        elif dtype == DType.uint64:
+            np_dtype = np.uint64
+        elif dtype == DType.uint128:
+            raise Error(
+                "NumPy has no native uint128 type. Cannot convert"
+                " NDArray[DType.uint128] to numpy array."
+            )
+        elif dtype == DType.uint256:
+            raise Error(
+                "NumPy has no native uint256 type. Cannot convert"
+                " NDArray[DType.uint256] to numpy array."
+            )
+        elif dtype == DType.uint:
+            np_dtype = np.uintp
+        elif dtype == DType.bfloat16:
+            raise Error(
+                "NumPy has no native bfloat16 type. Cannot convert"
+                " NDArray[DType.bfloat16] to numpy array."
+            )
+        elif dtype == DType.float16:
             np_dtype = np.float16
         elif dtype == DType.float32:
             np_dtype = np.float32
-        elif dtype == DType.int64:
-            np_dtype = np.int64
-        elif dtype == DType.int32:
-            np_dtype = np.int32
+        elif dtype == DType.float64:
+            np_dtype = np.float64
+        elif dtype == DType.bool:
+            np_dtype = np.bool_
+
+        var order = "C" if array.flags.C_CONTIGUOUS else "F"
+        numpyarray = np.empty(np_arr_dim, dtype=np_dtype, order=order)
+        var pointer_d = numpyarray.__array_interface__["data"][
+            0
+        ].unsafe_get_as_pointer[dtype]()
+        memcpy(dest=pointer_d, src=array.unsafe_ptr(), count=array.size)
+        _ = array
+
+        return numpyarray^
+
+    except e:
+        print("Error in converting to numpy", e)
+        return PythonObject()
+
+
+fn to_numpy[dtype: DType](array: Matrix[dtype]) raises -> PythonObject:
+    """
+    Convert a Matrix to a numpy matrix.
+
+    Example:
+    ```console
+    var arr = NDArray[DType.float32](3, 3, 3)
+    var np_arr = to_numpy(arr)
+    var np_arr1 = arr.to_numpy()
+    ```
+
+    Parameters:
+        dtype: The data type of the NDArray elements.
+
+    Args:
+        array: The NDArray to convert.
+
+    Returns:
+        The converted numpy array.
+    """
+    try:
+        var np = Python.import_module("numpy")
+
+        var np_arr_dim = Python.list()
+        np_arr_dim.append(array.shape[0])
+        np_arr_dim.append(array.shape[1])
+
+        np.set_printoptions(4)
+
+        # TODO: Add all the new types!
+        # Implement a dictionary for this later
+        var numpyarray: PythonObject
+        var np_dtype = np.float64
+        if dtype == DType.int8:
+            np_dtype = np.int8
         elif dtype == DType.int16:
             np_dtype = np.int16
-        elif dtype == DType.int8:
-            np_dtype = np.int8
-        elif dtype == DType.index:
+        elif dtype == DType.int32:
+            np_dtype = np.int32
+        elif dtype == DType.int64:
+            np_dtype = np.int64
+        elif dtype == DType.int128:
+            raise Error(
+                "NumPy has no native int128 type. Cannot convert"
+                " Matrix[DType.int128] to numpy array."
+            )
+        elif dtype == DType.int256:
+            raise Error(
+                "NumPy has no native int256 type. Cannot convert"
+                " Matrix[DType.int256] to numpy array."
+            )
+        elif dtype == DType.int:
             np_dtype = np.intp
-        elif dtype == DType.uint64:
-            np_dtype = np.uint64
-        elif dtype == DType.uint32:
-            np_dtype = np.uint32
-        elif dtype == DType.uint16:
-            np_dtype = np.uint16
         elif dtype == DType.uint8:
             np_dtype = np.uint8
+        elif dtype == DType.uint16:
+            np_dtype = np.uint16
+        elif dtype == DType.uint32:
+            np_dtype = np.uint32
+        elif dtype == DType.uint64:
+            np_dtype = np.uint64
+        elif dtype == DType.uint128:
+            raise Error(
+                "NumPy has no native uint128 type. Cannot convert"
+                " Matrix[DType.uint128] to numpy array."
+            )
+        elif dtype == DType.uint256:
+            raise Error(
+                "NumPy has no native uint256 type. Cannot convert"
+                " Matrix[DType.uint256] to numpy array."
+            )
+        elif dtype == DType.uint:
+            np_dtype = np.uintp
+        elif dtype == DType.bfloat16:
+            raise Error(
+                "NumPy has no native bfloat16 type. Cannot convert"
+                " Matrix[DType.bfloat16] to numpy array."
+            )
+        elif dtype == DType.float16:
+            np_dtype = np.float16
+        elif dtype == DType.float32:
+            np_dtype = np.float32
+        elif dtype == DType.float64:
+            np_dtype = np.float64
         elif dtype == DType.bool:
             np_dtype = np.bool_
 
@@ -417,8 +549,7 @@ fn to_numpy[dtype: DType](array: NDArray[dtype]) raises -> PythonObject:
         var pointer_d = numpyarray.__array_interface__["data"][
             0
         ].unsafe_get_as_pointer[dtype]()
-        memcpy(pointer_d, array.unsafe_ptr(), array.size)
-        _ = array
+        memcpy(dest=pointer_d, src=array._buf.ptr, count=array.size)
 
         return numpyarray^
 
diff --git a/numojo/prelude.mojo b/numojo/prelude.mojo
index f0254dd4..7edd1297 100644
--- a/numojo/prelude.mojo
+++ b/numojo/prelude.mojo
@@ -33,28 +33,39 @@ from numojo.core.complex.complex_dtype import (
     ci16,
     ci32,
     ci64,
-    cisize,
-    cintp,
+    ci128,
+    ci256,
+    cint,
     cu8,
     cu16,
     cu32,
     cu64,
+    cu128,
+    cu256,
+    cuint,
+    cbf16,
     cf16,
     cf32,
     cf64,
     cboolean,
+    cinvalid,
 )
 from numojo.core.datatypes import (
     i8,
     i16,
     i32,
     i64,
-    isize,
-    intp,
+    i128,
+    i256,
+    int,
     u8,
     u16,
     u32,
     u64,
+    u128,
+    u256,
+    uint,
+    bf16,
     f16,
     f32,
     f64,
diff --git a/numojo/routines/creation.mojo b/numojo/routines/creation.mojo
index edb34243..a3b06659 100644
--- a/numojo/routines/creation.mojo
+++ b/numojo/routines/creation.mojo
@@ -10,14 +10,11 @@ Array creation routine.
 # TODO (In order of priority)
 1) Implement axis argument for the NDArray creation functions
 2) Separate `array(object)` and `NDArray.__init__(shape)`.
-3) Use `Shapelike` trait to replace `NDArrayShape`, `List`, `VariadicList` and
-3) Use `Shapelike` trait to replace `NDArrayShape`, `List`, `VariadicList` and
-    reduce the number of function reloads.
+3) Use `Shapelike` trait to replace `NDArrayShape`, `List`, `VariadicList` and reduce the number of function reloads.
 4) Simplify complex overloads into sum of real methods.
 
 ---
 
-Use more uniformed way of calling functions, i.e., using one specific
 Use more uniformed way of calling functions, i.e., using one specific
 overload for each function. This makes maintenance easier. Example:
 
@@ -26,15 +23,10 @@ overload for each function. This makes maintenance easier. Example:
 - `zeros`, `ones` calls `full`.
 - Other functions calls `zeros`, `ones`, `full`.
 
-If overloads are needed, it is better to call the default signature in other
-overloads. Example: `zeros(shape: NDArrayShape)`. All other overloads call this
-If overloads are needed, it is better to call the default signature in other
-overloads. Example: `zeros(shape: NDArrayShape)`. All other overloads call this
-function. So it is easy for modification.
+If overloads are needed, it is better to call the default signature in other overloads. Example: `zeros(shape: NDArrayShape)`. All other overloads call this function. So it is easy for modification.
 
 """
 
-from algorithm import parallelize, vectorize
 from algorithm import parallelize, vectorize
 from builtin.math import pow
 from collections import Dict
@@ -64,23 +56,31 @@ fn arange[
     step: Scalar[dtype] = Scalar[dtype](1),
 ) raises -> NDArray[dtype]:
     """
-    Function that computes a series of values starting from "start" to "stop"
-    with given "step" size.
-
-    Raises:
-        Error if both dtype and dtype are integers or if dtype is a float and
-        dtype is an integer.
+    Generate evenly spaced values within a given interval.
 
     Parameters:
         dtype: Datatype of the output array.
 
     Args:
-        start: Scalar[dtype] - Start value.
-        stop: Scalar[dtype]  - End value.
-        step: Scalar[dtype]  - Step size between each element (default 1).
+        start: Start value (inclusive).
+        stop: End value (exclusive).
+        step: Step size between consecutive elements (default 1).
 
     Returns:
-        A NDArray of datatype `dtype` with elements ranging from `start` to `stop` incremented with `step`.
+        A NDArray of `dtype` with elements in range [start, stop) with step increments.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Basic usage
+        var arr = nm.arange[nm.f64](0.0, 10.0, 2.0)
+        print(arr)  # [0.0, 2.0, 4.0, 6.0, 8.0]
+
+        # With negative step
+        var arr2 = nm.arange[nm.f64](10.0, 0.0, -2.0)
+        print(arr2)  # [10.0, 8.0, 6.0, 4.0, 2.0]
+        ```
     """
     var num: Int = ((stop - start) / step).__int__()
     var result: NDArray[dtype] = NDArray[dtype](NDArrayShape(num))
@@ -94,7 +94,26 @@ fn arange[
     dtype: DType = DType.float64
 ](stop: Scalar[dtype]) raises -> NDArray[dtype]:
     """
-    (Overload) When start is 0 and step is 1.
+    Generate evenly spaced values from 0 to stop.
+
+    Overload with start=0 and step=1 for convenience.
+
+    Parameters:
+        dtype: Datatype of the output array.
+
+    Args:
+        stop: End value (exclusive).
+
+    Returns:
+        A NDArray of `dtype` with elements [0, 1, 2, ..., stop-1].
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var arr = nm.arange[nm.f64](5.0)
+        print(arr)  # [0.0, 1.0, 2.0, 3.0, 4.0]
+        ```
     """
 
     var size: Int = Int(stop)  # TODO: handle negative values.
@@ -113,23 +132,28 @@ fn arange[
     step: ComplexSIMD[cdtype] = ComplexSIMD[cdtype](1, 1),
 ) raises -> ComplexNDArray[cdtype]:
     """
-    Function that computes a series of values starting from "start" to "stop"
-    with given "step" size.
-
-    Raises:
-        Error if both dtype and dtype are integers or if dtype is a float and
-        dtype is an integer.
+    Generate evenly spaced complex values within a given interval.
 
     Parameters:
         cdtype: Complex datatype of the output array.
 
     Args:
-        start: ComplexSIMD[cdtype] - Start value.
-        stop: ComplexSIMD[cdtype]  - End value.
-        step: ComplexSIMD[cdtype]  - Step size between each element (default 1).
+        start: Start value (inclusive).
+        stop: End value (exclusive).
+        step: Step size between consecutive elements (default 1+1j).
 
     Returns:
-        A ComplexNDArray of datatype `dtype` with elements ranging from `start` to `stop` incremented with `step`.
+        A ComplexNDArray of `cdtype` with elements in range [start, stop) with step increments.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var start = nm.CScalar[nm.cf64](0.0, 0.0)
+        var stop = nm.CScalar[nm.cf64](5.0, 5.0)
+        var step = nm.CScalar[nm.cf64](1.0, 1.0)
+        var arr = nm.arange[nm.cf64](start, stop, step)
+        ```
     """
     var num_re: Int = ((stop.re - start.re) / step.re).__int__()
     var num_im: Int = ((stop.im - start.im) / step.im).__int__()
@@ -155,7 +179,18 @@ fn arange[
     cdtype: ComplexDType = ComplexDType.float64,
 ](stop: ComplexSIMD[cdtype]) raises -> ComplexNDArray[cdtype]:
     """
-    (Overload) When start is 0 and step is 1.
+    Generate evenly spaced complex values from 0 to stop.
+
+    Overload with start=0+0j and step=1+1j for convenience.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        stop: End value (exclusive).
+
+    Returns:
+        A ComplexNDArray of `cdtype` with elements from origin to stop.
     """
     var size_re = Int(stop.re)
     var size_im = Int(stop.im)
@@ -180,35 +215,49 @@ fn arange[
 # Linear Spacing NDArray Generation
 # ===------------------------------------------------------------------------===#
 fn linspace[
-    dtype: DType = DType.float64
+    dtype: DType = DType.float64,
+    parallel: Bool = False,
 ](
     start: Scalar[dtype],
     stop: Scalar[dtype],
     num: Int = 50,
     endpoint: Bool = True,
-    parallel: Bool = False,
 ) raises -> NDArray[dtype]:
     """
-    Function that computes a series of linearly spaced values starting from "start" to "stop" with given size. Wrapper function for _linspace_serial, _linspace_parallel.
-
-    Raises:
-        Error if dtype is an integer.
+    Generate evenly spaced numbers over a specified interval.
 
     Parameters:
-        dtype: Datatype of the output array.
+        dtype: Datatype of the output array (must be floating-point).
+        parallel: Whether to use parallelization for computation (default False).
 
     Args:
-        start: Start value.
-        stop: End value.
-        num: No of linearly spaced elements.
-        endpoint: Specifies whether to include endpoint in the final NDArray, defaults to True.
-        parallel: Specifies whether the linspace should be calculated using parallelization, deafults to False.
+        start: Starting value of the sequence.
+        stop: End value of the sequence.
+        num: Number of samples to generate (default 50).
+        endpoint: Whether to include `stop` in the result (default True).
 
     Returns:
-        A NDArray of datatype `dtype` with elements ranging from `start` to `stop` with num elements.
+        A NDArray of `dtype` with `num` evenly spaced elements between `start` and `stop`.
 
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Basic usage
+        var arr = nm.linspace[nm.f64](0.0, 10.0, 5)
+        print(arr)  # [0.0, 2.5, 5.0, 7.5, 10.0]
+
+        # Without endpoint
+        var arr2 = nm.linspace[nm.f64](0.0, 10.0, 5, endpoint=False)
+        print(arr2)  # [0.0, 2.0, 4.0, 6.0, 8.0]
+
+        # Parallel computation for large arrays
+        var large = nm.linspace[nm.f64](0.0, 1000.0, 10000, parallel=True)
+        ```
     """
     constrained[not dtype.is_integral()]()
+
+    @parameter
     if parallel:
         return _linspace_parallel[dtype](start, stop, num, endpoint)
     else:
@@ -300,34 +349,41 @@ fn _linspace_parallel[
 
 fn linspace[
     cdtype: ComplexDType = ComplexDType.float64,
+    parallel: Bool = False,
 ](
     start: ComplexSIMD[cdtype],
     stop: ComplexSIMD[cdtype],
     num: Int = 50,
     endpoint: Bool = True,
-    parallel: Bool = False,
 ) raises -> ComplexNDArray[cdtype]:
     """
-    Function that computes a series of linearly spaced values starting from "start" to "stop" with given size. Wrapper function for _linspace_serial, _linspace_parallel.
-
-    Raises:
-        Error if dtype is an integer.
+    Generate evenly spaced complex numbers over a specified interval.
 
     Parameters:
-        cdtype: Complex datatype of the output array.
+        cdtype: Complex datatype of the output array (must be floating-point).
+        parallel: Whether to use parallelization for computation (default False).
 
     Args:
-        start: Start value.
-        stop: End value.
-        num: No of linearly spaced elements.
-        endpoint: Specifies whether to include endpoint in the final ComplexNDArray, defaults to True.
-        parallel: Specifies whether the linspace should be calculated using parallelization, deafults to False.
+        start: Starting complex value of the sequence.
+        stop: End complex value of the sequence.
+        num: Number of samples to generate (default 50).
+        endpoint: Whether to include `stop` in the result (default True).
 
     Returns:
-        A ComplexNDArray of `dtype` with `num` linearly spaced elements between `start` and `stop`.
+        A ComplexNDArray of `cdtype` with `num` evenly spaced elements between `start` and `stop`.
+
+    Examples:
+        ```mojo
+        import numojo as nm
 
+        var start = nm.CScalar[nm.cf64](0.0, 0.0)
+        var stop = nm.CScalar[nm.cf64](10.0, 10.0)
+        var arr = nm.linspace[nm.cf64](start, stop, 5)
+        ```
     """
     constrained[not cdtype.is_integral()]()
+
+    @parameter
     if parallel:
         return _linspace_parallel[cdtype](start, stop, num, endpoint)
     else:
@@ -457,36 +513,50 @@ fn _linspace_parallel[
 # Logarithmic Spacing NDArray Generation
 # ===------------------------------------------------------------------------===#
 fn logspace[
-    dtype: DType = DType.float64
+    dtype: DType = DType.float64,
+    parallel: Bool = False,
 ](
     start: Scalar[dtype],
     stop: Scalar[dtype],
     num: Int,
     endpoint: Bool = True,
     base: Scalar[dtype] = 10.0,
-    parallel: Bool = False,
 ) raises -> NDArray[dtype]:
     """
-    Generate a logrithmic spaced NDArray of `num` elements between `start` and `stop`. Wrapper function for _logspace_serial, _logspace_parallel functions.
+    Generate logarithmically spaced numbers over a specified interval.
 
-    Raises:
-        Error if dtype is an integer.
+    The sequence starts at base^start and ends at base^stop.
 
     Parameters:
-        dtype: Datatype of the output array.
+        dtype: Datatype of the output array (must be floating-point).
+        parallel: Whether to use parallelization for computation (default False).
 
     Args:
-        start: The starting value of the NDArray.
-        stop: The ending value of the NDArray.
-        num: The number of elements in the NDArray.
-        endpoint: Whether to include the `stop` value in the NDArray. Defaults to True.
-        base: Base value of the logarithm, defaults to 10.
-        parallel: Specifies whether to calculate the logarithmic spaced values using parallelization.
+        start: Base^start is the starting value of the sequence.
+        stop: Base^stop is the final value of the sequence.
+        num: Number of samples to generate.
+        endpoint: Whether to include base^stop in the result (default True).
+        base: The base of the logarithm (default 10.0).
 
     Returns:
-    - A NDArray of `dtype` with `num` logarithmic spaced elements between `start` and `stop`.
+        A NDArray of `dtype` with `num` logarithmically spaced elements.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Logarithmic spacing from 10^0 to 10^3
+        var arr = nm.logspace[nm.f64](0.0, 3.0, 4)
+        print(arr)  # [1.0, 10.0, 100.0, 1000.0]
+
+        # Base 2 logarithmic spacing
+        var arr2 = nm.logspace[nm.f64](0.0, 4.0, 5, base=2.0)
+        print(arr2)  # [1.0, 2.0, 4.0, 8.0, 16.0]
+        ```
     """
     constrained[not dtype.is_integral()]()
+
+    @parameter
     if parallel:
         return _logspace_parallel[dtype](
             start,
@@ -593,33 +663,32 @@ fn _logspace_parallel[
 
 fn logspace[
     cdtype: ComplexDType = ComplexDType.float64,
+    parallel: Bool = False,
 ](
     start: ComplexSIMD[cdtype],
     stop: ComplexSIMD[cdtype],
     num: Int,
     endpoint: Bool = True,
     base: ComplexSIMD[cdtype] = ComplexSIMD[cdtype](10.0, 10.0),
-    parallel: Bool = False,
 ) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a logrithmic spaced ComplexNDArray of `num` elements between `start` and `stop`. Wrapper function for _logspace_serial, _logspace_parallel functions.
+    Generate logarithmically spaced complex numbers over a specified interval.
 
-    Raises:
-        Error if dtype is an integer.
+    The sequence starts at base^start and ends at base^stop.
 
     Parameters:
-        cdtype: Complex datatype of the output array.
+        cdtype: Complex datatype of the output array (must be floating-point).
+        parallel: Whether to use parallelization for computation (default False).
 
     Args:
-        start: The starting value of the ComplexNDArray.
-        stop: The ending value of the ComplexNDArray.
-        num: The number of elements in the ComplexNDArray.
-        endpoint: Whether to include the `stop` value in the ComplexNDArray. Defaults to True.
-        base: Base value of the logarithm, defaults to 10.
-        parallel: Specifies whether to calculate the logarithmic spaced values using parallelization.
+        start: Base^start is the starting complex value of the sequence.
+        stop: Base^stop is the final complex value of the sequence.
+        num: Number of samples to generate.
+        endpoint: Whether to include base^stop in the result (default True).
+        base: The complex base of the logarithm (default 10+10j).
 
     Returns:
-    - A ComplexNDArray of `dtype` with `num` logarithmic spaced elements between `start` and `stop`.
+        A ComplexNDArray of `cdtype` with `num` logarithmically spaced elements.
     """
     constrained[not cdtype.is_integral()]()
     if parallel:
@@ -776,22 +845,31 @@ fn geomspace[
     endpoint: Bool = True,
 ) raises -> NDArray[dtype]:
     """
-    Generate a NDArray of `num` elements between `start` and `stop` in a geometric series.
-
-    Raises:
-        Error if dtype is an integer.
+    Generate numbers spaced evenly on a log scale (geometric progression).
 
     Parameters:
-        dtype: Datatype of the input values.
+        dtype: Datatype of the output array (must be floating-point).
 
     Args:
-        start: The starting value of the NDArray.
-        stop: The ending value of the NDArray.
-        num: The number of elements in the NDArray.
-        endpoint: Whether to include the `stop` value in the NDArray. Defaults to True.
+        start: The starting value of the sequence.
+        stop: The final value of the sequence.
+        num: Number of samples to generate.
+        endpoint: Whether to include `stop` in the result (default True).
 
     Returns:
         A NDArray of `dtype` with `num` geometrically spaced elements between `start` and `stop`.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Geometric progression from 1 to 1000
+        var arr = nm.geomspace[nm.f64](1.0, 1000.0, 4)
+        print(arr)  # [1.0, 10.0, 100.0, 1000.0]
+        ```
+
+    Notes:
+        This is similar to logspace, but with endpoints specified directly.
     """
     constrained[
         not dtype.is_integral(), "Int type will result to precision errors."
@@ -826,22 +904,22 @@ fn geomspace[
     endpoint: Bool = True,
 ) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of `num` elements between `start` and `stop` in a geometric series.
-
-    Raises:
-        Error if dtype is an integer.
+    Generate complex numbers spaced evenly on a log scale (geometric progression).
 
     Parameters:
-        cdtype: Complex datatype of the output array.
+        cdtype: Complex datatype of the output array (must be floating-point).
 
     Args:
-        start: The starting value of the ComplexNDArray.
-        stop: The ending value of the ComplexNDArray.
-        num: The number of elements in the ComplexNDArray.
-        endpoint: Whether to include the `stop` value in the ComplexNDArray. Defaults to True.
+        start: The starting complex value of the sequence.
+        stop: The final complex value of the sequence.
+        num: Number of samples to generate.
+        endpoint: Whether to include `stop` in the result (default True).
 
     Returns:
-        A ComplexNDArray of `dtype` with `num` geometrically spaced elements between `start` and `stop`.
+        A ComplexNDArray of `cdtype` with `num` geometrically spaced elements between `start` and `stop`.
+
+    Notes:
+        This is similar to logspace, but with endpoints specified directly.
     """
     constrained[
         not cdtype.is_integral(), "Int type will result to precision errors."
@@ -902,14 +980,40 @@ fn empty[
 fn empty[
     dtype: DType = DType.float64
 ](shape: List[Int]) raises -> NDArray[dtype]:
-    """Overload of function `empty` that reads a list of ints."""
+    """
+    Generate an empty NDArray from a list of integers.
+
+    Overload of `empty` that accepts a list of integers for the shape.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape`.
+    """
     return empty[dtype](shape=NDArrayShape(shape))
 
 
 fn empty[
     dtype: DType = DType.float64
 ](shape: VariadicList[Int]) raises -> NDArray[dtype]:
-    """Overload of function `empty` that reads a variadic list of ints."""
+    """
+    Generate an empty NDArray from variadic integer arguments.
+
+    Overload of `empty` that accepts variadic integers for the shape.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape`.
+    """
     return empty[dtype](shape=NDArrayShape(shape))
 
 
@@ -952,14 +1056,40 @@ fn empty[
 fn empty[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: List[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `empty` that reads a list of ints."""
+    """
+    Generate an empty ComplexNDArray from a list of integers.
+
+    Overload of `empty` that accepts a list of integers for the shape.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape`.
+    """
     return empty[cdtype](shape=NDArrayShape(shape))
 
 
 fn empty[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: VariadicList[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `empty` that reads a variadic list of ints."""
+    """
+    Generate an empty ComplexNDArray from variadic integer arguments.
+
+    Overload of `empty` that accepts variadic integers for the shape.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape`.
+    """
     return empty[cdtype](shape=NDArrayShape(shape))
 
 
@@ -994,6 +1124,16 @@ fn eye[dtype: DType = DType.float64](N: Int, M: Int) raises -> NDArray[dtype]:
 
     Returns:
         A NDArray of `dtype` with size N x M and ones on the diagonals.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var arr = nm.eye[nm.f64](3, 4)
+        # [[1, 0, 0, 0],
+        #  [0, 1, 0, 0],
+        #  [0, 0, 1, 0]]
+        ```
     """
     var result: NDArray[dtype] = zeros[dtype](NDArrayShape(N, M))
     var one: Scalar[dtype] = Scalar[dtype](1)
@@ -1016,7 +1156,7 @@ fn eye[
         M: Number of columns in the matrix.
 
     Returns:
-        A ComplexNDArray of `dtype` with size N x M and ones on the diagonals.
+        A ComplexNDArray of `cdtype` with size N x M and ones on the diagonals.
     """
     var result: ComplexNDArray[cdtype] = zeros[cdtype](NDArrayShape(N, M))
     var one: ComplexSIMD[cdtype] = ComplexSIMD[cdtype](1, 1)
@@ -1033,10 +1173,20 @@ fn identity[dtype: DType = DType.float64](N: Int) raises -> NDArray[dtype]:
         dtype: Datatype of the NDArray elements.
 
     Args:
-        N: Size of the matrix.
+        N: Size of the square matrix.
 
     Returns:
         A NDArray of `dtype` with size N x N and ones on the diagonals.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var I = nm.identity[nm.f64](3)
+        # [[1, 0, 0],
+        #  [0, 1, 0],
+        #  [0, 0, 1]]
+        ```
     """
     var result: NDArray[dtype] = zeros[dtype](NDArrayShape(N, N))
     var one: Scalar[dtype] = Scalar[dtype](1)
@@ -1049,16 +1199,16 @@ fn identity[
     cdtype: ComplexDType = ComplexDType.float64,
 ](N: Int) raises -> ComplexNDArray[cdtype]:
     """
-    Generate an Complex identity matrix of size N x N.
+    Generate a complex identity matrix of size N x N.
 
     Parameters:
         cdtype: Complex datatype of the output array.
 
     Args:
-        N: Size of the matrix.
+        N: Size of the square matrix.
 
     Returns:
-        A ComplexNDArray of `dtype` with size N x N and ones on the diagonals.
+        A ComplexNDArray of `cdtype` with size N x N and ones on the diagonals.
     """
     var result: ComplexNDArray[cdtype] = zeros[cdtype](NDArrayShape(N, N))
     var one: ComplexSIMD[cdtype] = ComplexSIMD[cdtype](1, 1)
@@ -1071,9 +1221,7 @@ fn ones[
     dtype: DType = DType.float64
 ](shape: NDArrayShape) raises -> NDArray[dtype]:
     """
-    Generate a NDArray of ones with given shape filled with ones.
-
-    It calls the function `full`.
+    Generate a NDArray filled with ones.
 
     Parameters:
         dtype: Datatype of the NDArray.
@@ -1082,7 +1230,16 @@ fn ones[
         shape: Shape of the NDArray.
 
     Returns:
-        A NDArray of `dtype` with given `shape`.
+        A NDArray of `dtype` with given `shape` filled with ones.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var arr = nm.ones[nm.f64](Shape(2, 3))
+        # [[1, 1, 1],
+        #  [1, 1, 1]]
+        ```
     """
     return full[dtype](shape=shape, fill_value=1)
 
@@ -1090,14 +1247,36 @@ fn ones[
 fn ones[
     dtype: DType = DType.float64
 ](shape: List[Int]) raises -> NDArray[dtype]:
-    """Overload of function `ones` that reads a list of ints."""
+    """
+    Generate a NDArray filled with ones from a list of integers.
+
+    Parameters:
+        dtype: Datatype of the NDArray.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with ones.
+    """
     return ones[dtype](shape=NDArrayShape(shape))
 
 
 fn ones[
     dtype: DType = DType.float64
 ](shape: VariadicList[Int]) raises -> NDArray[dtype]:
-    """Overload of function `ones` that reads a variadic of ints."""
+    """
+    Generate a NDArray filled with ones from variadic integer arguments.
+
+    Parameters:
+        dtype: Datatype of the NDArray.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with ones.
+    """
     return ones[dtype](shape=NDArrayShape(shape))
 
 
@@ -1123,9 +1302,7 @@ fn ones[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: NDArrayShape) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of ones with given shape filled with ones.
-
-    It calls the function `full`.
+    Generate a ComplexNDArray filled with ones.
 
     Parameters:
         cdtype: Complex datatype of the output array.
@@ -1134,7 +1311,7 @@ fn ones[
         shape: Shape of the ComplexNDArray.
 
     Returns:
-        A ComplexNDArray of `dtype` with given `shape`.
+        A ComplexNDArray of `cdtype` with given `shape` filled with ones.
     """
     return full[cdtype](shape=shape, fill_value=ComplexSIMD[cdtype](1, 1))
 
@@ -1142,14 +1319,36 @@ fn ones[
 fn ones[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: List[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `ones` that reads a list of ints."""
+    """
+    Generate a ComplexNDArray filled with ones from a list of integers.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with ones.
+    """
     return ones[cdtype](shape=NDArrayShape(shape))
 
 
 fn ones[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: VariadicList[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `ones` that reads a variadic of ints."""
+    """
+    Generate a ComplexNDArray filled with ones from variadic integer arguments.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with ones.
+    """
     return ones[cdtype](shape=NDArrayShape(shape))
 
 
@@ -1157,7 +1356,7 @@ fn ones_like[
     cdtype: ComplexDType = ComplexDType.float64,
 ](array: ComplexNDArray[cdtype]) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of the same shape as `a` filled with ones.
+    Generate a ComplexNDArray of the same shape as `array` filled with ones.
 
     Parameters:
         cdtype: Complex datatype of the output array.
@@ -1166,7 +1365,7 @@ fn ones_like[
         array: ComplexNDArray to be used as a reference for the shape.
 
     Returns:
-        A ComplexNDArray of `dtype` with the same shape as `a` filled with ones.
+        A ComplexNDArray of `cdtype` with the same shape as `array` filled with ones.
     """
     return full[cdtype](shape=array.shape, fill_value=ComplexSIMD[cdtype](1, 1))
 
@@ -1175,9 +1374,7 @@ fn zeros[
     dtype: DType = DType.float64
 ](shape: NDArrayShape) raises -> NDArray[dtype]:
     """
-    Generate a NDArray of zeros with given shape.
-
-    It calls the function `full`.
+    Generate a NDArray filled with zeros.
 
     Parameters:
         dtype: Datatype of the NDArray elements.
@@ -1186,9 +1383,16 @@ fn zeros[
         shape: Shape of the NDArray.
 
     Returns:
-        A NDArray of `dtype` with given `shape`.
+        A NDArray of `dtype` with given `shape` filled with zeros.
 
-    of NDArray.
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var arr = nm.zeros[nm.f64](Shape(2, 3))
+        # [[0, 0, 0],
+        #  [0, 0, 0]]
+        ```
     """
     return full[dtype](shape=shape, fill_value=0)
 
@@ -1196,14 +1400,36 @@ fn zeros[
 fn zeros[
     dtype: DType = DType.float64
 ](shape: List[Int]) raises -> NDArray[dtype]:
-    """Overload of function `zeros` that reads a list of ints."""
+    """
+    Generate a NDArray filled with zeros from a list of integers.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with zeros.
+    """
     return zeros[dtype](shape=NDArrayShape(shape))
 
 
 fn zeros[
     dtype: DType = DType.float64
 ](shape: VariadicList[Int]) raises -> NDArray[dtype]:
-    """Overload of function `zeros` that reads a variadic list of ints."""
+    """
+    Generate a NDArray filled with zeros from variadic integer arguments.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with zeros.
+    """
     return zeros[dtype](shape=NDArrayShape(shape))
 
 
@@ -1211,7 +1437,7 @@ fn zeros_like[
     dtype: DType = DType.float64
 ](array: NDArray[dtype]) raises -> NDArray[dtype]:
     """
-    Generate a NDArray of the same shape as `a` filled with zeros.
+    Generate a NDArray of the same shape as `array` filled with zeros.
 
     Parameters:
         dtype: Datatype of the NDArray elements.
@@ -1220,7 +1446,7 @@ fn zeros_like[
         array: NDArray to be used as a reference for the shape.
 
     Returns:
-        A NDArray of `dtype` with the same shape as `a` filled with zeros.
+        A NDArray of `dtype` with the same shape as `array` filled with zeros.
     """
     return full[dtype](shape=array.shape, fill_value=0)
 
@@ -1229,9 +1455,7 @@ fn zeros[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: NDArrayShape) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of zeros with given shape.
-
-    It calls the function `full` with `fill_value` set to `ComplexSIMD[cdtype](0, 0)`.
+    Generate a ComplexNDArray filled with zeros.
 
     Parameters:
         cdtype: Complex datatype of the output array.
@@ -1240,8 +1464,7 @@ fn zeros[
         shape: Shape of the ComplexNDArray.
 
     Returns:
-        A ComplexNDArray of `dtype` with given `shape`.
-
+        A ComplexNDArray of `cdtype` with given `shape` filled with zeros.
     """
     return full[cdtype](shape=shape, fill_value=ComplexSIMD[cdtype](0, 0))
 
@@ -1249,14 +1472,36 @@ fn zeros[
 fn zeros[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: List[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `zeros` that reads a list of ints."""
+    """
+    Generate a ComplexNDArray filled with zeros from a list of integers.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as a list of integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with zeros.
+    """
     return zeros[cdtype](shape=NDArrayShape(shape))
 
 
 fn zeros[
     cdtype: ComplexDType = ComplexDType.float64,
 ](shape: VariadicList[Int]) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `zeros` that reads a variadic list of ints."""
+    """
+    Generate a ComplexNDArray filled with zeros from variadic integer arguments.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as variadic integers.
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with zeros.
+    """
     return zeros[cdtype](shape=NDArrayShape(shape))
 
 
@@ -1264,7 +1509,7 @@ fn zeros_like[
     cdtype: ComplexDType = ComplexDType.float64,
 ](array: ComplexNDArray[cdtype]) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of the same shape as `a` filled with zeros.
+    Generate a ComplexNDArray of the same shape as `array` filled with zeros.
 
     Parameters:
         cdtype: Complex datatype of the output array.
@@ -1273,7 +1518,7 @@ fn zeros_like[
         array: ComplexNDArray to be used as a reference for the shape.
 
     Returns:
-        A ComplexNDArray of `dtype` with the same shape as `a` filled with zeros.
+        A ComplexNDArray of `cdtype` with the same shape as `array` filled with zeros.
     """
     return full[cdtype](shape=array.shape, fill_value=ComplexSIMD[cdtype](0, 0))
 
@@ -1283,18 +1528,27 @@ fn full[
 ](
     shape: NDArrayShape, fill_value: Scalar[dtype], order: String = "C"
 ) raises -> NDArray[dtype]:
-    """Initialize an NDArray of certain shape fill it with a given value.
+    """
+    Create a NDArray filled with a specified value.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
 
     Args:
         shape: Shape of the array.
-        fill_value: Set all the values to this.
-        order: Memory order C or F.
+        fill_value: Value to fill all elements with.
+        order: Memory layout order ('C' for row-major or 'F' for column-major).
 
-    Example:
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with `fill_value`.
+
+    Examples:
         ```mojo
         import numojo as nm
-        from numojo.prelude import *
-        var a = nm.full(Shape(2,3,4), fill_value=10)
+
+        var arr = nm.full[nm.f64](Shape(2, 3), fill_value=7.0)
+        # [[7, 7, 7],
+        #  [7, 7, 7]]
         ```
     """
 
@@ -1309,7 +1563,20 @@ fn full[
 ](
     shape: List[Int], fill_value: Scalar[dtype], order: String = "C"
 ) raises -> NDArray[dtype]:
-    """Overload of function `full` that reads a list of ints."""
+    """
+    Create a NDArray filled with a specified value from a list of integers.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as a list of integers.
+        fill_value: Value to fill all elements with.
+        order: Memory layout order ('C' or 'F').
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with `fill_value`.
+    """
     return full[dtype](
         shape=NDArrayShape(shape), fill_value=fill_value, order=order
     )
@@ -1320,7 +1587,20 @@ fn full[
 ](
     shape: VariadicList[Int], fill_value: Scalar[dtype], order: String = "C"
 ) raises -> NDArray[dtype]:
-    """Overload of function `full` that reads a variadic list of ints."""
+    """
+    Create a NDArray filled with a specified value from variadic integer arguments.
+
+    Parameters:
+        dtype: Datatype of the NDArray elements.
+
+    Args:
+        shape: Shape as variadic integers.
+        fill_value: Value to fill all elements with.
+        order: Memory layout order ('C' or 'F').
+
+    Returns:
+        A NDArray of `dtype` with given `shape` filled with `fill_value`.
+    """
     return full[dtype](
         shape=NDArrayShape(shape), fill_value=fill_value, order=order
     )
@@ -1332,7 +1612,7 @@ fn full_like[
     array: NDArray[dtype], fill_value: Scalar[dtype], order: String = "C"
 ) raises -> NDArray[dtype]:
     """
-    Generate a NDArray of the same shape as `a` filled with `fill_value`.
+    Generate a NDArray of the same shape as `array` filled with `fill_value`.
 
     Parameters:
         dtype: Datatype of the NDArray elements.
@@ -1340,10 +1620,10 @@ fn full_like[
     Args:
         array: NDArray to be used as a reference for the shape.
         fill_value: Value to fill the NDArray with.
-        order: Memory order C or F.
+        order: Memory layout order ('C' or 'F').
 
     Returns:
-        A NDArray of `dtype` with the same shape as `a` filled with `fill_value`.
+        A NDArray of `dtype` with the same shape as `array` filled with `fill_value`.
     """
     return full[dtype](shape=array.shape, fill_value=fill_value, order=order)
 
@@ -1355,21 +1635,26 @@ fn full[
     fill_value: ComplexSIMD[cdtype],
     order: String = "C",
 ) raises -> ComplexNDArray[cdtype]:
-    """Initialize an ComplexNDArray of certain shape fill it with a given value.
+    """
+    Create a ComplexNDArray filled with a specified complex value.
 
     Parameters:
         cdtype: Complex datatype of the output array.
 
     Args:
         shape: Shape of the ComplexNDArray.
-        fill_value: Set all the values to this.
-        order: Memory order C or F.
+        fill_value: Complex value to fill all elements with.
+        order: Memory layout order ('C' for row-major or 'F' for column-major).
 
-    Example:
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with `fill_value`.
+
+    Examples:
         ```mojo
         import numojo as nm
-        from numojo.prelude import *
-        var a = nm.full[nm.cf32](Shape(2,3,4), fill_value=CScalar[nm.cf32](10, 10))
+
+        var val = nm.CScalar[nm.cf64](3.0, 4.0)
+        var arr = nm.full[nm.cf64](Shape(2, 2), fill_value=val)
         ```
     """
     var A = ComplexNDArray[cdtype](shape=shape, order=order)
@@ -1386,7 +1671,20 @@ fn full[
     fill_value: ComplexSIMD[cdtype],
     order: String = "C",
 ) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `full` that reads a list of ints."""
+    """
+    Create a ComplexNDArray filled with a specified value from a list of integers.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as a list of integers.
+        fill_value: Complex value to fill all elements with.
+        order: Memory layout order ('C' or 'F').
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with `fill_value`.
+    """
     return full[cdtype](
         shape=NDArrayShape(shape), fill_value=fill_value, order=order
     )
@@ -1399,7 +1697,20 @@ fn full[
     fill_value: ComplexSIMD[cdtype],
     order: String = "C",
 ) raises -> ComplexNDArray[cdtype]:
-    """Overload of function `full` that reads a variadic list of ints."""
+    """
+    Create a ComplexNDArray filled with a specified value from variadic integer arguments.
+
+    Parameters:
+        cdtype: Complex datatype of the output array.
+
+    Args:
+        shape: Shape as variadic integers.
+        fill_value: Complex value to fill all elements with.
+        order: Memory layout order ('C' or 'F').
+
+    Returns:
+        A ComplexNDArray of `cdtype` with given `shape` filled with `fill_value`.
+    """
     return full[cdtype](
         shape=NDArrayShape(shape), fill_value=fill_value, order=order
     )
@@ -1413,18 +1724,18 @@ fn full_like[
     order: String = "C",
 ) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a ComplexNDArray of the same shape as `a` filled with `fill_value`.
+    Generate a ComplexNDArray of the same shape as `array` filled with `fill_value`.
 
     Parameters:
         cdtype: Complex datatype of the output array.
 
     Args:
         array: ComplexNDArray to be used as a reference for the shape.
-        fill_value: Value to fill the ComplexNDArray with.
-        order: Memory order C or F.
+        fill_value: Complex value to fill the ComplexNDArray with.
+        order: Memory layout order ('C' or 'F').
 
     Returns:
-        A ComplexNDArray of `dtype` with the same shape as `a` filled with `fill_value`.
+        A ComplexNDArray of `cdtype` with the same shape as `array` filled with `fill_value`.
     """
     return full[cdtype](shape=array.shape, fill_value=fill_value, order=order)
 
@@ -1442,11 +1753,29 @@ fn diag[
         dtype: Datatype of the NDArray elements.
 
     Args:
-        v: NDArray to extract the diagonal from.
-        k: Diagonal offset.
+        v: If 1-D, creates a 2-D array with v on the diagonal. If 2-D, extracts the diagonal.
+        k: Diagonal offset (0 for main diagonal, positive for upper, negative for lower).
 
     Returns:
-        A 1-D NDArray with the diagonal of the input NDArray.
+        If v is 1-D: A 2-D NDArray with v on the k-th diagonal.
+        If v is 2-D: A 1-D NDArray containing the k-th diagonal.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Create diagonal matrix from 1-D array
+        var v = nm.arange[nm.f64](3.0)
+        var diag_mat = nm.diag[nm.f64](v)
+        # [[0, 0, 0],
+        #  [0, 1, 0],
+        #  [0, 0, 2]]
+
+        # Extract diagonal from 2-D array
+        var mat = nm.ones[nm.f64](Shape(3, 3))
+        var d = nm.diag[nm.f64](mat)
+        # [1, 1, 1]
+        ```
     """
     if v.ndim == 1:
         var n: Int = v.size
@@ -1490,11 +1819,12 @@ fn diag[
         cdtype: Complex datatype of the output array.
 
     Args:
-        v: ComplexNDArray to extract the diagonal from.
-        k: Diagonal offset.
+        v: If 1-D, creates a 2-D array with v on the diagonal. If 2-D, extracts the diagonal.
+        k: Diagonal offset (0 for main diagonal, positive for upper, negative for lower).
 
     Returns:
-        A 1-D ComplexNDArray with the diagonal of the input ComplexNDArray.
+        If v is 1-D: A 2-D ComplexNDArray with v on the k-th diagonal.
+        If v is 2-D: A 1-D ComplexNDArray containing the k-th diagonal.
     """
     return ComplexNDArray[cdtype](
         re=diag[cdtype._dtype](v._re, k),
@@ -1506,17 +1836,29 @@ fn diagflat[
     dtype: DType = DType.float64
 ](v: NDArray[dtype], k: Int = 0) raises -> NDArray[dtype]:
     """
-    Generate a 2-D NDArray with the flattened input as the diagonal.
+    Create a 2-D array with the flattened input as the diagonal.
 
     Parameters:
         dtype: Datatype of the NDArray elements.
 
     Args:
-        v: NDArray to be flattened and used as the diagonal.
-        k: Diagonal offset.
+        v: NDArray to be flattened and used as the diagonal (any shape).
+        k: Diagonal offset (0 for main diagonal, positive for upper, negative for lower).
 
     Returns:
-        A 2-D NDArray with the flattened input as the diagonal.
+        A 2-D NDArray with the flattened input as the k-th diagonal.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        var v = nm.arange[nm.f64](4.0).reshape(Shape(2, 2))  # 2x2 array
+        var d = nm.diagflat[nm.f64](v)  # Flattens to [0,1,2,3] then creates diagonal
+        # [[0, 0, 0, 0],
+        #  [0, 1, 0, 0],
+        #  [0, 0, 2, 0],
+        #  [0, 0, 0, 3]]
+        ```
     """
     var n: Int = v.size
     var result: NDArray[dtype] = zeros[dtype](
@@ -1538,17 +1880,17 @@ fn diagflat[
     cdtype: ComplexDType = ComplexDType.float64,
 ](v: ComplexNDArray[cdtype], k: Int = 0) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a 2-D ComplexNDArray with the flattened input as the diagonal.
+    Create a 2-D complex array with the flattened input as the diagonal.
 
     Parameters:
         cdtype: Complex datatype of the output array.
 
     Args:
-        v: ComplexNDArray to be flattened and used as the diagonal.
-        k: Diagonal offset.
+        v: ComplexNDArray to be flattened and used as the diagonal (any shape).
+        k: Diagonal offset (0 for main diagonal, positive for upper, negative for lower).
 
     Returns:
-        A 2-D ComplexNDArray with the flattened input as the diagonal.
+        A 2-D ComplexNDArray with the flattened input as the k-th diagonal.
     """
     return ComplexNDArray[cdtype](
         re=diagflat[cdtype._dtype](v._re, k),
@@ -1560,7 +1902,9 @@ fn tri[
     dtype: DType = DType.float64
 ](N: Int, M: Int, k: Int = 0) raises -> NDArray[dtype]:
     """
-    Generate a 2-D NDArray with ones on and below the k-th diagonal.
+    Generate a lower triangular matrix.
+
+    Creates an array with ones on and below the k-th diagonal, zeros elsewhere.
 
     Parameters:
         dtype: Datatype of the NDArray elements.
@@ -1568,10 +1912,27 @@ fn tri[
     Args:
         N: Number of rows in the matrix.
         M: Number of columns in the matrix.
-        k: Diagonal offset.
+        k: Diagonal offset (0 for main diagonal, positive shifts right, negative shifts left).
 
     Returns:
-        A 2-D NDArray with ones on and below the k-th diagonal.
+        A 2-D NDArray of shape (N, M) with ones on and below the k-th diagonal.
+
+    Examples:
+        ```mojo
+        import numojo as nm
+
+        # Lower triangular matrix
+        var L = nm.tri[nm.f64](3, 3)
+        # [[1, 0, 0],
+        #  [1, 1, 0],
+        #  [1, 1, 1]]
+
+        # With offset
+        var L2 = nm.tri[nm.f64](3, 3, k=1)
+        # [[1, 1, 0],
+        #  [1, 1, 1],
+        #  [1, 1, 1]]
+        ```
     """
     var result: NDArray[dtype] = zeros[dtype](NDArrayShape(N, M))
     for i in range(N):
@@ -1585,7 +1946,9 @@ fn tri[
     cdtype: ComplexDType = ComplexDType.float64,
 ](N: Int, M: Int, k: Int = 0) raises -> ComplexNDArray[cdtype]:
     """
-    Generate a 2-D ComplexNDArray with ones on and below the k-th diagonal.
+    Generate a lower triangular complex matrix.
+
+    Creates a complex array with ones on and below the k-th diagonal, zeros elsewhere.
 
     Parameters:
         cdtype: Complex datatype of the output array.
@@ -1593,10 +1956,10 @@ fn tri[
     Args:
         N: Number of rows in the matrix.
         M: Number of columns in the matrix.
-        k: Diagonal offset.
+        k: Diagonal offset (0 for main diagonal, positive shifts right, negative shifts left).
 
     Returns:
-        A 2-D ComplexNDArray with ones on and below the k-th diagonal.
+        A 2-D ComplexNDArray of shape (N, M) with ones on and below the k-th diagonal.
     """
     return ComplexNDArray[cdtype](
         re=tri[cdtype._dtype](N, M, k),
@@ -2193,7 +2556,7 @@ fn array[
         np_dtype = np.int16
     elif dtype == DType.int8:
         np_dtype = np.int8
-    elif dtype == DType.index:
+    elif dtype == DType.int:
         np_dtype = np.intp
     elif dtype == DType.uint64:
         np_dtype = np.uint64
@@ -2212,7 +2575,7 @@ fn array[
         dtype
     ]()
     var A: NDArray[dtype] = NDArray[dtype](array_shape, order)
-    memcpy[Scalar[dtype]](A._buf.ptr, pointer, A.size)
+    memcpy[Scalar[dtype]](dest=A._buf.ptr, src=pointer, count=A.size)
     return A^
 
 
@@ -2271,7 +2634,7 @@ fn array[
         np_dtype = np.int16
     elif dtype == DType.int8:
         np_dtype = np.int8
-    elif dtype == DType.index:
+    elif dtype == DType.int:
         np_dtype = np.intp
     elif dtype == DType.uint64:
         np_dtype = np.uint64
@@ -2294,8 +2657,10 @@ fn array[
         0
     ].unsafe_get_as_pointer[dtype]()
     var A: ComplexNDArray[cdtype] = ComplexNDArray[cdtype](array_shape, order)
-    memcpy[Scalar[dtype]](A._re._buf.ptr, pointer, A._re.size)
-    memcpy[Scalar[dtype]](A._im._buf.ptr, pointer_imag, A._im.size)
+    memcpy[Scalar[dtype]](dest=A._re._buf.ptr, src=pointer, count=A._re.size)
+    memcpy[Scalar[dtype]](
+        dest=A._im._buf.ptr, src=pointer_imag, count=A._im.size
+    )
     return A^
 
 
diff --git a/numojo/routines/functional.mojo b/numojo/routines/functional.mojo
index b5eb8ac5..cafdbc60 100644
--- a/numojo/routines/functional.mojo
+++ b/numojo/routines/functional.mojo
@@ -82,12 +82,12 @@ fn apply_along_axis[
 fn apply_along_axis[
     dtype: DType,
     func1d: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> Scalar[
-        DType.index
+        DType.int
     ],
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Applies a function to a NDArray by axis and reduce that dimension.
-    The returned data type is DType.index.
+    The returned data type is DType.int.
     When the array is 1-d, the returned array will be a 0-d array.
 
     Parameters:
@@ -105,14 +105,14 @@ fn apply_along_axis[
     # The iterator along the axis
     var iterator = a.iter_along_axis(axis=axis)
     # The final output array will have 1 less dimension than the input array
-    var res: NDArray[DType.index]
+    var res: NDArray[DType.int]
 
     if a.ndim == 1:
-        res = numojo.creation._0darray[DType.index](0)
+        res = numojo.creation._0darray[DType.int](0)
         (res._buf.ptr).init_pointee_copy(func1d[dtype](a))
 
     else:
-        res = NDArray[DType.index](a.shape._pop(axis=axis))
+        res = NDArray[DType.int](a.shape._pop(axis=axis))
 
         @parameter
         fn parallelized_func(i: Int):
@@ -221,9 +221,9 @@ fn apply_along_axis[
             try:
                 var elements: NDArray[dtype] = func1d[dtype](iterator.ith(i))
                 memcpy(
-                    result._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=result._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -236,7 +236,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -293,9 +293,9 @@ fn apply_along_axis[
                 var elements: NDArray[dtype] = iterator.ith(i)
                 func1d[dtype](elements)
                 memcpy(
-                    a._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=a._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -308,7 +308,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -333,9 +333,9 @@ fn apply_along_axis[
 fn apply_along_axis[
     dtype: DType,
     func1d: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> NDArray[
-        DType.index
+        DType.int
     ],
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Applies a function to a NDArray by axis without reducing that dimension.
     The resulting array will have the same shape as the input array.
@@ -357,20 +357,20 @@ fn apply_along_axis[
     # The iterator along the axis
     var iterator = a.iter_along_axis(axis=axis)
     # The final output array will have the same shape as the input array
-    var res = NDArray[DType.index](a.shape)
+    var res = NDArray[DType.int](a.shape)
 
     if a.flags.C_CONTIGUOUS and (axis == a.ndim - 1):
         # The memory layout is C-contiguous
         @parameter
         fn parallelized_func_c(i: Int):
             try:
-                var elements: NDArray[DType.index] = func1d[dtype](
+                var elements: NDArray[DType.int] = func1d[dtype](
                     iterator.ith(i)
                 )
                 memcpy(
-                    res._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=res._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -383,7 +383,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -391,9 +391,7 @@ fn apply_along_axis[
                 indices = indices_elements[0].copy()
                 elements = indices_elements[1].copy()
 
-                var res_along_axis: NDArray[DType.index] = func1d[dtype](
-                    elements
-                )
+                var res_along_axis: NDArray[DType.int] = func1d[dtype](elements)
 
                 for j in range(a.shape[axis]):
                     (res._buf.ptr + Int(indices[j])).init_pointee_copy(
diff --git a/numojo/routines/indexing.mojo b/numojo/routines/indexing.mojo
index 38a6f34f..3a63a5cc 100644
--- a/numojo/routines/indexing.mojo
+++ b/numojo/routines/indexing.mojo
@@ -25,11 +25,10 @@ import numojo.core.utility as utility
 # ===----------------------------------------------------------------------=== #
 
 
-fn where[
+# ! It's not properly being formatted by pixi for some reason!
+fn `where`[
     dtype: DType
-](
-    mut x: NDArray[dtype], scalar: SIMD[dtype, 1], mask: NDArray[DType.bool]
-) raises:
+](x: NDArray[dtype], scalar: Scalar[dtype], mask: NDArray[DType.bool]) raises:
     """
     Replaces elements in `x` with `scalar` where `mask` is True.
 
@@ -48,7 +47,7 @@ fn where[
 
 
 # TODO: do it with vectorization
-fn where[
+fn `where`[
     dtype: DType
 ](mut x: NDArray[dtype], y: NDArray[dtype], mask: NDArray[DType.bool]) raises:
     """
@@ -173,7 +172,7 @@ fn compress[
                 (item._buf + j).init_pointee_copy(
                     remainder // res_strides._buf[j]
                 )
-                remainder %= res_strides._buf[j]
+                remainder %= Int(res_strides._buf[j])
 
                 # Then along other axes
                 for j in range(result.ndim):
@@ -181,7 +180,7 @@ fn compress[
                         (item._buf + j).init_pointee_copy(
                             remainder // res_strides._buf[j]
                         )
-                        remainder %= res_strides._buf[j]
+                        remainder %= Int(res_strides._buf[j])
 
                 (
                     result._buf.ptr + utility._get_offset(item, result.strides)
@@ -237,7 +236,7 @@ fn compress[
 fn take_along_axis[
     dtype: DType, //,
 ](
-    arr: NDArray[dtype], indices: NDArray[DType.index], axis: Int = 0
+    arr: NDArray[dtype], indices: NDArray[DType.int], axis: Int = 0
 ) raises -> NDArray[dtype]:
     """
     Takes values from the input array along the given axis based on indices.
@@ -303,7 +302,7 @@ fn take_along_axis[
     # except along the axis
 
     var broadcasted_indices: NDArray[
-        DType.index
+        DType.int
     ] = indices.copy()  # make this owned and don't copy
 
     if arr.shape != indices.shape:
@@ -337,15 +336,15 @@ fn take_along_axis[
                 indices_slice
             ]
             memcpy(
-                result._buf.ptr + i * result.shape[normalized_axis],
-                arr_slice_after_applying_indices._buf.ptr,
-                result.shape[normalized_axis],
+                dest=result._buf.ptr + i * result.shape[normalized_axis],
+                src=arr_slice_after_applying_indices._buf.ptr,
+                count=result.shape[normalized_axis],
             )
     else:
         # If axis is not the last axis, the data is not contiguous.
         for i in range(length_of_iterator):
-            var indices_slice_offsets: NDArray[DType.index]
-            var indices_slice: NDArray[DType.index]
+            var indices_slice_offsets: NDArray[DType.int]
+            var indices_slice: NDArray[DType.int]
             var indices_slice_offsets_slice = indices_iterator.ith_with_offsets(
                 i
             )
diff --git a/numojo/routines/linalg/products.mojo b/numojo/routines/linalg/products.mojo
index 64961039..bb78ec39 100644
--- a/numojo/routines/linalg/products.mojo
+++ b/numojo/routines/linalg/products.mojo
@@ -339,20 +339,20 @@ fn matmul[
 
     for i in range(result.size // result_sub_matrix.size):
         memcpy(
-            A_sub_matrix._buf.ptr,
-            A._buf.ptr + (i * A_sub_matrix.size),
-            A_sub_matrix.size,
+            dest=A_sub_matrix._buf.ptr,
+            src=A._buf.ptr + (i * A_sub_matrix.size),
+            count=A_sub_matrix.size,
         )
         memcpy(
-            B_sub_matrix._buf.ptr,
-            B._buf.ptr + (i * B_sub_matrix.size),
-            B_sub_matrix.size,
+            dest=B_sub_matrix._buf.ptr,
+            src=B._buf.ptr + (i * B_sub_matrix.size),
+            count=B_sub_matrix.size,
         )
         result_sub_matrix = matmul_2darray(A_sub_matrix, B_sub_matrix)
         memcpy(
-            result._buf.ptr + (i * result_sub_matrix.size),
-            result_sub_matrix._buf.ptr,
-            result_sub_matrix.size,
+            dest=result._buf.ptr + (i * result_sub_matrix.size),
+            src=result_sub_matrix._buf.ptr,
+            count=result_sub_matrix.size,
         )
     return result^
 
diff --git a/numojo/routines/manipulation.mojo b/numojo/routines/manipulation.mojo
index 97868575..cff82aa0 100644
--- a/numojo/routines/manipulation.mojo
+++ b/numojo/routines/manipulation.mojo
@@ -141,7 +141,6 @@ fn reshape[
     Returns:
         Array of the same data with a new shape.
     """
-    print("HOLY")
     if A.size != shape.size_of_array():
         raise Error("Cannot reshape: Number of elements do not match.")
 
@@ -208,7 +207,7 @@ fn ravel[
 # TODO: Remove this one if the following function is working well:
 # `numojo.core.utility._traverse_buffer_according_to_shape_and_strides`
 fn _set_values_according_to_shape_and_strides(
-    mut I: NDArray[DType.index],
+    mut I: NDArray[DType.int],
     mut index: Int,
     current_dim: Int,
     previous_sum: Int,
@@ -255,10 +254,10 @@ fn transpose[
     Examples.
     ```mojo
     import numojo as nm
-    # A is a 2darray
-    print(nm.transpose(A, axes=List(0, 1)))  # equal to transpose of matrix
-    # A is a 3darray
-    print(nm.transpose(A, axes=List(2, 1, 0)))  # transpose 0-th and 2-th dimensions
+    var arr2d = nm.random.rand(2,3)
+    print(nm.transpose(arr2d, axes=List(0, 1)))  # equal to transpose of matrix
+    var arr3d = nm.random.rand(2,3,4)
+    print(nm.transpose(arr3d, axes=List(2, 1, 0)))  # transpose 0-th and 2-th dimensions
     ```
     """
     if len(axes) != A.ndim:
@@ -286,8 +285,8 @@ fn transpose[
         new_strides._buf[i] = A.strides[axes[i]]
 
     var array_order: String = "C" if A.flags.C_CONTIGUOUS else "F"
-    var I = NDArray[DType.index](Shape(A.size), order=array_order)
-    var ptr: UnsafePointer[Scalar[DType.index]] = I._buf.ptr
+    var I = NDArray[DType.int](Shape(A.size), order=array_order)
+    var ptr: UnsafePointer[Scalar[DType.int]] = I._buf.ptr
     numojo.core.utility._traverse_buffer_according_to_shape_and_strides(
         ptr, new_shape, new_strides
     )
@@ -299,20 +298,19 @@ fn transpose[
 
 
 # TODO: Make this operation in place to match numpy.
-fn transpose[dtype: DType](var A: NDArray[dtype]) raises -> NDArray[dtype]:
+fn transpose[dtype: DType](A: NDArray[dtype]) raises -> NDArray[dtype]:
     """
     (overload) Transpose the array when `axes` is not given.
     If `axes` is not given, it is equal to flipping the axes.
     See docstring of `transpose`.
     """
-
     if A.ndim == 1:
-        return A^
+        return A.copy()
     if A.ndim == 2:
         var array_order = "C" if A.flags.C_CONTIGUOUS else "F"
         var B = NDArray[dtype](Shape(A.shape[1], A.shape[0]), order=array_order)
         if A.shape[0] == 1 or A.shape[1] == 1:
-            memcpy(B._buf.ptr, A._buf.ptr, A.size)
+            memcpy(dest=B._buf.ptr, src=A._buf.ptr, count=A.size)
         else:
             for i in range(B.shape[0]):
                 for j in range(B.shape[1]):
@@ -337,7 +335,7 @@ fn transpose[dtype: DType](A: Matrix[dtype]) -> Matrix[dtype]:
     var B = Matrix[dtype](Tuple(A.shape[1], A.shape[0]), order=order)
 
     if A.shape[0] == 1 or A.shape[1] == 1:
-        memcpy(B._buf.ptr, A._buf.ptr, A.size)
+        memcpy(dest=B._buf.ptr, src=A._buf.ptr, count=A.size)
     else:
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
@@ -616,7 +614,7 @@ fn flip[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     numojo.core.utility._traverse_buffer_according_to_shape_and_strides(
diff --git a/numojo/routines/math/extrema.mojo b/numojo/routines/math/extrema.mojo
index 85b2abee..12ca3ae3 100644
--- a/numojo/routines/math/extrema.mojo
+++ b/numojo/routines/math/extrema.mojo
@@ -230,7 +230,7 @@ fn max[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 fn _max[
     dtype: DType
 ](A: Matrix[dtype], start: Int, end: Int) raises -> Tuple[
-    Scalar[dtype], Scalar[DType.index]
+    Scalar[dtype], Scalar[DType.int]
 ]:
     """
     Auxiliary function that find the max value in a range of the buffer.
@@ -243,7 +243,7 @@ fn _max[
             ).format(start, end, A.size)
         )
 
-    var max_index: Scalar[DType.index] = start
+    var max_index: Scalar[DType.int] = start
 
     var rows = A.shape[0]
     var cols = A.shape[1]
@@ -350,7 +350,7 @@ fn min[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 fn _min[
     dtype: DType
 ](A: Matrix[dtype], start: Int, end: Int) raises -> Tuple[
-    Scalar[dtype], Scalar[DType.index]
+    Scalar[dtype], Scalar[DType.int]
 ]:
     """
     Auxiliary function that find the min value in a range of the buffer.
@@ -363,7 +363,7 @@ fn _min[
             ).format(start, end, A.size)
         )
 
-    var min_index: Scalar[DType.index] = start
+    var min_index: Scalar[DType.int] = start
 
     var rows = A.shape[0]
     var cols = A.shape[1]
diff --git a/numojo/routines/math/products.mojo b/numojo/routines/math/products.mojo
index 031ca4db..d01e06aa 100644
--- a/numojo/routines/math/products.mojo
+++ b/numojo/routines/math/products.mojo
@@ -71,10 +71,11 @@ fn prod[
             slices.append(Slice(0, A.shape[i]))
         else:
             slices.append(Slice(0, 0))  # Temp value
-    var result = ones[dtype](NDArrayShape(result_shape))
+    var result: NDArray[dtype] = ones[dtype](NDArrayShape(result_shape))
     for i in range(size_of_axis):
         slices[axis] = Slice(i, i + 1)
-        var arr_slice = A[slices.copy()]
+        # TODO: modify slicing getter to avoid copy.
+        var arr_slice: NDArray[dtype] = A._getitem_list_slices(slices.copy())
         result *= arr_slice
 
     return result^
@@ -204,7 +205,7 @@ fn cumprod[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     var _shape = B.shape._move_axis_to_end(axis)
diff --git a/numojo/routines/math/sums.mojo b/numojo/routines/math/sums.mojo
index b9021e49..64653f3b 100644
--- a/numojo/routines/math/sums.mojo
+++ b/numojo/routines/math/sums.mojo
@@ -62,7 +62,7 @@ fn sum[dtype: DType](A: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
         An NDArray.
     """
 
-    var normalized_axis = axis
+    var normalized_axis: Int = axis
     if normalized_axis < 0:
         normalized_axis += A.ndim
 
@@ -99,10 +99,10 @@ fn sum[dtype: DType](A: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
             slices.append(Slice(0, A.shape[i]))
         else:
             slices.append(Slice(0, 0))  # Temp value
-    var result = zeros[dtype](NDArrayShape(result_shape))
+    var result: NDArray[dtype] = zeros[dtype](NDArrayShape(result_shape))
     for i in range(size_of_axis):
         slices[normalized_axis] = Slice(i, i + 1)
-        var arr_slice = A[slices.copy()]
+        var arr_slice: NDArray[dtype] = A._getitem_list_slices(slices.copy())
         result += arr_slice
 
     return result^
@@ -263,7 +263,7 @@ fn cumsum[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     var _shape = B.shape._move_axis_to_end(axis)
diff --git a/numojo/routines/searching.mojo b/numojo/routines/searching.mojo
index a6bb2fe2..52c78327 100644
--- a/numojo/routines/searching.mojo
+++ b/numojo/routines/searching.mojo
@@ -17,7 +17,7 @@ from numojo.routines.sorting import binary_sort
 from numojo.routines.math.extrema import _max, _min
 
 
-fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the index of the maximum value in the buffer.
     Regardless of the shape of input, it is treated as a 1-d array.
 
@@ -44,7 +44,7 @@ fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
     return result
 
 
-fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the index of the minimum value in the buffer.
     Regardless of the shape of input, it is treated as a 1-d array.
 
@@ -71,7 +71,7 @@ fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
     return result
 
 
-fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the indices of the maximum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -98,7 +98,7 @@ fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
 
 fn argmax[
     dtype: DType, //
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """Returns the indices of the maximum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -161,11 +161,11 @@ fn argmax[
 @always_inline
 fn find_extrema_index[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+](A: Matrix[dtype]) raises -> Scalar[DType.int]:
     """Find index of min/max value, either in whole matrix or along an axis."""
 
     var extreme_val = A[0, 0]
-    var extreme_idx: Scalar[DType.index] = 0
+    var extreme_idx: Scalar[DType.int] = 0
 
     for i in range(A.shape[0]):
         for j in range(A.shape[1]):
@@ -187,13 +187,13 @@ fn find_extrema_index[
 @always_inline
 fn find_extrema_index[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype], axis: Optional[Int]) raises -> Matrix[DType.index]:
+](A: Matrix[dtype], axis: Optional[Int]) raises -> Matrix[DType.int]:
     """Find index of min/max value, either in whole matrix or along an axis."""
 
     if axis != 0 and axis != 1:
         raise Error(String("The axis can either be 1 or 0!"))
 
-    var B = Matrix[DType.index](
+    var B = Matrix[DType.int](
         shape=(A.shape[0], 1) if axis == 1 else (1, A.shape[1])
     )
 
@@ -237,19 +237,19 @@ fn find_extrema_index[
     return B^
 
 
-fn argmax[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+fn argmax[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.int]:
     """Find index of max value in a flattened matrix."""
     return find_extrema_index[dtype, True](A)
 
 
 fn argmax[
     dtype: DType
-](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.int]:
     """Find indices of max values along the given axis."""
     return find_extrema_index[dtype, True](A, axis)
 
 
-fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the indices of the minimum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -276,7 +276,7 @@ fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
 
 fn argmin[
     dtype: DType, //
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """Returns the indices of the minimum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -309,7 +309,7 @@ fn argmin[
     return numojo.apply_along_axis[func1d=argmin_1d](a=a, axis=normalized_axis)
 
 
-fn argmin[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+fn argmin[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.int]:
     """
     Index of the min. It is first flattened before sorting.
     """
@@ -318,7 +318,7 @@ fn argmin[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
 
 fn argmin[
     dtype: DType
-](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.int]:
     """
     Index of the min along the given axis.
     """
diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index 54408959..4f0bb331 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -149,7 +149,7 @@ fn sort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
     """
     Sort the Matrix. It is first flattened before sorting.
     """
-    var I = Matrix.zeros[DType.index](shape=A.shape)
+    var I = Matrix.zeros[DType.int](shape=A.shape)
     var B = A.flatten()
     _quick_sort_inplace(B, I, 0, A.size - 1)
 
@@ -167,7 +167,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 
         for i in range(A.shape[0]):
             var row = Matrix[dtype](shape=(1, A.shape[1]), order="C")
-            var indices = Matrix.zeros[DType.index](
+            var indices = Matrix.zeros[DType.int](
                 shape=(1, A.shape[1]), order="C"
             )
 
@@ -186,7 +186,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 
         for j in range(A.shape[1]):
             var col = Matrix[dtype](shape=(A.shape[0], 1), order="C")
-            var indices = Matrix.zeros[DType.index](
+            var indices = Matrix.zeros[DType.int](
                 shape=(A.shape[0], 1), order="C"
             )
 
@@ -203,7 +203,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
         raise Error(String("The axis can either be 1 or 0!"))
 
 
-fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
+fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort an array.
     It is not guaranteed to be unstable.
@@ -224,7 +224,7 @@ fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
     else:
         a_flattened = ravel(a)
 
-    var indices = arange[DType.index](a_flattened.size)
+    var indices = arange[DType.int](a_flattened.size)
 
     _quick_sort_inplace(a_flattened, indices)
 
@@ -233,7 +233,7 @@ fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
 
 fn argsort[
     dtype: DType
-](mut a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](mut a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort an array.
     It is not guaranteed to be unstable.
@@ -272,11 +272,11 @@ fn argsort[
     )
 
 
-fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
+fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.int]:
     """
     Argsort the Matrix. It is first flattened before sorting.
     """
-    var I = Matrix[DType.index](shape=(1, A.size), order=A.order())
+    var I = Matrix[DType.int](shape=(1, A.size), order=A.order())
     for i in range(I.size):
         I._buf.ptr[i] = i
     var B: Matrix[dtype]
@@ -291,18 +291,18 @@ fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
 
 fn argsort[
     dtype: DType
-](var A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](var A: Matrix[dtype], axis: Int) raises -> Matrix[DType.int]:
     """
     Argsort the Matrix along the given axis.
     """
     var order = A.order()
 
     if axis == 1:
-        var result = Matrix[DType.index](shape=A.shape, order=order)
+        var result = Matrix[DType.int](shape=A.shape, order=order)
 
         for i in range(A.shape[0]):
             var row = Matrix[dtype](shape=(1, A.shape[1]), order="C")
-            var idx = Matrix[DType.index](shape=(1, A.shape[1]), order="C")
+            var idx = Matrix[DType.int](shape=(1, A.shape[1]), order="C")
 
             for j in range(A.shape[1]):
                 row._store(0, j, A._load(i, j))
@@ -316,11 +316,11 @@ fn argsort[
         return result^
 
     elif axis == 0:
-        var result = Matrix[DType.index](shape=A.shape, order=order)
+        var result = Matrix[DType.int](shape=A.shape, order=order)
 
         for j in range(A.shape[1]):
             var col = Matrix[dtype](shape=(A.shape[0], 1), order="C")
-            var idx = Matrix[DType.index](shape=(A.shape[0], 1), order="C")
+            var idx = Matrix[DType.int](shape=(A.shape[0], 1), order="C")
 
             for i in range(A.shape[0]):
                 col._store(i, 0, A._load(i, j))
@@ -542,7 +542,7 @@ fn quick_sort_stable_inplace_1d[dtype: DType](mut a: NDArray[dtype]) raises:
 
 fn argsort_quick_sort_1d[
     dtype: DType
-](a: NDArray[dtype]) raises -> NDArray[DType.index]:
+](a: NDArray[dtype]) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort the buffer of an array.
     Regardless of the shape of input, it is treated as a 1-d array.
@@ -559,7 +559,7 @@ fn argsort_quick_sort_1d[
     """
 
     var result: NDArray[dtype] = a.copy()
-    var indices = arange[DType.index](result.size)
+    var indices = arange[DType.int](result.size)
     _quick_sort_inplace(result, indices)
     return indices^
 
@@ -831,7 +831,7 @@ fn _quick_sort_inplace[dtype: DType](mut A: NDArray[dtype]) raises:
 
 fn _quick_sort_inplace[
     dtype: DType
-](mut A: NDArray[dtype], mut I: NDArray[DType.index]) raises:
+](mut A: NDArray[dtype], mut I: NDArray[DType.int]) raises:
     """
     Sort in-place array's buffer using quick sort method.
     The indices are also sorted.
diff --git a/pixi.toml b/pixi.toml
index c68bb089..ba9ef404 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -1,5 +1,6 @@
-[project]
+[workspace]
 channels = [
+    "https://conda.modular.com/max-nightly/",
     "conda-forge",
     "https://conda.modular.com/max",
     "https://repo.prefix.dev/modular-community",
@@ -34,13 +35,13 @@ backend = {name = "pixi-build-mojo", version = "0.*", channels = [
 name = "numojo"
 
 [package.host-dependencies]
-modular = ">=25.6.0,<26"
+modular = ">=25.6.1,<26"
 
 [package.build-dependencies]
-modular = ">=25.6.0,<26"
+modular = ">=25.6.1,<26"
 
 [package.run-dependencies]
-modular = ">=25.6.0,<26"
+modular = ">=25.6.1,<26"
 
 [tasks]
 # compile the package and copy it to the tests folder
@@ -78,7 +79,7 @@ doc_pages = "mojo doc numojo/ -o docs.json"
 release = "clear && pixi run final && pixi run doc_pages"
 
 [dependencies]
-python = ">=3.13.5,<3.14"
-numpy = ">=2.3.2,<3"
-scipy = ">=1.16.0,<2"
-modular = ">=25.6.0,<26"
+python = ">=3.13.9,<3.14"
+numpy = ">=2.3.3,<3"
+scipy = ">=1.16.2,<2"
+mojo = ">=0.25.7.0.dev2025102519,<0.26"
diff --git a/tests/core/test_array_indexing_and_slicing.mojo b/tests/core/test_array_indexing_and_slicing.mojo
index ba7033c4..8c08098f 100644
--- a/tests/core/test_array_indexing_and_slicing.mojo
+++ b/tests/core/test_array_indexing_and_slicing.mojo
@@ -95,7 +95,7 @@ def test_slicing_getter6():
     var np = Python.import_module("numpy")
 
     var b = nm.arange[i8](60).reshape(nm.Shape(3, 4, 5))
-    var ind = nm.array[isize]("[[2,0,1], [1,0,1]]")
+    var ind = nm.array[int]("[[2,0,1], [1,0,1]]")
     var mask = nm.array[boolean]("[1,0,1]")
 
     var bnp = b.to_numpy()
diff --git a/tests/routines/test_indexing.mojo b/tests/routines/test_indexing.mojo
index 861b7a99..72c44e2c 100644
--- a/tests/routines/test_indexing.mojo
+++ b/tests/routines/test_indexing.mojo
@@ -73,7 +73,7 @@ fn test_take_along_axis() raises:
     # Test 1-D array
     var a1d = nm.arange[i8](10)
     var a1d_np = a1d.to_numpy()
-    var indices1d = nm.array[intp]("[2, 3, 1, 8]")
+    var indices1d = nm.array[int]("[2, 3, 1, 8]")
     var indices1d_np = indices1d.to_numpy()
 
     check(
@@ -85,7 +85,7 @@ fn test_take_along_axis() raises:
     # Test 2-D array with axis=0
     var a2d = nm.arange[i8](12).reshape(Shape(3, 4))
     var a2d_np = a2d.to_numpy()
-    var indices2d_0 = nm.array[intp]("[[0, 1, 2, 0], [1, 2, 0, 1]]")
+    var indices2d_0 = nm.array[int]("[[0, 1, 2, 0], [1, 2, 0, 1]]")
     var indices2d_0_np = indices2d_0.to_numpy()
 
     check(
@@ -95,7 +95,7 @@ fn test_take_along_axis() raises:
     )
 
     # Test 2-D array with axis=1
-    var indices2d_1 = nm.array[intp](
+    var indices2d_1 = nm.array[int](
         "[[3, 0, 2, 1], [1, 3, 0, 0], [2, 1, 0, 3]]"
     )
     var indices2d_1_np = indices2d_1.to_numpy()
@@ -111,7 +111,7 @@ fn test_take_along_axis() raises:
     var a3d_np = a3d.to_numpy()
 
     # Test with axis=0
-    var indices3d_0 = nm.zeros[intp](Shape(1, 3, 4))
+    var indices3d_0 = nm.zeros[int](Shape(1, 3, 4))
     var indices3d_0_np = indices3d_0.to_numpy()
 
     check(
@@ -121,7 +121,7 @@ fn test_take_along_axis() raises:
     )
 
     # Test with axis=1
-    var indices3d_1 = nm.array[intp](
+    var indices3d_1 = nm.array[int](
         "[[[0, 1, 0, 2], [2, 1, 0, 1], [1, 2, 2, 0]], [[1, 0, 1, 2], [0, 2, 1,"
         " 0], [2, 0, 0, 1]]]"
     )
@@ -134,7 +134,7 @@ fn test_take_along_axis() raises:
     )
 
     # Test with axis=2
-    var indices3d_2 = nm.array[intp](
+    var indices3d_2 = nm.array[int](
         "[[[2, 0, 3, 1], [1, 3, 0, 2], [3, 1, 2, 0]], [[0, 2, 1, 3], [2, 0, 3,"
         " 1], [1, 3, 0, 2]]]"
     )
@@ -160,7 +160,7 @@ fn test_take_along_axis() raises:
     var a2d_test_np = a2d_test.to_numpy()
 
     # For axis=0, using indices of shape (2, 4) - different first dim, same second dim
-    var indices2d_axis0 = nm.array[intp]("[[0, 1, 2, 0], [1, 0, 2, 1]]")
+    var indices2d_axis0 = nm.array[int]("[[0, 1, 2, 0], [1, 0, 2, 1]]")
     var indices2d_axis0_np = indices2d_axis0.to_numpy()
 
     check(
@@ -173,7 +173,7 @@ fn test_take_along_axis() raises:
     )
 
     # For axis=1, using indices of shape (3, 2) - same first dim, different second dim
-    var indices2d_axis1 = nm.array[intp]("[[0, 3], [2, 1], [1, 3]]")
+    var indices2d_axis1 = nm.array[int]("[[0, 3], [2, 1], [1, 3]]")
     var indices2d_axis1_np = indices2d_axis1.to_numpy()
 
     check(
@@ -191,7 +191,7 @@ fn test_take_along_axis() raises:
     var a3d_test_np = a3d_test.to_numpy()
 
     # For axis=0, indices of shape (1, 3, 4) - same shape except dim 0
-    var ind_axis0 = nm.zeros[intp](Shape(1, 3, 4))
+    var ind_axis0 = nm.zeros[int](Shape(1, 3, 4))
     var ind_axis0_np = ind_axis0.to_numpy()
 
     check(
@@ -204,7 +204,7 @@ fn test_take_along_axis() raises:
     )
 
     # For axis=2, indices of shape (2, 3, 2) - same shape except dim 2
-    var ind_axis2 = nm.array[intp](
+    var ind_axis2 = nm.array[int](
         "[[[0, 3], [2, 1], [3, 0]], [[1, 2], [0, 3], [2, 1]]]"
     )
     var ind_axis2_np = ind_axis2.to_numpy()
@@ -227,7 +227,7 @@ fn test_take_along_axis_fortran_order() raises:
     var a3d_f_np = a3d_f.to_numpy()
 
     # Test with axis=0
-    var indices3d_0 = nm.zeros[intp](Shape(1, 3, 4))
+    var indices3d_0 = nm.zeros[int](Shape(1, 3, 4))
     var indices3d_0_np = indices3d_0.to_numpy()
 
     check(
@@ -237,7 +237,7 @@ fn test_take_along_axis_fortran_order() raises:
     )
 
     # Test with axis=1
-    var indices3d_1 = nm.array[intp](
+    var indices3d_1 = nm.array[int](
         "[[[0, 1, 0, 2], [2, 1, 0, 1], [1, 2, 2, 0]], [[1, 0, 1, 2], [0, 2, 1,"
         " 0], [2, 0, 0, 1]]]"
     )
@@ -250,7 +250,7 @@ fn test_take_along_axis_fortran_order() raises:
     )
 
     # Test with axis=2
-    var indices3d_2 = nm.array[intp](
+    var indices3d_2 = nm.array[int](
         "[[[2, 0, 3, 1], [1, 3, 0, 2], [3, 1, 2, 0]], [[0, 2, 1, 3], [2, 0, 3,"
         " 1], [1, 3, 0, 2]]]"
     )