22
33from algorithm import vectorize, parallelize
44from memory.memory import _malloc, stack_allocation
5- from sys import CompilationTarget, num_performance_cores, simdwidthof, sizeof
5+ from sys import CompilationTarget, num_performance_cores, simd_width_of, size_of
66import benchmark
77from testing import assert_equal
88from utils import IndexList
@@ -163,7 +163,7 @@ fn pack_A[
163163 ](Ac.stride[0 ]()),
164164 )
165165
166- vectorize[pack_col, simdwidthof [Type]()](min (Ac.shape[0 ]() - i, mr))
166+ vectorize[pack_col, simd_width_of [Type]()](min (Ac.shape[0 ]() - i, mr))
167167
168168 for l in range (min (Ac.shape[0 ]() - i, mr), mr):
169169 dst_ptr[l] = Scalar[Type](0 )
@@ -193,15 +193,15 @@ fn pack_B[
193193 @parameter
194194 fn pack_row [width : Int](l : Int):
195195 (dst_ptr + l).store[
196- alignment = sizeof [Type]() * simdwidthof [Type]()
196+ alignment = size_of [Type]() * simd_width_of [Type]()
197197 ](
198198 (src_ptr + l).load[width=width](),
199199 )
200200
201201 vectorize[
202202 pack_row,
203- simdwidthof [Type](),
204- unroll_factor = nr // simdwidthof [Type](),
203+ simd_width_of [Type](),
204+ unroll_factor = nr // simd_width_of [Type](),
205205 ](min (Bc.shape[1 ]() - i, nr))
206206
207207 for l in range (min (Bc.shape[1 ]() - i, nr), nr):
@@ -223,8 +223,8 @@ fn matmul_impl[
223223 mr : Int,
224224 nr : Int,
225225](mc : Int, nc : Int, mut C : Matrix[Type], A : Matrix[Type], B : Matrix[Type]):
226- var Ac_buffer = _malloc[Scalar[Type], alignment= 64 ](
227- mc * kc * sizeof [Type]()
226+ var Ac_buffer = _malloc[Scalar[Type]](
227+ mc * kc * size_of [Type](), alignment = 64
228228 )
229229
230230 var M = C.shape[0 ]()
@@ -268,8 +268,8 @@ fn loop_n[
268268 @parameter
269269 fn parallelize_balanced_part (idx : Int):
270270 var Bc_buffer = UnsafePointer[Scalar[Type]](
271- _malloc[Scalar[Type], alignment= 64 ](
272- kc * nc_per_thread * sizeof [Type]()
271+ _malloc[Scalar[Type]](
272+ kc * nc_per_thread * size_of [Type](), alignment = 64
273273 )
274274 )
275275
@@ -291,8 +291,8 @@ fn loop_n[
291291 @parameter
292292 fn parallelize_remainder (idx : Int):
293293 var Bc_buffer = UnsafePointer[Scalar[Type]](
294- _malloc[Scalar[Type], alignment= 64 ](
295- kc * remainder_per_thread * sizeof [Type]()
294+ _malloc[Scalar[Type]](
295+ kc * remainder_per_thread * size_of [Type](), alignment = 64
296296 )
297297 )
298298 var j = balanced_part + idx * remainder_per_thread
@@ -348,7 +348,7 @@ fn macro_kernel[
348348fn micro_kernel [
349349 Type : DType, //, mr : Int, nr : Int, padding : Bool
350350](mut Cr : Matrix[Type], Ar : Matrix[Type], Br : Matrix[Type]):
351- alias simd_width = simdwidthof [Type]()
351+ alias simd_width = simd_width_of [Type]()
352352 constrained[nr % simd_width == 0 , " nr must be multiple of simd_width" ]()
353353
354354 var Ar_ptr = Ar.data
@@ -391,7 +391,7 @@ fn micro_kernel[
391391 @parameter
392392 for j in range (0 , nr, simd_width):
393393 br[j // simd_width] = (Br_ptr + j).load[
394- width=simd_width, alignment = sizeof [Type]() * simdwidthof [Type]()
394+ width=simd_width, alignment = size_of [Type]() * simd_width_of [Type]()
395395 ]()
396396
397397 @parameter
@@ -440,15 +440,15 @@ fn micro_kernel[
440440
441441@always_inline
442442fn matmul_params [Type : DType]() -> IndexList[5 ]:
443- alias mc = 8192 // sizeof [Type]() # fix this for simplicity
444- alias N = simdwidthof [Type]()
443+ alias mc = 8192 // size_of [Type]() # fix this for simplicity
444+ alias N = simd_width_of [Type]()
445445
446446 alias Vectors = 32 if CompilationTarget.has_avx512f() else 16
447447
448448 @parameter
449449 fn compute_kc [mr : Int, nr : Int]() -> Int:
450450 alias CBr = Int((L1_ASSOCIATIVITY - 1 ) / (1 + mr / nr))
451- return (CBr * L1_CACHE_SIZE ) // (nr * sizeof [Type]() * L1_ASSOCIATIVITY )
451+ return (CBr * L1_CACHE_SIZE ) // (nr * size_of [Type]() * L1_ASSOCIATIVITY )
452452
453453 @parameter
454454 fn compute_params [C : Int]() -> IndexList[5 ]:
@@ -458,7 +458,7 @@ fn matmul_params[Type: DType]() -> IndexList[5]:
458458 alias CBr = Int((L1_ASSOCIATIVITY - 1 ) / (1 + mr / nr))
459459 alias kc = compute_kc[mr, nr]()
460460 alias nc = (L2_ASSOCIATIVITY - 1 ) * L2_CACHE_SIZE // (
461- kc * sizeof [Type]() * L2_ASSOCIATIVITY
461+ kc * size_of [Type]() * L2_ASSOCIATIVITY
462462 ) - mr
463463 return IndexList[5 ](mc, nc, kc, mr, nr)
464464
0 commit comments