8787 # miter decreases in each iteration of factors
8888 miter, niter = factors[i]
8989
90- r = ( MoW % miter)
90+ r = MoW % miter
9191 # if ((miter * W * U * 2) ≤ M - (W+W)) & ((r == 0) | (miter == (r+1)))
9292 mlarge = (miter * (U * 2 )) ≤ MoW - 2
9393 # we want `mlarge` enough, or there to be no remainder (`r == 0`)
103103 @inbounds factors[(length (factors)+ 1 )>>> 1 ]
104104end
105105
106- # struct ChooseNumBlocks{U,C} <: Function end
107- # function (cnb::ChooseNumBlocks{U,C})(M::UInt) where {U,C}
108- # choose_num_blocks(M, StaticInt{U}(), StaticInt{C}())
109- # end
110-
111- # @generated function choose_num_block_table(::StaticInt{U}, ::StaticInt{NC}) where {U,NC}
112- # t = Expr(:tuple)
113- # for n ∈ 1:NC
114- # cnb = :(ChooseNumBlocks{$U,$n}())
115- # push!(t.args, :(@cfunction($cnb, Tuple{UInt,UInt}, (UInt,))))
116- # end
117- # t
118- # end
119106@generated function choose_num_block_table (:: StaticInt{NC} ) where {NC}
120107 t = Expr (:tuple )
121108 for n ∈ 1 : NC
130117 nt,
131118 :: StaticInt{NTMAX} ,
132119) where {U,NTMAX}
133- # valid range for nt: 2 ≤ nt ≤ NTMAX
134- # if NTMAX > 8
135- # return quote
136- # $(Expr(:meta,:inline))
137- # choose_num_blocks_table(M, StaticInt{$U}(), nt, StaticInt{$NTMAX}())
138- # end
139- # else
140120 if NTMAX == 2 # `nt` must be `2`
141121 return quote
142122 $ (Expr (:meta , :inline ))
@@ -166,16 +146,6 @@ function add_bisecting_if_branches!(q, lb, ub, U, isfirst::Bool)
166146 return
167147end
168148
169- # @inline function choose_num_blocks_table(M, ::StaticInt{U}, nt, ::StaticInt{NTMAX}) where {U,NTMAX}
170- # if nt == NTMAX
171- # choose_num_blocks(M % UInt, StaticInt{U}(), StaticInt{NTMAX}())
172- # else
173- # @inbounds fptr = choose_num_block_table(StaticInt{U}(), StaticInt{NTMAX}())[nt]
174- # VectorizationBase.assume(fptr ≠ C_NULL)
175- # ccall(fptr, Tuple{UInt,UInt}, (UInt,), M%UInt)
176- # end
177- # end
178-
179149# if a threaded loop is vectorized, call
180150@inline function choose_num_blocks (M, :: StaticInt{U} , nt) where {U}
181151 _choose_num_blocks (M % UInt, StaticInt {U} (), nt, lv_max_num_threads ())
184154@inline choose_num_blocks (nt, :: StaticInt{NC} = lv_max_num_threads ()) where {NC} =
185155 @inbounds choose_num_block_table (StaticInt {NC} ())[nt]
186156
187-
188-
189- # The goal is to minimimize the maximum costs...
190- # But maybe 'relatively even sizes' heuristics are more robust than fancy modeling?
191- # At least early on, before lots of test cases with different sorts of loops have informed the modeling.
192- #
193- # goal is to produce `nblocks` roughly even block sizes (bM, bN), such that `bM % fM == bN % fN == 0`.
194- # function roughly_even_blocks(M, N, fM, fN, nblocks)
195- # M_N_ratio = M / N
196- # block_per_m = sqrt(nblocks * M_N_ratio) # obv not even
197- # blocks_per_n = block_per_m / M_N_ratio
198- # mi = cld(M, fM)
199- # ni = cld(N, fN)
200- # block_per_m, blocks_per_n
201- # end
202157if Sys. ARCH === :x86_64
203158 @inline function choose_num_threads (
204159 C:: T ,
@@ -280,7 +235,6 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
280235 # push!(q.args, :(@show var"#load#thread#ret#"))
281236 for (i, or) ∈ enumerate (ls. outer_reductions)
282237 op = ls. operations[or]
283- var = name (op)
284238 mvar = mangledvar (op)
285239 instr = instruction (op)
286240 out = Symbol (mvar, " ##onevec##" )
@@ -328,7 +282,6 @@ function thread_loop_summary!(
328282)
329283 W = ls. vector_width
330284 @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
331- u₂ = u₂max
332285 threadloopnumtag = Int (issecondthreadloop)
333286 lensym = Symbol (" #len#thread#$threadloopnumtag #" )
334287 define_len = if isstaticloop (threadedloop)
@@ -505,7 +458,7 @@ function thread_one_loops_expr(
505458 thread_loop_summary! (ls, ua, threadedloop, false )
506459 loopboundexpr = Expr (:tuple ) # for launched threads
507460 lastboundexpr = Expr (:tuple ) # remainder, started on main thread
508- for (i, loop) ∈ enumerate ( ls. loops)
461+ for loop ∈ ls. loops
509462 if loop === threadedloop
510463 push! (loopboundexpr. args, looprange)
511464 push! (lastboundexpr. args, lastrange)
@@ -546,8 +499,6 @@ function thread_one_loops_expr(
546499 if var"##do#thread##"
547500 var"#threads#tuple#" , var"#torelease#tuple#" =
548501 PolyesterWeave. request_threads (var"#nrequest#" )
549- # var"#threads#tuple#", var"#torelease#tuple#" = PolyesterWeave.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
550-
551502 var"#thread#factor#0#" = var"#nthreads#"
552503 $ iterdef
553504 var"#thread#id#" = 0x00000000
@@ -627,7 +578,6 @@ function thread_one_loops_expr(
627578 end
628579 $ retexpr
629580 end
630- # Expr(:block, Expr(:meta,:inline), ls.preamble, q)
631581 Expr (:block , ls. preamble, q)
632582end
633583function define_vthread_blocks (vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -710,7 +660,6 @@ function thread_two_loops_expr(
710660 end
711661 @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
712662 u₂ = u₂max
713- W = ls. vector_width
714663 threadedloop1 = getloop (ls, threadedid1)
715664 threadedloop2 = getloop (ls, threadedid2)
716665 define_len1, define_num_unrolls1, loopstart1, iterstop1, looprange1, lastrange1 =
@@ -719,7 +668,7 @@ function thread_two_loops_expr(
719668 thread_loop_summary! (ls, ua, threadedloop2, true )
720669 loopboundexpr = Expr (:tuple )
721670 lastboundexpr = Expr (:tuple )
722- for (i, loop) ∈ enumerate ( ls. loops)
671+ for loop ∈ ls. loops
723672 if loop === threadedloop1
724673 push! (loopboundexpr. args, looprange1)
725674 push! (lastboundexpr. args, lastrange1)
@@ -741,11 +690,9 @@ function thread_two_loops_expr(
741690 Val (typeof (var"#avx#call#args#" )),
742691 flatten_to_tuple (var"#avx#call#args#" )... ,
743692 ))
744- # _turbo_orig_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, var"#lv#tuple#args#"))
745693 update_return_values = if length (ls. outer_reductions) > 0
746694 retv = loopset_return_value (ls, Val (false ))
747695 _turbo_call_ = Expr (:(= ), retv, _turbo_call_)
748- # _turbo_orig_ = Expr(:(=), retv, _turbo_orig_)
749696 outer_reduct_combine_expressions (ls, retv)
750697 else
751698 nothing
@@ -757,15 +704,10 @@ function thread_two_loops_expr(
757704 retexpr = length (ls. outer_reductions) > 0 ? :(return $ retv) : :(return nothing )
758705 q = quote
759706 $ choose_nthread # UInt
760- # @show var"#nthreads#"
761707 $ loopstart1
762708 $ loopstart2
763709 var"##do#thread##" = var"#nthreads#" > one (var"#nthreads#" )
764710 if var"##do#thread##"
765- # if var"#nthreads#" ≤ 1
766- # $_turbo_orig_
767- # return $retexpr
768- # end
769711 $ define_len1
770712 $ define_len2
771713 $ define_num_unrolls1
@@ -913,7 +855,6 @@ function thread_two_loops_expr(
913855 end
914856 $ retexpr
915857 end
916- # Expr(:block, Expr(:meta,:inline), ls.preamble, q)
917858 Expr (:block , ls. preamble, q)
918859end
919860
@@ -957,8 +898,6 @@ function avx_threads_expr(
957898)
958899 valid_thread_loop, ua, c = valid_thread_loops (ls)
959900 num_candiates = sum (valid_thread_loop)
960- # num_to_thread = min(num_candiates, 2)
961- # candidate_ids =
962901 if (num_candiates == 0 ) || (nt ≤ 1 ) # it was called from `avx_body` but now `nt` was set to `1`
963902 avx_body (ls, UNROLL)
964903 elseif (num_candiates == 1 ) || (nt ≤ 3 )
0 commit comments