@@ -1501,6 +1501,7 @@ struct LoopOrders
15011501 syms_nr:: Vector{Symbol}
15021502 syms_r:: Vector{Symbol}
15031503 buff:: Vector{Symbol}
1504+ state:: Vector{Int}
15041505end
15051506
15061507function outer_reduct_loopordersplit (ls:: LoopSet )
@@ -1549,18 +1550,20 @@ function LoopOrders(ls::LoopSet)
15491550 LoopOrders (
15501551 nonreductsyms,
15511552 reductsyms,
1552- Vector {Symbol} (undef, length (ls. loopsymbols))
1553+ Vector {Symbol} (undef, length (ls. loopsymbols)),
1554+ Vector {Int} (undef, length (ls. loopsymbols))
15531555 )
15541556end
15551557
1556- nonreductview (lo:: LoopOrders ) = view (lo. buff, 1 : length (lo. syms_nr))
1557- reductview (lo:: LoopOrders ) = view (lo. buff, 1 + length (lo. syms_nr): length (lo. buff))
1558+ nonreductview (lo:: LoopOrders ) = @inbounds view (lo. buff, 1 : length (lo. syms_nr))
1559+ reductview (lo:: LoopOrders ) = @inbounds view (lo. buff, 1 + length (lo. syms_nr): length (lo. buff))
15581560function Base. iterate (lo:: LoopOrders )
1559- copyto ! (nonreductview (lo), lo. syms_nr)
1560- copyto ! (reductview (lo), lo. syms_r)
1561+ _copyto ! (nonreductview (lo), lo. syms_nr)
1562+ _copyto ! (reductview (lo), lo. syms_r)
15611563 nr = length (lo. syms_nr)
15621564 r = length (lo. syms_r)
1563- state = zeros (Int, nr + r)
1565+ state = lo. state
1566+ _fill! (state,0 )
15641567 lo. buff, (view (state, 1 : nr), view (state, 1 + nr: nr+ r))
15651568end
15661569
@@ -1582,10 +1585,20 @@ function advance_state!(state)
15821585 end
15831586 true
15841587end
1585- function advance_state! (state, Nr)
1588+ function _copyto! (x,y)
1589+ @inbounds for i = eachindex (x,y)
1590+ x[i]= y[i]
1591+ end
1592+ end
1593+ function _fill! (x,y)
1594+ @inbounds for i = eachindex (x)
1595+ x[i]= y
1596+ end
1597+ end
1598+ function advance_state! (state, Nr):: Bool
15861599 state_nr = view (state, 1 : Nr)
15871600 advance_state! (state_nr) && return true
1588- fill ! (state_nr, 0 )
1601+ _fill ! (state_nr, 0 )
15891602 advance_state! (view (state, 1 + Nr: length (state)))
15901603end
15911604swap! (x:: AbstractVector , i:: Int , j:: Int ) = (x[j], x[i]) = (x[i], x[j])
@@ -1594,20 +1607,21 @@ function swap!(
15941607 src:: AbstractVector{Symbol} ,
15951608 offs:: AbstractVector{Int}
15961609)
1597- copyto ! (dest, src)
1610+ _copyto ! (dest, src)
15981611 for i ∈ eachindex (offs)
15991612 sᵢ = offs[i]
16001613 sᵢ == 0 || swap! (dest, i, i + sᵢ)
16011614 end
16021615end
16031616# This is not a good algorithm
1604- function Base. iterate (lo:: LoopOrders , (state_nr, state_r))
1617+ @inline function Base. iterate (lo:: LoopOrders , states)
1618+ (state_nr, state_r) = states
16051619 if advance_state! (state_nr)
16061620 swap! (nonreductview (lo), lo. syms_nr, state_nr)
16071621 else
16081622 advance_state! (state_r) || return nothing
1609- fill ! (state_nr, 0 )
1610- copyto ! (nonreductview (lo), lo. syms_nr)
1623+ _fill ! (state_nr, 0 )
1624+ _copyto ! (nonreductview (lo), lo. syms_nr)
16111625 swap! (reductview (lo), lo. syms_r, state_r)
16121626 end
16131627 lo. buff, (state_nr, state_r)
@@ -1644,7 +1658,7 @@ function choose_unroll_order(
16441658 cost_temp = evaluate_cost_unroll (ls, new_order, new_vec, lowest_cost, sld)
16451659 if cost_temp < lowest_cost
16461660 lowest_cost = cost_temp
1647- copyto ! (best_order, new_order)
1661+ _copyto ! (best_order, new_order)
16481662 best_vec = new_vec
16491663 end
16501664 end
@@ -1769,7 +1783,7 @@ function choose_tile(
17691783 bestu₂ = newu₂
17701784 bestu₁ = newu₁
17711785 loadelim = loadelim_temp
1772- copyto ! (best_order, new_order)
1786+ _copyto ! (best_order, new_order)
17731787 save_tilecost! (ls)
17741788 end
17751789 end
@@ -1822,7 +1836,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
18221836 mismatched = mismatchedstorereductions (ls)
18231837 if num_loops (ls) > 1 && tc ≤ uc
18241838 @assert ls. loop_order. bestorder === torder
1825- # copyto !(ls.loop_order.bestorder, torder)
1839+ # _copyto !(ls.loop_order.bestorder, torder)
18261840 return torder,
18271841 tunroll,
18281842 ttile,
@@ -1833,7 +1847,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
18331847 shouldinline
18341848 # return torder, tvec, 4, 4#5, 5
18351849 else
1836- copyto ! (ls. loop_order. bestorder, uorder)
1850+ _copyto ! (ls. loop_order. bestorder, uorder)
18371851 UF, uunroll = determine_unroll_factor (ls, uorder, uvec)
18381852 return uorder,
18391853 uunroll,
0 commit comments