@@ -111,7 +111,7 @@ function cost(
111111 # cannot shuffle false means reject curly
112112 # either false means shuffle
113113 dont_shuffle =
114- rejectinterleave (op) && (cannot_shuffle (op, u₁, u₂, contigind, indices))
114+ (Wshift > 3 ) || ( rejectinterleave (op) && (cannot_shuffle (op, u₁, u₂, contigind, indices) ))
115115 if dont_shuffle
116116 # offset = 0.0 # gather/scatter, alignment doesn't matter
117117 r = 1 << shifter
@@ -301,7 +301,6 @@ function unroll_no_reductions(ls, order, vloopsym)
301301 u = if compute_rt ≤ 1
302302 4
303303 elseif compute_rt > memory_rt
304- # @show load_rt, store_rt, compute_rt, compute_l, rpc, rpp
305304 # if compute_rt > 40
306305 # max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1)
307306 # else
@@ -312,7 +311,6 @@ function unroll_no_reductions(ls, order, vloopsym)
312311 else
313312 max (1 , min (4 , round (Int, 1.75 compute_rt / load_rt)))
314313 end
315- # @show load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp
316314 # u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
317315 # commented out here is to decide to align loops
318316 # if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
@@ -381,7 +379,6 @@ function determine_unroll_factor(
381379 end
382380 recip_throughput =
383381 max (compute_recip_throughput, load_recip_throughput, store_recip_throughput)
384- # @show latency, recip_throughput
385382 recip_throughput, latency
386383end
387384function count_reductions (ls:: LoopSet )
@@ -992,7 +989,6 @@ function load_elimination_cost_factor!(
992989 # cost_vec[1] -= rt
993990 # cost_vec[1] -= 0.5625 * iters
994991 # cost_vec[1] -= 0.5625 * iters / 2
995- # @show rto, 0.8rt, op
996992 # reg_pressure[1] += 0.25rp
997993 reg_pressure[1 ] += 0.25 rp
998994 cost_vec[2 ] += rt
@@ -1156,7 +1152,6 @@ function evaluate_cost_tile!(
11561152 @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
11571153 cacheunrolled! (ls, u₁loopsym, u₂loopsym, vloopsym)
11581154 # println("\n")
1159- # @show order unrollsyms
11601155 # u₂loopsym = order[1]
11611156 # u₁loopsym = order[2]
11621157 ops = operations (ls)
@@ -1248,9 +1243,6 @@ function evaluate_cost_tile!(
12481243 inner₁ = u₁reached | depends_on_u₂
12491244 inner₂ = u₂reached | depends_on_u₁
12501245 # if isconstantop(op)
1251- # if iscompute(op)
1252- # @show inner₁, depends_on_u₁, inner₂, depends_on_u₂, op
1253- # end
12541246 reduced_by_unrolling[1 , 2 , id] = inner₁ & ! depends_on_u₁
12551247 reduced_by_unrolling[2 , 2 , id] = inner₂ & ! depends_on_u₂
12561248 # else
@@ -1285,7 +1277,6 @@ function evaluate_cost_tile!(
12851277 size_T,
12861278 opisininnerloop,
12871279 )
1288- # println("constoffelim")
12891280 continue
12901281 elseif load_elimination_cost_factor! (
12911282 cost_vec,
@@ -1298,11 +1289,8 @@ function evaluate_cost_tile!(
12981289 Wshift,
12991290 size_T,
13001291 )
1301- # println("loadelim")
1302- # A[i,j-1], A[i,j]
13031292 continue
13041293 end
1305- # elseif isconstant(op)
13061294 end
13071295 rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
13081296 if isload (op) & (! prefetch_good_idea)
@@ -1315,9 +1303,8 @@ function evaluate_cost_tile!(
13151303 else # FIXME : hack to not go crazy
13161304 max (zero (rp), rp - one (rp))
13171305 end
1318- rto = rt
1306+ # rto = rt
13191307 rt *= iters[id]
1320- # @show (u₁reducesrt, u₂reducesrt), (u₁reducesrp, u₂reducesrp), rto, rt, lat, rp, op
13211308 if isstore (op) & (! u₁reducesrt) & (! u₂reducesrt)
13221309 irreducible_storecosts += rt
13231310 end
@@ -1340,7 +1327,6 @@ function evaluate_cost_tile!(
13401327 end
13411328 end
13421329 end
1343- # @show u₁reducesrp, u₂reducesrp, rp, op
13441330 update_reg_pres! (reg_pressure, rp, u₁reducesrp, u₂reducesrp)
13451331 # end
13461332 # update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
0 commit comments