@@ -34,3 +34,326 @@ pg_neon_at_runtime_but_uncompiled()
3434 }
3535 return 0 ;
3636}
37+
38+ #if (defined(__SSE2__ ) || defined(PG_ENABLE_ARM_NEON ))
39+
40+ void
41+ filter_shrink_X_SSE2_multi (Uint8 * srcpix , Uint8 * dstpix , int height ,
42+ int srcpitch , int dstpitch , int srcwidth ,
43+ int dstwidth )
44+ {
45+ // FIXME TODO: assumes height is multiple of 2
46+
47+ int srcdiff = srcpitch - (srcwidth * 4 ) + srcpitch ;
48+ int dstdiff = dstpitch - (dstwidth * 4 ) + dstpitch ;
49+ int x , y ;
50+
51+ __m128i src , src2 , dst , dst2 , accumulate , mm_xcounter , mm_xfrac ;
52+
53+ Uint8 * srcpix2 = srcpix + srcpitch ;
54+ Uint8 * dstpix2 = dstpix + dstpitch ;
55+
56+ int xspace = 0x04000 * srcwidth / dstwidth ; /* must be > 1 */
57+
58+ __m128i xrecip = _mm_set1_epi16 ((Uint16 )(0x40000000 / xspace ));
59+
60+ for (y = 0 ; y < height ; y += 2 ) {
61+ accumulate = _mm_setzero_si128 ();
62+ int xcounter = xspace ;
63+ for (x = 0 ; x < srcwidth ; x ++ ) {
64+ if (xcounter > 0x04000 ) {
65+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
66+ _mm_setzero_si128 ());
67+
68+ src2 = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix2 ),
69+ _mm_setzero_si128 ());
70+ src2 = _mm_slli_si128 (src2 , 8 ); // replace with
71+ src = _mm_add_epi16 (src , src2 ); // _mm_unpacklo_epi64?
72+
73+ accumulate = _mm_add_epi16 (accumulate , src );
74+ srcpix += 4 ;
75+ srcpix2 += 4 ;
76+ xcounter -= 0x04000 ;
77+ }
78+ else {
79+ int xfrac = 0x04000 - xcounter ;
80+ /* write out a destination pixel */
81+
82+ mm_xcounter = _mm_set1_epi16 (xcounter );
83+ mm_xfrac = _mm_set1_epi16 (xfrac );
84+
85+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
86+ _mm_setzero_si128 ());
87+
88+ src2 = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix2 ),
89+ _mm_setzero_si128 ());
90+ src2 = _mm_slli_si128 (src2 , 8 ); // replace with
91+ src = _mm_add_epi16 (src , src2 ); // _mm_unpacklo_epi64?
92+
93+ src = _mm_slli_epi16 (src , 2 );
94+ dst = _mm_mulhi_epu16 (src , mm_xcounter );
95+ dst = _mm_add_epi16 (dst , accumulate );
96+ accumulate = _mm_mulhi_epu16 (src , mm_xfrac );
97+
98+ dst = _mm_mulhi_epu16 (dst , xrecip );
99+ dst = _mm_packus_epi16 (dst , _mm_setzero_si128 ());
100+ _mm_storeu_si32 (dstpix , dst );
101+
102+ _mm_storeu_si32 (dstpix2 , _mm_srli_si128 (dst , 4 ));
103+
104+ dstpix += 4 ;
105+ dstpix2 += 4 ;
106+ srcpix += 4 ;
107+ srcpix2 += 4 ;
108+ xcounter = xspace - xfrac ;
109+ }
110+ }
111+ srcpix += srcdiff ;
112+ srcpix2 += srcdiff ;
113+ dstpix += dstdiff ;
114+ dstpix2 += dstdiff ;
115+ }
116+ }
117+
118+ void
119+ filter_shrink_X_SSE2 (Uint8 * srcpix , Uint8 * dstpix , int height , int srcpitch ,
120+ int dstpitch , int srcwidth , int dstwidth )
121+ {
122+ int srcdiff = srcpitch - (srcwidth * 4 );
123+ int dstdiff = dstpitch - (dstwidth * 4 );
124+ int x , y ;
125+ __m128i src , dst , accumulate , mm_xcounter , mm_xfrac ;
126+
127+ int xspace = 0x04000 * srcwidth / dstwidth ; /* must be > 1 */
128+ __m128i xrecip = _mm_set1_epi16 (0x40000000 / xspace );
129+
130+ for (y = 0 ; y < height ; y ++ ) {
131+ accumulate = _mm_setzero_si128 ();
132+ int xcounter = xspace ;
133+ for (x = 0 ; x < srcwidth ; x ++ ) {
134+ if (xcounter > 0x04000 ) {
135+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
136+ _mm_setzero_si128 ());
137+
138+ accumulate = _mm_add_epi16 (accumulate , src );
139+ srcpix += 4 ;
140+ xcounter -= 0x04000 ;
141+ }
142+ else {
143+ int xfrac = 0x04000 - xcounter ;
144+ /* write out a destination pixel */
145+
146+ mm_xcounter = _mm_set1_epi16 (xcounter );
147+ mm_xfrac = _mm_set1_epi16 (xfrac );
148+
149+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
150+ _mm_setzero_si128 ());
151+
152+ src = _mm_slli_epi16 (src , 2 );
153+ dst = _mm_mulhi_epu16 (src , mm_xcounter );
154+ dst = _mm_add_epi16 (dst , accumulate );
155+ accumulate = _mm_mulhi_epu16 (src , mm_xfrac );
156+
157+ dst = _mm_mulhi_epu16 (dst , xrecip );
158+ dst = _mm_packus_epi16 (dst , _mm_setzero_si128 ());
159+ _mm_storeu_si32 (dstpix , dst );
160+
161+ dstpix += 4 ;
162+ srcpix += 4 ;
163+
164+ xcounter = xspace - xfrac ;
165+ }
166+ }
167+ srcpix += srcdiff ;
168+ dstpix += dstdiff ;
169+ }
170+ }
171+
172+ void
173+ filter_shrink_Y_SSE2 (Uint8 * srcpix , Uint8 * dstpix , int width , int srcpitch ,
174+ int dstpitch , int srcheight , int dstheight )
175+ {
176+ int srcdiff = srcpitch - (width * 4 );
177+ int dstdiff = dstpitch - (width * 4 );
178+ int x , y ;
179+ __m128i src , dst , mm_acc , mm_yfrac , mm_ycounter ;
180+
181+ int yspace = 0x04000 * srcheight / dstheight ; /* must be > 1 */
182+ __m128i yrecip = _mm_set1_epi16 (0x40000000 / yspace );
183+ int ycounter = yspace ;
184+
185+ Uint16 * templine ;
186+ // TODO replace malloc+memset with calloc?
187+ /* allocate and clear a memory area for storing the accumulator line */
188+ templine = (Uint16 * )malloc (dstpitch * 2 );
189+ if (templine == NULL )
190+ return ;
191+ memset (templine , 0 , dstpitch * 2 );
192+
193+ for (y = 0 ; y < srcheight ; y ++ ) {
194+ Uint16 * accumulate = templine ;
195+ if (ycounter > 0x04000 ) {
196+ // TODO could iterate multipixel at a time
197+ for (x = 0 ; x < width ; x ++ ) {
198+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
199+ _mm_setzero_si128 ());
200+ _mm_storeu_si64 (
201+ accumulate ,
202+ _mm_add_epi16 (_mm_loadu_si64 (accumulate ), src ));
203+ accumulate += 4 ; // 4 Uint16s, so 8 bytes
204+ srcpix += 4 ;
205+ }
206+ ycounter -= 0x04000 ;
207+ }
208+ else {
209+ int yfrac = 0x04000 - ycounter ;
210+ /* write out a destination line */
211+ // TODO could iterate multipixel at a time
212+ for (x = 0 ; x < width ; x ++ ) {
213+ src = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcpix ),
214+ _mm_setzero_si128 ());
215+ srcpix += 4 ;
216+
217+ mm_acc = _mm_loadu_si64 (accumulate );
218+
219+ mm_yfrac = _mm_set1_epi16 (yfrac );
220+ mm_ycounter = _mm_set1_epi16 (ycounter );
221+
222+ src = _mm_slli_epi16 (src , 2 );
223+ dst = _mm_mulhi_epu16 (src , mm_yfrac );
224+ src = _mm_mulhi_epu16 (src , mm_ycounter );
225+
226+ _mm_storeu_si64 (accumulate , dst );
227+ accumulate += 4 ; // 4 Uint16s, so 8 bytes
228+
229+ dst = _mm_add_epi16 (src , mm_acc );
230+ dst = _mm_mulhi_epu16 (dst , yrecip );
231+ dst = _mm_packus_epi16 (dst , _mm_setzero_si128 ());
232+ _mm_storeu_si32 (dstpix , dst );
233+ dstpix += 4 ;
234+ }
235+ dstpix += dstdiff ;
236+ ycounter = yspace - yfrac ;
237+ }
238+ srcpix += srcdiff ;
239+ } /* for (int y = 0; y < srcheight; y++) */
240+
241+ /* free the temporary memory */
242+ free (templine );
243+ }
244+
245+ void
246+ filter_expand_X_SSE2 (Uint8 * srcpix , Uint8 * dstpix , int height , int srcpitch ,
247+ int dstpitch , int srcwidth , int dstwidth )
248+ {
249+ int dstdiff = dstpitch - (dstwidth * 4 );
250+ int * xidx0 , * xmult_combined ;
251+ int x , y ;
252+ const int factorwidth = 8 ;
253+
254+ #ifdef _MSC_VER
255+ /* Make MSVC static analyzer happy by assuring dstwidth >= 2 to suppress
256+ * a false analyzer report */
257+ __analysis_assume (dstwidth >= 2 );
258+ #endif
259+
260+ /* Allocate memory for factors */
261+ xidx0 = malloc (dstwidth * 4 );
262+ if (xidx0 == 0 )
263+ return ;
264+ xmult_combined = (int * )malloc (dstwidth * factorwidth );
265+ if (xmult_combined == 0 ) {
266+ free (xidx0 );
267+ return ;
268+ }
269+
270+ /* Create multiplier factors and starting indices and put them in arrays */
271+ for (x = 0 ; x < dstwidth ; x ++ ) {
272+ // Could it be worth it to reduce the fixed point there to fit
273+ // inside 16 bits (0xFF), and then pack xidx0 in with mult factors?
274+ int xm1 = 0x100 * ((x * (srcwidth - 1 )) % dstwidth ) / dstwidth ;
275+ int xm0 = 0x100 - xm1 ;
276+ xidx0 [x ] = x * (srcwidth - 1 ) / dstwidth ;
277+
278+ // packs xm0 and xm1 scaling factors into a combined array, for easy
279+ // loading
280+ xmult_combined [x * 2 ] = xm0 | (xm0 << 16 );
281+ xmult_combined [x * 2 + 1 ] = xm1 | (xm1 << 16 );
282+ }
283+
284+ __m128i src , mmxid , mult0 , mult1 , multcombined , dst ;
285+
286+ /* Do the scaling in raster order so we don't trash the cache */
287+ for (y = 0 ; y < height ; y ++ ) {
288+ Uint8 * srcrow0 = srcpix + y * srcpitch ;
289+ for (x = 0 ; x < dstwidth ; x ++ ) {
290+ Uint8 * src_p =
291+ srcrow0 + xidx0 [x ] * 4 ; // *8 now because of factorwidth?
292+
293+ src =
294+ _mm_unpacklo_epi8 (_mm_loadu_si64 (src_p ), _mm_setzero_si128 ());
295+
296+ // uses combined multipliers against 2 src pixels
297+ // xm0 against src[0-3] (1 px), and xm1 against xrc[4-7] (1 px)
298+ multcombined = _mm_shuffle_epi32 (
299+ _mm_loadu_si64 (xmult_combined + x * 2 ), 0b01010000 );
300+
301+ src = _mm_mullo_epi16 (src , multcombined );
302+
303+ dst = _mm_bsrli_si128 (src , 8 );
304+ dst = _mm_add_epi16 (src , dst );
305+ dst = _mm_srli_epi16 (dst , 8 );
306+ dst = _mm_packus_epi16 (dst , _mm_setzero_si128 ());
307+ _mm_storeu_si32 (dstpix , dst );
308+
309+ dstpix += 4 ;
310+ }
311+ dstpix += dstdiff ;
312+ }
313+
314+ /* free memory */
315+ free (xidx0 );
316+ free (xmult_combined );
317+ }
318+
319+ void
320+ filter_expand_Y_SSE2 (Uint8 * srcpix , Uint8 * dstpix , int width , int srcpitch ,
321+ int dstpitch , int srcheight , int dstheight )
322+ {
323+ int x , y ;
324+
325+ __m128i src0 , src1 , dst , ymult0_mm , ymult1_mm ;
326+
327+ for (y = 0 ; y < dstheight ; y ++ ) {
328+ int yidx0 = y * (srcheight - 1 ) / dstheight ;
329+ Uint8 * srcrow0 = srcpix + yidx0 * srcpitch ;
330+ Uint8 * srcrow1 = srcrow0 + srcpitch ;
331+ int ymult1 = 0x0100 * ((y * (srcheight - 1 )) % dstheight ) / dstheight ;
332+ int ymult0 = 0x0100 - ymult1 ;
333+
334+ ymult0_mm = _mm_set1_epi16 (ymult0 );
335+ ymult1_mm = _mm_set1_epi16 (ymult1 );
336+
337+ for (x = 0 ; x < width ; x ++ ) {
338+ src0 = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcrow0 ),
339+ _mm_setzero_si128 ());
340+ src1 = _mm_unpacklo_epi8 (_mm_loadu_si32 (srcrow1 ),
341+ _mm_setzero_si128 ());
342+
343+ src0 = _mm_mullo_epi16 (src0 , ymult0_mm );
344+ src1 = _mm_mullo_epi16 (src1 , ymult1_mm );
345+
346+ dst = _mm_add_epi16 (src0 , src1 );
347+ dst = _mm_srli_epi16 (dst , 8 );
348+ dst = _mm_packus_epi16 (dst , _mm_setzero_si128 ());
349+ _mm_storeu_si32 (dstpix , dst );
350+
351+ srcrow0 += 4 ;
352+ srcrow1 += 4 ;
353+ dstpix += 4 ;
354+ }
355+ Uint8 * dstrow = dstpix + y * dstpitch ;
356+ }
357+ }
358+
359+ #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
0 commit comments