Skip to content

Commit 4e9418a

Browse files
committed
Initial smoothscale intrinsic implementation
1 parent 644fc36 commit 4e9418a

File tree

4 files changed

+360
-3
lines changed

4 files changed

+360
-3
lines changed

src_c/simd_transform.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,21 @@
99

1010
// SSE2 functions
1111
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
12+
13+
// smoothscale filters
14+
void
15+
filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
16+
int dstpitch, int srcwidth, int dstwidth);
17+
void
18+
filter_shrink_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
19+
int dstpitch, int srcheight, int dstheight);
20+
void
21+
filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
22+
int dstpitch, int srcwidth, int dstwidth);
23+
void
24+
filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
25+
int dstpitch, int srcheight, int dstheight);
26+
1227
#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
1328

1429
// AVX2 functions

src_c/simd_transform_sse2.c

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,326 @@ pg_neon_at_runtime_but_uncompiled()
3434
}
3535
return 0;
3636
}
37+
38+
#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
39+
40+
void
41+
filter_shrink_X_SSE2_multi(Uint8 *srcpix, Uint8 *dstpix, int height,
42+
int srcpitch, int dstpitch, int srcwidth,
43+
int dstwidth)
44+
{
45+
// FIXME TODO: assumes height is multiple of 2
46+
47+
int srcdiff = srcpitch - (srcwidth * 4) + srcpitch;
48+
int dstdiff = dstpitch - (dstwidth * 4) + dstpitch;
49+
int x, y;
50+
51+
__m128i src, src2, dst, dst2, accumulate, mm_xcounter, mm_xfrac;
52+
53+
Uint8 *srcpix2 = srcpix + srcpitch;
54+
Uint8 *dstpix2 = dstpix + dstpitch;
55+
56+
int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
57+
58+
__m128i xrecip = _mm_set1_epi16((Uint16)(0x40000000 / xspace));
59+
60+
for (y = 0; y < height; y += 2) {
61+
accumulate = _mm_setzero_si128();
62+
int xcounter = xspace;
63+
for (x = 0; x < srcwidth; x++) {
64+
if (xcounter > 0x04000) {
65+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
66+
_mm_setzero_si128());
67+
68+
src2 = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix2),
69+
_mm_setzero_si128());
70+
src2 = _mm_slli_si128(src2, 8); // replace with
71+
src = _mm_add_epi16(src, src2); // _mm_unpacklo_epi64?
72+
73+
accumulate = _mm_add_epi16(accumulate, src);
74+
srcpix += 4;
75+
srcpix2 += 4;
76+
xcounter -= 0x04000;
77+
}
78+
else {
79+
int xfrac = 0x04000 - xcounter;
80+
/* write out a destination pixel */
81+
82+
mm_xcounter = _mm_set1_epi16(xcounter);
83+
mm_xfrac = _mm_set1_epi16(xfrac);
84+
85+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
86+
_mm_setzero_si128());
87+
88+
src2 = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix2),
89+
_mm_setzero_si128());
90+
src2 = _mm_slli_si128(src2, 8); // replace with
91+
src = _mm_add_epi16(src, src2); // _mm_unpacklo_epi64?
92+
93+
src = _mm_slli_epi16(src, 2);
94+
dst = _mm_mulhi_epu16(src, mm_xcounter);
95+
dst = _mm_add_epi16(dst, accumulate);
96+
accumulate = _mm_mulhi_epu16(src, mm_xfrac);
97+
98+
dst = _mm_mulhi_epu16(dst, xrecip);
99+
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
100+
_mm_storeu_si32(dstpix, dst);
101+
102+
_mm_storeu_si32(dstpix2, _mm_srli_si128(dst, 4));
103+
104+
dstpix += 4;
105+
dstpix2 += 4;
106+
srcpix += 4;
107+
srcpix2 += 4;
108+
xcounter = xspace - xfrac;
109+
}
110+
}
111+
srcpix += srcdiff;
112+
srcpix2 += srcdiff;
113+
dstpix += dstdiff;
114+
dstpix2 += dstdiff;
115+
}
116+
}
117+
118+
void
119+
filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
120+
int dstpitch, int srcwidth, int dstwidth)
121+
{
122+
int srcdiff = srcpitch - (srcwidth * 4);
123+
int dstdiff = dstpitch - (dstwidth * 4);
124+
int x, y;
125+
__m128i src, dst, accumulate, mm_xcounter, mm_xfrac;
126+
127+
int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
128+
__m128i xrecip = _mm_set1_epi16(0x40000000 / xspace);
129+
130+
for (y = 0; y < height; y++) {
131+
accumulate = _mm_setzero_si128();
132+
int xcounter = xspace;
133+
for (x = 0; x < srcwidth; x++) {
134+
if (xcounter > 0x04000) {
135+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
136+
_mm_setzero_si128());
137+
138+
accumulate = _mm_add_epi16(accumulate, src);
139+
srcpix += 4;
140+
xcounter -= 0x04000;
141+
}
142+
else {
143+
int xfrac = 0x04000 - xcounter;
144+
/* write out a destination pixel */
145+
146+
mm_xcounter = _mm_set1_epi16(xcounter);
147+
mm_xfrac = _mm_set1_epi16(xfrac);
148+
149+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
150+
_mm_setzero_si128());
151+
152+
src = _mm_slli_epi16(src, 2);
153+
dst = _mm_mulhi_epu16(src, mm_xcounter);
154+
dst = _mm_add_epi16(dst, accumulate);
155+
accumulate = _mm_mulhi_epu16(src, mm_xfrac);
156+
157+
dst = _mm_mulhi_epu16(dst, xrecip);
158+
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
159+
_mm_storeu_si32(dstpix, dst);
160+
161+
dstpix += 4;
162+
srcpix += 4;
163+
164+
xcounter = xspace - xfrac;
165+
}
166+
}
167+
srcpix += srcdiff;
168+
dstpix += dstdiff;
169+
}
170+
}
171+
172+
void
173+
filter_shrink_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
174+
int dstpitch, int srcheight, int dstheight)
175+
{
176+
int srcdiff = srcpitch - (width * 4);
177+
int dstdiff = dstpitch - (width * 4);
178+
int x, y;
179+
__m128i src, dst, mm_acc, mm_yfrac, mm_ycounter;
180+
181+
int yspace = 0x04000 * srcheight / dstheight; /* must be > 1 */
182+
__m128i yrecip = _mm_set1_epi16(0x40000000 / yspace);
183+
int ycounter = yspace;
184+
185+
Uint16 *templine;
186+
// TODO replace malloc+memset with calloc?
187+
/* allocate and clear a memory area for storing the accumulator line */
188+
templine = (Uint16 *)malloc(dstpitch * 2);
189+
if (templine == NULL)
190+
return;
191+
memset(templine, 0, dstpitch * 2);
192+
193+
for (y = 0; y < srcheight; y++) {
194+
Uint16 *accumulate = templine;
195+
if (ycounter > 0x04000) {
196+
// TODO could iterate multipixel at a time
197+
for (x = 0; x < width; x++) {
198+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
199+
_mm_setzero_si128());
200+
_mm_storeu_si64(
201+
accumulate,
202+
_mm_add_epi16(_mm_loadu_si64(accumulate), src));
203+
accumulate += 4; // 4 Uint16s, so 8 bytes
204+
srcpix += 4;
205+
}
206+
ycounter -= 0x04000;
207+
}
208+
else {
209+
int yfrac = 0x04000 - ycounter;
210+
/* write out a destination line */
211+
// TODO could iterate multipixel at a time
212+
for (x = 0; x < width; x++) {
213+
src = _mm_unpacklo_epi8(_mm_loadu_si32(srcpix),
214+
_mm_setzero_si128());
215+
srcpix += 4;
216+
217+
mm_acc = _mm_loadu_si64(accumulate);
218+
219+
mm_yfrac = _mm_set1_epi16(yfrac);
220+
mm_ycounter = _mm_set1_epi16(ycounter);
221+
222+
src = _mm_slli_epi16(src, 2);
223+
dst = _mm_mulhi_epu16(src, mm_yfrac);
224+
src = _mm_mulhi_epu16(src, mm_ycounter);
225+
226+
_mm_storeu_si64(accumulate, dst);
227+
accumulate += 4; // 4 Uint16s, so 8 bytes
228+
229+
dst = _mm_add_epi16(src, mm_acc);
230+
dst = _mm_mulhi_epu16(dst, yrecip);
231+
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
232+
_mm_storeu_si32(dstpix, dst);
233+
dstpix += 4;
234+
}
235+
dstpix += dstdiff;
236+
ycounter = yspace - yfrac;
237+
}
238+
srcpix += srcdiff;
239+
} /* for (int y = 0; y < srcheight; y++) */
240+
241+
/* free the temporary memory */
242+
free(templine);
243+
}
244+
245+
void
246+
filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
247+
int dstpitch, int srcwidth, int dstwidth)
248+
{
249+
int dstdiff = dstpitch - (dstwidth * 4);
250+
int *xidx0, *xmult_combined;
251+
int x, y;
252+
const int factorwidth = 8;
253+
254+
#ifdef _MSC_VER
255+
/* Make MSVC static analyzer happy by assuring dstwidth >= 2 to suppress
256+
* a false analyzer report */
257+
__analysis_assume(dstwidth >= 2);
258+
#endif
259+
260+
/* Allocate memory for factors */
261+
xidx0 = malloc(dstwidth * 4);
262+
if (xidx0 == 0)
263+
return;
264+
xmult_combined = (int *)malloc(dstwidth * factorwidth);
265+
if (xmult_combined == 0) {
266+
free(xidx0);
267+
return;
268+
}
269+
270+
/* Create multiplier factors and starting indices and put them in arrays */
271+
for (x = 0; x < dstwidth; x++) {
272+
// Could it be worth it to reduce the fixed point there to fit
273+
// inside 16 bits (0xFF), and then pack xidx0 in with mult factors?
274+
int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
275+
int xm0 = 0x100 - xm1;
276+
xidx0[x] = x * (srcwidth - 1) / dstwidth;
277+
278+
// packs xm0 and xm1 scaling factors into a combined array, for easy
279+
// loading
280+
xmult_combined[x * 2] = xm0 | (xm0 << 16);
281+
xmult_combined[x * 2 + 1] = xm1 | (xm1 << 16);
282+
}
283+
284+
__m128i src, mmxid, mult0, mult1, multcombined, dst;
285+
286+
/* Do the scaling in raster order so we don't trash the cache */
287+
for (y = 0; y < height; y++) {
288+
Uint8 *srcrow0 = srcpix + y * srcpitch;
289+
for (x = 0; x < dstwidth; x++) {
290+
Uint8 *src_p =
291+
srcrow0 + xidx0[x] * 4; // *8 now because of factorwidth?
292+
293+
src =
294+
_mm_unpacklo_epi8(_mm_loadu_si64(src_p), _mm_setzero_si128());
295+
296+
// uses combined multipliers against 2 src pixels
297+
// xm0 against src[0-3] (1 px), and xm1 against xrc[4-7] (1 px)
298+
multcombined = _mm_shuffle_epi32(
299+
_mm_loadu_si64(xmult_combined + x * 2), 0b01010000);
300+
301+
src = _mm_mullo_epi16(src, multcombined);
302+
303+
dst = _mm_bsrli_si128(src, 8);
304+
dst = _mm_add_epi16(src, dst);
305+
dst = _mm_srli_epi16(dst, 8);
306+
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
307+
_mm_storeu_si32(dstpix, dst);
308+
309+
dstpix += 4;
310+
}
311+
dstpix += dstdiff;
312+
}
313+
314+
/* free memory */
315+
free(xidx0);
316+
free(xmult_combined);
317+
}
318+
319+
void
320+
filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
321+
int dstpitch, int srcheight, int dstheight)
322+
{
323+
int x, y;
324+
325+
__m128i src0, src1, dst, ymult0_mm, ymult1_mm;
326+
327+
for (y = 0; y < dstheight; y++) {
328+
int yidx0 = y * (srcheight - 1) / dstheight;
329+
Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
330+
Uint8 *srcrow1 = srcrow0 + srcpitch;
331+
int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
332+
int ymult0 = 0x0100 - ymult1;
333+
334+
ymult0_mm = _mm_set1_epi16(ymult0);
335+
ymult1_mm = _mm_set1_epi16(ymult1);
336+
337+
for (x = 0; x < width; x++) {
338+
src0 = _mm_unpacklo_epi8(_mm_loadu_si32(srcrow0),
339+
_mm_setzero_si128());
340+
src1 = _mm_unpacklo_epi8(_mm_loadu_si32(srcrow1),
341+
_mm_setzero_si128());
342+
343+
src0 = _mm_mullo_epi16(src0, ymult0_mm);
344+
src1 = _mm_mullo_epi16(src1, ymult1_mm);
345+
346+
dst = _mm_add_epi16(src0, src1);
347+
dst = _mm_srli_epi16(dst, 8);
348+
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
349+
_mm_storeu_si32(dstpix, dst);
350+
351+
srcrow0 += 4;
352+
srcrow1 += 4;
353+
dstpix += 4;
354+
}
355+
Uint8 *dstrow = dstpix + y * dstpitch;
356+
}
357+
}
358+
359+
#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/

src_c/transform.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,14 @@ smoothscale_init(struct _module_state *st)
12211221
}
12221222

12231223
#ifdef SCALE_MMX_SUPPORT
1224-
if (SDL_HasSSE()) {
1224+
if (SDL_HasSSE2()) {
1225+
st->filter_type = "SSE2";
1226+
st->filter_shrink_X = filter_shrink_X_SSE2;
1227+
st->filter_shrink_Y = filter_shrink_Y_SSE2;
1228+
st->filter_expand_X = filter_expand_X_SSE2;
1229+
st->filter_expand_Y = filter_expand_Y_SSE2;
1230+
}
1231+
else if (SDL_HasSSE()) {
12251232
st->filter_type = "SSE";
12261233
st->filter_shrink_X = filter_shrink_X_SSE;
12271234
st->filter_shrink_Y = filter_shrink_Y_SSE;
@@ -1585,6 +1592,17 @@ surf_set_smoothscale_backend(PyObject *self, PyObject *args, PyObject *kwargs)
15851592
st->filter_expand_X = filter_expand_X_SSE;
15861593
st->filter_expand_Y = filter_expand_Y_SSE;
15871594
}
1595+
else if (strcmp(type, "SSE2") == 0) {
1596+
if (!SDL_HasSSE2()) {
1597+
return RAISE(PyExc_ValueError,
1598+
"SSE2 not supported on this machine");
1599+
}
1600+
st->filter_type = "SSE2";
1601+
st->filter_shrink_X = filter_shrink_X_SSE2;
1602+
st->filter_shrink_Y = filter_shrink_Y_SSE2;
1603+
st->filter_expand_X = filter_expand_X_SSE2;
1604+
st->filter_expand_Y = filter_expand_Y_SSE2;
1605+
}
15881606
else {
15891607
return PyErr_Format(PyExc_ValueError, "Unknown backend type %s", type);
15901608
}

0 commit comments

Comments
 (0)