Skip to content

Commit 6111af4

Browse files
authored
Merge pull request #2566 from itzpr3d4t0r/add-sse2-fillers
Add SSE2 fillers
2 parents 5482b63 + ecce2b0 commit 6111af4

File tree

6 files changed

+364
-38
lines changed

6 files changed

+364
-38
lines changed

buildconfig/Setup.Android.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG)
5050
mouse src_c/mouse.c $(SDL) $(DEBUG)
5151
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
5252
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
53-
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
53+
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
5454
surflock src_c/surflock.c $(SDL) $(DEBUG)
5555
time src_c/time.c $(SDL) $(DEBUG)
5656
joystick src_c/joystick.c $(SDL) $(DEBUG)

buildconfig/Setup.Emscripten.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c
6969
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
7070
_sdl2.touch src_c/void.c
7171

72-
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
72+
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
7373
transform src_c/void.c
7474

7575

buildconfig/Setup.SDL2.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG)
6060
mouse src_c/mouse.c $(SDL) $(DEBUG)
6161
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
6262
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
63-
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
63+
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
6464
surflock src_c/surflock.c $(SDL) $(DEBUG)
6565
time src_c/time.c $(SDL) $(DEBUG)
6666
joystick src_c/joystick.c $(SDL) $(DEBUG)

src_c/simd_fill.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,56 @@
11
#define NO_PYGAME_C_API
22
#include "_surface.h"
33

4+
#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
5+
// arm64 has neon optimisations enabled by default, even when fpu=neon is not
6+
// passed
7+
#define PG_ENABLE_ARM_NEON 1
8+
#endif
9+
10+
/* See if we are compiled 64 bit on GCC or MSVC */
11+
#if _WIN32 || _WIN64
12+
#if _WIN64
13+
#define ENV64BIT
14+
#endif
15+
#endif
16+
17+
// Check GCC
18+
#if __GNUC__
19+
#if __x86_64__ || __ppc64__ || __aarch64__
20+
#define ENV64BIT
21+
#endif
22+
#endif
23+
24+
#if PG_ENABLE_ARM_NEON
25+
// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon
26+
#include "include/sse2neon.h"
27+
#endif /* PG_ENABLE_ARM_NEON */
28+
29+
#if defined(__SSE2__)
30+
#define PG_ENABLE_SSE_NEON 1
31+
#elif PG_ENABLE_ARM_NEON
32+
#define PG_ENABLE_SSE_NEON 1
33+
#else
34+
#define PG_ENABLE_SSE_NEON 0
35+
#endif
36+
437
int
538
_pg_has_avx2();
639

40+
/* This returns True if either SSE2 or NEON is present at runtime.
41+
* Relevant because they use the same codepaths. Only the relevant runtime
42+
* SDL cpu feature check is compiled in.*/
43+
int
44+
_pg_HasSSE_NEON();
45+
746
// AVX2 functions
847
int
948
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
1049
Uint32 color);
1150
int
1251
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
1352
Uint32 color);
53+
1454
int
1555
surface_fill_blend_sub_avx2(SDL_Surface *surface, SDL_Rect *rect,
1656
Uint32 color);
@@ -35,3 +75,34 @@ surface_fill_blend_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
3575
int
3676
surface_fill_blend_rgba_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
3777
Uint32 color);
78+
// SSE2 functions
79+
int
80+
surface_fill_blend_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
81+
Uint32 color);
82+
int
83+
surface_fill_blend_rgba_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
84+
Uint32 color);
85+
int
86+
surface_fill_blend_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
87+
Uint32 color);
88+
int
89+
surface_fill_blend_rgba_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
90+
Uint32 color);
91+
int
92+
surface_fill_blend_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
93+
Uint32 color);
94+
int
95+
surface_fill_blend_rgba_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
96+
Uint32 color);
97+
int
98+
surface_fill_blend_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
99+
Uint32 color);
100+
int
101+
surface_fill_blend_rgba_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
102+
Uint32 color);
103+
int
104+
surface_fill_blend_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
105+
Uint32 color);
106+
int
107+
surface_fill_blend_rgba_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
108+
Uint32 color);

src_c/simd_surface_fill_sse2.c

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#include "simd_fill.h"
2+
3+
#define BAD_SSE2_FUNCTION_CALL \
4+
printf( \
5+
"Fatal Error: Attempted calling an SSE2 function when both compile " \
6+
"time and runtime support is missing. If you are seeing this " \
7+
"message, you have stumbled across a pygame bug, please report it " \
8+
"to the devs!"); \
9+
PG_EXIT(1)
10+
11+
int
12+
_pg_HasSSE_NEON()
13+
{
14+
#if defined(__SSE2__)
15+
return SDL_HasSSE2();
16+
#elif PG_ENABLE_ARM_NEON
17+
return SDL_HasNEON();
18+
#else
19+
return 0;
20+
#endif
21+
}
22+
23+
#define SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
24+
/* initialize surface data */ \
25+
int width = rect->w, height = rect->h; \
26+
int skip = surface->pitch / 4 - width; \
27+
/* indicates the number of pixels that can't be processed in 4-pixel \
28+
* blocks */ \
29+
int pxl_excess = width % 4; \
30+
/* indicates the number of 4-pixel blocks that can be processed */ \
31+
int n_iters_4 = width / 4; \
32+
int i, j; \
33+
/* load pixel data */ \
34+
Uint32 *pixels = \
35+
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
36+
\
37+
__m128i mm128_dst; \
38+
/* prep and load the color */ \
39+
Uint32 amask = surface->format->Amask; \
40+
if (amask) { \
41+
{ \
42+
COLOR_PROCESS_CODE \
43+
} \
44+
} \
45+
__m128i mm128_color = _mm_set1_epi32(color);
46+
47+
#define RUN_SSE2_FILLER(FILL_CODE) \
48+
while (height--) { \
49+
for (i = 0; i < n_iters_4; i++) { \
50+
/* load 4 pixels */ \
51+
mm128_dst = _mm_loadu_si128((__m128i *)pixels); \
52+
\
53+
{FILL_CODE} \
54+
\
55+
/* store 4 pixels */ \
56+
_mm_storeu_si128((__m128i *)pixels, mm128_dst); \
57+
\
58+
pixels += 4; \
59+
} \
60+
\
61+
if (pxl_excess) { \
62+
for (j = 0; j < pxl_excess; j++, pixels++) { \
63+
mm128_dst = _mm_cvtsi32_si128(*pixels); \
64+
\
65+
{FILL_CODE} \
66+
\
67+
*pixels = _mm_cvtsi128_si32(mm128_dst); \
68+
} \
69+
} \
70+
pixels += skip; \
71+
}
72+
73+
/* Setup for RUN_16BIT_SHUFFLE_OUT */
74+
#define SETUP_SHUFFLE \
75+
__m128i shuff_dst, _shuff16_temp, mm128_colorA, mm128_colorB; \
76+
mm128_colorA = _mm_unpacklo_epi8(mm128_color, _mm_setzero_si128()); \
77+
mm128_colorB = _mm_unpackhi_epi8(mm128_color, _mm_setzero_si128());
78+
79+
#define RUN_16BIT_SHUFFLE_OUT(FILL_CODE) \
80+
/* ==== shuffle pixels out into two registers each, src */ \
81+
/* and dst set up for 16 bit math, like 0A0R0G0B ==== */ \
82+
shuff_dst = _mm_unpacklo_epi8(mm128_dst, _mm_setzero_si128()); \
83+
mm128_color = mm128_colorA; \
84+
\
85+
{FILL_CODE} \
86+
\
87+
_shuff16_temp = shuff_dst; \
88+
\
89+
shuff_dst = _mm_unpackhi_epi8(mm128_dst, _mm_setzero_si128()); \
90+
mm128_color = mm128_colorB; \
91+
\
92+
{FILL_CODE} \
93+
\
94+
/* ==== recombine A and B pixels ==== */ \
95+
mm128_dst = _mm_packus_epi16(_shuff16_temp, shuff_dst);
96+
97+
#define FILLERS(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
98+
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
99+
SDL_Rect *rect, Uint32 color) \
100+
{ \
101+
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
102+
RUN_SSE2_FILLER(FILL_CODE) \
103+
return 0; \
104+
} \
105+
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
106+
SDL_Rect *rect, Uint32 color) \
107+
{ \
108+
SETUP_SSE2_FILLER({}) \
109+
RUN_SSE2_FILLER(FILL_CODE) \
110+
return 0; \
111+
}
112+
113+
#define FILLERS_SHUFF(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
114+
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
115+
SDL_Rect *rect, Uint32 color) \
116+
{ \
117+
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
118+
SETUP_SHUFFLE \
119+
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
120+
return 0; \
121+
} \
122+
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
123+
SDL_Rect *rect, Uint32 color) \
124+
{ \
125+
SETUP_SSE2_FILLER({}) \
126+
SETUP_SHUFFLE \
127+
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
128+
return 0; \
129+
}
130+
131+
#define INVALID_DEFS(NAME) \
132+
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
133+
SDL_Rect *rect, Uint32 color) \
134+
{ \
135+
BAD_SSE2_FUNCTION_CALL; \
136+
return -1; \
137+
} \
138+
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
139+
SDL_Rect *rect, Uint32 color) \
140+
{ \
141+
BAD_SSE2_FUNCTION_CALL; \
142+
return -1; \
143+
}
144+
145+
#define ADD_CODE mm128_dst = _mm_adds_epu8(mm128_dst, mm128_color);
146+
#define SUB_CODE mm128_dst = _mm_subs_epu8(mm128_dst, mm128_color);
147+
#define MIN_CODE mm128_dst = _mm_min_epu8(mm128_dst, mm128_color);
148+
#define MAX_CODE mm128_dst = _mm_max_epu8(mm128_dst, mm128_color);
149+
#define MULT_CODE \
150+
{ \
151+
shuff_dst = _mm_mullo_epi16(shuff_dst, mm128_color); \
152+
shuff_dst = _mm_adds_epu16(shuff_dst, _mm_set1_epi16(255)); \
153+
shuff_dst = _mm_srli_epi16(shuff_dst, 8); \
154+
}
155+
156+
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
157+
FILLERS(add, color &= ~amask;, ADD_CODE)
158+
FILLERS(sub, color &= ~amask;, SUB_CODE)
159+
FILLERS(min, color |= amask;, MIN_CODE)
160+
FILLERS(max, color &= ~amask;, MAX_CODE)
161+
FILLERS_SHUFF(mult, color |= amask;, MULT_CODE)
162+
#else
163+
INVALID_DEFS(add)
164+
INVALID_DEFS(sub)
165+
INVALID_DEFS(min)
166+
INVALID_DEFS(max)
167+
INVALID_DEFS(mult)
168+
#endif /* defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) */

0 commit comments

Comments
 (0)