Skip to content

Commit 68685f4

Browse files
authored
Merge pull request #2565 from itzpr3d4t0r/add-missing-avx-fillers
Added missing AVX2 fillers
2 parents 6fed43d + 8f3a26e commit 68685f4

File tree

3 files changed

+195
-30
lines changed

3 files changed

+195
-30
lines changed

src_c/simd_fill.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,27 @@ surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
1111
int
1212
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
1313
Uint32 color);
14+
int
15+
surface_fill_blend_sub_avx2(SDL_Surface *surface, SDL_Rect *rect,
16+
Uint32 color);
17+
int
18+
surface_fill_blend_rgba_sub_avx2(SDL_Surface *surface, SDL_Rect *rect,
19+
Uint32 color);
20+
int
21+
surface_fill_blend_mult_avx2(SDL_Surface *surface, SDL_Rect *rect,
22+
Uint32 color);
23+
int
24+
surface_fill_blend_rgba_mult_avx2(SDL_Surface *surface, SDL_Rect *rect,
25+
Uint32 color);
26+
int
27+
surface_fill_blend_min_avx2(SDL_Surface *surface, SDL_Rect *rect,
28+
Uint32 color);
29+
int
30+
surface_fill_blend_rgba_min_avx2(SDL_Surface *surface, SDL_Rect *rect,
31+
Uint32 color);
32+
int
33+
surface_fill_blend_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
34+
Uint32 color);
35+
int
36+
surface_fill_blend_rgba_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
37+
Uint32 color);

src_c/simd_surface_fill_avx2.c

Lines changed: 103 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -84,38 +84,111 @@ _pg_has_avx2()
8484
pixels += skip; \
8585
}
8686

87+
/* Setup for RUN_16BIT_SHUFFLE_OUT */
88+
#define SETUP_SHUFFLE \
89+
__m256i shuff_out_A = \
90+
_mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, \
91+
0x80, 18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, \
92+
5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0); \
93+
\
94+
__m256i shuff_out_B = _mm256_set_epi8( \
95+
0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80, 26, 0x80, 25, \
96+
0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, \
97+
0x80, 9, 0x80, 8); \
98+
\
99+
__m256i shuff_dst, _shuff16_temp, mm256_colorA, mm256_colorB; \
100+
mm256_colorA = _mm256_shuffle_epi8(mm256_color, shuff_out_A); \
101+
mm256_colorB = _mm256_shuffle_epi8(mm256_color, shuff_out_B);
102+
103+
#define RUN_16BIT_SHUFFLE_OUT(FILL_CODE) \
104+
/* ==== shuffle pixels out into two registers each, src */ \
105+
/* and dst set up for 16 bit math, like 0A0R0G0B ==== */ \
106+
shuff_dst = _mm256_shuffle_epi8(mm256_dst, shuff_out_A); \
107+
mm256_color = mm256_colorA; \
108+
\
109+
{FILL_CODE} \
110+
\
111+
_shuff16_temp = shuff_dst; \
112+
\
113+
shuff_dst = _mm256_shuffle_epi8(mm256_dst, shuff_out_B); \
114+
mm256_color = mm256_colorB; \
115+
\
116+
{FILL_CODE} \
117+
\
118+
/* ==== recombine A and B pixels ==== */ \
119+
mm256_dst = _mm256_packus_epi16(_shuff16_temp, shuff_dst);
120+
121+
#define FILLERS(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
122+
int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
123+
SDL_Rect *rect, Uint32 color) \
124+
{ \
125+
SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
126+
RUN_AVX2_FILLER(FILL_CODE) \
127+
return 0; \
128+
} \
129+
int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
130+
SDL_Rect *rect, Uint32 color) \
131+
{ \
132+
SETUP_AVX2_FILLER({}) \
133+
RUN_AVX2_FILLER(FILL_CODE) \
134+
return 0; \
135+
}
136+
137+
#define FILLERS_SHUFF(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
138+
int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
139+
SDL_Rect *rect, Uint32 color) \
140+
{ \
141+
SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
142+
SETUP_SHUFFLE \
143+
RUN_AVX2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
144+
return 0; \
145+
} \
146+
int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
147+
SDL_Rect *rect, Uint32 color) \
148+
{ \
149+
SETUP_AVX2_FILLER({}) \
150+
SETUP_SHUFFLE \
151+
RUN_AVX2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
152+
return 0; \
153+
}
154+
155+
#define INVALID_DEFS(NAME) \
156+
int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
157+
SDL_Rect *rect, Uint32 color) \
158+
{ \
159+
BAD_AVX2_FUNCTION_CALL; \
160+
return -1; \
161+
} \
162+
int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
163+
SDL_Rect *rect, Uint32 color) \
164+
{ \
165+
BAD_AVX2_FUNCTION_CALL; \
166+
return -1; \
167+
}
168+
169+
#define ADD_CODE mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color);
170+
#define SUB_CODE mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_color);
171+
#define MIN_CODE mm256_dst = _mm256_min_epu8(mm256_dst, mm256_color);
172+
#define MAX_CODE mm256_dst = _mm256_max_epu8(mm256_dst, mm256_color);
173+
#define MULT_CODE \
174+
{ \
175+
shuff_dst = _mm256_mullo_epi16(shuff_dst, mm256_color); \
176+
shuff_dst = _mm256_adds_epu16(shuff_dst, _mm256_set1_epi16(255)); \
177+
shuff_dst = _mm256_srli_epi16(shuff_dst, 8); \
178+
}
179+
87180
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
88181
!defined(SDL_DISABLE_IMMINTRIN_H)
89-
int
90-
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
91-
{
92-
SETUP_AVX2_FILLER({ color &= ~amask; })
93-
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
94-
return 0;
95-
}
96-
97-
int
98-
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
99-
Uint32 color)
100-
{
101-
SETUP_AVX2_FILLER({})
102-
RUN_AVX2_FILLER({ mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); });
103-
return 0;
104-
}
182+
FILLERS(add, color &= ~amask;, ADD_CODE)
183+
FILLERS(sub, color &= ~amask;, SUB_CODE)
184+
FILLERS(min, color |= amask;, MIN_CODE)
185+
FILLERS(max, color &= ~amask;, MAX_CODE)
186+
FILLERS_SHUFF(mult, color |= amask;, MULT_CODE)
105187
#else
106-
int
107-
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
108-
{
109-
BAD_AVX2_FUNCTION_CALL;
110-
return -1;
111-
}
112-
113-
int
114-
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
115-
Uint32 color)
116-
{
117-
BAD_AVX2_FUNCTION_CALL;
118-
return -1;
119-
}
188+
INVALID_DEFS(add)
189+
INVALID_DEFS(sub)
190+
INVALID_DEFS(min)
191+
INVALID_DEFS(max)
192+
INVALID_DEFS(mult)
120193
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
121194
!defined(SDL_DISABLE_IMMINTRIN_H) */

src_c/surface_fill.c

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,18 +879,50 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
879879
break;
880880
}
881881
case PYGAME_BLEND_SUB: {
882+
#if !defined(__EMSCRIPTEN__)
883+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
884+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
885+
result = surface_fill_blend_sub_avx2(surface, rect, color);
886+
break;
887+
}
888+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
889+
#endif /* __EMSCRIPTEN__ */
882890
result = surface_fill_blend_sub(surface, rect, color);
883891
break;
884892
}
885893
case PYGAME_BLEND_MULT: {
894+
#if !defined(__EMSCRIPTEN__)
895+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
896+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
897+
result = surface_fill_blend_mult_avx2(surface, rect, color);
898+
break;
899+
}
900+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
901+
#endif /* __EMSCRIPTEN__ */
886902
result = surface_fill_blend_mult(surface, rect, color);
887903
break;
888904
}
889905
case PYGAME_BLEND_MIN: {
906+
#if !defined(__EMSCRIPTEN__)
907+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
908+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
909+
result = surface_fill_blend_min_avx2(surface, rect, color);
910+
break;
911+
}
912+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
913+
#endif /* __EMSCRIPTEN__ */
890914
result = surface_fill_blend_min(surface, rect, color);
891915
break;
892916
}
893917
case PYGAME_BLEND_MAX: {
918+
#if !defined(__EMSCRIPTEN__)
919+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
920+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
921+
result = surface_fill_blend_max_avx2(surface, rect, color);
922+
break;
923+
}
924+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
925+
#endif /* __EMSCRIPTEN__ */
894926
result = surface_fill_blend_max(surface, rect, color);
895927
break;
896928
}
@@ -909,18 +941,54 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
909941
break;
910942
}
911943
case PYGAME_BLEND_RGBA_SUB: {
944+
#if !defined(__EMSCRIPTEN__)
945+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
946+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
947+
result =
948+
surface_fill_blend_rgba_sub_avx2(surface, rect, color);
949+
break;
950+
}
951+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
952+
#endif /* __EMSCRIPTEN__ */
912953
result = surface_fill_blend_rgba_sub(surface, rect, color);
913954
break;
914955
}
915956
case PYGAME_BLEND_RGBA_MULT: {
957+
#if !defined(__EMSCRIPTEN__)
958+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
959+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
960+
result =
961+
surface_fill_blend_rgba_mult_avx2(surface, rect, color);
962+
break;
963+
}
964+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
965+
#endif /* __EMSCRIPTEN__ */
916966
result = surface_fill_blend_rgba_mult(surface, rect, color);
917967
break;
918968
}
919969
case PYGAME_BLEND_RGBA_MIN: {
970+
#if !defined(__EMSCRIPTEN__)
971+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
972+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
973+
result =
974+
surface_fill_blend_rgba_min_avx2(surface, rect, color);
975+
break;
976+
}
977+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
978+
#endif /* __EMSCRIPTEN__ */
920979
result = surface_fill_blend_rgba_min(surface, rect, color);
921980
break;
922981
}
923982
case PYGAME_BLEND_RGBA_MAX: {
983+
#if !defined(__EMSCRIPTEN__)
984+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
985+
if (surface->format->BytesPerPixel == 4 && _pg_has_avx2()) {
986+
result =
987+
surface_fill_blend_rgba_max_avx2(surface, rect, color);
988+
break;
989+
}
990+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
991+
#endif /* __EMSCRIPTEN__ */
924992
result = surface_fill_blend_rgba_max(surface, rect, color);
925993
break;
926994
}

0 commit comments

Comments
 (0)