@@ -84,38 +84,111 @@ _pg_has_avx2()
8484 pixels += skip ; \
8585 }
8686
87+ /* Setup for RUN_16BIT_SHUFFLE_OUT */
88+ #define SETUP_SHUFFLE \
89+ __m256i shuff_out_A = \
90+ _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, \
91+ 0x80, 18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, \
92+ 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0); \
93+ \
94+ __m256i shuff_out_B = _mm256_set_epi8( \
95+ 0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80, 26, 0x80, 25, \
96+ 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, \
97+ 0x80, 9, 0x80, 8); \
98+ \
99+ __m256i shuff_dst, _shuff16_temp, mm256_colorA, mm256_colorB; \
100+ mm256_colorA = _mm256_shuffle_epi8(mm256_color, shuff_out_A); \
101+ mm256_colorB = _mm256_shuffle_epi8(mm256_color, shuff_out_B);
102+
103+ #define RUN_16BIT_SHUFFLE_OUT (FILL_CODE ) \
104+ /* ==== shuffle pixels out into two registers each, src */ \
105+ /* and dst set up for 16 bit math, like 0A0R0G0B ==== */ \
106+ shuff_dst = _mm256_shuffle_epi8(mm256_dst, shuff_out_A); \
107+ mm256_color = mm256_colorA; \
108+ \
109+ {FILL_CODE} \
110+ \
111+ _shuff16_temp = shuff_dst; \
112+ \
113+ shuff_dst = _mm256_shuffle_epi8(mm256_dst, shuff_out_B); \
114+ mm256_color = mm256_colorB; \
115+ \
116+ {FILL_CODE} \
117+ \
118+ /* ==== recombine A and B pixels ==== */ \
119+ mm256_dst = _mm256_packus_epi16 (_shuff16_temp , shuff_dst );
120+
121+ #define FILLERS (NAME , COLOR_PROCESS_CODE , FILL_CODE ) \
122+ int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
123+ SDL_Rect *rect, Uint32 color) \
124+ { \
125+ SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
126+ RUN_AVX2_FILLER(FILL_CODE) \
127+ return 0; \
128+ } \
129+ int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
130+ SDL_Rect *rect, Uint32 color) \
131+ { \
132+ SETUP_AVX2_FILLER({}) \
133+ RUN_AVX2_FILLER(FILL_CODE) \
134+ return 0; \
135+ }
136+
137+ #define FILLERS_SHUFF (NAME , COLOR_PROCESS_CODE , FILL_CODE ) \
138+ int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
139+ SDL_Rect *rect, Uint32 color) \
140+ { \
141+ SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \
142+ SETUP_SHUFFLE \
143+ RUN_AVX2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
144+ return 0; \
145+ } \
146+ int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
147+ SDL_Rect *rect, Uint32 color) \
148+ { \
149+ SETUP_AVX2_FILLER({}) \
150+ SETUP_SHUFFLE \
151+ RUN_AVX2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
152+ return 0; \
153+ }
154+
155+ #define INVALID_DEFS (NAME ) \
156+ int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \
157+ SDL_Rect *rect, Uint32 color) \
158+ { \
159+ BAD_AVX2_FUNCTION_CALL; \
160+ return -1; \
161+ } \
162+ int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \
163+ SDL_Rect *rect, Uint32 color) \
164+ { \
165+ BAD_AVX2_FUNCTION_CALL; \
166+ return -1; \
167+ }
168+
169+ #define ADD_CODE mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color);
170+ #define SUB_CODE mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_color);
171+ #define MIN_CODE mm256_dst = _mm256_min_epu8(mm256_dst, mm256_color);
172+ #define MAX_CODE mm256_dst = _mm256_max_epu8(mm256_dst, mm256_color);
173+ #define MULT_CODE \
174+ { \
175+ shuff_dst = _mm256_mullo_epi16(shuff_dst, mm256_color); \
176+ shuff_dst = _mm256_adds_epu16(shuff_dst, _mm256_set1_epi16(255)); \
177+ shuff_dst = _mm256_srli_epi16(shuff_dst, 8); \
178+ }
179+
87180#if defined(__AVX2__ ) && defined(HAVE_IMMINTRIN_H ) && \
88181 !defined(SDL_DISABLE_IMMINTRIN_H )
89- int
90- surface_fill_blend_add_avx2 (SDL_Surface * surface , SDL_Rect * rect , Uint32 color )
91- {
92- SETUP_AVX2_FILLER ({ color &= ~amask ; })
93- RUN_AVX2_FILLER ({ mm256_dst = _mm256_adds_epu8 (mm256_dst , mm256_color ); });
94- return 0 ;
95- }
96-
97- int
98- surface_fill_blend_rgba_add_avx2 (SDL_Surface * surface , SDL_Rect * rect ,
99- Uint32 color )
100- {
101- SETUP_AVX2_FILLER ({})
102- RUN_AVX2_FILLER ({ mm256_dst = _mm256_adds_epu8 (mm256_dst , mm256_color ); });
103- return 0 ;
104- }
182+ FILLERS (add , color &= ~amask ;, ADD_CODE )
183+ FILLERS (sub , color &= ~amask ;, SUB_CODE )
184+ FILLERS (min , color |= amask ;, MIN_CODE )
185+ FILLERS (max , color &= ~amask ;, MAX_CODE )
186+ FILLERS_SHUFF (mult , color |= amask ;, MULT_CODE )
105187#else
106- int
107- surface_fill_blend_add_avx2 (SDL_Surface * surface , SDL_Rect * rect , Uint32 color )
108- {
109- BAD_AVX2_FUNCTION_CALL ;
110- return -1 ;
111- }
112-
113- int
114- surface_fill_blend_rgba_add_avx2 (SDL_Surface * surface , SDL_Rect * rect ,
115- Uint32 color )
116- {
117- BAD_AVX2_FUNCTION_CALL ;
118- return -1 ;
119- }
188+ INVALID_DEFS (add )
189+ INVALID_DEFS (sub )
190+ INVALID_DEFS (min )
191+ INVALID_DEFS (max )
192+ INVALID_DEFS (mult )
120193#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
121194 !defined(SDL_DISABLE_IMMINTRIN_H) */
0 commit comments