Skip to content

Commit c43140e

Browse files
committed
Upgrade alpha blitter from 2px to 4px batches
1 parent 34c0b6d commit c43140e

File tree

1 file changed

+84
-26
lines changed

1 file changed

+84
-26
lines changed

src_c/simd_blitters_sse2.c

Lines changed: 84 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -482,55 +482,105 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
482482
Uint32 *srcp32 = (Uint32 *)info->s_pixels;
483483
Uint32 *dstp32 = (Uint32 *)info->d_pixels;
484484

485-
int pre_2_width = width % 2;
486-
int post_2_width = width / 2;
485+
int pre_4_width = width % 4;
486+
int post_4_width = width / 4;
487487

488488
__m128i src1, dst1, sub_dst, mm_src_alpha;
489+
__m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
490+
__m128i *srcp128, *dstp128;
489491
__m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF);
490492
__m128i mm_zero = _mm_setzero_si128();
491493

492494
while (height--) {
495+
srcp128 = (__m128i *)srcp32;
496+
dstp128 = (__m128i *)dstp32;
497+
493498
LOOP_UNROLLED4(
494499
{
495-
/* src(ARGB) -> src1 (00000000ARGBARGB) */
496-
LOAD_64_INTO_M128((Uint64 *)srcp32, &src1);
500+
/*
501+
* 4 pixel preparations
502+
*/
503+
504+
/* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
505+
pixels_src = _mm_loadu_si128(srcp128);
497506

498507
/* isolate alpha channels
499-
* 00000000A1000A2000 -> mm_src_alpha */
500-
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
508+
* A1000A2000A3000A4000 -> mm_src_alpha */
509+
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
501510

502511
/* shift right to position alpha channels for manipulation
503-
* 000000000A1000A200 -> mm_src_alpha*/
512+
* 0A1000A2000A3000A400 -> mm_src_alpha*/
504513
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
505514

515+
/* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
516+
pixels_dst = _mm_loadu_si128(dstp128);
517+
518+
/*
519+
* BATCH A (the 2 low pixels)
520+
*/
521+
506522
/* shuffle alpha channels to duplicate 16 bit pairs
507523
* shuffle (3, 3, 1, 1) (backed 2 bit numbers)
508-
* [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
509-
* [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
510-
* Therefore the previous contents of 16 bit number #1
511-
* Goes into 16 bit number #1 and #2, and the previous
512-
* content of 16 bit number #3 goes into #2 and #3 */
513-
mm_src_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
524+
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
525+
* [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
526+
* Therefore the previous contents of 16 bit lane 1
527+
* Goes into 16 bit lanes 0 and 1, and the previous
528+
* content of 16 bit lane 3 goes into lanes 2 and 3*/
529+
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
514530

515531
/* finally move into final config
516532
* spread out so they can be multiplied in 16 bit math
517533
* against all RGBA of both pixels being blit
518-
* 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
519-
mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
534+
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
535+
*/
536+
unpacked_alpha =
537+
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
538+
539+
/* 0A0R0G0B0A0R0G0B -> src1 */
540+
src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);
541+
542+
/* 0A0R0G0B0A0R0G0B -> dst1 */
543+
dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);
544+
545+
/* (srcRGB - dstRGB) */
546+
sub_dst = _mm_sub_epi16(src1, dst1);
547+
548+
/* (srcRGB - dstRGB) * srcA */
549+
sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
550+
551+
/* (srcRGB - dstRGB) * srcA + srcRGB */
552+
sub_dst = _mm_add_epi16(sub_dst, src1);
553+
554+
/* (dstRGB << 8) */
555+
dst1 = _mm_slli_epi16(dst1, 8);
556+
557+
/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
558+
sub_dst = _mm_add_epi16(sub_dst, dst1);
559+
560+
/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
561+
* 8)*/
562+
batch_a_dst = _mm_srli_epi16(sub_dst, 8);
563+
564+
/*
565+
* BATCH B (the 2 high pixels)
566+
*/
567+
568+
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
569+
570+
unpacked_alpha =
571+
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
520572

521573
/* 0A0R0G0B0A0R0G0B -> src1 */
522-
src1 = _mm_unpacklo_epi8(src1, mm_zero);
574+
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
523575

524-
/* dst(ARGB) -> dst1 (00000000ARGBARGB) */
525-
LOAD_64_INTO_M128((Uint64 *)dstp32, &dst1);
526576
/* 0A0R0G0B0A0R0G0B -> dst1 */
527-
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
577+
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
528578

529579
/* (srcRGB - dstRGB) */
530580
sub_dst = _mm_sub_epi16(src1, dst1);
531581

532582
/* (srcRGB - dstRGB) * srcA */
533-
sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
583+
sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
534584

535585
/* (srcRGB - dstRGB) * srcA + srcRGB */
536586
sub_dst = _mm_add_epi16(sub_dst, src1);
@@ -545,18 +595,26 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
545595
* 8)*/
546596
sub_dst = _mm_srli_epi16(sub_dst, 8);
547597

598+
/*
599+
* Combine the batches and store
600+
*/
601+
548602
/* pack everything back into a pixel with zeroed out alpha
549603
*/
550-
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
604+
sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
551605
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
552-
STORE_M128_INTO_64(&sub_dst, (Uint64 *)dstp32);
553606

554-
srcp32 += 2;
555-
dstp32 += 2;
607+
_mm_storeu_si128(dstp128, sub_dst);
608+
609+
srcp128++;
610+
dstp128++;
556611
},
557-
n, post_2_width);
612+
n, post_4_width);
613+
614+
srcp32 = (Uint32 *)srcp128;
615+
dstp32 = (Uint32 *)dstp128;
558616

559-
for (int i = 0; i < pre_2_width; i++) {
617+
for (int i = 0; i < pre_4_width; i++) {
560618
/* Do the actual blend */
561619
/* src(ARGB) -> src1 (000000000000ARGB) */
562620
src1 = _mm_cvtsi32_si128(*srcp32);

0 commit comments

Comments
 (0)