Skip to content

Commit 717d032

Browse files
authored
Merge pull request #2601 from Starbuck5/improve-alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst
Improve performance of SSE2 no_surf_alpha_opaque_dst blitter
2 parents ff166fe + 1f1ebf0 commit 717d032

File tree

1 file changed

+111
-125
lines changed

1 file changed

+111
-125
lines changed

src_c/simd_blitters_sse2.c

Lines changed: 111 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,27 @@ alphablit_alpha_sse2_argb_no_surf_alpha(SDL_BlitInfo *info)
470470
}
471471
}
472472

473+
/* Defines the blit procedure at the core of
474+
* alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst
475+
*
476+
* Input variables: src1, dst1, unpacked_alpha
477+
* containing unpacked 16 bit lanes of src, dst, and src alpha
478+
* Output variables: sub_dst
479+
* */
480+
#define ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE \
481+
/* (srcRGB - dstRGB) */ \
482+
sub_dst = _mm_sub_epi16(src1, dst1); \
483+
/* (srcRGB - dstRGB) * srcA */ \
484+
sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); \
485+
/* (srcRGB - dstRGB) * srcA + srcRGB */ \
486+
sub_dst = _mm_add_epi16(sub_dst, src1); \
487+
/* (dstRGB << 8) */ \
488+
dst1 = _mm_slli_epi16(dst1, 8); \
489+
/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ \
490+
sub_dst = _mm_add_epi16(sub_dst, dst1); \
491+
/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> 8) */ \
492+
sub_dst = _mm_srli_epi16(sub_dst, 8);
493+
473494
void
474495
alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
475496
{
@@ -479,164 +500,129 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
479500
int srcskip = info->s_skip >> 2;
480501
int dstskip = info->d_skip >> 2;
481502

482-
Uint64 *srcp64 = (Uint64 *)info->s_pixels;
483-
Uint64 *dstp64 = (Uint64 *)info->d_pixels;
484-
485-
Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF;
486-
Uint32 rgb_mask32 = 0x00FFFFFF;
487-
488503
Uint32 *srcp32 = (Uint32 *)info->s_pixels;
489504
Uint32 *dstp32 = (Uint32 *)info->d_pixels;
490505

491-
__m128i src1, dst1, sub_dst, mm_src_alpha, mm_zero, mm_rgb_mask;
506+
int pxl_excess = width % 4;
507+
int n_iters_4 = width / 4;
492508

493-
/* There are two paths through this blitter:
494-
1. Two pixels at once.
495-
2. One pixel at a time.
496-
*/
497-
if (((width % 2) == 0) && ((srcskip % 2) == 0) && ((dstskip % 2) == 0)) {
498-
width = width / 2;
499-
srcskip = srcskip / 2;
500-
dstskip = dstskip / 2;
509+
__m128i src1, dst1, sub_dst, mm_src_alpha;
510+
__m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
511+
__m128i *srcp128, *dstp128;
512+
__m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF);
513+
__m128i mm_zero = _mm_setzero_si128();
501514

502-
mm_zero = _mm_setzero_si128();
515+
while (height--) {
516+
srcp128 = (__m128i *)srcp32;
517+
dstp128 = (__m128i *)dstp32;
503518

504-
/* two pixels at a time */
505-
LOAD_64_INTO_M128(&rgb_mask64, &mm_rgb_mask);
506-
while (height--) {
507-
LOOP_UNROLLED4(
508-
{
509-
/* src(ARGB) -> src1 (00000000ARGBARGB) */
510-
LOAD_64_INTO_M128(srcp64, &src1);
519+
LOOP_UNROLLED4(
520+
{
521+
/* ==== load 4 pixels into SSE registers ==== */
511522

512-
/* isolate alpha channels
513-
* 00000000A1000A2000 -> mm_src_alpha */
514-
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
523+
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
524+
pixels_src = _mm_loadu_si128(srcp128);
515525

516-
/* shift right to position alpha channels for manipulation
517-
* 000000000A1000A200 -> mm_src_alpha*/
518-
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
526+
/* isolate alpha channels
527+
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
528+
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
519529

520-
/* shuffle alpha channels to duplicate 16 bit pairs
521-
* shuffle (3, 3, 1, 1) (backed 2 bit numbers)
522-
* [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
523-
* [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
524-
* Therefore the previous contents of 16 bit number #1
525-
* Goes into 16 bit number #1 and #2, and the previous
526-
* content of 16 bit number #3 goes into #2 and #3 */
527-
mm_src_alpha =
528-
_mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
530+
/* shift right to position alpha channels for manipulation
531+
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
532+
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
529533

530-
/* finally move into final config
531-
* spread out so they can be multiplied in 16 bit math
532-
* against all RGBA of both pixels being blit
533-
* 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
534-
mm_src_alpha =
535-
_mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
534+
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
535+
pixels_dst = _mm_loadu_si128(dstp128);
536536

537-
/* 0A0R0G0B0A0R0G0B -> src1 */
538-
src1 = _mm_unpacklo_epi8(src1, mm_zero);
537+
/* ==== BATCH A (the 2 low pixels) ==== */
539538

540-
/* dst(ARGB) -> dst1 (00000000ARGBARGB) */
541-
LOAD_64_INTO_M128(dstp64, &dst1);
542-
/* 0A0R0G0B0A0R0G0B -> dst1 */
543-
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
539+
/* shuffle alpha channels to duplicate 16 bit pairs
540+
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
541+
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
544542

545-
/* (srcRGB - dstRGB) */
546-
sub_dst = _mm_sub_epi16(src1, dst1);
543+
/* spread alpha into final config for 16 bit math
544+
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
545+
unpacked_alpha =
546+
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
547547

548-
/* (srcRGB - dstRGB) * srcA */
549-
sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
548+
/* 0A0R0G0B0A0R0G0B -> src1 */
549+
src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);
550550

551-
/* (srcRGB - dstRGB) * srcA + srcRGB */
552-
sub_dst = _mm_add_epi16(sub_dst, src1);
551+
/* 0A0R0G0B0A0R0G0B -> dst1 */
552+
dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);
553553

554-
/* (dstRGB << 8) */
555-
dst1 = _mm_slli_epi16(dst1, 8);
554+
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
556555

557-
/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
558-
sub_dst = _mm_add_epi16(sub_dst, dst1);
556+
batch_a_dst = sub_dst;
559557

560-
/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
561-
* 8)*/
562-
sub_dst = _mm_srli_epi16(sub_dst, 8);
558+
/* ==== BATCH B (the 2 high pixels) ==== */
563559

564-
/* pack everything back into a pixel with zeroed out alpha
565-
*/
566-
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
567-
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
568-
STORE_M128_INTO_64(&sub_dst, dstp64);
560+
/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
561+
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
569562

570-
++srcp64;
571-
++dstp64;
572-
},
573-
n, width);
574-
srcp64 += srcskip;
575-
dstp64 += dstskip;
576-
}
577-
}
578-
else {
579-
/* one pixel at a time */
580-
mm_zero = _mm_setzero_si128();
581-
mm_rgb_mask = _mm_cvtsi32_si128(rgb_mask32);
563+
/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
564+
unpacked_alpha =
565+
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
582566

583-
while (height--) {
584-
LOOP_UNROLLED4(
585-
{
586-
/* Do the actual blend */
587-
/* src(ARGB) -> src1 (000000000000ARGB) */
588-
src1 = _mm_cvtsi32_si128(*srcp32);
589-
/* src1 >> ashift -> mm_src_alpha(000000000000000A) */
590-
mm_src_alpha = _mm_srli_si128(src1, 3);
567+
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
568+
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
591569

592-
/* Then Calc RGB */
593-
/* 0000000000000A0A -> rgb_src_alpha */
594-
mm_src_alpha =
595-
_mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
596-
/* 000000000A0A0A0A -> rgb_src_alpha */
597-
mm_src_alpha =
598-
_mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
570+
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
571+
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
599572

600-
/* 000000000A0R0G0B -> src1 */
601-
src1 = _mm_unpacklo_epi8(src1, mm_zero);
573+
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
602574

603-
/* dst(ARGB) -> dst1 (000000000000ARGB) */
604-
dst1 = _mm_cvtsi32_si128(*dstp32);
605-
/* 000000000A0R0G0B -> dst1 */
606-
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
575+
/* ==== combine batches and store ==== */
607576

608-
/* (srcRGB - dstRGB) */
609-
sub_dst = _mm_sub_epi16(src1, dst1);
577+
sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
578+
/* zero out alpha */
579+
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
580+
_mm_storeu_si128(dstp128, sub_dst);
610581

611-
/* (srcRGB - dstRGB) * srcA */
612-
sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
582+
srcp128++;
583+
dstp128++;
584+
},
585+
n, n_iters_4);
613586

614-
/* (srcRGB - dstRGB) * srcA + srcRGB */
615-
sub_dst = _mm_add_epi16(sub_dst, src1);
587+
srcp32 = (Uint32 *)srcp128;
588+
dstp32 = (Uint32 *)dstp128;
616589

617-
/* (dstRGB << 8) */
618-
dst1 = _mm_slli_epi16(dst1, 8);
590+
for (int i = 0; i < pxl_excess; i++) {
591+
/*[00][00][00][00][00][00][AR][GB] -> src1*/
592+
src1 = _mm_cvtsi32_si128(*srcp32);
619593

620-
/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
621-
sub_dst = _mm_add_epi16(sub_dst, dst1);
594+
/*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
595+
mm_src_alpha = _mm_srli_si128(src1, 3);
622596

623-
/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
624-
* 8)*/
625-
sub_dst = _mm_srli_epi16(sub_dst, 8);
597+
/*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
598+
mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
626599

627-
/* pack everything back into a pixel */
628-
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
629-
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
630-
/* reset alpha to 0 */
631-
*dstp32 = _mm_cvtsi128_si32(sub_dst);
600+
/*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
601+
unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
632602

633-
++srcp32;
634-
++dstp32;
635-
},
636-
n, width);
637-
srcp32 += srcskip;
638-
dstp32 += dstskip;
603+
/*[00][00][00][00][0A][0R][0G][0B] -> src1*/
604+
src1 = _mm_unpacklo_epi8(src1, mm_zero);
605+
606+
/*[00][00][00][00][00][00][AR][GB] -> dst1*/
607+
dst1 = _mm_cvtsi32_si128(*dstp32);
608+
609+
/*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
610+
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
611+
612+
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
613+
614+
/* pack everything back into a pixel */
615+
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
616+
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
617+
/* reset alpha to 0 */
618+
*dstp32 = _mm_cvtsi128_si32(sub_dst);
619+
620+
srcp32++;
621+
dstp32++;
639622
}
623+
624+
srcp32 += srcskip;
625+
dstp32 += dstskip;
640626
}
641627
}
642628

0 commit comments

Comments
 (0)