Skip to content

Commit 1f1ebf0

Browse files
committed
Comments pass for alpha_opaque_dst blitter
1 parent 8ee4f9b commit 1f1ebf0

File tree

1 file changed

+31
-43
lines changed

1 file changed

+31
-43
lines changed

src_c/simd_blitters_sse2.c

Lines changed: 31 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
503503
Uint32 *srcp32 = (Uint32 *)info->s_pixels;
504504
Uint32 *dstp32 = (Uint32 *)info->d_pixels;
505505

506-
int pre_4_width = width % 4;
507-
int post_4_width = width / 4;
506+
int pxl_excess = width % 4;
507+
int n_iters_4 = width / 4;
508508

509509
__m128i src1, dst1, sub_dst, mm_src_alpha;
510510
__m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
@@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
518518

519519
LOOP_UNROLLED4(
520520
{
521-
/*
522-
* 4 pixel preparations
523-
*/
521+
/* ==== load 4 pixels into SSE registers ==== */
524522

525-
/* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
523+
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
526524
pixels_src = _mm_loadu_si128(srcp128);
527525

528526
/* isolate alpha channels
529-
* A1000A2000A3000A4000 -> mm_src_alpha */
527+
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
530528
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
531529

532530
/* shift right to position alpha channels for manipulation
533-
* 0A1000A2000A3000A400 -> mm_src_alpha*/
531+
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
534532
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
535533

536-
/* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
534+
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
537535
pixels_dst = _mm_loadu_si128(dstp128);
538536

539-
/*
540-
* BATCH A (the 2 low pixels)
541-
*/
537+
/* ==== BATCH A (the 2 low pixels) ==== */
542538

543539
/* shuffle alpha channels to duplicate 16 bit pairs
544-
* shuffle (3, 3, 1, 1) (backed 2 bit numbers)
545-
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
546-
* [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
547-
* Therefore the previous contents of 16 bit lane 1
548-
* Goes into 16 bit lanes 0 and 1, and the previous
549-
* content of 16 bit lane 3 goes into lanes 2 and 3*/
540+
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
550541
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
551542

552-
/* finally move into final config
553-
* spread out so they can be multiplied in 16 bit math
554-
* against all RGBA of both pixels being blit
555-
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
556-
*/
543+
/* spread alpha into final config for 16 bit math
544+
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
557545
unpacked_alpha =
558546
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
559547

@@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
567555

568556
batch_a_dst = sub_dst;
569557

570-
/*
571-
* BATCH B (the 2 high pixels)
572-
*/
558+
/* ==== BATCH B (the 2 high pixels) ==== */
573559

560+
/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
574561
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
575562

563+
/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
576564
unpacked_alpha =
577565
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
578566

579-
/* 0A0R0G0B0A0R0G0B -> src1 */
567+
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
580568
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
581569

582-
/* 0A0R0G0B0A0R0G0B -> dst1 */
570+
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
583571
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
584572

585573
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
586574

587-
/*
588-
* Combine the batches and store
589-
* pack everything back into a pixel with zeroed out alpha
590-
*/
575+
/* ==== combine batches and store ==== */
576+
591577
sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
578+
/* zero out alpha */
592579
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
593580
_mm_storeu_si128(dstp128, sub_dst);
594581

595582
srcp128++;
596583
dstp128++;
597584
},
598-
n, post_4_width);
585+
n, n_iters_4);
599586

600587
srcp32 = (Uint32 *)srcp128;
601588
dstp32 = (Uint32 *)dstp128;
602589

603-
for (int i = 0; i < pre_4_width; i++) {
604-
/* Do the actual blend */
605-
/* src(ARGB) -> src1 (000000000000ARGB) */
590+
for (int i = 0; i < pxl_excess; i++) {
591+
/*[00][00][00][00][00][00][AR][GB] -> src1*/
606592
src1 = _mm_cvtsi32_si128(*srcp32);
607-
/* src1 >> ashift -> mm_src_alpha(000000000000000A) */
593+
594+
/*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
608595
mm_src_alpha = _mm_srli_si128(src1, 3);
609596

610-
/* Then Calc RGB */
611-
/* 0000000000000A0A -> rgb_src_alpha */
597+
/*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
612598
mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
613-
/* 000000000A0A0A0A -> rgb_src_alpha */
599+
600+
/*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
614601
unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
615602

616-
/* 000000000A0R0G0B -> src1 */
603+
/*[00][00][00][00][0A][0R][0G][0B] -> src1*/
617604
src1 = _mm_unpacklo_epi8(src1, mm_zero);
618605

619-
/* dst(ARGB) -> dst1 (000000000000ARGB) */
606+
/*[00][00][00][00][00][00][AR][GB] -> dst1*/
620607
dst1 = _mm_cvtsi32_si128(*dstp32);
621-
/* 000000000A0R0G0B -> dst1 */
608+
609+
/*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
622610
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
623611

624612
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

0 commit comments

Comments
 (0)