@@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
503503 Uint32 * srcp32 = (Uint32 * )info -> s_pixels ;
504504 Uint32 * dstp32 = (Uint32 * )info -> d_pixels ;
505505
506- int pre_4_width = width % 4 ;
507- int post_4_width = width / 4 ;
506+ int pxl_excess = width % 4 ;
507+ int n_iters_4 = width / 4 ;
508508
509509 __m128i src1 , dst1 , sub_dst , mm_src_alpha ;
510510 __m128i unpacked_alpha , pixels_src , pixels_dst , batch_a_dst ;
@@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
518518
519519 LOOP_UNROLLED4 (
520520 {
521- /*
522- * 4 pixel preparations
523- */
521+ /* ==== load 4 pixels into SSE registers ==== */
524522
525- /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
523+ /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
526524 pixels_src = _mm_loadu_si128 (srcp128 );
527525
528526 /* isolate alpha channels
529- * A1000A2000A3000A4000 -> mm_src_alpha */
527+ * [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
530528 mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , pixels_src );
531529
532530 /* shift right to position alpha channels for manipulation
533- * 0A1000A2000A3000A400 -> mm_src_alpha*/
531+ * [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
534532 mm_src_alpha = _mm_srli_si128 (mm_src_alpha , 1 );
535533
536- /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
534+ /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
537535 pixels_dst = _mm_loadu_si128 (dstp128 );
538536
539- /*
540- * BATCH A (the 2 low pixels)
541- */
537+ /* ==== BATCH A (the 2 low pixels) ==== */
542538
543539 /* shuffle alpha channels to duplicate 16 bit pairs
544- * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
545- * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
546- * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
547- * Therefore the previous contents of 16 bit lane 1
548- * Goes into 16 bit lanes 0 and 1, and the previous
549- * content of 16 bit lane 3 goes into lanes 2 and 3*/
540+ * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
550541 unpacked_alpha = _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
551542
552- /* finally move into final config
553- * spread out so they can be multiplied in 16 bit math
554- * against all RGBA of both pixels being blit
555- * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
556- */
543+ /* spread alpha into final config for 16 bit math
544+ * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
557545 unpacked_alpha =
558546 _mm_unpacklo_epi16 (unpacked_alpha , unpacked_alpha );
559547
@@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
567555
568556 batch_a_dst = sub_dst ;
569557
570- /*
571- * BATCH B (the 2 high pixels)
572- */
558+ /* ==== BATCH B (the 2 high pixels) ==== */
573559
560+ /*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
574561 unpacked_alpha = _mm_shufflehi_epi16 (mm_src_alpha , 0b11110101 );
575562
563+ /*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
576564 unpacked_alpha =
577565 _mm_unpackhi_epi16 (unpacked_alpha , unpacked_alpha );
578566
579- /* 0A0R0G0B0A0R0G0B -> src1 */
567+ /*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
580568 src1 = _mm_unpackhi_epi8 (pixels_src , mm_zero );
581569
582- /* 0A0R0G0B0A0R0G0B -> dst1 */
570+ /*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
583571 dst1 = _mm_unpackhi_epi8 (pixels_dst , mm_zero );
584572
585573 ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
586574
587- /*
588- * Combine the batches and store
589- * pack everything back into a pixel with zeroed out alpha
590- */
575+ /* ==== combine batches and store ==== */
576+
591577 sub_dst = _mm_packus_epi16 (batch_a_dst , sub_dst );
578+ /* zero out alpha */
592579 sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
593580 _mm_storeu_si128 (dstp128 , sub_dst );
594581
595582 srcp128 ++ ;
596583 dstp128 ++ ;
597584 },
598- n , post_4_width );
585+ n , n_iters_4 );
599586
600587 srcp32 = (Uint32 * )srcp128 ;
601588 dstp32 = (Uint32 * )dstp128 ;
602589
603- for (int i = 0 ; i < pre_4_width ; i ++ ) {
604- /* Do the actual blend */
605- /* src(ARGB) -> src1 (000000000000ARGB) */
590+ for (int i = 0 ; i < pxl_excess ; i ++ ) {
591+ /*[00][00][00][00][00][00][AR][GB] -> src1*/
606592 src1 = _mm_cvtsi32_si128 (* srcp32 );
607- /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
593+
594+ /*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
608595 mm_src_alpha = _mm_srli_si128 (src1 , 3 );
609596
610- /* Then Calc RGB */
611- /* 0000000000000A0A -> rgb_src_alpha */
597+ /*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
612598 mm_src_alpha = _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
613- /* 000000000A0A0A0A -> rgb_src_alpha */
599+
600+ /*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
614601 unpacked_alpha = _mm_unpacklo_epi32 (mm_src_alpha , mm_src_alpha );
615602
616- /* 000000000A0R0G0B -> src1 */
603+ /*[00][00][00][00][0A][0R][0G][0B] -> src1*/
617604 src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
618605
619- /* dst(ARGB) -> dst1 (000000000000ARGB) */
606+ /*[00][00][00][00][00][00][AR][GB] -> dst1*/
620607 dst1 = _mm_cvtsi32_si128 (* dstp32 );
621- /* 000000000A0R0G0B -> dst1 */
608+
609+ /*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
622610 dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
623611
624612 ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
0 commit comments