@@ -482,55 +482,105 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
482482 Uint32 * srcp32 = (Uint32 * )info -> s_pixels ;
483483 Uint32 * dstp32 = (Uint32 * )info -> d_pixels ;
484484
485- int pre_2_width = width % 2 ;
486- int post_2_width = width / 2 ;
485+ int pre_4_width = width % 4 ;
486+ int post_4_width = width / 4 ;
487487
488488 __m128i src1 , dst1 , sub_dst , mm_src_alpha ;
489+ __m128i unpacked_alpha , pixels_src , pixels_dst , batch_a_dst ;
490+ __m128i * srcp128 , * dstp128 ;
489491 __m128i mm_rgb_mask = _mm_set1_epi32 (0x00FFFFFF );
490492 __m128i mm_zero = _mm_setzero_si128 ();
491493
492494 while (height -- ) {
495+ srcp128 = (__m128i * )srcp32 ;
496+ dstp128 = (__m128i * )dstp32 ;
497+
493498 LOOP_UNROLLED4 (
494499 {
495- /* src(ARGB) -> src1 (00000000ARGBARGB) */
496- LOAD_64_INTO_M128 ((Uint64 * )srcp32 , & src1 );
500+ /*
501+ * 4 pixel preparations
502+ */
503+
504+ /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
505+ pixels_src = _mm_loadu_si128 (srcp128 );
497506
498507 /* isolate alpha channels
499- * 00000000A1000A2000 -> mm_src_alpha */
500- mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , src1 );
508+ * A1000A2000A3000A4000 -> mm_src_alpha */
509+ mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , pixels_src );
501510
502511 /* shift right to position alpha channels for manipulation
503- * 000000000A1000A200 -> mm_src_alpha*/
512+ * 0A1000A2000A3000A400 -> mm_src_alpha*/
504513 mm_src_alpha = _mm_srli_si128 (mm_src_alpha , 1 );
505514
515+ /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
516+ pixels_dst = _mm_loadu_si128 (dstp128 );
517+
518+ /*
519+ * BATCH A (the 2 low pixels)
520+ */
521+
506522 /* shuffle alpha channels to duplicate 16 bit pairs
507523 * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
508- * [00][00][00][00][0A1][00][0A2][00 ] -> mm_src_alpha
509- * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
510- * Therefore the previous contents of 16 bit number # 1
511- * Goes into 16 bit number #1 and #2 , and the previous
512- * content of 16 bit number # 3 goes into # 2 and #3 */
513- mm_src_alpha = _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
524+ * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4 ] -> mm_src_alpha
525+ * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
526+ * Therefore the previous contents of 16 bit lane 1
527+ * Goes into 16 bit lanes 0 and 1 , and the previous
528+ * content of 16 bit lane 3 goes into lanes 2 and 3 */
529+ unpacked_alpha = _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
514530
515531 /* finally move into final config
516532 * spread out so they can be multiplied in 16 bit math
517533 * against all RGBA of both pixels being blit
518- * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
519- mm_src_alpha = _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
534+ * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
535+ */
536+ unpacked_alpha =
537+ _mm_unpacklo_epi16 (unpacked_alpha , unpacked_alpha );
538+
539+ /* 0A0R0G0B0A0R0G0B -> src1 */
540+ src1 = _mm_unpacklo_epi8 (pixels_src , mm_zero );
541+
542+ /* 0A0R0G0B0A0R0G0B -> dst1 */
543+ dst1 = _mm_unpacklo_epi8 (pixels_dst , mm_zero );
544+
545+ /* (srcRGB - dstRGB) */
546+ sub_dst = _mm_sub_epi16 (src1 , dst1 );
547+
548+ /* (srcRGB - dstRGB) * srcA */
549+ sub_dst = _mm_mullo_epi16 (sub_dst , unpacked_alpha );
550+
551+ /* (srcRGB - dstRGB) * srcA + srcRGB */
552+ sub_dst = _mm_add_epi16 (sub_dst , src1 );
553+
554+ /* (dstRGB << 8) */
555+ dst1 = _mm_slli_epi16 (dst1 , 8 );
556+
557+ /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
558+ sub_dst = _mm_add_epi16 (sub_dst , dst1 );
559+
560+ /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
561+ * 8)*/
562+ batch_a_dst = _mm_srli_epi16 (sub_dst , 8 );
563+
564+ /*
565+ * BATCH B (the 2 high pixels)
566+ */
567+
568+ unpacked_alpha = _mm_shufflehi_epi16 (mm_src_alpha , 0b11110101 );
569+
570+ unpacked_alpha =
571+ _mm_unpackhi_epi16 (unpacked_alpha , unpacked_alpha );
520572
521573 /* 0A0R0G0B0A0R0G0B -> src1 */
522- src1 = _mm_unpacklo_epi8 ( src1 , mm_zero );
574+ src1 = _mm_unpackhi_epi8 ( pixels_src , mm_zero );
523575
524- /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
525- LOAD_64_INTO_M128 ((Uint64 * )dstp32 , & dst1 );
526576 /* 0A0R0G0B0A0R0G0B -> dst1 */
527- dst1 = _mm_unpacklo_epi8 ( dst1 , mm_zero );
577+ dst1 = _mm_unpackhi_epi8 ( pixels_dst , mm_zero );
528578
529579 /* (srcRGB - dstRGB) */
530580 sub_dst = _mm_sub_epi16 (src1 , dst1 );
531581
532582 /* (srcRGB - dstRGB) * srcA */
533- sub_dst = _mm_mullo_epi16 (sub_dst , mm_src_alpha );
583+ sub_dst = _mm_mullo_epi16 (sub_dst , unpacked_alpha );
534584
535585 /* (srcRGB - dstRGB) * srcA + srcRGB */
536586 sub_dst = _mm_add_epi16 (sub_dst , src1 );
@@ -545,18 +595,26 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
545595 * 8)*/
546596 sub_dst = _mm_srli_epi16 (sub_dst , 8 );
547597
598+ /*
599+ * Combine the batches and store
600+ */
601+
548602 /* pack everything back into a pixel with zeroed out alpha
549603 */
550- sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
604+ sub_dst = _mm_packus_epi16 (batch_a_dst , sub_dst );
551605 sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
552- STORE_M128_INTO_64 (& sub_dst , (Uint64 * )dstp32 );
553606
554- srcp32 += 2 ;
555- dstp32 += 2 ;
607+ _mm_storeu_si128 (dstp128 , sub_dst );
608+
609+ srcp128 ++ ;
610+ dstp128 ++ ;
556611 },
557- n , post_2_width );
612+ n , post_4_width );
613+
614+ srcp32 = (Uint32 * )srcp128 ;
615+ dstp32 = (Uint32 * )dstp128 ;
558616
559- for (int i = 0 ; i < pre_2_width ; i ++ ) {
617+ for (int i = 0 ; i < pre_4_width ; i ++ ) {
560618 /* Do the actual blend */
561619 /* src(ARGB) -> src1 (000000000000ARGB) */
562620 src1 = _mm_cvtsi32_si128 (* srcp32 );
0 commit comments