@@ -470,6 +470,27 @@ alphablit_alpha_sse2_argb_no_surf_alpha(SDL_BlitInfo *info)
470470 }
471471}
472472
473+ /* Defines the blit procedure at the core of
474+ * alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst
475+ *
476+ * Input variables: src1, dst1, unpacked_alpha
477+ * containing unpacked 16 bit lanes of src, dst, and src alpha
478+ * Output variables: sub_dst
479+ * */
480+ #define ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE \
481+ /* (srcRGB - dstRGB) */ \
482+ sub_dst = _mm_sub_epi16(src1, dst1); \
483+ /* (srcRGB - dstRGB) * srcA */ \
484+ sub_dst = _mm_mullo_epi16 (sub_dst , unpacked_alpha ); \
485+ /* (srcRGB - dstRGB) * srcA + srcRGB */ \
486+ sub_dst = _mm_add_epi16 (sub_dst , src1 ); \
487+ /* (dstRGB << 8) */ \
488+ dst1 = _mm_slli_epi16 (dst1 , 8 ); \
489+ /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ \
490+ sub_dst = _mm_add_epi16 (sub_dst , dst1 ); \
491+ /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> 8) */ \
492+ sub_dst = _mm_srli_epi16 (sub_dst , 8 );
493+
473494void
474495alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst (SDL_BlitInfo * info )
475496{
@@ -479,164 +500,129 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
479500 int srcskip = info -> s_skip >> 2 ;
480501 int dstskip = info -> d_skip >> 2 ;
481502
482- Uint64 * srcp64 = (Uint64 * )info -> s_pixels ;
483- Uint64 * dstp64 = (Uint64 * )info -> d_pixels ;
484-
485- Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF ;
486- Uint32 rgb_mask32 = 0x00FFFFFF ;
487-
488503 Uint32 * srcp32 = (Uint32 * )info -> s_pixels ;
489504 Uint32 * dstp32 = (Uint32 * )info -> d_pixels ;
490505
491- __m128i src1 , dst1 , sub_dst , mm_src_alpha , mm_zero , mm_rgb_mask ;
506+ int pxl_excess = width % 4 ;
507+ int n_iters_4 = width / 4 ;
492508
493- /* There are two paths through this blitter:
494- 1. Two pixels at once.
495- 2. One pixel at a time.
496- */
497- if (((width % 2 ) == 0 ) && ((srcskip % 2 ) == 0 ) && ((dstskip % 2 ) == 0 )) {
498- width = width / 2 ;
499- srcskip = srcskip / 2 ;
500- dstskip = dstskip / 2 ;
509+ __m128i src1 , dst1 , sub_dst , mm_src_alpha ;
510+ __m128i unpacked_alpha , pixels_src , pixels_dst , batch_a_dst ;
511+ __m128i * srcp128 , * dstp128 ;
512+ __m128i mm_rgb_mask = _mm_set1_epi32 (0x00FFFFFF );
513+ __m128i mm_zero = _mm_setzero_si128 ();
501514
502- mm_zero = _mm_setzero_si128 ();
515+ while (height -- ) {
516+ srcp128 = (__m128i * )srcp32 ;
517+ dstp128 = (__m128i * )dstp32 ;
503518
504- /* two pixels at a time */
505- LOAD_64_INTO_M128 (& rgb_mask64 , & mm_rgb_mask );
506- while (height -- ) {
507- LOOP_UNROLLED4 (
508- {
509- /* src(ARGB) -> src1 (00000000ARGBARGB) */
510- LOAD_64_INTO_M128 (srcp64 , & src1 );
519+ LOOP_UNROLLED4 (
520+ {
521+ /* ==== load 4 pixels into SSE registers ==== */
511522
512- /* isolate alpha channels
513- * 00000000A1000A2000 -> mm_src_alpha */
514- mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , src1 );
523+ /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
524+ pixels_src = _mm_loadu_si128 (srcp128 );
515525
516- /* shift right to position alpha channels for manipulation
517- * 000000000A1000A200 -> mm_src_alpha*/
518- mm_src_alpha = _mm_srli_si128 ( mm_src_alpha , 1 );
526+ /* isolate alpha channels
527+ * [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
528+ mm_src_alpha = _mm_andnot_si128 ( mm_rgb_mask , pixels_src );
519529
520- /* shuffle alpha channels to duplicate 16 bit pairs
521- * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
522- * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
523- * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
524- * Therefore the previous contents of 16 bit number #1
525- * Goes into 16 bit number #1 and #2, and the previous
526- * content of 16 bit number #3 goes into #2 and #3 */
527- mm_src_alpha =
528- _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
530+ /* shift right to position alpha channels for manipulation
531+ * [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
532+ mm_src_alpha = _mm_srli_si128 (mm_src_alpha , 1 );
529533
530- /* finally move into final config
531- * spread out so they can be multiplied in 16 bit math
532- * against all RGBA of both pixels being blit
533- * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
534- mm_src_alpha =
535- _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
534+ /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
535+ pixels_dst = _mm_loadu_si128 (dstp128 );
536536
537- /* 0A0R0G0B0A0R0G0B -> src1 */
538- src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
537+ /* ==== BATCH A (the 2 low pixels) ==== */
539538
540- /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
541- LOAD_64_INTO_M128 (dstp64 , & dst1 );
542- /* 0A0R0G0B0A0R0G0B -> dst1 */
543- dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
539+ /* shuffle alpha channels to duplicate 16 bit pairs
540+ * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
541+ unpacked_alpha = _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
544542
545- /* (srcRGB - dstRGB) */
546- sub_dst = _mm_sub_epi16 (src1 , dst1 );
543+ /* spread alpha into final config for 16 bit math
544+ * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
545+ unpacked_alpha =
546+ _mm_unpacklo_epi16 (unpacked_alpha , unpacked_alpha );
547547
548- /* (srcRGB - dstRGB) * srcA */
549- sub_dst = _mm_mullo_epi16 ( sub_dst , mm_src_alpha );
548+ /* 0A0R0G0B0A0R0G0B -> src1 */
549+ src1 = _mm_unpacklo_epi8 ( pixels_src , mm_zero );
550550
551- /* (srcRGB - dstRGB) * srcA + srcRGB */
552- sub_dst = _mm_add_epi16 ( sub_dst , src1 );
551+ /* 0A0R0G0B0A0R0G0B -> dst1 */
552+ dst1 = _mm_unpacklo_epi8 ( pixels_dst , mm_zero );
553553
554- /* (dstRGB << 8) */
555- dst1 = _mm_slli_epi16 (dst1 , 8 );
554+ ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
556555
557- /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
558- sub_dst = _mm_add_epi16 (sub_dst , dst1 );
556+ batch_a_dst = sub_dst ;
559557
560- /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
561- * 8)*/
562- sub_dst = _mm_srli_epi16 (sub_dst , 8 );
558+ /* ==== BATCH B (the 2 high pixels) ==== */
563559
564- /* pack everything back into a pixel with zeroed out alpha
565- */
566- sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
567- sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
568- STORE_M128_INTO_64 (& sub_dst , dstp64 );
560+ /*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
561+ unpacked_alpha = _mm_shufflehi_epi16 (mm_src_alpha , 0b11110101 );
569562
570- ++ srcp64 ;
571- ++ dstp64 ;
572- },
573- n , width );
574- srcp64 += srcskip ;
575- dstp64 += dstskip ;
576- }
577- }
578- else {
579- /* one pixel at a time */
580- mm_zero = _mm_setzero_si128 ();
581- mm_rgb_mask = _mm_cvtsi32_si128 (rgb_mask32 );
563+ /*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
564+ unpacked_alpha =
565+ _mm_unpackhi_epi16 (unpacked_alpha , unpacked_alpha );
582566
583- while (height -- ) {
584- LOOP_UNROLLED4 (
585- {
586- /* Do the actual blend */
587- /* src(ARGB) -> src1 (000000000000ARGB) */
588- src1 = _mm_cvtsi32_si128 (* srcp32 );
589- /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
590- mm_src_alpha = _mm_srli_si128 (src1 , 3 );
567+ /*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
568+ src1 = _mm_unpackhi_epi8 (pixels_src , mm_zero );
591569
592- /* Then Calc RGB */
593- /* 0000000000000A0A -> rgb_src_alpha */
594- mm_src_alpha =
595- _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
596- /* 000000000A0A0A0A -> rgb_src_alpha */
597- mm_src_alpha =
598- _mm_unpacklo_epi32 (mm_src_alpha , mm_src_alpha );
570+ /*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
571+ dst1 = _mm_unpackhi_epi8 (pixels_dst , mm_zero );
599572
600- /* 000000000A0R0G0B -> src1 */
601- src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
573+ ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
602574
603- /* dst(ARGB) -> dst1 (000000000000ARGB) */
604- dst1 = _mm_cvtsi32_si128 (* dstp32 );
605- /* 000000000A0R0G0B -> dst1 */
606- dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
575+ /* ==== combine batches and store ==== */
607576
608- /* (srcRGB - dstRGB) */
609- sub_dst = _mm_sub_epi16 (src1 , dst1 );
577+ sub_dst = _mm_packus_epi16 (batch_a_dst , sub_dst );
578+ /* zero out alpha */
579+ sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
580+ _mm_storeu_si128 (dstp128 , sub_dst );
610581
611- /* (srcRGB - dstRGB) * srcA */
612- sub_dst = _mm_mullo_epi16 (sub_dst , mm_src_alpha );
582+ srcp128 ++ ;
583+ dstp128 ++ ;
584+ },
585+ n , n_iters_4 );
613586
614- /* (srcRGB - dstRGB) * srcA + srcRGB */
615- sub_dst = _mm_add_epi16 ( sub_dst , src1 ) ;
587+ srcp32 = ( Uint32 * ) srcp128 ;
588+ dstp32 = ( Uint32 * ) dstp128 ;
616589
617- /* (dstRGB << 8) */
618- dst1 = _mm_slli_epi16 (dst1 , 8 );
590+ for (int i = 0 ; i < pxl_excess ; i ++ ) {
591+ /*[00][00][00][00][00][00][AR][GB] -> src1*/
592+ src1 = _mm_cvtsi32_si128 (* srcp32 );
619593
620- /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
621- sub_dst = _mm_add_epi16 ( sub_dst , dst1 );
594+ /*[00][00][00][00][00][00][00][0A] -> mm_src_alpha */
595+ mm_src_alpha = _mm_srli_si128 ( src1 , 3 );
622596
623- /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
624- * 8)*/
625- sub_dst = _mm_srli_epi16 (sub_dst , 8 );
597+ /*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
598+ mm_src_alpha = _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
626599
627- /* pack everything back into a pixel */
628- sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
629- sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
630- /* reset alpha to 0 */
631- * dstp32 = _mm_cvtsi128_si32 (sub_dst );
600+ /*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
601+ unpacked_alpha = _mm_unpacklo_epi32 (mm_src_alpha , mm_src_alpha );
632602
633- ++ srcp32 ;
634- ++ dstp32 ;
635- },
636- n , width );
637- srcp32 += srcskip ;
638- dstp32 += dstskip ;
603+ /*[00][00][00][00][0A][0R][0G][0B] -> src1*/
604+ src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
605+
606+ /*[00][00][00][00][00][00][AR][GB] -> dst1*/
607+ dst1 = _mm_cvtsi32_si128 (* dstp32 );
608+
609+ /*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
610+ dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
611+
612+ ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
613+
614+ /* pack everything back into a pixel */
615+ sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
616+ sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
617+ /* reset alpha to 0 */
618+ * dstp32 = _mm_cvtsi128_si32 (sub_dst );
619+
620+ srcp32 ++ ;
621+ dstp32 ++ ;
639622 }
623+
624+ srcp32 += srcskip ;
625+ dstp32 += dstskip ;
640626 }
641627}
642628
0 commit comments