@@ -479,164 +479,135 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
479479 int srcskip = info -> s_skip >> 2 ;
480480 int dstskip = info -> d_skip >> 2 ;
481481
482- Uint64 * srcp64 = (Uint64 * )info -> s_pixels ;
483- Uint64 * dstp64 = (Uint64 * )info -> d_pixels ;
484-
485- Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF ;
486- Uint32 rgb_mask32 = 0x00FFFFFF ;
487-
488482 Uint32 * srcp32 = (Uint32 * )info -> s_pixels ;
489483 Uint32 * dstp32 = (Uint32 * )info -> d_pixels ;
490484
491- __m128i src1 , dst1 , sub_dst , mm_src_alpha , mm_zero , mm_rgb_mask ;
485+ int pre_2_width = width % 2 ;
486+ int post_2_width = width / 2 ;
492487
493- /* There are two paths through this blitter:
494- 1. Two pixels at once.
495- 2. One pixel at a time.
496- */
497- if (((width % 2 ) == 0 ) && ((srcskip % 2 ) == 0 ) && ((dstskip % 2 ) == 0 )) {
498- width = width / 2 ;
499- srcskip = srcskip / 2 ;
500- dstskip = dstskip / 2 ;
488+ __m128i src1 , dst1 , sub_dst , mm_src_alpha ;
489+ __m128i mm_rgb_mask = _mm_set1_epi32 (0x00FFFFFF );
490+ __m128i mm_zero = _mm_setzero_si128 ();
501491
502- mm_zero = _mm_setzero_si128 ();
492+ while (height -- ) {
493+ LOOP_UNROLLED4 (
494+ {
495+ /* src(ARGB) -> src1 (00000000ARGBARGB) */
496+ LOAD_64_INTO_M128 ((Uint64 * )srcp32 , & src1 );
497+
498+ /* isolate alpha channels
499+ * 00000000A1000A2000 -> mm_src_alpha */
500+ mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , src1 );
501+
502+ /* shift right to position alpha channels for manipulation
503+ * 000000000A1000A200 -> mm_src_alpha*/
504+ mm_src_alpha = _mm_srli_si128 (mm_src_alpha , 1 );
505+
506+ /* shuffle alpha channels to duplicate 16 bit pairs
507+ * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
508+ * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
509+ * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
510+ * Therefore the previous contents of 16 bit number #1
511+ * Goes into 16 bit number #1 and #2, and the previous
512+ * content of 16 bit number #3 goes into #2 and #3 */
513+ mm_src_alpha = _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
514+
515+ /* finally move into final config
516+ * spread out so they can be multiplied in 16 bit math
517+ * against all RGBA of both pixels being blit
518+ * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
519+ mm_src_alpha = _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
520+
521+ /* 0A0R0G0B0A0R0G0B -> src1 */
522+ src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
523+
524+ /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
525+ LOAD_64_INTO_M128 ((Uint64 * )dstp32 , & dst1 );
526+ /* 0A0R0G0B0A0R0G0B -> dst1 */
527+ dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
528+
529+ /* (srcRGB - dstRGB) */
530+ sub_dst = _mm_sub_epi16 (src1 , dst1 );
531+
532+ /* (srcRGB - dstRGB) * srcA */
533+ sub_dst = _mm_mullo_epi16 (sub_dst , mm_src_alpha );
534+
535+ /* (srcRGB - dstRGB) * srcA + srcRGB */
536+ sub_dst = _mm_add_epi16 (sub_dst , src1 );
537+
538+ /* (dstRGB << 8) */
539+ dst1 = _mm_slli_epi16 (dst1 , 8 );
540+
541+ /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
542+ sub_dst = _mm_add_epi16 (sub_dst , dst1 );
543+
544+ /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
545+ * 8)*/
546+ sub_dst = _mm_srli_epi16 (sub_dst , 8 );
547+
548+ /* pack everything back into a pixel with zeroed out alpha
549+ */
550+ sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
551+ sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
552+ STORE_M128_INTO_64 (& sub_dst , (Uint64 * )dstp32 );
553+
554+ srcp32 += 2 ;
555+ dstp32 += 2 ;
556+ },
557+ n , post_2_width );
503558
504- /* two pixels at a time */
505- LOAD_64_INTO_M128 (& rgb_mask64 , & mm_rgb_mask );
506- while (height -- ) {
507- LOOP_UNROLLED4 (
508- {
509- /* src(ARGB) -> src1 (00000000ARGBARGB) */
510- LOAD_64_INTO_M128 (srcp64 , & src1 );
559+ for (int i = 0 ; i < pre_2_width ; i ++ ) {
560+ /* Do the actual blend */
561+ /* src(ARGB) -> src1 (000000000000ARGB) */
562+ src1 = _mm_cvtsi32_si128 (* srcp32 );
563+ /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
564+ mm_src_alpha = _mm_srli_si128 (src1 , 3 );
511565
512- /* isolate alpha channels
513- * 00000000A1000A2000 -> mm_src_alpha */
514- mm_src_alpha = _mm_andnot_si128 (mm_rgb_mask , src1 );
566+ /* Then Calc RGB */
567+ /* 0000000000000A0A -> rgb_src_alpha */
568+ mm_src_alpha = _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
569+ /* 000000000A0A0A0A -> rgb_src_alpha */
570+ mm_src_alpha = _mm_unpacklo_epi32 (mm_src_alpha , mm_src_alpha );
515571
516- /* shift right to position alpha channels for manipulation
517- * 000000000A1000A200 -> mm_src_alpha*/
518- mm_src_alpha = _mm_srli_si128 (mm_src_alpha , 1 );
572+ /* 000000000A0R0G0B -> src1 */
573+ src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
519574
520- /* shuffle alpha channels to duplicate 16 bit pairs
521- * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
522- * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
523- * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
524- * Therefore the previous contents of 16 bit number #1
525- * Goes into 16 bit number #1 and #2, and the previous
526- * content of 16 bit number #3 goes into #2 and #3 */
527- mm_src_alpha =
528- _mm_shufflelo_epi16 (mm_src_alpha , 0b11110101 );
575+ /* dst(ARGB) -> dst1 (000000000000ARGB) */
576+ dst1 = _mm_cvtsi32_si128 (* dstp32 );
577+ /* 000000000A0R0G0B -> dst1 */
578+ dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
529579
530- /* finally move into final config
531- * spread out so they can be multiplied in 16 bit math
532- * against all RGBA of both pixels being blit
533- * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
534- mm_src_alpha =
535- _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
580+ /* (srcRGB - dstRGB) */
581+ sub_dst = _mm_sub_epi16 (src1 , dst1 );
536582
537- /* 0A0R0G0B0A0R0G0B -> src1 */
538- src1 = _mm_unpacklo_epi8 ( src1 , mm_zero );
583+ /* (srcRGB - dstRGB) * srcA */
584+ sub_dst = _mm_mullo_epi16 ( sub_dst , mm_src_alpha );
539585
540- /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
541- LOAD_64_INTO_M128 (dstp64 , & dst1 );
542- /* 0A0R0G0B0A0R0G0B -> dst1 */
543- dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
586+ /* (srcRGB - dstRGB) * srcA + srcRGB */
587+ sub_dst = _mm_add_epi16 (sub_dst , src1 );
544588
545- /* (srcRGB - dstRGB) */
546- sub_dst = _mm_sub_epi16 (src1 , dst1 );
547-
548- /* (srcRGB - dstRGB) * srcA */
549- sub_dst = _mm_mullo_epi16 (sub_dst , mm_src_alpha );
550-
551- /* (srcRGB - dstRGB) * srcA + srcRGB */
552- sub_dst = _mm_add_epi16 (sub_dst , src1 );
553-
554- /* (dstRGB << 8) */
555- dst1 = _mm_slli_epi16 (dst1 , 8 );
589+ /* (dstRGB << 8) */
590+ dst1 = _mm_slli_epi16 (dst1 , 8 );
556591
557- /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
558- sub_dst = _mm_add_epi16 (sub_dst , dst1 );
592+ /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
593+ sub_dst = _mm_add_epi16 (sub_dst , dst1 );
559594
560- /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
561- * 8)*/
562- sub_dst = _mm_srli_epi16 (sub_dst , 8 );
595+ /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
596+ * 8)*/
597+ sub_dst = _mm_srli_epi16 (sub_dst , 8 );
563598
564- /* pack everything back into a pixel with zeroed out alpha
565- */
566- sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
567- sub_dst = _mm_and_si128 ( sub_dst , mm_rgb_mask );
568- STORE_M128_INTO_64 ( & sub_dst , dstp64 );
599+ /* pack everything back into a pixel */
600+ sub_dst = _mm_packus_epi16 ( sub_dst , mm_zero );
601+ sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
602+ /* reset alpha to 0 */
603+ * dstp32 = _mm_cvtsi128_si32 ( sub_dst );
569604
570- ++ srcp64 ;
571- ++ dstp64 ;
572- },
573- n , width );
574- srcp64 += srcskip ;
575- dstp64 += dstskip ;
605+ srcp32 ++ ;
606+ dstp32 ++ ;
576607 }
577- }
578- else {
579- /* one pixel at a time */
580- mm_zero = _mm_setzero_si128 ();
581- mm_rgb_mask = _mm_cvtsi32_si128 (rgb_mask32 );
582-
583- while (height -- ) {
584- LOOP_UNROLLED4 (
585- {
586- /* Do the actual blend */
587- /* src(ARGB) -> src1 (000000000000ARGB) */
588- src1 = _mm_cvtsi32_si128 (* srcp32 );
589- /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
590- mm_src_alpha = _mm_srli_si128 (src1 , 3 );
591-
592- /* Then Calc RGB */
593- /* 0000000000000A0A -> rgb_src_alpha */
594- mm_src_alpha =
595- _mm_unpacklo_epi16 (mm_src_alpha , mm_src_alpha );
596- /* 000000000A0A0A0A -> rgb_src_alpha */
597- mm_src_alpha =
598- _mm_unpacklo_epi32 (mm_src_alpha , mm_src_alpha );
599-
600- /* 000000000A0R0G0B -> src1 */
601- src1 = _mm_unpacklo_epi8 (src1 , mm_zero );
602-
603- /* dst(ARGB) -> dst1 (000000000000ARGB) */
604- dst1 = _mm_cvtsi32_si128 (* dstp32 );
605- /* 000000000A0R0G0B -> dst1 */
606- dst1 = _mm_unpacklo_epi8 (dst1 , mm_zero );
607-
608- /* (srcRGB - dstRGB) */
609- sub_dst = _mm_sub_epi16 (src1 , dst1 );
610-
611- /* (srcRGB - dstRGB) * srcA */
612- sub_dst = _mm_mullo_epi16 (sub_dst , mm_src_alpha );
613-
614- /* (srcRGB - dstRGB) * srcA + srcRGB */
615- sub_dst = _mm_add_epi16 (sub_dst , src1 );
616-
617- /* (dstRGB << 8) */
618- dst1 = _mm_slli_epi16 (dst1 , 8 );
619-
620- /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
621- sub_dst = _mm_add_epi16 (sub_dst , dst1 );
622-
623- /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
624- * 8)*/
625- sub_dst = _mm_srli_epi16 (sub_dst , 8 );
626608
627- /* pack everything back into a pixel */
628- sub_dst = _mm_packus_epi16 (sub_dst , mm_zero );
629- sub_dst = _mm_and_si128 (sub_dst , mm_rgb_mask );
630- /* reset alpha to 0 */
631- * dstp32 = _mm_cvtsi128_si32 (sub_dst );
632-
633- ++ srcp32 ;
634- ++ dstp32 ;
635- },
636- n , width );
637- srcp32 += srcskip ;
638- dstp32 += dstskip ;
639- }
609+ srcp32 += srcskip ;
610+ dstp32 += dstskip ;
640611 }
641612}
642613
0 commit comments