2121
2222using namespace v8 ;
2323
24- // Windows doesn't support the C99 names for these
2524#ifdef _MSC_VER
26- #define isnan (x ) _isnan(x)
27- #define isinf (x ) (!_finite(x))
25+ // Windows doesn't support the C99 names for these. TODO unnecessary,
26+ // should be using std::isnan.
27+ # define isnan (x ) _isnan(x)
28+ # define isinf (x ) (!_finite(x))
29+ # include < intrin.h>
30+ # define bswap32 _byteswap_ulong
31+ #else
32+ # ifdef __x86_64__
33+ # include < x86intrin.h>
34+ # endif
35+ # define bswap32 __builtin_bswap32
2836#endif
2937
38+ static inline uint32_t rotr (uint32_t n, unsigned int c) {
39+ // GCC has no portable _rotr intrinsic, so rely on idiom recognition. Works
40+ // for all supported versions of MSVC, GCC x86, GCC ARM, Clang.
41+ // https://stackoverflow.com/a/776523/1218408
42+ const unsigned int mask = CHAR_BIT * sizeof (n) - 1 ;
43+ c &= mask;
44+ return (n >> c) | (n << ((~c + 1 ) & mask));
45+ }
46+
3047#ifndef isnan
3148#define isnan (x ) std::isnan(x)
3249#define isinf (x ) std::isinf(x)
@@ -852,32 +869,70 @@ NAN_METHOD(Context2d::PutImageData) {
852869 for (int y = 0 ; y < rows; ++y) {
853870 uint8_t *dstRow = dst;
854871 uint8_t *srcRow = src;
855- for (int x = 0 ; x < cols; ++x) {
856- // rgba
857- uint8_t r = *srcRow++;
858- uint8_t g = *srcRow++;
859- uint8_t b = *srcRow++;
860- uint8_t a = *srcRow++;
872+ #if defined(__x86_64__) || defined(_M_X64)
873+ int x = 0 ;
874+ for (; x < cols - 1 ; x += 2 ) { // Two columns at a time
875+ // Fast path if both alphas are 0.
876+ uint64_t px64;
877+ memcpy (&px64, srcRow, 8 );
878+ const uint64_t aMask = 0xFF000000'FF000000 ;
879+ const uint64_t aOnly = px64 & aMask;
880+ if (aOnly == 0 ) {
881+ memset (dstRow, 0 , 8 );
882+ dstRow += 8 ;
883+ srcRow += 8 ;
884+ continue ;
885+ }
861886
862- // argb
863- // performance optimization: fully transparent/opaque pixels can be
864- // processed more efficiently.
887+ __m128i px;
888+ memcpy (&px, srcRow, 8 ); // gcc doesn't define _mm_loadu_si64
889+ px = _mm_unpacklo_epi8 (px, _mm_setzero_si128 ());
890+ // rgba -> bgra
891+ px = _mm_shufflelo_epi16 (px, 0b11000110 );
892+ px = _mm_shufflehi_epi16 (px, 0b11000110 );
893+
894+ // Fast path if both alphas are 255.
895+ if (aOnly != aMask) {
896+ // broadcast alpha
897+ __m128i av = _mm_shufflelo_epi16 (px, 0b11111111 );
898+ av = _mm_shufflehi_epi16 (av, 0b11111111 );
899+ // Multiply by alpha.
900+ // Set alpha channel multiplier to 255 to undo upcoming division by 255
901+ const __m128i a255 = _mm_set_epi16 (0xFF , 0 , 0 , 0 , 0xFF , 0 , 0 , 0 );
902+ av = _mm_or_si128 (av, a255);
903+ px = _mm_mullo_epi16 (px, av);
904+ // divide by 255
905+ px = _mm_mulhi_epu16 (px, _mm_set1_epi16 (0x8081 ));
906+ px = _mm_srli_epi16 (px, 7 );
907+ }
908+
909+ // pack int16 to int8
910+ px = _mm_packus_epi16 (px, px);
911+ memcpy (dstRow, &px, 8 );
912+ dstRow += 8 ;
913+ srcRow += 8 ;
914+ }
915+ if (cols & 1 ) {
916+ #else
917+ for (int x = 0 ; x < cols; x++) {
918+ #endif
919+ uint32_t c;
920+ memcpy (&c, srcRow, 4 ); // rgba (LE)
921+ srcRow += 4 ;
922+ uint32_t a = c >> 24 ;
865923 if (a == 0 ) {
866- *dstRow++ = 0 ;
867- *dstRow++ = 0 ;
868- *dstRow++ = 0 ;
869- *dstRow++ = 0 ;
870- } else if (a == 255 ) {
871- *dstRow++ = b;
872- *dstRow++ = g;
873- *dstRow++ = r;
874- *dstRow++ = a;
924+ uint32_t zero = 0 ;
925+ memcpy (dstRow, &zero, 4 );
926+ } else if (a == 255 ) { // rgba (LE)
927+ c = bswap32 (c); // abgr
928+ c = rotr (c, 8 ); // bgra
929+ memcpy (dstRow, &c, 4 );
875930 } else {
876- float alpha = (float ) a / 255 ;
877- *dstRow++ = b * alpha ;
878- *dstRow++ = g * alpha ;
879- *dstRow++ = r * alpha ;
880- * dstRow++ = a ;
931+ uint8_t r = (c & 0xFF ) * a / 255 ;
932+ uint8_t g = (c >> 8 & 0xFF ) * a / 255 ;
933+ uint8_t b = (c >> 16 & 0xFF ) * a / 255 ;
934+ uint32_t bgra = (a << 24 ) | (r << 16 ) | (g << 8 ) | b ;
935+ memcpy ( dstRow, &bgra, 4 ) ;
881936 }
882937 }
883938 dst += dstStride;
@@ -892,13 +947,13 @@ NAN_METHOD(Context2d::PutImageData) {
892947 uint8_t *dstRow = dst;
893948 uint8_t *srcRow = src;
894949 for (int x = 0 ; x < cols; ++x) {
895- // rgba
950+ // rgb[a]
896951 uint8_t r = *srcRow++;
897952 uint8_t g = *srcRow++;
898953 uint8_t b = *srcRow++;
899954 srcRow++;
900955
901- // argb
956+ // bgra
902957 *dstRow++ = b;
903958 *dstRow++ = g;
904959 *dstRow++ = r;
0 commit comments