@@ -816,6 +816,42 @@ struct VECTOR_SHL_V128
816816 }
817817 e.lea (e.GetNativeParam (1 ), e.StashConstantXmm (1 , i.src2 .constant ()));
818818 } else {
819+ if (e.IsFeatureEnabled (kX64EmitGFNI | kX64EmitAVX512Ortho |
820+ kX64EmitAVX512VBMI )) {
821+ // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only
822+ // applies when the multiplication overflows. Masking away any bits
823+ // that would have overflowed turns the polynomial-multiplication into
824+ // regular modulo-multiplication
825+ const uint64_t shift_mask = UINT64_C (0x01'03'07'0f'1f'3f'7f'ff );
826+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
827+ e.vpermb (e.xmm0 , i.src2 , e.xmm0 );
828+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
829+
830+ // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc
831+ const uint64_t multiply_table = UINT64_C (0x80'40'20'10'08'04'02'01 );
832+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
833+ e.vpermb (e.xmm1 , i.src2 , e.xmm1 );
834+
835+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
836+ return ;
837+ } else if (e.IsFeatureEnabled (kX64EmitGFNI )) {
838+ // Only use the lower 4 bits
839+ // This also protects from vpshufb from writing zero when the MSB is set
840+ e.LoadConstantXmm (e.xmm0 , vec128b (0x0F ));
841+ e.vpand (e.xmm2 , i.src2 , e.xmm0 );
842+
843+ const uint64_t shift_mask = UINT64_C (0x01'03'07'0f'1f'3f'7f'ff );
844+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
845+ e.vpshufb (e.xmm0 , e.xmm0 , e.xmm2 );
846+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
847+
848+ const uint64_t multiply_table = UINT64_C (0x80'40'20'10'08'04'02'01 );
849+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
850+ e.vpshufb (e.xmm1 , e.xmm1 , e.xmm2 );
851+
852+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
853+ return ;
854+ }
819855 e.lea (e.GetNativeParam (1 ), e.StashXmm (1 , i.src2 ));
820856 }
821857 e.lea (e.GetNativeParam (0 ), e.StashXmm (0 , i.src1 ));
0 commit comments