@@ -816,6 +816,41 @@ struct VECTOR_SHL_V128
816816 }
817817 e.lea (e.GetNativeParam (1 ), e.StashConstantXmm (1 , i.src2 .constant ()));
818818 } else {
819+ // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only
820+ // applies when the multiplication overflows. Masking away any bits
821+ // that would have overflowed turns the polynomial-multiplication into
822+ // regular modulo-multiplication
823+ const uint64_t shift_mask = UINT64_C (0x01'03'07'0f'1f'3f'7f'ff );
824+ // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc
825+ const uint64_t multiply_table = UINT64_C (0x80'40'20'10'08'04'02'01 );
826+
827+ if (e.IsFeatureEnabled (kX64EmitGFNI | kX64EmitAVX512Ortho |
828+ kX64EmitAVX512VBMI )) {
829+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
830+ e.vpermb (e.xmm0 , i.src2 , e.xmm0 );
831+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
832+
833+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
834+ e.vpermb (e.xmm1 , i.src2 , e.xmm1 );
835+
836+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
837+ return ;
838+ } else if (e.IsFeatureEnabled (kX64EmitGFNI )) {
839+ // Only use the lower 4 bits
840+ // This also protects from vpshufb from writing zero when the MSB is set
841+ e.LoadConstantXmm (e.xmm0 , vec128b (0x0F ));
842+ e.vpand (e.xmm2 , i.src2 , e.xmm0 );
843+
844+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
845+ e.vpshufb (e.xmm0 , e.xmm0 , e.xmm2 );
846+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
847+
848+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
849+ e.vpshufb (e.xmm1 , e.xmm1 , e.xmm2 );
850+
851+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
852+ return ;
853+ }
819854 e.lea (e.GetNativeParam (1 ), e.StashXmm (1 , i.src2 ));
820855 }
821856 e.lea (e.GetNativeParam (0 ), e.StashXmm (0 , i.src1 ));
0 commit comments