@@ -3249,3 +3249,209 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
32493249 %max1 = call double @llvm.maximum.f64 (double %max0 , double 16 .0 )
32503250 ret double %max1
32513251}
3252+
3253+ define <2 x float > @v_no_fmaximum3_f32__multi_use (float %a , float %b , float %c ) {
3254+ ; GFX12-LABEL: v_no_fmaximum3_f32__multi_use:
3255+ ; GFX12: ; %bb.0:
3256+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3257+ ; GFX12-NEXT: s_wait_expcnt 0x0
3258+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3259+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3260+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3261+ ; GFX12-NEXT: v_maximum_f32 v0, v0, v1
3262+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3263+ ; GFX12-NEXT: v_maximum_f32 v1, v0, v2
3264+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3265+ ;
3266+ ; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
3267+ ; GFX9: ; %bb.0:
3268+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3269+ ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
3270+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
3271+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
3272+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
3273+ ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
3274+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
3275+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
3276+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3277+ %max0 = call float @llvm.maximum.f32 (float %a , float %b )
3278+ %max1 = call float @llvm.maximum.f32 (float %max0 , float %c )
3279+ %insert.0 = insertelement <2 x float > poison, float %max0 , i32 0
3280+ %insert.1 = insertelement <2 x float > %insert.0 , float %max1 , i32 1
3281+ ret <2 x float > %insert.1
3282+ }
3283+
3284+ define amdgpu_ps <2 x i32 > @s_no_fmaximum3_f32__multi_use (float inreg %a , float inreg %b , float inreg %c ) {
3285+ ; GFX12-LABEL: s_no_fmaximum3_f32__multi_use:
3286+ ; GFX12: ; %bb.0:
3287+ ; GFX12-NEXT: s_maximum_f32 s0, s0, s1
3288+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
3289+ ; GFX12-NEXT: s_maximum_f32 s1, s0, s2
3290+ ; GFX12-NEXT: ; return to shader part epilog
3291+ ;
3292+ ; GFX9-LABEL: s_no_fmaximum3_f32__multi_use:
3293+ ; GFX9: ; %bb.0:
3294+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
3295+ ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
3296+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
3297+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
3298+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
3299+ ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0
3300+ ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0
3301+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3302+ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3303+ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3304+ ; GFX9-NEXT: ; return to shader part epilog
3305+ %max0 = call float @llvm.maximum.f32 (float %a , float %b )
3306+ %max1 = call float @llvm.maximum.f32 (float %max0 , float %c )
3307+ %cast0 = bitcast float %max0 to i32
3308+ %cast1 = bitcast float %max1 to i32
3309+ %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane (i32 %cast0 )
3310+ %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane (i32 %cast1 )
3311+ %insert.0 = insertelement <2 x i32 > poison, i32 %readfirstlane0 , i32 0
3312+ %insert.1 = insertelement <2 x i32 > %insert.0 , i32 %readfirstlane1 , i32 1
3313+ ret <2 x i32 > %insert.1
3314+ }
3315+
3316+ define <2 x half > @v_no_fmaximum3_f16__multi_use (half %a , half %b , half %c ) {
3317+ ; GFX12-LABEL: v_no_fmaximum3_f16__multi_use:
3318+ ; GFX12: ; %bb.0:
3319+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3320+ ; GFX12-NEXT: s_wait_expcnt 0x0
3321+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3322+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3323+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3324+ ; GFX12-NEXT: v_maximum_f16 v0, v0, v1
3325+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3326+ ; GFX12-NEXT: v_maximum_f16 v1, v0, v2
3327+ ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
3328+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3329+ ;
3330+ ; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
3331+ ; GFX9: ; %bb.0:
3332+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333+ ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
3334+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
3335+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
3336+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
3337+ ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
3338+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
3339+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
3340+ ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
3341+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3342+ %max0 = call half @llvm.maximum.f16 (half %a , half %b )
3343+ %max1 = call half @llvm.maximum.f16 (half %max0 , half %c )
3344+ %insert.0 = insertelement <2 x half > poison, half %max0 , i32 0
3345+ %insert.1 = insertelement <2 x half > %insert.0 , half %max1 , i32 1
3346+ ret <2 x half > %insert.1
3347+ }
3348+
3349+ define amdgpu_ps <2 x i32 > @s_no_fmaximum3_f16__multi_use (half inreg %a , half inreg %b , half inreg %c ) {
3350+ ; GFX12-LABEL: s_no_fmaximum3_f16__multi_use:
3351+ ; GFX12: ; %bb.0:
3352+ ; GFX12-NEXT: s_maximum_f16 s0, s0, s1
3353+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3354+ ; GFX12-NEXT: s_maximum_f16 s1, s0, s2
3355+ ; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
3356+ ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
3357+ ; GFX12-NEXT: ; return to shader part epilog
3358+ ;
3359+ ; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
3360+ ; GFX9: ; %bb.0:
3361+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
3362+ ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
3363+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
3364+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
3365+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
3366+ ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
3367+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
3368+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3369+ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
3370+ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
3371+ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3372+ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3373+ ; GFX9-NEXT: ; return to shader part epilog
3374+ %max0 = call half @llvm.maximum.f16 (half %a , half %b )
3375+ %max1 = call half @llvm.maximum.f16 (half %max0 , half %c )
3376+ %cast0 = bitcast half %max0 to i16
3377+ %cast1 = bitcast half %max1 to i16
3378+ %ext0 = zext i16 %cast0 to i32
3379+ %ext1 = zext i16 %cast1 to i32
3380+ %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane (i32 %ext0 )
3381+ %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane (i32 %ext1 )
3382+ %insert.0 = insertelement <2 x i32 > poison, i32 %readfirstlane0 , i32 0
3383+ %insert.1 = insertelement <2 x i32 > %insert.0 , i32 %readfirstlane1 , i32 1
3384+ ret <2 x i32 > %insert.1
3385+ }
3386+
3387+ define <4 x half > @v_no_fmaximum3_v2f16__multi_use (<2 x half > %a , <2 x half > %b , <2 x half > %c ) {
3388+ ; GFX12-LABEL: v_no_fmaximum3_v2f16__multi_use:
3389+ ; GFX12: ; %bb.0:
3390+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3391+ ; GFX12-NEXT: s_wait_expcnt 0x0
3392+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3393+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3394+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3395+ ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
3396+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3397+ ; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2
3398+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3399+ ;
3400+ ; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
3401+ ; GFX9: ; %bb.0:
3402+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3403+ ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1
3404+ ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
3405+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
3406+ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
3407+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3408+ ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
3409+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
3410+ ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3411+ ; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4
3412+ ; GFX9-NEXT: v_pk_max_f16 v3, v0, v2
3413+ ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2
3414+ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
3415+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3416+ ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
3417+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
3418+ ; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4
3419+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3420+ %max0 = call <2 x half > @llvm.maximum.f16 (<2 x half > %a , <2 x half > %b )
3421+ %max1 = call <2 x half > @llvm.maximum.f16 (<2 x half > %max0 , <2 x half > %c )
3422+ %concat = shufflevector <2 x half > %max0 , <2 x half > %max1 , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
3423+ ret <4 x half > %concat
3424+ }
3425+
3426+ define <2 x double > @v_no_fmaximum3_f64__multi_use (double %a , double %b , double %c ) {
3427+ ; GFX12-LABEL: v_no_fmaximum3_f64__multi_use:
3428+ ; GFX12: ; %bb.0:
3429+ ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3430+ ; GFX12-NEXT: s_wait_expcnt 0x0
3431+ ; GFX12-NEXT: s_wait_samplecnt 0x0
3432+ ; GFX12-NEXT: s_wait_bvhcnt 0x0
3433+ ; GFX12-NEXT: s_wait_kmcnt 0x0
3434+ ; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
3435+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3436+ ; GFX12-NEXT: v_maximum_f64 v[2:3], v[0:1], v[4:5]
3437+ ; GFX12-NEXT: s_setpc_b64 s[30:31]
3438+ ;
3439+ ; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
3440+ ; GFX9: ; %bb.0:
3441+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3442+ ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
3443+ ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
3444+ ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000
3445+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
3446+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
3447+ ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
3448+ ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
3449+ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3450+ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
3451+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
3452+ %max0 = call double @llvm.maximum.f64 (double %a , double %b )
3453+ %max1 = call double @llvm.maximum.f64 (double %max0 , double %c )
3454+ %insert.0 = insertelement <2 x double > poison, double %max0 , i32 0
3455+ %insert.1 = insertelement <2 x double > %insert.0 , double %max1 , i32 1
3456+ ret <2 x double > %insert.1
3457+ }
0 commit comments