From 8e85e3138dd485c4221cc12aff9eb60ab48ed3b5 Mon Sep 17 00:00:00 2001 From: Chaitanya Date: Fri, 14 Nov 2025 21:17:47 +0530 Subject: [PATCH 1/7] [SWDEV-531975] Implement workdistribute construct lowering (#140523) (#541) This PR introduces a new pass "lower-workdistribute" Fortran array statements are lowered to fir as fir.do_loop unordered. "lower-workdistribute" pass works mainly on identifying "fir.do_loop unordered" that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and lowers it to target{teams{parallel{wsloop{loop_nest}}}}. It hoists all the other ops outside target region. Relaces heap allocation on target with omp.target_allocmem and deallocation with omp.target_freemem from host. Also replaces runtime function "Assign" with omp.target_memcpy from host. This pass implements following rewrites and optimisations: - **FissionWorkdistribute**: finds the parallelizable ops within teams {workdistribute} region and moves them to their own teams{workdistribute} region. - **WorkdistributeRuntimeCallLower**: finds the FortranAAssign calls nested in teams {workdistribute{}} and lowers it to unordered do loop if src is scalar and dest is array. Other runtime calls are not handled currently. - **WorkdistributeDoLower**: finds the fir.do_loop unoredered nested in teams {workdistribute{fir.do_loop unoredered}} and lowers it to teams {parallel { distribute {wsloop {loop_nest}}}}. - **TeamsWorkdistributeToSingle**: hoists all the ops inside teams {workdistribute{}} before teams op. The work in this PR is C-P and updated from @ivanradanov commits from coexecute implementation: [flang_workdistribute_iwomp_2024](https://github.com/ivanradanov/llvm-project/commits/flang_workdistribute_iwomp_2024) Paper related to this work by @ivanradanov ["Automatic Parallelization and OpenMP Offloadingof Fortran Array Notation"](https://www.osti.gov/servlets/purl/[2449728](https://www.osti.gov/servlets/purl/2449728)) --- .../include/flang/Optimizer/OpenMP/Passes.td | 4 + flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + .../Optimizer/OpenMP/LowerWorkdistribute.cpp | 1852 +++++++++++++++++ flang/lib/Optimizer/Passes/Pipelines.cpp | 4 +- flang/test/Fir/basic-program.fir | 1 + .../Lower/OpenMP/workdistribute-multiple.f90 | 20 + .../Lower/OpenMP/workdistribute-saxpy-1d.f90 | 39 + .../Lower/OpenMP/workdistribute-saxpy-2d.f90 | 45 + .../Lower/OpenMP/workdistribute-saxpy-3d.f90 | 47 + ...workdistribute-saxpy-and-scalar-assign.f90 | 53 + .../OpenMP/workdistribute-saxpy-two-2d.f90 | 68 + .../OpenMP/workdistribute-scalar-assign.f90 | 29 + .../workdistribute-target-teams-clauses.f90 | 32 + ...workdistribute-teams-unsupported-after.f90 | 22 + ...orkdistribute-teams-unsupported-before.f90 | 22 + .../OpenMP/lower-workdistribute-doloop.mlir | 33 + .../lower-workdistribute-fission-host.mlir | 117 ++ .../lower-workdistribute-fission-target.mlir | 118 ++ .../OpenMP/lower-workdistribute-fission.mlir | 71 + ...-workdistribute-runtime-assign-scalar.mlir | 108 + 20 files changed, 2685 insertions(+), 1 deletion(-) create mode 100644 flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp create mode 100644 flang/test/Lower/OpenMP/workdistribute-multiple.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 create mode 100644 flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 create mode 100644 flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir create mode 100644 flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 2b39c81178084..8d30f165dd8b6 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -110,6 +110,10 @@ def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { let summary = "Lower workshare construct"; } +def LowerWorkdistribute : Pass<"lower-workdistribute", "::mlir::ModuleOp"> { + let summary = "Lower workdistribute construct"; +} + def GenericLoopConversionPass : Pass<"omp-generic-loop-conversion", "mlir::func::FuncOp"> { let summary = "Converts OpenMP generic `omp.loop` to semantically " diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 579e47268afea..304333fa8830e 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms MapsForPrivatizedSymbols.cpp MapInfoFinalization.cpp MarkDeclareTarget.cpp + LowerWorkdistribute.cpp LowerWorkshare.cpp LowerNontemporal.cpp SimdOnly.cpp diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp new file mode 100644 index 0000000000000..cfa39e142907c --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp @@ -0,0 +1,1852 @@ +//===- LowerWorkdistribute.cpp +//-------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the lowering and optimisations of omp.workdistribute. +// +// Fortran array statements are lowered to fir as fir.do_loop unordered. +// lower-workdistribute pass works mainly on identifying fir.do_loop unordered +// that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and +// lowers it to target{teams{parallel{distribute{wsloop{loop_nest}}}}}. +// It hoists all the other ops outside target region. +// Relaces heap allocation on target with omp.target_allocmem and +// deallocation with omp.target_freemem from host. Also replaces +// runtime function "Assign" with omp_target_memcpy. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Utils.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace flangomp { +#define GEN_PASS_DEF_LOWERWORKDISTRIBUTE +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +#define DEBUG_TYPE "lower-workdistribute" + +using namespace mlir; + +namespace { + +/// This string is used to identify the Fortran-specific runtime FortranAAssign. +static constexpr llvm::StringRef FortranAssignStr = "_FortranAAssign"; + +/// The isRuntimeCall function is a utility designed to determine +/// if a given operation is a call to a Fortran-specific runtime function. +static bool isRuntimeCall(Operation *op) { + if (auto callOp = dyn_cast(op)) { + auto callee = callOp.getCallee(); + if (!callee) + return false; + auto *func = op->getParentOfType().lookupSymbol(*callee); + if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName())) + return true; + } + return false; +} + +/// This is the single source of truth about whether we should parallelize an +/// operation nested in an omp.workdistribute region. +/// Parallelize here refers to dividing into units of work. +static bool shouldParallelize(Operation *op) { + // True if the op is a runtime call to Assign + if (isRuntimeCall(op)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + return true; + } + } + // We cannot parallelize ops with side effects. + // Parallelizable operations should not produce + // values that other operations depend on + if (llvm::any_of(op->getResults(), + [](OpResult v) -> bool { return !v.use_empty(); })) + return false; + // We will parallelize unordered loops - these come from array syntax + if (auto loop = dyn_cast(op)) { + auto unordered = loop.getUnordered(); + if (!unordered) + return false; + return *unordered; + } + // We cannot parallelize anything else. + return false; +} + +/// The getPerfectlyNested function is a generic utility for finding +/// a single, "perfectly nested" operation within a parent operation. +template +static T getPerfectlyNested(Operation *op) { + if (op->getNumRegions() != 1) + return nullptr; + auto ®ion = op->getRegion(0); + if (region.getBlocks().size() != 1) + return nullptr; + auto *block = ®ion.front(); + auto *firstOp = &block->front(); + if (auto nested = dyn_cast(firstOp)) + if (firstOp->getNextNode() == block->getTerminator()) + return nested; + return nullptr; +} + +/// verifyTargetTeamsWorkdistribute method verifies that +/// omp.target { teams { workdistribute { ... } } } is well formed +/// and fails for function calls that don't have lowering implemented yet. +static LogicalResult +verifyTargetTeamsWorkdistribute(omp::WorkdistributeOp workdistribute) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + if (!teams) { + emitError(loc, "workdistribute not nested in teams\n"); + return failure(); + } + if (workdistribute.getRegion().getBlocks().size() != 1) { + emitError(loc, "workdistribute with multiple blocks\n"); + return failure(); + } + if (teams.getRegion().getBlocks().size() != 1) { + emitError(loc, "teams with multiple blocks\n"); + return failure(); + } + + bool foundWorkdistribute = false; + for (auto &op : teams.getOps()) { + if (isa(op)) { + if (foundWorkdistribute) { + emitError(loc, "teams has multiple workdistribute ops.\n"); + return failure(); + } + foundWorkdistribute = true; + continue; + } + // Identify any omp dialect ops present before/after workdistribute. + if (op.getDialect() && isa(op.getDialect()) && + !isa(op)) { + emitError(loc, "teams has omp ops other than workdistribute. Lowering " + "not implemented yet.\n"); + return failure(); + } + } + + omp::TargetOp targetOp = dyn_cast(teams->getParentOp()); + // return if not omp.target + if (!targetOp) + return success(); + + for (auto &op : workdistribute.getOps()) { + if (auto callOp = dyn_cast(op)) { + if (isRuntimeCall(&op)) { + auto funcName = (*callOp.getCallee()).getRootReference().getValue(); + // _FortranAAssign is handled. Other runtime calls are not supported + // in omp.workdistribute yet. + if (funcName == FortranAssignStr) + continue; + else { + emitError(loc, "Runtime call " + funcName + + " lowering not supported for workdistribute yet."); + return failure(); + } + } + } + } + return success(); +} + +/// fissionWorkdistribute method finds the parallelizable ops +/// within teams {workdistribute} region and moves them to their +/// own teams{workdistribute} region. +/// +/// If B() and D() are parallelizable, +/// +/// omp.teams { +/// omp.workdistribute { +/// A() +/// B() +/// C() +/// D() +/// E() +/// } +/// } +/// +/// becomes +/// +/// A() +/// omp.teams { +/// omp.workdistribute { +/// B() +/// } +/// } +/// C() +/// omp.teams { +/// omp.workdistribute { +/// D() +/// } +/// } +/// E() +static FailureOr +fissionWorkdistribute(omp::WorkdistributeOp workdistribute) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + auto *teamsBlock = &teams.getRegion().front(); + bool changed = false; + // Move the ops inside teams and before workdistribute outside. + IRMapping irMapping; + llvm::SmallVector teamsHoisted; + for (auto &op : teams.getOps()) { + if (&op == workdistribute) { + break; + } + if (shouldParallelize(&op)) { + emitError(loc, "teams has parallelize ops before first workdistribute\n"); + return failure(); + } else { + rewriter.setInsertionPoint(teams); + rewriter.clone(op, irMapping); + teamsHoisted.push_back(&op); + changed = true; + } + } + for (auto *op : llvm::reverse(teamsHoisted)) { + op->replaceAllUsesWith(irMapping.lookup(op)); + op->erase(); + } + + // While we have unhandled operations in the original workdistribute + auto *workdistributeBlock = &workdistribute.getRegion().front(); + auto *terminator = workdistributeBlock->getTerminator(); + while (&workdistributeBlock->front() != terminator) { + rewriter.setInsertionPoint(teams); + IRMapping mapping; + llvm::SmallVector hoisted; + Operation *parallelize = nullptr; + for (auto &op : workdistribute.getOps()) { + if (&op == terminator) { + break; + } + if (shouldParallelize(&op)) { + parallelize = &op; + break; + } else { + rewriter.clone(op, mapping); + hoisted.push_back(&op); + changed = true; + } + } + + for (auto *op : llvm::reverse(hoisted)) { + op->replaceAllUsesWith(mapping.lookup(op)); + op->erase(); + } + + if (parallelize && hoisted.empty() && + parallelize->getNextNode() == terminator) + break; + if (parallelize) { + auto newTeams = rewriter.cloneWithoutRegions(teams); + auto *newTeamsBlock = rewriter.createBlock( + &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {}); + for (auto arg : teamsBlock->getArguments()) + newTeamsBlock->addArgument(arg.getType(), arg.getLoc()); + auto newWorkdistribute = rewriter.create(loc); + rewriter.create(loc); + rewriter.createBlock(&newWorkdistribute.getRegion(), + newWorkdistribute.getRegion().begin(), {}, {}); + auto *cloned = rewriter.clone(*parallelize); + parallelize->replaceAllUsesWith(cloned); + parallelize->erase(); + rewriter.create(loc); + changed = true; + } + } + return changed; +} + +/// Generate omp.parallel operation with an empty region. +static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) { + auto parallelOp = rewriter.create(loc); + parallelOp.setComposite(composite); + rewriter.createBlock(¶llelOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); + return; +} + +/// Generate omp.distribute operation with an empty region. +static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) { + mlir::omp::DistributeOperands distributeClauseOps; + auto distributeOp = + rewriter.create(loc, distributeClauseOps); + distributeOp.setComposite(composite); + auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion()); + rewriter.setInsertionPointToStart(distributeBlock); + return; +} + +/// Generate loop nest clause operands from fir.do_loop operation. +static void +genLoopNestClauseOps(OpBuilder &rewriter, fir::DoLoopOp loop, + mlir::omp::LoopNestOperands &loopNestClauseOps) { + assert(loopNestClauseOps.loopLowerBounds.empty() && + "Loop nest bounds were already emitted!"); + loopNestClauseOps.loopLowerBounds.push_back(loop.getLowerBound()); + loopNestClauseOps.loopUpperBounds.push_back(loop.getUpperBound()); + loopNestClauseOps.loopSteps.push_back(loop.getStep()); + loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); +} + +/// Generate omp.wsloop operation with an empty region and +/// clone the body of fir.do_loop operation inside the loop nest region. +static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop, + const mlir::omp::LoopNestOperands &clauseOps, + bool composite) { + + auto wsloopOp = rewriter.create(doLoop.getLoc()); + wsloopOp.setComposite(composite); + rewriter.createBlock(&wsloopOp.getRegion()); + + auto loopNestOp = + rewriter.create(doLoop.getLoc(), clauseOps); + + // Clone the loop's body inside the loop nest construct using the + // mapped values. + rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), + loopNestOp.getRegion().begin()); + Block *clonedBlock = &loopNestOp.getRegion().back(); + mlir::Operation *terminatorOp = clonedBlock->getTerminator(); + + // Erase fir.result op of do loop and create yield op. + if (auto resultOp = dyn_cast(terminatorOp)) { + rewriter.setInsertionPoint(terminatorOp); + rewriter.create(doLoop->getLoc()); + terminatorOp->erase(); + } +} + +/// workdistributeDoLower method finds the fir.do_loop unoredered +/// nested in teams {workdistribute{fir.do_loop unoredered}} and +/// lowers it to teams {parallel { distribute {wsloop {loop_nest}}}}. +/// +/// If fir.do_loop is present inside teams workdistribute +/// +/// omp.teams { +/// omp.workdistribute { +/// fir.do_loop unoredered { +/// ... +/// } +/// } +/// } +/// +/// Then, its lowered to +/// +/// omp.teams { +/// omp.parallel { +/// omp.distribute { +/// omp.wsloop { +/// omp.loop_nest +/// ... +/// } +/// } +/// } +/// } +/// } +static bool +workdistributeDoLower(omp::WorkdistributeOp workdistribute, + SetVector &targetOpsToProcess) { + OpBuilder rewriter(workdistribute); + auto doLoop = getPerfectlyNested(workdistribute); + auto wdLoc = workdistribute->getLoc(); + if (doLoop && shouldParallelize(doLoop)) { + assert(doLoop.getReduceOperands().empty()); + + // Record the target ops to process later + if (auto teamsOp = dyn_cast(workdistribute->getParentOp())) { + auto targetOp = dyn_cast(teamsOp->getParentOp()); + if (targetOp) { + targetOpsToProcess.insert(targetOp); + } + } + // Generate the nested parallel, distribute, wsloop and loop_nest ops. + genParallelOp(wdLoc, rewriter, true); + genDistributeOp(wdLoc, rewriter, true); + mlir::omp::LoopNestOperands loopNestClauseOps; + genLoopNestClauseOps(rewriter, doLoop, loopNestClauseOps); + genWsLoopOp(rewriter, doLoop, loopNestClauseOps, true); + workdistribute.erase(); + return true; + } + return false; +} + +/// Check if the enclosed type in fir.ref is fir.box and fir.box encloses array +static bool isEnclosedTypeRefToBoxArray(Type type) { + // Check if it's a reference type + if (auto refType = dyn_cast(type)) { + // Get the referenced type (should be fir.box) + auto referencedType = refType.getEleTy(); + // Check if referenced type is a box + if (auto boxType = dyn_cast(referencedType)) { + // Get the boxed type and check if it's an array + auto boxedType = boxType.getEleTy(); + // Check if boxed type is a sequence (array) + return isa(boxedType); + } + } + return false; +} + +/// Check if the enclosed type in fir.box is scalar (not array) +static bool isEnclosedTypeBoxScalar(Type type) { + // Check if it's a box type + if (auto boxType = dyn_cast(type)) { + // Get the boxed type + auto boxedType = boxType.getEleTy(); + // Check if boxed type is NOT a sequence (array) + return !isa(boxedType); + } + return false; +} + +/// Check if the FortranAAssign call has src as scalar and dest as array +static bool isFortranAssignSrcScalarAndDestArray(fir::CallOp callOp) { + if (callOp.getNumOperands() < 2) + return false; + auto srcArg = callOp.getOperand(1); + auto destArg = callOp.getOperand(0); + // Both operands should be fir.convert ops + auto srcConvert = srcArg.getDefiningOp(); + auto destConvert = destArg.getDefiningOp(); + if (!srcConvert || !destConvert) { + emitError(callOp->getLoc(), + "Unimplemented: FortranAssign to OpenMP lowering\n"); + return false; + } + // Get the original types before conversion + auto srcOrigType = srcConvert.getValue().getType(); + auto destOrigType = destConvert.getValue().getType(); + + // Check if src is scalar and dest is array + bool srcIsScalar = isEnclosedTypeBoxScalar(srcOrigType); + bool destIsArray = isEnclosedTypeRefToBoxArray(destOrigType); + return srcIsScalar && destIsArray; +} + +/// Convert a flat index to multi-dimensional indices for an array box +/// Example: 2D array with shape (2,4) +/// Col 1 Col 2 Col 3 Col 4 +/// Row 1: (1,1) (1,2) (1,3) (1,4) +/// Row 2: (2,1) (2,2) (2,3) (2,4) +/// +/// extents: (2,4) +/// +/// flatIdx: 0 1 2 3 4 5 6 7 +/// Indices: (1,1) (1,2) (1,3) (1,4) (2,1) (2,2) (2,3) (2,4) +static SmallVector convertFlatToMultiDim(OpBuilder &builder, + Location loc, Value flatIdx, + Value arrayBox) { + // Get array type and rank + auto boxType = cast(arrayBox.getType()); + auto seqType = cast(boxType.getEleTy()); + int rank = seqType.getDimension(); + + // Get all extents + SmallVector extents; + // Get extents for each dimension + for (int i = 0; i < rank; ++i) { + auto dimIdx = builder.create(loc, i); + auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx); + extents.push_back(boxDims.getResult(1)); + } + + // Convert flat index to multi-dimensional indices + SmallVector indices(rank); + Value temp = flatIdx; + auto c1 = builder.create(loc, 1); + + // Work backwards through dimensions (row-major order) + for (int i = rank - 1; i >= 0; --i) { + Value zeroBasedIdx = builder.create(loc, temp, extents[i]); + // Convert to one-based index + indices[i] = builder.create(loc, zeroBasedIdx, c1); + if (i > 0) { + temp = builder.create(loc, temp, extents[i]); + } + } + + return indices; +} + +/// Calculate the total number of elements in the array box +/// (totalElems = extent(1) * extent(2) * ... * extent(n)) +static Value CalculateTotalElements(OpBuilder &builder, Location loc, + Value arrayBox) { + auto boxType = cast(arrayBox.getType()); + auto seqType = cast(boxType.getEleTy()); + int rank = seqType.getDimension(); + + Value totalElems = nullptr; + for (int i = 0; i < rank; ++i) { + auto dimIdx = builder.create(loc, i); + auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx); + Value extent = boxDims.getResult(1); + if (i == 0) { + totalElems = extent; + } else { + totalElems = builder.create(loc, totalElems, extent); + } + } + return totalElems; +} + +/// Replace the FortranAAssign runtime call with an unordered do loop +static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc, + omp::TeamsOp teamsOp, + omp::WorkdistributeOp workdistribute, + fir::CallOp callOp) { + auto destConvert = callOp.getOperand(0).getDefiningOp(); + auto srcConvert = callOp.getOperand(1).getDefiningOp(); + + Value destBox = destConvert.getValue(); + Value srcBox = srcConvert.getValue(); + + // get defining alloca op of destBox and srcBox + auto destAlloca = destBox.getDefiningOp(); + + if (!destAlloca) { + emitError(loc, "Unimplemented: FortranAssign to OpenMP lowering\n"); + return; + } + + // get the store op that stores to the alloca + for (auto user : destAlloca->getUsers()) { + if (auto storeOp = dyn_cast(user)) { + destBox = storeOp.getValue(); + break; + } + } + + builder.setInsertionPoint(teamsOp); + // Load destination array box (if it's a reference) + Value arrayBox = destBox; + if (isa(destBox.getType())) + arrayBox = builder.create(loc, destBox); + + auto scalarValue = builder.create(loc, srcBox); + Value scalar = builder.create(loc, scalarValue); + + // Calculate total number of elements (flattened) + auto c0 = builder.create(loc, 0); + auto c1 = builder.create(loc, 1); + Value totalElems = CalculateTotalElements(builder, loc, arrayBox); + + auto *workdistributeBlock = &workdistribute.getRegion().front(); + builder.setInsertionPointToStart(workdistributeBlock); + // Create single unordered loop for flattened array + auto doLoop = fir::DoLoopOp::create(builder, loc, c0, totalElems, c1, true); + Block *loopBlock = &doLoop.getRegion().front(); + builder.setInsertionPointToStart(doLoop.getBody()); + + auto flatIdx = loopBlock->getArgument(0); + SmallVector indices = + convertFlatToMultiDim(builder, loc, flatIdx, arrayBox); + // Use fir.array_coor for linear addressing + auto elemPtr = fir::ArrayCoorOp::create( + builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox, + nullptr, nullptr, ValueRange{indices}, ValueRange{}); + + builder.create(loc, scalar, elemPtr); +} + +/// workdistributeRuntimeCallLower method finds the runtime calls +/// nested in teams {workdistribute{}} and +/// lowers FortranAAssign to unordered do loop if src is scalar and dest is +/// array. Other runtime calls are not handled currently. +static FailureOr +workdistributeRuntimeCallLower(omp::WorkdistributeOp workdistribute, + SetVector &targetOpsToProcess) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + if (!teams) { + emitError(loc, "workdistribute not nested in teams\n"); + return failure(); + } + if (workdistribute.getRegion().getBlocks().size() != 1) { + emitError(loc, "workdistribute with multiple blocks\n"); + return failure(); + } + if (teams.getRegion().getBlocks().size() != 1) { + emitError(loc, "teams with multiple blocks\n"); + return failure(); + } + bool changed = false; + // Get the target op parent of teams + omp::TargetOp targetOp = dyn_cast(teams->getParentOp()); + SmallVector opsToErase; + for (auto &op : workdistribute.getOps()) { + if (isRuntimeCall(&op)) { + rewriter.setInsertionPoint(&op); + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + if (isFortranAssignSrcScalarAndDestArray(runtimeCall) && targetOp) { + // Record the target ops to process later + targetOpsToProcess.insert(targetOp); + replaceWithUnorderedDoLoop(rewriter, loc, teams, workdistribute, + runtimeCall); + opsToErase.push_back(&op); + changed = true; + } + } + } + } + // Erase the runtime calls that have been replaced. + for (auto *op : opsToErase) { + op->erase(); + } + return changed; +} + +/// teamsWorkdistributeToSingleOp method hoists all the ops inside +/// teams {workdistribute{}} before teams op. +/// +/// If A() and B () are present inside teams workdistribute +/// +/// omp.teams { +/// omp.workdistribute { +/// A() +/// B() +/// } +/// } +/// +/// Then, its lowered to +/// +/// A() +/// B() +/// +/// If only the terminator remains in teams after hoisting, we erase teams op. +static bool +teamsWorkdistributeToSingleOp(omp::TeamsOp teamsOp, + SetVector &targetOpsToProcess) { + auto workdistributeOp = getPerfectlyNested(teamsOp); + if (!workdistributeOp) + return false; + // Get the block containing teamsOp (the parent block). + Block *parentBlock = teamsOp->getBlock(); + Block &workdistributeBlock = *workdistributeOp.getRegion().begin(); + // Record the target ops to process later + for (auto &op : workdistributeBlock.getOperations()) { + if (shouldParallelize(&op)) { + auto targetOp = dyn_cast(teamsOp->getParentOp()); + if (targetOp) { + targetOpsToProcess.insert(targetOp); + } + } + } + auto insertPoint = Block::iterator(teamsOp); + // Get the range of operations to move (excluding the terminator). + auto workdistributeBegin = workdistributeBlock.begin(); + auto workdistributeEnd = workdistributeBlock.getTerminator()->getIterator(); + // Move the operations from workdistribute block to before teamsOp. + parentBlock->getOperations().splice(insertPoint, + workdistributeBlock.getOperations(), + workdistributeBegin, workdistributeEnd); + // Erase the now-empty workdistributeOp. + workdistributeOp.erase(); + Block &teamsBlock = *teamsOp.getRegion().begin(); + // Check if only the terminator remains and erase teams op. + if (teamsBlock.getOperations().size() == 1 && + teamsBlock.getTerminator() != nullptr) { + teamsOp.erase(); + } + return true; +} + +/// If multiple workdistribute are nested in a target regions, we will need to +/// split the target region, but we want to preserve the data semantics of the +/// original data region and avoid unnecessary data movement at each of the +/// subkernels - we split the target region into a target_data{target} +/// nest where only the outer one moves the data +FailureOr splitTargetData(omp::TargetOp targetOp, + RewriterBase &rewriter) { + auto loc = targetOp->getLoc(); + if (targetOp.getMapVars().empty()) { + emitError(loc, "Target region has no data maps\n"); + return failure(); + } + // Collect all the mapinfo ops + SmallVector mapInfos; + for (auto opr : targetOp.getMapVars()) { + auto mapInfo = cast(opr.getDefiningOp()); + mapInfos.push_back(mapInfo); + } + + rewriter.setInsertionPoint(targetOp); + SmallVector innerMapInfos; + SmallVector outerMapInfos; + // Create new mapinfo ops for the inner target region + for (auto mapInfo : mapInfos) { + auto originalMapType = + (llvm::omp::OpenMPOffloadMappingFlags)(mapInfo.getMapType()); + auto originalCaptureType = mapInfo.getMapCaptureType(); + llvm::omp::OpenMPOffloadMappingFlags newMapType; + mlir::omp::VariableCaptureKind newCaptureType; + // For bycopy, we keep the same map type and capture type + // For byref, we change the map type to none and keep the capture type + if (originalCaptureType == mlir::omp::VariableCaptureKind::ByCopy) { + newMapType = originalMapType; + newCaptureType = originalCaptureType; + } else if (originalCaptureType == mlir::omp::VariableCaptureKind::ByRef) { + newMapType = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + newCaptureType = originalCaptureType; + outerMapInfos.push_back(mapInfo); + } else { + emitError(targetOp->getLoc(), "Unhandled case"); + return failure(); + } + auto innerMapInfo = cast(rewriter.clone(*mapInfo)); + innerMapInfo.setMapTypeAttr(rewriter.getIntegerAttr( + rewriter.getIntegerType(64, false), + static_cast< + std::underlying_type_t>( + newMapType))); + innerMapInfo.setMapCaptureType(newCaptureType); + innerMapInfos.push_back(innerMapInfo.getResult()); + } + + rewriter.setInsertionPoint(targetOp); + auto device = targetOp.getDevice(); + auto ifExpr = targetOp.getIfExpr(); + auto deviceAddrVars = targetOp.getHasDeviceAddrVars(); + auto devicePtrVars = targetOp.getIsDevicePtrVars(); + // Create the target data op + auto targetDataOp = rewriter.create( + loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars); + auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(taregtDataBlock); + // Create the inner target op + auto newTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), + targetOp.getHostEvalVars(), targetOp.getIfExpr(), + targetOp.getInReductionVars(), targetOp.getInReductionByrefAttr(), + targetOp.getInReductionSymsAttr(), targetOp.getIsDevicePtrVars(), + innerMapInfos, targetOp.getNowaitAttr(), targetOp.getPrivateVars(), + targetOp.getPrivateSymsAttr(), targetOp.getPrivateNeedsBarrierAttr(), + targetOp.getThreadLimit(), targetOp.getPrivateMapsAttr()); + rewriter.inlineRegionBefore(targetOp.getRegion(), newTargetOp.getRegion(), + newTargetOp.getRegion().begin()); + rewriter.replaceOp(targetOp, targetDataOp); + return newTargetOp; +} + +/// getNestedOpToIsolate function is designed to identify a specific teams +/// parallel op within the body of an omp::TargetOp that should be "isolated." +/// This returns a tuple of op, if its first op in targetBlock, or if the op is +/// last op in the traget block. +static std::optional> +getNestedOpToIsolate(omp::TargetOp targetOp) { + if (targetOp.getRegion().empty()) + return std::nullopt; + auto *targetBlock = &targetOp.getRegion().front(); + for (auto &op : *targetBlock) { + bool first = &op == &*targetBlock->begin(); + bool last = op.getNextNode() == targetBlock->getTerminator(); + if (first && last) + return std::nullopt; + + if (isa(&op)) + return {{&op, first, last}}; + } + return std::nullopt; +} + +/// Temporary structure to hold the two mapinfo ops +struct TempOmpVar { + omp::MapInfoOp from, to; +}; + +/// isPtr checks if the type is a pointer or reference type. +static bool isPtr(Type ty) { + return isa(ty) || isa(ty); +} + +/// getPtrTypeForOmp returns an LLVM pointer type for the given type. +static Type getPtrTypeForOmp(Type ty) { + if (isPtr(ty)) + return LLVM::LLVMPointerType::get(ty.getContext()); + else + return fir::ReferenceType::get(ty); +} + +/// allocateTempOmpVar allocates a temporary variable for OpenMP mapping +static TempOmpVar allocateTempOmpVar(Location loc, Type ty, + RewriterBase &rewriter) { + MLIRContext &ctx = *ty.getContext(); + Value alloc; + Type allocType; + auto llvmPtrTy = LLVM::LLVMPointerType::get(&ctx); + // Get the appropriate type for allocation + if (isPtr(ty)) { + Type intTy = rewriter.getI32Type(); + auto one = rewriter.create(loc, intTy, 1); + allocType = llvmPtrTy; + alloc = rewriter.create(loc, llvmPtrTy, allocType, one); + allocType = intTy; + } else { + allocType = ty; + alloc = rewriter.create(loc, allocType); + } + // Lambda to create mapinfo ops + auto getMapInfo = [&](uint64_t mappingFlags, const char *name) { + return rewriter.create( + loc, alloc.getType(), alloc, TypeAttr::get(allocType), + rewriter.getIntegerAttr(rewriter.getIntegerType(64, /*isSigned=*/false), + mappingFlags), + rewriter.getAttr( + omp::VariableCaptureKind::ByRef), + /*varPtrPtr=*/Value{}, + /*members=*/SmallVector{}, + /*member_index=*/mlir::ArrayAttr{}, + /*bounds=*/ValueRange(), + /*mapperId=*/mlir::FlatSymbolRefAttr(), + /*name=*/rewriter.getStringAttr(name), rewriter.getBoolAttr(false)); + }; + // Create mapinfo ops. + uint64_t mapFrom = + static_cast>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); + uint64_t mapTo = + static_cast>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); + auto mapInfoFrom = getMapInfo(mapFrom, "__flang_workdistribute_from"); + auto mapInfoTo = getMapInfo(mapTo, "__flang_workdistribute_to"); + return TempOmpVar{mapInfoFrom, mapInfoTo}; +} + +// usedOutsideSplit checks if a value is used outside the split operation. +static bool usedOutsideSplit(Value v, Operation *split) { + if (!split) + return false; + auto targetOp = cast(split->getParentOp()); + auto *targetBlock = &targetOp.getRegion().front(); + for (auto *user : v.getUsers()) { + while (user->getBlock() != targetBlock) { + user = user->getParentOp(); + } + if (!user->isBeforeInBlock(split)) + return true; + } + return false; +} + +/// isRecomputableAfterFission checks if an operation can be recomputed +static bool isRecomputableAfterFission(Operation *op, Operation *splitBefore) { + // If the op has side effects, it cannot be recomputed. + // We consider fir.declare as having no side effects. + return isa(op) || isMemoryEffectFree(op); +} + +/// collectNonRecomputableDeps collects dependencies that cannot be recomputed +static void collectNonRecomputableDeps(Value &v, omp::TargetOp targetOp, + SetVector &nonRecomputable, + SetVector &toCache, + SetVector &toRecompute) { + Operation *op = v.getDefiningOp(); + // If v is a block argument, it must be from the targetOp. + if (!op) { + assert(cast(v).getOwner()->getParentOp() == targetOp); + return; + } + // If the op is in the nonRecomputable set, add it to toCache and return. + if (nonRecomputable.contains(op)) { + toCache.insert(op); + return; + } + // Add the op to toRecompute. + toRecompute.insert(op); + for (auto opr : op->getOperands()) + collectNonRecomputableDeps(opr, targetOp, nonRecomputable, toCache, + toRecompute); +} + +/// createBlockArgsAndMap creates block arguments and maps them +static void createBlockArgsAndMap(Location loc, RewriterBase &rewriter, + omp::TargetOp &targetOp, Block *targetBlock, + Block *newTargetBlock, + SmallVector &hostEvalVars, + SmallVector &mapOperands, + SmallVector &allocs, + IRMapping &irMapping) { + // FIRST: Map `host_eval_vars` to block arguments + unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size(); + for (unsigned i = 0; i < hostEvalVars.size(); ++i) { + Value originalValue; + BlockArgument newArg; + if (i < originalHostEvalVarsSize) { + originalValue = targetBlock->getArgument(i); // Host_eval args come first + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } else { + originalValue = hostEvalVars[i]; + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } + irMapping.map(originalValue, newArg); + } + + // SECOND: Map `map_operands` to block arguments + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + for (unsigned i = 0; i < mapOperands.size(); ++i) { + Value originalValue; + BlockArgument newArg; + // Map the new arguments from the original block. + if (i < originalMapVarsSize) { + originalValue = targetBlock->getArgument(originalHostEvalVarsSize + + i); // Offset by host_eval count + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } + // Map the new arguments from the `allocs`. + else { + originalValue = allocs[i - originalMapVarsSize]; + newArg = newTargetBlock->addArgument( + getPtrTypeForOmp(originalValue.getType()), originalValue.getLoc()); + } + irMapping.map(originalValue, newArg); + } + + // THIRD: Map `private_vars` to block arguments (if any) + unsigned originalPrivateVarsSize = targetOp.getPrivateVars().size(); + for (unsigned i = 0; i < originalPrivateVarsSize; ++i) { + auto originalArg = targetBlock->getArgument(originalHostEvalVarsSize + + originalMapVarsSize + i); + auto newArg = newTargetBlock->addArgument(originalArg.getType(), + originalArg.getLoc()); + irMapping.map(originalArg, newArg); + } + return; +} + +/// reloadCacheAndRecompute reloads cached values and recomputes operations +static void reloadCacheAndRecompute( + Location loc, RewriterBase &rewriter, Operation *splitBefore, + omp::TargetOp &targetOp, Block *targetBlock, Block *newTargetBlock, + SmallVector &hostEvalVars, SmallVector &mapOperands, + SmallVector &allocs, SetVector &toRecompute, + IRMapping &irMapping) { + // Handle the load operations for the allocs. + rewriter.setInsertionPointToStart(newTargetBlock); + auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext()); + + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + unsigned hostEvalVarsSize = hostEvalVars.size(); + // Create load operations for each allocated variable. + for (unsigned i = 0; i < allocs.size(); ++i) { + Value original = allocs[i]; + // Get the new block argument for this specific allocated value. + Value newArg = + newTargetBlock->getArgument(hostEvalVarsSize + originalMapVarsSize + i); + Value restored; + // If the original value is a pointer or reference, load and convert if + // necessary. + if (isPtr(original.getType())) { + restored = rewriter.create(loc, llvmPtrTy, newArg); + if (!isa(original.getType())) + restored = + rewriter.create(loc, original.getType(), restored); + } else { + restored = rewriter.create(loc, newArg); + } + irMapping.map(original, restored); + } + // Clone the operations if they are in the toRecompute set. + for (auto it = targetBlock->begin(); it != splitBefore->getIterator(); it++) { + if (toRecompute.contains(&*it)) + rewriter.clone(*it, irMapping); + } +} + +/// Given a teamsOp, navigate down the nested structure to find the +/// innermost LoopNestOp. The expected nesting is: +/// teams -> parallel -> distribute -> wsloop -> loop_nest +static mlir::omp::LoopNestOp getLoopNestFromTeams(mlir::omp::TeamsOp teamsOp) { + if (teamsOp.getRegion().empty()) + return nullptr; + // Ensure the teams region has a single block. + if (teamsOp.getRegion().getBlocks().size() != 1) + return nullptr; + // Find parallel op inside teams + mlir::omp::ParallelOp parallelOp = nullptr; + // Look for the parallel op in the teams region + for (auto &op : teamsOp.getRegion().front()) { + if (auto parallel = dyn_cast(op)) { + parallelOp = parallel; + break; + } + } + if (!parallelOp) + return nullptr; + + // Find distribute op inside parallel + mlir::omp::DistributeOp distributeOp = nullptr; + for (auto &op : parallelOp.getRegion().front()) { + if (auto distribute = dyn_cast(op)) { + distributeOp = distribute; + break; + } + } + if (!distributeOp) + return nullptr; + + // Find wsloop op inside distribute + mlir::omp::WsloopOp wsloopOp = nullptr; + for (auto &op : distributeOp.getRegion().front()) { + if (auto wsloop = dyn_cast(op)) { + wsloopOp = wsloop; + break; + } + } + if (!wsloopOp) + return nullptr; + + // Find loop_nest op inside wsloop + for (auto &op : wsloopOp.getRegion().front()) { + if (auto loopNest = dyn_cast(op)) { + return loopNest; + } + } + + return nullptr; +} + +/// Generate LLVM constant operations for i32 and i64 types. +static mlir::LLVM::ConstantOp +genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) { + mlir::Type i32Ty = rewriter.getI32Type(); + mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value); + return rewriter.create(loc, i32Ty, attr); +} + +/// Given a box descriptor, extract the base address of the data it describes. +/// If the box descriptor is a reference, load it first. +/// The base address is returned as an i8* pointer. +static Value genDescriptorGetBaseAddress(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetBaseAddress"); + auto i8Type = builder.getI8Type(); + auto unknownArrayType = + fir::SequenceType::get({fir::SequenceType::getUnknownExtent()}, i8Type); + auto i8BoxType = fir::BoxType::get(unknownArrayType); + auto typedBox = fir::ConvertOp::create(builder, loc, i8BoxType, box); + auto rawAddr = fir::BoxAddrOp::create(builder, loc, typedBox); + return rawAddr; +} + +/// Given a box descriptor, extract the total number of elements in the array it +/// describes. If the box descriptor is a reference, load it first. +/// The total number of elements is returned as an i64 value. +static Value genDescriptorGetTotalElements(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetTotalElements"); + auto i64Type = builder.getI64Type(); + return fir::BoxTotalElementsOp::create(builder, loc, i64Type, box); +} + +/// Given a box descriptor, extract the size of each element in the array it +/// describes. If the box descriptor is a reference, load it first. +/// The element size is returned as an i64 value. +static Value genDescriptorGetEleSize(fir::FirOpBuilder &builder, Location loc, + Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetElementSize"); + auto i64Type = builder.getI64Type(); + return fir::BoxEleSizeOp::create(builder, loc, i64Type, box); +} + +/// Given a box descriptor, compute the total size in bytes of the data it +/// describes. This is done by multiplying the total number of elements by the +/// size of each element. If the box descriptor is a reference, load it first. +/// The total size in bytes is returned as an i64 value. +static Value genDescriptorGetDataSizeInBytes(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetElementSize"); + Value eleSize = genDescriptorGetEleSize(builder, loc, box); + Value totalElements = genDescriptorGetTotalElements(builder, loc, box); + return mlir::arith::MulIOp::create(builder, loc, totalElements, eleSize); +} + +/// Generate a call to the OpenMP runtime function `omp_get_mapped_ptr` to +/// retrieve the device pointer corresponding to a given host pointer and device +/// number. If no mapping exists, the original host pointer is returned. +/// Signature: +/// void *omp_get_mapped_ptr(void *host_ptr, int device_num); +static mlir::Value genOmpGetMappedPtrIfPresent(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value hostPtr, + mlir::Value deviceNum, + mlir::ModuleOp module) { + auto *context = builder.getContext(); + auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type()); + auto i32Type = builder.getI32Type(); + auto funcName = "omp_get_mapped_ptr"; + auto funcOp = module.lookupSymbol(funcName); + + if (!funcOp) { + auto funcType = + mlir::FunctionType::get(context, {voidPtrType, i32Type}, {voidPtrType}); + + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + + funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType); + funcOp.setPrivate(); + } + + llvm::SmallVector args; + args.push_back(fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr)); + args.push_back(fir::ConvertOp::create(builder, loc, i32Type, deviceNum)); + auto callOp = fir::CallOp::create(builder, loc, funcOp, args); + auto mappedPtr = callOp.getResult(0); + auto isNull = builder.genIsNullAddr(loc, mappedPtr); + auto convertedHostPtr = + fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr); + auto result = arith::SelectOp::create(builder, loc, isNull, convertedHostPtr, + mappedPtr); + return result; +} + +/// Generate a call to the OpenMP runtime function `omp_target_memcpy` to +/// perform memory copy between host and device or between devices. +/// Signature: +/// int omp_target_memcpy(void *dst, const void *src, size_t length, +/// size_t dst_offset, size_t src_offset, +/// int dst_device, int src_device); +static void genOmpTargetMemcpyCall(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value dst, + mlir::Value src, mlir::Value length, + mlir::Value dstOffset, mlir::Value srcOffset, + mlir::Value device, mlir::ModuleOp module) { + auto *context = builder.getContext(); + auto funcName = "omp_target_memcpy"; + auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type()); + auto sizeTType = builder.getI64Type(); // assuming size_t is 64-bit + auto i32Type = builder.getI32Type(); + auto funcOp = module.lookupSymbol(funcName); + + if (!funcOp) { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + llvm::SmallVector argTypes = { + voidPtrType, voidPtrType, sizeTType, sizeTType, + sizeTType, i32Type, i32Type}; + auto funcType = mlir::FunctionType::get(context, argTypes, {i32Type}); + funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType); + funcOp.setPrivate(); + } + + llvm::SmallVector args{dst, src, length, dstOffset, + srcOffset, device, device}; + fir::CallOp::create(builder, loc, funcOp, args); + return; +} + +/// Generate code to replace a Fortran array assignment call with OpenMP +/// runtime calls to perform the equivalent operation on the device. +/// This involves extracting the source and destination pointers from the +/// Fortran array descriptors, retrieving their mapped device pointers (if any), +/// and invoking `omp_target_memcpy` to copy the data on the device. +static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder, + mlir::Location loc, + fir::CallOp callOp, + mlir::Value device, + mlir::ModuleOp module) { + assert(callOp.getNumResults() == 0 && + "Expected _FortranAAssign to have no results"); + assert(callOp.getNumOperands() >= 2 && + "Expected _FortranAAssign to have at least two operands"); + + // Extract the source and destination pointers from the call operands. + mlir::Value dest = callOp.getOperand(0); + mlir::Value src = callOp.getOperand(1); + + // Get the base addresses of the source and destination arrays. + mlir::Value srcBase = genDescriptorGetBaseAddress(builder, loc, src); + mlir::Value destBase = genDescriptorGetBaseAddress(builder, loc, dest); + + // Get the total size in bytes of the data to be copied. + mlir::Value srcDataSize = genDescriptorGetDataSizeInBytes(builder, loc, src); + + // Retrieve the mapped device pointers for source and destination. + // If no mapping exists, the original host pointer is used. + Value destPtr = + genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module); + Value srcPtr = + genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module); + Value zero = builder.create(loc, builder.getI64Type(), + builder.getI64IntegerAttr(0)); + + // Generate the call to omp_target_memcpy to perform the data copy on the + // device. + genOmpTargetMemcpyCall(builder, loc, destPtr, srcPtr, srcDataSize, zero, zero, + device, module); +} + +/// Struct to hold the host eval vars corresponding to loop bounds and steps +struct HostEvalVars { + SmallVector lbs; + SmallVector ubs; + SmallVector steps; +}; + +/// moveToHost method clones all the ops from target region outside of it. +/// It hoists runtime function "_FortranAAssign" and replaces it with omp +/// version. Also hoists and replaces fir.allocmem with omp.target_allocmem and +/// fir.freemem with omp.target_freemem +static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter, + mlir::ModuleOp module, + struct HostEvalVars &hostEvalVars) { + OpBuilder::InsertionGuard guard(rewriter); + Block *targetBlock = &targetOp.getRegion().front(); + assert(targetBlock == &targetOp.getRegion().back()); + IRMapping mapping; + + // Get the parent target_data op + auto targetDataOp = cast(targetOp->getParentOp()); + if (!targetDataOp) { + emitError(targetOp->getLoc(), + "Expected target op to be inside target_data op"); + return failure(); + } + // create mapping for host_eval_vars + unsigned hostEvalVarCount = targetOp.getHostEvalVars().size(); + for (unsigned i = 0; i < targetOp.getHostEvalVars().size(); ++i) { + Value hostEvalVar = targetOp.getHostEvalVars()[i]; + BlockArgument arg = targetBlock->getArguments()[i]; + mapping.map(arg, hostEvalVar); + } + // create mapping for map_vars + for (unsigned i = 0; i < targetOp.getMapVars().size(); ++i) { + Value mapInfo = targetOp.getMapVars()[i]; + BlockArgument arg = targetBlock->getArguments()[hostEvalVarCount + i]; + Operation *op = mapInfo.getDefiningOp(); + assert(op); + auto mapInfoOp = cast(op); + // map the block argument to the host-side variable pointer + mapping.map(arg, mapInfoOp.getVarPtr()); + } + // create mapping for private_vars + unsigned mapSize = targetOp.getMapVars().size(); + for (unsigned i = 0; i < targetOp.getPrivateVars().size(); ++i) { + Value privateVar = targetOp.getPrivateVars()[i]; + // The mapping should link the device-side variable to the host-side one. + BlockArgument arg = + targetBlock->getArguments()[hostEvalVarCount + mapSize + i]; + // Map the device-side copy (`arg`) to the host-side value (`privateVar`). + mapping.map(arg, privateVar); + } + + rewriter.setInsertionPoint(targetOp); + SmallVector opsToReplace; + Value device = targetOp.getDevice(); + + // If device is not specified, default to device 0. + if (!device) { + device = genI32Constant(targetOp.getLoc(), rewriter, 0); + } + // Clone all operations. + for (auto it = targetBlock->begin(), end = std::prev(targetBlock->end()); + it != end; ++it) { + auto *op = &*it; + Operation *clonedOp = rewriter.clone(*op, mapping); + // Map the results of the original op to the cloned op. + for (unsigned i = 0; i < op->getNumResults(); ++i) { + mapping.map(op->getResult(i), clonedOp->getResult(i)); + } + // fir.declare changes its type when hoisting it out of omp.target to + // omp.target_data Introduce a load, if original declareOp input is not of + // reference type, but cloned delcareOp input is reference type. + if (fir::DeclareOp clonedDeclareOp = dyn_cast(clonedOp)) { + auto originalDeclareOp = cast(op); + Type originalInType = originalDeclareOp.getMemref().getType(); + Type clonedInType = clonedDeclareOp.getMemref().getType(); + + fir::ReferenceType originalRefType = + dyn_cast(originalInType); + fir::ReferenceType clonedRefType = + dyn_cast(clonedInType); + if (!originalRefType && clonedRefType) { + Type clonedEleTy = clonedRefType.getElementType(); + if (clonedEleTy == originalDeclareOp.getType()) { + opsToReplace.push_back(clonedOp); + } + } + } + // Collect the ops to be replaced. + if (isa(clonedOp) || isa(clonedOp)) + opsToReplace.push_back(clonedOp); + // Check for runtime calls to be replaced. + if (isRuntimeCall(clonedOp)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + opsToReplace.push_back(clonedOp); + } else { + emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting."); + return failure(); + } + } + } + // Replace fir.allocmem with omp.target_allocmem. + for (Operation *op : opsToReplace) { + if (auto allocOp = dyn_cast(op)) { + rewriter.setInsertionPoint(allocOp); + auto ompAllocmemOp = rewriter.create( + allocOp.getLoc(), rewriter.getI64Type(), device, + allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(), + allocOp.getBindcNameAttr(), allocOp.getTypeparams(), + allocOp.getShape()); + auto firConvertOp = rewriter.create( + allocOp.getLoc(), allocOp.getResult().getType(), + ompAllocmemOp.getResult()); + rewriter.replaceOp(allocOp, firConvertOp.getResult()); + } + // Replace fir.freemem with omp.target_freemem. + else if (auto freeOp = dyn_cast(op)) { + rewriter.setInsertionPoint(freeOp); + auto firConvertOp = rewriter.create( + freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref()); + rewriter.create(freeOp.getLoc(), device, + firConvertOp.getResult()); + rewriter.eraseOp(freeOp); + } + // fir.declare changes its type when hoisting it out of omp.target to + // omp.target_data Introduce a load, if original declareOp input is not of + // reference type, but cloned delcareOp input is reference type. + else if (fir::DeclareOp clonedDeclareOp = dyn_cast(op)) { + Type clonedInType = clonedDeclareOp.getMemref().getType(); + fir::ReferenceType clonedRefType = + dyn_cast(clonedInType); + Type clonedEleTy = clonedRefType.getElementType(); + rewriter.setInsertionPoint(op); + Value loadedValue = rewriter.create( + clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref()); + clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue); + } + // Replace runtime calls with omp versions. + else if (isRuntimeCall(op)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + rewriter.setInsertionPoint(op); + fir::FirOpBuilder builder{rewriter, op}; + + mlir::Location loc = runtimeCall.getLoc(); + genFortranAssignOmpReplacement(builder, loc, runtimeCall, device, + module); + rewriter.eraseOp(op); + } else { + emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting."); + return failure(); + } + } else { + emitError(op->getLoc(), "Unhandled op hoisting."); + return failure(); + } + } + + // Update the host_eval_vars to use the mapped values. + for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i) { + hostEvalVars.lbs[i] = mapping.lookup(hostEvalVars.lbs[i]); + hostEvalVars.ubs[i] = mapping.lookup(hostEvalVars.ubs[i]); + hostEvalVars.steps[i] = mapping.lookup(hostEvalVars.steps[i]); + } + // Finally erase the original targetOp. + rewriter.eraseOp(targetOp); + return success(); +} + +/// Result of isolateOp method +struct SplitResult { + omp::TargetOp preTargetOp; + omp::TargetOp isolatedTargetOp; + omp::TargetOp postTargetOp; +}; + +/// computeAllocsCacheRecomputable method computes the allocs needed to cache +/// the values that are used outside the split point. It also computes the ops +/// that need to be cached and the ops that can be recomputed after the split. +static void computeAllocsCacheRecomputable( + omp::TargetOp targetOp, Operation *splitBeforeOp, RewriterBase &rewriter, + SmallVector &preMapOperands, SmallVector &postMapOperands, + SmallVector &allocs, SmallVector &requiredVals, + SetVector &nonRecomputable, SetVector &toCache, + SetVector &toRecompute) { + auto *targetBlock = &targetOp.getRegion().front(); + // Find all values that are used outside the split point. + for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator(); + it++) { + // Check if any of the results are used outside the split point. + for (auto res : it->getResults()) { + if (usedOutsideSplit(res, splitBeforeOp)) { + requiredVals.push_back(res); + } + } + // If the op is not recomputable, add it to the nonRecomputable set. + if (!isRecomputableAfterFission(&*it, splitBeforeOp)) { + nonRecomputable.insert(&*it); + } + } + // For each required value, collect its dependencies. + for (auto requiredVal : requiredVals) + collectNonRecomputableDeps(requiredVal, targetOp, nonRecomputable, toCache, + toRecompute); + // For each op in toCache, create an alloc and update the pre and post map + // operands. + for (Operation *op : toCache) { + for (auto res : op->getResults()) { + auto alloc = + allocateTempOmpVar(targetOp.getLoc(), res.getType(), rewriter); + allocs.push_back(res); + preMapOperands.push_back(alloc.from); + postMapOperands.push_back(alloc.to); + } + } +} + +/// genPreTargetOp method generates the preTargetOp that contains all the ops +/// before the split point. It also creates the block arguments and maps the +/// values accordingly. It also creates the store operations for the allocs. +static omp::TargetOp +genPreTargetOp(omp::TargetOp targetOp, SmallVector &preMapOperands, + SmallVector &allocs, Operation *splitBeforeOp, + RewriterBase &rewriter, struct HostEvalVars &hostEvalVars, + bool isTargetDevice) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector preHostEvalVars{targetOp.getHostEvalVars()}; + // update the hostEvalVars of preTargetOp + omp::TargetOp preTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars, + targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), preMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + auto *preTargetBlock = rewriter.createBlock( + &preTargetOp.getRegion(), preTargetOp.getRegion().begin(), {}, {}); + IRMapping preMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, preTargetBlock, + preHostEvalVars, preMapOperands, allocs, preMapping); + + // Handle the store operations for the allocs. + rewriter.setInsertionPointToStart(preTargetBlock); + auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext()); + + // Clone the original operations. + for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator(); + it++) { + rewriter.clone(*it, preMapping); + } + + unsigned originalHostEvalVarsSize = preHostEvalVars.size(); + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + // Create Stores for allocs. + for (unsigned i = 0; i < allocs.size(); ++i) { + Value originalResult = allocs[i]; + Value toStore = preMapping.lookup(originalResult); + // Get the new block argument for this specific allocated value. + Value newArg = preTargetBlock->getArgument(originalHostEvalVarsSize + + originalMapVarsSize + i); + // Create the store operation. + if (isPtr(originalResult.getType())) { + if (!isa(toStore.getType())) + toStore = rewriter.create(loc, llvmPtrTy, toStore); + rewriter.create(loc, toStore, newArg); + } else { + rewriter.create(loc, toStore, newArg); + } + } + rewriter.create(loc); + + // Update hostEvalVars with the mapped values for the loop bounds if we have + // a loopNestOp and we are not generating code for the target device. + omp::LoopNestOp loopNestOp = + getLoopNestFromTeams(cast(splitBeforeOp)); + if (loopNestOp && !isTargetDevice) { + for (size_t i = 0; i < loopNestOp.getLoopLowerBounds().size(); ++i) { + Value lb = loopNestOp.getLoopLowerBounds()[i]; + Value ub = loopNestOp.getLoopUpperBounds()[i]; + Value step = loopNestOp.getLoopSteps()[i]; + + hostEvalVars.lbs.push_back(preMapping.lookup(lb)); + hostEvalVars.ubs.push_back(preMapping.lookup(ub)); + hostEvalVars.steps.push_back(preMapping.lookup(step)); + } + } + + return preTargetOp; +} + +/// genIsolatedTargetOp method generates the isolatedTargetOp that contains the +/// ops between the split point. It also creates the block arguments and maps +/// the values accordingly. It also creates the load operations for the allocs +/// and recomputes the necessary ops. +static omp::TargetOp +genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector &postMapOperands, + Operation *splitBeforeOp, RewriterBase &rewriter, + SmallVector &allocs, + SetVector &toRecompute, + struct HostEvalVars &hostEvalVars, bool isTargetDevice) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector isolatedHostEvalVars{targetOp.getHostEvalVars()}; + // update the hostEvalVars of isolatedTargetOp + if (!hostEvalVars.lbs.empty() && !isTargetDevice) { + isolatedHostEvalVars.append(hostEvalVars.lbs.begin(), + hostEvalVars.lbs.end()); + isolatedHostEvalVars.append(hostEvalVars.ubs.begin(), + hostEvalVars.ubs.end()); + isolatedHostEvalVars.append(hostEvalVars.steps.begin(), + hostEvalVars.steps.end()); + } + // Create the isolated target op + omp::TargetOp isolatedTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), + isolatedHostEvalVars, targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + auto *isolatedTargetBlock = + rewriter.createBlock(&isolatedTargetOp.getRegion(), + isolatedTargetOp.getRegion().begin(), {}, {}); + IRMapping isolatedMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, + isolatedTargetBlock, isolatedHostEvalVars, + postMapOperands, allocs, isolatedMapping); + // Handle the load operations for the allocs and recompute ops. + reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock, + isolatedTargetBlock, isolatedHostEvalVars, + postMapOperands, allocs, toRecompute, + isolatedMapping); + + // Clone the original operations. + rewriter.clone(*splitBeforeOp, isolatedMapping); + rewriter.create(loc); + + // update the loop bounds in the isolatedTargetOp if we have host_eval vars + // and we are not generating code for the target device. + if (!hostEvalVars.lbs.empty() && !isTargetDevice) { + omp::TeamsOp teamsOp; + for (auto &op : *isolatedTargetBlock) { + if (isa(&op)) + teamsOp = cast(&op); + } + assert(teamsOp && "No teamsOp found in isolated target region"); + // Get the loopNestOp inside the teamsOp + auto loopNestOp = getLoopNestFromTeams(teamsOp); + // Get the BlockArgs related to host_eval vars and update loop_nest bounds + // to them + unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size(); + unsigned index = originalHostEvalVarsSize; + // Replace loop bounds with the block arguments passed down via host_eval + SmallVector lbs, ubs, steps; + + // Collect new lb/ub/step values from target block args + for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i) + lbs.push_back(isolatedTargetBlock->getArgument(index++)); + + for (size_t i = 0; i < hostEvalVars.ubs.size(); ++i) + ubs.push_back(isolatedTargetBlock->getArgument(index++)); + + for (size_t i = 0; i < hostEvalVars.steps.size(); ++i) + steps.push_back(isolatedTargetBlock->getArgument(index++)); + + // Reset the loop bounds + loopNestOp.getLoopLowerBoundsMutable().assign(lbs); + loopNestOp.getLoopUpperBoundsMutable().assign(ubs); + loopNestOp.getLoopStepsMutable().assign(steps); + } + + return isolatedTargetOp; +} + +/// genPostTargetOp method generates the postTargetOp that contains all the ops +/// after the split point. It also creates the block arguments and maps the +/// values accordingly. It also creates the load operations for the allocs +/// and recomputes the necessary ops. +static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp, + Operation *splitBeforeOp, + SmallVector &postMapOperands, + RewriterBase &rewriter, + SmallVector &allocs, + SetVector &toRecompute) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector postHostEvalVars{targetOp.getHostEvalVars()}; + // Create the post target op + omp::TargetOp postTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars, + targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + // Create the block for postTargetOp + auto *postTargetBlock = rewriter.createBlock( + &postTargetOp.getRegion(), postTargetOp.getRegion().begin(), {}, {}); + IRMapping postMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, postTargetBlock, + postHostEvalVars, postMapOperands, allocs, postMapping); + // Handle the load operations for the allocs and recompute ops. + reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock, + postTargetBlock, postHostEvalVars, postMapOperands, + allocs, toRecompute, postMapping); + assert(splitBeforeOp->getNumResults() == 0 || + llvm::all_of(splitBeforeOp->getResults(), + [](Value result) { return result.use_empty(); })); + // Clone the original operations after the split point. + for (auto it = std::next(splitBeforeOp->getIterator()); + it != targetBlock->end(); it++) + rewriter.clone(*it, postMapping); + return postTargetOp; +} + +/// isolateOp method rewrites a omp.target_data { omp.target } in to +/// omp.target_data { +/// // preTargetOp region contains ops before splitBeforeOp. +/// omp.target {} +/// // isolatedTargetOp region contains splitBeforeOp, +/// omp.target {} +/// // postTargetOp region contains ops after splitBeforeOp. +/// omp.target {} +/// } +/// It also handles the mapping of variables and the caching/recomputing +/// of values as needed. +static FailureOr isolateOp(Operation *splitBeforeOp, + bool splitAfter, RewriterBase &rewriter, + mlir::ModuleOp module, + bool isTargetDevice) { + auto targetOp = cast(splitBeforeOp->getParentOp()); + assert(targetOp); + rewriter.setInsertionPoint(targetOp); + + // Prepare the map operands for preTargetOp and postTargetOp + auto preMapOperands = SmallVector(targetOp.getMapVars()); + auto postMapOperands = SmallVector(targetOp.getMapVars()); + + // Vectors to hold analysis results + SmallVector requiredVals; + SetVector toCache; + SetVector toRecompute; + SetVector nonRecomputable; + SmallVector allocs; + struct HostEvalVars hostEvalVars; + + // Analyze the ops in target region to determine which ops need to be + // cached and which ops need to be recomputed + computeAllocsCacheRecomputable( + targetOp, splitBeforeOp, rewriter, preMapOperands, postMapOperands, + allocs, requiredVals, nonRecomputable, toCache, toRecompute); + + rewriter.setInsertionPoint(targetOp); + + // Generate the preTargetOp that contains all the ops before splitBeforeOp. + auto preTargetOp = + genPreTargetOp(targetOp, preMapOperands, allocs, splitBeforeOp, rewriter, + hostEvalVars, isTargetDevice); + + // Move the ops of preTarget to host. + auto res = moveToHost(preTargetOp, rewriter, module, hostEvalVars); + if (failed(res)) + return failure(); + rewriter.setInsertionPoint(targetOp); + + // Generate the isolatedTargetOp + omp::TargetOp isolatedTargetOp = + genIsolatedTargetOp(targetOp, postMapOperands, splitBeforeOp, rewriter, + allocs, toRecompute, hostEvalVars, isTargetDevice); + + omp::TargetOp postTargetOp = nullptr; + // Generate the postTargetOp that contains all the ops after splitBeforeOp. + if (splitAfter) { + rewriter.setInsertionPoint(targetOp); + postTargetOp = genPostTargetOp(targetOp, splitBeforeOp, postMapOperands, + rewriter, allocs, toRecompute); + } + // Finally erase the original targetOp. + rewriter.eraseOp(targetOp); + return SplitResult{preTargetOp, isolatedTargetOp, postTargetOp}; +} + +/// Recursively fission target ops until no more nested ops can be isolated. +static LogicalResult fissionTarget(omp::TargetOp targetOp, + RewriterBase &rewriter, + mlir::ModuleOp module, bool isTargetDevice) { + auto tuple = getNestedOpToIsolate(targetOp); + if (!tuple) { + LLVM_DEBUG(llvm::dbgs() << " No op to isolate\n"); + struct HostEvalVars hostEvalVars; + return moveToHost(targetOp, rewriter, module, hostEvalVars); + } + Operation *toIsolate = std::get<0>(*tuple); + bool splitBefore = !std::get<1>(*tuple); + bool splitAfter = !std::get<2>(*tuple); + // Recursively isolate the target op. + if (splitBefore && splitAfter) { + auto res = + isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice); + if (failed(res)) + return failure(); + return fissionTarget((*res).postTargetOp, rewriter, module, isTargetDevice); + } + // Isolate only before the op. + if (splitBefore) { + auto res = + isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice); + if (failed(res)) + return failure(); + } else { + emitError(toIsolate->getLoc(), "Unhandled case in fissionTarget"); + return failure(); + } + return success(); +} + +/// Pass to lower omp.workdistribute ops. +class LowerWorkdistributePass + : public flangomp::impl::LowerWorkdistributeBase { +public: + void runOnOperation() override { + MLIRContext &context = getContext(); + auto moduleOp = getOperation(); + bool changed = false; + SetVector targetOpsToProcess; + auto verify = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + if (failed(verifyTargetTeamsWorkdistribute(workdistribute))) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + if (verify.wasInterrupted()) + return signalPassFailure(); + + auto fission = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + auto res = fissionWorkdistribute(workdistribute); + if (failed(res)) + return WalkResult::interrupt(); + changed |= *res; + return WalkResult::advance(); + }); + if (fission.wasInterrupted()) + return signalPassFailure(); + + auto rtCallLower = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + auto res = workdistributeRuntimeCallLower(workdistribute, + targetOpsToProcess); + if (failed(res)) + return WalkResult::interrupt(); + changed |= *res; + return WalkResult::advance(); + }); + if (rtCallLower.wasInterrupted()) + return signalPassFailure(); + + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + changed |= workdistributeDoLower(workdistribute, targetOpsToProcess); + }); + + moduleOp->walk([&](mlir::omp::TeamsOp teams) { + changed |= teamsWorkdistributeToSingleOp(teams, targetOpsToProcess); + }); + if (changed) { + bool isTargetDevice = + llvm::cast(*moduleOp) + .getIsTargetDevice(); + IRRewriter rewriter(&context); + for (auto targetOp : targetOpsToProcess) { + auto res = splitTargetData(targetOp, rewriter); + if (failed(res)) + return signalPassFailure(); + if (*res) { + if (failed(fissionTarget(*res, rewriter, moduleOp, isTargetDevice))) + return signalPassFailure(); + } + } + } + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 1253f7b2e9d3d..fe7cbba12eb9b 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -301,8 +301,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, addNestedPassToAllTopLevelOperations( pm, hlfir::createInlineHLFIRAssign); pm.addPass(hlfir::createConvertHLFIRtoFIR()); - if (enableOpenMP != EnableOpenMP::None) + if (enableOpenMP != EnableOpenMP::None) { pm.addPass(flangomp::createLowerWorkshare()); + pm.addPass(flangomp::createLowerWorkdistribute()); + } if (enableOpenMP == EnableOpenMP::Simd) pm.addPass(flangomp::createSimdOnlyPass()); } diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 195e5ad7f9dc8..59f6c73ae84ee 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -69,6 +69,7 @@ func.func @_QQmain() { // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: ConvertHLFIRtoFIR // PASSES-NEXT: LowerWorkshare +// PASSES-NEXT: LowerWorkdistribute // PASSES-NEXT: CSE // PASSES-NEXT: (S) 0 num-cse'd - Number of operations CSE'd // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd diff --git a/flang/test/Lower/OpenMP/workdistribute-multiple.f90 b/flang/test/Lower/OpenMP/workdistribute-multiple.f90 new file mode 100644 index 0000000000000..cf1d9dd294cea --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-multiple.f90 @@ -0,0 +1,20 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has multiple workdistribute ops. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + + !$omp workdistribute + y = a * y + x + !$omp end workdistribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 new file mode 100644 index 0000000000000..b2dbc0f15121e --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 @@ -0,0 +1,39 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 new file mode 100644 index 0000000000000..09e1211541edb --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 @@ -0,0 +1,45 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 new file mode 100644 index 0000000000000..cf5d0234edb39 --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 @@ -0,0 +1,47 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols, depth) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols, depth + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols, depth) :: x, y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + ! CHECK: fir.do_loop + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols, depth) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols, depth + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols, depth) :: x, y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + ! CHECK: fir.do_loop + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 new file mode 100644 index 0000000000000..516c4603bd5da --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 @@ -0,0 +1,53 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp target teams workdistribute + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = a * x + y + + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = 2.0_real32 + + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams workdistribute + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = a * x + y + + ! CHECK: fir.call @_FortranAAssign + y = 2.0_real32 + + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 new file mode 100644 index 0000000000000..4aeb2e89140cc --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 @@ -0,0 +1,68 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + !$omp target teams workdistribute + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * x + y + + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * y + x + + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + !$omp teams workdistribute + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * x + y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * y + x + + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 new file mode 100644 index 0000000000000..3062b3598b8ae --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 @@ -0,0 +1,29 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute_scalar_assign +subroutine target_teams_workdistribute_scalar_assign() + integer :: aa(10) + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp target teams workdistribute + aa = 20 + !$omp end target teams workdistribute + +end subroutine target_teams_workdistribute_scalar_assign + +! CHECK-LABEL: func @_QPteams_workdistribute_scalar_assign +subroutine teams_workdistribute_scalar_assign() + integer :: aa(10) + ! CHECK: fir.call @_FortranAAssign + !$omp teams workdistribute + aa = 20 + !$omp end teams workdistribute + +end subroutine teams_workdistribute_scalar_assign diff --git a/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 new file mode 100644 index 0000000000000..4a08e53bc316a --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 @@ -0,0 +1,32 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +! CHECK: omp.target_data map_entries({{.*}}) +! CHECK: omp.target thread_limit({{.*}}) host_eval({{.*}}) map_entries({{.*}}) +! CHECK: omp.teams num_teams({{.*}}) +! CHECK: omp.parallel +! CHECK: omp.distribute +! CHECK: omp.wsloop +! CHECK: omp.loop_nest + +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + integer :: i + + a = 2.0_real32 + x = [(real(i, real32), i = 1, 10)] + y = [(real(i * 0.5, real32), i = 1, 10)] + + !$omp target teams workdistribute & + !$omp& num_teams(4) & + !$omp& thread_limit(8) & + !$omp& default(shared) & + !$omp& private(i) & + !$omp& map(to: x) & + !$omp& map(tofrom: y) + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 new file mode 100644 index 0000000000000..f9c5a771f401d --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 @@ -0,0 +1,22 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + + !$omp distribute + do i = 1, 10 + x(i) = real(i, kind=real32) + end do + !$omp end distribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 new file mode 100644 index 0000000000000..3ef7f90087944 --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 @@ -0,0 +1,22 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp distribute + do i = 1, 10 + x(i) = real(i, kind=real32) + end do + !$omp end distribute + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir new file mode 100644 index 0000000000000..00d10d6264ec9 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir @@ -0,0 +1,33 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// CHECK-LABEL: func.func @x({{.*}}) +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_1:.*]]) : index = (%[[ARG0:.*]]) to (%[[ARG1:.*]]) inclusive step (%[[ARG2:.*]]) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: fir.store %[[VAL_0]] to %[[ARG4:.*]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +func.func @x(%lb : index, %ub : index, %step : index, %b : i1, %addr : !fir.ref) { + omp.teams { + omp.workdistribute { + fir.do_loop %iv = %lb to %ub step %step unordered { + %zero = arith.constant 0 : index + fir.store %zero to %addr : !fir.ref + } + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir new file mode 100644 index 0000000000000..04e60ca8bbf37 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir @@ -0,0 +1,117 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s +// Test lowering of workdistribute after fission on host device. + +// CHECK-LABEL: func.func @x( +// CHECK: %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"} +// CHECK: fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"} +// CHECK: fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"} +// CHECK: fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { +// CHECK: %[[VAL_11:.*]] = fir.alloca index +// CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_14:.*]] = fir.alloca index +// CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_17:.*]] = fir.alloca index +// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_20:.*]] = fir.alloca !fir.heap +// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(from) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(to) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_27:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index +// CHECK: %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"} +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap +// CHECK: fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref +// CHECK: fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref +// CHECK: fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref +// CHECK: fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref> +// CHECK: omp.target host_eval(%[[VAL_24]] -> %[[VAL_31:.*]], %[[VAL_25]] -> %[[VAL_32:.*]], %[[VAL_26]] -> %[[VAL_33:.*]] : index, index, index) map_entries(%[[VAL_7]] -> %[[VAL_34:.*]], %[[VAL_8]] -> %[[VAL_35:.*]], %[[VAL_9]] -> %[[VAL_36:.*]], %[[VAL_10]] -> %[[VAL_37:.*]], %[[VAL_13]] -> %[[VAL_38:.*]], %[[VAL_16]] -> %[[VAL_39:.*]], %[[VAL_19]] -> %[[VAL_40:.*]], %[[VAL_22]] -> %[[VAL_41:.*]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref>) { +// CHECK: %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref +// CHECK: %[[VAL_43:.*]] = fir.load %[[VAL_39]] : !fir.ref +// CHECK: %[[VAL_44:.*]] = fir.load %[[VAL_40]] : !fir.ref +// CHECK: %[[VAL_45:.*]] = fir.load %[[VAL_41]] : !fir.ref> +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_43]], %[[VAL_43]] : index +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_47:.*]]) : index = (%[[VAL_31]]) to (%[[VAL_32]]) inclusive step (%[[VAL_33]]) { +// CHECK: fir.store %[[VAL_46]] to %[[VAL_45]] : !fir.heap +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_48:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_49:.*]] = fir.load %[[VAL_11]] : !fir.ref +// CHECK: %[[VAL_50:.*]] = fir.load %[[VAL_14]] : !fir.ref +// CHECK: %[[VAL_51:.*]] = fir.load %[[VAL_17]] : !fir.ref +// CHECK: %[[VAL_52:.*]] = fir.load %[[VAL_20]] : !fir.ref> +// CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_50]], %[[VAL_50]] : index +// CHECK: fir.store %[[VAL_49]] to %[[VAL_52]] : !fir.heap +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.heap) -> i64 +// CHECK: omp.target_freemem %[[VAL_48]], %[[VAL_54]] : i32, i64 +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + +module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false} { +func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref) { + %lb_ref = fir.alloca index {bindc_name = "lb"} + fir.store %lb to %lb_ref : !fir.ref + %ub_ref = fir.alloca index {bindc_name = "ub"} + fir.store %ub to %ub_ref : !fir.ref + %step_ref = fir.alloca index {bindc_name = "step"} + fir.store %step to %step_ref : !fir.ref + + %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} + %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} + %step_map = omp.map.info var_ptr(%step_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} + %addr_map = omp.map.info var_ptr(%addr : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} + + omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { + %lb_val = fir.load %ARG0 : !fir.ref + %ub_val = fir.load %ARG1 : !fir.ref + %step_val = fir.load %ARG2 : !fir.ref + %one = arith.constant 1 : index + + %20 = arith.addi %ub_val, %ub_val : index + omp.teams { + omp.workdistribute { + %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"} + fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered { + fir.store %20 to %dev_mem : !fir.heap + } + fir.store %lb_val to %dev_mem : !fir.heap + fir.freemem %dev_mem : !fir.heap + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir new file mode 100644 index 0000000000000..062eb701b52ef --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir @@ -0,0 +1,118 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s +// Test lowering of workdistribute after fission on host device. + +// CHECK-LABEL: func.func @x( +// CHECK: %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"} +// CHECK: fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"} +// CHECK: fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"} +// CHECK: fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { +// CHECK: %[[VAL_11:.*]] = fir.alloca index +// CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_14:.*]] = fir.alloca index +// CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_17:.*]] = fir.alloca index +// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_20:.*]] = fir.alloca !fir.heap +// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(from) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(to) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_27:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index +// CHECK: %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"} +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap +// CHECK: fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref +// CHECK: fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref +// CHECK: fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref +// CHECK: fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref> +// CHECK: omp.target map_entries(%[[VAL_7]] -> %[[VAL_31:.*]], %[[VAL_8]] -> %[[VAL_32:.*]], %[[VAL_9]] -> %[[VAL_33:.*]], %[[VAL_10]] -> %[[VAL_34:.*]], %[[VAL_13]] -> %[[VAL_35:.*]], %[[VAL_16]] -> %[[VAL_36:.*]], %[[VAL_19]] -> %[[VAL_37:.*]], %[[VAL_22]] -> %[[VAL_38:.*]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref>) { +// CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_35]] : !fir.ref +// CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_36]] : !fir.ref +// CHECK: %[[VAL_41:.*]] = fir.load %[[VAL_37]] : !fir.ref +// CHECK: %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref> +// CHECK: %[[VAL_43:.*]] = arith.addi %[[VAL_40]], %[[VAL_40]] : index +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_44:.*]]) : index = (%[[VAL_39]]) to (%[[VAL_40]]) inclusive step (%[[VAL_41]]) { +// CHECK: fir.store %[[VAL_43]] to %[[VAL_42]] : !fir.heap +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_46:.*]] = fir.load %[[VAL_11]] : !fir.ref +// CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_14]] : !fir.ref +// CHECK: %[[VAL_48:.*]] = fir.load %[[VAL_17]] : !fir.ref +// CHECK: %[[VAL_49:.*]] = fir.load %[[VAL_20]] : !fir.ref> +// CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_47]], %[[VAL_47]] : index +// CHECK: fir.store %[[VAL_46]] to %[[VAL_49]] : !fir.heap +// CHECK: %[[VAL_51:.*]] = fir.convert %[[VAL_49]] : (!fir.heap) -> i64 +// CHECK: omp.target_freemem %[[VAL_45]], %[[VAL_51]] : i32, i64 +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + + +module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { +func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref) { + %lb_ref = fir.alloca index {bindc_name = "lb"} + fir.store %lb to %lb_ref : !fir.ref + %ub_ref = fir.alloca index {bindc_name = "ub"} + fir.store %ub to %ub_ref : !fir.ref + %step_ref = fir.alloca index {bindc_name = "step"} + fir.store %step to %step_ref : !fir.ref + + %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} + %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} + %step_map = omp.map.info var_ptr(%step_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} + %addr_map = omp.map.info var_ptr(%addr : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} + + omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { + %lb_val = fir.load %ARG0 : !fir.ref + %ub_val = fir.load %ARG1 : !fir.ref + %step_val = fir.load %ARG2 : !fir.ref + %one = arith.constant 1 : index + + %20 = arith.addi %ub_val, %ub_val : index + omp.teams { + omp.workdistribute { + %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"} + fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered { + fir.store %20 to %dev_mem : !fir.heap + } + fir.store %lb_val to %dev_mem : !fir.heap + fir.freemem %dev_mem : !fir.heap + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir new file mode 100644 index 0000000000000..c562b7009664d --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir @@ -0,0 +1,71 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// CHECK-LABEL: func.func @test_fission_workdistribute( +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 9 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 5.000000e+00 : f32 +// CHECK: fir.store %[[VAL_3]] to %[[ARG2:.*]] : !fir.ref +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_2]]) inclusive step (%[[VAL_1]]) { +// CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[VAL_4]] : (!fir.ref>, index) -> !fir.ref +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref +// CHECK: %[[VAL_7:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[VAL_4]] : (!fir.ref>, index) -> !fir.ref +// CHECK: fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: fir.call @regular_side_effect_func(%[[ARG2:.*]]) : (!fir.ref) -> () +// CHECK: fir.call @my_fir_parallel_runtime_func(%[[ARG3:.*]]) : (!fir.ref) -> () +// CHECK: fir.do_loop %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_1]] { +// CHECK: %[[VAL_9:.*]] = fir.coordinate_of %[[ARG0]], %[[VAL_8]] : (!fir.ref>, index) -> !fir.ref +// CHECK: fir.store %[[VAL_3]] to %[[VAL_9]] : !fir.ref +// CHECK: } +// CHECK: %[[VAL_10:.*]] = fir.load %[[ARG2:.*]] : !fir.ref +// CHECK: fir.store %[[VAL_10]] to %[[ARG3:.*]] : !fir.ref +// CHECK: return +// CHECK: } +module { +func.func @regular_side_effect_func(%arg0: !fir.ref) { + return +} +func.func @my_fir_parallel_runtime_func(%arg0: !fir.ref) attributes {fir.runtime} { + return +} +func.func @test_fission_workdistribute(%arr1: !fir.ref>, %arr2: !fir.ref>, %scalar_ref1: !fir.ref, %scalar_ref2: !fir.ref) { + %c0_idx = arith.constant 0 : index + %c1_idx = arith.constant 1 : index + %c9_idx = arith.constant 9 : index + %float_val = arith.constant 5.0 : f32 + omp.teams { + omp.workdistribute { + fir.store %float_val to %scalar_ref1 : !fir.ref + fir.do_loop %iv = %c0_idx to %c9_idx step %c1_idx unordered { + %elem_ptr_arr1 = fir.coordinate_of %arr1, %iv : (!fir.ref>, index) -> !fir.ref + %loaded_val_loop1 = fir.load %elem_ptr_arr1 : !fir.ref + %elem_ptr_arr2 = fir.coordinate_of %arr2, %iv : (!fir.ref>, index) -> !fir.ref + fir.store %loaded_val_loop1 to %elem_ptr_arr2 : !fir.ref + } + fir.call @regular_side_effect_func(%scalar_ref1) : (!fir.ref) -> () + fir.call @my_fir_parallel_runtime_func(%scalar_ref2) : (!fir.ref) -> () + fir.do_loop %jv = %c0_idx to %c9_idx step %c1_idx { + %elem_ptr_ordered_loop = fir.coordinate_of %arr1, %jv : (!fir.ref>, index) -> !fir.ref + fir.store %float_val to %elem_ptr_ordered_loop : !fir.ref + } + %loaded_for_hoist = fir.load %scalar_ref1 : !fir.ref + fir.store %loaded_for_hoist to %scalar_ref2 : !fir.ref + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir new file mode 100644 index 0000000000000..03d5d71df0a82 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir @@ -0,0 +1,108 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// Test lowering of workdistribute for a scalar assignment within a target teams workdistribute region. +// The test checks that the scalar assignment is correctly lowered to wsloop and loop_nest operations. + +// Example Fortran code: +// !$omp target teams workdistribute +// y = 3.0_real32 +// !$omp end target teams workdistribute + + +// CHECK-LABEL: func.func @x( +// CHECK: omp.target {{.*}} { +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_73:.*]]) : index = (%[[VAL_66:.*]]) to (%[[VAL_72:.*]]) inclusive step (%[[VAL_67:.*]]) { +// CHECK: %[[VAL_74:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_75:.*]]:3 = fir.box_dims %[[VAL_64:.*]], %[[VAL_74]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_76:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_77:.*]]:3 = fir.box_dims %[[VAL_64]], %[[VAL_76]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_78:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_79:.*]] = arith.remsi %[[VAL_73]], %[[VAL_77]]#1 : index +// CHECK: %[[VAL_80:.*]] = arith.addi %[[VAL_79]], %[[VAL_78]] : index +// CHECK: %[[VAL_81:.*]] = arith.divsi %[[VAL_73]], %[[VAL_77]]#1 : index +// CHECK: %[[VAL_82:.*]] = arith.remsi %[[VAL_81]], %[[VAL_75]]#1 : index +// CHECK: %[[VAL_83:.*]] = arith.addi %[[VAL_82]], %[[VAL_78]] : index +// CHECK: %[[VAL_84:.*]] = fir.array_coor %[[VAL_64]] %[[VAL_83]], %[[VAL_80]] : (!fir.box>, index, index) -> !fir.ref +// CHECK: fir.store %[[VAL_65:.*]] to %[[VAL_84]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: func.func private @_FortranAAssign(!fir.ref>, !fir.box, !fir.ref, i32) attributes {fir.runtime} + +module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { +func.func @x(%arr : !fir.ref>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c78 = arith.constant 78 : index + %cst = arith.constant 3.000000e+00 : f32 + %0 = fir.alloca i32 + %1 = fir.alloca i32 + %c10 = arith.constant 10 : index + %c20 = arith.constant 20 : index + %194 = arith.subi %c10, %c1 : index + %195 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%194 : index) extent(%c10 : index) stride(%c1 : index) start_idx(%c1 : index) + %196 = arith.subi %c20, %c1 : index + %197 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%196 : index) extent(%c20 : index) stride(%c1 : index) start_idx(%c1 : index) + %198 = omp.map.info var_ptr(%arr : !fir.ref>, f32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%195, %197) -> !fir.ref> {name = "y"} + %199 = omp.map.info var_ptr(%1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} + %200 = omp.map.info var_ptr(%0 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} + omp.target map_entries(%198 -> %arg5, %199 -> %arg6, %200 -> %arg7 : !fir.ref>, !fir.ref, !fir.ref) { + %c0_0 = arith.constant 0 : index + %201 = fir.load %arg7 : !fir.ref + %202 = fir.load %arg6 : !fir.ref + %203 = fir.convert %202 : (i32) -> i64 + %204 = fir.convert %201 : (i32) -> i64 + %205 = fir.convert %204 : (i64) -> index + %206 = arith.cmpi sgt, %205, %c0_0 : index + %207 = fir.convert %203 : (i64) -> index + %208 = arith.cmpi sgt, %207, %c0_0 : index + %209 = arith.select %208, %207, %c0_0 : index + %210 = arith.select %206, %205, %c0_0 : index + %211 = fir.shape %210, %209 : (index, index) -> !fir.shape<2> + %212 = fir.declare %arg5(%211) {uniq_name = "_QFFaxpy_array_workdistributeEy"} : (!fir.ref>, !fir.shape<2>) -> !fir.ref> + %213 = fir.embox %212(%211) : (!fir.ref>, !fir.shape<2>) -> !fir.box> + omp.teams { + %214 = fir.alloca !fir.box> {pinned} + omp.workdistribute { + %215 = fir.alloca f32 + %216 = fir.embox %215 : (!fir.ref) -> !fir.box + %217 = fir.shape %210, %209 : (index, index) -> !fir.shape<2> + %218 = fir.embox %212(%217) : (!fir.ref>, !fir.shape<2>) -> !fir.box> + fir.store %218 to %214 : !fir.ref>> + %219 = fir.address_of(@_QQclXf9c642d28e5bba1f07fa9a090b72f4fc) : !fir.ref> + %c39_i32 = arith.constant 39 : i32 + %220 = fir.convert %214 : (!fir.ref>>) -> !fir.ref> + %221 = fir.convert %216 : (!fir.box) -> !fir.box + %222 = fir.convert %219 : (!fir.ref>) -> !fir.ref + fir.call @_FortranAAssign(%220, %221, %222, %c39_i32) : (!fir.ref>, !fir.box, !fir.ref, i32) -> () + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} + +func.func private @_FortranAAssign(!fir.ref>, !fir.box, !fir.ref, i32) attributes {fir.runtime} + +fir.global linkonce @_QQclXf9c642d28e5bba1f07fa9a090b72f4fc constant : !fir.char<1,78> { + %0 = fir.string_lit "File: /work/github/skc7/llvm-project/build_fomp_reldebinfo/saxpy_tests/\00"(78) : !fir.char<1,78> + fir.has_value %0 : !fir.char<1,78> +} +} From 3f1ca97c469d6b1f87e47a754d95407e6ab18b10 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Mon, 13 Oct 2025 19:03:31 -0500 Subject: [PATCH 2/7] move gpurun to offload/utils will work on script changes for aomp, and npsdb after it lands --- offload/utils/gpurun | 672 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 672 insertions(+) create mode 100755 offload/utils/gpurun diff --git a/offload/utils/gpurun b/offload/utils/gpurun new file mode 100755 index 0000000000000..a22c4265bcac7 --- /dev/null +++ b/offload/utils/gpurun @@ -0,0 +1,672 @@ +#!/bin/bash +# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# gpurun: Process launch utility for GPU applications. This is a wrapper +# to execute application binaries including OpenMPI GPU applications. +# See help message below (gpurun -h) for more information. +# +# Usage Examples: +# gpurun true +# mpirun -np 4 gpurun env | grep ROCR_VISIBLE_DEVICES +# + +# If set to 1, just invoke the rest of the command line without doing anything +# else. +GPURUN_BYPASS=${GPURUN_BYPASS:-0} + +function execOnError() { + exec "$@" +} + +if [ "$GPURUN_BYPASS" = "1" ]; then + execOnError "$@" +fi + +# PROGVERSION string is updated by cmake when component is installed +PROGVERSION=X.Y-Z +function version(){ + echo $0 version $PROGVERSION + exit 0 +} +function usage(){ +/bin/cat 2>&1 <<"EOF" + + gpurun: Application process launch utility for GPUs. + This utility ensures the process will enable either a single + GPU or the number specified with -md (multi-device) option. + It launches the application binary with either the 'taskset' + or 'numactl' utility so the process only runs on CPU cores + in the same NUMA domain as the selected GPUs. + This utility sets environment variable ROCR_VISIBLE_DEVICES + to selected GPUs ONLY if it was not already set by the + callers environment AND the number of GPUs is not 1. + This utility also sets environment variable HSA_CU_MASK + to control which CUs are available to the process. + HSA_CU_MASK is set only when more than one OpenMPI process + (rank) will utilize the same GPU and it is not preset. + Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the + number of CUs available to the process after masking. + + Usage: + gpurun [ ] + mpirun -np gpurun [ ] + + Options: + -h Print this help message and exit + -md Set number of desired devices for multi-device mode, default=1 + -s suppress output, often useful in benchmarking + -q suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0 + -v Verbose output, same as GPURUN_VERBOSE=1 + -vv Verbose output, same as GPURUN_VERBOSE=2 + -m use numactl membind to CPUs in same NUMA domain. Note: Allocation + fails when not enough memory available on these nodes. + -l use numactl localalloc to CPUs in same NUMA domain. Note: If + memory cannot be allocated, alloc falls back to other nodes. + --version Print version of gpurun and exit + + Optional Input environment variables: + GPURUN_VERBOSE + 0: default for silent operation, no trace printed to stderr + 1: -v prints trace record including process launch cmd to stderr + 2: -vv prints trace and other summary diagnostics + ROCMINFO_BINARY Set location of rocminfo binary + AOMP: location of AOMP or ROCM + GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0. + This only works for single device mode. + GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards + GPURUN_MASK_POLICY : useful if machine has different GPU cards + ROCR_VISIBLE_DEVICES: See description above + OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi + OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi + This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID + and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK + + Generated (output) Environment Variables: + OMPX_TARGET_TEAM_PROCS - Number of CUs available to process + ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset + HSA_CU_MASK - The CU mask for the device. + LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument + GPU_MAX_HW_QUEUES + LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" + + Limitations: + - Currently, gpurun creates masks that are mutually exclusive of each other. + That is, the MPI processes will not share CUs. If number of ranks is not + perfectly divisible by number of CUs or number of GPUs, some resources + would be unused. + Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization. + - Works with AOMP 19.0-0 or ROCM 6.1 or greater + - cu masking is not available when multiple devices per process are enabled + with -md option (multi-device) mode. + + Notes: + With MPI, this utility distributes GPUs and their CUs across + multiple ranks of an MPI job into mutually exclusive sets of CUs. + It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE + and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a + the mutually exclusive CU mask. + + An rplace (rank place) is a subset of CUs for a rank. + gpurun calculates the number of rplaces needed to contain all + the specified number of ranks for this node. If number of ranks not + divisible by number of GPUs, then there will be more rplaces than ranks. + The number of CUs in an rplace is calculated by dividing the number of + CUs per GPU by the number of rplaces per GPU. This is also the number of + bits set in the CU mask. This is also the number of physical locations + available for an OpenMP team to execute. This utility exports that number + to the environment variable OMPX_TARGET_TEAM_PROCS. This value + could be used by the application or runtume to adjust the number + of desired teams in a target region. If no masking occurs, the entire + GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to + the total number of CUs on the GPU. + + Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. + +EOF + exit 0 +} + +_end_gpurun_opts=0 +_devices_per_mdset=1 +_uses_multi_device=0 +while [ "$_end_gpurun_opts" == "0" ] ; do + case "$1" in + -s) GPURUN_VERBOSE=0;; + -q) GPURUN_VERBOSE=0;; + --quiet) GPURUN_VERBOSE=0;; + -h) usage ;; + -help) usage ;; + --help) usage ;; + -version) version ;; + --version) version ;; + -v) GPURUN_VERBOSE=1;; + -vv) GPURUN_VERBOSE=2;; + -m) _use_numactl_membind=1;; + -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; + -l) _use_numactl_localalloc=1;; + -nomask) GPURUN_MASK_POLICY="nomask";; + *) _end_gpurun_opts=1; break;; + esac + if [ "$_end_gpurun_opts" == "0" ] ; then + shift + fi +done + +# Default: quiet operation +GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} +# Default: create mutually exclusive sets of CUs when GPU is oversubscribed +GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex} +# switch mask policy to preset if HSA_CU_MASK was preset +[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset +# switch mask policy to nomask for multi-device +[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask +# Offset selected device to avoid some heavily used GPUs +GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0} + +# Get environment variables set by OpenMPI +_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE +_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK +# If not OpenMPI, check for Platform MPI, MVAPICH +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_LOCALNRANKS + _local_rank_num=$MPI_LOCALRANKID +fi +# Also try MPI_COMM_WORLD env vars +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE + _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK +fi +# Check if SLURM was used +if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then + _num_local_ranks=$SLURM_CPUS_ON_NODE + _local_rank_num=$SLURM_LOCALID +fi +# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=1 + _local_rank_num=0 +fi + +# Find location of the rocminfo binary +AOMP=${AOMP:-_AOMP_INSTALL_DIR_} +if [ ! -d $AOMP ] ; then + AOMP="_AOMP_INSTALL_DIR_" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/lib/llvm" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/llvm" +fi +if [ ! -d $AOMP ] ; then + realpath=`realpath $0` + thisdir=`dirname $realpath` + AOMP=$thisdir/.. +fi +if [ ! -d $AOMP ] ; then + >&2 echo "ERROR: AOMP not found at $AOMP" + >&2 echo " Please install AOMP or correctly set env-var AOMP" + execOnError "$@" +fi +ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo} +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo +if [ ! -f $ROCMINFO_BINARY ] ; then + >&2 echo "ERROR: Could not find binary for rocminfo," + >&2 echo " Please correct installation of ROCM or AOMP compiler" + execOnError "$@" +fi + +# Use rocminfo to find number number of CUs and gfxids for each GPU. +_tfile="/tmp/rinfo_out$$" +$ROCMINFO_BINARY 2>/dev/null | grep -E " Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile +_tfile_lines=`wc -l $_tfile | cut -d" " -f1` +if [ $_tfile_lines == 0 ] ; then + >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices" + rm $_tfile + execOnError "$@" +fi +# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device +_ri_all_gfxids="" +_ri_gfxids=() +_ri_cucount=() +_ri_bdfids=() +_ri_dev_idx=() +_ri_num_devices=0 +_last_cu_count=0 +_ri_uuid=() +_last_device_type_was_gpu=0 +_device_type_preset=0 +_ri_num_all_devices=0 +[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1 +while read _linepair ; do + _fieldvalue=`echo $_linepair | cut -d":" -f2` + _fieldtype=`echo $_linepair | cut -d":" -f1` + if [ $_fieldvalue == "CPU" ] ; then + _last_device_type_was_gpu=0 + elif [ $_fieldvalue == "GPU" ] ; then + _last_device_type_was_gpu=1 + elif [ "$_fieldtype" == "Uuid" ] ; then + _this_uuid=$_fieldvalue + elif [ "$_fieldtype" == "BDFID" ] ; then + if [[ $_last_device_type_was_gpu == 1 ]] ; then + # _domain="$(echo "$_fieldvalue / (2^32)" | bc)" + _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)" + _devfn="$(echo "($_fieldvalue % (2^8))" | bc)" + _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")" + fi + elif [ "$_fieldtype" == "Name" ] ; then + # The device name field is last in rocminfo output, so we can create new _ri_ array entry + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _this_gfxid=`echo $_fieldvalue | cut -d'-' -f5` + ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid" + _is_type_visible=1 + if [ $_device_type_preset == 1 ] ; then + _is_type_visible=0 + if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then + _is_type_visible=1 + fi + fi + if [ $_is_type_visible == 1 ] ; then + _ri_gfxids+=( $_this_gfxid ) + _ri_cucount+=( $_last_cu_count ) + _ri_bdfids+=( $_bdfidstr ) + _ri_dev_idx+=( $_ri_num_all_devices ) + _ri_uuid+=( $_this_uuid ) + _ri_num_devices=$(( $_ri_num_devices + 1 )) + fi + _ri_num_all_devices=$(( $_ri_num_all_devices + 1 )) + fi + else + # else the _fieldvalue was the number of CUs or GCPUs + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _last_cu_count=$_fieldvalue + fi + fi +done < $_tfile +rm $_tfile + +if [ $_ri_num_devices == 0 ] ; then + if [ $_local_rank_num == 0 ] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY" + fi + if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then + >&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES" + >&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly." + fi + execOnError "$@" + else + execOnError "$@" + fi +fi + +# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per +# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids, +# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information +# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above +# by scanning output from rocminfo. +_sysdevdir="/sys/bus/pci/devices" +_ss_num_devices=0 +_ss_cpulist=() +_ss_bdfid=() +_ss_numanode=() +_ss_uuid=() +_ss_gfxid=() +_ss_cucount=() +for _devid in `ls $_sysdevdir` ; do + if [ -f $_sysdevdir/$_devid/device ] ; then + _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` + if [ ! -z $_driver_name ] ; then + if [ $_driver_name == "DRIVER=amdgpu" ] ; then + _numa_node=`cat $_sysdevdir/$_devid/numa_node` + [ "$_numa_node" == "-1" ] && _numa_node=0 + _this_uuid=0 + if [ -f $_sysdevdir/$_devid/unique_id ] ; then + _this_uuid=`cat $_sysdevdir/$_devid/unique_id` + if [ -z $_this_uuid ] ; then + _this_uuid=0 + _has_unique_id_file=0 + else + _this_uuid="GPU-$_this_uuid" + _has_unique_id_file=1 + fi + fi + _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` + _match_uuid_count=0 + for _ri_i in ${!_ri_bdfids[@]} ; do + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + if [ $_ss_value == $_ri_value ] ; then + _match_uuid_count=$(( $_match_uuid_count + 1 )) + fi + done + # Search _ri_ arrays for matching uuids or matching bdfids. + for _ri_i in ${!_ri_bdfids[@]} ; do + if [ "$_has_unique_id_file" == "1" ] ; then + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then + # Under Hyper-V, we may see a zero BDFID. Fall back to UUID. + _ss_value=$_devid + _ri_value=$_devid + else + _ss_value=$_devid + _ri_value="0000:${_ri_bdfids[$_ri_i]}.0" + fi + if [ $_ss_value == $_ri_value ] ; then + if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then + # Some GPUs do not have unique_id or TPX mode creates multiple + # identical uuids, so use device index for RVD + _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} ) + else + _ss_uuid+=( $_this_uuid ) + fi + _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} ) + _ss_cucount+=( ${_ri_cucount[$_ri_i]} ) + _ss_bdfid+=( $_devid ) + _ss_numanode+=( $_numa_node ) + _ss_cpulist+=( $_this_cpulist ) + _ss_num_devices=$(( $_ss_num_devices + 1 )) + fi + done + fi + fi + fi +done + +if [[ $_ss_num_devices -lt 1 ]] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir." + fi + execOnError "$@" +fi + +# check for taskset or numactl cmd +if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd_binary=`which numactl` + if [ $? != 0 ] ; then + >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed." + execOnError "$@" + fi +else + _launch_process_cmd_binary=`which taskset` + if [ $? != 0 ] ; then + >&2 echo "ERROR: $0 requires the taskset command to be installed." + execOnError "$@" + fi +fi +if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then + >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored." + _use_numactl_membind=0 +fi + +_utilized_devices=$_ri_num_devices +[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks + +# Calculate number of GPUs to use to evenly spread ranks across GPUs. +# An rplace is a set of CUs that will be used for a rank. +# The number of rplaces must be at least the number of ranks. +_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices )) +_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices )) +if [ $_uncovered_ranks != 0 ] ; then + # If _num_local_ranks not divisible by number of GPUs, + # then add an extra rplace per GPU to make room for remainder. + _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) +fi +if [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # For mutex policy, adjacent ranks are assigned to the same device. + _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) + # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS + _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +else + # for mask policies nomask or preset, adjacent ranks are assigned to + # different GPUs and oversubscribed ranks are assigned round robin + _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +fi + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +if [ $_uses_multi_device == 1 ]; then + # Enforce some rules on the use of -md option + # Note -md forces GPURUN_MASK_POLICY=nomask + if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then + >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode" + execOnError "$@" + fi + if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then + >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)" + execOnError "$@" + fi + _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset )) + if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then + printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n Some multi-device sets will overlap.\n" >&2 + fi + _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices)) + _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 )) + + # merge entries for this mdset from per device arrays + _md_bdfs="" + _md_cpus="" + _md_nns="" + _md_uuids="" + _md_dev_idxs="" + _sep="" + for i in `seq $_md_device_set_start $_md_device_set_end` ; do + _dev_index=$i + # handle index wrap around number of devices + [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices )) + _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]} + _new_nn=${_ss_numanode[$_dev_index]} + SAVEIFS=$IFS + IFS="," + _found=0 + for _existing_nn in $_md_nns ; do + [ $_existing_nn == $_new_nn ] && _found=1 + done + IFS=$SAVEIFS + if [ $_found == 0 ] ; then + # only add new numa node and cpulist, if not already in the md set + _md_nns+=$_sep$_new_nn + _md_cpus+=$_sep${_ss_cpulist[$_dev_index]} + fi + _md_uuids+=$_sep${_ss_uuid[$_dev_index]} + _md_dev_idxs+=$_sep$_dev_index + _sep="," + done + _device_num=$_md_device_set_start +fi + +_available_CUs_per_device=${_ss_cucount[$_device_num]} +_gfxid=${_ss_gfxid[$_device_num]} + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +_utilized_CUs_per_device=$_available_CUs_per_device +_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +# Lower utilized CUs till divisible by number of rplaces per GPU +while [ $_rem2 != 0 ] ; do + _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 )) + _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +done +_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) + +# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0 +if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then + if [ $_uses_multi_device == 0 ] ; then + _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) + _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices )) + _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) + _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) + _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) + _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) + if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then + if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then + _extra_diags=true + fi + fi + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + [ $_extra_diags ] && \ + >&2 echo "-- USED GPUS: $(( $_ri_num_devices - $_wasted_GPUs ))" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " + [ $_extra_diags ] && echo + >&2 echo "- RPLACEs PER NODE: $_total_GPU_rplaces" + >&2 echo "- RPLACEs PER GPU: $_number_of_rplaces_per_GPU" + [ $_extra_diags ] && \ + >&2 echo "-- USED RPLACEs: $_num_local_ranks (RANKS)" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ + >&2 echo "- gfxids ${_ss_gfxid[@]}" + >&2 echo "- CUs PER GPU: ${_ss_cucount[@]}" + [ $_extra_diags ] && \ + >&2 echo "-- USED on CUs RANK0: $_utilized_CUs_per_device" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED CUs RANK0 : $_wasted_CUs_on_each_GPU" + >&2 echo "- CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" + >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" + if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then + >&2 echo "- Preset ROCR_VISIBLE_DEVICES: $ROCR_VISIBLE_DEVICES" + fi + if [[ ! -z "$HSA_CU_MASK" ]] ; then + # node utilizatino could be incorrect with preset cumask. + >&2 echo "- Preset HSA_CU_MASK: $HSA_CU_MASK" + else + >&2 echo "- NODE UTILIZATION: $_utilization %" + fi + else + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + >&2 echo "- DEVS PER RANK: $_devices_per_mdset" + >&2 echo "- MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)" + _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices )) + >&2 echo "- NODE UTILIZATION: $_md_utilization %" + fi +fi +# --- END OF DIAGNOSTIC BLOCK + +if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace + _bits_to_set=$_CUs_per_rplace + # This formula keeps adjacent ranks on same GPU which should be preferred + _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) + # use bc because these values can be very large + _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` + _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` + # Calculate the number of leading zeros needed for this mask + _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 )) + for i in `seq 1 $_lz` ; do + _mask="0$_mask" + done + _mask="0x$_mask" +fi + +_launch_process_cmd="" +if [ $_uses_multi_device == 0 ] ; then + # retrieve scanned info from per device arrays + _bdfidstrc=${_ss_bdfid[$_device_num]} + NUMANODE=${_ss_numanode[$_device_num]} + _list_of_cpu_cores=${_ss_cpulist[$_device_num]} + _this_uuid=${_ss_uuid[$_device_num]} +else + # Use multi-device values + _bdfidstrc=$_md_bdfs + NUMANODE=$_md_nns + _list_of_cpu_cores=$_md_cpus + _this_uuid=$_md_uuids + _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset " +fi +if [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE" +elif [ "$_use_numactl_membind" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE" +else + _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores" +fi + +# If gpurun was not given command to execute, then dont run _launch_process_cmd +[ "$*" == "" ] && _launch_process_cmd="" + +# only set ROCR_VISIBLE_DEVICES if not already set +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + export ROCR_VISIBLE_DEVICES=$_this_uuid + _log_word="RVD" +else + _log_word="PRESET-RVD" +fi + +export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace + +# - Limit HSA queues when multiple ranks per GPU +if [ $_number_of_rplaces_per_GPU != 1 ] ; then + # Only set these env controls if not set by caller + [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 + [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 +fi + +[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] && \ + [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" + +if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then + # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution. + if [ "$GPURUN_VERBOSE" != "0" ] ; then + if [ $_uses_multi_device == 1 ] ; then + printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n CMD:$_launch_process_cmd $*\n" >&2 + else + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2 + fi + fi + $_launch_process_cmd $* + # --- end code block +else + # --- HSA_CU_MASK is required in this code block, assumes no multi-device + if [[ -z "$HSA_CU_MASK" ]] ; then + # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: + export HSA_CU_MASK=0:$_mask + else + # use preset mask + _mask=$HSA_CU_MASK + fi + if [ "$GPURUN_VERBOSE" != "0" ] ; then + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2 + fi + HSA_CU_MASK=0:$_mask \ + $_launch_process_cmd $* + # --- end code block +fi +exit $? From 0ac33444e4e797a35ce32d9eae7e7c725aa0a9fb Mon Sep 17 00:00:00 2001 From: Ethan Stewart Date: Wed, 22 Oct 2025 15:53:48 -0500 Subject: [PATCH 3/7] [offload][utils] - Add cmake install step for gpurun - add_subdirectory(utils) in offload CMakeLists.txt - usage of new macro add_openmp_util to install utils into llvm/bin --- offload/CMakeLists.txt | 2 ++ offload/utils/CMakeLists.txt | 10 ++++++++++ offload/utils/gpurun/CMakeLists.txt | 1 + offload/utils/{ => gpurun}/gpurun | 0 4 files changed, 13 insertions(+) create mode 100644 offload/utils/CMakeLists.txt create mode 100644 offload/utils/gpurun/CMakeLists.txt rename offload/utils/{ => gpurun}/gpurun (100%) diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 82ebdf84019a5..d972402d03a7d 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -475,3 +475,5 @@ if(OFFLOAD_INCLUDE_TESTS) add_subdirectory(test) add_subdirectory(unittests) endif() + +add_subdirectory(utils) diff --git a/offload/utils/CMakeLists.txt b/offload/utils/CMakeLists.txt new file mode 100644 index 0000000000000..d6f2d6729d18c --- /dev/null +++ b/offload/utils/CMakeLists.txt @@ -0,0 +1,10 @@ +set(OPENMP_UTILS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH + "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')") + +macro(add_openmp_util path) + install(PROGRAMS + ${path} + DESTINATION "${OPENMP_UTILS_INSTALL_DIR}") +endmacro() + +add_subdirectory(gpurun) diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt new file mode 100644 index 0000000000000..0483a5737b830 --- /dev/null +++ b/offload/utils/gpurun/CMakeLists.txt @@ -0,0 +1 @@ +add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun) diff --git a/offload/utils/gpurun b/offload/utils/gpurun/gpurun similarity index 100% rename from offload/utils/gpurun rename to offload/utils/gpurun/gpurun From 7a00afda648e9b433936e360c522793926531115 Mon Sep 17 00:00:00 2001 From: ronlieb Date: Tue, 18 Nov 2025 22:32:36 -0500 Subject: [PATCH 4/7] [gpurun] force numatcl with rocr_vis_dev or mpi rank (#619) rocm 7.2 changed pci layout/info really messes up xnack=1 performance necessitates forced path to numactl -nr use numactl ROCR_VISIBLE_DEVICES -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK --- offload/utils/gpurun/gpurun | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun index a22c4265bcac7..ab86f491fa7f6 100755 --- a/offload/utils/gpurun/gpurun +++ b/offload/utils/gpurun/gpurun @@ -80,6 +80,8 @@ function usage(){ fails when not enough memory available on these nodes. -l use numactl localalloc to CPUs in same NUMA domain. Note: If memory cannot be allocated, alloc falls back to other nodes. + -nr use numactl ROCR_VISIBLE_DEVICES + -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK --version Print version of gpurun and exit Optional Input environment variables: @@ -161,6 +163,8 @@ while [ "$_end_gpurun_opts" == "0" ] ; do -vv) GPURUN_VERBOSE=2;; -m) _use_numactl_membind=1;; -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; + -nr) _use_numactl_rocr=1;; + -nm) _use_numactl_ompi=1;; -l) _use_numactl_localalloc=1;; -nomask) GPURUN_MASK_POLICY="nomask";; *) _end_gpurun_opts=1; break;; @@ -199,6 +203,14 @@ if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then _num_local_ranks=$SLURM_CPUS_ON_NODE _local_rank_num=$SLURM_LOCALID fi +if [ "$_use_numactl_rocr" == "1" ] ; then + numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* + exit $? +fi +if [ "$_use_numactl_ompi" == "1" ] ; then + numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* + exit $? +fi # If none of the above MPIs, assume gpurun is wrapper for single process on single GPU if [ -z "$_num_local_ranks" ] ; then _num_local_ranks=1 From 49fea23dce2028614cb7d25cfdbdfbe204500874 Mon Sep 17 00:00:00 2001 From: ronlieb Date: Wed, 19 Nov 2025 12:38:28 -0500 Subject: [PATCH 5/7] [gpurun] add numactl check and fallback for -nm and -nr (#625) --- offload/utils/gpurun/gpurun | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun index ab86f491fa7f6..b8a0daf64b913 100755 --- a/offload/utils/gpurun/gpurun +++ b/offload/utils/gpurun/gpurun @@ -203,13 +203,26 @@ if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then _num_local_ranks=$SLURM_CPUS_ON_NODE _local_rank_num=$SLURM_LOCALID fi + if [ "$_use_numactl_rocr" == "1" ] ; then - numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* - exit $? + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* + exit $? + else + $* + exit $? + fi fi if [ "$_use_numactl_ompi" == "1" ] ; then - numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* - exit $? + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* + exit $? + else + $* + exit $? + fi fi # If none of the above MPIs, assume gpurun is wrapper for single process on single GPU if [ -z "$_num_local_ranks" ] ; then From f51cf9a0246048398d467c75468dd836e7974657 Mon Sep 17 00:00:00 2001 From: theRonShark Date: Thu, 20 Nov 2025 06:23:08 -0500 Subject: [PATCH 6/7] [gpurun] enable GPURUN_BYPASS after argument processing (#630) --- offload/utils/gpurun/gpurun | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun index b8a0daf64b913..870bc7a8ccbcd 100755 --- a/offload/utils/gpurun/gpurun +++ b/offload/utils/gpurun/gpurun @@ -36,10 +36,6 @@ function execOnError() { exec "$@" } -if [ "$GPURUN_BYPASS" = "1" ]; then - execOnError "$@" -fi - # PROGVERSION string is updated by cmake when component is installed PROGVERSION=X.Y-Z function version(){ @@ -174,6 +170,10 @@ while [ "$_end_gpurun_opts" == "0" ] ; do fi done +if [ "$GPURUN_BYPASS" = "1" ]; then + execOnError "$@" +fi + # Default: quiet operation GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} # Default: create mutually exclusive sets of CUs when GPU is oversubscribed From 026dd0de1f4b2c600f66f97dba91bb65bba06895 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Thu, 20 Nov 2025 09:28:34 +0900 Subject: [PATCH 7/7] RenameIndependentSubregs: try to only implicit def used subregs (#167486) Attempt to only define used subregisters when creating IMPLICIT_DEF fix ups for live interval subranges. This avoids the appearance at the MIR level of entire (wide) registers becoming live rather than relying only on transient LiveIntervals dead definitions for unused subregisters. (cherry picked from commit b1c4b55118131cdf3d6d47ba31578b2e0cd78ec7) --- llvm/lib/CodeGen/RenameIndependentSubregs.cpp | 33 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 315 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 8644 ++++++----------- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 30 +- ...rval-bug-in-rename-independent-subregs.mir | 84 +- ...se-after-free-after-cleanup-failed-vreg.ll | 2 +- 6 files changed, 3115 insertions(+), 5993 deletions(-) diff --git a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 83a9c0d738394..796ee8cf857ae 100644 --- a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -306,6 +306,7 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( const IntEqClasses &Classes, const SmallVectorImpl &SubRangeInfos, const SmallVectorImpl &Intervals) const { + const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); const SlotIndexes &Indexes = *LIS->getSlotIndexes(); for (size_t I = 0, E = Intervals.size(); I < E; ++I) { @@ -314,6 +315,25 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( LI.removeEmptySubRanges(); + // Try to establish a single subregister which covers all uses. + // Note: this is assuming the selected subregister will only be + // used for fixing up live intervals issues created by this pass. + LaneBitmask UsedMask, UnusedMask; + for (LiveInterval::SubRange &SR : LI.subranges()) + UsedMask |= SR.LaneMask; + SmallVector SubRegIdxs; + unsigned Flags = 0; + unsigned SubReg = 0; + // TODO: Handle SubRegIdxs.size() > 1 + if (TRI.getCoveringSubRegIndexes(MRI->getRegClass(Reg), UsedMask, + SubRegIdxs) && + SubRegIdxs.size() == 1) { + SubReg = SubRegIdxs.front(); + Flags = RegState::Undef; + } else { + UnusedMask = MRI->getMaxLaneMaskForVReg(Reg) & ~UsedMask; + } + // There must be a def (or live-in) before every use. Splitting vregs may // violate this principle as the splitted vreg may not have a definition on // every path. Fix this by creating IMPLICIT_DEF instruction as necessary. @@ -336,19 +356,18 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( MachineBasicBlock::iterator InsertPos = llvm::findPHICopyInsertPoint(PredMBB, &MBB, Reg); const MCInstrDesc &MCDesc = TII->get(TargetOpcode::IMPLICIT_DEF); - MachineInstrBuilder ImpDef = BuildMI(*PredMBB, InsertPos, - DebugLoc(), MCDesc, Reg); + MachineInstrBuilder ImpDef = + BuildMI(*PredMBB, InsertPos, DebugLoc(), MCDesc) + .addDef(Reg, Flags, SubReg); SlotIndex DefIdx = LIS->InsertMachineInstrInMaps(*ImpDef); SlotIndex RegDefIdx = DefIdx.getRegSlot(); - LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(Reg); for (LiveInterval::SubRange &SR : LI.subranges()) { - Mask = Mask & ~SR.LaneMask; VNInfo *SRVNI = SR.getNextValue(RegDefIdx, Allocator); SR.addSegment(LiveRange::Segment(RegDefIdx, PredEnd, SRVNI)); } - - if (!Mask.none()) { - LiveInterval::SubRange *SR = LI.createSubRange(Allocator, Mask); + if (!UnusedMask.none()) { + LiveInterval::SubRange *SR = + LI.createSubRange(Allocator, UnusedMask); SR->createDeadDef(RegDefIdx, Allocator); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index b0ca1e8ef3dff..cbf17bd71a69e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -144,43 +144,41 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v21, v0 -; GFX1030-NEXT: v_mov_b32_e32 v22, v1 -; GFX1030-NEXT: v_mov_b32_e32 v23, v2 -; GFX1030-NEXT: v_mov_b32_e32 v24, v3 -; GFX1030-NEXT: v_mov_b32_e32 v25, v4 -; GFX1030-NEXT: v_mov_b32_e32 v26, v5 -; GFX1030-NEXT: v_mov_b32_e32 v27, v6 -; GFX1030-NEXT: v_mov_b32_e32 v28, v7 -; GFX1030-NEXT: v_mov_b32_e32 v29, v8 -; GFX1030-NEXT: v_mov_b32_e32 v30, v9 -; GFX1030-NEXT: v_mov_b32_e32 v31, v10 -; GFX1030-NEXT: v_mov_b32_e32 v19, v11 -; GFX1030-NEXT: v_mov_b32_e32 v20, v12 +; GFX1030-NEXT: v_mov_b32_e32 v15, v0 +; GFX1030-NEXT: v_mov_b32_e32 v16, v1 +; GFX1030-NEXT: v_mov_b32_e32 v17, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v4 +; GFX1030-NEXT: v_mov_b32_e32 v20, v5 +; GFX1030-NEXT: v_mov_b32_e32 v21, v6 +; GFX1030-NEXT: v_mov_b32_e32 v22, v7 +; GFX1030-NEXT: v_mov_b32_e32 v23, v8 +; GFX1030-NEXT: v_mov_b32_e32 v24, v9 +; GFX1030-NEXT: v_mov_b32_e32 v25, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v20 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[21:31], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr11 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 +; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr31 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1030-NEXT: ; %bb.2: @@ -190,22 +188,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v19, v11 -; GFX1013-NEXT: v_mov_b32_e32 v20, v12 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr19 +; GFX1013-NEXT: ; implicit-def: $vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1013-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 @@ -220,31 +216,29 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v21, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1 ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 -; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v11 -; GFX11-NEXT: v_mov_b32_e32 v19, v12 +; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s4, v18 -; GFX11-NEXT: v_readfirstlane_b32 s5, v19 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 ; GFX11-NEXT: v_readfirstlane_b32 s6, v13 ; GFX11-NEXT: v_readfirstlane_b32 s7, v14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: ; implicit-def: $vgpr11 ; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 -; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: @@ -259,42 +253,40 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 +; GFX1030-NEXT: v_mov_b32_e32 v13, v0 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 +; GFX1030-NEXT: v_mov_b32_e32 v15, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 +; GFX1030-NEXT: v_mov_b32_e32 v16, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v16, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_and_or_b32 v23, 0xffff, v5, v0 -; GFX1030-NEXT: v_and_or_b32 v24, 0xffff, v6, v1 -; GFX1030-NEXT: v_alignbit_b32 v25, v2, v7, 16 +; GFX1030-NEXT: v_mov_b32_e32 v17, v4 +; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_and_or_b32 v18, 0xffff, v5, v0 +; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:25], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr9 +; GFX1030-NEXT: ; implicit-def: $vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 -; GFX1030-NEXT: ; implicit-def: $vgpr21 -; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1030-NEXT: ; %bb.2: @@ -304,30 +296,28 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v17, v9 -; GFX1013-NEXT: v_mov_b32_e32 v18, v10 -; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v9 -; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 +; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr17 +; GFX1013-NEXT: ; implicit-def: $vgpr9 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 @@ -343,33 +333,32 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v19, v10 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3 -; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v18, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0 ; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v18 -; GFX11-NEXT: v_readfirstlane_b32 s5, v19 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 -; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr9 ; GFX11-NEXT: ; implicit-def: $vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 -; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: @@ -384,45 +373,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v22, v0 -; GFX1030-NEXT: v_mov_b32_e32 v23, v1 -; GFX1030-NEXT: v_mov_b32_e32 v24, v2 -; GFX1030-NEXT: v_mov_b32_e32 v25, v3 -; GFX1030-NEXT: v_mov_b32_e32 v26, v4 -; GFX1030-NEXT: v_mov_b32_e32 v27, v5 -; GFX1030-NEXT: v_mov_b32_e32 v28, v6 -; GFX1030-NEXT: v_mov_b32_e32 v29, v7 -; GFX1030-NEXT: v_mov_b32_e32 v30, v8 -; GFX1030-NEXT: v_mov_b32_e32 v31, v9 -; GFX1030-NEXT: v_mov_b32_e32 v32, v10 -; GFX1030-NEXT: v_mov_b32_e32 v33, v11 -; GFX1030-NEXT: v_mov_b32_e32 v20, v12 -; GFX1030-NEXT: v_mov_b32_e32 v21, v13 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v21, v5 +; GFX1030-NEXT: v_mov_b32_e32 v22, v6 +; GFX1030-NEXT: v_mov_b32_e32 v23, v7 +; GFX1030-NEXT: v_mov_b32_e32 v24, v8 +; GFX1030-NEXT: v_mov_b32_e32 v25, v9 +; GFX1030-NEXT: v_mov_b32_e32 v26, v10 +; GFX1030-NEXT: v_mov_b32_e32 v27, v11 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v20 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v21 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v13 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v14 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v15 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[22:33], s[4:7] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 +; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 +; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr31 -; GFX1030-NEXT: ; implicit-def: $vgpr32 -; GFX1030-NEXT: ; implicit-def: $vgpr33 -; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1030-NEXT: ; %bb.2: @@ -432,22 +419,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v20, v12 -; GFX1013-NEXT: v_mov_b32_e32 v21, v13 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v20 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v21 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v13 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v14 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v15 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr20 +; GFX1013-NEXT: ; implicit-def: $vgpr12 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 @@ -465,28 +450,26 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1 ; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v12 :: v_dual_mov_b32 v5, v13 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v12 +; GFX11-NEXT: v_readfirstlane_b32 s5, v13 ; GFX11-NEXT: v_readfirstlane_b32 s6, v14 ; GFX11-NEXT: v_readfirstlane_b32 s7, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] -; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr12 ; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 ; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11 -; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-NEXT: ; %bb.2: @@ -501,44 +484,42 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v19, v0 -; GFX1030-NEXT: v_mov_b32_e32 v20, v1 +; GFX1030-NEXT: v_mov_b32_e32 v14, v0 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v2 +; GFX1030-NEXT: v_mov_b32_e32 v16, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX1030-NEXT: v_mov_b32_e32 v22, v3 +; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v23, v4 -; GFX1030-NEXT: v_mov_b32_e32 v24, v5 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_mov_b32_e32 v18, v11 -; GFX1030-NEXT: v_and_or_b32 v25, 0xffff, v6, v0 -; GFX1030-NEXT: v_and_or_b32 v26, 0xffff, v7, v1 -; GFX1030-NEXT: v_alignbit_b32 v27, v2, v8, 16 +; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_mov_b32_e32 v19, v5 +; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 +; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0 +; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v7, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v17 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:27], s[4:7] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr10 +; GFX1030-NEXT: ; implicit-def: $vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 ; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 ; GFX1030-NEXT: ; %bb.2: @@ -548,30 +529,28 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v18, v10 -; GFX1013-NEXT: v_mov_b32_e32 v19, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 -; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v11 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 +; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v15 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v19 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr18 +; GFX1013-NEXT: ; implicit-def: $vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 @@ -591,29 +570,29 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3 ; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v10 :: v_dual_mov_b32 v5, v11 -; GFX11-NEXT: v_lshl_or_b32 v20, v6, 16, v0 -; GFX11-NEXT: v_perm_b32 v21, v6, v8, 0x7060302 -; GFX11-NEXT: v_lshl_or_b32 v22, v7, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0 +; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302 +; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v10 +; GFX11-NEXT: v_readfirstlane_b32 s5, v11 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16 -; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[4:6]], s[4:7] a16 +; GFX11-NEXT: ; implicit-def: $vgpr10 ; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16 -; GFX11-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index b47c7ecf8de95..f230a14dd0834 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -29546,173 +29546,27 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-NEXT: .LBB19_2: ; %cmp.true @@ -29720,762 +29574,674 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB19_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB19_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB19_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -34192,324 +33958,81 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-LABEL: bitcast_v64f16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 ; GFX11-NEXT: .LBB23_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB23_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB23_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37313,324 +36836,81 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-LABEL: bitcast_v64i16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 ; GFX11-NEXT: .LBB27_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB27_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB27_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB27_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -67275,173 +66555,27 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-NEXT: .LBB43_2: ; %cmp.true @@ -67449,762 +66583,674 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB43_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB43_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -71893,324 +70939,81 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-LABEL: bitcast_v64f16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-NEXT: .LBB47_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB47_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB47_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -74968,324 +73771,81 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-LABEL: bitcast_v64i16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-NEXT: .LBB51_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB51_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB51_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -102724,173 +101284,27 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB63_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB63_3 ; GFX11-NEXT: .LBB63_2: ; %cmp.true @@ -102898,762 +101312,674 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB63_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB63_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -107383,324 +105709,81 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-LABEL: bitcast_v64f16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 ; GFX11-NEXT: .LBB67_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB67_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB67_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB67_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -110518,324 +108601,81 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-LABEL: bitcast_v64i16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 ; GFX11-NEXT: .LBB71_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB71_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB71_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB71_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -138331,173 +136171,27 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 ; GFX11-NEXT: .LBB79_2: ; %cmp.true @@ -138505,762 +136199,674 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB79_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB79_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -142871,324 +140477,81 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-LABEL: bitcast_v64f16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 ; GFX11-NEXT: .LBB83_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB83_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB83_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB83_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -145850,324 +143213,81 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-LABEL: bitcast_v64i16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 ; GFX11-NEXT: .LBB87_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB87_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB87_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB87_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2b63a8cf69476..28b992ee77b14 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -981,7 +981,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_1: ; %Flow @@ -1004,36 +1004,36 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.5: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], s[6:7] ; GCN-NEXT: s_cbranch_execz .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir index eaf669da83ead..9e38919190ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir @@ -11,26 +11,28 @@ body: | ; REG_ALLOC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: KILL killed renamable $vgpr4 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr2 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr0 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr3 - ; REG_ALLOC-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; REG_ALLOC-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; REG_ALLOC-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec ; REG_ALLOC-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REG_ALLOC-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; REG_ALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; REG_ALLOC-NEXT: S_BRANCH %bb.2 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.1: ; REG_ALLOC-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; REG_ALLOC-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -42,33 +44,33 @@ body: | ; REG_ALLOC-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 - ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; REG_ALLOC-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; REG_ALLOC-NEXT: S_BRANCH %bb.1 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.3: ; REG_ALLOC-NEXT: successors: %bb.5(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr6, implicit $exec ; REG_ALLOC-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: S_BRANCH %bb.5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.4: - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; REG_ALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; REG_ALLOC-NEXT: S_ENDPGM 0 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.5: ; REG_ALLOC-NEXT: successors: %bb.4(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: S_BRANCH %bb.4 @@ -78,26 +80,28 @@ body: | ; DEAD_INST_DEL-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr4 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr2 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr0 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr3 - ; DEAD_INST_DEL-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; DEAD_INST_DEL-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec ; DEAD_INST_DEL-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.2 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.1: ; DEAD_INST_DEL-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; DEAD_INST_DEL-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -109,33 +113,33 @@ body: | ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 - ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; DEAD_INST_DEL-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.1 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.3: ; DEAD_INST_DEL-NEXT: successors: %bb.5(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr6, implicit $exec ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.4: - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: S_ENDPGM 0 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.5: ; DEAD_INST_DEL-NEXT: successors: %bb.4(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.4 diff --git a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll index ea127323f3e05..50efc06237d5b 100644 --- a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll +++ b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -filetype=null -verify-machineinstrs %s 2>&1 | FileCheck %s +; RUN: not llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -amdgpu-enable-rewrite-partial-reg-uses=0 -filetype=null -verify-machineinstrs %s 2>&1 | FileCheck %s ; CHECK: error: :0:0: ran out of registers during register allocation in function 'f' ; CHECK-NOT: Bad machine code