diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 2b39c81178084..8d30f165dd8b6 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -110,6 +110,10 @@ def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { let summary = "Lower workshare construct"; } +def LowerWorkdistribute : Pass<"lower-workdistribute", "::mlir::ModuleOp"> { + let summary = "Lower workdistribute construct"; +} + def GenericLoopConversionPass : Pass<"omp-generic-loop-conversion", "mlir::func::FuncOp"> { let summary = "Converts OpenMP generic `omp.loop` to semantically " diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 579e47268afea..304333fa8830e 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms MapsForPrivatizedSymbols.cpp MapInfoFinalization.cpp MarkDeclareTarget.cpp + LowerWorkdistribute.cpp LowerWorkshare.cpp LowerNontemporal.cpp SimdOnly.cpp diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp new file mode 100644 index 0000000000000..cfa39e142907c --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp @@ -0,0 +1,1852 @@ +//===- LowerWorkdistribute.cpp +//-------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the lowering and optimisations of omp.workdistribute. +// +// Fortran array statements are lowered to fir as fir.do_loop unordered. +// lower-workdistribute pass works mainly on identifying fir.do_loop unordered +// that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and +// lowers it to target{teams{parallel{distribute{wsloop{loop_nest}}}}}. +// It hoists all the other ops outside target region. +// Relaces heap allocation on target with omp.target_allocmem and +// deallocation with omp.target_freemem from host. Also replaces +// runtime function "Assign" with omp_target_memcpy. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Utils.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace flangomp { +#define GEN_PASS_DEF_LOWERWORKDISTRIBUTE +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +#define DEBUG_TYPE "lower-workdistribute" + +using namespace mlir; + +namespace { + +/// This string is used to identify the Fortran-specific runtime FortranAAssign. +static constexpr llvm::StringRef FortranAssignStr = "_FortranAAssign"; + +/// The isRuntimeCall function is a utility designed to determine +/// if a given operation is a call to a Fortran-specific runtime function. +static bool isRuntimeCall(Operation *op) { + if (auto callOp = dyn_cast(op)) { + auto callee = callOp.getCallee(); + if (!callee) + return false; + auto *func = op->getParentOfType().lookupSymbol(*callee); + if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName())) + return true; + } + return false; +} + +/// This is the single source of truth about whether we should parallelize an +/// operation nested in an omp.workdistribute region. +/// Parallelize here refers to dividing into units of work. +static bool shouldParallelize(Operation *op) { + // True if the op is a runtime call to Assign + if (isRuntimeCall(op)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + return true; + } + } + // We cannot parallelize ops with side effects. + // Parallelizable operations should not produce + // values that other operations depend on + if (llvm::any_of(op->getResults(), + [](OpResult v) -> bool { return !v.use_empty(); })) + return false; + // We will parallelize unordered loops - these come from array syntax + if (auto loop = dyn_cast(op)) { + auto unordered = loop.getUnordered(); + if (!unordered) + return false; + return *unordered; + } + // We cannot parallelize anything else. + return false; +} + +/// The getPerfectlyNested function is a generic utility for finding +/// a single, "perfectly nested" operation within a parent operation. +template +static T getPerfectlyNested(Operation *op) { + if (op->getNumRegions() != 1) + return nullptr; + auto ®ion = op->getRegion(0); + if (region.getBlocks().size() != 1) + return nullptr; + auto *block = ®ion.front(); + auto *firstOp = &block->front(); + if (auto nested = dyn_cast(firstOp)) + if (firstOp->getNextNode() == block->getTerminator()) + return nested; + return nullptr; +} + +/// verifyTargetTeamsWorkdistribute method verifies that +/// omp.target { teams { workdistribute { ... } } } is well formed +/// and fails for function calls that don't have lowering implemented yet. +static LogicalResult +verifyTargetTeamsWorkdistribute(omp::WorkdistributeOp workdistribute) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + if (!teams) { + emitError(loc, "workdistribute not nested in teams\n"); + return failure(); + } + if (workdistribute.getRegion().getBlocks().size() != 1) { + emitError(loc, "workdistribute with multiple blocks\n"); + return failure(); + } + if (teams.getRegion().getBlocks().size() != 1) { + emitError(loc, "teams with multiple blocks\n"); + return failure(); + } + + bool foundWorkdistribute = false; + for (auto &op : teams.getOps()) { + if (isa(op)) { + if (foundWorkdistribute) { + emitError(loc, "teams has multiple workdistribute ops.\n"); + return failure(); + } + foundWorkdistribute = true; + continue; + } + // Identify any omp dialect ops present before/after workdistribute. + if (op.getDialect() && isa(op.getDialect()) && + !isa(op)) { + emitError(loc, "teams has omp ops other than workdistribute. Lowering " + "not implemented yet.\n"); + return failure(); + } + } + + omp::TargetOp targetOp = dyn_cast(teams->getParentOp()); + // return if not omp.target + if (!targetOp) + return success(); + + for (auto &op : workdistribute.getOps()) { + if (auto callOp = dyn_cast(op)) { + if (isRuntimeCall(&op)) { + auto funcName = (*callOp.getCallee()).getRootReference().getValue(); + // _FortranAAssign is handled. Other runtime calls are not supported + // in omp.workdistribute yet. + if (funcName == FortranAssignStr) + continue; + else { + emitError(loc, "Runtime call " + funcName + + " lowering not supported for workdistribute yet."); + return failure(); + } + } + } + } + return success(); +} + +/// fissionWorkdistribute method finds the parallelizable ops +/// within teams {workdistribute} region and moves them to their +/// own teams{workdistribute} region. +/// +/// If B() and D() are parallelizable, +/// +/// omp.teams { +/// omp.workdistribute { +/// A() +/// B() +/// C() +/// D() +/// E() +/// } +/// } +/// +/// becomes +/// +/// A() +/// omp.teams { +/// omp.workdistribute { +/// B() +/// } +/// } +/// C() +/// omp.teams { +/// omp.workdistribute { +/// D() +/// } +/// } +/// E() +static FailureOr +fissionWorkdistribute(omp::WorkdistributeOp workdistribute) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + auto *teamsBlock = &teams.getRegion().front(); + bool changed = false; + // Move the ops inside teams and before workdistribute outside. + IRMapping irMapping; + llvm::SmallVector teamsHoisted; + for (auto &op : teams.getOps()) { + if (&op == workdistribute) { + break; + } + if (shouldParallelize(&op)) { + emitError(loc, "teams has parallelize ops before first workdistribute\n"); + return failure(); + } else { + rewriter.setInsertionPoint(teams); + rewriter.clone(op, irMapping); + teamsHoisted.push_back(&op); + changed = true; + } + } + for (auto *op : llvm::reverse(teamsHoisted)) { + op->replaceAllUsesWith(irMapping.lookup(op)); + op->erase(); + } + + // While we have unhandled operations in the original workdistribute + auto *workdistributeBlock = &workdistribute.getRegion().front(); + auto *terminator = workdistributeBlock->getTerminator(); + while (&workdistributeBlock->front() != terminator) { + rewriter.setInsertionPoint(teams); + IRMapping mapping; + llvm::SmallVector hoisted; + Operation *parallelize = nullptr; + for (auto &op : workdistribute.getOps()) { + if (&op == terminator) { + break; + } + if (shouldParallelize(&op)) { + parallelize = &op; + break; + } else { + rewriter.clone(op, mapping); + hoisted.push_back(&op); + changed = true; + } + } + + for (auto *op : llvm::reverse(hoisted)) { + op->replaceAllUsesWith(mapping.lookup(op)); + op->erase(); + } + + if (parallelize && hoisted.empty() && + parallelize->getNextNode() == terminator) + break; + if (parallelize) { + auto newTeams = rewriter.cloneWithoutRegions(teams); + auto *newTeamsBlock = rewriter.createBlock( + &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {}); + for (auto arg : teamsBlock->getArguments()) + newTeamsBlock->addArgument(arg.getType(), arg.getLoc()); + auto newWorkdistribute = rewriter.create(loc); + rewriter.create(loc); + rewriter.createBlock(&newWorkdistribute.getRegion(), + newWorkdistribute.getRegion().begin(), {}, {}); + auto *cloned = rewriter.clone(*parallelize); + parallelize->replaceAllUsesWith(cloned); + parallelize->erase(); + rewriter.create(loc); + changed = true; + } + } + return changed; +} + +/// Generate omp.parallel operation with an empty region. +static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) { + auto parallelOp = rewriter.create(loc); + parallelOp.setComposite(composite); + rewriter.createBlock(¶llelOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); + return; +} + +/// Generate omp.distribute operation with an empty region. +static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) { + mlir::omp::DistributeOperands distributeClauseOps; + auto distributeOp = + rewriter.create(loc, distributeClauseOps); + distributeOp.setComposite(composite); + auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion()); + rewriter.setInsertionPointToStart(distributeBlock); + return; +} + +/// Generate loop nest clause operands from fir.do_loop operation. +static void +genLoopNestClauseOps(OpBuilder &rewriter, fir::DoLoopOp loop, + mlir::omp::LoopNestOperands &loopNestClauseOps) { + assert(loopNestClauseOps.loopLowerBounds.empty() && + "Loop nest bounds were already emitted!"); + loopNestClauseOps.loopLowerBounds.push_back(loop.getLowerBound()); + loopNestClauseOps.loopUpperBounds.push_back(loop.getUpperBound()); + loopNestClauseOps.loopSteps.push_back(loop.getStep()); + loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); +} + +/// Generate omp.wsloop operation with an empty region and +/// clone the body of fir.do_loop operation inside the loop nest region. +static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop, + const mlir::omp::LoopNestOperands &clauseOps, + bool composite) { + + auto wsloopOp = rewriter.create(doLoop.getLoc()); + wsloopOp.setComposite(composite); + rewriter.createBlock(&wsloopOp.getRegion()); + + auto loopNestOp = + rewriter.create(doLoop.getLoc(), clauseOps); + + // Clone the loop's body inside the loop nest construct using the + // mapped values. + rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), + loopNestOp.getRegion().begin()); + Block *clonedBlock = &loopNestOp.getRegion().back(); + mlir::Operation *terminatorOp = clonedBlock->getTerminator(); + + // Erase fir.result op of do loop and create yield op. + if (auto resultOp = dyn_cast(terminatorOp)) { + rewriter.setInsertionPoint(terminatorOp); + rewriter.create(doLoop->getLoc()); + terminatorOp->erase(); + } +} + +/// workdistributeDoLower method finds the fir.do_loop unoredered +/// nested in teams {workdistribute{fir.do_loop unoredered}} and +/// lowers it to teams {parallel { distribute {wsloop {loop_nest}}}}. +/// +/// If fir.do_loop is present inside teams workdistribute +/// +/// omp.teams { +/// omp.workdistribute { +/// fir.do_loop unoredered { +/// ... +/// } +/// } +/// } +/// +/// Then, its lowered to +/// +/// omp.teams { +/// omp.parallel { +/// omp.distribute { +/// omp.wsloop { +/// omp.loop_nest +/// ... +/// } +/// } +/// } +/// } +/// } +static bool +workdistributeDoLower(omp::WorkdistributeOp workdistribute, + SetVector &targetOpsToProcess) { + OpBuilder rewriter(workdistribute); + auto doLoop = getPerfectlyNested(workdistribute); + auto wdLoc = workdistribute->getLoc(); + if (doLoop && shouldParallelize(doLoop)) { + assert(doLoop.getReduceOperands().empty()); + + // Record the target ops to process later + if (auto teamsOp = dyn_cast(workdistribute->getParentOp())) { + auto targetOp = dyn_cast(teamsOp->getParentOp()); + if (targetOp) { + targetOpsToProcess.insert(targetOp); + } + } + // Generate the nested parallel, distribute, wsloop and loop_nest ops. + genParallelOp(wdLoc, rewriter, true); + genDistributeOp(wdLoc, rewriter, true); + mlir::omp::LoopNestOperands loopNestClauseOps; + genLoopNestClauseOps(rewriter, doLoop, loopNestClauseOps); + genWsLoopOp(rewriter, doLoop, loopNestClauseOps, true); + workdistribute.erase(); + return true; + } + return false; +} + +/// Check if the enclosed type in fir.ref is fir.box and fir.box encloses array +static bool isEnclosedTypeRefToBoxArray(Type type) { + // Check if it's a reference type + if (auto refType = dyn_cast(type)) { + // Get the referenced type (should be fir.box) + auto referencedType = refType.getEleTy(); + // Check if referenced type is a box + if (auto boxType = dyn_cast(referencedType)) { + // Get the boxed type and check if it's an array + auto boxedType = boxType.getEleTy(); + // Check if boxed type is a sequence (array) + return isa(boxedType); + } + } + return false; +} + +/// Check if the enclosed type in fir.box is scalar (not array) +static bool isEnclosedTypeBoxScalar(Type type) { + // Check if it's a box type + if (auto boxType = dyn_cast(type)) { + // Get the boxed type + auto boxedType = boxType.getEleTy(); + // Check if boxed type is NOT a sequence (array) + return !isa(boxedType); + } + return false; +} + +/// Check if the FortranAAssign call has src as scalar and dest as array +static bool isFortranAssignSrcScalarAndDestArray(fir::CallOp callOp) { + if (callOp.getNumOperands() < 2) + return false; + auto srcArg = callOp.getOperand(1); + auto destArg = callOp.getOperand(0); + // Both operands should be fir.convert ops + auto srcConvert = srcArg.getDefiningOp(); + auto destConvert = destArg.getDefiningOp(); + if (!srcConvert || !destConvert) { + emitError(callOp->getLoc(), + "Unimplemented: FortranAssign to OpenMP lowering\n"); + return false; + } + // Get the original types before conversion + auto srcOrigType = srcConvert.getValue().getType(); + auto destOrigType = destConvert.getValue().getType(); + + // Check if src is scalar and dest is array + bool srcIsScalar = isEnclosedTypeBoxScalar(srcOrigType); + bool destIsArray = isEnclosedTypeRefToBoxArray(destOrigType); + return srcIsScalar && destIsArray; +} + +/// Convert a flat index to multi-dimensional indices for an array box +/// Example: 2D array with shape (2,4) +/// Col 1 Col 2 Col 3 Col 4 +/// Row 1: (1,1) (1,2) (1,3) (1,4) +/// Row 2: (2,1) (2,2) (2,3) (2,4) +/// +/// extents: (2,4) +/// +/// flatIdx: 0 1 2 3 4 5 6 7 +/// Indices: (1,1) (1,2) (1,3) (1,4) (2,1) (2,2) (2,3) (2,4) +static SmallVector convertFlatToMultiDim(OpBuilder &builder, + Location loc, Value flatIdx, + Value arrayBox) { + // Get array type and rank + auto boxType = cast(arrayBox.getType()); + auto seqType = cast(boxType.getEleTy()); + int rank = seqType.getDimension(); + + // Get all extents + SmallVector extents; + // Get extents for each dimension + for (int i = 0; i < rank; ++i) { + auto dimIdx = builder.create(loc, i); + auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx); + extents.push_back(boxDims.getResult(1)); + } + + // Convert flat index to multi-dimensional indices + SmallVector indices(rank); + Value temp = flatIdx; + auto c1 = builder.create(loc, 1); + + // Work backwards through dimensions (row-major order) + for (int i = rank - 1; i >= 0; --i) { + Value zeroBasedIdx = builder.create(loc, temp, extents[i]); + // Convert to one-based index + indices[i] = builder.create(loc, zeroBasedIdx, c1); + if (i > 0) { + temp = builder.create(loc, temp, extents[i]); + } + } + + return indices; +} + +/// Calculate the total number of elements in the array box +/// (totalElems = extent(1) * extent(2) * ... * extent(n)) +static Value CalculateTotalElements(OpBuilder &builder, Location loc, + Value arrayBox) { + auto boxType = cast(arrayBox.getType()); + auto seqType = cast(boxType.getEleTy()); + int rank = seqType.getDimension(); + + Value totalElems = nullptr; + for (int i = 0; i < rank; ++i) { + auto dimIdx = builder.create(loc, i); + auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx); + Value extent = boxDims.getResult(1); + if (i == 0) { + totalElems = extent; + } else { + totalElems = builder.create(loc, totalElems, extent); + } + } + return totalElems; +} + +/// Replace the FortranAAssign runtime call with an unordered do loop +static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc, + omp::TeamsOp teamsOp, + omp::WorkdistributeOp workdistribute, + fir::CallOp callOp) { + auto destConvert = callOp.getOperand(0).getDefiningOp(); + auto srcConvert = callOp.getOperand(1).getDefiningOp(); + + Value destBox = destConvert.getValue(); + Value srcBox = srcConvert.getValue(); + + // get defining alloca op of destBox and srcBox + auto destAlloca = destBox.getDefiningOp(); + + if (!destAlloca) { + emitError(loc, "Unimplemented: FortranAssign to OpenMP lowering\n"); + return; + } + + // get the store op that stores to the alloca + for (auto user : destAlloca->getUsers()) { + if (auto storeOp = dyn_cast(user)) { + destBox = storeOp.getValue(); + break; + } + } + + builder.setInsertionPoint(teamsOp); + // Load destination array box (if it's a reference) + Value arrayBox = destBox; + if (isa(destBox.getType())) + arrayBox = builder.create(loc, destBox); + + auto scalarValue = builder.create(loc, srcBox); + Value scalar = builder.create(loc, scalarValue); + + // Calculate total number of elements (flattened) + auto c0 = builder.create(loc, 0); + auto c1 = builder.create(loc, 1); + Value totalElems = CalculateTotalElements(builder, loc, arrayBox); + + auto *workdistributeBlock = &workdistribute.getRegion().front(); + builder.setInsertionPointToStart(workdistributeBlock); + // Create single unordered loop for flattened array + auto doLoop = fir::DoLoopOp::create(builder, loc, c0, totalElems, c1, true); + Block *loopBlock = &doLoop.getRegion().front(); + builder.setInsertionPointToStart(doLoop.getBody()); + + auto flatIdx = loopBlock->getArgument(0); + SmallVector indices = + convertFlatToMultiDim(builder, loc, flatIdx, arrayBox); + // Use fir.array_coor for linear addressing + auto elemPtr = fir::ArrayCoorOp::create( + builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox, + nullptr, nullptr, ValueRange{indices}, ValueRange{}); + + builder.create(loc, scalar, elemPtr); +} + +/// workdistributeRuntimeCallLower method finds the runtime calls +/// nested in teams {workdistribute{}} and +/// lowers FortranAAssign to unordered do loop if src is scalar and dest is +/// array. Other runtime calls are not handled currently. +static FailureOr +workdistributeRuntimeCallLower(omp::WorkdistributeOp workdistribute, + SetVector &targetOpsToProcess) { + OpBuilder rewriter(workdistribute); + auto loc = workdistribute->getLoc(); + auto teams = dyn_cast(workdistribute->getParentOp()); + if (!teams) { + emitError(loc, "workdistribute not nested in teams\n"); + return failure(); + } + if (workdistribute.getRegion().getBlocks().size() != 1) { + emitError(loc, "workdistribute with multiple blocks\n"); + return failure(); + } + if (teams.getRegion().getBlocks().size() != 1) { + emitError(loc, "teams with multiple blocks\n"); + return failure(); + } + bool changed = false; + // Get the target op parent of teams + omp::TargetOp targetOp = dyn_cast(teams->getParentOp()); + SmallVector opsToErase; + for (auto &op : workdistribute.getOps()) { + if (isRuntimeCall(&op)) { + rewriter.setInsertionPoint(&op); + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + if (isFortranAssignSrcScalarAndDestArray(runtimeCall) && targetOp) { + // Record the target ops to process later + targetOpsToProcess.insert(targetOp); + replaceWithUnorderedDoLoop(rewriter, loc, teams, workdistribute, + runtimeCall); + opsToErase.push_back(&op); + changed = true; + } + } + } + } + // Erase the runtime calls that have been replaced. + for (auto *op : opsToErase) { + op->erase(); + } + return changed; +} + +/// teamsWorkdistributeToSingleOp method hoists all the ops inside +/// teams {workdistribute{}} before teams op. +/// +/// If A() and B () are present inside teams workdistribute +/// +/// omp.teams { +/// omp.workdistribute { +/// A() +/// B() +/// } +/// } +/// +/// Then, its lowered to +/// +/// A() +/// B() +/// +/// If only the terminator remains in teams after hoisting, we erase teams op. +static bool +teamsWorkdistributeToSingleOp(omp::TeamsOp teamsOp, + SetVector &targetOpsToProcess) { + auto workdistributeOp = getPerfectlyNested(teamsOp); + if (!workdistributeOp) + return false; + // Get the block containing teamsOp (the parent block). + Block *parentBlock = teamsOp->getBlock(); + Block &workdistributeBlock = *workdistributeOp.getRegion().begin(); + // Record the target ops to process later + for (auto &op : workdistributeBlock.getOperations()) { + if (shouldParallelize(&op)) { + auto targetOp = dyn_cast(teamsOp->getParentOp()); + if (targetOp) { + targetOpsToProcess.insert(targetOp); + } + } + } + auto insertPoint = Block::iterator(teamsOp); + // Get the range of operations to move (excluding the terminator). + auto workdistributeBegin = workdistributeBlock.begin(); + auto workdistributeEnd = workdistributeBlock.getTerminator()->getIterator(); + // Move the operations from workdistribute block to before teamsOp. + parentBlock->getOperations().splice(insertPoint, + workdistributeBlock.getOperations(), + workdistributeBegin, workdistributeEnd); + // Erase the now-empty workdistributeOp. + workdistributeOp.erase(); + Block &teamsBlock = *teamsOp.getRegion().begin(); + // Check if only the terminator remains and erase teams op. + if (teamsBlock.getOperations().size() == 1 && + teamsBlock.getTerminator() != nullptr) { + teamsOp.erase(); + } + return true; +} + +/// If multiple workdistribute are nested in a target regions, we will need to +/// split the target region, but we want to preserve the data semantics of the +/// original data region and avoid unnecessary data movement at each of the +/// subkernels - we split the target region into a target_data{target} +/// nest where only the outer one moves the data +FailureOr splitTargetData(omp::TargetOp targetOp, + RewriterBase &rewriter) { + auto loc = targetOp->getLoc(); + if (targetOp.getMapVars().empty()) { + emitError(loc, "Target region has no data maps\n"); + return failure(); + } + // Collect all the mapinfo ops + SmallVector mapInfos; + for (auto opr : targetOp.getMapVars()) { + auto mapInfo = cast(opr.getDefiningOp()); + mapInfos.push_back(mapInfo); + } + + rewriter.setInsertionPoint(targetOp); + SmallVector innerMapInfos; + SmallVector outerMapInfos; + // Create new mapinfo ops for the inner target region + for (auto mapInfo : mapInfos) { + auto originalMapType = + (llvm::omp::OpenMPOffloadMappingFlags)(mapInfo.getMapType()); + auto originalCaptureType = mapInfo.getMapCaptureType(); + llvm::omp::OpenMPOffloadMappingFlags newMapType; + mlir::omp::VariableCaptureKind newCaptureType; + // For bycopy, we keep the same map type and capture type + // For byref, we change the map type to none and keep the capture type + if (originalCaptureType == mlir::omp::VariableCaptureKind::ByCopy) { + newMapType = originalMapType; + newCaptureType = originalCaptureType; + } else if (originalCaptureType == mlir::omp::VariableCaptureKind::ByRef) { + newMapType = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + newCaptureType = originalCaptureType; + outerMapInfos.push_back(mapInfo); + } else { + emitError(targetOp->getLoc(), "Unhandled case"); + return failure(); + } + auto innerMapInfo = cast(rewriter.clone(*mapInfo)); + innerMapInfo.setMapTypeAttr(rewriter.getIntegerAttr( + rewriter.getIntegerType(64, false), + static_cast< + std::underlying_type_t>( + newMapType))); + innerMapInfo.setMapCaptureType(newCaptureType); + innerMapInfos.push_back(innerMapInfo.getResult()); + } + + rewriter.setInsertionPoint(targetOp); + auto device = targetOp.getDevice(); + auto ifExpr = targetOp.getIfExpr(); + auto deviceAddrVars = targetOp.getHasDeviceAddrVars(); + auto devicePtrVars = targetOp.getIsDevicePtrVars(); + // Create the target data op + auto targetDataOp = rewriter.create( + loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars); + auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(taregtDataBlock); + // Create the inner target op + auto newTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), + targetOp.getHostEvalVars(), targetOp.getIfExpr(), + targetOp.getInReductionVars(), targetOp.getInReductionByrefAttr(), + targetOp.getInReductionSymsAttr(), targetOp.getIsDevicePtrVars(), + innerMapInfos, targetOp.getNowaitAttr(), targetOp.getPrivateVars(), + targetOp.getPrivateSymsAttr(), targetOp.getPrivateNeedsBarrierAttr(), + targetOp.getThreadLimit(), targetOp.getPrivateMapsAttr()); + rewriter.inlineRegionBefore(targetOp.getRegion(), newTargetOp.getRegion(), + newTargetOp.getRegion().begin()); + rewriter.replaceOp(targetOp, targetDataOp); + return newTargetOp; +} + +/// getNestedOpToIsolate function is designed to identify a specific teams +/// parallel op within the body of an omp::TargetOp that should be "isolated." +/// This returns a tuple of op, if its first op in targetBlock, or if the op is +/// last op in the traget block. +static std::optional> +getNestedOpToIsolate(omp::TargetOp targetOp) { + if (targetOp.getRegion().empty()) + return std::nullopt; + auto *targetBlock = &targetOp.getRegion().front(); + for (auto &op : *targetBlock) { + bool first = &op == &*targetBlock->begin(); + bool last = op.getNextNode() == targetBlock->getTerminator(); + if (first && last) + return std::nullopt; + + if (isa(&op)) + return {{&op, first, last}}; + } + return std::nullopt; +} + +/// Temporary structure to hold the two mapinfo ops +struct TempOmpVar { + omp::MapInfoOp from, to; +}; + +/// isPtr checks if the type is a pointer or reference type. +static bool isPtr(Type ty) { + return isa(ty) || isa(ty); +} + +/// getPtrTypeForOmp returns an LLVM pointer type for the given type. +static Type getPtrTypeForOmp(Type ty) { + if (isPtr(ty)) + return LLVM::LLVMPointerType::get(ty.getContext()); + else + return fir::ReferenceType::get(ty); +} + +/// allocateTempOmpVar allocates a temporary variable for OpenMP mapping +static TempOmpVar allocateTempOmpVar(Location loc, Type ty, + RewriterBase &rewriter) { + MLIRContext &ctx = *ty.getContext(); + Value alloc; + Type allocType; + auto llvmPtrTy = LLVM::LLVMPointerType::get(&ctx); + // Get the appropriate type for allocation + if (isPtr(ty)) { + Type intTy = rewriter.getI32Type(); + auto one = rewriter.create(loc, intTy, 1); + allocType = llvmPtrTy; + alloc = rewriter.create(loc, llvmPtrTy, allocType, one); + allocType = intTy; + } else { + allocType = ty; + alloc = rewriter.create(loc, allocType); + } + // Lambda to create mapinfo ops + auto getMapInfo = [&](uint64_t mappingFlags, const char *name) { + return rewriter.create( + loc, alloc.getType(), alloc, TypeAttr::get(allocType), + rewriter.getIntegerAttr(rewriter.getIntegerType(64, /*isSigned=*/false), + mappingFlags), + rewriter.getAttr( + omp::VariableCaptureKind::ByRef), + /*varPtrPtr=*/Value{}, + /*members=*/SmallVector{}, + /*member_index=*/mlir::ArrayAttr{}, + /*bounds=*/ValueRange(), + /*mapperId=*/mlir::FlatSymbolRefAttr(), + /*name=*/rewriter.getStringAttr(name), rewriter.getBoolAttr(false)); + }; + // Create mapinfo ops. + uint64_t mapFrom = + static_cast>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); + uint64_t mapTo = + static_cast>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); + auto mapInfoFrom = getMapInfo(mapFrom, "__flang_workdistribute_from"); + auto mapInfoTo = getMapInfo(mapTo, "__flang_workdistribute_to"); + return TempOmpVar{mapInfoFrom, mapInfoTo}; +} + +// usedOutsideSplit checks if a value is used outside the split operation. +static bool usedOutsideSplit(Value v, Operation *split) { + if (!split) + return false; + auto targetOp = cast(split->getParentOp()); + auto *targetBlock = &targetOp.getRegion().front(); + for (auto *user : v.getUsers()) { + while (user->getBlock() != targetBlock) { + user = user->getParentOp(); + } + if (!user->isBeforeInBlock(split)) + return true; + } + return false; +} + +/// isRecomputableAfterFission checks if an operation can be recomputed +static bool isRecomputableAfterFission(Operation *op, Operation *splitBefore) { + // If the op has side effects, it cannot be recomputed. + // We consider fir.declare as having no side effects. + return isa(op) || isMemoryEffectFree(op); +} + +/// collectNonRecomputableDeps collects dependencies that cannot be recomputed +static void collectNonRecomputableDeps(Value &v, omp::TargetOp targetOp, + SetVector &nonRecomputable, + SetVector &toCache, + SetVector &toRecompute) { + Operation *op = v.getDefiningOp(); + // If v is a block argument, it must be from the targetOp. + if (!op) { + assert(cast(v).getOwner()->getParentOp() == targetOp); + return; + } + // If the op is in the nonRecomputable set, add it to toCache and return. + if (nonRecomputable.contains(op)) { + toCache.insert(op); + return; + } + // Add the op to toRecompute. + toRecompute.insert(op); + for (auto opr : op->getOperands()) + collectNonRecomputableDeps(opr, targetOp, nonRecomputable, toCache, + toRecompute); +} + +/// createBlockArgsAndMap creates block arguments and maps them +static void createBlockArgsAndMap(Location loc, RewriterBase &rewriter, + omp::TargetOp &targetOp, Block *targetBlock, + Block *newTargetBlock, + SmallVector &hostEvalVars, + SmallVector &mapOperands, + SmallVector &allocs, + IRMapping &irMapping) { + // FIRST: Map `host_eval_vars` to block arguments + unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size(); + for (unsigned i = 0; i < hostEvalVars.size(); ++i) { + Value originalValue; + BlockArgument newArg; + if (i < originalHostEvalVarsSize) { + originalValue = targetBlock->getArgument(i); // Host_eval args come first + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } else { + originalValue = hostEvalVars[i]; + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } + irMapping.map(originalValue, newArg); + } + + // SECOND: Map `map_operands` to block arguments + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + for (unsigned i = 0; i < mapOperands.size(); ++i) { + Value originalValue; + BlockArgument newArg; + // Map the new arguments from the original block. + if (i < originalMapVarsSize) { + originalValue = targetBlock->getArgument(originalHostEvalVarsSize + + i); // Offset by host_eval count + newArg = newTargetBlock->addArgument(originalValue.getType(), + originalValue.getLoc()); + } + // Map the new arguments from the `allocs`. + else { + originalValue = allocs[i - originalMapVarsSize]; + newArg = newTargetBlock->addArgument( + getPtrTypeForOmp(originalValue.getType()), originalValue.getLoc()); + } + irMapping.map(originalValue, newArg); + } + + // THIRD: Map `private_vars` to block arguments (if any) + unsigned originalPrivateVarsSize = targetOp.getPrivateVars().size(); + for (unsigned i = 0; i < originalPrivateVarsSize; ++i) { + auto originalArg = targetBlock->getArgument(originalHostEvalVarsSize + + originalMapVarsSize + i); + auto newArg = newTargetBlock->addArgument(originalArg.getType(), + originalArg.getLoc()); + irMapping.map(originalArg, newArg); + } + return; +} + +/// reloadCacheAndRecompute reloads cached values and recomputes operations +static void reloadCacheAndRecompute( + Location loc, RewriterBase &rewriter, Operation *splitBefore, + omp::TargetOp &targetOp, Block *targetBlock, Block *newTargetBlock, + SmallVector &hostEvalVars, SmallVector &mapOperands, + SmallVector &allocs, SetVector &toRecompute, + IRMapping &irMapping) { + // Handle the load operations for the allocs. + rewriter.setInsertionPointToStart(newTargetBlock); + auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext()); + + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + unsigned hostEvalVarsSize = hostEvalVars.size(); + // Create load operations for each allocated variable. + for (unsigned i = 0; i < allocs.size(); ++i) { + Value original = allocs[i]; + // Get the new block argument for this specific allocated value. + Value newArg = + newTargetBlock->getArgument(hostEvalVarsSize + originalMapVarsSize + i); + Value restored; + // If the original value is a pointer or reference, load and convert if + // necessary. + if (isPtr(original.getType())) { + restored = rewriter.create(loc, llvmPtrTy, newArg); + if (!isa(original.getType())) + restored = + rewriter.create(loc, original.getType(), restored); + } else { + restored = rewriter.create(loc, newArg); + } + irMapping.map(original, restored); + } + // Clone the operations if they are in the toRecompute set. + for (auto it = targetBlock->begin(); it != splitBefore->getIterator(); it++) { + if (toRecompute.contains(&*it)) + rewriter.clone(*it, irMapping); + } +} + +/// Given a teamsOp, navigate down the nested structure to find the +/// innermost LoopNestOp. The expected nesting is: +/// teams -> parallel -> distribute -> wsloop -> loop_nest +static mlir::omp::LoopNestOp getLoopNestFromTeams(mlir::omp::TeamsOp teamsOp) { + if (teamsOp.getRegion().empty()) + return nullptr; + // Ensure the teams region has a single block. + if (teamsOp.getRegion().getBlocks().size() != 1) + return nullptr; + // Find parallel op inside teams + mlir::omp::ParallelOp parallelOp = nullptr; + // Look for the parallel op in the teams region + for (auto &op : teamsOp.getRegion().front()) { + if (auto parallel = dyn_cast(op)) { + parallelOp = parallel; + break; + } + } + if (!parallelOp) + return nullptr; + + // Find distribute op inside parallel + mlir::omp::DistributeOp distributeOp = nullptr; + for (auto &op : parallelOp.getRegion().front()) { + if (auto distribute = dyn_cast(op)) { + distributeOp = distribute; + break; + } + } + if (!distributeOp) + return nullptr; + + // Find wsloop op inside distribute + mlir::omp::WsloopOp wsloopOp = nullptr; + for (auto &op : distributeOp.getRegion().front()) { + if (auto wsloop = dyn_cast(op)) { + wsloopOp = wsloop; + break; + } + } + if (!wsloopOp) + return nullptr; + + // Find loop_nest op inside wsloop + for (auto &op : wsloopOp.getRegion().front()) { + if (auto loopNest = dyn_cast(op)) { + return loopNest; + } + } + + return nullptr; +} + +/// Generate LLVM constant operations for i32 and i64 types. +static mlir::LLVM::ConstantOp +genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) { + mlir::Type i32Ty = rewriter.getI32Type(); + mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value); + return rewriter.create(loc, i32Ty, attr); +} + +/// Given a box descriptor, extract the base address of the data it describes. +/// If the box descriptor is a reference, load it first. +/// The base address is returned as an i8* pointer. +static Value genDescriptorGetBaseAddress(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetBaseAddress"); + auto i8Type = builder.getI8Type(); + auto unknownArrayType = + fir::SequenceType::get({fir::SequenceType::getUnknownExtent()}, i8Type); + auto i8BoxType = fir::BoxType::get(unknownArrayType); + auto typedBox = fir::ConvertOp::create(builder, loc, i8BoxType, box); + auto rawAddr = fir::BoxAddrOp::create(builder, loc, typedBox); + return rawAddr; +} + +/// Given a box descriptor, extract the total number of elements in the array it +/// describes. If the box descriptor is a reference, load it first. +/// The total number of elements is returned as an i64 value. +static Value genDescriptorGetTotalElements(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetTotalElements"); + auto i64Type = builder.getI64Type(); + return fir::BoxTotalElementsOp::create(builder, loc, i64Type, box); +} + +/// Given a box descriptor, extract the size of each element in the array it +/// describes. If the box descriptor is a reference, load it first. +/// The element size is returned as an i64 value. +static Value genDescriptorGetEleSize(fir::FirOpBuilder &builder, Location loc, + Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetElementSize"); + auto i64Type = builder.getI64Type(); + return fir::BoxEleSizeOp::create(builder, loc, i64Type, box); +} + +/// Given a box descriptor, compute the total size in bytes of the data it +/// describes. This is done by multiplying the total number of elements by the +/// size of each element. If the box descriptor is a reference, load it first. +/// The total size in bytes is returned as an i64 value. +static Value genDescriptorGetDataSizeInBytes(fir::FirOpBuilder &builder, + Location loc, Value boxDesc) { + Value box = boxDesc; + if (auto refBox = dyn_cast(boxDesc.getType())) { + box = fir::LoadOp::create(builder, loc, boxDesc); + } + assert(isa(box.getType()) && + "Unknown type passed to genDescriptorGetElementSize"); + Value eleSize = genDescriptorGetEleSize(builder, loc, box); + Value totalElements = genDescriptorGetTotalElements(builder, loc, box); + return mlir::arith::MulIOp::create(builder, loc, totalElements, eleSize); +} + +/// Generate a call to the OpenMP runtime function `omp_get_mapped_ptr` to +/// retrieve the device pointer corresponding to a given host pointer and device +/// number. If no mapping exists, the original host pointer is returned. +/// Signature: +/// void *omp_get_mapped_ptr(void *host_ptr, int device_num); +static mlir::Value genOmpGetMappedPtrIfPresent(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value hostPtr, + mlir::Value deviceNum, + mlir::ModuleOp module) { + auto *context = builder.getContext(); + auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type()); + auto i32Type = builder.getI32Type(); + auto funcName = "omp_get_mapped_ptr"; + auto funcOp = module.lookupSymbol(funcName); + + if (!funcOp) { + auto funcType = + mlir::FunctionType::get(context, {voidPtrType, i32Type}, {voidPtrType}); + + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + + funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType); + funcOp.setPrivate(); + } + + llvm::SmallVector args; + args.push_back(fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr)); + args.push_back(fir::ConvertOp::create(builder, loc, i32Type, deviceNum)); + auto callOp = fir::CallOp::create(builder, loc, funcOp, args); + auto mappedPtr = callOp.getResult(0); + auto isNull = builder.genIsNullAddr(loc, mappedPtr); + auto convertedHostPtr = + fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr); + auto result = arith::SelectOp::create(builder, loc, isNull, convertedHostPtr, + mappedPtr); + return result; +} + +/// Generate a call to the OpenMP runtime function `omp_target_memcpy` to +/// perform memory copy between host and device or between devices. +/// Signature: +/// int omp_target_memcpy(void *dst, const void *src, size_t length, +/// size_t dst_offset, size_t src_offset, +/// int dst_device, int src_device); +static void genOmpTargetMemcpyCall(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value dst, + mlir::Value src, mlir::Value length, + mlir::Value dstOffset, mlir::Value srcOffset, + mlir::Value device, mlir::ModuleOp module) { + auto *context = builder.getContext(); + auto funcName = "omp_target_memcpy"; + auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type()); + auto sizeTType = builder.getI64Type(); // assuming size_t is 64-bit + auto i32Type = builder.getI32Type(); + auto funcOp = module.lookupSymbol(funcName); + + if (!funcOp) { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(module.getBody()); + llvm::SmallVector argTypes = { + voidPtrType, voidPtrType, sizeTType, sizeTType, + sizeTType, i32Type, i32Type}; + auto funcType = mlir::FunctionType::get(context, argTypes, {i32Type}); + funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType); + funcOp.setPrivate(); + } + + llvm::SmallVector args{dst, src, length, dstOffset, + srcOffset, device, device}; + fir::CallOp::create(builder, loc, funcOp, args); + return; +} + +/// Generate code to replace a Fortran array assignment call with OpenMP +/// runtime calls to perform the equivalent operation on the device. +/// This involves extracting the source and destination pointers from the +/// Fortran array descriptors, retrieving their mapped device pointers (if any), +/// and invoking `omp_target_memcpy` to copy the data on the device. +static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder, + mlir::Location loc, + fir::CallOp callOp, + mlir::Value device, + mlir::ModuleOp module) { + assert(callOp.getNumResults() == 0 && + "Expected _FortranAAssign to have no results"); + assert(callOp.getNumOperands() >= 2 && + "Expected _FortranAAssign to have at least two operands"); + + // Extract the source and destination pointers from the call operands. + mlir::Value dest = callOp.getOperand(0); + mlir::Value src = callOp.getOperand(1); + + // Get the base addresses of the source and destination arrays. + mlir::Value srcBase = genDescriptorGetBaseAddress(builder, loc, src); + mlir::Value destBase = genDescriptorGetBaseAddress(builder, loc, dest); + + // Get the total size in bytes of the data to be copied. + mlir::Value srcDataSize = genDescriptorGetDataSizeInBytes(builder, loc, src); + + // Retrieve the mapped device pointers for source and destination. + // If no mapping exists, the original host pointer is used. + Value destPtr = + genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module); + Value srcPtr = + genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module); + Value zero = builder.create(loc, builder.getI64Type(), + builder.getI64IntegerAttr(0)); + + // Generate the call to omp_target_memcpy to perform the data copy on the + // device. + genOmpTargetMemcpyCall(builder, loc, destPtr, srcPtr, srcDataSize, zero, zero, + device, module); +} + +/// Struct to hold the host eval vars corresponding to loop bounds and steps +struct HostEvalVars { + SmallVector lbs; + SmallVector ubs; + SmallVector steps; +}; + +/// moveToHost method clones all the ops from target region outside of it. +/// It hoists runtime function "_FortranAAssign" and replaces it with omp +/// version. Also hoists and replaces fir.allocmem with omp.target_allocmem and +/// fir.freemem with omp.target_freemem +static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter, + mlir::ModuleOp module, + struct HostEvalVars &hostEvalVars) { + OpBuilder::InsertionGuard guard(rewriter); + Block *targetBlock = &targetOp.getRegion().front(); + assert(targetBlock == &targetOp.getRegion().back()); + IRMapping mapping; + + // Get the parent target_data op + auto targetDataOp = cast(targetOp->getParentOp()); + if (!targetDataOp) { + emitError(targetOp->getLoc(), + "Expected target op to be inside target_data op"); + return failure(); + } + // create mapping for host_eval_vars + unsigned hostEvalVarCount = targetOp.getHostEvalVars().size(); + for (unsigned i = 0; i < targetOp.getHostEvalVars().size(); ++i) { + Value hostEvalVar = targetOp.getHostEvalVars()[i]; + BlockArgument arg = targetBlock->getArguments()[i]; + mapping.map(arg, hostEvalVar); + } + // create mapping for map_vars + for (unsigned i = 0; i < targetOp.getMapVars().size(); ++i) { + Value mapInfo = targetOp.getMapVars()[i]; + BlockArgument arg = targetBlock->getArguments()[hostEvalVarCount + i]; + Operation *op = mapInfo.getDefiningOp(); + assert(op); + auto mapInfoOp = cast(op); + // map the block argument to the host-side variable pointer + mapping.map(arg, mapInfoOp.getVarPtr()); + } + // create mapping for private_vars + unsigned mapSize = targetOp.getMapVars().size(); + for (unsigned i = 0; i < targetOp.getPrivateVars().size(); ++i) { + Value privateVar = targetOp.getPrivateVars()[i]; + // The mapping should link the device-side variable to the host-side one. + BlockArgument arg = + targetBlock->getArguments()[hostEvalVarCount + mapSize + i]; + // Map the device-side copy (`arg`) to the host-side value (`privateVar`). + mapping.map(arg, privateVar); + } + + rewriter.setInsertionPoint(targetOp); + SmallVector opsToReplace; + Value device = targetOp.getDevice(); + + // If device is not specified, default to device 0. + if (!device) { + device = genI32Constant(targetOp.getLoc(), rewriter, 0); + } + // Clone all operations. + for (auto it = targetBlock->begin(), end = std::prev(targetBlock->end()); + it != end; ++it) { + auto *op = &*it; + Operation *clonedOp = rewriter.clone(*op, mapping); + // Map the results of the original op to the cloned op. + for (unsigned i = 0; i < op->getNumResults(); ++i) { + mapping.map(op->getResult(i), clonedOp->getResult(i)); + } + // fir.declare changes its type when hoisting it out of omp.target to + // omp.target_data Introduce a load, if original declareOp input is not of + // reference type, but cloned delcareOp input is reference type. + if (fir::DeclareOp clonedDeclareOp = dyn_cast(clonedOp)) { + auto originalDeclareOp = cast(op); + Type originalInType = originalDeclareOp.getMemref().getType(); + Type clonedInType = clonedDeclareOp.getMemref().getType(); + + fir::ReferenceType originalRefType = + dyn_cast(originalInType); + fir::ReferenceType clonedRefType = + dyn_cast(clonedInType); + if (!originalRefType && clonedRefType) { + Type clonedEleTy = clonedRefType.getElementType(); + if (clonedEleTy == originalDeclareOp.getType()) { + opsToReplace.push_back(clonedOp); + } + } + } + // Collect the ops to be replaced. + if (isa(clonedOp) || isa(clonedOp)) + opsToReplace.push_back(clonedOp); + // Check for runtime calls to be replaced. + if (isRuntimeCall(clonedOp)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + opsToReplace.push_back(clonedOp); + } else { + emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting."); + return failure(); + } + } + } + // Replace fir.allocmem with omp.target_allocmem. + for (Operation *op : opsToReplace) { + if (auto allocOp = dyn_cast(op)) { + rewriter.setInsertionPoint(allocOp); + auto ompAllocmemOp = rewriter.create( + allocOp.getLoc(), rewriter.getI64Type(), device, + allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(), + allocOp.getBindcNameAttr(), allocOp.getTypeparams(), + allocOp.getShape()); + auto firConvertOp = rewriter.create( + allocOp.getLoc(), allocOp.getResult().getType(), + ompAllocmemOp.getResult()); + rewriter.replaceOp(allocOp, firConvertOp.getResult()); + } + // Replace fir.freemem with omp.target_freemem. + else if (auto freeOp = dyn_cast(op)) { + rewriter.setInsertionPoint(freeOp); + auto firConvertOp = rewriter.create( + freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref()); + rewriter.create(freeOp.getLoc(), device, + firConvertOp.getResult()); + rewriter.eraseOp(freeOp); + } + // fir.declare changes its type when hoisting it out of omp.target to + // omp.target_data Introduce a load, if original declareOp input is not of + // reference type, but cloned delcareOp input is reference type. + else if (fir::DeclareOp clonedDeclareOp = dyn_cast(op)) { + Type clonedInType = clonedDeclareOp.getMemref().getType(); + fir::ReferenceType clonedRefType = + dyn_cast(clonedInType); + Type clonedEleTy = clonedRefType.getElementType(); + rewriter.setInsertionPoint(op); + Value loadedValue = rewriter.create( + clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref()); + clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue); + } + // Replace runtime calls with omp versions. + else if (isRuntimeCall(op)) { + fir::CallOp runtimeCall = cast(op); + auto funcName = runtimeCall.getCallee()->getRootReference().getValue(); + if (funcName == FortranAssignStr) { + rewriter.setInsertionPoint(op); + fir::FirOpBuilder builder{rewriter, op}; + + mlir::Location loc = runtimeCall.getLoc(); + genFortranAssignOmpReplacement(builder, loc, runtimeCall, device, + module); + rewriter.eraseOp(op); + } else { + emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting."); + return failure(); + } + } else { + emitError(op->getLoc(), "Unhandled op hoisting."); + return failure(); + } + } + + // Update the host_eval_vars to use the mapped values. + for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i) { + hostEvalVars.lbs[i] = mapping.lookup(hostEvalVars.lbs[i]); + hostEvalVars.ubs[i] = mapping.lookup(hostEvalVars.ubs[i]); + hostEvalVars.steps[i] = mapping.lookup(hostEvalVars.steps[i]); + } + // Finally erase the original targetOp. + rewriter.eraseOp(targetOp); + return success(); +} + +/// Result of isolateOp method +struct SplitResult { + omp::TargetOp preTargetOp; + omp::TargetOp isolatedTargetOp; + omp::TargetOp postTargetOp; +}; + +/// computeAllocsCacheRecomputable method computes the allocs needed to cache +/// the values that are used outside the split point. It also computes the ops +/// that need to be cached and the ops that can be recomputed after the split. +static void computeAllocsCacheRecomputable( + omp::TargetOp targetOp, Operation *splitBeforeOp, RewriterBase &rewriter, + SmallVector &preMapOperands, SmallVector &postMapOperands, + SmallVector &allocs, SmallVector &requiredVals, + SetVector &nonRecomputable, SetVector &toCache, + SetVector &toRecompute) { + auto *targetBlock = &targetOp.getRegion().front(); + // Find all values that are used outside the split point. + for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator(); + it++) { + // Check if any of the results are used outside the split point. + for (auto res : it->getResults()) { + if (usedOutsideSplit(res, splitBeforeOp)) { + requiredVals.push_back(res); + } + } + // If the op is not recomputable, add it to the nonRecomputable set. + if (!isRecomputableAfterFission(&*it, splitBeforeOp)) { + nonRecomputable.insert(&*it); + } + } + // For each required value, collect its dependencies. + for (auto requiredVal : requiredVals) + collectNonRecomputableDeps(requiredVal, targetOp, nonRecomputable, toCache, + toRecompute); + // For each op in toCache, create an alloc and update the pre and post map + // operands. + for (Operation *op : toCache) { + for (auto res : op->getResults()) { + auto alloc = + allocateTempOmpVar(targetOp.getLoc(), res.getType(), rewriter); + allocs.push_back(res); + preMapOperands.push_back(alloc.from); + postMapOperands.push_back(alloc.to); + } + } +} + +/// genPreTargetOp method generates the preTargetOp that contains all the ops +/// before the split point. It also creates the block arguments and maps the +/// values accordingly. It also creates the store operations for the allocs. +static omp::TargetOp +genPreTargetOp(omp::TargetOp targetOp, SmallVector &preMapOperands, + SmallVector &allocs, Operation *splitBeforeOp, + RewriterBase &rewriter, struct HostEvalVars &hostEvalVars, + bool isTargetDevice) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector preHostEvalVars{targetOp.getHostEvalVars()}; + // update the hostEvalVars of preTargetOp + omp::TargetOp preTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars, + targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), preMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + auto *preTargetBlock = rewriter.createBlock( + &preTargetOp.getRegion(), preTargetOp.getRegion().begin(), {}, {}); + IRMapping preMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, preTargetBlock, + preHostEvalVars, preMapOperands, allocs, preMapping); + + // Handle the store operations for the allocs. + rewriter.setInsertionPointToStart(preTargetBlock); + auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext()); + + // Clone the original operations. + for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator(); + it++) { + rewriter.clone(*it, preMapping); + } + + unsigned originalHostEvalVarsSize = preHostEvalVars.size(); + unsigned originalMapVarsSize = targetOp.getMapVars().size(); + // Create Stores for allocs. + for (unsigned i = 0; i < allocs.size(); ++i) { + Value originalResult = allocs[i]; + Value toStore = preMapping.lookup(originalResult); + // Get the new block argument for this specific allocated value. + Value newArg = preTargetBlock->getArgument(originalHostEvalVarsSize + + originalMapVarsSize + i); + // Create the store operation. + if (isPtr(originalResult.getType())) { + if (!isa(toStore.getType())) + toStore = rewriter.create(loc, llvmPtrTy, toStore); + rewriter.create(loc, toStore, newArg); + } else { + rewriter.create(loc, toStore, newArg); + } + } + rewriter.create(loc); + + // Update hostEvalVars with the mapped values for the loop bounds if we have + // a loopNestOp and we are not generating code for the target device. + omp::LoopNestOp loopNestOp = + getLoopNestFromTeams(cast(splitBeforeOp)); + if (loopNestOp && !isTargetDevice) { + for (size_t i = 0; i < loopNestOp.getLoopLowerBounds().size(); ++i) { + Value lb = loopNestOp.getLoopLowerBounds()[i]; + Value ub = loopNestOp.getLoopUpperBounds()[i]; + Value step = loopNestOp.getLoopSteps()[i]; + + hostEvalVars.lbs.push_back(preMapping.lookup(lb)); + hostEvalVars.ubs.push_back(preMapping.lookup(ub)); + hostEvalVars.steps.push_back(preMapping.lookup(step)); + } + } + + return preTargetOp; +} + +/// genIsolatedTargetOp method generates the isolatedTargetOp that contains the +/// ops between the split point. It also creates the block arguments and maps +/// the values accordingly. It also creates the load operations for the allocs +/// and recomputes the necessary ops. +static omp::TargetOp +genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector &postMapOperands, + Operation *splitBeforeOp, RewriterBase &rewriter, + SmallVector &allocs, + SetVector &toRecompute, + struct HostEvalVars &hostEvalVars, bool isTargetDevice) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector isolatedHostEvalVars{targetOp.getHostEvalVars()}; + // update the hostEvalVars of isolatedTargetOp + if (!hostEvalVars.lbs.empty() && !isTargetDevice) { + isolatedHostEvalVars.append(hostEvalVars.lbs.begin(), + hostEvalVars.lbs.end()); + isolatedHostEvalVars.append(hostEvalVars.ubs.begin(), + hostEvalVars.ubs.end()); + isolatedHostEvalVars.append(hostEvalVars.steps.begin(), + hostEvalVars.steps.end()); + } + // Create the isolated target op + omp::TargetOp isolatedTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), + isolatedHostEvalVars, targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + auto *isolatedTargetBlock = + rewriter.createBlock(&isolatedTargetOp.getRegion(), + isolatedTargetOp.getRegion().begin(), {}, {}); + IRMapping isolatedMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, + isolatedTargetBlock, isolatedHostEvalVars, + postMapOperands, allocs, isolatedMapping); + // Handle the load operations for the allocs and recompute ops. + reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock, + isolatedTargetBlock, isolatedHostEvalVars, + postMapOperands, allocs, toRecompute, + isolatedMapping); + + // Clone the original operations. + rewriter.clone(*splitBeforeOp, isolatedMapping); + rewriter.create(loc); + + // update the loop bounds in the isolatedTargetOp if we have host_eval vars + // and we are not generating code for the target device. + if (!hostEvalVars.lbs.empty() && !isTargetDevice) { + omp::TeamsOp teamsOp; + for (auto &op : *isolatedTargetBlock) { + if (isa(&op)) + teamsOp = cast(&op); + } + assert(teamsOp && "No teamsOp found in isolated target region"); + // Get the loopNestOp inside the teamsOp + auto loopNestOp = getLoopNestFromTeams(teamsOp); + // Get the BlockArgs related to host_eval vars and update loop_nest bounds + // to them + unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size(); + unsigned index = originalHostEvalVarsSize; + // Replace loop bounds with the block arguments passed down via host_eval + SmallVector lbs, ubs, steps; + + // Collect new lb/ub/step values from target block args + for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i) + lbs.push_back(isolatedTargetBlock->getArgument(index++)); + + for (size_t i = 0; i < hostEvalVars.ubs.size(); ++i) + ubs.push_back(isolatedTargetBlock->getArgument(index++)); + + for (size_t i = 0; i < hostEvalVars.steps.size(); ++i) + steps.push_back(isolatedTargetBlock->getArgument(index++)); + + // Reset the loop bounds + loopNestOp.getLoopLowerBoundsMutable().assign(lbs); + loopNestOp.getLoopUpperBoundsMutable().assign(ubs); + loopNestOp.getLoopStepsMutable().assign(steps); + } + + return isolatedTargetOp; +} + +/// genPostTargetOp method generates the postTargetOp that contains all the ops +/// after the split point. It also creates the block arguments and maps the +/// values accordingly. It also creates the load operations for the allocs +/// and recomputes the necessary ops. +static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp, + Operation *splitBeforeOp, + SmallVector &postMapOperands, + RewriterBase &rewriter, + SmallVector &allocs, + SetVector &toRecompute) { + auto loc = targetOp.getLoc(); + auto *targetBlock = &targetOp.getRegion().front(); + SmallVector postHostEvalVars{targetOp.getHostEvalVars()}; + // Create the post target op + omp::TargetOp postTargetOp = rewriter.create( + targetOp.getLoc(), targetOp.getAllocateVars(), + targetOp.getAllocatorVars(), targetOp.getBareAttr(), + targetOp.getDependKindsAttr(), targetOp.getDependVars(), + targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars, + targetOp.getIfExpr(), targetOp.getInReductionVars(), + targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(), + targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(), + targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(), + targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(), + targetOp.getPrivateMapsAttr()); + // Create the block for postTargetOp + auto *postTargetBlock = rewriter.createBlock( + &postTargetOp.getRegion(), postTargetOp.getRegion().begin(), {}, {}); + IRMapping postMapping; + // Create block arguments and map the values. + createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, postTargetBlock, + postHostEvalVars, postMapOperands, allocs, postMapping); + // Handle the load operations for the allocs and recompute ops. + reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock, + postTargetBlock, postHostEvalVars, postMapOperands, + allocs, toRecompute, postMapping); + assert(splitBeforeOp->getNumResults() == 0 || + llvm::all_of(splitBeforeOp->getResults(), + [](Value result) { return result.use_empty(); })); + // Clone the original operations after the split point. + for (auto it = std::next(splitBeforeOp->getIterator()); + it != targetBlock->end(); it++) + rewriter.clone(*it, postMapping); + return postTargetOp; +} + +/// isolateOp method rewrites a omp.target_data { omp.target } in to +/// omp.target_data { +/// // preTargetOp region contains ops before splitBeforeOp. +/// omp.target {} +/// // isolatedTargetOp region contains splitBeforeOp, +/// omp.target {} +/// // postTargetOp region contains ops after splitBeforeOp. +/// omp.target {} +/// } +/// It also handles the mapping of variables and the caching/recomputing +/// of values as needed. +static FailureOr isolateOp(Operation *splitBeforeOp, + bool splitAfter, RewriterBase &rewriter, + mlir::ModuleOp module, + bool isTargetDevice) { + auto targetOp = cast(splitBeforeOp->getParentOp()); + assert(targetOp); + rewriter.setInsertionPoint(targetOp); + + // Prepare the map operands for preTargetOp and postTargetOp + auto preMapOperands = SmallVector(targetOp.getMapVars()); + auto postMapOperands = SmallVector(targetOp.getMapVars()); + + // Vectors to hold analysis results + SmallVector requiredVals; + SetVector toCache; + SetVector toRecompute; + SetVector nonRecomputable; + SmallVector allocs; + struct HostEvalVars hostEvalVars; + + // Analyze the ops in target region to determine which ops need to be + // cached and which ops need to be recomputed + computeAllocsCacheRecomputable( + targetOp, splitBeforeOp, rewriter, preMapOperands, postMapOperands, + allocs, requiredVals, nonRecomputable, toCache, toRecompute); + + rewriter.setInsertionPoint(targetOp); + + // Generate the preTargetOp that contains all the ops before splitBeforeOp. + auto preTargetOp = + genPreTargetOp(targetOp, preMapOperands, allocs, splitBeforeOp, rewriter, + hostEvalVars, isTargetDevice); + + // Move the ops of preTarget to host. + auto res = moveToHost(preTargetOp, rewriter, module, hostEvalVars); + if (failed(res)) + return failure(); + rewriter.setInsertionPoint(targetOp); + + // Generate the isolatedTargetOp + omp::TargetOp isolatedTargetOp = + genIsolatedTargetOp(targetOp, postMapOperands, splitBeforeOp, rewriter, + allocs, toRecompute, hostEvalVars, isTargetDevice); + + omp::TargetOp postTargetOp = nullptr; + // Generate the postTargetOp that contains all the ops after splitBeforeOp. + if (splitAfter) { + rewriter.setInsertionPoint(targetOp); + postTargetOp = genPostTargetOp(targetOp, splitBeforeOp, postMapOperands, + rewriter, allocs, toRecompute); + } + // Finally erase the original targetOp. + rewriter.eraseOp(targetOp); + return SplitResult{preTargetOp, isolatedTargetOp, postTargetOp}; +} + +/// Recursively fission target ops until no more nested ops can be isolated. +static LogicalResult fissionTarget(omp::TargetOp targetOp, + RewriterBase &rewriter, + mlir::ModuleOp module, bool isTargetDevice) { + auto tuple = getNestedOpToIsolate(targetOp); + if (!tuple) { + LLVM_DEBUG(llvm::dbgs() << " No op to isolate\n"); + struct HostEvalVars hostEvalVars; + return moveToHost(targetOp, rewriter, module, hostEvalVars); + } + Operation *toIsolate = std::get<0>(*tuple); + bool splitBefore = !std::get<1>(*tuple); + bool splitAfter = !std::get<2>(*tuple); + // Recursively isolate the target op. + if (splitBefore && splitAfter) { + auto res = + isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice); + if (failed(res)) + return failure(); + return fissionTarget((*res).postTargetOp, rewriter, module, isTargetDevice); + } + // Isolate only before the op. + if (splitBefore) { + auto res = + isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice); + if (failed(res)) + return failure(); + } else { + emitError(toIsolate->getLoc(), "Unhandled case in fissionTarget"); + return failure(); + } + return success(); +} + +/// Pass to lower omp.workdistribute ops. +class LowerWorkdistributePass + : public flangomp::impl::LowerWorkdistributeBase { +public: + void runOnOperation() override { + MLIRContext &context = getContext(); + auto moduleOp = getOperation(); + bool changed = false; + SetVector targetOpsToProcess; + auto verify = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + if (failed(verifyTargetTeamsWorkdistribute(workdistribute))) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + if (verify.wasInterrupted()) + return signalPassFailure(); + + auto fission = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + auto res = fissionWorkdistribute(workdistribute); + if (failed(res)) + return WalkResult::interrupt(); + changed |= *res; + return WalkResult::advance(); + }); + if (fission.wasInterrupted()) + return signalPassFailure(); + + auto rtCallLower = + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + auto res = workdistributeRuntimeCallLower(workdistribute, + targetOpsToProcess); + if (failed(res)) + return WalkResult::interrupt(); + changed |= *res; + return WalkResult::advance(); + }); + if (rtCallLower.wasInterrupted()) + return signalPassFailure(); + + moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) { + changed |= workdistributeDoLower(workdistribute, targetOpsToProcess); + }); + + moduleOp->walk([&](mlir::omp::TeamsOp teams) { + changed |= teamsWorkdistributeToSingleOp(teams, targetOpsToProcess); + }); + if (changed) { + bool isTargetDevice = + llvm::cast(*moduleOp) + .getIsTargetDevice(); + IRRewriter rewriter(&context); + for (auto targetOp : targetOpsToProcess) { + auto res = splitTargetData(targetOp, rewriter); + if (failed(res)) + return signalPassFailure(); + if (*res) { + if (failed(fissionTarget(*res, rewriter, moduleOp, isTargetDevice))) + return signalPassFailure(); + } + } + } + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 1253f7b2e9d3d..fe7cbba12eb9b 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -301,8 +301,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, addNestedPassToAllTopLevelOperations( pm, hlfir::createInlineHLFIRAssign); pm.addPass(hlfir::createConvertHLFIRtoFIR()); - if (enableOpenMP != EnableOpenMP::None) + if (enableOpenMP != EnableOpenMP::None) { pm.addPass(flangomp::createLowerWorkshare()); + pm.addPass(flangomp::createLowerWorkdistribute()); + } if (enableOpenMP == EnableOpenMP::Simd) pm.addPass(flangomp::createSimdOnlyPass()); } diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 195e5ad7f9dc8..59f6c73ae84ee 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -69,6 +69,7 @@ func.func @_QQmain() { // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: ConvertHLFIRtoFIR // PASSES-NEXT: LowerWorkshare +// PASSES-NEXT: LowerWorkdistribute // PASSES-NEXT: CSE // PASSES-NEXT: (S) 0 num-cse'd - Number of operations CSE'd // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd diff --git a/flang/test/Lower/OpenMP/workdistribute-multiple.f90 b/flang/test/Lower/OpenMP/workdistribute-multiple.f90 new file mode 100644 index 0000000000000..cf1d9dd294cea --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-multiple.f90 @@ -0,0 +1,20 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has multiple workdistribute ops. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + + !$omp workdistribute + y = a * y + x + !$omp end workdistribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 new file mode 100644 index 0000000000000..b2dbc0f15121e --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 @@ -0,0 +1,39 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 new file mode 100644 index 0000000000000..09e1211541edb --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 @@ -0,0 +1,45 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 new file mode 100644 index 0000000000000..cf5d0234edb39 --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 @@ -0,0 +1,47 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols, depth) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols, depth + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols, depth) :: x, y + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + ! CHECK: fir.do_loop + + !$omp target teams workdistribute + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols, depth) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols, depth + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols, depth) :: x, y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + ! CHECK: fir.do_loop + + !$omp teams workdistribute + y = a * x + y + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 new file mode 100644 index 0000000000000..516c4603bd5da --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 @@ -0,0 +1,53 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp target teams workdistribute + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = a * x + y + + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = 2.0_real32 + + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams workdistribute + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + y = a * x + y + + ! CHECK: fir.call @_FortranAAssign + y = 2.0_real32 + + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 new file mode 100644 index 0000000000000..4aeb2e89140cc --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 @@ -0,0 +1,68 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +subroutine target_teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + !$omp target teams workdistribute + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * x + y + + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * y + x + + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute + +! CHECK-LABEL: func @_QPteams_workdistribute +subroutine teams_workdistribute(a, x, y, rows, cols) + use iso_fortran_env + implicit none + + integer, intent(in) :: rows, cols + real(kind=real32) :: a + real(kind=real32), dimension(rows, cols) :: x, y + + !$omp teams workdistribute + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * x + y + + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + ! CHECK: fir.do_loop + + y = a * y + x + + !$omp end teams workdistribute +end subroutine teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 new file mode 100644 index 0000000000000..3062b3598b8ae --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 @@ -0,0 +1,29 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute_scalar_assign +subroutine target_teams_workdistribute_scalar_assign() + integer :: aa(10) + + ! CHECK: omp.target_data + ! CHECK: omp.target + ! CHECK: omp.teams + ! CHECK: omp.parallel + ! CHECK: omp.distribute + ! CHECK: omp.wsloop + ! CHECK: omp.loop_nest + + !$omp target teams workdistribute + aa = 20 + !$omp end target teams workdistribute + +end subroutine target_teams_workdistribute_scalar_assign + +! CHECK-LABEL: func @_QPteams_workdistribute_scalar_assign +subroutine teams_workdistribute_scalar_assign() + integer :: aa(10) + ! CHECK: fir.call @_FortranAAssign + !$omp teams workdistribute + aa = 20 + !$omp end teams workdistribute + +end subroutine teams_workdistribute_scalar_assign diff --git a/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 new file mode 100644 index 0000000000000..4a08e53bc316a --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 @@ -0,0 +1,32 @@ +! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s + +! CHECK-LABEL: func @_QPtarget_teams_workdistribute +! CHECK: omp.target_data map_entries({{.*}}) +! CHECK: omp.target thread_limit({{.*}}) host_eval({{.*}}) map_entries({{.*}}) +! CHECK: omp.teams num_teams({{.*}}) +! CHECK: omp.parallel +! CHECK: omp.distribute +! CHECK: omp.wsloop +! CHECK: omp.loop_nest + +subroutine target_teams_workdistribute() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + integer :: i + + a = 2.0_real32 + x = [(real(i, real32), i = 1, 10)] + y = [(real(i * 0.5, real32), i = 1, 10)] + + !$omp target teams workdistribute & + !$omp& num_teams(4) & + !$omp& thread_limit(8) & + !$omp& default(shared) & + !$omp& private(i) & + !$omp& map(to: x) & + !$omp& map(tofrom: y) + y = a * x + y + !$omp end target teams workdistribute +end subroutine target_teams_workdistribute diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 new file mode 100644 index 0000000000000..f9c5a771f401d --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 @@ -0,0 +1,22 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + + !$omp distribute + do i = 1, 10 + x(i) = real(i, kind=real32) + end do + !$omp end distribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 new file mode 100644 index 0000000000000..3ef7f90087944 --- /dev/null +++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 @@ -0,0 +1,22 @@ +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s + +! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet. +! CHECK-LABEL: func @_QPteams_workdistribute_1 +subroutine teams_workdistribute_1() + use iso_fortran_env + real(kind=real32) :: a + real(kind=real32), dimension(10) :: x + real(kind=real32), dimension(10) :: y + !$omp teams + + !$omp distribute + do i = 1, 10 + x(i) = real(i, kind=real32) + end do + !$omp end distribute + + !$omp workdistribute + y = a * x + y + !$omp end workdistribute + !$omp end teams +end subroutine teams_workdistribute_1 diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir new file mode 100644 index 0000000000000..00d10d6264ec9 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir @@ -0,0 +1,33 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// CHECK-LABEL: func.func @x({{.*}}) +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_1:.*]]) : index = (%[[ARG0:.*]]) to (%[[ARG1:.*]]) inclusive step (%[[ARG2:.*]]) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: fir.store %[[VAL_0]] to %[[ARG4:.*]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +func.func @x(%lb : index, %ub : index, %step : index, %b : i1, %addr : !fir.ref) { + omp.teams { + omp.workdistribute { + fir.do_loop %iv = %lb to %ub step %step unordered { + %zero = arith.constant 0 : index + fir.store %zero to %addr : !fir.ref + } + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir new file mode 100644 index 0000000000000..04e60ca8bbf37 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir @@ -0,0 +1,117 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s +// Test lowering of workdistribute after fission on host device. + +// CHECK-LABEL: func.func @x( +// CHECK: %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"} +// CHECK: fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"} +// CHECK: fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"} +// CHECK: fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { +// CHECK: %[[VAL_11:.*]] = fir.alloca index +// CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_14:.*]] = fir.alloca index +// CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_17:.*]] = fir.alloca index +// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_20:.*]] = fir.alloca !fir.heap +// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(from) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(to) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_27:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index +// CHECK: %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"} +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap +// CHECK: fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref +// CHECK: fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref +// CHECK: fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref +// CHECK: fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref> +// CHECK: omp.target host_eval(%[[VAL_24]] -> %[[VAL_31:.*]], %[[VAL_25]] -> %[[VAL_32:.*]], %[[VAL_26]] -> %[[VAL_33:.*]] : index, index, index) map_entries(%[[VAL_7]] -> %[[VAL_34:.*]], %[[VAL_8]] -> %[[VAL_35:.*]], %[[VAL_9]] -> %[[VAL_36:.*]], %[[VAL_10]] -> %[[VAL_37:.*]], %[[VAL_13]] -> %[[VAL_38:.*]], %[[VAL_16]] -> %[[VAL_39:.*]], %[[VAL_19]] -> %[[VAL_40:.*]], %[[VAL_22]] -> %[[VAL_41:.*]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref>) { +// CHECK: %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref +// CHECK: %[[VAL_43:.*]] = fir.load %[[VAL_39]] : !fir.ref +// CHECK: %[[VAL_44:.*]] = fir.load %[[VAL_40]] : !fir.ref +// CHECK: %[[VAL_45:.*]] = fir.load %[[VAL_41]] : !fir.ref> +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_43]], %[[VAL_43]] : index +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_47:.*]]) : index = (%[[VAL_31]]) to (%[[VAL_32]]) inclusive step (%[[VAL_33]]) { +// CHECK: fir.store %[[VAL_46]] to %[[VAL_45]] : !fir.heap +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_48:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_49:.*]] = fir.load %[[VAL_11]] : !fir.ref +// CHECK: %[[VAL_50:.*]] = fir.load %[[VAL_14]] : !fir.ref +// CHECK: %[[VAL_51:.*]] = fir.load %[[VAL_17]] : !fir.ref +// CHECK: %[[VAL_52:.*]] = fir.load %[[VAL_20]] : !fir.ref> +// CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_50]], %[[VAL_50]] : index +// CHECK: fir.store %[[VAL_49]] to %[[VAL_52]] : !fir.heap +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.heap) -> i64 +// CHECK: omp.target_freemem %[[VAL_48]], %[[VAL_54]] : i32, i64 +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + +module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false} { +func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref) { + %lb_ref = fir.alloca index {bindc_name = "lb"} + fir.store %lb to %lb_ref : !fir.ref + %ub_ref = fir.alloca index {bindc_name = "ub"} + fir.store %ub to %ub_ref : !fir.ref + %step_ref = fir.alloca index {bindc_name = "step"} + fir.store %step to %step_ref : !fir.ref + + %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} + %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} + %step_map = omp.map.info var_ptr(%step_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} + %addr_map = omp.map.info var_ptr(%addr : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} + + omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { + %lb_val = fir.load %ARG0 : !fir.ref + %ub_val = fir.load %ARG1 : !fir.ref + %step_val = fir.load %ARG2 : !fir.ref + %one = arith.constant 1 : index + + %20 = arith.addi %ub_val, %ub_val : index + omp.teams { + omp.workdistribute { + %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"} + fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered { + fir.store %20 to %dev_mem : !fir.heap + } + fir.store %lb_val to %dev_mem : !fir.heap + fir.freemem %dev_mem : !fir.heap + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir new file mode 100644 index 0000000000000..062eb701b52ef --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir @@ -0,0 +1,118 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s +// Test lowering of workdistribute after fission on host device. + +// CHECK-LABEL: func.func @x( +// CHECK: %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"} +// CHECK: fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"} +// CHECK: fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"} +// CHECK: fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "lb"} +// CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "ub"} +// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "step"} +// CHECK: %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref, index) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref {name = "addr"} +// CHECK: omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { +// CHECK: %[[VAL_11:.*]] = fir.alloca index +// CHECK: %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_14:.*]] = fir.alloca index +// CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_17:.*]] = fir.alloca index +// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(from) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_20:.*]] = fir.alloca !fir.heap +// CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(from) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_from"} +// CHECK: %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref>, !fir.heap) map_clauses(to) capture(ByRef) -> !fir.ref> {name = "__flang_workdistribute_to"} +// CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref +// CHECK: %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref +// CHECK: %[[VAL_27:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index +// CHECK: %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"} +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap +// CHECK: fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref +// CHECK: fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref +// CHECK: fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref +// CHECK: fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref> +// CHECK: omp.target map_entries(%[[VAL_7]] -> %[[VAL_31:.*]], %[[VAL_8]] -> %[[VAL_32:.*]], %[[VAL_9]] -> %[[VAL_33:.*]], %[[VAL_10]] -> %[[VAL_34:.*]], %[[VAL_13]] -> %[[VAL_35:.*]], %[[VAL_16]] -> %[[VAL_36:.*]], %[[VAL_19]] -> %[[VAL_37:.*]], %[[VAL_22]] -> %[[VAL_38:.*]] : !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref, !fir.ref>) { +// CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_35]] : !fir.ref +// CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_36]] : !fir.ref +// CHECK: %[[VAL_41:.*]] = fir.load %[[VAL_37]] : !fir.ref +// CHECK: %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref> +// CHECK: %[[VAL_43:.*]] = arith.addi %[[VAL_40]], %[[VAL_40]] : index +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_44:.*]]) : index = (%[[VAL_39]]) to (%[[VAL_40]]) inclusive step (%[[VAL_41]]) { +// CHECK: fir.store %[[VAL_43]] to %[[VAL_42]] : !fir.heap +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_46:.*]] = fir.load %[[VAL_11]] : !fir.ref +// CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_14]] : !fir.ref +// CHECK: %[[VAL_48:.*]] = fir.load %[[VAL_17]] : !fir.ref +// CHECK: %[[VAL_49:.*]] = fir.load %[[VAL_20]] : !fir.ref> +// CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_47]], %[[VAL_47]] : index +// CHECK: fir.store %[[VAL_46]] to %[[VAL_49]] : !fir.heap +// CHECK: %[[VAL_51:.*]] = fir.convert %[[VAL_49]] : (!fir.heap) -> i64 +// CHECK: omp.target_freemem %[[VAL_45]], %[[VAL_51]] : i32, i64 +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + + +module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { +func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref) { + %lb_ref = fir.alloca index {bindc_name = "lb"} + fir.store %lb to %lb_ref : !fir.ref + %ub_ref = fir.alloca index {bindc_name = "ub"} + fir.store %ub to %ub_ref : !fir.ref + %step_ref = fir.alloca index {bindc_name = "step"} + fir.store %step to %step_ref : !fir.ref + + %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "lb"} + %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "ub"} + %step_map = omp.map.info var_ptr(%step_ref : !fir.ref, index) map_clauses(to) capture(ByRef) -> !fir.ref {name = "step"} + %addr_map = omp.map.info var_ptr(%addr : !fir.ref, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "addr"} + + omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref, !fir.ref, !fir.ref, !fir.ref) { + %lb_val = fir.load %ARG0 : !fir.ref + %ub_val = fir.load %ARG1 : !fir.ref + %step_val = fir.load %ARG2 : !fir.ref + %one = arith.constant 1 : index + + %20 = arith.addi %ub_val, %ub_val : index + omp.teams { + omp.workdistribute { + %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"} + fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered { + fir.store %20 to %dev_mem : !fir.heap + } + fir.store %lb_val to %dev_mem : !fir.heap + fir.freemem %dev_mem : !fir.heap + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir new file mode 100644 index 0000000000000..c562b7009664d --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir @@ -0,0 +1,71 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// CHECK-LABEL: func.func @test_fission_workdistribute( +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 9 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 5.000000e+00 : f32 +// CHECK: fir.store %[[VAL_3]] to %[[ARG2:.*]] : !fir.ref +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_2]]) inclusive step (%[[VAL_1]]) { +// CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[VAL_4]] : (!fir.ref>, index) -> !fir.ref +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref +// CHECK: %[[VAL_7:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[VAL_4]] : (!fir.ref>, index) -> !fir.ref +// CHECK: fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: fir.call @regular_side_effect_func(%[[ARG2:.*]]) : (!fir.ref) -> () +// CHECK: fir.call @my_fir_parallel_runtime_func(%[[ARG3:.*]]) : (!fir.ref) -> () +// CHECK: fir.do_loop %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_1]] { +// CHECK: %[[VAL_9:.*]] = fir.coordinate_of %[[ARG0]], %[[VAL_8]] : (!fir.ref>, index) -> !fir.ref +// CHECK: fir.store %[[VAL_3]] to %[[VAL_9]] : !fir.ref +// CHECK: } +// CHECK: %[[VAL_10:.*]] = fir.load %[[ARG2:.*]] : !fir.ref +// CHECK: fir.store %[[VAL_10]] to %[[ARG3:.*]] : !fir.ref +// CHECK: return +// CHECK: } +module { +func.func @regular_side_effect_func(%arg0: !fir.ref) { + return +} +func.func @my_fir_parallel_runtime_func(%arg0: !fir.ref) attributes {fir.runtime} { + return +} +func.func @test_fission_workdistribute(%arr1: !fir.ref>, %arr2: !fir.ref>, %scalar_ref1: !fir.ref, %scalar_ref2: !fir.ref) { + %c0_idx = arith.constant 0 : index + %c1_idx = arith.constant 1 : index + %c9_idx = arith.constant 9 : index + %float_val = arith.constant 5.0 : f32 + omp.teams { + omp.workdistribute { + fir.store %float_val to %scalar_ref1 : !fir.ref + fir.do_loop %iv = %c0_idx to %c9_idx step %c1_idx unordered { + %elem_ptr_arr1 = fir.coordinate_of %arr1, %iv : (!fir.ref>, index) -> !fir.ref + %loaded_val_loop1 = fir.load %elem_ptr_arr1 : !fir.ref + %elem_ptr_arr2 = fir.coordinate_of %arr2, %iv : (!fir.ref>, index) -> !fir.ref + fir.store %loaded_val_loop1 to %elem_ptr_arr2 : !fir.ref + } + fir.call @regular_side_effect_func(%scalar_ref1) : (!fir.ref) -> () + fir.call @my_fir_parallel_runtime_func(%scalar_ref2) : (!fir.ref) -> () + fir.do_loop %jv = %c0_idx to %c9_idx step %c1_idx { + %elem_ptr_ordered_loop = fir.coordinate_of %arr1, %jv : (!fir.ref>, index) -> !fir.ref + fir.store %float_val to %elem_ptr_ordered_loop : !fir.ref + } + %loaded_for_hoist = fir.load %scalar_ref1 : !fir.ref + fir.store %loaded_for_hoist to %scalar_ref2 : !fir.ref + omp.terminator + } + omp.terminator + } + return +} +} diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir new file mode 100644 index 0000000000000..03d5d71df0a82 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir @@ -0,0 +1,108 @@ +// RUN: fir-opt --lower-workdistribute %s | FileCheck %s + +// Test lowering of workdistribute for a scalar assignment within a target teams workdistribute region. +// The test checks that the scalar assignment is correctly lowered to wsloop and loop_nest operations. + +// Example Fortran code: +// !$omp target teams workdistribute +// y = 3.0_real32 +// !$omp end target teams workdistribute + + +// CHECK-LABEL: func.func @x( +// CHECK: omp.target {{.*}} { +// CHECK: omp.teams { +// CHECK: omp.parallel { +// CHECK: omp.distribute { +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_73:.*]]) : index = (%[[VAL_66:.*]]) to (%[[VAL_72:.*]]) inclusive step (%[[VAL_67:.*]]) { +// CHECK: %[[VAL_74:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_75:.*]]:3 = fir.box_dims %[[VAL_64:.*]], %[[VAL_74]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_76:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_77:.*]]:3 = fir.box_dims %[[VAL_64]], %[[VAL_76]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_78:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_79:.*]] = arith.remsi %[[VAL_73]], %[[VAL_77]]#1 : index +// CHECK: %[[VAL_80:.*]] = arith.addi %[[VAL_79]], %[[VAL_78]] : index +// CHECK: %[[VAL_81:.*]] = arith.divsi %[[VAL_73]], %[[VAL_77]]#1 : index +// CHECK: %[[VAL_82:.*]] = arith.remsi %[[VAL_81]], %[[VAL_75]]#1 : index +// CHECK: %[[VAL_83:.*]] = arith.addi %[[VAL_82]], %[[VAL_78]] : index +// CHECK: %[[VAL_84:.*]] = fir.array_coor %[[VAL_64]] %[[VAL_83]], %[[VAL_80]] : (!fir.box>, index, index) -> !fir.ref +// CHECK: fir.store %[[VAL_65:.*]] to %[[VAL_84]] : !fir.ref +// CHECK: omp.yield +// CHECK: } +// CHECK: } {omp.composite} +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } {omp.composite} +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +// CHECK: func.func private @_FortranAAssign(!fir.ref>, !fir.box, !fir.ref, i32) attributes {fir.runtime} + +module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { +func.func @x(%arr : !fir.ref>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c78 = arith.constant 78 : index + %cst = arith.constant 3.000000e+00 : f32 + %0 = fir.alloca i32 + %1 = fir.alloca i32 + %c10 = arith.constant 10 : index + %c20 = arith.constant 20 : index + %194 = arith.subi %c10, %c1 : index + %195 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%194 : index) extent(%c10 : index) stride(%c1 : index) start_idx(%c1 : index) + %196 = arith.subi %c20, %c1 : index + %197 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%196 : index) extent(%c20 : index) stride(%c1 : index) start_idx(%c1 : index) + %198 = omp.map.info var_ptr(%arr : !fir.ref>, f32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%195, %197) -> !fir.ref> {name = "y"} + %199 = omp.map.info var_ptr(%1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} + %200 = omp.map.info var_ptr(%0 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} + omp.target map_entries(%198 -> %arg5, %199 -> %arg6, %200 -> %arg7 : !fir.ref>, !fir.ref, !fir.ref) { + %c0_0 = arith.constant 0 : index + %201 = fir.load %arg7 : !fir.ref + %202 = fir.load %arg6 : !fir.ref + %203 = fir.convert %202 : (i32) -> i64 + %204 = fir.convert %201 : (i32) -> i64 + %205 = fir.convert %204 : (i64) -> index + %206 = arith.cmpi sgt, %205, %c0_0 : index + %207 = fir.convert %203 : (i64) -> index + %208 = arith.cmpi sgt, %207, %c0_0 : index + %209 = arith.select %208, %207, %c0_0 : index + %210 = arith.select %206, %205, %c0_0 : index + %211 = fir.shape %210, %209 : (index, index) -> !fir.shape<2> + %212 = fir.declare %arg5(%211) {uniq_name = "_QFFaxpy_array_workdistributeEy"} : (!fir.ref>, !fir.shape<2>) -> !fir.ref> + %213 = fir.embox %212(%211) : (!fir.ref>, !fir.shape<2>) -> !fir.box> + omp.teams { + %214 = fir.alloca !fir.box> {pinned} + omp.workdistribute { + %215 = fir.alloca f32 + %216 = fir.embox %215 : (!fir.ref) -> !fir.box + %217 = fir.shape %210, %209 : (index, index) -> !fir.shape<2> + %218 = fir.embox %212(%217) : (!fir.ref>, !fir.shape<2>) -> !fir.box> + fir.store %218 to %214 : !fir.ref>> + %219 = fir.address_of(@_QQclXf9c642d28e5bba1f07fa9a090b72f4fc) : !fir.ref> + %c39_i32 = arith.constant 39 : i32 + %220 = fir.convert %214 : (!fir.ref>>) -> !fir.ref> + %221 = fir.convert %216 : (!fir.box) -> !fir.box + %222 = fir.convert %219 : (!fir.ref>) -> !fir.ref + fir.call @_FortranAAssign(%220, %221, %222, %c39_i32) : (!fir.ref>, !fir.box, !fir.ref, i32) -> () + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} + +func.func private @_FortranAAssign(!fir.ref>, !fir.box, !fir.ref, i32) attributes {fir.runtime} + +fir.global linkonce @_QQclXf9c642d28e5bba1f07fa9a090b72f4fc constant : !fir.char<1,78> { + %0 = fir.string_lit "File: /work/github/skc7/llvm-project/build_fomp_reldebinfo/saxpy_tests/\00"(78) : !fir.char<1,78> + fir.has_value %0 : !fir.char<1,78> +} +} diff --git a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 83a9c0d738394..796ee8cf857ae 100644 --- a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -306,6 +306,7 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( const IntEqClasses &Classes, const SmallVectorImpl &SubRangeInfos, const SmallVectorImpl &Intervals) const { + const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); const SlotIndexes &Indexes = *LIS->getSlotIndexes(); for (size_t I = 0, E = Intervals.size(); I < E; ++I) { @@ -314,6 +315,25 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( LI.removeEmptySubRanges(); + // Try to establish a single subregister which covers all uses. + // Note: this is assuming the selected subregister will only be + // used for fixing up live intervals issues created by this pass. + LaneBitmask UsedMask, UnusedMask; + for (LiveInterval::SubRange &SR : LI.subranges()) + UsedMask |= SR.LaneMask; + SmallVector SubRegIdxs; + unsigned Flags = 0; + unsigned SubReg = 0; + // TODO: Handle SubRegIdxs.size() > 1 + if (TRI.getCoveringSubRegIndexes(MRI->getRegClass(Reg), UsedMask, + SubRegIdxs) && + SubRegIdxs.size() == 1) { + SubReg = SubRegIdxs.front(); + Flags = RegState::Undef; + } else { + UnusedMask = MRI->getMaxLaneMaskForVReg(Reg) & ~UsedMask; + } + // There must be a def (or live-in) before every use. Splitting vregs may // violate this principle as the splitted vreg may not have a definition on // every path. Fix this by creating IMPLICIT_DEF instruction as necessary. @@ -336,19 +356,18 @@ void RenameIndependentSubregs::computeMainRangesFixFlags( MachineBasicBlock::iterator InsertPos = llvm::findPHICopyInsertPoint(PredMBB, &MBB, Reg); const MCInstrDesc &MCDesc = TII->get(TargetOpcode::IMPLICIT_DEF); - MachineInstrBuilder ImpDef = BuildMI(*PredMBB, InsertPos, - DebugLoc(), MCDesc, Reg); + MachineInstrBuilder ImpDef = + BuildMI(*PredMBB, InsertPos, DebugLoc(), MCDesc) + .addDef(Reg, Flags, SubReg); SlotIndex DefIdx = LIS->InsertMachineInstrInMaps(*ImpDef); SlotIndex RegDefIdx = DefIdx.getRegSlot(); - LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(Reg); for (LiveInterval::SubRange &SR : LI.subranges()) { - Mask = Mask & ~SR.LaneMask; VNInfo *SRVNI = SR.getNextValue(RegDefIdx, Allocator); SR.addSegment(LiveRange::Segment(RegDefIdx, PredEnd, SRVNI)); } - - if (!Mask.none()) { - LiveInterval::SubRange *SR = LI.createSubRange(Allocator, Mask); + if (!UnusedMask.none()) { + LiveInterval::SubRange *SR = + LI.createSubRange(Allocator, UnusedMask); SR->createDeadDef(RegDefIdx, Allocator); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index b0ca1e8ef3dff..cbf17bd71a69e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -144,43 +144,41 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v21, v0 -; GFX1030-NEXT: v_mov_b32_e32 v22, v1 -; GFX1030-NEXT: v_mov_b32_e32 v23, v2 -; GFX1030-NEXT: v_mov_b32_e32 v24, v3 -; GFX1030-NEXT: v_mov_b32_e32 v25, v4 -; GFX1030-NEXT: v_mov_b32_e32 v26, v5 -; GFX1030-NEXT: v_mov_b32_e32 v27, v6 -; GFX1030-NEXT: v_mov_b32_e32 v28, v7 -; GFX1030-NEXT: v_mov_b32_e32 v29, v8 -; GFX1030-NEXT: v_mov_b32_e32 v30, v9 -; GFX1030-NEXT: v_mov_b32_e32 v31, v10 -; GFX1030-NEXT: v_mov_b32_e32 v19, v11 -; GFX1030-NEXT: v_mov_b32_e32 v20, v12 +; GFX1030-NEXT: v_mov_b32_e32 v15, v0 +; GFX1030-NEXT: v_mov_b32_e32 v16, v1 +; GFX1030-NEXT: v_mov_b32_e32 v17, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v4 +; GFX1030-NEXT: v_mov_b32_e32 v20, v5 +; GFX1030-NEXT: v_mov_b32_e32 v21, v6 +; GFX1030-NEXT: v_mov_b32_e32 v22, v7 +; GFX1030-NEXT: v_mov_b32_e32 v23, v8 +; GFX1030-NEXT: v_mov_b32_e32 v24, v9 +; GFX1030-NEXT: v_mov_b32_e32 v25, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v20 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[21:31], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr11 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 +; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr31 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1030-NEXT: ; %bb.2: @@ -190,22 +188,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v19, v11 -; GFX1013-NEXT: v_mov_b32_e32 v20, v12 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr19 +; GFX1013-NEXT: ; implicit-def: $vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1013-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 @@ -220,31 +216,29 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v21, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1 ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 -; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v11 -; GFX11-NEXT: v_mov_b32_e32 v19, v12 +; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s4, v18 -; GFX11-NEXT: v_readfirstlane_b32 s5, v19 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 ; GFX11-NEXT: v_readfirstlane_b32 s6, v13 ; GFX11-NEXT: v_readfirstlane_b32 s7, v14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: ; implicit-def: $vgpr11 ; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 -; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: @@ -259,42 +253,40 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 +; GFX1030-NEXT: v_mov_b32_e32 v13, v0 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 +; GFX1030-NEXT: v_mov_b32_e32 v15, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 +; GFX1030-NEXT: v_mov_b32_e32 v16, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v16, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_and_or_b32 v23, 0xffff, v5, v0 -; GFX1030-NEXT: v_and_or_b32 v24, 0xffff, v6, v1 -; GFX1030-NEXT: v_alignbit_b32 v25, v2, v7, 16 +; GFX1030-NEXT: v_mov_b32_e32 v17, v4 +; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_and_or_b32 v18, 0xffff, v5, v0 +; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:25], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr9 +; GFX1030-NEXT: ; implicit-def: $vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 -; GFX1030-NEXT: ; implicit-def: $vgpr21 -; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1030-NEXT: ; %bb.2: @@ -304,30 +296,28 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v17, v9 -; GFX1013-NEXT: v_mov_b32_e32 v18, v10 -; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v9 -; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 +; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr17 +; GFX1013-NEXT: ; implicit-def: $vgpr9 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 @@ -343,33 +333,32 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v19, v10 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3 -; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v18, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0 ; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v18 -; GFX11-NEXT: v_readfirstlane_b32 s5, v19 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 -; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr9 ; GFX11-NEXT: ; implicit-def: $vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 -; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: @@ -384,45 +373,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v22, v0 -; GFX1030-NEXT: v_mov_b32_e32 v23, v1 -; GFX1030-NEXT: v_mov_b32_e32 v24, v2 -; GFX1030-NEXT: v_mov_b32_e32 v25, v3 -; GFX1030-NEXT: v_mov_b32_e32 v26, v4 -; GFX1030-NEXT: v_mov_b32_e32 v27, v5 -; GFX1030-NEXT: v_mov_b32_e32 v28, v6 -; GFX1030-NEXT: v_mov_b32_e32 v29, v7 -; GFX1030-NEXT: v_mov_b32_e32 v30, v8 -; GFX1030-NEXT: v_mov_b32_e32 v31, v9 -; GFX1030-NEXT: v_mov_b32_e32 v32, v10 -; GFX1030-NEXT: v_mov_b32_e32 v33, v11 -; GFX1030-NEXT: v_mov_b32_e32 v20, v12 -; GFX1030-NEXT: v_mov_b32_e32 v21, v13 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v21, v5 +; GFX1030-NEXT: v_mov_b32_e32 v22, v6 +; GFX1030-NEXT: v_mov_b32_e32 v23, v7 +; GFX1030-NEXT: v_mov_b32_e32 v24, v8 +; GFX1030-NEXT: v_mov_b32_e32 v25, v9 +; GFX1030-NEXT: v_mov_b32_e32 v26, v10 +; GFX1030-NEXT: v_mov_b32_e32 v27, v11 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v20 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v21 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v13 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v14 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v15 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[22:33], s[4:7] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 +; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 +; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr31 -; GFX1030-NEXT: ; implicit-def: $vgpr32 -; GFX1030-NEXT: ; implicit-def: $vgpr33 -; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1030-NEXT: ; %bb.2: @@ -432,22 +419,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v20, v12 -; GFX1013-NEXT: v_mov_b32_e32 v21, v13 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v20 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v21 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v13 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v14 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v15 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[20:21] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr20 +; GFX1013-NEXT: ; implicit-def: $vgpr12 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 @@ -465,28 +450,26 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1 ; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v12 :: v_dual_mov_b32 v5, v13 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v12 +; GFX11-NEXT: v_readfirstlane_b32 s5, v13 ; GFX11-NEXT: v_readfirstlane_b32 s6, v14 ; GFX11-NEXT: v_readfirstlane_b32 s7, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] -; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr12 ; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 ; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11 -; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-NEXT: ; %bb.2: @@ -501,44 +484,42 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v19, v0 -; GFX1030-NEXT: v_mov_b32_e32 v20, v1 +; GFX1030-NEXT: v_mov_b32_e32 v14, v0 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v2 +; GFX1030-NEXT: v_mov_b32_e32 v16, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX1030-NEXT: v_mov_b32_e32 v22, v3 +; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v23, v4 -; GFX1030-NEXT: v_mov_b32_e32 v24, v5 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_mov_b32_e32 v18, v11 -; GFX1030-NEXT: v_and_or_b32 v25, 0xffff, v6, v0 -; GFX1030-NEXT: v_and_or_b32 v26, 0xffff, v7, v1 -; GFX1030-NEXT: v_alignbit_b32 v27, v2, v8, 16 +; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_mov_b32_e32 v19, v5 +; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 +; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0 +; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v7, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v17 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v18 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[17:18] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:27], s[4:7] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr10 +; GFX1030-NEXT: ; implicit-def: $vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 ; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 ; GFX1030-NEXT: ; %bb.2: @@ -548,30 +529,28 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v18, v10 -; GFX1013-NEXT: v_mov_b32_e32 v19, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 -; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v11 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 +; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v15 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v19 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 ; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 ; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr18 +; GFX1013-NEXT: ; implicit-def: $vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 @@ -591,29 +570,29 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3 ; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v10 :: v_dual_mov_b32 v5, v11 -; GFX11-NEXT: v_lshl_or_b32 v20, v6, 16, v0 -; GFX11-NEXT: v_perm_b32 v21, v6, v8, 0x7060302 -; GFX11-NEXT: v_lshl_or_b32 v22, v7, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0 +; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302 +; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v10 +; GFX11-NEXT: v_readfirstlane_b32 s5, v11 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16 -; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[4:6]], s[4:7] a16 +; GFX11-NEXT: ; implicit-def: $vgpr10 ; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16 -; GFX11-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index b47c7ecf8de95..f230a14dd0834 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -29546,173 +29546,27 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-NEXT: .LBB19_2: ; %cmp.true @@ -29720,762 +29574,674 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB19_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB19_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB19_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -34192,324 +33958,81 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-LABEL: bitcast_v64f16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 ; GFX11-NEXT: .LBB23_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB23_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB23_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37313,324 +36836,81 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-LABEL: bitcast_v64i16_to_v32i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 ; GFX11-NEXT: .LBB27_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB27_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB27_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB27_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -67275,173 +66555,27 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-NEXT: .LBB43_2: ; %cmp.true @@ -67449,762 +66583,674 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB43_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB43_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -71893,324 +70939,81 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-LABEL: bitcast_v64f16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-NEXT: .LBB47_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB47_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB47_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -74968,324 +73771,81 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-LABEL: bitcast_v64i16_to_v32f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-NEXT: .LBB51_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB51_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB51_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -102724,173 +101284,27 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB63_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB63_3 ; GFX11-NEXT: .LBB63_2: ; %cmp.true @@ -102898,762 +101312,674 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB63_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB63_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -107383,324 +105709,81 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-LABEL: bitcast_v64f16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 ; GFX11-NEXT: .LBB67_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB67_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB67_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB67_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -110518,324 +108601,81 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-LABEL: bitcast_v64i16_to_v16i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 ; GFX11-NEXT: .LBB71_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB71_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB71_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB71_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -138331,173 +136171,27 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 ; GFX11-NEXT: .LBB79_2: ; %cmp.true @@ -138505,762 +136199,674 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 ; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v32, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v33 :: v_dual_add_nc_u32 v5, v7, v32 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v32, v33, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v34, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s3 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_bfe_u32 v33, v34, 16, 1 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v32, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v34 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 ; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31 -; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v33, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v4, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v36, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v36, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v32, vcc_lo +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 +; GFX11-NEXT: v_add_f32_e64 v38, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v34 +; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v2, v37, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v33, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v36, v38 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v36, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v37, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v31, v35, v37 :: v_dual_add_nc_u32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v33 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_bfe_u32 v36, v29, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_add_nc_u32 v32, v34, v35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v29 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v34, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v38 :: v_dual_add_nc_u32 v33, v35, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_cndmask_b32 v28, v33, v37 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v36, v27, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v29, v32, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v34, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v27 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_lshl_or_b32 v28, v32, 16, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v34, v37, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v32, v37, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_lshlrev_b32 v36, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_bfe_u32 v35, v24, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v33, v39, v36 +; GFX11-NEXT: v_lshl_or_b32 v26, v32, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v24 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v22 ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v25, v32, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v35 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v23 +; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v34 ; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 ; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_dual_cndmask_b32 v23, v32, v34 :: v_dual_add_nc_u32 v34, v35, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_nc_u32 v32, v32, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v37, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v48, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v36, v48, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v32, v38, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v32, v39, v35 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v21, v36, v37 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX11-NEXT: v_lshl_or_b32 v22, v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v36 ; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v33, v37, v20 +; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v33, v35 :: v_dual_and_b32 v33, 0xffff0000, v18 +; GFX11-NEXT: v_bfe_u32 v38, v19, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, v38, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v37 :: v_dual_add_nc_u32 v35, v35, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 ; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_nc_u32 v37, v37, v33 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v36, v37, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v17, 0x40c00000, v17 ; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v38, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v17 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v38, v39, v36 +; GFX11-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v17 +; GFX11-NEXT: v_bfe_u32 v48, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v48, v48, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v39 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v35, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v48 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v39, v51, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_lshl_or_b32 v20, v32, 16, v20 +; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v19 +; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v35, v48, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v35 ; GFX11-NEXT: .LBB79_3: ; %end -; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB79_4: -; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -142871,324 +140477,81 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-LABEL: bitcast_v64f16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 ; GFX11-NEXT: .LBB83_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB83_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB83_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB83_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -145850,324 +143213,81 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-LABEL: bitcast_v64i16_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: ; meta instruction -; GFX11-NEXT: scratch_store_b32 off, v185, s32 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 ; GFX11-NEXT: .LBB87_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB87_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB87_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 ; GFX11-NEXT: s_branch .LBB87_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2b63a8cf69476..28b992ee77b14 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -981,7 +981,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_1: ; %Flow @@ -1004,36 +1004,36 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.5: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v6, v7 +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], s[6:7] ; GCN-NEXT: s_cbranch_execz .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir index eaf669da83ead..9e38919190ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir @@ -11,26 +11,28 @@ body: | ; REG_ALLOC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: KILL killed renamable $vgpr4 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr2 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr0 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr3 - ; REG_ALLOC-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; REG_ALLOC-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; REG_ALLOC-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec ; REG_ALLOC-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REG_ALLOC-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; REG_ALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; REG_ALLOC-NEXT: S_BRANCH %bb.2 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.1: ; REG_ALLOC-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; REG_ALLOC-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -42,33 +44,33 @@ body: | ; REG_ALLOC-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 - ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; REG_ALLOC-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; REG_ALLOC-NEXT: S_BRANCH %bb.1 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.3: ; REG_ALLOC-NEXT: successors: %bb.5(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr6, implicit $exec ; REG_ALLOC-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: S_BRANCH %bb.5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.4: - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; REG_ALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; REG_ALLOC-NEXT: S_ENDPGM 0 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.5: ; REG_ALLOC-NEXT: successors: %bb.4(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: S_BRANCH %bb.4 @@ -78,26 +80,28 @@ body: | ; DEAD_INST_DEL-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr4 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr2 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr0 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr3 - ; DEAD_INST_DEL-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; DEAD_INST_DEL-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec ; DEAD_INST_DEL-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.2 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.1: ; DEAD_INST_DEL-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; DEAD_INST_DEL-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -109,33 +113,33 @@ body: | ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 - ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; DEAD_INST_DEL-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.1 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.3: ; DEAD_INST_DEL-NEXT: successors: %bb.5(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000003, $vgpr6_vgpr7_vgpr8_vgpr9:0x0000000000000003 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr6, implicit $exec ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.4: - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: S_ENDPGM 0 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.5: ; DEAD_INST_DEL-NEXT: successors: %bb.4(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.4 diff --git a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll index ea127323f3e05..50efc06237d5b 100644 --- a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll +++ b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -filetype=null -verify-machineinstrs %s 2>&1 | FileCheck %s +; RUN: not llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -amdgpu-enable-rewrite-partial-reg-uses=0 -filetype=null -verify-machineinstrs %s 2>&1 | FileCheck %s ; CHECK: error: :0:0: ran out of registers during register allocation in function 'f' ; CHECK-NOT: Bad machine code diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 82ebdf84019a5..d972402d03a7d 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -475,3 +475,5 @@ if(OFFLOAD_INCLUDE_TESTS) add_subdirectory(test) add_subdirectory(unittests) endif() + +add_subdirectory(utils) diff --git a/offload/utils/CMakeLists.txt b/offload/utils/CMakeLists.txt new file mode 100644 index 0000000000000..d6f2d6729d18c --- /dev/null +++ b/offload/utils/CMakeLists.txt @@ -0,0 +1,10 @@ +set(OPENMP_UTILS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH + "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')") + +macro(add_openmp_util path) + install(PROGRAMS + ${path} + DESTINATION "${OPENMP_UTILS_INSTALL_DIR}") +endmacro() + +add_subdirectory(gpurun) diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt new file mode 100644 index 0000000000000..0483a5737b830 --- /dev/null +++ b/offload/utils/gpurun/CMakeLists.txt @@ -0,0 +1 @@ +add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun new file mode 100755 index 0000000000000..870bc7a8ccbcd --- /dev/null +++ b/offload/utils/gpurun/gpurun @@ -0,0 +1,697 @@ +#!/bin/bash +# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# gpurun: Process launch utility for GPU applications. This is a wrapper +# to execute application binaries including OpenMPI GPU applications. +# See help message below (gpurun -h) for more information. +# +# Usage Examples: +# gpurun true +# mpirun -np 4 gpurun env | grep ROCR_VISIBLE_DEVICES +# + +# If set to 1, just invoke the rest of the command line without doing anything +# else. +GPURUN_BYPASS=${GPURUN_BYPASS:-0} + +function execOnError() { + exec "$@" +} + +# PROGVERSION string is updated by cmake when component is installed +PROGVERSION=X.Y-Z +function version(){ + echo $0 version $PROGVERSION + exit 0 +} +function usage(){ +/bin/cat 2>&1 <<"EOF" + + gpurun: Application process launch utility for GPUs. + This utility ensures the process will enable either a single + GPU or the number specified with -md (multi-device) option. + It launches the application binary with either the 'taskset' + or 'numactl' utility so the process only runs on CPU cores + in the same NUMA domain as the selected GPUs. + This utility sets environment variable ROCR_VISIBLE_DEVICES + to selected GPUs ONLY if it was not already set by the + callers environment AND the number of GPUs is not 1. + This utility also sets environment variable HSA_CU_MASK + to control which CUs are available to the process. + HSA_CU_MASK is set only when more than one OpenMPI process + (rank) will utilize the same GPU and it is not preset. + Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the + number of CUs available to the process after masking. + + Usage: + gpurun [ ] + mpirun -np gpurun [ ] + + Options: + -h Print this help message and exit + -md Set number of desired devices for multi-device mode, default=1 + -s suppress output, often useful in benchmarking + -q suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0 + -v Verbose output, same as GPURUN_VERBOSE=1 + -vv Verbose output, same as GPURUN_VERBOSE=2 + -m use numactl membind to CPUs in same NUMA domain. Note: Allocation + fails when not enough memory available on these nodes. + -l use numactl localalloc to CPUs in same NUMA domain. Note: If + memory cannot be allocated, alloc falls back to other nodes. + -nr use numactl ROCR_VISIBLE_DEVICES + -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK + --version Print version of gpurun and exit + + Optional Input environment variables: + GPURUN_VERBOSE + 0: default for silent operation, no trace printed to stderr + 1: -v prints trace record including process launch cmd to stderr + 2: -vv prints trace and other summary diagnostics + ROCMINFO_BINARY Set location of rocminfo binary + AOMP: location of AOMP or ROCM + GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0. + This only works for single device mode. + GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards + GPURUN_MASK_POLICY : useful if machine has different GPU cards + ROCR_VISIBLE_DEVICES: See description above + OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi + OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi + This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID + and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK + + Generated (output) Environment Variables: + OMPX_TARGET_TEAM_PROCS - Number of CUs available to process + ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset + HSA_CU_MASK - The CU mask for the device. + LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument + GPU_MAX_HW_QUEUES + LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" + + Limitations: + - Currently, gpurun creates masks that are mutually exclusive of each other. + That is, the MPI processes will not share CUs. If number of ranks is not + perfectly divisible by number of CUs or number of GPUs, some resources + would be unused. + Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization. + - Works with AOMP 19.0-0 or ROCM 6.1 or greater + - cu masking is not available when multiple devices per process are enabled + with -md option (multi-device) mode. + + Notes: + With MPI, this utility distributes GPUs and their CUs across + multiple ranks of an MPI job into mutually exclusive sets of CUs. + It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE + and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a + the mutually exclusive CU mask. + + An rplace (rank place) is a subset of CUs for a rank. + gpurun calculates the number of rplaces needed to contain all + the specified number of ranks for this node. If number of ranks not + divisible by number of GPUs, then there will be more rplaces than ranks. + The number of CUs in an rplace is calculated by dividing the number of + CUs per GPU by the number of rplaces per GPU. This is also the number of + bits set in the CU mask. This is also the number of physical locations + available for an OpenMP team to execute. This utility exports that number + to the environment variable OMPX_TARGET_TEAM_PROCS. This value + could be used by the application or runtume to adjust the number + of desired teams in a target region. If no masking occurs, the entire + GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to + the total number of CUs on the GPU. + + Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. + +EOF + exit 0 +} + +_end_gpurun_opts=0 +_devices_per_mdset=1 +_uses_multi_device=0 +while [ "$_end_gpurun_opts" == "0" ] ; do + case "$1" in + -s) GPURUN_VERBOSE=0;; + -q) GPURUN_VERBOSE=0;; + --quiet) GPURUN_VERBOSE=0;; + -h) usage ;; + -help) usage ;; + --help) usage ;; + -version) version ;; + --version) version ;; + -v) GPURUN_VERBOSE=1;; + -vv) GPURUN_VERBOSE=2;; + -m) _use_numactl_membind=1;; + -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; + -nr) _use_numactl_rocr=1;; + -nm) _use_numactl_ompi=1;; + -l) _use_numactl_localalloc=1;; + -nomask) GPURUN_MASK_POLICY="nomask";; + *) _end_gpurun_opts=1; break;; + esac + if [ "$_end_gpurun_opts" == "0" ] ; then + shift + fi +done + +if [ "$GPURUN_BYPASS" = "1" ]; then + execOnError "$@" +fi + +# Default: quiet operation +GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} +# Default: create mutually exclusive sets of CUs when GPU is oversubscribed +GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex} +# switch mask policy to preset if HSA_CU_MASK was preset +[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset +# switch mask policy to nomask for multi-device +[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask +# Offset selected device to avoid some heavily used GPUs +GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0} + +# Get environment variables set by OpenMPI +_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE +_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK +# If not OpenMPI, check for Platform MPI, MVAPICH +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_LOCALNRANKS + _local_rank_num=$MPI_LOCALRANKID +fi +# Also try MPI_COMM_WORLD env vars +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE + _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK +fi +# Check if SLURM was used +if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then + _num_local_ranks=$SLURM_CPUS_ON_NODE + _local_rank_num=$SLURM_LOCALID +fi + +if [ "$_use_numactl_rocr" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* + exit $? + else + $* + exit $? + fi +fi +if [ "$_use_numactl_ompi" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* + exit $? + else + $* + exit $? + fi +fi +# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=1 + _local_rank_num=0 +fi + +# Find location of the rocminfo binary +AOMP=${AOMP:-_AOMP_INSTALL_DIR_} +if [ ! -d $AOMP ] ; then + AOMP="_AOMP_INSTALL_DIR_" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/lib/llvm" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/llvm" +fi +if [ ! -d $AOMP ] ; then + realpath=`realpath $0` + thisdir=`dirname $realpath` + AOMP=$thisdir/.. +fi +if [ ! -d $AOMP ] ; then + >&2 echo "ERROR: AOMP not found at $AOMP" + >&2 echo " Please install AOMP or correctly set env-var AOMP" + execOnError "$@" +fi +ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo} +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo +if [ ! -f $ROCMINFO_BINARY ] ; then + >&2 echo "ERROR: Could not find binary for rocminfo," + >&2 echo " Please correct installation of ROCM or AOMP compiler" + execOnError "$@" +fi + +# Use rocminfo to find number number of CUs and gfxids for each GPU. +_tfile="/tmp/rinfo_out$$" +$ROCMINFO_BINARY 2>/dev/null | grep -E " Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile +_tfile_lines=`wc -l $_tfile | cut -d" " -f1` +if [ $_tfile_lines == 0 ] ; then + >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices" + rm $_tfile + execOnError "$@" +fi +# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device +_ri_all_gfxids="" +_ri_gfxids=() +_ri_cucount=() +_ri_bdfids=() +_ri_dev_idx=() +_ri_num_devices=0 +_last_cu_count=0 +_ri_uuid=() +_last_device_type_was_gpu=0 +_device_type_preset=0 +_ri_num_all_devices=0 +[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1 +while read _linepair ; do + _fieldvalue=`echo $_linepair | cut -d":" -f2` + _fieldtype=`echo $_linepair | cut -d":" -f1` + if [ $_fieldvalue == "CPU" ] ; then + _last_device_type_was_gpu=0 + elif [ $_fieldvalue == "GPU" ] ; then + _last_device_type_was_gpu=1 + elif [ "$_fieldtype" == "Uuid" ] ; then + _this_uuid=$_fieldvalue + elif [ "$_fieldtype" == "BDFID" ] ; then + if [[ $_last_device_type_was_gpu == 1 ]] ; then + # _domain="$(echo "$_fieldvalue / (2^32)" | bc)" + _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)" + _devfn="$(echo "($_fieldvalue % (2^8))" | bc)" + _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")" + fi + elif [ "$_fieldtype" == "Name" ] ; then + # The device name field is last in rocminfo output, so we can create new _ri_ array entry + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _this_gfxid=`echo $_fieldvalue | cut -d'-' -f5` + ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid" + _is_type_visible=1 + if [ $_device_type_preset == 1 ] ; then + _is_type_visible=0 + if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then + _is_type_visible=1 + fi + fi + if [ $_is_type_visible == 1 ] ; then + _ri_gfxids+=( $_this_gfxid ) + _ri_cucount+=( $_last_cu_count ) + _ri_bdfids+=( $_bdfidstr ) + _ri_dev_idx+=( $_ri_num_all_devices ) + _ri_uuid+=( $_this_uuid ) + _ri_num_devices=$(( $_ri_num_devices + 1 )) + fi + _ri_num_all_devices=$(( $_ri_num_all_devices + 1 )) + fi + else + # else the _fieldvalue was the number of CUs or GCPUs + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _last_cu_count=$_fieldvalue + fi + fi +done < $_tfile +rm $_tfile + +if [ $_ri_num_devices == 0 ] ; then + if [ $_local_rank_num == 0 ] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY" + fi + if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then + >&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES" + >&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly." + fi + execOnError "$@" + else + execOnError "$@" + fi +fi + +# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per +# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids, +# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information +# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above +# by scanning output from rocminfo. +_sysdevdir="/sys/bus/pci/devices" +_ss_num_devices=0 +_ss_cpulist=() +_ss_bdfid=() +_ss_numanode=() +_ss_uuid=() +_ss_gfxid=() +_ss_cucount=() +for _devid in `ls $_sysdevdir` ; do + if [ -f $_sysdevdir/$_devid/device ] ; then + _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` + if [ ! -z $_driver_name ] ; then + if [ $_driver_name == "DRIVER=amdgpu" ] ; then + _numa_node=`cat $_sysdevdir/$_devid/numa_node` + [ "$_numa_node" == "-1" ] && _numa_node=0 + _this_uuid=0 + if [ -f $_sysdevdir/$_devid/unique_id ] ; then + _this_uuid=`cat $_sysdevdir/$_devid/unique_id` + if [ -z $_this_uuid ] ; then + _this_uuid=0 + _has_unique_id_file=0 + else + _this_uuid="GPU-$_this_uuid" + _has_unique_id_file=1 + fi + fi + _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` + _match_uuid_count=0 + for _ri_i in ${!_ri_bdfids[@]} ; do + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + if [ $_ss_value == $_ri_value ] ; then + _match_uuid_count=$(( $_match_uuid_count + 1 )) + fi + done + # Search _ri_ arrays for matching uuids or matching bdfids. + for _ri_i in ${!_ri_bdfids[@]} ; do + if [ "$_has_unique_id_file" == "1" ] ; then + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then + # Under Hyper-V, we may see a zero BDFID. Fall back to UUID. + _ss_value=$_devid + _ri_value=$_devid + else + _ss_value=$_devid + _ri_value="0000:${_ri_bdfids[$_ri_i]}.0" + fi + if [ $_ss_value == $_ri_value ] ; then + if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then + # Some GPUs do not have unique_id or TPX mode creates multiple + # identical uuids, so use device index for RVD + _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} ) + else + _ss_uuid+=( $_this_uuid ) + fi + _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} ) + _ss_cucount+=( ${_ri_cucount[$_ri_i]} ) + _ss_bdfid+=( $_devid ) + _ss_numanode+=( $_numa_node ) + _ss_cpulist+=( $_this_cpulist ) + _ss_num_devices=$(( $_ss_num_devices + 1 )) + fi + done + fi + fi + fi +done + +if [[ $_ss_num_devices -lt 1 ]] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir." + fi + execOnError "$@" +fi + +# check for taskset or numactl cmd +if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd_binary=`which numactl` + if [ $? != 0 ] ; then + >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed." + execOnError "$@" + fi +else + _launch_process_cmd_binary=`which taskset` + if [ $? != 0 ] ; then + >&2 echo "ERROR: $0 requires the taskset command to be installed." + execOnError "$@" + fi +fi +if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then + >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored." + _use_numactl_membind=0 +fi + +_utilized_devices=$_ri_num_devices +[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks + +# Calculate number of GPUs to use to evenly spread ranks across GPUs. +# An rplace is a set of CUs that will be used for a rank. +# The number of rplaces must be at least the number of ranks. +_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices )) +_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices )) +if [ $_uncovered_ranks != 0 ] ; then + # If _num_local_ranks not divisible by number of GPUs, + # then add an extra rplace per GPU to make room for remainder. + _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) +fi +if [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # For mutex policy, adjacent ranks are assigned to the same device. + _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) + # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS + _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +else + # for mask policies nomask or preset, adjacent ranks are assigned to + # different GPUs and oversubscribed ranks are assigned round robin + _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +fi + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +if [ $_uses_multi_device == 1 ]; then + # Enforce some rules on the use of -md option + # Note -md forces GPURUN_MASK_POLICY=nomask + if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then + >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode" + execOnError "$@" + fi + if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then + >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)" + execOnError "$@" + fi + _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset )) + if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then + printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n Some multi-device sets will overlap.\n" >&2 + fi + _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices)) + _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 )) + + # merge entries for this mdset from per device arrays + _md_bdfs="" + _md_cpus="" + _md_nns="" + _md_uuids="" + _md_dev_idxs="" + _sep="" + for i in `seq $_md_device_set_start $_md_device_set_end` ; do + _dev_index=$i + # handle index wrap around number of devices + [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices )) + _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]} + _new_nn=${_ss_numanode[$_dev_index]} + SAVEIFS=$IFS + IFS="," + _found=0 + for _existing_nn in $_md_nns ; do + [ $_existing_nn == $_new_nn ] && _found=1 + done + IFS=$SAVEIFS + if [ $_found == 0 ] ; then + # only add new numa node and cpulist, if not already in the md set + _md_nns+=$_sep$_new_nn + _md_cpus+=$_sep${_ss_cpulist[$_dev_index]} + fi + _md_uuids+=$_sep${_ss_uuid[$_dev_index]} + _md_dev_idxs+=$_sep$_dev_index + _sep="," + done + _device_num=$_md_device_set_start +fi + +_available_CUs_per_device=${_ss_cucount[$_device_num]} +_gfxid=${_ss_gfxid[$_device_num]} + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +_utilized_CUs_per_device=$_available_CUs_per_device +_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +# Lower utilized CUs till divisible by number of rplaces per GPU +while [ $_rem2 != 0 ] ; do + _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 )) + _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +done +_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) + +# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0 +if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then + if [ $_uses_multi_device == 0 ] ; then + _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) + _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices )) + _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) + _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) + _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) + _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) + if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then + if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then + _extra_diags=true + fi + fi + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + [ $_extra_diags ] && \ + >&2 echo "-- USED GPUS: $(( $_ri_num_devices - $_wasted_GPUs ))" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " + [ $_extra_diags ] && echo + >&2 echo "- RPLACEs PER NODE: $_total_GPU_rplaces" + >&2 echo "- RPLACEs PER GPU: $_number_of_rplaces_per_GPU" + [ $_extra_diags ] && \ + >&2 echo "-- USED RPLACEs: $_num_local_ranks (RANKS)" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ + >&2 echo "- gfxids ${_ss_gfxid[@]}" + >&2 echo "- CUs PER GPU: ${_ss_cucount[@]}" + [ $_extra_diags ] && \ + >&2 echo "-- USED on CUs RANK0: $_utilized_CUs_per_device" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED CUs RANK0 : $_wasted_CUs_on_each_GPU" + >&2 echo "- CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" + >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" + if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then + >&2 echo "- Preset ROCR_VISIBLE_DEVICES: $ROCR_VISIBLE_DEVICES" + fi + if [[ ! -z "$HSA_CU_MASK" ]] ; then + # node utilizatino could be incorrect with preset cumask. + >&2 echo "- Preset HSA_CU_MASK: $HSA_CU_MASK" + else + >&2 echo "- NODE UTILIZATION: $_utilization %" + fi + else + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + >&2 echo "- DEVS PER RANK: $_devices_per_mdset" + >&2 echo "- MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)" + _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices )) + >&2 echo "- NODE UTILIZATION: $_md_utilization %" + fi +fi +# --- END OF DIAGNOSTIC BLOCK + +if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace + _bits_to_set=$_CUs_per_rplace + # This formula keeps adjacent ranks on same GPU which should be preferred + _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) + # use bc because these values can be very large + _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` + _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` + # Calculate the number of leading zeros needed for this mask + _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 )) + for i in `seq 1 $_lz` ; do + _mask="0$_mask" + done + _mask="0x$_mask" +fi + +_launch_process_cmd="" +if [ $_uses_multi_device == 0 ] ; then + # retrieve scanned info from per device arrays + _bdfidstrc=${_ss_bdfid[$_device_num]} + NUMANODE=${_ss_numanode[$_device_num]} + _list_of_cpu_cores=${_ss_cpulist[$_device_num]} + _this_uuid=${_ss_uuid[$_device_num]} +else + # Use multi-device values + _bdfidstrc=$_md_bdfs + NUMANODE=$_md_nns + _list_of_cpu_cores=$_md_cpus + _this_uuid=$_md_uuids + _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset " +fi +if [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE" +elif [ "$_use_numactl_membind" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE" +else + _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores" +fi + +# If gpurun was not given command to execute, then dont run _launch_process_cmd +[ "$*" == "" ] && _launch_process_cmd="" + +# only set ROCR_VISIBLE_DEVICES if not already set +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + export ROCR_VISIBLE_DEVICES=$_this_uuid + _log_word="RVD" +else + _log_word="PRESET-RVD" +fi + +export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace + +# - Limit HSA queues when multiple ranks per GPU +if [ $_number_of_rplaces_per_GPU != 1 ] ; then + # Only set these env controls if not set by caller + [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 + [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 +fi + +[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] && \ + [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" + +if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then + # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution. + if [ "$GPURUN_VERBOSE" != "0" ] ; then + if [ $_uses_multi_device == 1 ] ; then + printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n CMD:$_launch_process_cmd $*\n" >&2 + else + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2 + fi + fi + $_launch_process_cmd $* + # --- end code block +else + # --- HSA_CU_MASK is required in this code block, assumes no multi-device + if [[ -z "$HSA_CU_MASK" ]] ; then + # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: + export HSA_CU_MASK=0:$_mask + else + # use preset mask + _mask=$HSA_CU_MASK + fi + if [ "$GPURUN_VERBOSE" != "0" ] ; then + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2 + fi + HSA_CU_MASK=0:$_mask \ + $_launch_process_cmd $* + # --- end code block +fi +exit $?