From 8503c4925af981207c6b76ddcff16a30a2cc2f2e Mon Sep 17 00:00:00 2001 From: koparasy Date: Fri, 14 Nov 2025 17:26:59 -0800 Subject: [PATCH 1/6] Rename variables --- clang/lib/CIR/FrontendAction/CIRGenAction.cpp | 91 +++++++++---------- 1 file changed, 41 insertions(+), 50 deletions(-) diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp index 8b1a2512613d..05cf7b853da8 100644 --- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -35,6 +35,7 @@ #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/MultiplexConsumer.h" #include "clang/Lex/Preprocessor.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DiagnosticInfo.h" @@ -47,11 +48,10 @@ #include "llvm/LTO/LTOBackend.h" #include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/ADT/SmallString.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Support/Path.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" @@ -112,12 +112,12 @@ class CIRGenConsumer : public clang::ASTConsumer { CIRGenAction::OutputType Action; - CompilerInstance &CompilerInstance; - DiagnosticsEngine &DiagnosticsEngine; - [[maybe_unused]] const HeaderSearchOptions &HeaderSearchOptions; - CodeGenOptions &CodeGenOptions; - [[maybe_unused]] const TargetOptions &TargetOptions; - [[maybe_unused]] const LangOptions &LangOptions; + CompilerInstance &CI; + DiagnosticsEngine &Diags; + [[maybe_unused]] const HeaderSearchOptions &HeaderSearchOpts; + CodeGenOptions &CodeGenOpts; + [[maybe_unused]] const TargetOptions &TargetOpts; + [[maybe_unused]] const LangOptions &LangOpts; const FrontendOptions &FeOptions; std::string InputFileName; @@ -128,25 +128,21 @@ class CIRGenConsumer : public clang::ASTConsumer { std::unique_ptr Gen; public: - CIRGenConsumer(CIRGenAction::OutputType Action, - class CompilerInstance &CompilerInstance, - class DiagnosticsEngine &DiagnosticsEngine, + CIRGenConsumer(CIRGenAction::OutputType Action, class CompilerInstance &CI, + class DiagnosticsEngine &Diags, IntrusiveRefCntPtr VFS, - const class HeaderSearchOptions &HeaderSearchOptions, - class CodeGenOptions &CodeGenOptions, - const class TargetOptions &TargetOptions, - const class LangOptions &LangOptions, + const class HeaderSearchOptions &HeaderSearchOpts, + class CodeGenOptions &CodeGenOpts, + const class TargetOptions &TargetOpts, + const class LangOptions &LangOpts, const FrontendOptions &FeOptions, StringRef InputFile, std::unique_ptr Os) - : Action(Action), CompilerInstance(CompilerInstance), - DiagnosticsEngine(DiagnosticsEngine), - HeaderSearchOptions(HeaderSearchOptions), - CodeGenOptions(CodeGenOptions), TargetOptions(TargetOptions), - LangOptions(LangOptions), FeOptions(FeOptions), - InputFileName(InputFile.str()), - OutputStream(std::move(Os)), FS(VFS), - Gen(std::make_unique(DiagnosticsEngine, std::move(VFS), - CodeGenOptions)) {} + : Action(Action), CI(CI), Diags(Diags), + HeaderSearchOpts(HeaderSearchOpts), CodeGenOpts(CodeGenOpts), + TargetOpts(TargetOpts), LangOpts(LangOpts), FeOptions(FeOptions), + InputFileName(InputFile.str()), OutputStream(std::move(Os)), FS(VFS), + Gen(std::make_unique(Diags, std::move(VFS), + CodeGenOpts)) {} void Initialize(ASTContext &Ctx) override { assert(!AstContext && "initialized multiple times"); @@ -221,12 +217,12 @@ class CIRGenConsumer : public clang::ASTConsumer { FeOptions.ClangIRLifetimeCheck, LifetimeOpts, FeOptions.ClangIRIdiomRecognizer, IdiomRecognizerOpts, FeOptions.ClangIRLibOpt, LibOptOpts, PassOptParsingFailure, - CodeGenOptions.OptimizationLevel > 0, FlattenCir, + CodeGenOpts.OptimizationLevel > 0, FlattenCir, !FeOptions.ClangIRDirectLowering, EnableCcLowering, FeOptions.ClangIREnableMem2Reg) .failed()) { if (!PassOptParsingFailure.empty()) { - auto D = DiagnosticsEngine.Report(diag::err_drv_cir_pass_opt_parsing); + auto D = Diags.Report(diag::err_drv_cir_pass_opt_parsing); D << PassOptParsingFailure; } else llvm::report_fatal_error("CIR codegen: MLIR pass manager fails " @@ -269,24 +265,21 @@ class CIRGenConsumer : public clang::ASTConsumer { } } - bool EmitCIR = LangOptions.EmitCIRToFile || FeOptions.EmitClangIRFile || - !LangOptions.CIRFile.empty() || - !FeOptions.ClangIRFile.empty(); + bool EmitCIR = LangOpts.EmitCIRToFile || FeOptions.EmitClangIRFile || + !LangOpts.CIRFile.empty() || !FeOptions.ClangIRFile.empty(); if (EmitCIR) { std::unique_ptr CIRStream; llvm::SmallString<128> DefaultPath; if (!FeOptions.ClangIRFile.empty()) { - CIRStream = CompilerInstance.createOutputFile( - FeOptions.ClangIRFile, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); - } else if (!LangOptions.CIRFile.empty()) { - CIRStream = CompilerInstance.createOutputFile( - LangOptions.CIRFile, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); + CIRStream = CI.createOutputFile(FeOptions.ClangIRFile, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); + } else if (!LangOpts.CIRFile.empty()) { + CIRStream = CI.createOutputFile(LangOpts.CIRFile, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); } else { if (!FeOptions.OutputFile.empty() && FeOptions.OutputFile != "-") { DefaultPath = FeOptions.OutputFile; @@ -299,11 +292,10 @@ class CIRGenConsumer : public clang::ASTConsumer { DefaultPath = "clangir-output"; } llvm::sys::path::replace_extension(DefaultPath, "cir"); - CIRStream = CompilerInstance.createOutputFile( - DefaultPath, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); + CIRStream = CI.createOutputFile(DefaultPath, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); } if (CIRStream) { @@ -354,7 +346,7 @@ class CIRGenConsumer : public clang::ASTConsumer { case CIRGenAction::OutputType::EmitAssembly: { llvm::LLVMContext LlvmCtx; bool DisableDebugInfo = - CodeGenOptions.getDebugInfo() == llvm::codegenoptions::NoDebugInfo; + CodeGenOpts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo; auto LlvmModule = lowerFromCIRToLLVMIR( FeOptions, MlirMod, std::move(MlirCtx), LlvmCtx, FeOptions.ClangIRDisableCIRVerifier, @@ -362,10 +354,9 @@ class CIRGenConsumer : public clang::ASTConsumer { BackendAction BackendAction = getBackendActionFromOutputType(Action); - emitBackendOutput(CompilerInstance, CodeGenOptions, - C.getTargetInfo().getDataLayoutString(), - LlvmModule.get(), BackendAction, FS, - std::move(OutputStream)); + emitBackendOutput( + CI, CodeGenOpts, C.getTargetInfo().getDataLayoutString(), + LlvmModule.get(), BackendAction, FS, std::move(OutputStream)); break; } case CIRGenAction::OutputType::None: From 529925a13a69ec649c065e9142d0c42f86c6272b Mon Sep 17 00:00:00 2001 From: Yue Huang Date: Tue, 11 Nov 2025 17:48:37 +0000 Subject: [PATCH 2/6] [CIR][CUDA] Register __device__ global variables --- .../clang/CIR/Dialect/IR/CIRDataLayout.h | 4 + .../Dialect/Transforms/LoweringPrepare.cpp | 766 ++++++++++-------- clang/test/CIR/CodeGen/CUDA/registration.cu | 15 + 3 files changed, 447 insertions(+), 338 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h index e6ad589f3a57..cc0c398c329a 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h @@ -129,6 +129,10 @@ class CIRDataLayout { mlir::Type getCharType(mlir::MLIRContext *ctx) const { return typeSizeInfo.getCharType(ctx); } + + mlir::Type getSizeType(mlir::MLIRContext *ctx) const { + return typeSizeInfo.getSizeType(ctx); + } }; /// Used to lazily calculate structure layout information for a target machine, diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index f9b9da07be3b..e87d939eef48 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -127,6 +127,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { // Maps CUDA kernel name to device stub function. llvm::StringMap cudaKernelMap; + // Maps CUDA device-side variable name to host-side (shadow) GlobalOp. + llvm::StringMap cudaVarMap; void buildCUDAModuleCtor(); std::optional buildCUDAModuleDtor(); @@ -135,6 +137,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc); + void buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); /// /// AST related @@ -1261,8 +1265,7 @@ std::optional LoweringPreparePass::buildCUDARegisterGlobals() { builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); - - // TODO(cir): registration for global variables. + buildCUDARegisterVars(builder, regGlobalFunc); ReturnOp::create(builder, loc); return regGlobalFunc; @@ -1407,395 +1410,482 @@ std::optional LoweringPreparePass::buildHIPModuleDtor() { cir::ReturnOp::create(builder, loc); } return dtor; -} -std::optional LoweringPreparePass::buildCUDAModuleDtor() { - if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) - return {}; + void LoweringPreparePass::buildCUDARegisterVars( + cir::CIRBaseBuilderTy & builder, FuncOp regGlobalFunc) { + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); - std::string prefix = getCUDAPrefix(astCtx); + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); + auto sizeTy = datalayout->getSizeType(&getContext()); - auto voidTy = VoidType::get(&getContext()); - auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); - auto loc = theModule.getLoc(); + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); - cir::CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointToStart(theModule.getBody()); + // Declare CUDA internal function: + // void __cudaRegisterVar( + // void **fatbinHandle, + // char *hostVarName, + // char *deviceVarName, + // const char *deviceVarName, + // int isExtern, size_t varSize, + // int isConstant, int zero + // ); + // Similar to the registration of global functions, OG does not care about + // pointer types. They will generate the same IR anyway. + + FuncOp cudaRegisterVar = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + sizeTy, intTy, intTy}, + voidTy)); + + unsigned int count = 0; + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + + auto tmpString = GlobalOp::create( + globalBuilder, loc, (".str" + str + std::to_string(count++)).str(), + strType, /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + for (auto &[deviceSideName, global] : cudaVarMap) { + GlobalOp deviceNameStr = makeConstantString(deviceSideName); + mlir::Value deviceNameValue = builder.createBitcast( + builder.createGetGlobal(deviceNameStr), voidPtrTy); + + GlobalOp hostNameStr = makeConstantString(global.getName()); + mlir::Value hostNameValue = builder.createBitcast( + builder.createGetGlobal(hostNameStr), voidPtrTy); + + // Every device variable that has a shadow on host will not be extern. + // See CIRGenModule::emitGlobalVarDefinition. + auto isExtern = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + llvm::TypeSize size = datalayout->getTypeSizeInBits(global.getSymType()); + auto varSize = ConstantOp::create( + builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); + auto isConstant = ConstantOp::create( + builder, loc, IntAttr::get(intTy, global.getConstant())); + auto zero = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + builder.createCallOp(loc, cudaRegisterVar, + {fatbinHandle, hostNameValue, deviceNameValue, + deviceNameValue, isExtern, varSize, isConstant, + zero}); + } + } - // void __cudaUnregisterFatBinary(void ** handle); - std::string unregisterFuncName = - addUnderscoredPrefix(prefix, "UnregisterFatBinary"); - FuncOp unregisterFunc = buildRuntimeFunction( - builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy)); + std::optional LoweringPreparePass::buildCUDAModuleDtor() { + if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) + return {}; - // void __cuda_module_dtor(); - // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be - // put into globalDtorList. If it were a real dtor, then it would cause double - // free above CUDA 9.2. The way to use it is to manually call atexit() at end - // of module ctor. - std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); - FuncOp dtor = - buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), - GlobalLinkageKind::InternalLinkage); + std::string prefix = getCUDAPrefix(astCtx); - builder.setInsertionPointToStart(dtor.addEntryBlock()); + auto voidTy = VoidType::get(&getContext()); + auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); - // For dtor, we only need to call: - // __cudaUnregisterFatBinary(__cuda_gpubin_handle); + auto loc = theModule.getLoc(); - std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); - auto gpubinGlobal = cast(theModule.lookupSymbol(gpubinName)); - mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal); - mlir::Value gpubin = builder.createLoad(loc, gpubinAddress); - builder.createCallOp(loc, unregisterFunc, gpubin); - ReturnOp::create(builder, loc); + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(theModule.getBody()); - return dtor; -} + // void __cudaUnregisterFatBinary(void ** handle); + std::string unregisterFuncName = + addUnderscoredPrefix(prefix, "UnregisterFatBinary"); + FuncOp unregisterFunc = + buildRuntimeFunction(builder, unregisterFuncName, loc, + FuncType::get({voidPtrPtrTy}, voidTy)); -void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op); + // void __cuda_module_dtor(); + // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be + // put into globalDtorList. If it were a real dtor, then it would cause + // double free above CUDA 9.2. The way to use it is to manually call + // atexit() at end of module ctor. + std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); + FuncOp dtor = + buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), + GlobalLinkageKind::InternalLinkage); + + builder.setInsertionPointToStart(dtor.addEntryBlock()); + + // For dtor, we only need to call: + // __cudaUnregisterFatBinary(__cuda_gpubin_handle); + + std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); + auto gpubinGlobal = cast(theModule.lookupSymbol(gpubinName)); + mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal); + mlir::Value gpubin = builder.createLoad(loc, gpubinAddress); + builder.createCallOp(loc, unregisterFunc, gpubin); + ReturnOp::create(builder, loc); + + return dtor; + } - assert(astCtx && "AST context is not available during lowering prepare"); - auto loweredValue = cxxABI->lowerDynamicCast(builder, *astCtx, op); + void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op); - op.replaceAllUsesWith(loweredValue); - op.erase(); -} + assert(astCtx && "AST context is not available during lowering prepare"); + auto loweredValue = cxxABI->lowerDynamicCast(builder, *astCtx, op); -static void lowerArrayDtorCtorIntoLoop(CIRBaseBuilderTy &builder, - mlir::Operation *op, mlir::Type eltTy, - mlir::Value arrayAddr, uint64_t arrayLen, - bool isCtor) { - // Generate loop to call into ctor/dtor for every element. - auto loc = op->getLoc(); + op.replaceAllUsesWith(loweredValue); + op.erase(); + } - // TODO: instead of fixed integer size, create alias for PtrDiffTy and unify - // with CIRGen stuff. - auto ptrDiffTy = - cir::IntType::get(builder.getContext(), 64, /*signed=*/false); - uint64_t endOffset = isCtor ? arrayLen : arrayLen - 1; - mlir::Value endOffsetVal = cir::ConstantOp::create( - builder, loc, ptrDiffTy, cir::IntAttr::get(ptrDiffTy, endOffset)); - - auto begin = cir::CastOp::create(builder, loc, eltTy, - cir::CastKind::array_to_ptrdecay, arrayAddr); - mlir::Value end = - cir::PtrStrideOp::create(builder, loc, eltTy, begin, endOffsetVal); - mlir::Value start = isCtor ? begin : end; - mlir::Value stop = isCtor ? end : begin; - - auto tmpAddr = builder.createAlloca( - loc, /*addr type*/ builder.getPointerTo(eltTy), - /*var type*/ eltTy, "__array_idx", clang::CharUnits::One()); - builder.createStore(loc, start, tmpAddr); - - auto loop = builder.createDoWhile( - loc, - /*condBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); - mlir::Type boolTy = cir::BoolType::get(b.getContext()); - auto cmp = cir::CmpOp::create(builder, loc, boolTy, cir::CmpOpKind::ne, - currentElement, stop); - builder.createCondition(cmp); - }, - /*bodyBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); - - CallOp ctorCall; - op->walk([&](CallOp c) { ctorCall = c; }); - assert(ctorCall && "expected ctor call"); - - cir::ConstantOp stride; - if (isCtor) - stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, - cir::IntAttr::get(ptrDiffTy, 1)); - else - stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, - cir::IntAttr::get(ptrDiffTy, -1)); - - ctorCall->moveBefore(stride); - ctorCall->setOperand(0, currentElement); - - // Advance pointer and store them to temporary variable - auto nextElement = cir::PtrStrideOp::create(builder, loc, eltTy, - currentElement, stride); - builder.createStore(loc, nextElement, tmpAddr); - builder.createYield(loc); - }); - - op->replaceAllUsesWith(loop); - op->erase(); -} + static void lowerArrayDtorCtorIntoLoop( + CIRBaseBuilderTy & builder, mlir::Operation * op, mlir::Type eltTy, + mlir::Value arrayAddr, uint64_t arrayLen, bool isCtor) { + // Generate loop to call into ctor/dtor for every element. + auto loc = op->getLoc(); + + // TODO: instead of fixed integer size, create alias for PtrDiffTy and unify + // with CIRGen stuff. + auto ptrDiffTy = + cir::IntType::get(builder.getContext(), 64, /*signed=*/false); + uint64_t endOffset = isCtor ? arrayLen : arrayLen - 1; + mlir::Value endOffsetVal = cir::ConstantOp::create( + builder, loc, ptrDiffTy, cir::IntAttr::get(ptrDiffTy, endOffset)); + + auto begin = cir::CastOp::create( + builder, loc, eltTy, cir::CastKind::array_to_ptrdecay, arrayAddr); + mlir::Value end = + cir::PtrStrideOp::create(builder, loc, eltTy, begin, endOffsetVal); + mlir::Value start = isCtor ? begin : end; + mlir::Value stop = isCtor ? end : begin; + + auto tmpAddr = builder.createAlloca( + loc, /*addr type*/ builder.getPointerTo(eltTy), + /*var type*/ eltTy, "__array_idx", clang::CharUnits::One()); + builder.createStore(loc, start, tmpAddr); + + auto loop = builder.createDoWhile( + loc, + /*condBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); + mlir::Type boolTy = cir::BoolType::get(b.getContext()); + auto cmp = cir::CmpOp::create( + builder, loc, boolTy, cir::CmpOpKind::ne, currentElement, stop); + builder.createCondition(cmp); + }, + /*bodyBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); + + CallOp ctorCall; + op->walk([&](CallOp c) { ctorCall = c; }); + assert(ctorCall && "expected ctor call"); + + cir::ConstantOp stride; + if (isCtor) + stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, + cir::IntAttr::get(ptrDiffTy, 1)); + else + stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, + cir::IntAttr::get(ptrDiffTy, -1)); + + ctorCall->moveBefore(stride); + ctorCall->setOperand(0, currentElement); + + // Advance pointer and store them to temporary variable + auto nextElement = cir::PtrStrideOp::create(builder, loc, eltTy, + currentElement, stride); + builder.createStore(loc, nextElement, tmpAddr); + builder.createYield(loc); + }); + + op->replaceAllUsesWith(loop); + op->erase(); + } -void LoweringPreparePass::lowerArrayDtor(ArrayDtor op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); + void LoweringPreparePass::lowerArrayDtor(ArrayDtor op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); - auto eltTy = op->getRegion(0).getArgument(0).getType(); - auto arrayLen = - mlir::cast(op.getAddr().getType().getPointee()).getSize(); - lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, false); -} + auto eltTy = op->getRegion(0).getArgument(0).getType(); + auto arrayLen = + mlir::cast(op.getAddr().getType().getPointee()) + .getSize(); + lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, + false); + } -static std::string getGlobalVarNameForConstString(cir::StoreOp op, - uint64_t &cnt) { - llvm::SmallString<64> finalName; - llvm::raw_svector_ostream Out(finalName); + static std::string getGlobalVarNameForConstString(cir::StoreOp op, + uint64_t &cnt) { + llvm::SmallString<64> finalName; + llvm::raw_svector_ostream Out(finalName); - Out << "__const."; - if (auto fnOp = op->getParentOfType()) { - Out << fnOp.getSymNameAttr().getValue() << "."; - } else { - Out << "module."; - } + Out << "__const."; + if (auto fnOp = op->getParentOfType()) { + Out << fnOp.getSymNameAttr().getValue() << "."; + } else { + Out << "module."; + } - auto allocaOp = op.getAddr().getDefiningOp(); - if (allocaOp && !allocaOp.getName().empty()) - Out << allocaOp.getName(); - else - Out << cnt++; - return finalName.c_str(); -} + auto allocaOp = op.getAddr().getDefiningOp(); + if (allocaOp && !allocaOp.getName().empty()) + Out << allocaOp.getName(); + else + Out << cnt++; + return finalName.c_str(); + } -void LoweringPreparePass::lowerToMemCpy(StoreOp op) { - // Now that basic filter is done, do more checks before proceding with the - // transformation. - auto cstOp = op.getValue().getDefiningOp(); - if (!cstOp) - return; + void LoweringPreparePass::lowerToMemCpy(StoreOp op) { + // Now that basic filter is done, do more checks before proceding with the + // transformation. + auto cstOp = op.getValue().getDefiningOp(); + if (!cstOp) + return; - if (!isa(cstOp.getValue())) - return; - CIRBaseBuilderTy builder(getContext()); + if (!isa(cstOp.getValue())) + return; + CIRBaseBuilderTy builder(getContext()); - // Create a global which is initialized with the attribute that is either a - // constant array or record. - assert(!cir::MissingFeatures::unnamedAddr() && "NYI"); - builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); - std::string globalName = - getGlobalVarNameForConstString(op, annonGlobalConstArrayCount); - cir::GlobalOp globalCst = buildRuntimeVariable( - builder, globalName, op.getLoc(), op.getValue().getType(), - cir::GlobalLinkageKind::PrivateLinkage); - globalCst.setInitialValueAttr(cstOp.getValue()); - globalCst.setConstant(true); - - // Transform the store into a cir.copy. - builder.setInsertionPointAfter(op.getOperation()); - cir::CopyOp memCpy = - builder.createCopy(op.getAddr(), builder.createGetGlobal(globalCst)); - op->replaceAllUsesWith(memCpy); - op->erase(); - if (cstOp->getResult(0).getUsers().empty()) - cstOp->erase(); -} + // Create a global which is initialized with the attribute that is either a + // constant array or record. + assert(!cir::MissingFeatures::unnamedAddr() && "NYI"); + builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); + std::string globalName = + getGlobalVarNameForConstString(op, annonGlobalConstArrayCount); + cir::GlobalOp globalCst = buildRuntimeVariable( + builder, globalName, op.getLoc(), op.getValue().getType(), + cir::GlobalLinkageKind::PrivateLinkage); + globalCst.setInitialValueAttr(cstOp.getValue()); + globalCst.setConstant(true); + + // Transform the store into a cir.copy. + builder.setInsertionPointAfter(op.getOperation()); + cir::CopyOp memCpy = + builder.createCopy(op.getAddr(), builder.createGetGlobal(globalCst)); + op->replaceAllUsesWith(memCpy); + op->erase(); + if (cstOp->getResult(0).getUsers().empty()) + cstOp->erase(); + } -void LoweringPreparePass::lowerArrayCtor(ArrayCtor op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); + void LoweringPreparePass::lowerArrayCtor(ArrayCtor op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); - auto eltTy = op->getRegion(0).getArgument(0).getType(); - auto arrayLen = - mlir::cast(op.getAddr().getType().getPointee()).getSize(); - lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, true); -} + auto eltTy = op->getRegion(0).getArgument(0).getType(); + auto arrayLen = + mlir::cast(op.getAddr().getType().getPointee()) + .getSize(); + lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, + true); + } -void LoweringPreparePass::lowerStdFindOp(StdFindOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp( - op.getLoc(), op.getOriginalFnAttr(), op.getType(), - mlir::ValueRange{op.getOperand(0), op.getOperand(1), op.getOperand(2)}); + void LoweringPreparePass::lowerStdFindOp(StdFindOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp( + op.getLoc(), op.getOriginalFnAttr(), op.getType(), + mlir::ValueRange{op.getOperand(0), op.getOperand(1), op.getOperand(2)}); - op.replaceAllUsesWith(call); - op.erase(); -} + op.replaceAllUsesWith(call); + op.erase(); + } -void LoweringPreparePass::lowerIterBeginOp(IterBeginOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), - op.getType(), op.getOperand()); + void LoweringPreparePass::lowerIterBeginOp(IterBeginOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), + op.getType(), op.getOperand()); - op.replaceAllUsesWith(call); - op.erase(); -} + op.replaceAllUsesWith(call); + op.erase(); + } -void LoweringPreparePass::lowerIterEndOp(IterEndOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), - op.getType(), op.getOperand()); + void LoweringPreparePass::lowerIterEndOp(IterEndOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), + op.getType(), op.getOperand()); - op.replaceAllUsesWith(call); - op.erase(); -} + op.replaceAllUsesWith(call); + op.erase(); + } -void LoweringPreparePass::lowerThrowOp(ThrowOp op) { - CIRBaseBuilderTy builder(getContext()); + void LoweringPreparePass::lowerThrowOp(ThrowOp op) { + CIRBaseBuilderTy builder(getContext()); - if (op.rethrows()) { - auto voidTy = cir::VoidType::get(builder.getContext()); - auto fnType = cir::FuncType::get({}, voidTy); - auto fnName = "__cxa_rethrow"; + if (op.rethrows()) { + auto voidTy = cir::VoidType::get(builder.getContext()); + auto fnType = cir::FuncType::get({}, voidTy); + auto fnName = "__cxa_rethrow"; - builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); - FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType); + builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); + FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createTryCallOp(op.getLoc(), f, {}); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createTryCallOp(op.getLoc(), f, {}); - op->replaceAllUsesWith(call); - op->erase(); + op->replaceAllUsesWith(call); + op->erase(); + } } -} -void LoweringPreparePass::lowerTrivialConstructorCall(cir::CallOp op) { - FuncOp funcOp = getCalledFunction(op); - if (!funcOp) - return; - Attribute astAttr = funcOp.getAstAttr(); - if (!astAttr) - return; - auto ctorDecl = dyn_cast(astAttr); - if (!ctorDecl) - return; - if (ctorDecl.isDefaultConstructor()) - return; - - if (ctorDecl.isCopyConstructor()) { - // Additional safety checks: constructor calls should have no return value - if (op.getNumResults() > 0) + void LoweringPreparePass::lowerTrivialConstructorCall(cir::CallOp op) { + FuncOp funcOp = getCalledFunction(op); + if (!funcOp) return; - auto operands = op.getOperands(); - if (operands.size() != 2) + Attribute astAttr = funcOp.getAstAttr(); + if (!astAttr) return; - // Replace the trivial copy constructor call with a copy op - CIRBaseBuilderTy builder(getContext()); - mlir::Value dest = operands[0]; - mlir::Value src = operands[1]; - builder.setInsertionPoint(op); - builder.createCopy(dest, src); - op.erase(); + auto ctorDecl = dyn_cast(astAttr); + if (!ctorDecl) + return; + if (ctorDecl.isDefaultConstructor()) + return; + + if (ctorDecl.isCopyConstructor()) { + // Additional safety checks: constructor calls should have no return value + if (op.getNumResults() > 0) + return; + auto operands = op.getOperands(); + if (operands.size() != 2) + return; + // Replace the trivial copy constructor call with a copy op + CIRBaseBuilderTy builder(getContext()); + mlir::Value dest = operands[0]; + mlir::Value src = operands[1]; + builder.setInsertionPoint(op); + builder.createCopy(dest, src); + op.erase(); + } } -} -void LoweringPreparePass::addGlobalAnnotations(mlir::Operation *op, - mlir::ArrayAttr annotations) { - auto globalValue = cast(op); - mlir::StringAttr globalValueName = globalValue.getNameAttr(); - for (auto &annot : annotations) { - llvm::SmallVector entryArray = {globalValueName, annot}; - globalAnnotations.push_back( - mlir::ArrayAttr::get(theModule.getContext(), entryArray)); + void LoweringPreparePass::addGlobalAnnotations(mlir::Operation * op, + mlir::ArrayAttr annotations) { + auto globalValue = cast(op); + mlir::StringAttr globalValueName = globalValue.getNameAttr(); + for (auto &annot : annotations) { + llvm::SmallVector entryArray = {globalValueName, + annot}; + globalAnnotations.push_back( + mlir::ArrayAttr::get(theModule.getContext(), entryArray)); + } } -} -void LoweringPreparePass::buildGlobalAnnotationValues() { - if (globalAnnotations.empty()) - return; - mlir::ArrayAttr annotationValueArray = - mlir::ArrayAttr::get(theModule.getContext(), globalAnnotations); - theModule->setAttr( - cir::CIRDialect::getGlobalAnnotationsAttrName(), - cir::GlobalAnnotationValuesAttr::get(annotationValueArray)); -} + void LoweringPreparePass::buildGlobalAnnotationValues() { + if (globalAnnotations.empty()) + return; + mlir::ArrayAttr annotationValueArray = + mlir::ArrayAttr::get(theModule.getContext(), globalAnnotations); + theModule->setAttr( + cir::CIRDialect::getGlobalAnnotationsAttrName(), + cir::GlobalAnnotationValuesAttr::get(annotationValueArray)); + } -void LoweringPreparePass::runOnOp(Operation *op) { - if (auto unary = dyn_cast(op)) { - lowerUnaryOp(unary); - } else if (auto bin = dyn_cast(op)) { - lowerBinOp(bin); - } else if (auto cast = dyn_cast(op)) { - lowerCastOp(cast); - } else if (auto complexBin = dyn_cast(op)) { - lowerComplexBinOp(complexBin); - } else if (auto threeWayCmp = dyn_cast(op)) { - lowerThreeWayCmpOp(threeWayCmp); - } else if (auto vaArgOp = dyn_cast(op)) { - lowerVAArgOp(vaArgOp); - } else if (auto deleteArrayOp = dyn_cast(op)) { - lowerDeleteArrayOp(deleteArrayOp); - } else if (auto getGlobal = dyn_cast(op)) { - lowerGlobalOp(getGlobal); - } else if (auto dynamicCast = dyn_cast(op)) { - lowerDynamicCastOp(dynamicCast); - } else if (auto stdFind = dyn_cast(op)) { - lowerStdFindOp(stdFind); - } else if (auto iterBegin = dyn_cast(op)) { - lowerIterBeginOp(iterBegin); - } else if (auto iterEnd = dyn_cast(op)) { - lowerIterEndOp(iterEnd); - } else if (auto arrayCtor = dyn_cast(op)) { - lowerArrayCtor(arrayCtor); - } else if (auto arrayDtor = dyn_cast(op)) { - lowerArrayDtor(arrayDtor); - } else if (auto storeOp = dyn_cast(op)) { - mlir::Type valTy = storeOp.getValue().getType(); - if (isa(valTy) || isa(valTy)) - lowerToMemCpy(storeOp); - } else if (auto fnOp = dyn_cast(op)) { - if (auto globalCtor = fnOp.getGlobalCtorPriority()) { - globalCtorList.emplace_back(fnOp.getName(), globalCtor.value()); - } else if (auto globalDtor = fnOp.getGlobalDtorPriority()) { - globalDtorList.emplace_back(fnOp.getName(), globalDtor.value()); - } - if (auto attr = fnOp.getExtraAttrs().getElements().get( - CUDAKernelNameAttr::getMnemonic())) { - auto cudaBinaryAttr = dyn_cast(attr); - std::string kernelName = cudaBinaryAttr.getKernelName(); - cudaKernelMap[kernelName] = fnOp; + void LoweringPreparePass::runOnOp(Operation * op) { + if (auto unary = dyn_cast(op)) { + lowerUnaryOp(unary); + } else if (auto bin = dyn_cast(op)) { + lowerBinOp(bin); + } else if (auto cast = dyn_cast(op)) { + lowerCastOp(cast); + } else if (auto complexBin = dyn_cast(op)) { + lowerComplexBinOp(complexBin); + } else if (auto threeWayCmp = dyn_cast(op)) { + lowerThreeWayCmpOp(threeWayCmp); + } else if (auto vaArgOp = dyn_cast(op)) { + lowerVAArgOp(vaArgOp); + } else if (auto deleteArrayOp = dyn_cast(op)) { + lowerDeleteArrayOp(deleteArrayOp); + } else if (auto global = dyn_cast(op)) { + lowerGlobalOp(global); + if (auto attr = op->getAttr(cir::CUDAShadowNameAttr::getMnemonic())) { + auto shadowNameAttr = dyn_cast(attr); + std::string deviceSideName = shadowNameAttr.getDeviceSideName(); + cudaVarMap[deviceSideName] = global; + } + } else if (auto dynamicCast = dyn_cast(op)) { + lowerDynamicCastOp(dynamicCast); + } else if (auto stdFind = dyn_cast(op)) { + lowerStdFindOp(stdFind); + } else if (auto iterBegin = dyn_cast(op)) { + lowerIterBeginOp(iterBegin); + } else if (auto iterEnd = dyn_cast(op)) { + lowerIterEndOp(iterEnd); + } else if (auto arrayCtor = dyn_cast(op)) { + lowerArrayCtor(arrayCtor); + } else if (auto arrayDtor = dyn_cast(op)) { + lowerArrayDtor(arrayDtor); + } else if (auto storeOp = dyn_cast(op)) { + mlir::Type valTy = storeOp.getValue().getType(); + if (isa(valTy) || isa(valTy)) + lowerToMemCpy(storeOp); + } else if (auto fnOp = dyn_cast(op)) { + if (auto globalCtor = fnOp.getGlobalCtorPriority()) { + globalCtorList.emplace_back(fnOp.getName(), globalCtor.value()); + } else if (auto globalDtor = fnOp.getGlobalDtorPriority()) { + globalDtorList.emplace_back(fnOp.getName(), globalDtor.value()); + } + if (auto attr = fnOp.getExtraAttrs().getElements().get( + CUDAKernelNameAttr::getMnemonic())) { + auto cudaBinaryAttr = dyn_cast(attr); + std::string kernelName = cudaBinaryAttr.getKernelName(); + cudaKernelMap[kernelName] = fnOp; + } + if (std::optional annotations = fnOp.getAnnotations()) + addGlobalAnnotations(fnOp, annotations.value()); + } else if (auto throwOp = dyn_cast(op)) { + lowerThrowOp(throwOp); + } else if (auto callOp = dyn_cast(op)) { + lowerTrivialConstructorCall(callOp); } - if (std::optional annotations = fnOp.getAnnotations()) - addGlobalAnnotations(fnOp, annotations.value()); - } else if (auto throwOp = dyn_cast(op)) { - lowerThrowOp(throwOp); - } else if (auto callOp = dyn_cast(op)) { - lowerTrivialConstructorCall(callOp); } -} -void LoweringPreparePass::runOnOperation() { - assert(astCtx && "Missing ASTContext, please construct with the right ctor"); - auto *op = getOperation(); - if (isa<::mlir::ModuleOp>(op)) { - theModule = cast<::mlir::ModuleOp>(op); - datalayout.emplace(theModule); - } + void LoweringPreparePass::runOnOperation() { + assert(astCtx && + "Missing ASTContext, please construct with the right ctor"); + auto *op = getOperation(); + if (isa<::mlir::ModuleOp>(op)) { + theModule = cast<::mlir::ModuleOp>(op); + datalayout.emplace(theModule); + } - llvm::SmallVector opsToTransform; + llvm::SmallVector opsToTransform; - op->walk([&](Operation *op) { - if (isa(op)) - opsToTransform.push_back(op); - }); + op->walk([&](Operation *op) { + if (isa(op)) + opsToTransform.push_back(op); + }); - for (auto *o : opsToTransform) - runOnOp(o); + for (auto *o : opsToTransform) + runOnOp(o); - if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) { - buildCUDAModuleCtor(); - } + if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) { + buildCUDAModuleCtor(); + } - buildCXXGlobalInitFunc(); - buildGlobalCtorDtorList(); - buildGlobalAnnotationValues(); -} + buildCXXGlobalInitFunc(); + buildGlobalCtorDtorList(); + buildGlobalAnnotationValues(); + } -std::unique_ptr mlir::createLoweringPreparePass() { - return std::make_unique(); -} + std::unique_ptr mlir::createLoweringPreparePass() { + return std::make_unique(); + } -std::unique_ptr -mlir::createLoweringPreparePass(clang::ASTContext *astCtx) { - auto pass = std::make_unique(); - pass->setASTContext(astCtx); - return std::move(pass); -} + std::unique_ptr mlir::createLoweringPreparePass(clang::ASTContext * + astCtx) { + auto pass = std::make_unique(); + pass->setASTContext(astCtx); + return std::move(pass); + } diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 834e45204b77..661f916b0f16 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -50,6 +50,8 @@ __global__ void fn() {} +__device__ int a; + // CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { // CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr // CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" @@ -64,6 +66,16 @@ __global__ void fn() {} // CIR-HOST-SAME: %[[#DeviceFn]], // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: %[[#T3:]] = cir.get_global @".stra0" +// CIR-HOST: %[[#Device:]] = cir.cast bitcast %7 +// CIR-HOST: %[[#T4:]] = cir.get_global @".stra1" +// CIR-HOST: %[[#Host:]] = cir.cast bitcast %9 +// CIR-HOST: %[[#Ext:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Sz:]] = cir.const #cir.int<4> +// CIR-HOST: %[[#Const:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Zero:]] = cir.const #cir.int<0> +// CIR-HOST: cir.call @__cudaRegisterVar(%arg0, %[[#Host]], %[[#Device]], %[[#Device]], +// CIR-HOST-SAME: %[[#Ext]], %[[#Sz]], %[[#Const]], %[[#Zero]]) // CIR-HOST: } // LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) { @@ -74,6 +86,9 @@ __global__ void fn() {} // LLVM-HOST-SAME: ptr @.str_Z2fnv, // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: call void @__cudaRegisterVar( +// LLVM-HOST-SAME: ptr %0, ptr @.stra1, ptr @.stra0, ptr @.stra0, +// LLVM-HOST-SAME: i32 0, i64 4, i32 0, i32 0) // LLVM-HOST: } // The content in const array should be the same as echoed above, From 497a5e8c7dea63a528ee85eaa182d30715d5060f Mon Sep 17 00:00:00 2001 From: koparasy Date: Wed, 12 Nov 2025 21:38:03 -0800 Subject: [PATCH 3/6] Fix hip register global vars --- .../Dialect/Transforms/LoweringPrepare.cpp | 835 +++++++++--------- clang/test/CIR/CodeGen/CUDA/registration.cu | 8 +- clang/test/CIR/CodeGen/HIP/registration.cpp | 46 +- 3 files changed, 462 insertions(+), 427 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index e87d939eef48..5b03b53e8da6 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -1410,482 +1410,475 @@ std::optional LoweringPreparePass::buildHIPModuleDtor() { cir::ReturnOp::create(builder, loc); } return dtor; +} + +void LoweringPreparePass::buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc) { + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); + auto sizeTy = datalayout->getSizeType(&getContext()); - void LoweringPreparePass::buildCUDARegisterVars( - cir::CIRBaseBuilderTy & builder, FuncOp regGlobalFunc) { - auto loc = theModule.getLoc(); - auto cudaPrefix = getCUDAPrefix(astCtx); + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); - auto voidTy = VoidType::get(&getContext()); - auto voidPtrTy = PointerType::get(voidTy); - auto voidPtrPtrTy = PointerType::get(voidPtrTy); - auto intTy = datalayout->getIntType(&getContext()); - auto charTy = datalayout->getCharType(&getContext()); - auto sizeTy = datalayout->getSizeType(&getContext()); + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); - // Extract the GPU binary handle argument. - mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + // Declare CUDA internal function: + // void __cudaRegisterVar( + // void **fatbinHandle, + // char *hostVarName, + // char *deviceVarName, + // const char *deviceVarName, + // int isExtern, size_t varSize, + // int isConstant, int zero + // ); + // Similar to the registration of global functions, OG does not care about + // pointer types. They will generate the same IR anyway. + + FuncOp cudaRegisterVar = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + sizeTy, intTy, intTy}, + voidTy)); - cir::CIRBaseBuilderTy globalBuilder(getContext()); - globalBuilder.setInsertionPointToStart(theModule.getBody()); + unsigned int count = 0; + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); - // Declare CUDA internal function: - // void __cudaRegisterVar( - // void **fatbinHandle, - // char *hostVarName, - // char *deviceVarName, - // const char *deviceVarName, - // int isExtern, size_t varSize, - // int isConstant, int zero - // ); - // Similar to the registration of global functions, OG does not care about - // pointer types. They will generate the same IR anyway. - - FuncOp cudaRegisterVar = buildRuntimeFunction( - globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, - FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, - sizeTy, intTy, intTy}, - voidTy)); - - unsigned int count = 0; - auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { - auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); - - auto tmpString = GlobalOp::create( - globalBuilder, loc, (".str" + str + std::to_string(count++)).str(), - strType, /*isConstant=*/true, - /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); - - // We must make the string zero-terminated. - tmpString.setInitialValueAttr(ConstArrayAttr::get( - strType, StringAttr::get(&getContext(), str + "\0"))); - tmpString.setPrivate(); - return tmpString; - }; - - for (auto &[deviceSideName, global] : cudaVarMap) { - GlobalOp deviceNameStr = makeConstantString(deviceSideName); - mlir::Value deviceNameValue = builder.createBitcast( - builder.createGetGlobal(deviceNameStr), voidPtrTy); - - GlobalOp hostNameStr = makeConstantString(global.getName()); - mlir::Value hostNameValue = builder.createBitcast( - builder.createGetGlobal(hostNameStr), voidPtrTy); - - // Every device variable that has a shadow on host will not be extern. - // See CIRGenModule::emitGlobalVarDefinition. - auto isExtern = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); - llvm::TypeSize size = datalayout->getTypeSizeInBits(global.getSymType()); - auto varSize = ConstantOp::create( - builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); - auto isConstant = ConstantOp::create( - builder, loc, IntAttr::get(intTy, global.getConstant())); - auto zero = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); - builder.createCallOp(loc, cudaRegisterVar, - {fatbinHandle, hostNameValue, deviceNameValue, - deviceNameValue, isExtern, varSize, isConstant, - zero}); - } + auto tmpString = GlobalOp::create( + globalBuilder, loc, (".str" + str + std::to_string(count++)).str(), + strType, /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + for (auto &[deviceSideName, global] : cudaVarMap) { + GlobalOp varNameStr = makeConstantString(deviceSideName); + mlir::Value varNameValue = + builder.createBitcast(builder.createGetGlobal(varNameStr), voidPtrTy); + + auto globalVarValue = + builder.createBitcast(builder.createGetGlobal(global), voidPtrTy); + + // Every device variable that has a shadow on host will not be extern. + // See CIRGenModule::emitGlobalVarDefinition. + auto isExtern = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + llvm::TypeSize size = datalayout->getTypeSizeInBits(global.getSymType()); + auto varSize = ConstantOp::create( + builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); + auto isConstant = ConstantOp::create( + builder, loc, IntAttr::get(intTy, global.getConstant())); + auto zero = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + builder.createCallOp(loc, cudaRegisterVar, + {fatbinHandle, globalVarValue, varNameValue, + varNameValue, isExtern, varSize, isConstant, zero}); } +} - std::optional LoweringPreparePass::buildCUDAModuleDtor() { - if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) - return {}; +std::optional LoweringPreparePass::buildCUDAModuleDtor() { + if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) + return {}; - std::string prefix = getCUDAPrefix(astCtx); + std::string prefix = getCUDAPrefix(astCtx); - auto voidTy = VoidType::get(&getContext()); - auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); + auto voidTy = VoidType::get(&getContext()); + auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); - auto loc = theModule.getLoc(); + auto loc = theModule.getLoc(); - cir::CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointToStart(theModule.getBody()); + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(theModule.getBody()); - // void __cudaUnregisterFatBinary(void ** handle); - std::string unregisterFuncName = - addUnderscoredPrefix(prefix, "UnregisterFatBinary"); - FuncOp unregisterFunc = - buildRuntimeFunction(builder, unregisterFuncName, loc, - FuncType::get({voidPtrPtrTy}, voidTy)); + // void __cudaUnregisterFatBinary(void ** handle); + std::string unregisterFuncName = + addUnderscoredPrefix(prefix, "UnregisterFatBinary"); + FuncOp unregisterFunc = buildRuntimeFunction( + builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy)); - // void __cuda_module_dtor(); - // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be - // put into globalDtorList. If it were a real dtor, then it would cause - // double free above CUDA 9.2. The way to use it is to manually call - // atexit() at end of module ctor. - std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); - FuncOp dtor = - buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), - GlobalLinkageKind::InternalLinkage); - - builder.setInsertionPointToStart(dtor.addEntryBlock()); - - // For dtor, we only need to call: - // __cudaUnregisterFatBinary(__cuda_gpubin_handle); - - std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); - auto gpubinGlobal = cast(theModule.lookupSymbol(gpubinName)); - mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal); - mlir::Value gpubin = builder.createLoad(loc, gpubinAddress); - builder.createCallOp(loc, unregisterFunc, gpubin); - ReturnOp::create(builder, loc); - - return dtor; - } + // void __cuda_module_dtor(); + // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be + // put into globalDtorList. If it were a real dtor, then it would cause + // double free above CUDA 9.2. The way to use it is to manually call + // atexit() at end of module ctor. + std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); + FuncOp dtor = + buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), + GlobalLinkageKind::InternalLinkage); - void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op); + builder.setInsertionPointToStart(dtor.addEntryBlock()); - assert(astCtx && "AST context is not available during lowering prepare"); - auto loweredValue = cxxABI->lowerDynamicCast(builder, *astCtx, op); + // For dtor, we only need to call: + // __cudaUnregisterFatBinary(__cuda_gpubin_handle); - op.replaceAllUsesWith(loweredValue); - op.erase(); - } + std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); + auto gpubinGlobal = cast(theModule.lookupSymbol(gpubinName)); + mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal); + mlir::Value gpubin = builder.createLoad(loc, gpubinAddress); + builder.createCallOp(loc, unregisterFunc, gpubin); + ReturnOp::create(builder, loc); - static void lowerArrayDtorCtorIntoLoop( - CIRBaseBuilderTy & builder, mlir::Operation * op, mlir::Type eltTy, - mlir::Value arrayAddr, uint64_t arrayLen, bool isCtor) { - // Generate loop to call into ctor/dtor for every element. - auto loc = op->getLoc(); - - // TODO: instead of fixed integer size, create alias for PtrDiffTy and unify - // with CIRGen stuff. - auto ptrDiffTy = - cir::IntType::get(builder.getContext(), 64, /*signed=*/false); - uint64_t endOffset = isCtor ? arrayLen : arrayLen - 1; - mlir::Value endOffsetVal = cir::ConstantOp::create( - builder, loc, ptrDiffTy, cir::IntAttr::get(ptrDiffTy, endOffset)); - - auto begin = cir::CastOp::create( - builder, loc, eltTy, cir::CastKind::array_to_ptrdecay, arrayAddr); - mlir::Value end = - cir::PtrStrideOp::create(builder, loc, eltTy, begin, endOffsetVal); - mlir::Value start = isCtor ? begin : end; - mlir::Value stop = isCtor ? end : begin; - - auto tmpAddr = builder.createAlloca( - loc, /*addr type*/ builder.getPointerTo(eltTy), - /*var type*/ eltTy, "__array_idx", clang::CharUnits::One()); - builder.createStore(loc, start, tmpAddr); - - auto loop = builder.createDoWhile( - loc, - /*condBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); - mlir::Type boolTy = cir::BoolType::get(b.getContext()); - auto cmp = cir::CmpOp::create( - builder, loc, boolTy, cir::CmpOpKind::ne, currentElement, stop); - builder.createCondition(cmp); - }, - /*bodyBuilder=*/ - [&](mlir::OpBuilder &b, mlir::Location loc) { - auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); - - CallOp ctorCall; - op->walk([&](CallOp c) { ctorCall = c; }); - assert(ctorCall && "expected ctor call"); - - cir::ConstantOp stride; - if (isCtor) - stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, - cir::IntAttr::get(ptrDiffTy, 1)); - else - stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, - cir::IntAttr::get(ptrDiffTy, -1)); - - ctorCall->moveBefore(stride); - ctorCall->setOperand(0, currentElement); - - // Advance pointer and store them to temporary variable - auto nextElement = cir::PtrStrideOp::create(builder, loc, eltTy, - currentElement, stride); - builder.createStore(loc, nextElement, tmpAddr); - builder.createYield(loc); - }); - - op->replaceAllUsesWith(loop); - op->erase(); - } + return dtor; +} - void LoweringPreparePass::lowerArrayDtor(ArrayDtor op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); +void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op); - auto eltTy = op->getRegion(0).getArgument(0).getType(); - auto arrayLen = - mlir::cast(op.getAddr().getType().getPointee()) - .getSize(); - lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, - false); - } + assert(astCtx && "AST context is not available during lowering prepare"); + auto loweredValue = cxxABI->lowerDynamicCast(builder, *astCtx, op); - static std::string getGlobalVarNameForConstString(cir::StoreOp op, - uint64_t &cnt) { - llvm::SmallString<64> finalName; - llvm::raw_svector_ostream Out(finalName); + op.replaceAllUsesWith(loweredValue); + op.erase(); +} - Out << "__const."; - if (auto fnOp = op->getParentOfType()) { - Out << fnOp.getSymNameAttr().getValue() << "."; - } else { - Out << "module."; - } +static void lowerArrayDtorCtorIntoLoop(CIRBaseBuilderTy &builder, + mlir::Operation *op, mlir::Type eltTy, + mlir::Value arrayAddr, uint64_t arrayLen, + bool isCtor) { + // Generate loop to call into ctor/dtor for every element. + auto loc = op->getLoc(); - auto allocaOp = op.getAddr().getDefiningOp(); - if (allocaOp && !allocaOp.getName().empty()) - Out << allocaOp.getName(); - else - Out << cnt++; - return finalName.c_str(); - } + // TODO: instead of fixed integer size, create alias for PtrDiffTy and unify + // with CIRGen stuff. + auto ptrDiffTy = + cir::IntType::get(builder.getContext(), 64, /*signed=*/false); + uint64_t endOffset = isCtor ? arrayLen : arrayLen - 1; + mlir::Value endOffsetVal = cir::ConstantOp::create( + builder, loc, ptrDiffTy, cir::IntAttr::get(ptrDiffTy, endOffset)); + + auto begin = cir::CastOp::create(builder, loc, eltTy, + cir::CastKind::array_to_ptrdecay, arrayAddr); + mlir::Value end = + cir::PtrStrideOp::create(builder, loc, eltTy, begin, endOffsetVal); + mlir::Value start = isCtor ? begin : end; + mlir::Value stop = isCtor ? end : begin; + + auto tmpAddr = builder.createAlloca( + loc, /*addr type*/ builder.getPointerTo(eltTy), + /*var type*/ eltTy, "__array_idx", clang::CharUnits::One()); + builder.createStore(loc, start, tmpAddr); + + auto loop = builder.createDoWhile( + loc, + /*condBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); + mlir::Type boolTy = cir::BoolType::get(b.getContext()); + auto cmp = cir::CmpOp::create(builder, loc, boolTy, cir::CmpOpKind::ne, + currentElement, stop); + builder.createCondition(cmp); + }, + /*bodyBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr); + + CallOp ctorCall; + op->walk([&](CallOp c) { ctorCall = c; }); + assert(ctorCall && "expected ctor call"); + + cir::ConstantOp stride; + if (isCtor) + stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, + cir::IntAttr::get(ptrDiffTy, 1)); + else + stride = cir::ConstantOp::create(builder, loc, ptrDiffTy, + cir::IntAttr::get(ptrDiffTy, -1)); + + ctorCall->moveBefore(stride); + ctorCall->setOperand(0, currentElement); + + // Advance pointer and store them to temporary variable + auto nextElement = cir::PtrStrideOp::create(builder, loc, eltTy, + currentElement, stride); + builder.createStore(loc, nextElement, tmpAddr); + builder.createYield(loc); + }); + + op->replaceAllUsesWith(loop); + op->erase(); +} - void LoweringPreparePass::lowerToMemCpy(StoreOp op) { - // Now that basic filter is done, do more checks before proceding with the - // transformation. - auto cstOp = op.getValue().getDefiningOp(); - if (!cstOp) - return; +void LoweringPreparePass::lowerArrayDtor(ArrayDtor op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); - if (!isa(cstOp.getValue())) - return; - CIRBaseBuilderTy builder(getContext()); + auto eltTy = op->getRegion(0).getArgument(0).getType(); + auto arrayLen = + mlir::cast(op.getAddr().getType().getPointee()).getSize(); + lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, false); +} - // Create a global which is initialized with the attribute that is either a - // constant array or record. - assert(!cir::MissingFeatures::unnamedAddr() && "NYI"); - builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); - std::string globalName = - getGlobalVarNameForConstString(op, annonGlobalConstArrayCount); - cir::GlobalOp globalCst = buildRuntimeVariable( - builder, globalName, op.getLoc(), op.getValue().getType(), - cir::GlobalLinkageKind::PrivateLinkage); - globalCst.setInitialValueAttr(cstOp.getValue()); - globalCst.setConstant(true); - - // Transform the store into a cir.copy. - builder.setInsertionPointAfter(op.getOperation()); - cir::CopyOp memCpy = - builder.createCopy(op.getAddr(), builder.createGetGlobal(globalCst)); - op->replaceAllUsesWith(memCpy); - op->erase(); - if (cstOp->getResult(0).getUsers().empty()) - cstOp->erase(); +static std::string getGlobalVarNameForConstString(cir::StoreOp op, + uint64_t &cnt) { + llvm::SmallString<64> finalName; + llvm::raw_svector_ostream Out(finalName); + + Out << "__const."; + if (auto fnOp = op->getParentOfType()) { + Out << fnOp.getSymNameAttr().getValue() << "."; + } else { + Out << "module."; } - void LoweringPreparePass::lowerArrayCtor(ArrayCtor op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); + auto allocaOp = op.getAddr().getDefiningOp(); + if (allocaOp && !allocaOp.getName().empty()) + Out << allocaOp.getName(); + else + Out << cnt++; + return finalName.c_str(); +} - auto eltTy = op->getRegion(0).getArgument(0).getType(); - auto arrayLen = - mlir::cast(op.getAddr().getType().getPointee()) - .getSize(); - lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, - true); - } +void LoweringPreparePass::lowerToMemCpy(StoreOp op) { + // Now that basic filter is done, do more checks before proceding with the + // transformation. + auto cstOp = op.getValue().getDefiningOp(); + if (!cstOp) + return; - void LoweringPreparePass::lowerStdFindOp(StdFindOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp( - op.getLoc(), op.getOriginalFnAttr(), op.getType(), - mlir::ValueRange{op.getOperand(0), op.getOperand(1), op.getOperand(2)}); + if (!isa(cstOp.getValue())) + return; + CIRBaseBuilderTy builder(getContext()); - op.replaceAllUsesWith(call); - op.erase(); - } + // Create a global which is initialized with the attribute that is either a + // constant array or record. + assert(!cir::MissingFeatures::unnamedAddr() && "NYI"); + builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); + std::string globalName = + getGlobalVarNameForConstString(op, annonGlobalConstArrayCount); + cir::GlobalOp globalCst = buildRuntimeVariable( + builder, globalName, op.getLoc(), op.getValue().getType(), + cir::GlobalLinkageKind::PrivateLinkage); + globalCst.setInitialValueAttr(cstOp.getValue()); + globalCst.setConstant(true); + + // Transform the store into a cir.copy. + builder.setInsertionPointAfter(op.getOperation()); + cir::CopyOp memCpy = + builder.createCopy(op.getAddr(), builder.createGetGlobal(globalCst)); + op->replaceAllUsesWith(memCpy); + op->erase(); + if (cstOp->getResult(0).getUsers().empty()) + cstOp->erase(); +} - void LoweringPreparePass::lowerIterBeginOp(IterBeginOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), - op.getType(), op.getOperand()); +void LoweringPreparePass::lowerArrayCtor(ArrayCtor op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); - op.replaceAllUsesWith(call); - op.erase(); - } + auto eltTy = op->getRegion(0).getArgument(0).getType(); + auto arrayLen = + mlir::cast(op.getAddr().getType().getPointee()).getSize(); + lowerArrayDtorCtorIntoLoop(builder, op, eltTy, op.getAddr(), arrayLen, true); +} - void LoweringPreparePass::lowerIterEndOp(IterEndOp op) { - CIRBaseBuilderTy builder(getContext()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), - op.getType(), op.getOperand()); +void LoweringPreparePass::lowerStdFindOp(StdFindOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp( + op.getLoc(), op.getOriginalFnAttr(), op.getType(), + mlir::ValueRange{op.getOperand(0), op.getOperand(1), op.getOperand(2)}); - op.replaceAllUsesWith(call); - op.erase(); - } + op.replaceAllUsesWith(call); + op.erase(); +} - void LoweringPreparePass::lowerThrowOp(ThrowOp op) { - CIRBaseBuilderTy builder(getContext()); +void LoweringPreparePass::lowerIterBeginOp(IterBeginOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), + op.getType(), op.getOperand()); - if (op.rethrows()) { - auto voidTy = cir::VoidType::get(builder.getContext()); - auto fnType = cir::FuncType::get({}, voidTy); - auto fnName = "__cxa_rethrow"; + op.replaceAllUsesWith(call); + op.erase(); +} - builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); - FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType); +void LoweringPreparePass::lowerIterEndOp(IterEndOp op) { + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createCallOp(op.getLoc(), op.getOriginalFnAttr(), + op.getType(), op.getOperand()); - builder.setInsertionPointAfter(op.getOperation()); - auto call = builder.createTryCallOp(op.getLoc(), f, {}); + op.replaceAllUsesWith(call); + op.erase(); +} - op->replaceAllUsesWith(call); - op->erase(); - } - } +void LoweringPreparePass::lowerThrowOp(ThrowOp op) { + CIRBaseBuilderTy builder(getContext()); - void LoweringPreparePass::lowerTrivialConstructorCall(cir::CallOp op) { - FuncOp funcOp = getCalledFunction(op); - if (!funcOp) - return; - Attribute astAttr = funcOp.getAstAttr(); - if (!astAttr) - return; - auto ctorDecl = dyn_cast(astAttr); - if (!ctorDecl) - return; - if (ctorDecl.isDefaultConstructor()) - return; + if (op.rethrows()) { + auto voidTy = cir::VoidType::get(builder.getContext()); + auto fnType = cir::FuncType::get({}, voidTy); + auto fnName = "__cxa_rethrow"; - if (ctorDecl.isCopyConstructor()) { - // Additional safety checks: constructor calls should have no return value - if (op.getNumResults() > 0) - return; - auto operands = op.getOperands(); - if (operands.size() != 2) - return; - // Replace the trivial copy constructor call with a copy op - CIRBaseBuilderTy builder(getContext()); - mlir::Value dest = operands[0]; - mlir::Value src = operands[1]; - builder.setInsertionPoint(op); - builder.createCopy(dest, src); - op.erase(); - } - } + builder.setInsertionPointToStart(&theModule.getBodyRegion().front()); + FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType); - void LoweringPreparePass::addGlobalAnnotations(mlir::Operation * op, - mlir::ArrayAttr annotations) { - auto globalValue = cast(op); - mlir::StringAttr globalValueName = globalValue.getNameAttr(); - for (auto &annot : annotations) { - llvm::SmallVector entryArray = {globalValueName, - annot}; - globalAnnotations.push_back( - mlir::ArrayAttr::get(theModule.getContext(), entryArray)); - } + builder.setInsertionPointAfter(op.getOperation()); + auto call = builder.createTryCallOp(op.getLoc(), f, {}); + + op->replaceAllUsesWith(call); + op->erase(); } +} - void LoweringPreparePass::buildGlobalAnnotationValues() { - if (globalAnnotations.empty()) +void LoweringPreparePass::lowerTrivialConstructorCall(cir::CallOp op) { + FuncOp funcOp = getCalledFunction(op); + if (!funcOp) + return; + Attribute astAttr = funcOp.getAstAttr(); + if (!astAttr) + return; + auto ctorDecl = dyn_cast(astAttr); + if (!ctorDecl) + return; + if (ctorDecl.isDefaultConstructor()) + return; + + if (ctorDecl.isCopyConstructor()) { + // Additional safety checks: constructor calls should have no return value + if (op.getNumResults() > 0) return; - mlir::ArrayAttr annotationValueArray = - mlir::ArrayAttr::get(theModule.getContext(), globalAnnotations); - theModule->setAttr( - cir::CIRDialect::getGlobalAnnotationsAttrName(), - cir::GlobalAnnotationValuesAttr::get(annotationValueArray)); + auto operands = op.getOperands(); + if (operands.size() != 2) + return; + // Replace the trivial copy constructor call with a copy op + CIRBaseBuilderTy builder(getContext()); + mlir::Value dest = operands[0]; + mlir::Value src = operands[1]; + builder.setInsertionPoint(op); + builder.createCopy(dest, src); + op.erase(); } +} - void LoweringPreparePass::runOnOp(Operation * op) { - if (auto unary = dyn_cast(op)) { - lowerUnaryOp(unary); - } else if (auto bin = dyn_cast(op)) { - lowerBinOp(bin); - } else if (auto cast = dyn_cast(op)) { - lowerCastOp(cast); - } else if (auto complexBin = dyn_cast(op)) { - lowerComplexBinOp(complexBin); - } else if (auto threeWayCmp = dyn_cast(op)) { - lowerThreeWayCmpOp(threeWayCmp); - } else if (auto vaArgOp = dyn_cast(op)) { - lowerVAArgOp(vaArgOp); - } else if (auto deleteArrayOp = dyn_cast(op)) { - lowerDeleteArrayOp(deleteArrayOp); - } else if (auto global = dyn_cast(op)) { - lowerGlobalOp(global); - if (auto attr = op->getAttr(cir::CUDAShadowNameAttr::getMnemonic())) { - auto shadowNameAttr = dyn_cast(attr); - std::string deviceSideName = shadowNameAttr.getDeviceSideName(); - cudaVarMap[deviceSideName] = global; - } - } else if (auto dynamicCast = dyn_cast(op)) { - lowerDynamicCastOp(dynamicCast); - } else if (auto stdFind = dyn_cast(op)) { - lowerStdFindOp(stdFind); - } else if (auto iterBegin = dyn_cast(op)) { - lowerIterBeginOp(iterBegin); - } else if (auto iterEnd = dyn_cast(op)) { - lowerIterEndOp(iterEnd); - } else if (auto arrayCtor = dyn_cast(op)) { - lowerArrayCtor(arrayCtor); - } else if (auto arrayDtor = dyn_cast(op)) { - lowerArrayDtor(arrayDtor); - } else if (auto storeOp = dyn_cast(op)) { - mlir::Type valTy = storeOp.getValue().getType(); - if (isa(valTy) || isa(valTy)) - lowerToMemCpy(storeOp); - } else if (auto fnOp = dyn_cast(op)) { - if (auto globalCtor = fnOp.getGlobalCtorPriority()) { - globalCtorList.emplace_back(fnOp.getName(), globalCtor.value()); - } else if (auto globalDtor = fnOp.getGlobalDtorPriority()) { - globalDtorList.emplace_back(fnOp.getName(), globalDtor.value()); - } - if (auto attr = fnOp.getExtraAttrs().getElements().get( - CUDAKernelNameAttr::getMnemonic())) { - auto cudaBinaryAttr = dyn_cast(attr); - std::string kernelName = cudaBinaryAttr.getKernelName(); - cudaKernelMap[kernelName] = fnOp; - } - if (std::optional annotations = fnOp.getAnnotations()) - addGlobalAnnotations(fnOp, annotations.value()); - } else if (auto throwOp = dyn_cast(op)) { - lowerThrowOp(throwOp); - } else if (auto callOp = dyn_cast(op)) { - lowerTrivialConstructorCall(callOp); - } +void LoweringPreparePass::addGlobalAnnotations(mlir::Operation *op, + mlir::ArrayAttr annotations) { + auto globalValue = cast(op); + mlir::StringAttr globalValueName = globalValue.getNameAttr(); + for (auto &annot : annotations) { + llvm::SmallVector entryArray = {globalValueName, annot}; + globalAnnotations.push_back( + mlir::ArrayAttr::get(theModule.getContext(), entryArray)); } +} + +void LoweringPreparePass::buildGlobalAnnotationValues() { + if (globalAnnotations.empty()) + return; + mlir::ArrayAttr annotationValueArray = + mlir::ArrayAttr::get(theModule.getContext(), globalAnnotations); + theModule->setAttr( + cir::CIRDialect::getGlobalAnnotationsAttrName(), + cir::GlobalAnnotationValuesAttr::get(annotationValueArray)); +} - void LoweringPreparePass::runOnOperation() { - assert(astCtx && - "Missing ASTContext, please construct with the right ctor"); - auto *op = getOperation(); - if (isa<::mlir::ModuleOp>(op)) { - theModule = cast<::mlir::ModuleOp>(op); - datalayout.emplace(theModule); +void LoweringPreparePass::runOnOp(Operation *op) { + if (auto unary = dyn_cast(op)) { + lowerUnaryOp(unary); + } else if (auto bin = dyn_cast(op)) { + lowerBinOp(bin); + } else if (auto cast = dyn_cast(op)) { + lowerCastOp(cast); + } else if (auto complexBin = dyn_cast(op)) { + lowerComplexBinOp(complexBin); + } else if (auto threeWayCmp = dyn_cast(op)) { + lowerThreeWayCmpOp(threeWayCmp); + } else if (auto vaArgOp = dyn_cast(op)) { + lowerVAArgOp(vaArgOp); + } else if (auto deleteArrayOp = dyn_cast(op)) { + lowerDeleteArrayOp(deleteArrayOp); + } else if (auto global = dyn_cast(op)) { + lowerGlobalOp(global); + if (auto attr = op->getAttr(cir::CUDAShadowNameAttr::getMnemonic())) { + auto shadowNameAttr = dyn_cast(attr); + std::string deviceSideName = shadowNameAttr.getDeviceSideName(); + cudaVarMap[deviceSideName] = global; + } + } else if (auto dynamicCast = dyn_cast(op)) { + lowerDynamicCastOp(dynamicCast); + } else if (auto stdFind = dyn_cast(op)) { + lowerStdFindOp(stdFind); + } else if (auto iterBegin = dyn_cast(op)) { + lowerIterBeginOp(iterBegin); + } else if (auto iterEnd = dyn_cast(op)) { + lowerIterEndOp(iterEnd); + } else if (auto arrayCtor = dyn_cast(op)) { + lowerArrayCtor(arrayCtor); + } else if (auto arrayDtor = dyn_cast(op)) { + lowerArrayDtor(arrayDtor); + } else if (auto storeOp = dyn_cast(op)) { + mlir::Type valTy = storeOp.getValue().getType(); + if (isa(valTy) || isa(valTy)) + lowerToMemCpy(storeOp); + } else if (auto fnOp = dyn_cast(op)) { + if (auto globalCtor = fnOp.getGlobalCtorPriority()) { + globalCtorList.emplace_back(fnOp.getName(), globalCtor.value()); + } else if (auto globalDtor = fnOp.getGlobalDtorPriority()) { + globalDtorList.emplace_back(fnOp.getName(), globalDtor.value()); } + if (auto attr = fnOp.getExtraAttrs().getElements().get( + CUDAKernelNameAttr::getMnemonic())) { + auto cudaBinaryAttr = dyn_cast(attr); + std::string kernelName = cudaBinaryAttr.getKernelName(); + cudaKernelMap[kernelName] = fnOp; + } + if (std::optional annotations = fnOp.getAnnotations()) + addGlobalAnnotations(fnOp, annotations.value()); + } else if (auto throwOp = dyn_cast(op)) { + lowerThrowOp(throwOp); + } else if (auto callOp = dyn_cast(op)) { + lowerTrivialConstructorCall(callOp); + } +} - llvm::SmallVector opsToTransform; +void LoweringPreparePass::runOnOperation() { + assert(astCtx && "Missing ASTContext, please construct with the right ctor"); + auto *op = getOperation(); + if (isa<::mlir::ModuleOp>(op)) { + theModule = cast<::mlir::ModuleOp>(op); + datalayout.emplace(theModule); + } - op->walk([&](Operation *op) { - if (isa(op)) - opsToTransform.push_back(op); - }); + llvm::SmallVector opsToTransform; - for (auto *o : opsToTransform) - runOnOp(o); + op->walk([&](Operation *op) { + if (isa(op)) + opsToTransform.push_back(op); + }); - if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) { - buildCUDAModuleCtor(); - } + for (auto *o : opsToTransform) + runOnOp(o); - buildCXXGlobalInitFunc(); - buildGlobalCtorDtorList(); - buildGlobalAnnotationValues(); + if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) { + buildCUDAModuleCtor(); } - std::unique_ptr mlir::createLoweringPreparePass() { - return std::make_unique(); - } + buildCXXGlobalInitFunc(); + buildGlobalCtorDtorList(); + buildGlobalAnnotationValues(); +} - std::unique_ptr mlir::createLoweringPreparePass(clang::ASTContext * - astCtx) { - auto pass = std::make_unique(); - pass->setASTContext(astCtx); - return std::move(pass); - } +std::unique_ptr mlir::createLoweringPreparePass() { + return std::make_unique(); +} + +std::unique_ptr +mlir::createLoweringPreparePass(clang::ASTContext *astCtx) { + auto pass = std::make_unique(); + pass->setASTContext(astCtx); + return std::move(pass); +} diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 661f916b0f16..04c71ca0f84e 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -67,9 +67,9 @@ __device__ int a; // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) // CIR-HOST: %[[#T3:]] = cir.get_global @".stra0" -// CIR-HOST: %[[#Device:]] = cir.cast bitcast %7 -// CIR-HOST: %[[#T4:]] = cir.get_global @".stra1" -// CIR-HOST: %[[#Host:]] = cir.cast bitcast %9 +// CIR-HOST: %[[#Device:]] = cir.cast bitcast %[[#T3]] +// CIR-HOST: %[[#T4:]] = cir.get_global @a +// CIR-HOST: %[[#Host:]] = cir.cast bitcast %[[#T4]] // CIR-HOST: %[[#Ext:]] = cir.const #cir.int<0> // CIR-HOST: %[[#Sz:]] = cir.const #cir.int<4> // CIR-HOST: %[[#Const:]] = cir.const #cir.int<0> @@ -87,7 +87,7 @@ __device__ int a; // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) // LLVM-HOST: call void @__cudaRegisterVar( -// LLVM-HOST-SAME: ptr %0, ptr @.stra1, ptr @.stra0, ptr @.stra0, +// LLVM-HOST-SAME: ptr %0, ptr @a, ptr @.stra0, ptr @.stra0, // LLVM-HOST-SAME: i32 0, i64 4, i32 0, i32 0) // LLVM-HOST: } diff --git a/clang/test/CIR/CodeGen/HIP/registration.cpp b/clang/test/CIR/CodeGen/HIP/registration.cpp index a8294e972909..815fae6afbfa 100644 --- a/clang/test/CIR/CodeGen/HIP/registration.cpp +++ b/clang/test/CIR/CodeGen/HIP/registration.cpp @@ -26,12 +26,14 @@ // CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", {{[0-9]+}}>] // CIR-HOST: } +// LLVM-HOST: @.stra0 = private constant [2 x i8] c"a\00" // LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00" // LLVM-HOST: @__hip_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin" // LLVM-HOST: @__hip_fatbin_wrapper = internal constant { // LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null // LLVM-HOST: }, section ".hipFatBinSegment" // LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// LLVM-HOST: @a = global i32 undef, align 4 // LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor // CIR-HOST: cir.func internal private @__hip_module_dtor() { @@ -66,6 +68,9 @@ __global__ void fn() {} + +__device__ int a; + // CIR-HOST: cir.func internal private @__hip_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { // CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr // CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" @@ -80,6 +85,23 @@ __global__ void fn() {} // CIR-HOST-SAME: %[[#DeviceFn]], // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: %[[#GVARNAME:]] = cir.get_global @".stra0" +// CIR-HOST: %[[#GVARNAMEPTR:]] = cir.cast bitcast %[[#GVARNAME]] +// CIR-HOST: %[[#GVAR:]] = cir.get_global @a +// CIR-HOST: %[[#GVARPTR:]] = cir.cast bitcast %[[#GVAR]] +// CIR-HOST: %[[#ZERO:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#FOUR:]] = cir.const #cir.int<4> +// CIR-HOST: %[[#ZERON:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#ZERONN:]] = cir.const #cir.int<0> +// CIR-HOST: cir.call @__hipRegisterVar(%[[FatbinHandle]], +// CIR-HOST-SAME: %[[#GVARPTR]], +// CIR-HOST-SAME: %[[#GVARNAMEPTR]], +// CIR-HOST-SAME: %[[#GVARNAMEPTR]], +// CIR-HOST-SAME: %[[#ZERO]], +// CIR-HOST-SAME: %[[#FOUR:]], +// CIR-HOST-SAME: %[[#ZERON]], +// CIR-HOST-SAME: %[[#ZERONN]]) +// CIR-HOST: cir.return loc(#loc) // CIR-HOST: } // LLVM-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) { @@ -90,6 +112,15 @@ __global__ void fn() {} // LLVM-HOST-SAME: ptr @.str_Z2fnv, // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: call void @__hipRegisterVar( +// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]], +// LLVM-HOST-SAME: ptr @a, +// LLVM-HOST-SAME: ptr @.stra0, +// LLVM-HOST-SAME: ptr @.stra0, +// LLVM-HOST-SAME: i32 0, +// LLVM-HOST-SAME: i64 4, +// LLVM-HOST-SAME: i32 0, +// LLVM-HOST-SAME: i32 0) // LLVM-HOST: } // The content in const array should be the same as echoed above, @@ -145,9 +176,11 @@ __global__ void fn() {} // LLVM-HOST: ret void // OGCG-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// OGCG-HOST: @a = internal global i32 undef, align 4 // OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1 -// OGCG-HOST: @1 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096 -// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @1, ptr null }, section ".hipFatBinSegment", align 8 +// OGCG-HOST: @1 = private unnamed_addr constant [2 x i8] c"a\00", align 1 +// OGCG-HOST: @2 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096 +// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @2, ptr null }, section ".hipFatBinSegment", align 8 // OGCG-HOST: @__hip_gpubin_handle = internal global ptr null, align 8 // OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__hip_module_ctor, ptr null }] @@ -160,6 +193,15 @@ __global__ void fn() {} // OGCG-HOST-SAME: ptr @0, // OGCG-HOST-SAME: i32 -1, // OGCG-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// OGCG-HOST: call void @__hipRegisterVar( +// OGCG-HOST-SAME: ptr %[[#LLVMFatbin]], +// OGCG-HOST-SAME: ptr @a, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i64 4, +// OGCG-HOST-SAME: i32 0, i32 0) +// OGCG-HOST: ret void // OGCG-HOST: } // OGCG-HOST: define internal void @__hip_module_ctor() { From 76693e50dca4e2523ed7bd39ce6f016f57a2ba57 Mon Sep 17 00:00:00 2001 From: koparasy Date: Fri, 14 Nov 2025 17:59:24 -0800 Subject: [PATCH 4/6] Add tests --- clang/test/CIR/CodeGen/CUDA/registration.cu | 55 +++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 04c71ca0f84e..ea0e82873fad 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -13,6 +13,13 @@ // RUN: %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: -fcuda-include-gpubinary %t.fatbin \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + + // CIR-HOST: module @"{{.*}}" attributes { // CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>, // CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>] @@ -125,3 +132,51 @@ __device__ int a; // LLVM-HOST: call void @__cudaRegisterFatBinaryEnd // LLVM-HOST: call i32 @atexit(ptr @__cuda_module_dtor) // LLVM-HOST: } + +// OGCG-HOST: @a = internal global i32 undef, align 4 +// OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1 +// OGCG-HOST: @1 = private unnamed_addr constant [2 x i8] c"a\00", align 1 +// OGCG-HOST: @2 = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin", align 8 +// OGCG-HOST: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1180844977, i32 1, ptr @2, ptr null }, section ".nvFatBinSegment", align 8 +// OGCG-HOST: @__cuda_gpubin_handle = internal global ptr null, align 8 +// OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__cuda_module_ctor, ptr null }] + +// OGCG-HOST: define internal void @__cuda_register_globals(ptr %[[#HANDLE:]]) { +// OGCG-HOST: entry: +// OGCG-HOST: %1 = call i32 @__cudaRegisterFunction(ptr %[[#HANDLE]], +// OGCG-HOST-SAME: ptr @_Z17__device_stub__fnv, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: i32 -1, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null) +// OGCG-HOST: call void @__cudaRegisterVar(ptr %[[#HANDLE]], +// OGCG-HOST-SAME: ptr @a, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i64 4, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i32 0) +// OGCG-HOST: ret void +// OGCG-HOST: } + +// OGCG-HOST: define internal void @__cuda_module_ctor() { +// OGCG-HOST: entry: +// OGCG-HOST: %[[#WRAPADDR:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) +// OGCG-HOST: store ptr %[[#WRAPADDR]], ptr @__cuda_gpubin_handle, align 8 +// OGCG-HOST: call void @__cuda_register_globals(ptr %[[#WRAPADDR]]) +// OGCG-HOST: call void @__cudaRegisterFatBinaryEnd(ptr %[[#WRAPADDR]]) +// OGCG-HOST: %1 = call i32 @atexit(ptr @__cuda_module_dtor) +// OGCG-HOST: ret void +// OGCG-HOST: } + +// OGCG-HOST: define internal void @__cuda_module_dtor() { +// OGCG-HOST: entry: +// OGCG-HOST: %[[#HANDLE:]] = load ptr, ptr @__cuda_gpubin_handle, align 8 +// OGCG-HOST: call void @__cudaUnregisterFatBinary(ptr %[[#HANDLE]]) +// OGCG-HOST: ret void +// OGCG-HOST: } From a3cec1ef01194b810b3b92e4843fa1c0280d59a8 Mon Sep 17 00:00:00 2001 From: koparasy Date: Mon, 17 Nov 2025 10:46:07 -0800 Subject: [PATCH 5/6] ci: trigger pipeline From 8f2dc72d0c960e8e948abf59710c8c4efb5408a1 Mon Sep 17 00:00:00 2001 From: koparasy Date: Mon, 17 Nov 2025 10:50:20 -0800 Subject: [PATCH 6/6] ci: trigger pipeline