diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h index e6ad589f3a57..cc0c398c329a 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h @@ -129,6 +129,10 @@ class CIRDataLayout { mlir::Type getCharType(mlir::MLIRContext *ctx) const { return typeSizeInfo.getCharType(ctx); } + + mlir::Type getSizeType(mlir::MLIRContext *ctx) const { + return typeSizeInfo.getSizeType(ctx); + } }; /// Used to lazily calculate structure layout information for a target machine, diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index f9b9da07be3b..5b03b53e8da6 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -127,6 +127,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { // Maps CUDA kernel name to device stub function. llvm::StringMap cudaKernelMap; + // Maps CUDA device-side variable name to host-side (shadow) GlobalOp. + llvm::StringMap cudaVarMap; void buildCUDAModuleCtor(); std::optional buildCUDAModuleDtor(); @@ -135,6 +137,8 @@ struct LoweringPreparePass : public LoweringPrepareBase { void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc); + void buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); /// /// AST related @@ -1261,8 +1265,7 @@ std::optional LoweringPreparePass::buildCUDARegisterGlobals() { builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); - - // TODO(cir): registration for global variables. + buildCUDARegisterVars(builder, regGlobalFunc); ReturnOp::create(builder, loc); return regGlobalFunc; @@ -1409,6 +1412,81 @@ std::optional LoweringPreparePass::buildHIPModuleDtor() { return dtor; } +void LoweringPreparePass::buildCUDARegisterVars(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc) { + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); + auto sizeTy = datalayout->getSizeType(&getContext()); + + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); + + // Declare CUDA internal function: + // void __cudaRegisterVar( + // void **fatbinHandle, + // char *hostVarName, + // char *deviceVarName, + // const char *deviceVarName, + // int isExtern, size_t varSize, + // int isConstant, int zero + // ); + // Similar to the registration of global functions, OG does not care about + // pointer types. They will generate the same IR anyway. + + FuncOp cudaRegisterVar = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterVar"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + sizeTy, intTy, intTy}, + voidTy)); + + unsigned int count = 0; + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + + auto tmpString = GlobalOp::create( + globalBuilder, loc, (".str" + str + std::to_string(count++)).str(), + strType, /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + for (auto &[deviceSideName, global] : cudaVarMap) { + GlobalOp varNameStr = makeConstantString(deviceSideName); + mlir::Value varNameValue = + builder.createBitcast(builder.createGetGlobal(varNameStr), voidPtrTy); + + auto globalVarValue = + builder.createBitcast(builder.createGetGlobal(global), voidPtrTy); + + // Every device variable that has a shadow on host will not be extern. + // See CIRGenModule::emitGlobalVarDefinition. + auto isExtern = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + llvm::TypeSize size = datalayout->getTypeSizeInBits(global.getSymType()); + auto varSize = ConstantOp::create( + builder, loc, IntAttr::get(sizeTy, size.getFixedValue() / 8)); + auto isConstant = ConstantOp::create( + builder, loc, IntAttr::get(intTy, global.getConstant())); + auto zero = ConstantOp::create(builder, loc, IntAttr::get(intTy, 0)); + builder.createCallOp(loc, cudaRegisterVar, + {fatbinHandle, globalVarValue, varNameValue, + varNameValue, isExtern, varSize, isConstant, zero}); + } +} + std::optional LoweringPreparePass::buildCUDAModuleDtor() { if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) return {}; @@ -1431,9 +1509,9 @@ std::optional LoweringPreparePass::buildCUDAModuleDtor() { // void __cuda_module_dtor(); // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be - // put into globalDtorList. If it were a real dtor, then it would cause double - // free above CUDA 9.2. The way to use it is to manually call atexit() at end - // of module ctor. + // put into globalDtorList. If it were a real dtor, then it would cause + // double free above CUDA 9.2. The way to use it is to manually call + // atexit() at end of module ctor. std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); FuncOp dtor = buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), @@ -1721,8 +1799,13 @@ void LoweringPreparePass::runOnOp(Operation *op) { lowerVAArgOp(vaArgOp); } else if (auto deleteArrayOp = dyn_cast(op)) { lowerDeleteArrayOp(deleteArrayOp); - } else if (auto getGlobal = dyn_cast(op)) { - lowerGlobalOp(getGlobal); + } else if (auto global = dyn_cast(op)) { + lowerGlobalOp(global); + if (auto attr = op->getAttr(cir::CUDAShadowNameAttr::getMnemonic())) { + auto shadowNameAttr = dyn_cast(attr); + std::string deviceSideName = shadowNameAttr.getDeviceSideName(); + cudaVarMap[deviceSideName] = global; + } } else if (auto dynamicCast = dyn_cast(op)) { lowerDynamicCastOp(dynamicCast); } else if (auto stdFind = dyn_cast(op)) { diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp index 8b1a2512613d..05cf7b853da8 100644 --- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -35,6 +35,7 @@ #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/MultiplexConsumer.h" #include "clang/Lex/Preprocessor.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DiagnosticInfo.h" @@ -47,11 +48,10 @@ #include "llvm/LTO/LTOBackend.h" #include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/ADT/SmallString.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Support/Path.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" @@ -112,12 +112,12 @@ class CIRGenConsumer : public clang::ASTConsumer { CIRGenAction::OutputType Action; - CompilerInstance &CompilerInstance; - DiagnosticsEngine &DiagnosticsEngine; - [[maybe_unused]] const HeaderSearchOptions &HeaderSearchOptions; - CodeGenOptions &CodeGenOptions; - [[maybe_unused]] const TargetOptions &TargetOptions; - [[maybe_unused]] const LangOptions &LangOptions; + CompilerInstance &CI; + DiagnosticsEngine &Diags; + [[maybe_unused]] const HeaderSearchOptions &HeaderSearchOpts; + CodeGenOptions &CodeGenOpts; + [[maybe_unused]] const TargetOptions &TargetOpts; + [[maybe_unused]] const LangOptions &LangOpts; const FrontendOptions &FeOptions; std::string InputFileName; @@ -128,25 +128,21 @@ class CIRGenConsumer : public clang::ASTConsumer { std::unique_ptr Gen; public: - CIRGenConsumer(CIRGenAction::OutputType Action, - class CompilerInstance &CompilerInstance, - class DiagnosticsEngine &DiagnosticsEngine, + CIRGenConsumer(CIRGenAction::OutputType Action, class CompilerInstance &CI, + class DiagnosticsEngine &Diags, IntrusiveRefCntPtr VFS, - const class HeaderSearchOptions &HeaderSearchOptions, - class CodeGenOptions &CodeGenOptions, - const class TargetOptions &TargetOptions, - const class LangOptions &LangOptions, + const class HeaderSearchOptions &HeaderSearchOpts, + class CodeGenOptions &CodeGenOpts, + const class TargetOptions &TargetOpts, + const class LangOptions &LangOpts, const FrontendOptions &FeOptions, StringRef InputFile, std::unique_ptr Os) - : Action(Action), CompilerInstance(CompilerInstance), - DiagnosticsEngine(DiagnosticsEngine), - HeaderSearchOptions(HeaderSearchOptions), - CodeGenOptions(CodeGenOptions), TargetOptions(TargetOptions), - LangOptions(LangOptions), FeOptions(FeOptions), - InputFileName(InputFile.str()), - OutputStream(std::move(Os)), FS(VFS), - Gen(std::make_unique(DiagnosticsEngine, std::move(VFS), - CodeGenOptions)) {} + : Action(Action), CI(CI), Diags(Diags), + HeaderSearchOpts(HeaderSearchOpts), CodeGenOpts(CodeGenOpts), + TargetOpts(TargetOpts), LangOpts(LangOpts), FeOptions(FeOptions), + InputFileName(InputFile.str()), OutputStream(std::move(Os)), FS(VFS), + Gen(std::make_unique(Diags, std::move(VFS), + CodeGenOpts)) {} void Initialize(ASTContext &Ctx) override { assert(!AstContext && "initialized multiple times"); @@ -221,12 +217,12 @@ class CIRGenConsumer : public clang::ASTConsumer { FeOptions.ClangIRLifetimeCheck, LifetimeOpts, FeOptions.ClangIRIdiomRecognizer, IdiomRecognizerOpts, FeOptions.ClangIRLibOpt, LibOptOpts, PassOptParsingFailure, - CodeGenOptions.OptimizationLevel > 0, FlattenCir, + CodeGenOpts.OptimizationLevel > 0, FlattenCir, !FeOptions.ClangIRDirectLowering, EnableCcLowering, FeOptions.ClangIREnableMem2Reg) .failed()) { if (!PassOptParsingFailure.empty()) { - auto D = DiagnosticsEngine.Report(diag::err_drv_cir_pass_opt_parsing); + auto D = Diags.Report(diag::err_drv_cir_pass_opt_parsing); D << PassOptParsingFailure; } else llvm::report_fatal_error("CIR codegen: MLIR pass manager fails " @@ -269,24 +265,21 @@ class CIRGenConsumer : public clang::ASTConsumer { } } - bool EmitCIR = LangOptions.EmitCIRToFile || FeOptions.EmitClangIRFile || - !LangOptions.CIRFile.empty() || - !FeOptions.ClangIRFile.empty(); + bool EmitCIR = LangOpts.EmitCIRToFile || FeOptions.EmitClangIRFile || + !LangOpts.CIRFile.empty() || !FeOptions.ClangIRFile.empty(); if (EmitCIR) { std::unique_ptr CIRStream; llvm::SmallString<128> DefaultPath; if (!FeOptions.ClangIRFile.empty()) { - CIRStream = CompilerInstance.createOutputFile( - FeOptions.ClangIRFile, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); - } else if (!LangOptions.CIRFile.empty()) { - CIRStream = CompilerInstance.createOutputFile( - LangOptions.CIRFile, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); + CIRStream = CI.createOutputFile(FeOptions.ClangIRFile, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); + } else if (!LangOpts.CIRFile.empty()) { + CIRStream = CI.createOutputFile(LangOpts.CIRFile, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); } else { if (!FeOptions.OutputFile.empty() && FeOptions.OutputFile != "-") { DefaultPath = FeOptions.OutputFile; @@ -299,11 +292,10 @@ class CIRGenConsumer : public clang::ASTConsumer { DefaultPath = "clangir-output"; } llvm::sys::path::replace_extension(DefaultPath, "cir"); - CIRStream = CompilerInstance.createOutputFile( - DefaultPath, - /*Binary=*/false, - /*RemoveFileOnSignal=*/true, - /*UseTemporary=*/true); + CIRStream = CI.createOutputFile(DefaultPath, + /*Binary=*/false, + /*RemoveFileOnSignal=*/true, + /*UseTemporary=*/true); } if (CIRStream) { @@ -354,7 +346,7 @@ class CIRGenConsumer : public clang::ASTConsumer { case CIRGenAction::OutputType::EmitAssembly: { llvm::LLVMContext LlvmCtx; bool DisableDebugInfo = - CodeGenOptions.getDebugInfo() == llvm::codegenoptions::NoDebugInfo; + CodeGenOpts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo; auto LlvmModule = lowerFromCIRToLLVMIR( FeOptions, MlirMod, std::move(MlirCtx), LlvmCtx, FeOptions.ClangIRDisableCIRVerifier, @@ -362,10 +354,9 @@ class CIRGenConsumer : public clang::ASTConsumer { BackendAction BackendAction = getBackendActionFromOutputType(Action); - emitBackendOutput(CompilerInstance, CodeGenOptions, - C.getTargetInfo().getDataLayoutString(), - LlvmModule.get(), BackendAction, FS, - std::move(OutputStream)); + emitBackendOutput( + CI, CodeGenOpts, C.getTargetInfo().getDataLayoutString(), + LlvmModule.get(), BackendAction, FS, std::move(OutputStream)); break; } case CIRGenAction::OutputType::None: diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 834e45204b77..ea0e82873fad 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -13,6 +13,13 @@ // RUN: %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: -fcuda-include-gpubinary %t.fatbin \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + + // CIR-HOST: module @"{{.*}}" attributes { // CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>, // CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>] @@ -50,6 +57,8 @@ __global__ void fn() {} +__device__ int a; + // CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { // CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr // CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" @@ -64,6 +73,16 @@ __global__ void fn() {} // CIR-HOST-SAME: %[[#DeviceFn]], // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: %[[#T3:]] = cir.get_global @".stra0" +// CIR-HOST: %[[#Device:]] = cir.cast bitcast %[[#T3]] +// CIR-HOST: %[[#T4:]] = cir.get_global @a +// CIR-HOST: %[[#Host:]] = cir.cast bitcast %[[#T4]] +// CIR-HOST: %[[#Ext:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Sz:]] = cir.const #cir.int<4> +// CIR-HOST: %[[#Const:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#Zero:]] = cir.const #cir.int<0> +// CIR-HOST: cir.call @__cudaRegisterVar(%arg0, %[[#Host]], %[[#Device]], %[[#Device]], +// CIR-HOST-SAME: %[[#Ext]], %[[#Sz]], %[[#Const]], %[[#Zero]]) // CIR-HOST: } // LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) { @@ -74,6 +93,9 @@ __global__ void fn() {} // LLVM-HOST-SAME: ptr @.str_Z2fnv, // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: call void @__cudaRegisterVar( +// LLVM-HOST-SAME: ptr %0, ptr @a, ptr @.stra0, ptr @.stra0, +// LLVM-HOST-SAME: i32 0, i64 4, i32 0, i32 0) // LLVM-HOST: } // The content in const array should be the same as echoed above, @@ -110,3 +132,51 @@ __global__ void fn() {} // LLVM-HOST: call void @__cudaRegisterFatBinaryEnd // LLVM-HOST: call i32 @atexit(ptr @__cuda_module_dtor) // LLVM-HOST: } + +// OGCG-HOST: @a = internal global i32 undef, align 4 +// OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1 +// OGCG-HOST: @1 = private unnamed_addr constant [2 x i8] c"a\00", align 1 +// OGCG-HOST: @2 = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin", align 8 +// OGCG-HOST: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1180844977, i32 1, ptr @2, ptr null }, section ".nvFatBinSegment", align 8 +// OGCG-HOST: @__cuda_gpubin_handle = internal global ptr null, align 8 +// OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__cuda_module_ctor, ptr null }] + +// OGCG-HOST: define internal void @__cuda_register_globals(ptr %[[#HANDLE:]]) { +// OGCG-HOST: entry: +// OGCG-HOST: %1 = call i32 @__cudaRegisterFunction(ptr %[[#HANDLE]], +// OGCG-HOST-SAME: ptr @_Z17__device_stub__fnv, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: ptr @0, +// OGCG-HOST-SAME: i32 -1, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null, +// OGCG-HOST-SAME: ptr null) +// OGCG-HOST: call void @__cudaRegisterVar(ptr %[[#HANDLE]], +// OGCG-HOST-SAME: ptr @a, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i64 4, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i32 0) +// OGCG-HOST: ret void +// OGCG-HOST: } + +// OGCG-HOST: define internal void @__cuda_module_ctor() { +// OGCG-HOST: entry: +// OGCG-HOST: %[[#WRAPADDR:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) +// OGCG-HOST: store ptr %[[#WRAPADDR]], ptr @__cuda_gpubin_handle, align 8 +// OGCG-HOST: call void @__cuda_register_globals(ptr %[[#WRAPADDR]]) +// OGCG-HOST: call void @__cudaRegisterFatBinaryEnd(ptr %[[#WRAPADDR]]) +// OGCG-HOST: %1 = call i32 @atexit(ptr @__cuda_module_dtor) +// OGCG-HOST: ret void +// OGCG-HOST: } + +// OGCG-HOST: define internal void @__cuda_module_dtor() { +// OGCG-HOST: entry: +// OGCG-HOST: %[[#HANDLE:]] = load ptr, ptr @__cuda_gpubin_handle, align 8 +// OGCG-HOST: call void @__cudaUnregisterFatBinary(ptr %[[#HANDLE]]) +// OGCG-HOST: ret void +// OGCG-HOST: } diff --git a/clang/test/CIR/CodeGen/HIP/registration.cpp b/clang/test/CIR/CodeGen/HIP/registration.cpp index a8294e972909..815fae6afbfa 100644 --- a/clang/test/CIR/CodeGen/HIP/registration.cpp +++ b/clang/test/CIR/CodeGen/HIP/registration.cpp @@ -26,12 +26,14 @@ // CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", {{[0-9]+}}>] // CIR-HOST: } +// LLVM-HOST: @.stra0 = private constant [2 x i8] c"a\00" // LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00" // LLVM-HOST: @__hip_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin" // LLVM-HOST: @__hip_fatbin_wrapper = internal constant { // LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null // LLVM-HOST: }, section ".hipFatBinSegment" // LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// LLVM-HOST: @a = global i32 undef, align 4 // LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor // CIR-HOST: cir.func internal private @__hip_module_dtor() { @@ -66,6 +68,9 @@ __global__ void fn() {} + +__device__ int a; + // CIR-HOST: cir.func internal private @__hip_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { // CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr // CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" @@ -80,6 +85,23 @@ __global__ void fn() {} // CIR-HOST-SAME: %[[#DeviceFn]], // CIR-HOST-SAME: %[[#MinusOne]], // CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: %[[#GVARNAME:]] = cir.get_global @".stra0" +// CIR-HOST: %[[#GVARNAMEPTR:]] = cir.cast bitcast %[[#GVARNAME]] +// CIR-HOST: %[[#GVAR:]] = cir.get_global @a +// CIR-HOST: %[[#GVARPTR:]] = cir.cast bitcast %[[#GVAR]] +// CIR-HOST: %[[#ZERO:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#FOUR:]] = cir.const #cir.int<4> +// CIR-HOST: %[[#ZERON:]] = cir.const #cir.int<0> +// CIR-HOST: %[[#ZERONN:]] = cir.const #cir.int<0> +// CIR-HOST: cir.call @__hipRegisterVar(%[[FatbinHandle]], +// CIR-HOST-SAME: %[[#GVARPTR]], +// CIR-HOST-SAME: %[[#GVARNAMEPTR]], +// CIR-HOST-SAME: %[[#GVARNAMEPTR]], +// CIR-HOST-SAME: %[[#ZERO]], +// CIR-HOST-SAME: %[[#FOUR:]], +// CIR-HOST-SAME: %[[#ZERON]], +// CIR-HOST-SAME: %[[#ZERONN]]) +// CIR-HOST: cir.return loc(#loc) // CIR-HOST: } // LLVM-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) { @@ -90,6 +112,15 @@ __global__ void fn() {} // LLVM-HOST-SAME: ptr @.str_Z2fnv, // LLVM-HOST-SAME: i32 -1, // LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: call void @__hipRegisterVar( +// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]], +// LLVM-HOST-SAME: ptr @a, +// LLVM-HOST-SAME: ptr @.stra0, +// LLVM-HOST-SAME: ptr @.stra0, +// LLVM-HOST-SAME: i32 0, +// LLVM-HOST-SAME: i64 4, +// LLVM-HOST-SAME: i32 0, +// LLVM-HOST-SAME: i32 0) // LLVM-HOST: } // The content in const array should be the same as echoed above, @@ -145,9 +176,11 @@ __global__ void fn() {} // LLVM-HOST: ret void // OGCG-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 +// OGCG-HOST: @a = internal global i32 undef, align 4 // OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1 -// OGCG-HOST: @1 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096 -// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @1, ptr null }, section ".hipFatBinSegment", align 8 +// OGCG-HOST: @1 = private unnamed_addr constant [2 x i8] c"a\00", align 1 +// OGCG-HOST: @2 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096 +// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @2, ptr null }, section ".hipFatBinSegment", align 8 // OGCG-HOST: @__hip_gpubin_handle = internal global ptr null, align 8 // OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__hip_module_ctor, ptr null }] @@ -160,6 +193,15 @@ __global__ void fn() {} // OGCG-HOST-SAME: ptr @0, // OGCG-HOST-SAME: i32 -1, // OGCG-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// OGCG-HOST: call void @__hipRegisterVar( +// OGCG-HOST-SAME: ptr %[[#LLVMFatbin]], +// OGCG-HOST-SAME: ptr @a, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: ptr @1, +// OGCG-HOST-SAME: i32 0, +// OGCG-HOST-SAME: i64 4, +// OGCG-HOST-SAME: i32 0, i32 0) +// OGCG-HOST: ret void // OGCG-HOST: } // OGCG-HOST: define internal void @__hip_module_ctor() {