Skip to content

Commit 2a50df8

Browse files
authored
[CIR][HIP] Register fatbin in host hip runtime system (#1977)
1 parent e9bdd49 commit 2a50df8

File tree

2 files changed

+342
-15
lines changed

2 files changed

+342
-15
lines changed

clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp

Lines changed: 151 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
130130

131131
void buildCUDAModuleCtor();
132132
std::optional<FuncOp> buildCUDAModuleDtor();
133+
std::optional<FuncOp> buildHIPModuleDtor();
133134
std::optional<FuncOp> buildCUDARegisterGlobals();
134135

135136
void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
@@ -1046,8 +1047,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
10461047
std::move(cudaGPUBinaryOrErr.get());
10471048

10481049
// The section names are different for MAC OS X.
1049-
llvm::StringRef fatbinConstName = ".nv_fatbin";
1050-
llvm::StringRef fatbinSectionName = ".nvFatBinSegment";
1050+
llvm::StringRef fatbinConstName =
1051+
astCtx->getLangOpts().HIP ? ".hip_fatbin" : ".nv_fatbin";
1052+
1053+
llvm::StringRef fatbinSectionName =
1054+
astCtx->getLangOpts().HIP ? ".hipFatBinSegment" : ".nvFatBinSegment";
10511055

10521056
// Create a global variable with the contents of GPU binary.
10531057
auto fatbinType =
@@ -1119,7 +1123,66 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
11191123
globalCtorList.emplace_back(moduleCtorName,
11201124
cir::DefaultGlobalCtorDtorPriority);
11211125
builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
1126+
if (astCtx->getLangOpts().HIP) {
1127+
auto *entryBlock = builder.getInsertionBlock();
1128+
auto *parent = builder.getInsertionBlock()->getParent();
1129+
auto *ifBlock = builder.createBlock(parent);
1130+
auto *exitBlock = builder.createBlock(parent);
1131+
{
1132+
mlir::OpBuilder::InsertionGuard guard(builder);
1133+
builder.setInsertionPointToEnd(entryBlock);
1134+
mlir::Value handle =
1135+
builder.createLoad(loc, builder.createGetGlobal(gpubinHandle));
1136+
auto handlePtrTy = llvm::cast<cir::PointerType>(handle.getType());
1137+
mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
1138+
auto isNull =
1139+
builder.createCompare(loc, cir::CmpOpKind::eq, handle, nullPtr);
1140+
1141+
builder.create<cir::BrCondOp>(loc, isNull, ifBlock, exitBlock);
1142+
}
1143+
{
1144+
// When handle is null we need to load the fatbin and register it
1145+
mlir::OpBuilder::InsertionGuard guard(builder);
1146+
builder.setInsertionPointToStart(ifBlock);
1147+
auto wrapper = builder.createGetGlobal(fatbinWrapper);
1148+
auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
1149+
auto gpuBinaryHandleCall =
1150+
builder.createCallOp(loc, regFunc, fatbinVoidPtr);
1151+
auto gpuBinaryHandle = gpuBinaryHandleCall.getResult();
1152+
// Store the value back to the global `__cuda_gpubin_handle`.
1153+
auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle);
1154+
builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
1155+
builder.create<cir::BrOp>(loc, exitBlock);
1156+
}
1157+
{
1158+
// Exit block
1159+
mlir::OpBuilder::InsertionGuard guard(builder);
1160+
builder.setInsertionPointToStart(exitBlock);
1161+
mlir::Value gHandle =
1162+
builder.createLoad(loc, builder.createGetGlobal(gpubinHandle));
1163+
1164+
std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
1165+
if (regGlobal) {
1166+
builder.createCallOp(loc, *regGlobal, gHandle);
1167+
}
11221168

1169+
if (auto dtor = buildHIPModuleDtor()) {
1170+
cir::CIRBaseBuilderTy globalBuilder(getContext());
1171+
globalBuilder.setInsertionPointToStart(theModule.getBody());
1172+
FuncOp atexit = buildRuntimeFunction(
1173+
globalBuilder, "atexit", loc,
1174+
FuncType::get(PointerType::get(dtor->getFunctionType()), intTy));
1175+
1176+
mlir::Value dtorFunc = GetGlobalOp::create(
1177+
builder, loc, PointerType::get(dtor->getFunctionType()),
1178+
mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr()));
1179+
builder.createCallOp(loc, atexit, dtorFunc);
1180+
}
1181+
cir::ReturnOp::create(builder, loc);
1182+
}
1183+
return;
1184+
}
1185+
// CUDA CTOR-DTOR generations
11231186
// Register binary with CUDA runtime. This is substantially different in
11241187
// default mode vs. separate compilation.
11251188
// Corresponding code:
@@ -1243,9 +1306,10 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
12431306
auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
12441307
auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());
12451308

1246-
auto tmpString = GlobalOp::create(
1247-
globalBuilder, loc, (".str" + str).str(), strType, /*isConstant=*/true,
1248-
/*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
1309+
auto tmpString =
1310+
GlobalOp::create(globalBuilder, loc, (".str" + str).str(), strType,
1311+
/*isConstant=*/true,
1312+
/*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
12491313

12501314
// We must make the string zero-terminated.
12511315
tmpString.setInitialValueAttr(ConstArrayAttr::get(
@@ -1260,19 +1324,91 @@ void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
12601324
GlobalOp deviceFuncStr = makeConstantString(kernelName);
12611325
mlir::Value deviceFunc = builder.createBitcast(
12621326
builder.createGetGlobal(deviceFuncStr), voidPtrTy);
1263-
mlir::Value hostFunc = builder.createBitcast(
1264-
GetGlobalOp::create(
1265-
builder, loc, PointerType::get(deviceStub.getFunctionType()),
1266-
mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
1267-
voidPtrTy);
1268-
builder.createCallOp(
1269-
loc, cudaRegisterFunction,
1270-
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
1271-
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), cirNullPtr,
1272-
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
1327+
if (astCtx->getLangOpts().HIP) {
1328+
auto funcHandle = cast<GlobalOp>(theModule.lookupSymbol(kernelName));
1329+
mlir::Value hostFunc =
1330+
builder.createBitcast(builder.createGetGlobal(funcHandle), voidPtrTy);
1331+
builder.createCallOp(
1332+
loc, cudaRegisterFunction,
1333+
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
1334+
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
1335+
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
1336+
1337+
} else {
1338+
mlir::Value hostFunc = builder.createBitcast(
1339+
GetGlobalOp::create(
1340+
builder, loc, PointerType::get(deviceStub.getFunctionType()),
1341+
mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
1342+
voidPtrTy);
1343+
builder.createCallOp(
1344+
loc, cudaRegisterFunction,
1345+
{fatbinHandle, hostFunc, deviceFunc, deviceFunc,
1346+
ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)),
1347+
cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
1348+
}
12731349
}
12741350
}
12751351

1352+
std::optional<FuncOp> LoweringPreparePass::buildHIPModuleDtor() {
1353+
if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
1354+
return {};
1355+
1356+
std::string prefix = getCUDAPrefix(astCtx);
1357+
1358+
auto voidTy = VoidType::get(&getContext());
1359+
auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));
1360+
1361+
auto loc = theModule.getLoc();
1362+
1363+
cir::CIRBaseBuilderTy builder(getContext());
1364+
builder.setInsertionPointToStart(theModule.getBody());
1365+
1366+
// void __hipUnregisterFatBinary(void ** andle);
1367+
std::string unregisterFuncName =
1368+
addUnderscoredPrefix(prefix, "UnregisterFatBinary");
1369+
FuncOp unregisterFunc = buildRuntimeFunction(
1370+
builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy));
1371+
1372+
// void __hip_module_dtor();
1373+
// Despite the name, OG doesn't treat it as a destructor, so it shouldn't be
1374+
// put into globalDtorList. If it were a real dtor, then it would cause
1375+
// double free. The way to use it is to manually call
1376+
// atexit() at end of module ctor.
1377+
std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor");
1378+
FuncOp dtor =
1379+
buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy),
1380+
GlobalLinkageKind::InternalLinkage);
1381+
1382+
std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle");
1383+
auto gpuBinGlobal = cast<GlobalOp>(theModule.lookupSymbol(gpubinName));
1384+
auto *entryBlock = dtor.addEntryBlock();
1385+
auto *ifBlock = builder.createBlock(&dtor.getBody());
1386+
auto *exitBlock = builder.createBlock(&dtor.getBody());
1387+
mlir::OpBuilder::InsertionGuard guard(builder);
1388+
builder.setInsertionPointToEnd(entryBlock);
1389+
mlir::Value handle =
1390+
builder.createLoad(loc, builder.createGetGlobal(gpuBinGlobal));
1391+
auto handlePtrTy = llvm::cast<cir::PointerType>(handle.getType());
1392+
mlir::Value nullPtr = builder.getNullPtr(handlePtrTy, loc);
1393+
auto isNull = builder.createCompare(loc, cir::CmpOpKind::ne, handle, nullPtr);
1394+
builder.create<cir::BrCondOp>(loc, isNull, ifBlock, exitBlock);
1395+
{
1396+
// When handle is not null we need to unregister it and store null to handle
1397+
mlir::OpBuilder::InsertionGuard guard(builder);
1398+
builder.setInsertionPointToStart(ifBlock);
1399+
builder.createCallOp(loc, unregisterFunc, handle);
1400+
builder.createStore(loc, nullPtr, builder.createGetGlobal(gpuBinGlobal));
1401+
builder.create<cir::BrOp>(loc, exitBlock);
1402+
}
1403+
{
1404+
// Exit block
1405+
mlir::OpBuilder::InsertionGuard guard(builder);
1406+
builder.setInsertionPointToStart(exitBlock);
1407+
cir::ReturnOp::create(builder, loc);
1408+
}
1409+
return dtor;
1410+
}
1411+
12761412
std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() {
12771413
if (!theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
12781414
return {};
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#include "cuda.h"
2+
3+
// RUN: echo "sample fatbin" > %t.fatbin
4+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
5+
// RUN: -x hip -emit-cir -fhip-new-launch-api -I%S/../Inputs/ \
6+
// RUN: -fcuda-include-gpubinary %t.fatbin \
7+
// RUN: %s -o %t.cir
8+
// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
9+
10+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
11+
// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \
12+
// RUN: -fcuda-include-gpubinary %t.fatbin \
13+
// RUN: %s -o %t.ll
14+
// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
15+
16+
// OGCG emits LLVM IR in different order than clangir, we add at the end the order of OGCG.
17+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
18+
// RUN: -x hip -emit-llvm -fhip-new-launch-api -I%S/../Inputs/ \
19+
// RUN: -fcuda-include-gpubinary %t.fatbin \
20+
// RUN: %s -o %t.ll
21+
// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s
22+
23+
24+
// CIR-HOST: module @"{{.*}}" attributes {
25+
// CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
26+
// CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__hip_module_ctor", {{[0-9]+}}>]
27+
// CIR-HOST: }
28+
29+
// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00"
30+
// LLVM-HOST: @__hip_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin"
31+
// LLVM-HOST: @__hip_fatbin_wrapper = internal constant {
32+
// LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null
33+
// LLVM-HOST: }, section ".hipFatBinSegment"
34+
// LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8
35+
// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor
36+
37+
// CIR-HOST: cir.func internal private @__hip_module_dtor() {
38+
// CIR-HOST: %[[#HandleGlobal:]] = cir.get_global @__hip_gpubin_handle
39+
// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobal]] : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
40+
// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>> loc(#loc)
41+
// CIR-HOST: %3 = cir.cmp(ne, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr<!cir.ptr<!void>>, !cir.bool loc(#loc)
42+
// CIR-HOST: cir.brcond %3 ^bb1, ^bb2 loc(#loc)
43+
// CIR-HOST: ^bb1:
44+
// CIR-HOST: cir.call @__hipUnregisterFatBinary(%[[#HandleAddr]]) : (!cir.ptr<!cir.ptr<!void>>) -> () loc(#loc)
45+
// CIR-HOST: %[[#HandleAddr:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
46+
// CIR-HOST: cir.store %[[#NullVal]], %[[#HandleAddr]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
47+
// CIR-HOST: cir.br ^bb2 loc(#loc)
48+
// CIR-HOST: ^bb2: // 2 preds: ^bb0, ^bb1
49+
// CIR-HOST: cir.return loc(#loc)
50+
// CIR-HOST: } loc(#loc)
51+
52+
// LLVM-HOST: define internal void @__hip_module_dtor() {
53+
// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
54+
// LLVM-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null
55+
// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
56+
// LLVM-HOST: [[IFBLOCK]]: ; preds = %0
57+
// LLVM-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]])
58+
// LLVM-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8
59+
// LLVM-HOST: br label %[[EXITBLOCK]]
60+
// LLVM-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %0
61+
// LLVM-HOST: ret void
62+
// LLVM-HOST: }
63+
64+
// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" =
65+
// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros>
66+
67+
__global__ void fn() {}
68+
69+
// CIR-HOST: cir.func internal private @__hip_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) {
70+
// CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr<null>
71+
// CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv"
72+
// CIR-HOST: %[[#DeviceFn:]] = cir.cast bitcast %[[#T1]]
73+
// CIR-HOST: %[[#T2:]] = cir.get_global @_Z2fnv
74+
// CIR-HOST: %[[#HostFnHandle:]] = cir.cast bitcast %[[#T2]]
75+
// CIR-HOST: %[[#MinusOne:]] = cir.const #cir.int<-1>
76+
// CIR-HOST: cir.call @__hipRegisterFunction(
77+
// CIR-HOST-SAME: %[[FatbinHandle]],
78+
// CIR-HOST-SAME: %[[#HostFnHandle]],
79+
// CIR-HOST-SAME: %[[#DeviceFn]],
80+
// CIR-HOST-SAME: %[[#DeviceFn]],
81+
// CIR-HOST-SAME: %[[#MinusOne]],
82+
// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]])
83+
// CIR-HOST: }
84+
85+
// LLVM-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) {
86+
// LLVM-HOST: call i32 @__hipRegisterFunction(
87+
// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]],
88+
// LLVM-HOST-SAME: ptr @_Z2fnv,
89+
// LLVM-HOST-SAME: ptr @.str_Z2fnv,
90+
// LLVM-HOST-SAME: ptr @.str_Z2fnv,
91+
// LLVM-HOST-SAME: i32 -1,
92+
// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
93+
// LLVM-HOST: }
94+
95+
// The content in const array should be the same as echoed above,
96+
// with a trailing line break ('\n', 0x0A).
97+
// CIR-HOST: cir.global "private" constant cir_private @__hip_fatbin_str =
98+
// CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
99+
// CIR-HOST-SAME: {{.*}}section = ".hip_fatbin"
100+
101+
// The first value is HIP file head magic number.
102+
// CIR-HOST: cir.global "private" constant internal @__hip_fatbin_wrapper
103+
// CIR-HOST: = #cir.const_record<{
104+
// CIR-HOST: #cir.int<1212764230> : !s32i,
105+
// CIR-HOST: #cir.int<1> : !s32i,
106+
// CIR-HOST: #cir.global_view<@__hip_fatbin_str> : !cir.ptr<!void>,
107+
// CIR-HOST: #cir.ptr<null> : !cir.ptr<!void>
108+
// CIR-HOST: }>
109+
// CIR-HOST-SAME: {{.*}}section = ".hipFatBinSegment"
110+
111+
// CIR-HOST: cir.func internal private @__hip_module_ctor() {
112+
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
113+
// CIR-HOST: %[[#HandleAddr:]] = cir.load %[[#HandleGlobalVar]] : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
114+
// CIR-HOST: %[[#NullVal:]] = cir.const #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>> loc(#loc)
115+
// CIR-HOST: %[[#ICMP:]] = cir.cmp(eq, %[[#HandleAddr]], %[[#NullVal]]) : !cir.ptr<!cir.ptr<!void>>, !cir.bool loc(#loc)
116+
// CIR-HOST: cir.brcond %[[#ICMP]] ^bb1, ^bb2 loc(#loc)
117+
// CIR-HOST: ^bb1:
118+
// CIR-HOST: %[[#FatBinWrapper:]] = cir.get_global @__hip_fatbin_wrapper : !cir.ptr<!rec_anon_struct> loc(#loc)
119+
// CIR-HOST: %[[#CastGlobalFatBin:]] = cir.cast bitcast %[[#FatBinWrapper]] : !cir.ptr<!rec_anon_struct> -> !cir.ptr<!void> loc(#loc)
120+
// CIR-HOST: %[[#RTVal:]] = cir.call @__hipRegisterFatBinary(%[[#CastGlobalFatBin]]) : (!cir.ptr<!void>) -> !cir.ptr<!cir.ptr<!void>> loc(#loc)
121+
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
122+
// CIR-HOST: cir.store %[[#RTVal]], %[[#HandleGlobalVar]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
123+
// CIR-HOST: cir.br ^bb2 loc(#loc)
124+
// CIR-HOST: ^bb2:
125+
// CIR-HOST: %[[#HandleGlobalVar:]] = cir.get_global @__hip_gpubin_handle : !cir.ptr<!cir.ptr<!cir.ptr<!void>>> loc(#loc)
126+
// CIR-HOST: %[[#HandleVal:]] = cir.load %8 : !cir.ptr<!cir.ptr<!cir.ptr<!void>>>, !cir.ptr<!cir.ptr<!void>> loc(#loc)
127+
// CIR-HOST: cir.call @__hip_register_globals(%[[#HandleVal]]) : (!cir.ptr<!cir.ptr<!void>>) -> () loc(#loc)
128+
// CIR-HOST: %[[#DTOR:]] = cir.get_global @__hip_module_dtor : !cir.ptr<!cir.func<()>> loc(#loc)
129+
// CIR-HOST: %11 = cir.call @atexit(%[[#DTOR]]) : (!cir.ptr<!cir.func<()>>) -> !s32i loc(#loc)
130+
// CIR-HOST: cir.return loc(#loc)
131+
// CIR-HOST: } loc(#loc)
132+
133+
// LLVM-HOST: define internal void @__hip_module_ctor() {
134+
// LLVM-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
135+
// LLVM-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null
136+
// LLVM-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
137+
// LLVM-HOST: [[IFBLOCK]]:
138+
// LLVM-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
139+
// LLVM-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8
140+
// LLVM-HOST: br label %[[EXITBLOCK]]
141+
// LLVM-HOST: [[EXITBLOCK]]:
142+
// LLVM-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8
143+
// LLVM-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]])
144+
// LLVM-HOST: call i32 @atexit(ptr @__hip_module_dtor)
145+
// LLVM-HOST: ret void
146+
147+
// OGCG-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8
148+
// OGCG-HOST: @0 = private unnamed_addr constant [7 x i8] c"_Z2fnv\00", align 1
149+
// OGCG-HOST: @1 = private constant [14 x i8] c"sample fatbin\0A", section ".hip_fatbin", align 4096
150+
// OGCG-HOST: @__hip_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1212764230, i32 1, ptr @1, ptr null }, section ".hipFatBinSegment", align 8
151+
// OGCG-HOST: @__hip_gpubin_handle = internal global ptr null, align 8
152+
// OGCG-HOST: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__hip_module_ctor, ptr null }]
153+
154+
// OGCG-HOST: define internal void @__hip_register_globals(ptr %[[#LLVMFatbin:]]) {
155+
// OGCG-HOST: entry:
156+
// OGCG-HOST: call i32 @__hipRegisterFunction(
157+
// OGCG-HOST-SAME: ptr %[[#LLVMFatbin]],
158+
// OGCG-HOST-SAME: ptr @_Z2fnv,
159+
// OGCG-HOST-SAME: ptr @0,
160+
// OGCG-HOST-SAME: ptr @0,
161+
// OGCG-HOST-SAME: i32 -1,
162+
// OGCG-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
163+
// OGCG-HOST: }
164+
165+
// OGCG-HOST: define internal void @__hip_module_ctor() {
166+
// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
167+
// OGCG-HOST: %[[#ICMP:]] = icmp eq ptr %[[#LLVMHandleVar]], null
168+
// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
169+
// OGCG-HOST: [[IFBLOCK]]:
170+
// OGCG-HOST: %[[#Value:]] = call ptr @__hipRegisterFatBinary(ptr @__hip_fatbin_wrapper)
171+
// OGCG-HOST: store ptr %[[#Value]], ptr @__hip_gpubin_handle, align 8
172+
// OGCG-HOST: br label %[[EXITBLOCK]]
173+
// OGCG-HOST: [[EXITBLOCK]]:
174+
// OGCG-HOST: %[[#HandleValue:]] = load ptr, ptr @__hip_gpubin_handle, align 8
175+
// OGCG-HOST: call void @__hip_register_globals(ptr %[[#HandleValue]])
176+
// OGCG-HOST: call i32 @atexit(ptr @__hip_module_dtor)
177+
// OGCG-HOST: ret void
178+
179+
// OGCG-HOST: define internal void @__hip_module_dtor() {
180+
// OGCG-HOST: entry:
181+
// OGCG-HOST: %[[#LLVMHandleVar:]] = load ptr, ptr @__hip_gpubin_handle, align 8
182+
// OGCG-HOST: %[[#ICMP:]] = icmp ne ptr %[[#LLVMHandleVar]], null
183+
// OGCG-HOST: br i1 %[[#ICMP]], label %[[IFBLOCK:[^,]+]], label %[[EXITBLOCK:[^,]+]]
184+
// OGCG-HOST: [[IFBLOCK]]: ; preds = %entry
185+
// OGCG-HOST: call void @__hipUnregisterFatBinary(ptr %[[#LLVMHandleVar]])
186+
// OGCG-HOST: store ptr null, ptr @__hip_gpubin_handle, align 8
187+
// OGCG-HOST: br label %[[EXITBLOCK]]
188+
// OGCG-HOST: [[EXITBLOCK]]: ; preds = %[[IFBLOCK]], %entry
189+
// OGCG-HOST: ret void
190+
// OGCG-HOST: }
191+

0 commit comments

Comments
 (0)