Skip to content

Commit a1f4915

Browse files
committed
Fix race condition in C API init
1 parent fc9547c commit a1f4915

File tree

4 files changed

+153
-91
lines changed

4 files changed

+153
-91
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextBuiltins.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,10 +1887,18 @@ private static PythonManagedClass lookupBuiltinTypeWithName(PythonContext contex
18871887
} else {
18881888
String module = name.substring(0, index);
18891889
name = name.substring(index + 1);
1890-
Object moduleObject = core.lookupBuiltinModule(toTruffleStringUncached(module));
1890+
TruffleString tsModule = toTruffleStringUncached(module);
1891+
Object moduleObject = core.lookupBuiltinModule(tsModule);
18911892
if (moduleObject == null) {
1892-
moduleObject = AbstractImportNode.importModule(toTruffleStringUncached(module));
1893+
moduleObject = AbstractImportNode.lookupImportedModule(context, tsModule);
1894+
if (moduleObject == null) {
1895+
throw CompilerDirectives.shouldNotReachHere(String.format(
1896+
"Module '%s' is needed during C API initialization, but was not imported prior to the initialization in ensureCapiWasLoaded. This is an internal error in GraalPy.",
1897+
module));
1898+
}
18931899
}
1900+
// Assumption: builtin modules' tp_getattro is well-behaved and just reads the
1901+
// attribute, there is no locking or blocking inside
18941902
Object attribute = PyObjectGetAttr.getUncached().execute(null, moduleObject, toTruffleStringUncached(name));
18951903
if (attribute != PNone.NO_VALUE) {
18961904
if (attribute instanceof PythonBuiltinClassType builtinType) {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/capi/CApiContext.java

Lines changed: 117 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
import java.util.concurrent.ConcurrentHashMap;
6767
import java.util.concurrent.atomic.AtomicInteger;
6868
import java.util.concurrent.atomic.AtomicLong;
69+
import java.util.concurrent.locks.ReentrantLock;
6970
import java.util.logging.Level;
7071

7172
import org.graalvm.collections.Pair;
@@ -113,6 +114,7 @@
113114
import com.oracle.graal.python.nodes.ErrorMessages;
114115
import com.oracle.graal.python.nodes.PRaiseNode;
115116
import com.oracle.graal.python.nodes.object.GetClassNode;
117+
import com.oracle.graal.python.nodes.statement.AbstractImportNode;
116118
import com.oracle.graal.python.runtime.GilNode;
117119
import com.oracle.graal.python.runtime.PosixConstants;
118120
import com.oracle.graal.python.runtime.PythonContext;
@@ -905,106 +907,133 @@ public static CApiContext ensureCapiWasLoaded(Node node, PythonContext context,
905907
}
906908

907909
@TruffleBoundary
910+
@SuppressWarnings("try")
908911
public static CApiContext ensureCapiWasLoaded(Node node, PythonContext context, TruffleString name, TruffleString path, String reason) throws IOException, ImportException, ApiInitException {
909912
assert PythonContext.get(null).ownsGil(); // unsafe lazy initialization
910-
if (!context.hasCApiContext()) {
911-
Env env = context.getEnv();
912-
InteropLibrary U = InteropLibrary.getUncached();
913-
914-
TruffleFile homePath = env.getInternalTruffleFile(context.getCAPIHome().toJavaStringUncached());
915-
// e.g. "libpython-native.so"
916-
String libName = PythonContext.getSupportLibName("python-native");
917-
final TruffleFile capiFile = homePath.resolve(libName).getCanonicalFile();
913+
// The initialization may run Python code (e.g., module import in
914+
// GraalPyPrivate_InitBuiltinTypesAndStructs), so just holding the GIL is not enough
915+
if (!context.isCApiInitialized()) {
916+
917+
// We import those modules ahead of the initialization without the initialization lock
918+
// to avoid deadlocks. We would have imported them in the initialization anyway, this
919+
// way we can just simply look up already imported modules during the initialization
920+
// without running the complex import machinery and risking a deadlock
921+
AbstractImportNode.importModule(toTruffleStringUncached("datetime"));
922+
AbstractImportNode.importModule(toTruffleStringUncached("types"));
923+
924+
ReentrantLock initLock = context.getcApiInitializationLock();
925+
try (var ignored = GilNode.uncachedAcquire()) {
926+
TruffleSafepoint.setBlockedThreadInterruptible(node, ReentrantLock::lockInterruptibly, initLock);
927+
}
918928
try {
919-
SourceBuilder capiSrcBuilder;
920-
boolean useNative = true;
921-
boolean isolateNative = PythonOptions.IsolateNativeModules.getValue(env.getOptions());
922-
final NativeLibraryLocator loc;
923-
if (!isolateNative) {
924-
useNative = nativeCAPILoaded.compareAndSet(NO_NATIVE_CONTEXT, GLOBAL_NATIVE_CONTEXT);
925-
} else {
926-
useNative = nativeCAPILoaded.compareAndSet(NO_NATIVE_CONTEXT, ISOLATED_NATIVE_CONTEXT) || nativeCAPILoaded.get() == ISOLATED_NATIVE_CONTEXT;
927-
}
928-
if (!useNative) {
929-
String actualReason = "initialize native extensions support";
930-
if (reason != null) {
931-
actualReason = reason;
932-
} else if (name != null && path != null) {
933-
actualReason = String.format("load a native module '%s' from path '%s'", name.toJavaStringUncached(), path.toJavaStringUncached());
934-
}
935-
throw new ApiInitException(toTruffleStringUncached(
936-
String.format("Option python.IsolateNativeModules is set to 'false' and a second GraalPy context attempted to %s. " +
937-
"At least one context in this process runs with 'IsolateNativeModules' set to false. " +
938-
"Depending on the order of context creation, this means some contexts in the process " +
939-
"cannot use native module.", actualReason)));
940-
}
941-
loc = new NativeLibraryLocator(context, capiFile, isolateNative);
942-
context.ensureNFILanguage(node, "allowNativeAccess", "true");
943-
String dlopenFlags = isolateNative ? "RTLD_LOCAL" : "RTLD_GLOBAL";
944-
capiSrcBuilder = Source.newBuilder(J_NFI_LANGUAGE, String.format("load(%s) \"%s\"", dlopenFlags, loc.getCapiLibrary()), "<libpython>");
945-
LOGGER.config(() -> "loading CAPI from " + loc.getCapiLibrary() + " as native");
946-
if (!context.getLanguage().getEngineOption(PythonOptions.ExposeInternalSources)) {
947-
capiSrcBuilder.internal(true);
948-
}
949-
CallTarget capiLibraryCallTarget = context.getEnv().parseInternal(capiSrcBuilder.build());
950-
951-
Object capiLibrary = capiLibraryCallTarget.call();
952-
Object initFunction = U.readMember(capiLibrary, "initialize_graal_capi");
953-
CApiContext cApiContext = new CApiContext(context, capiLibrary, loc);
954-
context.setCApiContext(cApiContext);
955-
956-
try (BuiltinArrayWrapper builtinArrayWrapper = new BuiltinArrayWrapper()) {
957-
/*
958-
* The GC state needs to be created before the first managed object is sent to
959-
* native. This is because the native object stub could take part in GC and will
960-
* then already require the GC state.
961-
*/
962-
Object gcState = cApiContext.createGCState();
963-
Object signature = env.parseInternal(Source.newBuilder(J_NFI_LANGUAGE, "(ENV,POINTER,POINTER):VOID", "exec").build()).call();
964-
initFunction = SignatureLibrary.getUncached().bind(signature, initFunction);
965-
U.execute(initFunction, builtinArrayWrapper, gcState);
929+
if (!context.isCApiInitialized()) {
930+
// loadCApi must set C API context half-way through its execution so that it can
931+
// run code that needs C API context
932+
loadCApi(node, context, name, path, reason);
933+
context.setCApiInitialized(); // volatile write
966934
}
935+
} finally {
936+
initLock.unlock();
937+
}
938+
}
939+
return context.getCApiContext();
940+
}
967941

968-
assert PythonCApiAssertions.assertBuiltins(capiLibrary);
969-
cApiContext.pyDateTimeCAPICapsule = PyDateTimeCAPIWrapper.initWrapper(context, cApiContext);
970-
context.runCApiHooks();
942+
private static CApiContext loadCApi(Node node, PythonContext context, TruffleString name, TruffleString path, String reason) throws IOException, ImportException, ApiInitException {
943+
Env env = context.getEnv();
944+
InteropLibrary U = InteropLibrary.getUncached();
971945

972-
/*
973-
* C++ libraries sometimes declare global objects that have destructors that call
974-
* Py_DECREF. Those destructors are then called during native shutdown, which is
975-
* after the JVM/SVM shut down and the upcall would segfault. This finalizer code
976-
* rebinds reference operations to native no-ops that don't upcall. In normal
977-
* scenarios we call it during context exit, but when the VM is terminated by a
978-
* signal, the context exit is skipped. For that case we set up the shutdown hook.
979-
*/
980-
Object finalizeFunction = U.readMember(capiLibrary, "GraalPyPrivate_GetFinalizeCApiPointer");
981-
Object finalizeSignature = env.parseInternal(Source.newBuilder(J_NFI_LANGUAGE, "():POINTER", "exec").build()).call();
982-
Object finalizingPointer = SignatureLibrary.getUncached().call(finalizeSignature, finalizeFunction);
983-
try {
984-
cApiContext.addNativeFinalizer(context, finalizingPointer);
985-
cApiContext.runBackgroundGCTask(context);
986-
} catch (RuntimeException e) {
987-
// This can happen when other languages restrict multithreading
988-
LOGGER.warning(() -> "didn't register a native finalizer due to: " + e.getMessage());
946+
TruffleFile homePath = env.getInternalTruffleFile(context.getCAPIHome().toJavaStringUncached());
947+
// e.g. "libpython-native.so"
948+
String libName = PythonContext.getSupportLibName("python-native");
949+
final TruffleFile capiFile = homePath.resolve(libName).getCanonicalFile();
950+
try {
951+
SourceBuilder capiSrcBuilder;
952+
boolean useNative = true;
953+
boolean isolateNative = PythonOptions.IsolateNativeModules.getValue(env.getOptions());
954+
final NativeLibraryLocator loc;
955+
if (!isolateNative) {
956+
useNative = nativeCAPILoaded.compareAndSet(NO_NATIVE_CONTEXT, GLOBAL_NATIVE_CONTEXT);
957+
} else {
958+
useNative = nativeCAPILoaded.compareAndSet(NO_NATIVE_CONTEXT, ISOLATED_NATIVE_CONTEXT) || nativeCAPILoaded.get() == ISOLATED_NATIVE_CONTEXT;
959+
}
960+
if (!useNative) {
961+
String actualReason = "initialize native extensions support";
962+
if (reason != null) {
963+
actualReason = reason;
964+
} else if (name != null && path != null) {
965+
actualReason = String.format("load a native module '%s' from path '%s'", name.toJavaStringUncached(), path.toJavaStringUncached());
989966
}
967+
throw new ApiInitException(toTruffleStringUncached(
968+
String.format("Option python.IsolateNativeModules is set to 'false' and a second GraalPy context attempted to %s. " +
969+
"At least one context in this process runs with 'IsolateNativeModules' set to false. " +
970+
"Depending on the order of context creation, this means some contexts in the process " +
971+
"cannot use native module.", actualReason)));
972+
}
973+
loc = new NativeLibraryLocator(context, capiFile, isolateNative);
974+
context.ensureNFILanguage(node, "allowNativeAccess", "true");
975+
String dlopenFlags = isolateNative ? "RTLD_LOCAL" : "RTLD_GLOBAL";
976+
capiSrcBuilder = Source.newBuilder(J_NFI_LANGUAGE, String.format("load(%s) \"%s\"", dlopenFlags, loc.getCapiLibrary()), "<libpython>");
977+
LOGGER.config(() -> "loading CAPI from " + loc.getCapiLibrary() + " as native");
978+
if (!context.getLanguage().getEngineOption(PythonOptions.ExposeInternalSources)) {
979+
capiSrcBuilder.internal(true);
980+
}
981+
CallTarget capiLibraryCallTarget = context.getEnv().parseInternal(capiSrcBuilder.build());
982+
983+
Object capiLibrary = capiLibraryCallTarget.call();
984+
Object initFunction = U.readMember(capiLibrary, "initialize_graal_capi");
985+
CApiContext cApiContext = new CApiContext(context, capiLibrary, loc);
986+
context.setCApiContext(cApiContext);
990987

991-
return cApiContext;
992-
} catch (PException e) {
988+
try (BuiltinArrayWrapper builtinArrayWrapper = new BuiltinArrayWrapper()) {
993989
/*
994-
* Python exceptions that occur during the C API initialization are just passed
995-
* through
990+
* The GC state needs to be created before the first managed object is sent to
991+
* native. This is because the native object stub could take part in GC and will
992+
* then already require the GC state.
996993
*/
997-
throw e;
998-
} catch (RuntimeException | UnsupportedMessageException | ArityException | UnknownIdentifierException | UnsupportedTypeException e) {
999-
// we cannot really check if we truly need native access, so
1000-
// when the abi contains "managed" we assume we do not
1001-
if (!libName.contains("managed") && !context.isNativeAccessAllowed()) {
1002-
throw new ImportException(null, name, path, ErrorMessages.NATIVE_ACCESS_NOT_ALLOWED);
1003-
}
1004-
throw new ApiInitException(e);
994+
Object gcState = cApiContext.createGCState();
995+
Object signature = env.parseInternal(Source.newBuilder(J_NFI_LANGUAGE, "(ENV,POINTER,POINTER):VOID", "exec").build()).call();
996+
initFunction = SignatureLibrary.getUncached().bind(signature, initFunction);
997+
U.execute(initFunction, builtinArrayWrapper, gcState);
998+
}
999+
1000+
assert PythonCApiAssertions.assertBuiltins(capiLibrary);
1001+
cApiContext.pyDateTimeCAPICapsule = PyDateTimeCAPIWrapper.initWrapper(context, cApiContext);
1002+
context.runCApiHooks();
1003+
1004+
/*
1005+
* C++ libraries sometimes declare global objects that have destructors that call
1006+
* Py_DECREF. Those destructors are then called during native shutdown, which is after
1007+
* the JVM/SVM shut down and the upcall would segfault. This finalizer code rebinds
1008+
* reference operations to native no-ops that don't upcall. In normal scenarios we call
1009+
* it during context exit, but when the VM is terminated by a signal, the context exit
1010+
* is skipped. For that case we set up the shutdown hook.
1011+
*/
1012+
Object finalizeFunction = U.readMember(capiLibrary, "GraalPyPrivate_GetFinalizeCApiPointer");
1013+
Object finalizeSignature = env.parseInternal(Source.newBuilder(J_NFI_LANGUAGE, "():POINTER", "exec").build()).call();
1014+
Object finalizingPointer = SignatureLibrary.getUncached().call(finalizeSignature, finalizeFunction);
1015+
try {
1016+
cApiContext.addNativeFinalizer(context, finalizingPointer);
1017+
cApiContext.runBackgroundGCTask(context);
1018+
} catch (RuntimeException e) {
1019+
// This can happen when other languages restrict multithreading
1020+
LOGGER.warning(() -> "didn't register a native finalizer due to: " + e.getMessage());
1021+
}
1022+
1023+
return cApiContext;
1024+
} catch (PException e) {
1025+
/*
1026+
* Python exceptions that occur during the C API initialization are just passed through
1027+
*/
1028+
throw e;
1029+
} catch (RuntimeException | UnsupportedMessageException | ArityException | UnknownIdentifierException | UnsupportedTypeException e) {
1030+
// we cannot really check if we truly need native access, so
1031+
// when the abi contains "managed" we assume we do not
1032+
if (!libName.contains("managed") && !context.isNativeAccessAllowed()) {
1033+
throw new ImportException(null, name, path, ErrorMessages.NATIVE_ACCESS_NOT_ALLOWED);
10051034
}
1035+
throw new ApiInitException(e);
10061036
}
1007-
return context.getCApiContext();
10081037
}
10091038

10101039
private static final Set<String> C_EXT_SUPPORTED_LIST = Set.of(

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/statement/AbstractImportNode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ public static PythonModule importModule(TruffleString name) {
118118
}
119119
Object fromList = PFactory.createTuple(context.getLanguage(), PythonUtils.EMPTY_TRUFFLESTRING_ARRAY);
120120
CallNode.executeUncached(builtinImport, name, PNone.NONE, PNone.NONE, fromList, 0);
121+
return lookupImportedModule(context, name);
122+
}
123+
124+
@TruffleBoundary
125+
public static PythonModule lookupImportedModule(PythonContext context, TruffleString name) {
121126
PythonModule sysModule = context.lookupBuiltinModule(T_SYS);
122127
Object modules = sysModule.getAttribute(T_MODULES);
123128
if (modules == PNone.NO_VALUE) {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PythonContext.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@
9898
import java.util.concurrent.locks.ReentrantLock;
9999
import java.util.logging.Level;
100100

101-
import com.oracle.graal.python.nodes.attributes.ReadAttributeFromModuleNode;
102101
import org.graalvm.options.OptionKey;
103102

104103
import com.oracle.graal.python.PythonLanguage;
@@ -146,6 +145,7 @@
146145
import com.oracle.graal.python.nodes.SpecialAttributeNames;
147146
import com.oracle.graal.python.nodes.SpecialMethodNames;
148147
import com.oracle.graal.python.nodes.WriteUnraisableNode;
148+
import com.oracle.graal.python.nodes.attributes.ReadAttributeFromModuleNode;
149149
import com.oracle.graal.python.nodes.attributes.ReadAttributeFromObjectNode;
150150
import com.oracle.graal.python.nodes.bytecode_dsl.PBytecodeDSLRootNode;
151151
import com.oracle.graal.python.nodes.call.CallNode;
@@ -769,6 +769,8 @@ PythonThreadState getThreadState(Node n) {
769769
private OutputStream out;
770770
private OutputStream err;
771771
private InputStream in;
772+
private final ReentrantLock cApiInitializationLock = new ReentrantLock(false);
773+
private volatile boolean cApiWasInitialized = false;
772774
@CompilationFinal private CApiContext cApiContext;
773775
@CompilationFinal private boolean nativeAccessAllowed;
774776

@@ -2640,13 +2642,31 @@ private static void releaseSentinelLock(WeakReference<PLock> sentinelLockWeakref
26402642
}
26412643

26422644
public boolean hasCApiContext() {
2645+
// This may be called during C API initialization, we have a context so that we can finish
2646+
// the initialization, but the C API is not fully initialized yet
2647+
assert (cApiContext != null) || !cApiWasInitialized;
26432648
return cApiContext != null;
26442649
}
26452650

2651+
public boolean isCApiInitialized() {
2652+
assert (cApiContext != null) || !cApiWasInitialized;
2653+
return cApiWasInitialized;
2654+
}
2655+
2656+
public void setCApiInitialized() {
2657+
assert cApiContext != null;
2658+
cApiWasInitialized = true;
2659+
}
2660+
26462661
public CApiContext getCApiContext() {
2662+
assert (cApiContext != null) || !cApiWasInitialized;
26472663
return cApiContext;
26482664
}
26492665

2666+
public ReentrantLock getcApiInitializationLock() {
2667+
return cApiInitializationLock;
2668+
}
2669+
26502670
public void setCApiContext(CApiContext capiContext) {
26512671
assert this.cApiContext == null : "tried to create new C API context but it was already created";
26522672
this.cApiContext = capiContext;

0 commit comments

Comments
 (0)