Skip to content

Commit 313ae8d

Browse files
committed
Actually interrupt blocking activities
Contributes to #185 Signed-off-by: Sylvain Hellegouarch <sh@defuze.org>
1 parent 4c268b3 commit 313ae8d

File tree

6 files changed

+131
-8
lines changed

6 files changed

+131
-8
lines changed

CHANGELOG.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,22 @@
22

33
## [Unreleased][]
44

5-
[Unreleased]: https://github.com/chaostoolkit/chaostoolkit-lib/compare/1.14.1...HEAD
5+
[Unreleased]: https://github.com/chaostoolkit/chaostoolkit-lib/compare/1.14.2...HEAD
6+
7+
### Added
8+
9+
- Raise the `chaoslib.execeptions.InterruptExecution` on `SIGTERM`
10+
- New exception `chaoslib.execeptions.ExperimentExitedException` that can only
11+
be injected into blocking background activities when we received the SIGUSR2
12+
signal
13+
- We now inject `chaoslib.execeptions.ExperimentExitedException` into blocking
14+
background activities when we received the SIGUSR2 signal. This is relying
15+
on https://docs.python.org/3/c-api/init.html#c.PyThreadState_SetAsyncExc
16+
There is much we can to interrupt blocking calls and the limit is now reached
17+
because we have no control over any call that is out of the Python VM (just
18+
calling `time.sleep()` will get you in that situation). This is a constraint
19+
we have to live in and this extension authors must keep this in mind when
20+
they create their blocking calls. [#185][185]
621

722
## [1.14.1][] - 2020-09-10
823

chaoslib/exceptions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,16 @@ class InterruptExecution(ChaosException):
4444

4545
class InvalidControl(ChaosException):
4646
pass
47+
48+
49+
class ExperimentExitedException(ChaosException):
50+
"""
51+
Only raised when the process received a SIGUSR2 signal.
52+
53+
Raised into the blocking background activities of the method only.
54+
55+
If you catch it, this mean you can clean your activity but you should
56+
really raise another exception again to let the Chaos Toolkit
57+
quickly terminate.
58+
"""
59+
pass

chaoslib/exit.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def my_probe():
4040
4141
WARNING: Only available on Unix/Linux systems.
4242
"""
43+
from chaoslib.exceptions import InterruptExecution
4344
from contextlib import contextmanager
4445
import inspect
4546
import os
@@ -158,3 +159,4 @@ def _terminate_now(signum: int, frame: FrameType = None) -> None:
158159
"""
159160
if signum == signal.SIGTERM:
160161
logger.warning("Caught SIGTERM signal, interrupting experiment now")
162+
raise InterruptExecution("SIGTERM signal received")

chaoslib/run.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# -*- coding: utf-8 -*-
22
from concurrent.futures import Future, ThreadPoolExecutor, TimeoutError
3+
try:
4+
import ctypes
5+
HAS_CTYPES = True
6+
except ImportError:
7+
HAS_CTYPES = False
38
from datetime import datetime
49
import platform
510
import threading
@@ -12,7 +17,8 @@
1217
from chaoslib.activity import run_activities
1318
from chaoslib.control import initialize_controls, controls, cleanup_controls, \
1419
Control, initialize_global_controls, cleanup_global_controls
15-
from chaoslib.exceptions import ChaosException, InterruptExecution
20+
from chaoslib.exceptions import ChaosException, ExperimentExitedException, \
21+
InterruptExecution
1622
from chaoslib.exit import exit_signals
1723
from chaoslib.configuration import load_configuration
1824
from chaoslib.hypothesis import run_steady_state_hypothesis
@@ -754,10 +760,11 @@ def apply_activities(experiment: Experiment, configuration: Configuration,
754760
logger.debug("Waiting for background activities to complete")
755761
pool.shutdown(wait=True)
756762
elif pool:
763+
harshly_terminate_pending_background_activities(pool)
757764
logger.debug(
758765
"Do not wait for the background activities to finish "
759766
"as per signal")
760-
background_activity_timeout = 0.1
767+
background_activity_timeout = 0.2
761768
pool.shutdown(wait=False)
762769

763770
for index, run in enumerate(runs):
@@ -828,3 +835,39 @@ def has_steady_state_hypothesis_with_probes(experiment: Experiment) -> bool:
828835
if probes:
829836
return len(probes) > 0
830837
return False
838+
839+
840+
def harshly_terminate_pending_background_activities(
841+
pool: ThreadPoolExecutor) -> None:
842+
"""
843+
Ugly hack to try to force background activities to terminate now.
844+
845+
This cano only have an impact over functions that are still in the Python
846+
land. Any code outside of the Python VM (say calling a C function, even
847+
time.sleep()) will not be impacted and therefore will continue hanging
848+
until it does complete of its own accord.
849+
850+
This could have really bizarre side effects so it's only applied when
851+
a SIGUSR2 signal was received.
852+
"""
853+
if not HAS_CTYPES:
854+
logger.debug(
855+
"Your Python implementation does not provide the `ctypes` "
856+
"module and we cannot terminate very harshly running background "
857+
"activities.")
858+
return
859+
860+
logger.debug(
861+
"Harshly trying to interrupt remaining background activities still "
862+
"running")
863+
864+
# oh and of course we use private properties... might as well when trying
865+
# to be ugly
866+
for thread in pool._threads:
867+
tid = ctypes.c_long(thread.ident)
868+
try:
869+
gil = ctypes.pythonapi.PyGILState_Ensure()
870+
ctypes.pythonapi.PyThreadState_SetAsyncExc(
871+
tid, ctypes.py_object(ExperimentExitedException))
872+
finally:
873+
ctypes.pythonapi.PyGILState_Release(gil)

tests/fixtures/experiments.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,3 +969,37 @@
969969
}
970970
]
971971
}
972+
973+
974+
SimpleExperimentWithBackgroundActivity = {
975+
"version": "1.0.0",
976+
"title": "Hello world!",
977+
"description": "Say hello world.",
978+
"method": [
979+
{
980+
"type": "action",
981+
"name": "pretend-we-do-stuff",
982+
"background": True,
983+
"provider": {
984+
"type": "python",
985+
"module": "fixtures.longpythonfunc",
986+
"func": "be_long",
987+
"arguments": {
988+
"howlong": 3
989+
}
990+
}
991+
},
992+
{
993+
"type": "action",
994+
"name": "pretend-we-do-stuff-again",
995+
"provider": {
996+
"type": "python",
997+
"module": "fixtures.longpythonfunc",
998+
"func": "be_long",
999+
"arguments": {
1000+
"howlong": 4
1001+
}
1002+
}
1003+
}
1004+
]
1005+
}

tests/test_exit.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
from wsgiref.simple_server import WSGIServer, WSGIRequestHandler
55

6+
from chaoslib.exit import exit_gracefully, exit_ungracefully
67
from chaoslib.run import Runner
78
from chaoslib.types import Strategy
89

@@ -116,14 +117,29 @@ def test_wait_for_background_activity_on_graceful_exit():
116117

117118

118119
def test_do_not_wait_for_background_activity_on_ungraceful_exit():
119-
server = threading.Thread(target=run_http_server_in_background)
120-
server.start()
120+
def _exit_soon():
121+
time.sleep(1.5)
122+
exit_ungracefully()
123+
t = threading.Thread(target=_exit_soon)
121124

122-
x = deepcopy(experiments.ExperimentUngracefulExitLongHTTPCall)
125+
x = deepcopy(experiments.SimpleExperimentWithBackgroundActivity)
123126
with Runner(Strategy.DEFAULT) as runner:
127+
t.start()
124128
journal = runner.run(x)
125-
126129
assert journal["status"] == "interrupted"
127130
assert journal["run"][0]["status"] == "failed"
131+
assert "ExperimentExitedException" in journal["run"][0]["exception"][-1]
128132

129-
server.join()
133+
134+
def test_wait_for_background_activity_to_finish_on_graceful_exit():
135+
def _exit_soon():
136+
time.sleep(1.5)
137+
exit_gracefully()
138+
t = threading.Thread(target=_exit_soon)
139+
140+
x = deepcopy(experiments.SimpleExperimentWithBackgroundActivity)
141+
with Runner(Strategy.DEFAULT) as runner:
142+
t.start()
143+
journal = runner.run(x)
144+
assert journal["status"] == "interrupted"
145+
assert journal["run"][0]["status"] == "succeeded"

0 commit comments

Comments
 (0)