Skip to content

Commit 2b3ec5b

Browse files
authored
Merge pull request #186 from chaostoolkit/exit-shortcut
Add exit functions
2 parents ec23bba + d7f0438 commit 2b3ec5b

File tree

8 files changed

+654
-32
lines changed

8 files changed

+654
-32
lines changed

CHANGELOG.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44

55
[Unreleased]: https://github.com/chaostoolkit/chaostoolkit-lib/compare/1.13.1...HEAD
66

7+
### Added
8+
9+
- Add two functions to programmatically exit the experiment as soon as feasible
10+
by the Python VM [#185][185]:
11+
* `chaoslib.exit.exit_gracefully`: termintes but abides to the rollback strategy
12+
* `chaoslib.exit.exit_ungracefully`: termintes but bypasses rollbacks entirely
13+
and does not wait for background actions/probes still running
14+
This should mostly be useful to have a harsh way to interrupt an execution
15+
and is therefore an advanced concept with undesirable side effects (though
16+
Chaos Toolkit tries to do as right as it can).
17+
18+
CAVEAT: Only works on Unix/Linux systems implementing SIGUSR1/SIGUSR2 signals
19+
20+
[185]: https://github.com/chaostoolkit/chaostoolkit-lib/issues/185
21+
22+
723
## [1.13.1][] - 2020-09-07
824

925
[1.13.1]: https://github.com/chaostoolkit/chaostoolkit-lib/compare/1.13.0...1.13.1

chaoslib/activity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def run_activity(activity: Activity, configuration: Configuration,
219219
This is an internal function and should probably avoid being called
220220
outside this package.
221221
"""
222+
result = None
222223
try:
223224
provider = activity["provider"]
224225
activity_type = provider["type"]

chaoslib/exit.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""
2+
This module is advanced usage and mostly interesting to users who need to
3+
be able to terminate an experiment's execution as fast as possible and,
4+
potentially, without much care for cleaning up afterwards.
5+
6+
If you want to interrup an execution but can affort to wait for graceful
7+
completion (current activty, rollbacks...) it's probably best to rely on
8+
the control interface.
9+
10+
If you need this utility here, you can simply do as follows:
11+
12+
13+
```python
14+
from chaoslib.exit import exit_ungracefully
15+
16+
def my_probe():
17+
# whatever condition comes up that shows you need to terminate asap
18+
exit_ungracefully()
19+
```
20+
21+
Then in your experiment
22+
23+
```json
24+
"method": [
25+
{
26+
"type": "probe",
27+
"name": "interrupt-when-system-is-unhappy",
28+
"background": True,
29+
"provider": {
30+
"type": "python",
31+
"module": "mymod",
32+
"func": "my_probe"
33+
}
34+
}
35+
...
36+
]
37+
```
38+
39+
This will start your probe in the background.
40+
41+
WARNING: Only available on Unix/Linux systems.
42+
"""
43+
from contextlib import contextmanager
44+
import inspect
45+
import os
46+
import platform
47+
import signal
48+
from types import FrameType
49+
50+
from logzero import logger
51+
52+
__all__ = ["exit_gracefully", "exit_ungracefully", "exit_signals"]
53+
54+
55+
@contextmanager
56+
def exit_signals():
57+
"""
58+
Register the handlers for SIGUSR1 and SIGUSR2 signals. Puts back the
59+
original handlers when the call ends.
60+
61+
SIGUSR1 is used to terminate the experiment now while keeping the
62+
rollbacks if they were declared.
63+
SIGUSR2 is used to terminate the experiment without ever running the
64+
rollbacks.
65+
66+
Generally speaking using signals this way is a bit of an overkill but
67+
the Python VM has no other mechanism to interrupt blocking calls.
68+
69+
WARNING: Only available on Unix/Linux systems.
70+
"""
71+
if hasattr(signal, "SIGUSR1") and hasattr(signal, "SIGUSR2"):
72+
# keep a reference to the original handlers
73+
sigusr1_handler = signal.signal(signal.SIGUSR1, _leave_now)
74+
sigusr2_handler = signal.signal(signal.SIGUSR2, _leave_now)
75+
try:
76+
yield
77+
finally:
78+
signal.signal(signal.SIGUSR1, sigusr1_handler)
79+
signal.signal(signal.SIGUSR2, sigusr2_handler)
80+
else:
81+
# On a system that doesn't support SIGUSR signals
82+
# not much we can do...
83+
logger.debug(
84+
"System '{}' does not expose SIGUSR signals".format(
85+
platform.platform()))
86+
yield
87+
88+
89+
def exit_gracefully():
90+
"""
91+
Sends a user signal to the chaostoolkit process which should terminate
92+
the current execution immediatly, but gracefully.
93+
94+
WARNING: Only available on Unix/Linux systems.
95+
"""
96+
if not hasattr(signal, "SIGUSR1"):
97+
frames = inspect.getouterframes(inspect.currentframe())
98+
info = frames[1]
99+
logger.error(
100+
"Cannot call 'chaoslib.exit.exit_ungracefully() [{} - line {}] "
101+
"as it relies on the SIGUSR1 signal which is not available on "
102+
"your platform".format(info.filename, info.lineno))
103+
return
104+
105+
os.kill(os.getpid(), signal.SIGUSR1)
106+
107+
108+
def exit_ungracefully():
109+
"""
110+
Sends a user signal to the chaostoolkit process which should terminate
111+
the current execution immediatly, but not gracefully.
112+
113+
This means the rollbacks will not be executed, although controls
114+
will be correctly terminated.
115+
116+
WARNING: Only available on Unix/Linux systems.
117+
"""
118+
if not hasattr(signal, "SIGUSR2"):
119+
frames = inspect.getouterframes(inspect.currentframe())
120+
info = frames[1]
121+
logger.error(
122+
"Cannot call 'chaoslib.exit.exit_ungracefully() [{} - line {}] "
123+
"as it relies on the SIGUSR2 signal which is not available on "
124+
"your platform".format(info.filename, info.lineno))
125+
return
126+
127+
os.kill(os.getpid(), signal.SIGUSR2)
128+
129+
130+
###############################################################################
131+
# Internals
132+
###############################################################################
133+
def _leave_now(signum: int, frame: FrameType = None) -> None:
134+
"""
135+
Signal handler only interested in SIGUSR1 and SIGUSR2 to indicate
136+
requested termination of the experiment.
137+
"""
138+
logger.warning("Caught signal num: '{}'".format(signum))
139+
140+
if signum == signal.SIGUSR1:
141+
raise SystemExit(20)
142+
143+
elif signum == signal.SIGUSR2:
144+
raise SystemExit(30)

chaoslib/run.py

Lines changed: 82 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
from concurrent.futures import Future, ThreadPoolExecutor
2+
from concurrent.futures import Future, ThreadPoolExecutor, TimeoutError
33
from datetime import datetime
44
import platform
55
import threading
@@ -13,6 +13,7 @@
1313
from chaoslib.control import initialize_controls, controls, cleanup_controls, \
1414
Control, initialize_global_controls, cleanup_global_controls
1515
from chaoslib.exceptions import ChaosException, InterruptExecution
16+
from chaoslib.exit import exit_signals
1617
from chaoslib.configuration import load_configuration
1718
from chaoslib.hypothesis import run_steady_state_hypothesis
1819
from chaoslib.rollback import run_rollbacks
@@ -296,10 +297,11 @@ def run(self, experiment: Experiment,
296297
journal: Journal = None) -> Journal:
297298

298299
self.configure(experiment, settings)
299-
journal = self._run(
300-
self.strategy, self.schedule, experiment, journal,
301-
self.config, self.secrets, self.settings, experiment_vars,
302-
self.event_registry)
300+
with exit_signals():
301+
journal = self._run(
302+
self.strategy, self.schedule, experiment, journal,
303+
self.config, self.secrets, self.settings, experiment_vars,
304+
self.event_registry)
303305
return journal
304306

305307
def _run(self, strategy: Strategy, schedule: Schedule, # noqa: C901
@@ -334,6 +336,7 @@ def _run(self, strategy: Strategy, schedule: Schedule, # noqa: C901
334336
"rollbacks", {}).get("strategy", "default")
335337
logger.info("Rollbacks strategy: {}".format(rollback_strategy))
336338

339+
exit_gracefully_with_rollbacks = True
337340
with_ssh = has_steady_state_hypothesis_with_probes(experiment)
338341
if not with_ssh:
339342
logger.info(
@@ -374,17 +377,27 @@ def _run(self, strategy: Strategy, schedule: Schedule, # noqa: C901
374377
journal["status"] = "interrupted"
375378
logger.fatal(str(i))
376379
event_registry.interrupted(experiment, journal)
377-
except (KeyboardInterrupt, SystemExit):
380+
except KeyboardInterrupt:
378381
journal["status"] = "interrupted"
379-
logger.warning("Received an exit signal, "
380-
"leaving without applying rollbacks.")
382+
logger.warning("Received a termination signal (Ctrl-C)...")
383+
event_registry.signal_exit()
384+
except SystemExit as x:
385+
journal["status"] = "interrupted"
386+
logger.warning("Received the exit signal: {}".format(x.code))
387+
388+
exit_gracefully_with_rollbacks = x.code != 30
389+
if not exit_gracefully_with_rollbacks:
390+
logger.warning("Ignoring rollbacks as per signal")
381391
event_registry.signal_exit()
382392
finally:
383393
hypo_pool.shutdown(wait=True)
384394

385-
run_rollback(
386-
rollback_strategy, rollback_pool, experiment, journal,
387-
configuration, secrets, event_registry, dry)
395+
# just in case a signal overrode everything else to tell us not to
396+
# play them anyway (see the exit.py module)
397+
if exit_gracefully_with_rollbacks:
398+
run_rollback(
399+
rollback_strategy, rollback_pool, experiment, journal,
400+
configuration, secrets, event_registry, dry)
388401

389402
journal["end"] = datetime.utcnow().isoformat()
390403
journal["duration"] = time.time() - started_at
@@ -533,9 +546,8 @@ def run_method(strategy: Strategy, activity_pool: ThreadPoolExecutor,
533546
try:
534547
state = apply_activities(
535548
experiment, configuration, secrets, activity_pool, journal, dry)
536-
journal["run"] = state
537549
event_registry.method_completed(experiment, state)
538-
return journal["run"]
550+
return state
539551
except InterruptExecution:
540552
event_registry.method_completed(experiment)
541553
raise
@@ -717,27 +729,66 @@ def apply_activities(experiment: Experiment, configuration: Configuration,
717729
journal: Journal, dry: bool = False) -> List[Run]:
718730
with controls(level="method", experiment=experiment, context=experiment,
719731
configuration=configuration, secrets=secrets) as control:
732+
result = []
720733
runs = []
721-
for run in run_activities(
722-
experiment, configuration, secrets, pool, dry):
723-
runs.append(run)
724-
if journal["status"] in ["aborted", "failed", "interrupted"]:
725-
break
734+
method = experiment.get("method", [])
735+
wait_for_background_activities = True
726736

727-
if pool:
728-
logger.debug("Waiting for background activities to complete...")
729-
pool.shutdown(wait=True)
730-
731-
result = []
732-
for run in runs:
733-
if not run:
734-
continue
735-
if isinstance(run, dict):
736-
result.append(run)
737-
else:
738-
result.append(run.result())
737+
try:
738+
for run in run_activities(
739+
experiment, configuration, secrets, pool, dry):
740+
runs.append(run)
741+
if journal["status"] in ["aborted", "failed", "interrupted"]:
742+
break
743+
except SystemExit as x:
744+
# when we got a signal for an ungraceful exit, we can decide
745+
# not to wait for background activities. Their statuses will
746+
# remain failed.
747+
wait_for_background_activities = x.code != 30 # see exit.py
748+
raise
749+
finally:
750+
background_activity_timeout = None
739751

740-
control.with_state(result)
752+
if wait_for_background_activities and pool:
753+
logger.debug("Waiting for background activities to complete")
754+
pool.shutdown(wait=True)
755+
elif pool:
756+
logger.debug(
757+
"Do not wait for the background activities to finish "
758+
"as per signal")
759+
background_activity_timeout = 0.1
760+
pool.shutdown(wait=False)
761+
762+
for index, run in enumerate(runs):
763+
if not run:
764+
continue
765+
766+
if isinstance(run, dict):
767+
result.append(run)
768+
else:
769+
try:
770+
# background activities
771+
result.append(
772+
run.result(timeout=background_activity_timeout))
773+
except TimeoutError:
774+
# we want an entry for the background activity in our
775+
# results anyway, we won't have anything meaningful
776+
# to say about it
777+
result.append({
778+
"activity": method[index],
779+
"status": "failed",
780+
"output": None,
781+
"duration": None,
782+
"start": None,
783+
"end": None,
784+
"exception": None
785+
})
786+
787+
# now let's ensure the journal has all activities in their correct
788+
# order (background ones included)
789+
journal["run"] = result
790+
791+
control.with_state(result)
741792

742793
return result
743794

0 commit comments

Comments
 (0)