Skip to content

Commit a24ef90

Browse files
authored
Merge pull request #125 from man-group/delete-gridfs-too
Ensure that the delete function actually deletes the gridfs blobs (#122)
2 parents bedfe06 + 6a97130 commit a24ef90

File tree

3 files changed

+150
-47
lines changed

3 files changed

+150
-47
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
------------------
33

44
* Bugfix: Small bugfix for synchronous report execution
5+
* Improvement: Delete functionality in mongo now also deletes files from GridFS
56

67

78
0.4.5 (2022-09-29)

notebooker/serialization/mongo.py

Lines changed: 80 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,66 @@ def _add_deleted_status_to_filter(base_filter):
2727
return base_filter
2828

2929

30+
def ignore_missing_files(f):
31+
def _ignore_missing_files(path, *args, **kwargs):
32+
try:
33+
return f(path, *args, **kwargs)
34+
except NoFile:
35+
logger.error("Could not find file %s", path)
36+
return ""
37+
38+
return _ignore_missing_files
39+
40+
41+
@ignore_missing_files
42+
def read_file(result_data_store, path, is_json=False):
43+
r = result_data_store.get_last_version(path).read()
44+
try:
45+
return "" if not r else json.loads(r) if is_json else r.decode("utf8")
46+
except UnicodeDecodeError:
47+
return r
48+
49+
50+
@ignore_missing_files
51+
def read_bytes_file(result_data_store, path):
52+
return result_data_store.get_last_version(path).read()
53+
54+
55+
def load_files_from_gridfs(result_data_store: gridfs.GridFS, result: Dict, do_read=True) -> List[str]:
56+
57+
gridfs_filenames = []
58+
all_html_output_paths = result.get("raw_html_resources", {}).get("outputs", [])
59+
gridfs_filenames.extend(all_html_output_paths)
60+
if do_read:
61+
outputs = {path: read_file(result_data_store, path) for path in all_html_output_paths}
62+
result["raw_html_resources"]["outputs"] = outputs
63+
if result.get("generate_pdf_output"):
64+
pdf_filename = _pdf_filename(result["job_id"])
65+
if do_read:
66+
result["pdf"] = read_bytes_file(result_data_store, pdf_filename)
67+
gridfs_filenames.append(pdf_filename)
68+
if not result.get("raw_ipynb_json"):
69+
json_filename = _raw_json_filename(result["job_id"])
70+
if do_read:
71+
result["raw_ipynb_json"] = read_file(result_data_store, json_filename, is_json=True)
72+
gridfs_filenames.append(json_filename)
73+
if not result.get("raw_html"):
74+
html_filename = _raw_html_filename(result["job_id"])
75+
if do_read:
76+
result["raw_html"] = read_file(result_data_store, html_filename)
77+
gridfs_filenames.append(html_filename)
78+
if not result.get("email_html"):
79+
email_filename = _raw_email_html_filename(result["job_id"])
80+
result["email_html"] = read_file(result_data_store, email_filename)
81+
gridfs_filenames.append(email_filename)
82+
if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
83+
css_inlining_filename = _css_inlining_filename(result["job_id"])
84+
if do_read:
85+
result["raw_html_resources"]["inlining"] = read_file(result_data_store, css_inlining_filename, is_json=True)
86+
gridfs_filenames.append(css_inlining_filename)
87+
return gridfs_filenames
88+
89+
3090
class MongoResultSerializer(ABC):
3191
# This class is the interface between Mongo and the rest of the application
3292

@@ -212,47 +272,12 @@ def _convert_result(
212272
if cls is None:
213273
return None
214274

215-
def ignore_missing_files(f):
216-
def _ignore_missing_files(path, *args, **kwargs):
217-
try:
218-
return f(path, *args, **kwargs)
219-
except NoFile:
220-
logger.error("Could not find file %s in %s", path, self.result_data_store)
221-
return ""
222-
return _ignore_missing_files
223-
224-
@ignore_missing_files
225-
def read_file(path, is_json=False):
226-
r = self.result_data_store.get_last_version(path).read()
227-
try:
228-
return "" if not r else json.loads(r) if is_json else r.decode("utf8")
229-
except UnicodeDecodeError:
230-
return r
231-
232-
@ignore_missing_files
233-
def read_bytes_file(path):
234-
return self.result_data_store.get_last_version(path).read()
235-
236275
if not load_payload:
237276
result.pop("stdout", None)
238277

239278
if cls == NotebookResultComplete:
240279
if load_payload:
241-
outputs = {path: read_file(path) for path in result.get("raw_html_resources", {}).get("outputs", [])}
242-
result["raw_html_resources"]["outputs"] = outputs
243-
if result.get("generate_pdf_output"):
244-
pdf_filename = _pdf_filename(result["job_id"])
245-
result["pdf"] = read_bytes_file(pdf_filename)
246-
if not result.get("raw_ipynb_json"):
247-
result["raw_ipynb_json"] = read_file(_raw_json_filename(result["job_id"]), is_json=True)
248-
if not result.get("raw_html"):
249-
result["raw_html"] = read_file(_raw_html_filename(result["job_id"]))
250-
if not result.get("email_html"):
251-
result["email_html"] = read_file(_raw_email_html_filename(result["job_id"]))
252-
if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
253-
result["raw_html_resources"]["inlining"] = read_file(
254-
_css_inlining_filename(result["job_id"]), is_json=True
255-
)
280+
load_files_from_gridfs(self.result_data_store, result, do_read=True)
256281
else:
257282
result.pop("raw_html", None)
258283
result.pop("raw_ipynb_json", None)
@@ -298,7 +323,7 @@ def read_bytes_file(path):
298323
elif cls == NotebookResultError:
299324
if load_payload:
300325
if not result.get("error_info"):
301-
result["error_info"] = read_file(_error_info_filename(result["job_id"]))
326+
result["error_info"] = read_file(self.result_data_store, _error_info_filename(result["job_id"]))
302327
else:
303328
result.pop("error_info", None)
304329
return NotebookResultError(
@@ -319,11 +344,14 @@ def read_bytes_file(path):
319344
else:
320345
raise ValueError("Could not deserialise {} into result object.".format(result))
321346

347+
def _get_raw_check_result(self, job_id: str):
348+
return self.library.find_one({"job_id": job_id}, {"_id": 0})
349+
322350
def get_check_result(
323-
self, job_id: AnyStr
351+
self, job_id: AnyStr, load_payload: bool = True
324352
) -> Optional[Union[NotebookResultError, NotebookResultComplete, NotebookResultPending]]:
325-
result = self.library.find_one({"job_id": job_id}, {"_id": 0})
326-
return self._convert_result(result)
353+
result = self._get_raw_check_result(job_id)
354+
return self._convert_result(result, load_payload=load_payload)
327355

328356
def _get_raw_results(self, base_filter, projection, limit):
329357
base_filter = _add_deleted_status_to_filter(base_filter)
@@ -462,8 +490,19 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
462490
def n_all_results_for_report_name(self, report_name: str) -> int:
463491
return self._get_result_count({"report_name": report_name})
464492

465-
def delete_result(self, job_id: AnyStr) -> None:
493+
def delete_result(self, job_id: AnyStr) -> Dict[str, Any]:
494+
result = self._get_raw_check_result(job_id)
495+
status = JobStatus.from_string(result["status"])
496+
gridfs_filenames = []
497+
if status == JobStatus.DONE:
498+
gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False)
499+
elif status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED):
500+
gridfs_filenames = [_error_info_filename(job_id)]
466501
self.update_check_status(job_id, JobStatus.DELETED)
502+
for filename in gridfs_filenames:
503+
logger.info(f"Deleting {filename}")
504+
self.result_data_store.delete(filename)
505+
return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames}
467506

468507

469508
def _pdf_filename(job_id: str) -> str:

tests/integration/test_mongo.py

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
import uuid
33

44
import pytest
5+
from gridfs.errors import NoFile
56

6-
from notebooker.constants import JobStatus, NotebookResultComplete
7+
from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError
78
from notebooker.serialization.serialization import initialize_serializer_from_config
89
from notebooker.utils.filesystem import initialise_base_dirs
910

@@ -23,11 +24,11 @@ def test_mongo_saving_ipynb_json_to_gridfs(bson_library, webapp_config):
2324
update_time=datetime.datetime(2018, 1, 12, 2, 32),
2425
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
2526
job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
26-
raw_ipynb_json="x" * 32 * (2 ** 20), # 16MB document max
27-
raw_html="x" * 32 * (2 ** 20), # 16MB document max
28-
email_html="x" * 32 * (2 ** 20), # 16MB document max
29-
pdf=b"x" * 32 * (2 ** 20), # 16MB document max
30-
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2 ** 20)}},
27+
raw_ipynb_json="x" * 32 * (2**20), # 16MB document max
28+
raw_html="x" * 32 * (2**20), # 16MB document max
29+
email_html="x" * 32 * (2**20), # 16MB document max
30+
pdf=b"x" * 32 * (2**20), # 16MB document max
31+
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}},
3132
)
3233
)
3334
result = serializer.get_check_result(job_id)
@@ -52,3 +53,65 @@ def test_cant_serialise_done_job_via_update(bson_library, webapp_config):
5253
raw_html="",
5354
email_html="",
5455
)
56+
57+
58+
def test_delete(bson_library, webapp_config):
59+
initialise_base_dirs(webapp_config=webapp_config)
60+
serializer = initialize_serializer_from_config(webapp_config)
61+
62+
job_id = str(uuid.uuid4())
63+
report_name = str(uuid.uuid4())
64+
raw_html = "x" * 32 * (2**20)
65+
serializer.save_check_result(
66+
NotebookResultComplete(
67+
job_id=job_id,
68+
report_name=report_name,
69+
report_title=report_name,
70+
status=JobStatus.DONE,
71+
update_time=datetime.datetime(2018, 1, 12, 2, 32),
72+
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
73+
job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
74+
raw_ipynb_json="x" * 32 * (2**20), # 16MB document max
75+
raw_html=raw_html, # 16MB document max
76+
email_html="x" * 32 * (2**20), # 16MB document max
77+
pdf=b"x" * 32 * (2**20), # 16MB document max
78+
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"},
79+
)
80+
)
81+
assert bson_library.find_one({"job_id": job_id}) is not None
82+
result = serializer.get_check_result(job_id)
83+
assert result is not None
84+
assert result.raw_html == raw_html
85+
deleted_stuff = serializer.delete_result(job_id)
86+
for filename in deleted_stuff["gridfs_filenames"]:
87+
with pytest.raises(NoFile):
88+
serializer.result_data_store.get(filename)
89+
assert serializer.get_check_result(job_id) is None
90+
91+
92+
@pytest.mark.parametrize("job_status", [JobStatus.CANCELLED, JobStatus.TIMEOUT, JobStatus.ERROR])
93+
def test_delete_error(bson_library, webapp_config, job_status):
94+
initialise_base_dirs(webapp_config=webapp_config)
95+
serializer = initialize_serializer_from_config(webapp_config)
96+
97+
job_id = str(uuid.uuid4())
98+
report_name = str(uuid.uuid4())
99+
serializer.save_check_result(
100+
NotebookResultError(
101+
job_id=job_id,
102+
report_name=report_name,
103+
report_title=report_name,
104+
status=job_status,
105+
update_time=datetime.datetime(2018, 1, 12, 2, 32),
106+
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
107+
error_info="Oh no!",
108+
)
109+
)
110+
assert bson_library.find_one({"job_id": job_id}) is not None
111+
result = serializer.get_check_result(job_id)
112+
assert result is not None
113+
deleted_stuff = serializer.delete_result(job_id)
114+
for filename in deleted_stuff["gridfs_filenames"]:
115+
with pytest.raises(NoFile):
116+
serializer.result_data_store.get(filename)
117+
assert serializer.get_check_result(job_id) is None

0 commit comments

Comments
 (0)