Merge pull request #125 from man-group/delete-gridfs-too

jonbannister · web-flow · commit a24ef90f4fdd · 2022-10-28T15:07:27.000+01:00
Ensure that the delete function actually deletes the gridfs blobs (#122)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 ------------------
 
 * Bugfix: Small bugfix for synchronous report execution
+* Improvement: Delete functionality in mongo now also deletes files from GridFS
 
 
 0.4.5 (2022-09-29)
diff --git a/notebooker/serialization/mongo.py b/notebooker/serialization/mongo.py
@@ -27,6 +27,66 @@ def _add_deleted_status_to_filter(base_filter):
     return base_filter
 
 
+def ignore_missing_files(f):
+    def _ignore_missing_files(path, *args, **kwargs):
+        try:
+            return f(path, *args, **kwargs)
+        except NoFile:
+            logger.error("Could not find file %s", path)
+            return ""
+
+    return _ignore_missing_files
+
+
+@ignore_missing_files
+def read_file(result_data_store, path, is_json=False):
+    r = result_data_store.get_last_version(path).read()
+    try:
+        return "" if not r else json.loads(r) if is_json else r.decode("utf8")
+    except UnicodeDecodeError:
+        return r
+
+
+@ignore_missing_files
+def read_bytes_file(result_data_store, path):
+    return result_data_store.get_last_version(path).read()
+
+
+def load_files_from_gridfs(result_data_store: gridfs.GridFS, result: Dict, do_read=True) -> List[str]:
+
+    gridfs_filenames = []
+    all_html_output_paths = result.get("raw_html_resources", {}).get("outputs", [])
+    gridfs_filenames.extend(all_html_output_paths)
+    if do_read:
+        outputs = {path: read_file(result_data_store, path) for path in all_html_output_paths}
+        result["raw_html_resources"]["outputs"] = outputs
+    if result.get("generate_pdf_output"):
+        pdf_filename = _pdf_filename(result["job_id"])
+        if do_read:
+            result["pdf"] = read_bytes_file(result_data_store, pdf_filename)
+        gridfs_filenames.append(pdf_filename)
+    if not result.get("raw_ipynb_json"):
+        json_filename = _raw_json_filename(result["job_id"])
+        if do_read:
+            result["raw_ipynb_json"] = read_file(result_data_store, json_filename, is_json=True)
+        gridfs_filenames.append(json_filename)
+    if not result.get("raw_html"):
+        html_filename = _raw_html_filename(result["job_id"])
+        if do_read:
+            result["raw_html"] = read_file(result_data_store, html_filename)
+        gridfs_filenames.append(html_filename)
+    if not result.get("email_html"):
+        email_filename = _raw_email_html_filename(result["job_id"])
+        result["email_html"] = read_file(result_data_store, email_filename)
+        gridfs_filenames.append(email_filename)
+    if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
+        css_inlining_filename = _css_inlining_filename(result["job_id"])
+        if do_read:
+            result["raw_html_resources"]["inlining"] = read_file(result_data_store, css_inlining_filename, is_json=True)
+        gridfs_filenames.append(css_inlining_filename)
+    return gridfs_filenames
+
+
 class MongoResultSerializer(ABC):
     # This class is the interface between Mongo and the rest of the application
 
@@ -212,47 +272,12 @@ def _convert_result(
         if cls is None:
             return None
 
-        def ignore_missing_files(f):
-            def _ignore_missing_files(path, *args, **kwargs):
-                try:
-                    return f(path, *args, **kwargs)
-                except NoFile:
-                    logger.error("Could not find file %s in %s", path, self.result_data_store)
-                    return ""
-            return _ignore_missing_files
-
-        @ignore_missing_files
-        def read_file(path, is_json=False):
-            r = self.result_data_store.get_last_version(path).read()
-            try:
-                return "" if not r else json.loads(r) if is_json else r.decode("utf8")
-            except UnicodeDecodeError:
-                return r
-
-        @ignore_missing_files
-        def read_bytes_file(path):
-            return self.result_data_store.get_last_version(path).read()
-
         if not load_payload:
             result.pop("stdout", None)
 
         if cls == NotebookResultComplete:
             if load_payload:
-                outputs = {path: read_file(path) for path in result.get("raw_html_resources", {}).get("outputs", [])}
-                result["raw_html_resources"]["outputs"] = outputs
-                if result.get("generate_pdf_output"):
-                    pdf_filename = _pdf_filename(result["job_id"])
-                    result["pdf"] = read_bytes_file(pdf_filename)
-                if not result.get("raw_ipynb_json"):
-                    result["raw_ipynb_json"] = read_file(_raw_json_filename(result["job_id"]), is_json=True)
-                if not result.get("raw_html"):
-                    result["raw_html"] = read_file(_raw_html_filename(result["job_id"]))
-                if not result.get("email_html"):
-                    result["email_html"] = read_file(_raw_email_html_filename(result["job_id"]))
-                if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
-                    result["raw_html_resources"]["inlining"] = read_file(
-                        _css_inlining_filename(result["job_id"]), is_json=True
-                    )
+                load_files_from_gridfs(self.result_data_store, result, do_read=True)
             else:
                 result.pop("raw_html", None)
                 result.pop("raw_ipynb_json", None)
@@ -298,7 +323,7 @@ def read_bytes_file(path):
         elif cls == NotebookResultError:
             if load_payload:
                 if not result.get("error_info"):
-                    result["error_info"] = read_file(_error_info_filename(result["job_id"]))
+                    result["error_info"] = read_file(self.result_data_store, _error_info_filename(result["job_id"]))
             else:
                 result.pop("error_info", None)
             return NotebookResultError(
@@ -319,11 +344,14 @@ def read_bytes_file(path):
         else:
             raise ValueError("Could not deserialise {} into result object.".format(result))
 
+    def _get_raw_check_result(self, job_id: str):
+        return self.library.find_one({"job_id": job_id}, {"_id": 0})
+
     def get_check_result(
-        self, job_id: AnyStr
+        self, job_id: AnyStr, load_payload: bool = True
     ) -> Optional[Union[NotebookResultError, NotebookResultComplete, NotebookResultPending]]:
-        result = self.library.find_one({"job_id": job_id}, {"_id": 0})
-        return self._convert_result(result)
+        result = self._get_raw_check_result(job_id)
+        return self._convert_result(result, load_payload=load_payload)
 
     def _get_raw_results(self, base_filter, projection, limit):
         base_filter = _add_deleted_status_to_filter(base_filter)
@@ -462,8 +490,19 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
     def n_all_results_for_report_name(self, report_name: str) -> int:
         return self._get_result_count({"report_name": report_name})
 
-    def delete_result(self, job_id: AnyStr) -> None:
+    def delete_result(self, job_id: AnyStr) -> Dict[str, Any]:
+        result = self._get_raw_check_result(job_id)
+        status = JobStatus.from_string(result["status"])
+        gridfs_filenames = []
+        if status == JobStatus.DONE:
+            gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False)
+        elif status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED):
+            gridfs_filenames = [_error_info_filename(job_id)]
         self.update_check_status(job_id, JobStatus.DELETED)
+        for filename in gridfs_filenames:
+            logger.info(f"Deleting {filename}")
+            self.result_data_store.delete(filename)
+        return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames}
 
 
 def _pdf_filename(job_id: str) -> str:
diff --git a/tests/integration/test_mongo.py b/tests/integration/test_mongo.py
@@ -2,8 +2,9 @@
 import uuid
 
 import pytest
+from gridfs.errors import NoFile
 
-from notebooker.constants import JobStatus, NotebookResultComplete
+from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError
 from notebooker.serialization.serialization import initialize_serializer_from_config
 from notebooker.utils.filesystem import initialise_base_dirs
 
@@ -23,11 +24,11 @@ def test_mongo_saving_ipynb_json_to_gridfs(bson_library, webapp_config):
             update_time=datetime.datetime(2018, 1, 12, 2, 32),
             job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
             job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
-            raw_ipynb_json="x" * 32 * (2 ** 20),  # 16MB document max
-            raw_html="x" * 32 * (2 ** 20),  # 16MB document max
-            email_html="x" * 32 * (2 ** 20),  # 16MB document max
-            pdf=b"x" * 32 * (2 ** 20),  # 16MB document max
-            raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2 ** 20)}},
+            raw_ipynb_json="x" * 32 * (2**20),  # 16MB document max
+            raw_html="x" * 32 * (2**20),  # 16MB document max
+            email_html="x" * 32 * (2**20),  # 16MB document max
+            pdf=b"x" * 32 * (2**20),  # 16MB document max
+            raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}},
         )
     )
     result = serializer.get_check_result(job_id)
@@ -52,3 +53,65 @@ def test_cant_serialise_done_job_via_update(bson_library, webapp_config):
             raw_html="",
             email_html="",
         )
+
+
+def test_delete(bson_library, webapp_config):
+    initialise_base_dirs(webapp_config=webapp_config)
+    serializer = initialize_serializer_from_config(webapp_config)
+
+    job_id = str(uuid.uuid4())
+    report_name = str(uuid.uuid4())
+    raw_html = "x" * 32 * (2**20)
+    serializer.save_check_result(
+        NotebookResultComplete(
+            job_id=job_id,
+            report_name=report_name,
+            report_title=report_name,
+            status=JobStatus.DONE,
+            update_time=datetime.datetime(2018, 1, 12, 2, 32),
+            job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
+            job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
+            raw_ipynb_json="x" * 32 * (2**20),  # 16MB document max
+            raw_html=raw_html,  # 16MB document max
+            email_html="x" * 32 * (2**20),  # 16MB document max
+            pdf=b"x" * 32 * (2**20),  # 16MB document max
+            raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"},
+        )
+    )
+    assert bson_library.find_one({"job_id": job_id}) is not None
+    result = serializer.get_check_result(job_id)
+    assert result is not None
+    assert result.raw_html == raw_html
+    deleted_stuff = serializer.delete_result(job_id)
+    for filename in deleted_stuff["gridfs_filenames"]:
+        with pytest.raises(NoFile):
+            serializer.result_data_store.get(filename)
+    assert serializer.get_check_result(job_id) is None
+
+
+@pytest.mark.parametrize("job_status", [JobStatus.CANCELLED, JobStatus.TIMEOUT, JobStatus.ERROR])
+def test_delete_error(bson_library, webapp_config, job_status):
+    initialise_base_dirs(webapp_config=webapp_config)
+    serializer = initialize_serializer_from_config(webapp_config)
+
+    job_id = str(uuid.uuid4())
+    report_name = str(uuid.uuid4())
+    serializer.save_check_result(
+        NotebookResultError(
+            job_id=job_id,
+            report_name=report_name,
+            report_title=report_name,
+            status=job_status,
+            update_time=datetime.datetime(2018, 1, 12, 2, 32),
+            job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
+            error_info="Oh no!",
+        )
+    )
+    assert bson_library.find_one({"job_id": job_id}) is not None
+    result = serializer.get_check_result(job_id)
+    assert result is not None
+    deleted_stuff = serializer.delete_result(job_id)
+    for filename in deleted_stuff["gridfs_filenames"]:
+        with pytest.raises(NoFile):
+            serializer.result_data_store.get(filename)
+    assert serializer.get_check_result(job_id) is None