attempt at decoupling bg

mlincett · mlincett · commit fad8e5ca2894 · 2025-06-17T17:20:47.000+02:00
diff --git a/flarestack/core/results.py b/flarestack/core/results.py
@@ -37,6 +37,113 @@ class OverfluctuationError(Exception):
     pass
 
 
+class PickleCache:
+    def __init__(self, pickle_path: Path, background_only: bool = False):
+        self.path = pickle_path
+        # self.pickle_path.mkdir(parents=True, exist_ok=True)
+        self.merged_path = self.path / "merged"
+        self.merged_path.mkdir(parents=True, exist_ok=True)
+        self.background_only = background_only
+
+    def clean_merged_data(self):
+        """Function to clear cache of all data"""
+        # remove all files inside merged_path using Pathlib
+        if self.merged_path.exists():
+            for file in self.merged_path.iterdir():
+                if file.is_file():
+                    file.unlink()
+            logger.debug(f"Removed all files from {self.merged_path}")
+
+    def get_subdirs(self):
+        return [
+            x
+            for x in os.listdir(self.path)
+            if x[0] != "." and x != "merged"
+        ]
+
+    def merge_datadict(self, merged: dict[str, list | dict], pending_data: dict[str, list | dict]):
+        """Merge the content of pending_data into merged."""
+        for key, info in pending_data.items():
+            if isinstance(info, list):
+                # Append the list to the existing one.
+                merged[key] += info
+            elif isinstance(info, dict):
+                for param_name, params in info.items():
+                    try:
+                        merged[key][param_name] += params
+                    except KeyError as m:
+                        logger.warning(
+                            f"Keys [{key}][{param_name}] not found in \n {merged}"
+                        )
+                        raise KeyError(m)
+            else:
+                raise TypeError(
+                    f"Unexpected type for key {key}: {type(info)}. Expected list or dict."
+                )
+
+
+    def merge_and_load_subdir(self, subdir_name):
+        """Merge and load data from a single subdirectory."""
+        subdir = os.path.join(self.path, subdir_name)
+
+        files = os.listdir(subdir)
+
+        # Map one dir to one pickle
+        merged_file_path = os.path.join(self.merged_path, subdir_name + ".pkl")
+        # Load previously merged data, if it exists.
+        if os.path.isfile(merged_file_path):
+            logger.debug(f"loading merged data from {merged_file_path}")
+            with open(merged_file_path, "rb") as mp:
+                merged_data = Pickle.load(mp)
+        else:
+            merged_data = {}
+
+        for filename in files:
+            pending_file = os.path.join(subdir, filename)
+
+            try:
+                with open(pending_file, "rb") as f:
+                    data = Pickle.load(f)
+            except (EOFError, IsADirectoryError):
+                logger.warning("Failed loading: {0}".format(pending_file))
+                continue
+            # Remove file immediately. This can lead to undesired results because if the program crashes or gets terminated in the process, we will have removed files before writing the merged data. However, delaying the removal would create the opposite problem: the file is merged but not removed, and potentially merged a second time at the next run.
+            os.remove(pending_file)
+
+            if merged_data == {}:
+                merged_data = data
+            else:
+                self.merge_datadict(merged_data, data)
+
+        # Save merged data.
+        with open(merged_file_path, "wb") as mp:
+            Pickle.dump(merged_data, mp)
+
+        return merged_data
+
+    def merge_and_load(self, output_dict: dict):
+        # Loop over all injection scale subdirectories.
+        scales_subdirs = self.get_subdirs()
+
+        background_label = scale_shortener(0.0)
+
+        for subdir_name in scales_subdirs:
+            scale_label = scale_shortener(float(subdir_name))
+
+            if self.background_only and scale_label != background_label:
+                # skip non-background trials
+                continue
+
+            pending_data = self.merge_and_load_subdir(subdir_name)
+
+            if pending_data:
+                if scale_label == background_label and background_label in output_dict:
+                    self.merge_datadict(output_dict[background_label], pending_data)
+                else:
+                    output_dict[scale_label] = pending_data
+
+
+
 class ResultsHandler(object):
     def __init__(
         self,
@@ -45,10 +152,17 @@ def __init__(
         do_disc=True,
         bias_error="std",
         sigma_thresholds=[3.0, 5.0],
+        background_from=None
     ):
         self.sources = load_catalogue(rh_dict["catalogue"])
 
         self.name = rh_dict["name"]
+
+        if background_from is not None:
+            self.background_from = background_from
+        else:
+            self.background_from = rh_dict["name"]
+
         self.mh_name = rh_dict["mh_name"]
 
         self._inj_dict = rh_dict["inj_dict"]
@@ -59,11 +173,14 @@ def __init__(
         self.maxfev = rh_dict.get("maxfev", 800)
 
         self.results = dict()
+
         self.pickle_output_dir = name_pickle_output_dir(self.name)
+        self.pickle_output_dir_bg = name_pickle_output_dir(self.background_from)
 
-        self.plot_path = Path(plot_output_dir(self.name))
+        self.pickle_cache = PickleCache(Path(self.pickle_output_dir))
+        self.pickle_cache_bg = PickleCache(Path(self.pickle_output_dir_bg))
 
-        self.merged_dir = os.path.join(self.pickle_output_dir, "merged")
+        self.plot_path = Path(plot_output_dir(self.name))
 
         self.allow_extrapolation = rh_dict.get("allow_extrapolated_sensitivity", True)
 
@@ -101,7 +218,8 @@ def __init__(
             "extrapolated": False,
         }
 
-        # Load injection ladder values
+        # Load injection ladder values.
+        # Builds a dictionary mapping the injection scale to the content of the trials.
         try:
             self.inj = self.load_injection_values()
         except FileNotFoundError as err:
@@ -112,17 +230,17 @@ def __init__(
             self.valid = False
             return
 
-        # Load and merge the trial results
         try:
-            self.merge_pickle_data()
+            self.pickle_cache.merge_and_load(output_dict=self.results)
+            # Load the background trials. Will override the existing one.
+            self.pickle_cache_bg.merge_and_load(output_dict=self.results)
         except FileNotFoundError:
             logger.warning(f"No files found at {self.pickle_output_dir}")
 
         # auxiliary parameters
-        # self.sorted_scales = sorted(self.results.keys())
         self.scale_values = sorted(
             [float(j) for j in self.results.keys()]
-        )  # replaces self.scales_float
+        )  
         self.scale_labels = [scale_shortener(i) for i in self.scale_values]
 
         logger.info(f"Injection scales: {self.scale_values}")
@@ -131,17 +249,14 @@ def __init__(
         # Determine the injection scales
         try:
             self.find_ns_scale()
+            self.plot_bias()
         except ValueError as e:
             logger.warning(f"RuntimeError for ns scale factor: \n {e}")
         except IndexError as e:
             logger.warning(
                 f"IndexError for ns scale factor. Only background trials? \n {e}"
             )
 
-        # Create fit bias plots
-        # this expects flux_to_ns to be set
-        self.plot_bias()
-
         if do_sens:
             try:
                 self.find_sensitivity()
@@ -278,12 +393,9 @@ def nu_astronomy(self, flux, e_pdf_dict):
         return calculate_astronomy(flux, e_pdf_dict)
 
     def clean_merged_data(self):
-        """Function to clear cache of all data"""
-        try:
-            for f in os.listdir(self.merged_dir):
-                os.remove(self.merged_dir + f)
-        except OSError:
-            pass
+        """Clean merged data from pickle cache, only for main analysis. Do not touch the background cache."""
+        self.pickle_cache.clean_merged_data()
+
 
     def load_injection_values(self):
         """Function to load the values used in injection, so that a
@@ -311,25 +423,33 @@ def load_injection_values(self):
 
         return inj_values
 
-    def merge_pickle_data(self):
+
+    def merge_and_load_pickle_data(self):
+        # NOTE:
+        # self.pickle_output_path
+        # self.merged_dir = self.pickle_output_path / "merged"
+
+
+        # Loop over all subdirectories, one for each injection scale, containing one pickle per trial.
         all_sub_dirs = [
             x
-            for x in os.listdir(self.pickle_output_dir)
+            for x in os.listdir(self.path)
             if x[0] != "." and x != "merged"
         ]
-
+        # Create a "merged" directory, that will contain a single pickle with many trials per injection scale.
         try:
             os.makedirs(self.merged_dir)
         except OSError:
             pass
 
         for sub_dir_name in all_sub_dirs:
-            sub_dir = os.path.join(self.pickle_output_dir, sub_dir_name)
+            sub_dir = os.path.join(self.path, sub_dir_name)
 
             files = os.listdir(sub_dir)
 
+            # Map one dir to one pickle
             merged_path = os.path.join(self.merged_dir, sub_dir_name + ".pkl")
-
+            # Load previously merged data, if it exists.
             if os.path.isfile(merged_path):
                 logger.debug(f"loading merged data from {merged_path}")
                 with open(merged_path, "rb") as mp:
@@ -338,15 +458,16 @@ def merge_pickle_data(self):
                 merged_data = {}
 
             for filename in files:
-                path = os.path.join(sub_dir, filename)
+                pending_file = os.path.join(sub_dir, filename)
 
                 try:
-                    with open(path, "rb") as f:
+                    with open(pending_file, "rb") as f:
                         data = Pickle.load(f)
                 except (EOFError, IsADirectoryError):
-                    logger.warning("Failed loading: {0}".format(path))
+                    logger.warning("Failed loading: {0}".format(pending_file))
                     continue
-                os.remove(path)
+                # This can be "dangerous" because if the program crashes or gets terminated, we will have removed files before writing the merged data.
+                os.remove(pending_file)
 
                 if merged_data == {}:
                     merged_data = data
@@ -364,13 +485,15 @@ def merge_pickle_data(self):
                                     )
                                     raise KeyError(m)
 
+            # Save merged data.
             with open(merged_path, "wb") as mp:
                 Pickle.dump(merged_data, mp)
 
-            if len(list(merged_data.keys())) > 0:
+            # Load merged data in results.
+            if merged_data:
                 self.results[scale_shortener(float(sub_dir_name))] = merged_data
 
-        if len(list(self.results.keys())) == 0:
+        if not self.results:
             logger.warning("No data was found by ResultsHandler object! \n")
             logger.warning(
                 "Tried root directory: \n {0} \n ".format(self.pickle_output_dir)