[FEATURE] Integrate prototypes into pipeline

TinasheMTapera · TinasheMTapera · commit 920f69714ac4 · 2025-04-04T20:55:36.000-04:00
Fixes #6
diff --git a/conf/aggregation/aggregation.yaml b/conf/aggregation/aggregation.yaml
@@ -0,0 +1,9 @@
+daily: 
+  function: "numpy.mean"
+  string: mean
+
+monthly: 
+  function: "numpy.mean"
+  string: mean
+
+variable: ['t2m', 'd2m']
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -1,6 +1,7 @@
 defaults:
   - _self_
   - datapaths: datapaths
+  - aggregation: aggregation
 
 development_mode: false
 
@@ -20,7 +21,7 @@ query:
   # check precipitation
   # variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
   variable: ["2m_dewpoint_temperature", "2m_temperature"]
-  year: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
+  year: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
   month: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
   day: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
   time: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
diff --git a/notes/00_core.ipynb b/notes/00_core.ipynb
@@ -45,7 +45,8 @@
     "from pydrive2.auth import GoogleAuth\n",
     "from pydrive2.drive import GoogleDrive\n",
     "from omegaconf import DictConfig, OmegaConf\n",
-    "from pyprojroot import here"
+    "from pyprojroot import here\n",
+    "from importlib import import_module\n"
    ]
   },
   {
@@ -98,6 +99,20 @@
     "    return path"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
+    "def _get_callable(func_path):\n",
+    "    \"\"\"Dynamically import a callable from a string path.\"\"\"\n",
+    "    module_name, func_name = func_path.rsplit(\".\", 1)\n",
+    "    module = import_module(module_name)\n",
+    "    return getattr(module, func_name)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notes/02_aggregate.ipynb b/notes/02_aggregate.ipynb
diff --git a/snakefile b/snakefile
@@ -14,15 +14,17 @@ with hydra.initialize(config_path="conf", version_base=None):
     cfg = hydra.compose(config_name="config", overrides=[])
     print(OmegaConf.to_yaml(cfg))
 
-# read list of years
+# read list of variables to parellelize over
 years_cfg = OmegaConf.to_container(cfg.query.year, resolve=True)
 months_cfg = OmegaConf.to_container(cfg.query.month, resolve=True)
-#day_cfg = OmegaConf.to_container(cfg.query.day, resolve=True)
-#time_cfg = OmegaConf.to_container(cfg.query.time, resolve=True)
+variable_cfg = OmegaConf.to_container(cfg.query.variable, resolve=True)
+agg_variable_cfg = OmegaConf.to_container(cfg.aggregation.variable, resolve=True)
 
 rule all:
     input:
-        expand(data_dir / "input/{year}_{month}.nc", year=years_cfg, month=months_cfg)
+        expand(data_dir / "input/{year}_{month}.nc", year=years_cfg, month=months_cfg),#, variable=variable_cfg)
+        expand(data_dir / "intermediate/environmental_exposure-era5_healthshed_{variable}_{year}_{month}.parquet", 
+               variable=agg_variable_cfg, year=years_cfg, month=months_cfg)
 
 rule test_api:
     output:
@@ -37,3 +39,13 @@ rule download_raw_era5:
         """
         python src/era5_sandbox/download.py "++query.year={wildcards.year}" "++query.month={wildcards.month}"
         """
+
+rule spatial_aggregate_raw_era5:
+    input:
+        data_dir / "input/{year}_{month}.nc"
+    output:
+        data_dir / "intermediate/environmental_exposure-era5_healthshed_{variable}_{year}_{month}.nc"
+    params:
+        variable="{variable}"
+    script:
+        "src/era5_sandbox/aggregate.py"
diff --git a/src/era5_sandbox/__pycache__/__init__.cpython-311.pyc b/src/era5_sandbox/__pycache__/__init__.cpython-311.pyc
diff --git a/src/era5_sandbox/__pycache__/core.cpython-311.pyc b/src/era5_sandbox/__pycache__/core.cpython-311.pyc
diff --git a/src/era5_sandbox/_modidx.py b/src/era5_sandbox/_modidx.py
@@ -12,6 +12,11 @@
                                                                                     'era5_sandbox/aggregate.py'),
                                         'era5_sandbox.aggregate.RasterFile.shape': ( 'aggregate.html#rasterfile.shape',
                                                                                      'era5_sandbox/aggregate.py'),
+                                        'era5_sandbox.aggregate.aggregate_data': ( 'aggregate.html#aggregate_data',
+                                                                                   'era5_sandbox/aggregate.py'),
+                                        'era5_sandbox.aggregate.aggregate_to_healthsheds': ( 'aggregate.html#aggregate_to_healthsheds',
+                                                                                             'era5_sandbox/aggregate.py'),
+                                        'era5_sandbox.aggregate.main': ('aggregate.html#main', 'era5_sandbox/aggregate.py'),
                                         'era5_sandbox.aggregate.netcdf_to_tiff': ( 'aggregate.html#netcdf_to_tiff',
                                                                                    'era5_sandbox/aggregate.py'),
                                         'era5_sandbox.aggregate.polygon_to_raster_cells': ( 'aggregate.html#polygon_to_raster_cells',
@@ -28,6 +33,7 @@
                                    'era5_sandbox.core._create_directory_structure': ( 'core.html#_create_directory_structure',
                                                                                       'era5_sandbox/core.py'),
                                    'era5_sandbox.core._expand_path': ('core.html#_expand_path', 'era5_sandbox/core.py'),
+                                   'era5_sandbox.core._get_callable': ('core.html#_get_callable', 'era5_sandbox/core.py'),
                                    'era5_sandbox.core.describe': ('core.html#describe', 'era5_sandbox/core.py'),
                                    'era5_sandbox.core.main': ('core.html#main', 'era5_sandbox/core.py'),
                                    'era5_sandbox.core.testAPI': ('core.html#testapi', 'era5_sandbox/core.py')},
diff --git a/src/era5_sandbox/aggregate.py b/src/era5_sandbox/aggregate.py
@@ -1,28 +1,33 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../notes/02_aggregate.ipynb.
 
 # %% auto 0
-__all__ = ['resample_netcdf', 'RasterFile', 'netcdf_to_tiff', 'polygon_to_raster_cells']
+__all__ = ['resample_netcdf', 'RasterFile', 'netcdf_to_tiff', 'polygon_to_raster_cells', 'aggregate_to_healthsheds',
+           'aggregate_data', 'main']
 
 # %% ../../notes/02_aggregate.ipynb 4
 import tempfile
 import rasterio
+import hydra
+import argparse
+
+import pandas as pd
+import geopandas as gpd
 import numpy as np
 import xarray as xr
 import matplotlib.pyplot as plt
-import cartopy.crs as ccrs
-import cartopy.feature as cfeature
+
 from dataclasses import dataclass, field
 from typing import Optional, Tuple
 from pyprojroot import here
 from hydra import initialize, compose
-from omegaconf import OmegaConf
+from omegaconf import OmegaConf, DictConfig
 from tqdm import tqdm
 from math import ceil, floor
 from rasterstats.io import Raster
 from rasterstats.utils import boxify_points, rasterize_geom
 
-try: from era5_sandbox.core import GoogleDriver
-except: from core import GoogleDriver
+try: from era5_sandbox.core import GoogleDriver, _get_callable, describe
+except: from core import GoogleDriver, _get_callable, describe
 
 # %% ../../notes/02_aggregate.ipynb 8
 def resample_netcdf(
@@ -58,6 +63,7 @@ class RasterFile:
     data: Optional[np.ndarray] = field(default=None, init=False)
     transform: Optional[rasterio.Affine] = field(default=None, init=False)
     crs: Optional[str] = field(default=None, init=False)
+    nodata: Optional[float] = field(default=None, init=False)
     bounds: Optional[Tuple[float, float, float, float]] = field(default=None, init=False)
 
     def load(self):
@@ -66,6 +72,7 @@ def load(self):
             self.data = src.read(1)  # first band
             self.transform = src.transform
             self.crs = src.crs
+            self.nodata = src.nodata
             self.bounds = src.bounds
         return self
 
@@ -180,3 +187,121 @@ def polygon_to_raster_cells(
             cell_map.append(indices)
 
         return cell_map
+
+# %% ../../notes/02_aggregate.ipynb 25
+def aggregate_to_healthsheds(
+    res_poly2cell: list, # the result of polygon_to_raster_cells    
+    raster: RasterFile, # the raster data
+    shapes: gpd.GeoDataFrame, # the shapes of the health sheds
+    names_column: str = "fs_uid", # the unique identifier column name of the health sheds
+    aggregation_func: callable = np.nanmean, # the aggregation function
+    aggregation_name: str = "mean" # the name of the aggregation function
+    ) -> gpd.GeoDataFrame:
+    """
+    Aggregate the raster data to the health sheds.
+    """
+
+    stats = []
+
+    for indices in res_poly2cell:
+        if len(indices[0]) == 0:
+            # no cells found for this polygon
+            stats.append(np.nan)
+        else:
+            cells = raster.data[indices]
+            if sum(~np.isnan(cells)) == 0:
+                # no valid cells found for this polygon
+                stats.append(np.nan)
+                continue
+            else:
+                # compute MEAN of valid cells
+                # but this stat can be ANYTHING
+                stats.append(aggregation_func(cells))
+
+    # clean up the result into a dataframe
+    stats = pd.Series(stats)
+    shapes[aggregation_name] = stats
+    df = pd.DataFrame(
+            {"healthshed": shapes[names_column], aggregation_name: stats}
+        )
+    gdf = gpd.GeoDataFrame(df, geometry=shapes.geometry.values, crs=shapes.crs)
+    return gdf
+
+
+# %% ../../notes/02_aggregate.ipynb 35
+def aggregate_data(
+        cfg: DictConfig,  # hydra configuration file
+        input_file: str, # path to the input file
+        output_file: str, # path to the output file
+        exposure_variable: str # the variable to aggregate
+    )->None:
+    '''
+    Run the agggregation step of the pipeline.
+
+    Note, this function is the second step in the snakemake 
+    pipeline. This means that in order to define the input
+    file, we use the snakemake.input and snakemake.output variables
+    injected into the runtime by snakemake.
+    '''
+
+    if cfg.development_mode:
+        describe(cfg)
+        return None
+    
+    # get the healthshed shapefile
+    driver = GoogleDriver(json_key_path=here() / cfg.GOOGLE_DRIVE_AUTH_JSON.path)
+    drive = driver.get_drive()
+    healthsheds = driver.read_healthsheds(cfg.GOOGLE_DRIVE_AUTH_JSON.healthsheds_id)
+
+    # get the aggregation configuration
+    # exposure_variable = cfg.aggregation.variable
+    agg_func = _get_callable(cfg.aggregation.daily.function)
+    
+    resampled_nc_file = resample_netcdf(input_file, agg_func=agg_func)
+
+    resampled_tiff = netcdf_to_tiff(
+        ds=resampled_nc_file,
+        variable=exposure_variable,
+        crs="EPSG:4326"
+    )
+
+    # run the polygon to raster cell function
+    result_poly2cell=polygon_to_raster_cells(
+        vectors = healthsheds.geometry.values, # the geometries of the shapefile of the regions
+        raster=resampled_tiff.data, # the raster data above
+        band=1, # the value of the day that we're using
+        nodata=resampled_tiff.nodata, # any intersections with no data, may have to be np.nan
+        affine=resampled_tiff.transform, # some math thing need to revise
+        all_touched=True, 
+        verbose=True
+    )
+
+    result = aggregate_to_healthsheds(
+        res_poly2cell=result_poly2cell,
+        raster=resampled_tiff,
+        shapes=healthsheds,
+        names_column="fs_uid",
+        aggregation_func=agg_func,
+        aggregation_name=exposure_variable
+    )
+
+    # Save the result to a file
+    result.to_parquet(output_file)
+
+# %% ../../notes/02_aggregate.ipynb 36
+@hydra.main(version_base=None, config_path="../../conf", config_name="config")
+def main(cfg: DictConfig) -> None:
+    # Parse command-line arguments
+    input_file = str(snakemake.input[0])  # First input file
+    output_file = str(snakemake.output[0])
+    aggregation_variable = str(snakemake.params.variable)
+
+    aggregate_data(cfg, input_file=input_file, output_file=output_file, exposure_variable=aggregation_variable)
+
+# %% ../../notes/02_aggregate.ipynb 37
+#| eval: false
+try: from nbdev.imports import IN_NOTEBOOK
+except: IN_NOTEBOOK=False
+
+if __name__ == "__main__" and not IN_NOTEBOOK:
+    main()
diff --git a/src/era5_sandbox/core.py b/src/era5_sandbox/core.py
@@ -16,6 +16,8 @@
 from pydrive2.drive import GoogleDrive
 from omegaconf import DictConfig, OmegaConf
 from pyprojroot import here
+from importlib import import_module
+
 
 # %% ../../notes/00_core.ipynb 5
 def describe(
@@ -45,6 +47,13 @@ def _expand_path(
     return path
 
 # %% ../../notes/00_core.ipynb 7
+def _get_callable(func_path):
+    """Dynamically import a callable from a string path."""
+    module_name, func_name = func_path.rsplit(".", 1)
+    module = import_module(module_name)
+    return getattr(module, func_name)
+
+# %% ../../notes/00_core.ipynb 8
 def _create_directory_structure(
         base_path: str,  # The base directory where the structure will be created
         structure: dict  # A dictionary representing the directory structure
@@ -65,7 +74,7 @@ def _create_directory_structure(
         if isinstance(substructure, dict):
             _create_directory_structure(current_path, substructure)
 
-# %% ../../notes/00_core.ipynb 9
+# %% ../../notes/00_core.ipynb 10
 class GoogleDriver:
     """
     A class to handle Google Drive authentication and file management.
@@ -96,10 +105,10 @@ def _authenticate(self):
     def get_drive(self):
         return self.drive
 
-# %% ../../notes/00_core.ipynb 18
+# %% ../../notes/00_core.ipynb 19
 from fastcore.basics import patch
 
-# %% ../../notes/00_core.ipynb 19
+# %% ../../notes/00_core.ipynb 20
 @patch
 def read_healthsheds(self:GoogleDriver, healthshed_zip_name):
 
@@ -122,7 +131,7 @@ def read_healthsheds(self:GoogleDriver, healthshed_zip_name):
         
         return gdf
 
-# %% ../../notes/00_core.ipynb 23
+# %% ../../notes/00_core.ipynb 24
 def testAPI(
     cfg: DictConfig=None,
     dataset:str="reanalysis-era5-pressure-levels"
@@ -167,7 +176,7 @@ def testAPI(
         print("Error: {}".format(e))
         return False
 
-# %% ../../notes/00_core.ipynb 27
+# %% ../../notes/00_core.ipynb 28
 @hydra.main(version_base=None, config_path="../../conf", config_name="config")
 def main(cfg: DictConfig) -> None:
 
@@ -177,7 +186,7 @@ def main(cfg: DictConfig) -> None:
     # test the api
     testAPI(cfg=cfg)
 
-# %% ../../notes/00_core.ipynb 28
+# %% ../../notes/00_core.ipynb 29
 #| eval: false
 try: from nbdev.imports import IN_NOTEBOOK
 except: IN_NOTEBOOK=False