NSAPH-Data-Processing
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.renvignore‎
Lines changed: 6 additions & 0 deletions b/‎.renvignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 27 additions & 2 deletions b/‎README.md‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎conf/aggregation/aggregation.yaml‎
Lines changed: 42 additions & 7 deletions b/‎conf/aggregation/aggregation.yaml‎
Lines changed: 42 additions & 7 deletions
diff --git a/‎conf/config.yaml‎
Lines changed: 4 additions & 9 deletions b/‎conf/config.yaml‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎conf/geographies/geographies.yaml‎
Lines changed: 9 additions & 0 deletions b/‎conf/geographies/geographies.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎data/.gitignore‎
Lines changed: 0 additions & 4 deletions b/‎data/.gitignore‎
Lines changed: 0 additions & 4 deletions
@@ -3,4 +3,5 @@ logs/*
 .snakemake/*
 .DS_Store
 sandbox
-slurm*
+slurm*
+data
@@ -0,0 +1,6 @@
+^logs/
+^data/
+^slurm-.*\.out
+^_.github/
+sandbox
+logs
@@ -1,3 +1,28 @@
-This file will be overwritten by `index.ipynb`
+# ERA5 Exposure Aggregation Pipeline
 
-In the meantime, see `notes/index.ipynb` for the notes..
+This repository contains a pipeline for aggregating ERA5 environmental exposures data to a 0.1 degree grid. The pipeline is designed to be run on FASRC. We developed
+this pipeline using `nbdev`, which means that we can create modules and scripts from notebooks.
+Hence, all of the documentation for how the pipeline was developed and validated is
+available in `notes/index.ipynb` and the associated notebooks.
+
+## How to Review a PR
+
+To review a PR on this repository, follow these steps:
+
+0. Obtain an API key for the ERA5 datastore from [here](https://cds.climate.copernicus.eu/how-to-api), and ask Tinashe for access to the Golden Lab `googledriver` API key
+
+1. Clone this repository to your workspace on FASRC
+
+2. Create a conda environment with `conda create -n era5_sandbox python=3.10` and install all of the necessary dependencies for the package with `pip install -e .`
+
+3. Run the `core` module to test your API key and setup the data
+directory structure
+
+`python src/era5_sandbox/core.py`
+
+4. Symlink your local data directory to the original work
+`ln -s [YOUR WORKING DIRECTORY]/data /n/dominici_lab/lab/data_processing/csph-era5_sandbox/data`
+
+5. Dry run by removing a file from data `snakemake --dry-run`
+
+6. Run the pipeline `sbatch snakemake.sbatch`
@@ -1,9 +1,44 @@
-daily: 
-  function: "numpy.mean"
-  string: mean
+aggregation:
+  t2m:
+    hourly_to_daily:
+      - name: mean
+        function: "numpy.nanmean"
+      - name: min
+        function: "numpy.nanmin"
+      - name: max
+        function: "numpy.nanmax"
+    daily_to_healthshed:
+      - name: mean
+        function: "numpy.nanmean"
 
-monthly: 
-  function: "numpy.mean"
-  string: mean
+  d2m:
+    hourly_to_daily:
+      - name: mean
+        function: "numpy.nanmean"
+      - name: min
+        function: "numpy.nanmin"
+      - name: max
+        function: "numpy.nanmax"
+    daily_to_healthshed:
+      - name: mean
+        function: "numpy.nanmean"
 
-variable: ['t2m', 'd2m']
+  tp:
+    hourly_to_daily:
+      - name: total
+        function: "numpy.nansum"
+    daily_to_healthshed:
+      - name: mean
+        function: "numpy.nanmean"
+
+  swvl1:
+    hourly_to_daily:
+      - name: mean
+        function: "numpy.nanmean"
+      - name: min
+        function: "numpy.nanmin"
+      - name: max
+        function: "numpy.nanmax"
+    daily_to_healthshed:
+      - name: mean
+        function: "numpy.nanmean"
@@ -2,6 +2,7 @@ defaults:
   - _self_
   - datapaths: datapaths
   - aggregation: aggregation
+  - geographies: geographies
 
 development_mode: false
 
@@ -17,20 +18,14 @@ mdg_shapefile: "https://data.humdata.org/dataset/26fa506b-0727-4d9d-a590-d2abee2
 dataset: "reanalysis-era5-single-levels"
 
 query: 
+  geography: ["madagascar", "nepal"]
+
   product_type: reanalysis
-  # check precipitation
-  # variable: ["2m_dewpoint_temperature", "2m_temperature", "skin_temperature", "total_precipitation"]
-  variable: ["2m_dewpoint_temperature", "2m_temperature"]
+  variable: ["2m_dewpoint_temperature", "2m_temperature", "total_precipitation", "volumetric_soil_water_layer_1"]
   year: [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
   month: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
   day: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
   time: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
-  
-  # this may have to be added for the 
-  #levtype: pl
-  # in the current workflow we can test with a small number of healthsheds
-  # this bounding box will need to be expanded by ~ 50km (in G's dataset it is 50) or even up to 70 or 08
-  # we can also experiment with a buffer that follows the coastline precisely by 100KM
 
   area: [0, 360, -90, 90]
   data_format: netcdf
 
@@ -0,0 +1,9 @@
+madagascar:
+  shapefile: "https://data.humdata.org/dataset/26fa506b-0727-4d9d-a590-d2abee21ee22/resource/ed94d52e-349e-41be-80cb-62dc0435bd34/download/mdg_adm_bngrc_ocha_20181031_shp.zip"
+  healthsheds: "healthsheds2022.zip"
+  unique_id: "fs_uid"
+
+nepal:
+  shapefile: "https://data.humdata.org/dataset/07db728a-4f0f-4e98-8eb0-8fa9df61f01c/resource/2eb4c47f-fd6e-425d-b623-d35be1a7640e/download/npl_adm_nd_20240314_ab_shp.zip"
+  healthsheds: "Nepal_Healthsheds2024.zip"
+  unique_id: "fid"
-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +^logs/
 +^data/
 +^slurm-.*\.out
 +^_.github/
 +sandbox
 +logs