11# AUTOGENERATED! DO NOT EDIT! File to edit: ../../notes/02_aggregate.ipynb.
22
33# %% auto 0
4- __all__ = ['resample_netcdf' , 'RasterFile' , 'netcdf_to_tiff' , 'polygon_to_raster_cells' ]
4+ __all__ = ['resample_netcdf' , 'RasterFile' , 'netcdf_to_tiff' , 'polygon_to_raster_cells' , 'aggregate_to_healthsheds' ,
5+ 'aggregate_data' , 'main' ]
56
67# %% ../../notes/02_aggregate.ipynb 4
78import tempfile
89import rasterio
10+ import hydra
11+ import argparse
12+
13+ import pandas as pd
14+ import geopandas as gpd
915import numpy as np
1016import xarray as xr
1117import matplotlib .pyplot as plt
12- import cartopy .crs as ccrs
13- import cartopy .feature as cfeature
18+
1419from dataclasses import dataclass , field
1520from typing import Optional , Tuple
1621from pyprojroot import here
1722from hydra import initialize , compose
18- from omegaconf import OmegaConf
23+ from omegaconf import OmegaConf , DictConfig
1924from tqdm import tqdm
2025from math import ceil , floor
2126from rasterstats .io import Raster
2227from rasterstats .utils import boxify_points , rasterize_geom
2328
24- try : from era5_sandbox .core import GoogleDriver
25- except : from core import GoogleDriver
29+ try : from era5_sandbox .core import GoogleDriver , _get_callable , describe
30+ except : from core import GoogleDriver , _get_callable , describe
2631
2732# %% ../../notes/02_aggregate.ipynb 8
2833def resample_netcdf (
@@ -58,6 +63,7 @@ class RasterFile:
5863 data : Optional [np .ndarray ] = field (default = None , init = False )
5964 transform : Optional [rasterio .Affine ] = field (default = None , init = False )
6065 crs : Optional [str ] = field (default = None , init = False )
66+ nodata : Optional [float ] = field (default = None , init = False )
6167 bounds : Optional [Tuple [float , float , float , float ]] = field (default = None , init = False )
6268
6369 def load (self ):
@@ -66,6 +72,7 @@ def load(self):
6672 self .data = src .read (1 ) # first band
6773 self .transform = src .transform
6874 self .crs = src .crs
75+ self .nodata = src .nodata
6976 self .bounds = src .bounds
7077 return self
7178
@@ -180,3 +187,121 @@ def polygon_to_raster_cells(
180187 cell_map .append (indices )
181188
182189 return cell_map
190+
191+ # %% ../../notes/02_aggregate.ipynb 25
192+ def aggregate_to_healthsheds (
193+ res_poly2cell : list , # the result of polygon_to_raster_cells
194+ raster : RasterFile , # the raster data
195+ shapes : gpd .GeoDataFrame , # the shapes of the health sheds
196+ names_column : str = "fs_uid" , # the unique identifier column name of the health sheds
197+ aggregation_func : callable = np .nanmean , # the aggregation function
198+ aggregation_name : str = "mean" # the name of the aggregation function
199+ ) -> gpd .GeoDataFrame :
200+ """
201+ Aggregate the raster data to the health sheds.
202+ """
203+
204+ stats = []
205+
206+ for indices in res_poly2cell :
207+ if len (indices [0 ]) == 0 :
208+ # no cells found for this polygon
209+ stats .append (np .nan )
210+ else :
211+ cells = raster .data [indices ]
212+ if sum (~ np .isnan (cells )) == 0 :
213+ # no valid cells found for this polygon
214+ stats .append (np .nan )
215+ continue
216+ else :
217+ # compute MEAN of valid cells
218+ # but this stat can be ANYTHING
219+ stats .append (aggregation_func (cells ))
220+
221+ # clean up the result into a dataframe
222+ stats = pd .Series (stats )
223+ shapes [aggregation_name ] = stats
224+ df = pd .DataFrame (
225+ {"healthshed" : shapes [names_column ], aggregation_name : stats }
226+ )
227+ gdf = gpd .GeoDataFrame (df , geometry = shapes .geometry .values , crs = shapes .crs )
228+ return gdf
229+
230+
231+ # %% ../../notes/02_aggregate.ipynb 35
232+ def aggregate_data (
233+ cfg : DictConfig , # hydra configuration file
234+ input_file : str , # path to the input file
235+ output_file : str , # path to the output file
236+ exposure_variable : str # the variable to aggregate
237+ )-> None :
238+ '''
239+ Run the agggregation step of the pipeline.
240+
241+ Note, this function is the second step in the snakemake
242+ pipeline. This means that in order to define the input
243+ file, we use the snakemake.input and snakemake.output variables
244+ injected into the runtime by snakemake.
245+ '''
246+
247+ if cfg .development_mode :
248+ describe (cfg )
249+ return None
250+
251+ # get the healthshed shapefile
252+ driver = GoogleDriver (json_key_path = here () / cfg .GOOGLE_DRIVE_AUTH_JSON .path )
253+ drive = driver .get_drive ()
254+ healthsheds = driver .read_healthsheds (cfg .GOOGLE_DRIVE_AUTH_JSON .healthsheds_id )
255+
256+ # get the aggregation configuration
257+ # exposure_variable = cfg.aggregation.variable
258+ agg_func = _get_callable (cfg .aggregation .daily .function )
259+
260+ resampled_nc_file = resample_netcdf (input_file , agg_func = agg_func )
261+
262+ resampled_tiff = netcdf_to_tiff (
263+ ds = resampled_nc_file ,
264+ variable = exposure_variable ,
265+ crs = "EPSG:4326"
266+ )
267+
268+ # run the polygon to raster cell function
269+ result_poly2cell = polygon_to_raster_cells (
270+ vectors = healthsheds .geometry .values , # the geometries of the shapefile of the regions
271+ raster = resampled_tiff .data , # the raster data above
272+ band = 1 , # the value of the day that we're using
273+ nodata = resampled_tiff .nodata , # any intersections with no data, may have to be np.nan
274+ affine = resampled_tiff .transform , # some math thing need to revise
275+ all_touched = True ,
276+ verbose = True
277+ )
278+
279+ result = aggregate_to_healthsheds (
280+ res_poly2cell = result_poly2cell ,
281+ raster = resampled_tiff ,
282+ shapes = healthsheds ,
283+ names_column = "fs_uid" ,
284+ aggregation_func = agg_func ,
285+ aggregation_name = exposure_variable
286+ )
287+
288+ # Save the result to a file
289+ result .to_parquet (output_file )
290+
291+ # %% ../../notes/02_aggregate.ipynb 36
292+ @hydra .main (version_base = None , config_path = "../../conf" , config_name = "config" )
293+ def main (cfg : DictConfig ) -> None :
294+ # Parse command-line arguments
295+ input_file = str (snakemake .input [0 ]) # First input file
296+ output_file = str (snakemake .output [0 ])
297+ aggregation_variable = str (snakemake .params .variable )
298+
299+ aggregate_data (cfg , input_file = input_file , output_file = output_file , exposure_variable = aggregation_variable )
300+
301+ # %% ../../notes/02_aggregate.ipynb 37
302+ #| eval: false
303+ try : from nbdev .imports import IN_NOTEBOOK
304+ except : IN_NOTEBOOK = False
305+
306+ if __name__ == "__main__" and not IN_NOTEBOOK :
307+ main ()
0 commit comments