Misc documentation tweaks.

metasim · metasim · commit abb8f4efa5a1 · 2019-08-22T13:21:15.000-04:00
diff --git a/pyrasterframes/src/main/python/docs/aggregation.pymd b/pyrasterframes/src/main/python/docs/aggregation.pymd
@@ -20,11 +20,10 @@ We can illustrate aggregate differences by computing an aggregate mean. First, w
 ```python, sql_dataframe
 import pyspark.sql.functions as F
 
-rf = spark.sql("""
-SELECT 1 as id, rf_make_ones_tile(5, 5, 'float32') as tile
-UNION
-SELECT 2 as id, rf_local_multiply(rf_make_ones_tile(5, 5, 'float32'), 3) as tile
-""")
+df1 = spark.range(1).select('id', rf_make_ones_tile(5, 5, 'float32').alias('tile'))
+df2 = spark.range(1).select('id', rf_local_multiply(rf_make_ones_tile(5, 5, 'float32'), F.lit(3)).alias('tile'))
+
+rf = df1.union(df2)
 
 tiles = rf.select("tile").collect()
 print(tiles[0]['tile'].cells)
@@ -93,14 +92,13 @@ stats
 The @ref:[`rf_agg_local_stats`](reference.md#rf-agg-local-stats) function computes the element-wise local aggregate statistical summary as shown below. The DataFrame used in the previous two code blocks has unequal _tile_ dimensions, so a different DataFrame is used in this code block to avoid a runtime error.
 
 ```python, agg_local_stats
-rf = spark.sql("""
-SELECT 1 as id, rf_make_ones_tile(5, 5, 'float32') as tile
-UNION
-SELECT 2 as id, rf_make_constant_tile(3, 5, 5, 'float32') as tile
-UNION
-SELECT 3 as id, rf_make_constant_tile(5, 5, 5, 'float32') as tile
-""").agg(rf_agg_local_stats('tile').alias('stats'))
+df1 = spark.range(1).select('id', rf_make_ones_tile(5, 5, 'float32').alias('tile'))
+df2 = spark.range(1).select('id', rf_make_constant_tile(3, 5, 5, 'float32').alias('tile')) 
+df3 = spark.range(1).select('id', rf_make_constant_tile(5, 5, 5, 'float32').alias('tile')) 
 
+rf = df1.union(df2).union(df3) \
+    .agg(rf_agg_local_stats('tile').alias('stats'))
+    
 agg_local_stats = rf.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').collect()
 
 for r in agg_local_stats:
diff --git a/pyrasterframes/src/main/python/docs/local-algebra.pymd b/pyrasterframes/src/main/python/docs/local-algebra.pymd
@@ -34,13 +34,15 @@ We will apply the @ref:[catalog pattern](raster-catalogs.md) for defining the da
 This form of `(x - y) / (x + y)` is common in remote sensing and is called a normalized difference. It is used with other band pairs to highlight water, snow, and other phenomena.
 
 ```python, read_rasters
-bands = {4: 'red', 8: 'nir'}
+from pyspark.sql import Row
 uri_pattern = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B0{}.tif'
-catalog_df = pd.DataFrame([
-    {bands[b_num]: uri_pattern.format(b_num) for b_num in bands.keys()}
+catalog_df = spark.createDataFrame([
+    Row(red=uri_pattern.format(4), nir=uri_pattern.format(8))
 ])
-df = spark.read.raster(catalog=catalog_df.to_csv(index=None),
-                       catalog_col_names=list(catalog_df.columns))
+df = spark.read.raster(
+    catalog=catalog_df,
+    catalog_col_names=['red', 'nir']
+)
 df.printSchema()
 ```
 
diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd
@@ -95,7 +95,7 @@ modis_catalog = spark.read \
     .option("header", "true") \
     .load(SparkFiles.get(cat_filename)) \
     .withColumn('base_url',
-        F.concat(F.regexp_replace('download_url', 'index.html$', ''), 'gid',)
+        F.concat(F.regexp_replace('download_url', 'index.html$', ''), 'gid')
     ) \
     .drop('download_url') \
     .withColumn('red' , F.concat('base_url', F.lit("_B01.TIF"))) \
@@ -126,15 +126,17 @@ Now that we have prepared our catalog, we simply pass the DataFrame or CSV strin
 ```python, read_catalog
 rf = spark.read.raster(
     catalog=equator,
-    catalog_col_names=['red', 'nir'],
+    catalog_col_names=['red', 'nir']
 )
 rf.printSchema()
 ```
 
 Observe the schema of the resulting DataFrame has a projected raster struct for each column passed in `catalog_col_names`. For reference, the URI is now in a column appended with `_path`. Taking a quick look at the representation of the data, we see again each row contains an arbitrary portion of the entire scene coverage. We also see that for two-D catalogs, each row contains the same spatial extent for all tiles in that row.
 
 ```python, cat_read_sample
-sample = rf.select('gid', rf_extent('red'), rf_extent('nir'), rf_tile('red'), rf_tile('nir'))
+sample = rf \
+    .select('gid', rf_extent('red'), rf_extent('nir'), rf_tile('red'), rf_tile('nir')) \
+    .where(~rf_is_no_data_tile('red'))
 sample.limit(3)    
 ```
 
@@ -168,9 +170,10 @@ When reading a multiband raster or a _catalog_ describing multiband rasters, you
 For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`.
 
 ```python, multiband
-mb = spark.read.raster('s3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
-                       band_indexes=[0, 1, 2, 3],
-                      )
+mb = spark.read.raster(
+    's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
+    band_indexes=[0, 1, 2, 3],
+)
 mb.printSchema()
 ```
 
@@ -184,14 +187,15 @@ Here is a trivial example with a _catalog_ over multiband rasters. We specify tw
 import pandas as pd
 mb_cat = pd.DataFrame([
     {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
-     'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
+     'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif'
     },
 ])
-mb2 = spark.read.raster(catalog=spark.createDataFrame(mb_cat),
-                       catalog_col_names=['foo', 'bar'],
-                       band_indexes=[0, 1],
-                       tile_dimensions=(64,64)
-                      )
+mb2 = spark.read.raster(
+    catalog=spark.createDataFrame(mb_cat),
+    catalog_col_names=['foo', 'bar'],
+    band_indexes=[0, 1],
+    tile_dimensions=(64,64)
+)
 mb2.printSchema()
 ```
 
diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd
@@ -86,7 +86,7 @@ As documented in the @ref:[function reference](reference.md), various user-defin
 ```python, native_centroid
 from pyrasterframes.rasterfunctions import st_centroid
 df = df.withColumn('centroid', st_centroid(df.geometry))
-centroids = df.select('name', 'geometry', 'naive_centroid', 'centroid')
+centroids = df.select('geometry', 'name', 'naive_centroid', 'centroid')
 centroids.limit(3)
 ```
 
@@ -101,14 +101,9 @@ l8 = l8.withColumn('geom', st_geometry(l8.bounds_wgs84))
 l8 = l8.withColumn('paducah', st_point(lit(-88.6275), lit(37.072222)))
 
 l8_filtered = l8.filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(500000.0))))
+l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct')
 ```
 
-```python, evaluate=False, echo=False
-# suppressed due to run time.
-l8_filtered.count()
-```
-
-
 [GeoPandas]: http://geopandas.org
 [OGR]: https://gdal.org/drivers/vector/index.html
 [Shapely]: https://shapely.readthedocs.io/en/latest/manual.html