locationtech
diff --git a/‎core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala‎
Lines changed: 5 additions & 0 deletions b/‎core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/locationtech/rasterframes/util/package.scala‎
Lines changed: 22 additions & 0 deletions b/‎core/src/main/scala/org/locationtech/rasterframes/util/package.scala‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala‎
Lines changed: 9 additions & 4 deletions b/‎core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎docs/build.sbt‎
Lines changed: 0 additions & 1 deletion b/‎docs/build.sbt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogDataSource.scala‎
Lines changed: 42 additions & 2 deletions b/‎experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogDataSource.scala‎
Lines changed: 42 additions & 2 deletions
diff --git a/‎pyrasterframes/src/main/python/docs/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎pyrasterframes/src/main/python/docs/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎pyrasterframes/src/main/python/docs/aggregation.pymd‎
Lines changed: 20 additions & 13 deletions b/‎pyrasterframes/src/main/python/docs/aggregation.pymd‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎pyrasterframes/src/main/python/docs/getting-started.pymd‎
Lines changed: 13 additions & 10 deletions b/‎pyrasterframes/src/main/python/docs/getting-started.pymd‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎pyrasterframes/src/main/python/docs/languages.pymd‎
Lines changed: 3 additions & 3 deletions b/‎pyrasterframes/src/main/python/docs/languages.pymd‎
Lines changed: 3 additions & 3 deletions
@@ -280,6 +280,11 @@ trait RasterFunctions {
       udf(F.rasterize(_: Geometry, _: Geometry, _: Int, cols, rows)).apply(geometry, bounds, value)
     )
 
+  def rf_rasterize(geometry: Column, bounds: Column, value: Column, cols: Column, rows: Column): TypedColumn[Any, Tile] =
+    withTypedAlias("rf_rasterize", geometry)(
+      udf(F.rasterize).apply(geometry, bounds, value, cols, rows)
+    )
+
   /** Reproject a column of geometry from one CRS to another.
     * @param sourceGeom Geometry column to reproject
     * @param srcCRS Native CRS of `sourceGeom` as a literal
 
@@ -42,7 +42,7 @@ import org.locationtech.rasterframes.model.TileContext
 case class RealizeTile(child: Expression) extends UnaryRasterOp with CodegenFallback {
   override def dataType: DataType = TileType
 
-  override def nodeName: String = "rf_realize_tile"
+  override def nodeName: String = "rf_tile"
   implicit val tileSer = TileUDT.tileSerializer
 
   override protected def eval(tile: Tile, ctx: Option[TileContext]): Any =
 
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.rf._
 import org.apache.spark.sql.types.StringType
 import org.apache.spark.sql._
@@ -183,6 +184,27 @@ package object util {
     }
   }
 
+  implicit class DFWithPrettyPrint(val df: Dataset[_]) extends AnyVal {
+    def toMarkdown(numRows: Int = 5, truncate: Boolean = false): String = {
+      import df.sqlContext.implicits._
+      val cols = df.columns
+      val header = cols.mkString("| ", " | ", " |") + "\n" + ("|---" * cols.length) + "|\n"
+      val stringifiers = cols
+        .map(c => s"`$c`")
+        .map(c => df.col(c).cast(StringType))
+        .map(c => if (truncate) substring(c, 1, 40) else c)
+      val cat = concat_ws(" | ", stringifiers: _*)
+      val body = df
+        .select(cat).limit(numRows)
+        .as[String]
+        .collect()
+        .map(_.replaceAll("\\[", "\\\\["))
+        .map(_.replace('\n', '↩'))
+        .mkString("| ", " |\n| ", " |")
+      header + body
+    }
+  }
+
   object Shims {
     // GT 1.2.1 to 2.0.0
     def toArrayTile[T <: CellGrid](tile: T): T =
 
@@ -67,7 +67,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
       // Not sure why implicit resolution isn't handling this properly.
       implicit val enc = Encoders.tuple(crsEncoder, Encoders.STRING, crsEncoder, Encoders.scalaDouble)
       val df = Seq((pe.crs, "fred", pe.crs, 34.0)).toDF("c1", "s", "c2", "n")
-      df.crsColumns.size should be (2)
+      df.crsColumns.size should be(2)
     }
 
     it("should split TileLayout") {
@@ -92,10 +92,10 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
 
     it("should split key") {
       val s1 = SpatialKey(0, 0).subdivide(2)
-      assert(s1 === Seq(SpatialKey(0,0), SpatialKey(1,0), SpatialKey(0,1), SpatialKey(1,1)))
+      assert(s1 === Seq(SpatialKey(0, 0), SpatialKey(1, 0), SpatialKey(0, 1), SpatialKey(1, 1)))
 
       val s2 = SpatialKey(2, 3).subdivide(3)
-      assert(s2 === Seq(SpatialKey(6,9), SpatialKey(7,9), SpatialKey(8,9), SpatialKey(6,10), SpatialKey(7,10), SpatialKey(8,10), SpatialKey(6,11), SpatialKey(7,11), SpatialKey(8,11)))
+      assert(s2 === Seq(SpatialKey(6, 9), SpatialKey(7, 9), SpatialKey(8, 9), SpatialKey(6, 10), SpatialKey(7, 10), SpatialKey(8, 10), SpatialKey(6, 11), SpatialKey(7, 11), SpatialKey(8, 11)))
     }
 
     it("should split TileLayerMetadata[SpatialKey]") {
@@ -107,7 +107,12 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
 
       val divided = tlm.subdivide(2)
 
-      assert(divided.tileLayout.tileDimensions === (tileSize/2, tileSize/2))
+      assert(divided.tileLayout.tileDimensions === (tileSize / 2, tileSize / 2))
+    }
+
+    it("should render Markdown") {
+      import org.locationtech.rasterframes.util._
+      rf.toMarkdown().count(_ == '|') shouldBe >=(3 * 5)
     }
   }
 }
@@ -8,7 +8,6 @@ makePDF := {
 
   // Get the python source directory configured in the root project.
   val base = (Compile / paradox / sourceDirectories).value.find(_.toString.contains("python")).head
-  println(base)
 
   // Hard coded lacking any simple  way of determining order.
   val files = Seq(
 
@@ -75,9 +75,49 @@ object MODISCatalogDataSource extends LazyLogging with ResourceCacheSupport {
   final val MCD43A4_BASE = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/"
   override def maxCacheFileAgeHours: Int = Int.MaxValue
 
-  // List of missing days
+  // List of missing days in PDS
   private val blacklist = Seq[String](
-    //"2018-05-06"
+    "2018-02-27",
+    "2018-02-28",
+    "2018-03-01",
+    "2018-03-02",
+    "2018-03-03",
+    "2018-03-04",
+    "2018-03-05",
+    "2018-03-06",
+    "2018-03-07",
+    "2018-03-08",
+    "2018-03-09",
+    "2018-03-10",
+    "2018-03-11",
+    "2018-03-12",
+    "2018-03-13",
+    "2018-03-14",
+    "2018-05-16",
+    "2018-05-17",
+    "2018-05-18",
+    "2018-05-19",
+    "2018-05-20",
+    "2018-05-21",
+    "2018-06-01",
+    "2018-06-04",
+    "2018-07-29",
+    "2018-08-03",
+    "2018-08-04",
+    "2018-08-05",
+    "2018-10-01",
+    "2018-10-02",
+    "2018-10-03",
+    "2018-10-22",
+    "2018-10-23",
+    "2018-11-12",
+    "2018-12-19",
+    "2018-12-20",
+    "2018-12-21",
+    "2018-12-22",
+    "2018-12-23",
+    "2018-12-24",
+    "2019-03-18"
   )
 
   private def sceneFiles(start: LocalDate, end: LocalDate, useBlacklist: Boolean) = {
 
@@ -19,6 +19,9 @@
 #
 
 import os
+from pyspark.sql import DataFrame
+from pyrasterframes import RFContext
+from pweave import PwebPandocFormatter
 
 
 def docs_dir():
@@ -37,3 +40,12 @@ def resource_dir():
 
 def resource_dir_uri():
     return 'file://' + resource_dir()
+
+
+class PegdownMarkdownFormatter(PwebPandocFormatter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    # Pegdown doesn't support the width and label options.
+    def make_figure_string(self, figname, width, label, caption=""):
+        return "![%s](%s)" % (caption, figname)
@@ -1,6 +1,7 @@
 # Aggregation
 
 ```python, setup, echo=False
+from pyrasterframes import rf_ipython
 from docs import *
 from pyrasterframes.utils import create_rf_spark_session
 from pyrasterframes.rasterfunctions import *
@@ -16,7 +17,7 @@ There are 3 types of aggregate functions: _tile_ aggregate, DataFrame aggregate,
 
 We can illustrate these differences in computing an aggregate mean. First, we create a sample DataFrame of 2 _tiles_ where the first _tile_ is composed of 25 values of 1.0 and the second _tile_ is composed of 25 values of 3.0.
 
-```python, sql_dataframe
+```python, sql_dataframe, results='raw'
 import pyspark.sql.functions as F
 
 rf = spark.sql("""
@@ -25,38 +26,39 @@ UNION
 SELECT 2 as id, rf_local_multiply(rf_make_ones_tile(5, 5, 'float32'), 3) as tile
 """)
 
-rf.select("id", rf_render_matrix("tile")).show(10, False)
+rf.select("id", rf_render_matrix("tile")).show(truncate=False)
 ```
 
+
 In this code block, we are using the @ref:[`rf_tile_mean`](reference.md#rf-tile-mean) function to compute the _tile_ aggregate mean of cells in each row of column `tile`. The mean of each _tile_ is computed separately, so the first mean is 1.0 and the second mean is 3.0. Notice that the number of rows in the DataFrame is the same before and after the aggregation.
 
-```python, tile_mean
-rf.select(F.col('id'), rf_tile_mean(F.col('tile'))).show(10, False)
+```python, tile_mean, results='raw'
+rf.select(F.col('id'), rf_tile_mean(F.col('tile'))).show(truncate=False)
 ```
 
 In this code block, we are using the @ref:[`rf_agg_mean`](reference.md#rf-agg-mean) function to compute the DataFrame aggregate, which averages 25 values of 1.0 and 25 values of 3.0, across the fifty cells in two rows. Note that only a single row is returned since the average is computed over the full DataFrame.
 
-```python, agg_mean
-rf.agg(rf_agg_mean(F.col('tile'))).show(10, False)
+```python, agg_mean, results='raw'
+rf.agg(rf_agg_mean(F.col('tile'))).show()
 ```
 
 In this code block, we are using the @ref:[`rf_agg_local_mean`](reference.md#rf-agg-local-mean) function to compute the element-wise local aggregate mean across the two rows. In this example it is computing the mean of one value of 1.0 and one value of 3.0 to arrive at the element-wise mean, but doing so twenty-five times, one for each position in the _tile_.
 
 To compute an element-wise local aggregate, _tiles_ need have the same dimensions as in the example below where both _tiles_ have 5 rows and 5 columns. If we tried to compute an element-wise local aggregate over the DataFrame without equal _tile_ dimensions, we would get a runtime error.
 
-```python, local_mean
-rf.agg(rf_agg_local_mean(F.col('tile')).alias("local_mean")).select(rf_render_matrix("local_mean")).show(10, False)
+```python, local_mean, results='raw'
+rf.agg(rf_agg_local_mean(F.col('tile')).alias("local_mean")).select(rf_render_matrix("local_mean")).show(truncate=False)
 ```
 
 ## Cell Counts Example
 
 We can also count the total number of data and NoData cells over all the _tiles_ in a DataFrame using @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells) and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). There are 3,842,290 data cells and 1,941,734 NoData cells in this DataFrame. See section on @ref:["NoData" handling](nodata-handling.md) for additional discussion on handling missing data.
 
-```python, cell_counts
+```python, cell_counts, results='raw'
 rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
 stats = rf.agg(rf_agg_data_cells('proj_raster'), rf_agg_no_data_cells('proj_raster'))
 
-stats.show(5, False)
+stats.show()
 ```
 
 ## Statistical Summaries
@@ -70,15 +72,18 @@ rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B0
 stats = rf.select(rf_tile_stats('proj_raster').alias('stats'))
 
 stats.printSchema()
-stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').show(10, False)
+```
+
+```python, show_stats, results='raw'
+stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').show(10, truncate=False)
 ```
 
 The @ref:[`rf_agg_stats`](reference.md#rf-agg-stats) function aggregates over all of the _tiles_ in a DataFrame and returns a statistical summary of all cell values as shown below.
 
-```python, agg_stats
+```python, agg_stats, results='raw'
 rf.agg(rf_agg_stats('proj_raster').alias('stats')) \
     .select('stats.min', 'stats.max', 'stats.mean', 'stats.variance') \
-    .show(10, False)
+    .show()
 ```
 
 The @ref:[`rf_agg_local_stats`](reference.md#rf-agg-local-stats) function computes the element-wise local aggregate statistical summary as shown below. The DataFrame used in the previous two code blocks, has unequal _tile_ dimensions, so a different DataFrame is used in this code block to avoid a runtime error.
@@ -103,6 +108,7 @@ for r in agg_local_stats:
 
 The @ref:[`rf_tile_histogram`](reference.md#rf-tile-histogram) function computes a count of cell values within each row of _tile_ and outputs a `bins` array with the schema below. In the graph below, we have plotted `value` on the x-axis and `count` on the y-axis to create the histogram. There are 100 rows of _tile_ in this DataFrame, but this histogram is just computed for the _tile_ in the first row.
 
+
 ```python, tile_histogram
 import matplotlib.pyplot as plt
 
@@ -121,6 +127,7 @@ plt.show()
 
 The @ref:[`rf_agg_approx_histogram`](reference.md#rf-agg-approx-histogram) function computes a count of cell values across all of the rows of _tile_ in a DataFrame or group. In the example below, the range of the y-axis is significantly wider than the range of the y-axis on the previous histogram since this histogram was computed for all cell values in the DataFrame.
 
+
 ```python, agg_histogram
 bins_list = rf.agg(
     rf_agg_approx_histogram('proj_raster')['bins'].alias('bins')
 
@@ -14,22 +14,29 @@ $ python3 -m pip install pyrasterframes
 
 Then in a python interpreter of your choice, you can get a [`pyspark` `SparkSession`](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession) using the [`local[*]` master](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls).
 
-```python
+
+```python, setup, echo=False
+from pyrasterframes import rf_ipython
+```
+
+```python, version 
 import pyrasterframes
 spark = pyrasterframes.get_spark_session()
 ```
 
 Then, you can read a raster and work with it in a Spark DataFrame.
 
-```python
-from pyrasterframes.rasterfunctions import rf_local_add
+```python, local_add, results='raw'
+from pyrasterframes.rasterfunctions import *
 from pyspark.sql.functions import lit
 
 # Read a MODIS surface reflectance granule
 df = spark.read.raster('https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF')
 
 # Add 3 element-wise, show some rows of the DataFrame
-df.select(rf_local_add(df.proj_raster, lit(3))).show(5, False)
+df.withColumn('added', rf_local_add(df.proj_raster, lit(3))) \
+  .select(rf_crs('added'), rf_extent('added'), rf_tile('added')) \
+  .show(3)
 ```
 
 This example is extended in the [getting started Jupyter notebook](https://nbviewer.jupyter.org/github/locationtech/rasterframes/blob/develop/rf-notebook/src/main/notebooks/Getting%20Started.ipynb).
@@ -75,7 +82,7 @@ The `pyspark` shell command will look something like this.
 
 Then in the `pyspark` shell, import the module and call `withRasterFrames` on the SparkSession.
 
-```python, evaluate=False
+```python, shell, evaluate=False
 Welcome to
       ____              __
      / __/__  ___ _____/ /__
@@ -92,10 +99,6 @@ SparkSession available as 'spark'.
 
 Now you have the configured SparkSession with RasterFrames enabled.
 
-```python, echo=False
-spark.stop()
-```
-
 ## Installing GDAL
 
 GDAL provides a wide variety of drivers to read data from many different raster formats. If GDAL is installed in the environment, RasterFrames will be able to @ref:[read](raster-read.md) those formats.  If you are using the @ref:[Jupyter Notebook image](getting-started.md#jupyter-notebook), GDAL is already installed for you. Otherwise follow the instructions below. Version 2.4.1 or greater is required.
@@ -130,7 +133,7 @@ To support GeoTIFF and JPEG2000 formats, you should look for the following drive
 
 Do the following to see if RasterFrames was able to find GDAL:
 
-```python, evaluate=False
+```python, gdal_version, evaluate=False
 from pyrasterframes.utils import gdal_version
 print(gdal_version())
 ```
 
@@ -45,7 +45,7 @@ red_nir_tiles_monthly_2017 = spark.read.raster(
 
 ### Step 4: Compute aggregates
 
-```python, step_4_python
+```python, step_4_python, results='raw'
 result = red_nir_tiles_monthly_2017 \
     .where(st_intersects(
         st_reproject(rf_geometry(col('red')), rf_crs(col('red')).crsProj4, rf_mk_crs('EPSG:4326')),
@@ -75,7 +75,7 @@ sql("CREATE OR REPLACE TEMPORARY VIEW modis USING `aws-pds-modis-catalog`")
 
 ### Step 2: Down-select data by month
 
-```python, step_2_sql
+```python, step_2_sql, results='raw'
 sql("""
 CREATE OR REPLACE TEMPORARY VIEW red_nir_monthly_2017 AS
 SELECT granule_id, month(acquisition_date) as month, B01 as red, B02 as nir
@@ -101,7 +101,7 @@ OPTIONS (
 
 ### Step 4: Compute aggregates
 
-```python, step_4_sql
+```python, step_4_sql, results='raw'
 sql("""
 SELECT month, ndvi_stats.* FROM (
     SELECT month, rf_agg_stats(rf_normalized_difference(nir, red)) as ndvi_stats
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu`
`67`	`67`	`// Not sure why implicit resolution isn't handling this properly.`
`68`	`68`	`implicit val enc = Encoders.tuple(crsEncoder, Encoders.STRING, crsEncoder, Encoders.scalaDouble)`
`69`	`69`	`val df = Seq((pe.crs, "fred", pe.crs, 34.0)).toDF("c1", "s", "c2", "n")`
`70`		`- df.crsColumns.size should be (2)`
	`70`	`+ df.crsColumns.size should be(2)`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`it("should split TileLayout") {`
`@@ -92,10 +92,10 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu`
`92`	`92`
`93`	`93`	`it("should split key") {`
`94`	`94`	`val s1 = SpatialKey(0, 0).subdivide(2)`
`95`		`- assert(s1 === Seq(SpatialKey(0,0), SpatialKey(1,0), SpatialKey(0,1), SpatialKey(1,1)))`
	`95`	`+ assert(s1 === Seq(SpatialKey(0, 0), SpatialKey(1, 0), SpatialKey(0, 1), SpatialKey(1, 1)))`
`96`	`96`
`97`	`97`	`val s2 = SpatialKey(2, 3).subdivide(3)`
`98`		`- assert(s2 === Seq(SpatialKey(6,9), SpatialKey(7,9), SpatialKey(8,9), SpatialKey(6,10), SpatialKey(7,10), SpatialKey(8,10), SpatialKey(6,11), SpatialKey(7,11), SpatialKey(8,11)))`
	`98`	`+ assert(s2 === Seq(SpatialKey(6, 9), SpatialKey(7, 9), SpatialKey(8, 9), SpatialKey(6, 10), SpatialKey(7, 10), SpatialKey(8, 10), SpatialKey(6, 11), SpatialKey(7, 11), SpatialKey(8, 11)))`
`99`	`99`	`}`
`100`	`100`
`101`	`101`	`it("should split TileLayerMetadata[SpatialKey]") {`
`@@ -107,7 +107,12 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu`
`107`	`107`
`108`	`108`	`val divided = tlm.subdivide(2)`
`109`	`109`
`110`		`- assert(divided.tileLayout.tileDimensions === (tileSize/2, tileSize/2))`
	`110`	`+ assert(divided.tileLayout.tileDimensions === (tileSize / 2, tileSize / 2))`
	`111`	`+ }`
	`112`	`+`
	`113`	`+ it("should render Markdown") {`
	`114`	`+ import org.locationtech.rasterframes.util._`
	`115`	`+ rf.toMarkdown().count(_ == '\|') shouldBe >=(3 * 5)`
`111`	`116`	`}`
`112`	`117`	`}`
`113`	`118`	`}`