Skip to content

Commit 18ed679

Browse files
authored
Merge pull request #284 from s22s/feature/df_to_markdown
Ability to render pymd 'show' tables to Markdown .
2 parents 1c14e25 + a40be5f commit 18ed679

File tree

24 files changed

+299
-141
lines changed

24 files changed

+299
-141
lines changed

core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,11 @@ trait RasterFunctions {
280280
udf(F.rasterize(_: Geometry, _: Geometry, _: Int, cols, rows)).apply(geometry, bounds, value)
281281
)
282282

283+
def rf_rasterize(geometry: Column, bounds: Column, value: Column, cols: Column, rows: Column): TypedColumn[Any, Tile] =
284+
withTypedAlias("rf_rasterize", geometry)(
285+
udf(F.rasterize).apply(geometry, bounds, value, cols, rows)
286+
)
287+
283288
/** Reproject a column of geometry from one CRS to another.
284289
* @param sourceGeom Geometry column to reproject
285290
* @param srcCRS Native CRS of `sourceGeom` as a literal

core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ import org.locationtech.rasterframes.model.TileContext
4242
case class RealizeTile(child: Expression) extends UnaryRasterOp with CodegenFallback {
4343
override def dataType: DataType = TileType
4444

45-
override def nodeName: String = "rf_realize_tile"
45+
override def nodeName: String = "rf_tile"
4646
implicit val tileSer = TileUDT.tileSerializer
4747

4848
override protected def eval(tile: Tile, ctx: Option[TileContext]): Any =

core/src/main/scala/org/locationtech/rasterframes/util/package.scala

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
3636
import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression}
3737
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
3838
import org.apache.spark.sql.catalyst.rules.Rule
39+
import org.apache.spark.sql.functions._
3940
import org.apache.spark.sql.rf._
4041
import org.apache.spark.sql.types.StringType
4142
import org.apache.spark.sql._
@@ -183,6 +184,27 @@ package object util {
183184
}
184185
}
185186

187+
implicit class DFWithPrettyPrint(val df: Dataset[_]) extends AnyVal {
188+
def toMarkdown(numRows: Int = 5, truncate: Boolean = false): String = {
189+
import df.sqlContext.implicits._
190+
val cols = df.columns
191+
val header = cols.mkString("| ", " | ", " |") + "\n" + ("|---" * cols.length) + "|\n"
192+
val stringifiers = cols
193+
.map(c => s"`$c`")
194+
.map(c => df.col(c).cast(StringType))
195+
.map(c => if (truncate) substring(c, 1, 40) else c)
196+
val cat = concat_ws(" | ", stringifiers: _*)
197+
val body = df
198+
.select(cat).limit(numRows)
199+
.as[String]
200+
.collect()
201+
.map(_.replaceAll("\\[", "\\\\["))
202+
.map(_.replace('\n', '↩'))
203+
.mkString("| ", " |\n| ", " |")
204+
header + body
205+
}
206+
}
207+
186208
object Shims {
187209
// GT 1.2.1 to 2.0.0
188210
def toArrayTile[T <: CellGrid](tile: T): T =

core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
6767
// Not sure why implicit resolution isn't handling this properly.
6868
implicit val enc = Encoders.tuple(crsEncoder, Encoders.STRING, crsEncoder, Encoders.scalaDouble)
6969
val df = Seq((pe.crs, "fred", pe.crs, 34.0)).toDF("c1", "s", "c2", "n")
70-
df.crsColumns.size should be (2)
70+
df.crsColumns.size should be(2)
7171
}
7272

7373
it("should split TileLayout") {
@@ -92,10 +92,10 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
9292

9393
it("should split key") {
9494
val s1 = SpatialKey(0, 0).subdivide(2)
95-
assert(s1 === Seq(SpatialKey(0,0), SpatialKey(1,0), SpatialKey(0,1), SpatialKey(1,1)))
95+
assert(s1 === Seq(SpatialKey(0, 0), SpatialKey(1, 0), SpatialKey(0, 1), SpatialKey(1, 1)))
9696

9797
val s2 = SpatialKey(2, 3).subdivide(3)
98-
assert(s2 === Seq(SpatialKey(6,9), SpatialKey(7,9), SpatialKey(8,9), SpatialKey(6,10), SpatialKey(7,10), SpatialKey(8,10), SpatialKey(6,11), SpatialKey(7,11), SpatialKey(8,11)))
98+
assert(s2 === Seq(SpatialKey(6, 9), SpatialKey(7, 9), SpatialKey(8, 9), SpatialKey(6, 10), SpatialKey(7, 10), SpatialKey(8, 10), SpatialKey(6, 11), SpatialKey(7, 11), SpatialKey(8, 11)))
9999
}
100100

101101
it("should split TileLayerMetadata[SpatialKey]") {
@@ -107,7 +107,12 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
107107

108108
val divided = tlm.subdivide(2)
109109

110-
assert(divided.tileLayout.tileDimensions === (tileSize/2, tileSize/2))
110+
assert(divided.tileLayout.tileDimensions === (tileSize / 2, tileSize / 2))
111+
}
112+
113+
it("should render Markdown") {
114+
import org.locationtech.rasterframes.util._
115+
rf.toMarkdown().count(_ == '|') shouldBe >=(3 * 5)
111116
}
112117
}
113118
}

docs/build.sbt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ makePDF := {
88

99
// Get the python source directory configured in the root project.
1010
val base = (Compile / paradox / sourceDirectories).value.find(_.toString.contains("python")).head
11-
println(base)
1211

1312
// Hard coded lacking any simple way of determining order.
1413
val files = Seq(

experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogDataSource.scala

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,49 @@ object MODISCatalogDataSource extends LazyLogging with ResourceCacheSupport {
7575
final val MCD43A4_BASE = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/"
7676
override def maxCacheFileAgeHours: Int = Int.MaxValue
7777

78-
// List of missing days
78+
// List of missing days in PDS
7979
private val blacklist = Seq[String](
80-
//"2018-05-06"
80+
"2018-02-27",
81+
"2018-02-28",
82+
"2018-03-01",
83+
"2018-03-02",
84+
"2018-03-03",
85+
"2018-03-04",
86+
"2018-03-05",
87+
"2018-03-06",
88+
"2018-03-07",
89+
"2018-03-08",
90+
"2018-03-09",
91+
"2018-03-10",
92+
"2018-03-11",
93+
"2018-03-12",
94+
"2018-03-13",
95+
"2018-03-14",
96+
"2018-05-16",
97+
"2018-05-17",
98+
"2018-05-18",
99+
"2018-05-19",
100+
"2018-05-20",
101+
"2018-05-21",
102+
"2018-06-01",
103+
"2018-06-04",
104+
"2018-07-29",
105+
"2018-08-03",
106+
"2018-08-04",
107+
"2018-08-05",
108+
"2018-10-01",
109+
"2018-10-02",
110+
"2018-10-03",
111+
"2018-10-22",
112+
"2018-10-23",
113+
"2018-11-12",
114+
"2018-12-19",
115+
"2018-12-20",
116+
"2018-12-21",
117+
"2018-12-22",
118+
"2018-12-23",
119+
"2018-12-24",
120+
"2019-03-18"
81121
)
82122

83123
private def sceneFiles(start: LocalDate, end: LocalDate, useBlacklist: Boolean) = {

pyrasterframes/src/main/python/docs/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
#
2020

2121
import os
22+
from pyspark.sql import DataFrame
23+
from pyrasterframes import RFContext
24+
from pweave import PwebPandocFormatter
2225

2326

2427
def docs_dir():
@@ -37,3 +40,12 @@ def resource_dir():
3740

3841
def resource_dir_uri():
3942
return 'file://' + resource_dir()
43+
44+
45+
class PegdownMarkdownFormatter(PwebPandocFormatter):
46+
def __init__(self, *args, **kwargs):
47+
super().__init__(*args, **kwargs)
48+
49+
# Pegdown doesn't support the width and label options.
50+
def make_figure_string(self, figname, width, label, caption=""):
51+
return "![%s](%s)" % (caption, figname)

pyrasterframes/src/main/python/docs/aggregation.pymd

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Aggregation
22

33
```python, setup, echo=False
4+
from pyrasterframes import rf_ipython
45
from docs import *
56
from pyrasterframes.utils import create_rf_spark_session
67
from pyrasterframes.rasterfunctions import *
@@ -16,7 +17,7 @@ There are 3 types of aggregate functions: _tile_ aggregate, DataFrame aggregate,
1617

1718
We can illustrate these differences in computing an aggregate mean. First, we create a sample DataFrame of 2 _tiles_ where the first _tile_ is composed of 25 values of 1.0 and the second _tile_ is composed of 25 values of 3.0.
1819

19-
```python, sql_dataframe
20+
```python, sql_dataframe, results='raw'
2021
import pyspark.sql.functions as F
2122

2223
rf = spark.sql("""
@@ -25,38 +26,39 @@ UNION
2526
SELECT 2 as id, rf_local_multiply(rf_make_ones_tile(5, 5, 'float32'), 3) as tile
2627
""")
2728

28-
rf.select("id", rf_render_matrix("tile")).show(10, False)
29+
rf.select("id", rf_render_matrix("tile")).show(truncate=False)
2930
```
3031

32+
3133
In this code block, we are using the @ref:[`rf_tile_mean`](reference.md#rf-tile-mean) function to compute the _tile_ aggregate mean of cells in each row of column `tile`. The mean of each _tile_ is computed separately, so the first mean is 1.0 and the second mean is 3.0. Notice that the number of rows in the DataFrame is the same before and after the aggregation.
3234

33-
```python, tile_mean
34-
rf.select(F.col('id'), rf_tile_mean(F.col('tile'))).show(10, False)
35+
```python, tile_mean, results='raw'
36+
rf.select(F.col('id'), rf_tile_mean(F.col('tile'))).show(truncate=False)
3537
```
3638

3739
In this code block, we are using the @ref:[`rf_agg_mean`](reference.md#rf-agg-mean) function to compute the DataFrame aggregate, which averages 25 values of 1.0 and 25 values of 3.0, across the fifty cells in two rows. Note that only a single row is returned since the average is computed over the full DataFrame.
3840

39-
```python, agg_mean
40-
rf.agg(rf_agg_mean(F.col('tile'))).show(10, False)
41+
```python, agg_mean, results='raw'
42+
rf.agg(rf_agg_mean(F.col('tile'))).show()
4143
```
4244

4345
In this code block, we are using the @ref:[`rf_agg_local_mean`](reference.md#rf-agg-local-mean) function to compute the element-wise local aggregate mean across the two rows. In this example it is computing the mean of one value of 1.0 and one value of 3.0 to arrive at the element-wise mean, but doing so twenty-five times, one for each position in the _tile_.
4446

4547
To compute an element-wise local aggregate, _tiles_ need have the same dimensions as in the example below where both _tiles_ have 5 rows and 5 columns. If we tried to compute an element-wise local aggregate over the DataFrame without equal _tile_ dimensions, we would get a runtime error.
4648

47-
```python, local_mean
48-
rf.agg(rf_agg_local_mean(F.col('tile')).alias("local_mean")).select(rf_render_matrix("local_mean")).show(10, False)
49+
```python, local_mean, results='raw'
50+
rf.agg(rf_agg_local_mean(F.col('tile')).alias("local_mean")).select(rf_render_matrix("local_mean")).show(truncate=False)
4951
```
5052

5153
## Cell Counts Example
5254

5355
We can also count the total number of data and NoData cells over all the _tiles_ in a DataFrame using @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells) and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). There are 3,842,290 data cells and 1,941,734 NoData cells in this DataFrame. See section on @ref:["NoData" handling](nodata-handling.md) for additional discussion on handling missing data.
5456

55-
```python, cell_counts
57+
```python, cell_counts, results='raw'
5658
rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
5759
stats = rf.agg(rf_agg_data_cells('proj_raster'), rf_agg_no_data_cells('proj_raster'))
5860

59-
stats.show(5, False)
61+
stats.show()
6062
```
6163

6264
## Statistical Summaries
@@ -70,15 +72,18 @@ rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B0
7072
stats = rf.select(rf_tile_stats('proj_raster').alias('stats'))
7173

7274
stats.printSchema()
73-
stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').show(10, False)
75+
```
76+
77+
```python, show_stats, results='raw'
78+
stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').show(10, truncate=False)
7479
```
7580

7681
The @ref:[`rf_agg_stats`](reference.md#rf-agg-stats) function aggregates over all of the _tiles_ in a DataFrame and returns a statistical summary of all cell values as shown below.
7782

78-
```python, agg_stats
83+
```python, agg_stats, results='raw'
7984
rf.agg(rf_agg_stats('proj_raster').alias('stats')) \
8085
.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance') \
81-
.show(10, False)
86+
.show()
8287
```
8388

8489
The @ref:[`rf_agg_local_stats`](reference.md#rf-agg-local-stats) function computes the element-wise local aggregate statistical summary as shown below. The DataFrame used in the previous two code blocks, has unequal _tile_ dimensions, so a different DataFrame is used in this code block to avoid a runtime error.
@@ -103,6 +108,7 @@ for r in agg_local_stats:
103108

104109
The @ref:[`rf_tile_histogram`](reference.md#rf-tile-histogram) function computes a count of cell values within each row of _tile_ and outputs a `bins` array with the schema below. In the graph below, we have plotted `value` on the x-axis and `count` on the y-axis to create the histogram. There are 100 rows of _tile_ in this DataFrame, but this histogram is just computed for the _tile_ in the first row.
105110

111+
106112
```python, tile_histogram
107113
import matplotlib.pyplot as plt
108114

@@ -121,6 +127,7 @@ plt.show()
121127

122128
The @ref:[`rf_agg_approx_histogram`](reference.md#rf-agg-approx-histogram) function computes a count of cell values across all of the rows of _tile_ in a DataFrame or group. In the example below, the range of the y-axis is significantly wider than the range of the y-axis on the previous histogram since this histogram was computed for all cell values in the DataFrame.
123129

130+
124131
```python, agg_histogram
125132
bins_list = rf.agg(
126133
rf_agg_approx_histogram('proj_raster')['bins'].alias('bins')

pyrasterframes/src/main/python/docs/getting-started.pymd

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,29 @@ $ python3 -m pip install pyrasterframes
1414

1515
Then in a python interpreter of your choice, you can get a [`pyspark` `SparkSession`](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession) using the [`local[*]` master](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls).
1616

17-
```python
17+
18+
```python, setup, echo=False
19+
from pyrasterframes import rf_ipython
20+
```
21+
22+
```python, version
1823
import pyrasterframes
1924
spark = pyrasterframes.get_spark_session()
2025
```
2126

2227
Then, you can read a raster and work with it in a Spark DataFrame.
2328

24-
```python
25-
from pyrasterframes.rasterfunctions import rf_local_add
29+
```python, local_add, results='raw'
30+
from pyrasterframes.rasterfunctions import *
2631
from pyspark.sql.functions import lit
2732

2833
# Read a MODIS surface reflectance granule
2934
df = spark.read.raster('https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF')
3035

3136
# Add 3 element-wise, show some rows of the DataFrame
32-
df.select(rf_local_add(df.proj_raster, lit(3))).show(5, False)
37+
df.withColumn('added', rf_local_add(df.proj_raster, lit(3))) \
38+
.select(rf_crs('added'), rf_extent('added'), rf_tile('added')) \
39+
.show(3)
3340
```
3441

3542
This example is extended in the [getting started Jupyter notebook](https://nbviewer.jupyter.org/github/locationtech/rasterframes/blob/develop/rf-notebook/src/main/notebooks/Getting%20Started.ipynb).
@@ -75,7 +82,7 @@ The `pyspark` shell command will look something like this.
7582

7683
Then in the `pyspark` shell, import the module and call `withRasterFrames` on the SparkSession.
7784

78-
```python, evaluate=False
85+
```python, shell, evaluate=False
7986
Welcome to
8087
____ __
8188
/ __/__ ___ _____/ /__
@@ -92,10 +99,6 @@ SparkSession available as 'spark'.
9299

93100
Now you have the configured SparkSession with RasterFrames enabled.
94101

95-
```python, echo=False
96-
spark.stop()
97-
```
98-
99102
## Installing GDAL
100103

101104
GDAL provides a wide variety of drivers to read data from many different raster formats. If GDAL is installed in the environment, RasterFrames will be able to @ref:[read](raster-read.md) those formats. If you are using the @ref:[Jupyter Notebook image](getting-started.md#jupyter-notebook), GDAL is already installed for you. Otherwise follow the instructions below. Version 2.4.1 or greater is required.
@@ -130,7 +133,7 @@ To support GeoTIFF and JPEG2000 formats, you should look for the following drive
130133

131134
Do the following to see if RasterFrames was able to find GDAL:
132135

133-
```python, evaluate=False
136+
```python, gdal_version, evaluate=False
134137
from pyrasterframes.utils import gdal_version
135138
print(gdal_version())
136139
```

pyrasterframes/src/main/python/docs/languages.pymd

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ red_nir_tiles_monthly_2017 = spark.read.raster(
4545

4646
### Step 4: Compute aggregates
4747

48-
```python, step_4_python
48+
```python, step_4_python, results='raw'
4949
result = red_nir_tiles_monthly_2017 \
5050
.where(st_intersects(
5151
st_reproject(rf_geometry(col('red')), rf_crs(col('red')).crsProj4, rf_mk_crs('EPSG:4326')),
@@ -75,7 +75,7 @@ sql("CREATE OR REPLACE TEMPORARY VIEW modis USING `aws-pds-modis-catalog`")
7575

7676
### Step 2: Down-select data by month
7777

78-
```python, step_2_sql
78+
```python, step_2_sql, results='raw'
7979
sql("""
8080
CREATE OR REPLACE TEMPORARY VIEW red_nir_monthly_2017 AS
8181
SELECT granule_id, month(acquisition_date) as month, B01 as red, B02 as nir
@@ -101,7 +101,7 @@ OPTIONS (
101101

102102
### Step 4: Compute aggregates
103103

104-
```python, step_4_sql
104+
```python, step_4_sql, results='raw'
105105
sql("""
106106
SELECT month, ndvi_stats.* FROM (
107107
SELECT month, rf_agg_stats(rf_normalized_difference(nir, red)) as ndvi_stats

0 commit comments

Comments
 (0)