Incremental backup commit.

metasim · metasim · commit 37f6dc637ec9 · 2020-10-21T10:47:20.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ tour/*.tiff
 scoverage-report*
 
 zz-*
+rf-notebook/src/main/notebooks/.ipython
diff --git a/README.md b/README.md
@@ -62,6 +62,6 @@ Additional, Python sepcific build instruction may be found at [pyrasterframes/sr
 
 ## Copyright and License
 
-RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2019.
+RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2020.
 
 
diff --git a/datasource/src/it/scala/org/locationtech/rasterframes/datasource/raster/RaterSourceDataSourceIT.scala b/datasource/src/it/scala/org/locationtech/rasterframes/datasource/raster/RaterSourceDataSourceIT.scala
@@ -31,7 +31,7 @@ class RaterSourceDataSourceIT extends TestEnvironment with TestData {
       // A regression test.
       val rf = spark.read.raster
         .withSpatialIndex()
-        .load("https://s22s-test-geotiffs.s3.amazonaws.com/water_class/seasonality_90W_50N.tif")
+        .load("https://rasterframes.s3.amazonaws.com/samples/water_class/seasonality_90W_50N.tif")
 
       val target_rf =
         rf.select(rf_extent($"proj_raster").alias("extent"), rf_crs($"proj_raster").alias("crs"), rf_tile($"proj_raster").alias("target"))
diff --git a/datasource/src/test/resources/log4j.properties b/datasource/src/test/resources/log4j.properties
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+
+log4j.logger.org.apache=ERROR
+log4j.logger.com.amazonaws=WARN
+log4j.logger.geotrellis=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark_project.jetty=WARN
+log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.locationtech.rasterframes=DEBUG
+log4j.logger.org.locationtech.rasterframes.ref=DEBUG
+log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator=ERROR
+log4j.logger.org.apache.spark.sql.execution.WholeStageCodegenExec=ERROR
+log4j.logger.geotrellis.raster.gdal=ERROR
diff --git a/docs/src/main/paradox/index.md b/docs/src/main/paradox/index.md
@@ -29,18 +29,20 @@ The source code can be found on GitHub at [locationtech/rasterframes](https://gi
 
 ## Detailed Contents
 
-@@ toc { depth=4 }
+@@ toc { depth=3 }
 
 @@@ index
-* [Overview](description.md)
-* [Getting Started](getting-started.md)
-* [Concepts](concepts.md)
-* [Raster Data I/O](raster-io.md)
-* [Vector Data](vector-data.md)
-* [Raster Processing](raster-processing.md)
-* [Numpy and Pandas](numpy-pandas.md) 
-* [Scala and SQL](languages.md)
-* [Function Reference](reference.md)
-* [Release Notes](release-notes.md)
+* @ref:[Overview](description.md)
+* @ref:[Getting Started](getting-started.md)
+* @ref:[Concepts](concepts.md)
+* @ref:[Raster Data I/O](raster-io.md)
+* @ref:[Vector Data](vector-data.md)
+* @ref:[Raster Processing](raster-processing.md)
+* @ref:[Machine Learning](machine-learning.md)
+* @ref:[Numpy and Pandas](numpy-pandas.md) 
+* @ref:[IPython Extensions](ipython.md) 
+* @ref:[Scala and SQL](languages.md)
+* @ref:[Function Reference](reference.md)
+* @ref:[Release Notes](release-notes.md)
 @@@
 
diff --git a/docs/src/main/paradox/raster-processing.md b/docs/src/main/paradox/raster-processing.md
@@ -9,7 +9,6 @@
 * @ref:[Aggregation](aggregation.md)
 * @ref:[Time Series](time-series.md)
 * @ref:[Raster Join](raster-join.md)
-* @ref:[Machine Learning](machine-learning.md)
 
 @@@
 
diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md
@@ -6,12 +6,13 @@
 
 * Upgraded to Spark 2.4.7
 * Added `pyspark.sql.DataFrame.display(num_rows, truncate)` extension method when `rf_ipython` is imported.
+* Added users' manual section on IPython display enhancements.
 * Added `method_name` parameter to the `rf_resample` method.
    * __BREAKING__: In SQL, the function `rf_resample` now takes 3 arguments. You can use `rf_resample_nearest` with two arguments or refactor to `rf_resample(t, v, "nearest")`.
 * Added resample method parameter to SQL and Python APIs. @ref:[See updated docs](raster-join.md).
 * Upgraded many of the pyrasterframes dependencies, including: 
   `descartes`, `fiona`, `folium`, `geopandas`, `matplotlib`, `numpy`, `pandas`, `rasterio`, `shapely`
-
+* Changed `rasterframes.prefer-gdal` configuration parameter to default to `False`, as JVM GeoTIFF performs just as well for COGs as the GDAL one.
 
 ### 0.9.0
 
diff --git a/pyrasterframes/src/main/python/docs/aggregation.pymd b/pyrasterframes/src/main/python/docs/aggregation.pymd
@@ -71,7 +71,7 @@ rf.agg(rf_agg_local_mean('tile')) \
 We can also count the total number of data and NoData cells over all the _tiles_ in a DataFrame using @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells) and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). There are ~3.8 million data cells and ~1.9 million NoData cells in this DataFrame. See the section on @ref:["NoData" handling](nodata-handling.md) for additional discussion on handling missing data.
 
 ```python, cell_counts
-rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
+rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
 stats = rf.agg(rf_agg_data_cells('proj_raster'), rf_agg_no_data_cells('proj_raster'))
 stats
 ```
@@ -83,7 +83,7 @@ The statistical summary functions return a summary of cell values: number of dat
 The @ref:[`rf_tile_stats`](reference.md#rf-tile-stats) function computes summary statistics separately for each row in a _tile_ column as shown below.
 
 ```python, tile_stats
-rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif')
+rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif')
 stats = rf.select(rf_tile_stats('proj_raster').alias('stats'))
 
 stats.printSchema()
@@ -125,7 +125,7 @@ The @ref:[`rf_tile_histogram`](reference.md#rf-tile-histogram) function computes
 ```python, tile_histogram
 import matplotlib.pyplot as plt
 
-rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
+rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
 
 hist_df = rf.select(rf_tile_histogram('proj_raster')['bins'].alias('bins'))
 hist_df.printSchema()
diff --git a/pyrasterframes/src/main/python/docs/ipython.pymd b/pyrasterframes/src/main/python/docs/ipython.pymd
@@ -0,0 +1,82 @@
+# IPython/Jupyter Extensions
+
+The `pyrasterframes.rf_ipython` module injects a number of visualization extensions into the IPython environment, enhancing visualization of `DataFrame`s and `Tile`s.
+
+By default, the last expression's result in a IPython cell is passed to the `IPython.display.display` function. This function in turn looks for a [`DisplayFormatter`](https://ipython.readthedocs.io/en/stable/api/generated/IPython.core.formatters.html#IPython.core.formatters.DisplayFormatter) associated with the type, which in turn converts the instance to a display-appropriate representation, based on MIME type. For example, each `DisplayFormatter` may `plain/text` version for the IPython shell, and a `text/html` version for a Jupyter Notebook.
+
+```python imports, echo=False, results='hidden'
+from pyrasterframes.all import *
+from pyspark.sql.functions import col
+spark = create_rf_spark_session()
+```
+
+## Initialize Sample
+
+First we read in a sample image as tiles:
+
+```python raster_read
+uri = 'https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/' \
+      'MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF'
+
+# here we flatten the projected raster structure 
+df = spark.read.raster(uri) \
+        .withColumn('tile', rf_tile('proj_raster')) \
+        .withColumn('crs', rf_crs(col('proj_raster'))) \
+        .withColumn('extent', rf_extent(col('proj_raster'))) \
+        .drop('proj_raster')
+```
+ 
+Print the schema to confirm it's "shape":
+
+```python schema
+df.printSchema()
+```
+
+# Tile Display
+
+Let's look at a single tile. A `pyrasterframes.rf_types.Tile` will automatically render nicely in Jupyter or IPython.
+
+```python single_tile
+tile = df.select(df.tile).first()['tile']
+tile
+```
+
+If you access the tile's `cells` you get the underlying numpy ndarray (more specifically in this case, `numpy.ma.MaskedArray`).
+
+```python cells
+tile.cells
+```
+
+If you just want the string representation of the Tile, use `str`:
+
+```python tile_as_string
+str(tile)
+```
+
+## pyspark.sql.DataFrame Display
+
+There is also a capability for HTML rendering of the spark DataFrame. Rendering work is done on the JVM and the HTML string representation is provided for IPython to display.
+
+```python spark_dataframe
+df.select('tile', 'extent')
+```
+
+### Changing number of rows
+
+Because the `IPython.display.display` function doesn't accept any parameters, we have to provide a different means of passing parameters to the rendering code. Pandas does it with global settings via `set_option`/`get_option`. We take a more functional approach and have the user invoke an explicit `display` method:
+
+```python custom_display 
+df.display(num_rows=1, truncate=True)
+```  
+
+
+## pandas.DataFrame Display
+
+The same thing works for Pandas DataFrame if it contains a column of `Tile`s.
+
+```python pandas_dataframe
+# Limit copy of data from Spark to a few tiles.
+pandas_df = df.limit(4).toPandas()
+pandas_df.drop(['proj_raster_path'], axis=1)
+```
+
diff --git a/pyrasterframes/src/main/python/docs/local-algebra.pymd b/pyrasterframes/src/main/python/docs/local-algebra.pymd
@@ -35,7 +35,7 @@ This form of `(x - y) / (x + y)` is common in remote sensing and is called a nor
 
 ```python, read_rasters
 from pyspark.sql import Row
-uri_pattern = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B0{}.tif'
+uri_pattern = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B0{}.tif'
 catalog_df = spark.createDataFrame([
     Row(red=uri_pattern.format(4), nir=uri_pattern.format(8))
 ])
diff --git a/pyrasterframes/src/main/python/docs/masking.pymd b/pyrasterframes/src/main/python/docs/masking.pymd
@@ -30,9 +30,9 @@ The first step is to create a catalog with our band of interest and the SCL band
 ```python, blue_scl_cat
 from pyspark.sql import Row
 
-blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif'
-green_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B03.tif'
-scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif'
+blue_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif'
+green_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B03.tif'
+scl_uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/SCL.tif'
 cat = spark.createDataFrame([Row(blue=blue_uri, green=green_uri, scl=scl_uri),])
 unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'green', 'scl'])
 unmasked.printSchema()
diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd
@@ -40,7 +40,7 @@ CellType.float64()
 We can also inspect the cell type of a given _tile_ or `proj_raster` column.
 
 ```python, ct_from_sen
-cell_types = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif') \
+cell_types = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif') \
     .select(rf_cell_type('proj_raster')).distinct()
 cell_types    
 ```
diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd
@@ -19,7 +19,7 @@ RasterFrames can also read from @ref:[GeoTrellis catalogs and layers](raster-rea
 The simplest way to use the `raster` reader is with a single raster from a single URI or file. In the examples that follow we'll be reading from a Sentinel-2 scene stored in an AWS S3 bucket.
 
 ```python, read_one_uri
-rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif')
+rf = spark.read.raster('https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif')
 rf.printSchema()
 ```
 
@@ -158,7 +158,7 @@ For example, we can read a four-band (red, green, blue, and near-infrared) image
 
 ```python, multiband
 mb = spark.read.raster(
-    's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
+    'https://rasterframes.s3.amazonaws.com/samples/naip/m_3807863_nw_17_1_20160620.tif',
     band_indexes=[0, 1, 2, 3],
 )
 display(mb)
@@ -173,8 +173,8 @@ Here is a trivial example with a _catalog_ over multiband rasters. We specify tw
 ```python, multiband_catalog
 import pandas as pd
 mb_cat = pd.DataFrame([
-    {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif',
-     'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif'
+    {'foo': 'https://rasterframes.s3.amazonaws.com/samples/naip/m_3807863_nw_17_1_20160620.tif',
+     'bar': 'https://rasterframes.s3.amazonaws.com/samples/naip/m_3807863_nw_17_1_20160620.tif'
     },
 ])
 mb2 = spark.read.raster(
@@ -273,7 +273,7 @@ By default, reading raster pixel values is delayed until it is absolutely needed
 Consider the following two reads of the same data source. In the first, the lazy case, there is a pointer to the URI, extent and band to read. This will not be evaluated until the cell values are absolutely required. The second case shows the option to force the raster to be fully loaded right away.
 
 ```python, lazy_demo_1
-uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif'
+uri = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/B02.tif'
 lazy = spark.read.raster(uri).select(col('proj_raster.tile').cast('string'))
 lazy
 ```
diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd
@@ -24,7 +24,7 @@ The first step is to create a Spark DataFrame containing our imagery data. To ac
 The imagery for feature data will come from [eleven bands of 60 meter resolution Sentinel-2](https://earth.esa.int/web/sentinel/user-guides/sentinel-2-msi/resolutions/spatial) imagery. We also will use the [scene classification (SCL)](https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm) data to identify high quality, non-cloudy pixels.
 
 ```python, read_bands
-uri_base = 's3://s22s-test-geotiffs/luray_snp/{}.tif'
+uri_base = 'https://rasterframes.s3.amazonaws.com/samples/luray_snp/{}.tif'
 bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B11', 'B12']
 cols = ['SCL'] + bands
 
@@ -71,7 +71,7 @@ print('Found ', len(crses), 'distinct CRS.')
 crs = crses[0][0]
 
 from pyspark import SparkFiles
-spark.sparkContext.addFile('https://github.com/locationtech/rasterframes/raw/develop/pyrasterframes/src/test/resources/luray-labels.geojson') 
+spark.sparkContext.addFile('https://rasterframes.s3.amazonaws.com/samples/luray_snp/luray-labels.geojson') 
 
 label_df = spark.read.geojson(SparkFiles.get('luray-labels.geojson')) \
     .select('id', st_reproject('geometry', lit('EPSG:4326'), lit(crs)).alias('geometry')) \
diff --git a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd
@@ -32,8 +32,7 @@ The first step is to create a Spark DataFrame of our imagery data. To achieve th
 
 
 ```python, catalog
-filenamePattern = "https://github.com/locationtech/rasterframes/" \
-                  "raw/develop/core/src/test/resources/L8-B{}-Elkton-VA.tiff"
+filenamePattern = "https://rasterframes.s3.amazonaws.com/samples/elkton/L8-B{}-Elkton-VA.tiff"
 catalog_df = pd.DataFrame([
     {'b' + str(b): filenamePattern.format(b) for b in range(1, 8)}
 ])
diff --git a/pyrasterframes/src/main/python/pyrasterframes/all.py b/pyrasterframes/src/main/python/pyrasterframes/all.py
@@ -18,10 +18,14 @@
 #
 
 # noinspection PyUnresolvedReferences
-import pyrasterframes
+from pyrasterframes import *
 # noinspection PyUnresolvedReferences
 from pyrasterframes.rasterfunctions import *
 # noinspection PyUnresolvedReferences
 from pyrasterframes.utils import create_rf_spark_session
 # noinspection PyUnresolvedReferences
 import pyrasterframes.rf_ipython
+import pyspark
+
+print(f"RasterFrames version {pyrasterframes.__version__}; PySpark version {pyspark.__version__}")
+
diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py
diff --git a/pyrasterframes/src/main/python/pyrasterframes/utils.py b/pyrasterframes/src/main/python/pyrasterframes/utils.py
diff --git a/pyrasterframes/src/main/python/setup.py b/pyrasterframes/src/main/python/setup.py
diff --git a/pyrasterframes/src/main/python/tests/IpythonTests.py b/pyrasterframes/src/main/python/tests/IpythonTests.py
diff --git a/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb b/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb

Original file line number	Diff line number	Diff line change
`@@ -27,3 +27,4 @@ tour/*.tiff`
`27`	`27`	`scoverage-report*`
`28`	`28`
`29`	`29`	`zz-*`
	`30`	`+rf-notebook/src/main/notebooks/.ipython`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,6 @@ Additional, Python sepcific build instruction may be found at [pyrasterframes/sr`
`62`	`62`
`63`	`63`	`## Copyright and License`
`64`	`64`
`65`		`-RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2019.`
	`65`	`+RasterFrames is released under the Apache 2.0 License, copyright Astraea, Inc. 2017-2020.`
`66`	`66`
`67`	`67`