modified files to build for Databricks runtime 11.3 LTS compliant versions

ronanstokes-db · web-flow · commit 41fc07fad8dd · 2025-03-06T12:02:52.000-08:00
Modified build and dependency files to build for Databricks runtime 11.3 LTS compliant versions as earlier versions
will not be supported beyond March 2025.

This allows for use of Python 3.9 / Apache Spark 3.3.0 as minimum versions and brings in important updates to streaming (that will avoid version specific unit tests for other commits)
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -31,10 +31,10 @@ jobs:
           sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java
           java -version
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9.21
         uses: actions/setup-python@v5
         with:
-          python-version: '3.8.12'
+          python-version: '3.9.21'
           cache: 'pipenv'
 
       - name: Check Python version
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -24,10 +24,10 @@ jobs:
           sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java
           java -version
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9.21
         uses: actions/setup-python@v5
         with:
-          python-version: '3.8.12'
+          python-version: '3.9.21'
           cache: 'pipenv'
 
       - name: Check Python version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,13 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 #### Fixed 
 * Updated build scripts to use Ubuntu 22.04 to correspond to environment in Databricks runtime
 
+#### Changed
+* Changed base Databricks runtime version to DBR 11.3 LTS (based on Apache Spark 3.3.0)
+
+#### Added
+* Added support for serialization to/from JSON format
+
+
 ### Version 0.4.0 Hotfix 2
 
 #### Fixed
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -19,10 +19,7 @@ Dependent packages are not installed automatically by the `dbldatagen` package.
 
 ## Python compatibility
 
-The code has been tested with Python 3.8.12 and later.
-
-Older releases were tested with Python 3.7.5 but as of this release, it requires the Databricks 
-runtime 9.1 LTS or later. 
+The code has been tested with Python 3.9.21 and later.
 
 ## Checking your code for common issues
 
@@ -46,7 +43,7 @@ Our recommended mechanism for building the code is to use a `conda` or `pipenv`
 But it can be built with any Python virtualization environment.
 
 ### Spark dependencies
-The builds have been tested against Spark 3.2.1. This requires the OpenJDK 1.8.56 or later version of Java 8.
+The builds have been tested against Spark 3.3.0. This requires the OpenJDK 1.8.56 or later version of Java 8.
 The Databricks runtimes use the Azul Zulu version of OpenJDK 8 and we have used these in local testing.
 These are not installed automatically by the build process, so you will need to install them separately.
 
@@ -75,7 +72,7 @@ To build with `pipenv`, perform the following commands:
     - Run `make dist` from the main project directory
   - The resulting wheel file will be placed in the `dist` subdirectory
 
-The resulting build has been tested against Spark 3.2.1
+The resulting build has been tested against Spark 3.3.0
 
 ## Creating the HTML documentation
 
@@ -161,19 +158,19 @@ See https://legacy.python.org/dev/peps/pep-0008/
 
 # Github expectations
 When running the unit tests on Github, the environment should use the same environment as the latest Databricks
-runtime latest LTS release. While compatibility is preserved on LTS releases from Databricks runtime 10.4 onwards, 
+runtime latest LTS release. While compatibility is preserved on LTS releases from Databricks runtime 11.3 onwards, 
 unit tests will be run on the environment corresponding to the latest LTS release. 
 
-Libraries will use the same versions as the earliest supported LTS release - currently 10.4 LTS
+Libraries will use the same versions as the earliest supported LTS release - currently 11.3 LTS
 
 This means for the current build:
 
 - Use of Ubuntu 22.04 for the test runner
 - Use of Java 8
-- Use of Python 3.11
+- Use of Python 3.9.21 when testing / building the image
 
 See the following resources for more information
 = https://docs.databricks.com/en/release-notes/runtime/15.4lts.html
-- https://docs.databricks.com/en/release-notes/runtime/10.4lts.html
+- https://docs.databricks.com/en/release-notes/runtime/11.3lts.html
 - https://github.com/actions/runner-images/issues/10636
 
diff --git a/Pipfile b/Pipfile
@@ -10,7 +10,7 @@ sphinx = ">=2.0.0,<3.1.0"
 nbsphinx = "*"
 numpydoc = "==0.8"
 pypandoc = "*"
-ipython = "==7.31.1"
+ipython = "==7.32.0"
 pydata-sphinx-theme = "*"
 recommonmark = "*"
 sphinx-markdown-builder = "*"
@@ -19,13 +19,13 @@ prospector = "*"
 
 [packages]
 numpy = "==1.22.0"
-pyspark = "==3.1.3"
-pyarrow = "==4.0.1"
-wheel = "==0.38.4"
-pandas = "==1.2.4"
-setuptools = "==65.6.3"
-pyparsing = "==2.4.7"
+pyspark = "==3.3.0"
+pyarrow = "==7.0.0"
+wheel = "==0.37.0"
+pandas = "==1.3.4"
+setuptools = "==58.0.4"
+pyparsing = "==3.0.4"
 jmespath = "==0.10.0"
 
 [requires]
-python_version = "3.8.12"
+python_version = "3.9.21"
diff --git a/README.md b/README.md
@@ -83,8 +83,8 @@ The documentation [installation notes](https://databrickslabs.github.io/dbldatag
 contains details of installation using alternative mechanisms.
 
 ## Compatibility 
-The Databricks Labs Data Generator framework can be used with Pyspark 3.1.2 and Python 3.8 or later. These are 
-compatible with the Databricks runtime 10.4 LTS and later releases. For full Unity Catalog support, 
+The Databricks Labs Data Generator framework can be used with Pyspark 3.3.0 and Python 3.9.21 or later. These are 
+compatible with the Databricks runtime 11.3 LTS and later releases. For full Unity Catalog support, 
 we recommend using Databricks runtime 13.2 or later (Databricks 13.3 LTS or above preferred)
 
 For full library compatibility for a specific Databricks Spark release, see the Databricks 
@@ -155,7 +155,7 @@ The GitHub repository also contains further examples in the examples directory.
 
 ## Spark and Databricks Runtime Compatibility
 The `dbldatagen` package is intended to be compatible with recent LTS versions of the Databricks runtime, including 
-older LTS versions at least from 10.4 LTS and later. It also aims to be compatible with Delta Live Table runtimes, 
+older LTS versions at least from 11.3 LTS and later. It also aims to be compatible with Delta Live Table runtimes, 
 including `current` and `preview`. 
 
 While we don't specifically drop support for older runtimes, changes in Pyspark APIs or
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
@@ -95,7 +95,7 @@ class ColumnGenerationSpec(SerializableToDict):
     # restrict spurious messages from java gateway
     logging.getLogger("py4j").setLevel(logging.WARNING)
 
-    def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix='', random=False,
+    def __init__(self, name, colType=None, *, minValue=0, maxValue=None, step=1, prefix='', random=False,
                  distribution=None, baseColumn=None, randomSeed=None, randomSeedMethod=None,
                  implicit=False, omit=False, nullable=True, debug=False, verbose=False,
                  seedColumnName=DEFAULT_SEED_COLUMN,
@@ -529,18 +529,22 @@ def _setup_logger(self):
         else:
             self.logger.setLevel(logging.WARNING)
 
-    def _computeAdjustedRangeForColumn(self, colType, c_min, c_max, c_step, c_begin, c_end, c_interval, c_range,
+    def _computeAdjustedRangeForColumn(self, colType, c_min, c_max, c_step, *, c_begin, c_end, c_interval, c_range,
                                        c_unique):
         """Determine adjusted range for data column
         """
         assert colType is not None, "`colType` must be non-None instance"
 
         if type(colType) is DateType or type(colType) is TimestampType:
-            return self._computeAdjustedDateTimeRangeForColumn(colType, c_begin, c_end, c_interval, c_range, c_unique)
+            return self._computeAdjustedDateTimeRangeForColumn(colType, c_begin, c_end, c_interval,
+                                                               c_range=c_range,
+                                                               c_unique=c_unique)
         else:
-            return self._computeAdjustedNumericRangeForColumn(colType, c_min, c_max, c_step, c_range, c_unique)
+            return self._computeAdjustedNumericRangeForColumn(colType, c_min, c_max, c_step,
+                                                              c_range=c_range,
+                                                              c_unique=c_unique)
 
-    def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, c_range, c_unique):
+    def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, *, c_range, c_unique):
         """Determine adjusted range for data column
 
         Rules:
@@ -589,7 +593,7 @@ def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, c
 
         return result
 
-    def _computeAdjustedDateTimeRangeForColumn(self, colType, c_begin, c_end, c_interval, c_range, c_unique):
+    def _computeAdjustedDateTimeRangeForColumn(self, colType, c_begin, c_end, c_interval, *, c_range, c_unique):
         """Determine adjusted range for Date or Timestamp data column
         """
         effective_begin, effective_end, effective_interval = None, None, None
@@ -656,7 +660,7 @@ def _getUniformRandomSQLExpression(self, col_name):
         else:
             return "rand()"
 
-    def _getScaledIntSQLExpression(self, col_name, scale, base_columns, base_datatypes=None, compute_method=None,
+    def _getScaledIntSQLExpression(self, col_name, scale, base_columns, *, base_datatypes=None, compute_method=None,
                                    normalize=False):
         """ Get scaled numeric expression
 
diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py
@@ -92,7 +92,7 @@ def _displayRow(self, row):
 
         return ", ".join(results)
 
-    def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, dfData=None, rowLimit=1,
+    def _addMeasureToSummary(self, measureName, *, summaryExpr="''", fieldExprs=None, dfData=None, rowLimit=1,
                              dfSummary=None):
         """ Add a measure to the summary dataframe
 
@@ -340,7 +340,7 @@ def _generatorDefaultAttributesFromType(cls, sqlType, colName=None, dataSummary=
         return result
 
     @classmethod
-    def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppressOutput=False, name=None):
+    def _scriptDataGeneratorCode(cls, schema, *, dataSummary=None, sourceDf=None, suppressOutput=False, name=None):
         """
         Generate outline data generator code from an existing dataframe
 
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -76,7 +76,7 @@ class DataGenerator(SerializableToDict):
 
     # logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.NOTSET)
 
-    def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
+    def __init__(self, sparkSession=None, name=None, *, randomSeedMethod=None,
                  rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
                  batchSize=None, debug=False, seedColumnName=DEFAULT_SEED_COLUMN,
                  random=False,
@@ -782,7 +782,7 @@ def _checkColumnOrColumnList(self, columns, allowId=False):
                    f" column `{columns}` must refer to defined column")
         return True
 
-    def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=None,
+    def withColumnSpec(self, colName, *, minValue=None, maxValue=None, step=1, prefix=None,
                        random=None, distribution=None,
                        implicit=False, dataRange=None, omit=False, baseColumn=None, **kwargs):
         """ add a column specification for an existing column
@@ -842,7 +842,7 @@ def hasColumnSpec(self, colName):
         """
         return colName in self._columnSpecsByName
 
-    def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None, step=1,
+    def withColumn(self, colName, colType=StringType(), *, minValue=None, maxValue=None, step=1,
                    dataRange=None, prefix=None, random=None, distribution=None,
                    baseColumn=None, nullable=True,
                    omit=False, implicit=False, noWarn=False,
@@ -1058,7 +1058,7 @@ def withStructColumn(self, colName, fields=None, asJson=False, **kwargs):
 
         return newDf
 
-    def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
+    def _generateColumnDefinition(self, colName, colType=None, baseColumn=None, *,
                                   implicit=False, omit=False, nullable=True, **kwargs):
         """ generate field definition and column spec
 
@@ -1591,7 +1591,7 @@ def scriptTable(self, name=None, location=None, tableFormat="delta", asHtml=Fals
 
         return results
 
-    def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
+    def scriptMerge(self, tgtName=None, srcName=None, *, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
                     insertExpr=None,
                     useExplicitNames=True,
                     updateColumns=None, updateColumnExprs=None,
diff --git a/dbldatagen/text_generator_plugins.py b/dbldatagen/text_generator_plugins.py
@@ -69,7 +69,7 @@ class _FnCallContext:
         def __init__(self, txtGen):
             self.textGenerator = txtGen
 
-    def __init__(self, fn, init=None, initPerBatch=False, name=None, rootProperty=None):
+    def __init__(self, fn, *, init=None, initPerBatch=False, name=None, rootProperty=None):
         super().__init__()
         assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)"
         assert init is None or callable(init), "Init function must be a callable function or lambda if passed"
@@ -284,7 +284,7 @@ class FakerTextFactory(PyfuncTextFactory):
 
     _defaultFakerTextFactory = None
 
-    def __init__(self, locale=None, providers=None, name="FakerText", lib=None,
+    def __init__(self, *, locale=None, providers=None, name="FakerText", lib=None,
                  rootClass=None):
 
         super().__init__(name)
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
@@ -429,7 +429,8 @@ def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False):
 
         return num_placeholders, retval
 
-    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, escapeSpecialMeaning=False):
+    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, *,
+                                         escapeSpecialMeaning=False):
         """ Vectorized implementation of template driven text substitution
 
          Apply substitutions to placeholders using random numbers
diff --git a/makefile b/makefile
@@ -27,11 +27,11 @@ prepare: clean
 
 create-dev-env:
 	@echo "$(OK_COLOR)=> making conda dev environment$(NO_COLOR)"
-	conda create -n $(ENV_NAME) python=3.8.10
+	conda create -n $(ENV_NAME) python=3.9.21
 
 create-github-build-env:
 	@echo "$(OK_COLOR)=> making conda dev environment$(NO_COLOR)"
-	conda create -n pip_$(ENV_NAME) python=3.8
+	conda create -n pip_$(ENV_NAME) python=3.9.21
 
 install-dev-dependencies:
 	@echo "$(OK_COLOR)=> installing dev environment requirements$(NO_COLOR)"
diff --git a/python/dev_require.txt b/python/dev_require.txt
@@ -1,19 +1,19 @@
 # The following packages are used in building the test data generator framework.
 # All packages used are already installed in the Databricks runtime environment for version 6.5 or later
 numpy==1.22.0
-pandas==1.2.4
+pandas==1.3.4
 pickleshare==0.7.5
 py4j>=0.10.9.3
-pyarrow==4.0.1
-pyspark>=3.2.1,<=3.3.0
-python-dateutil==2.8.1
-six==1.15.0
-pyparsing==2.4.7
+pyarrow==7.0.0
+pyspark==3.3.0
+python-dateutil==2.8.2
+six==1.16.0
+pyparsing==3.0.4
 jmespath==0.10.0
 
 # The following packages are required for development only
-wheel==0.36.2
-setuptools==52.0.0
+wheel==0.37.0
+setuptools==58.0.4
 bumpversion
 pytest
 pytest-cov
@@ -28,7 +28,7 @@ sphinx_rtd_theme
 nbsphinx
 numpydoc==0.8
 pypandoc
-ipython==7.22.0
+ipython==7.32.0
 recommonmark
 sphinx-markdown-builder
 Jinja2 < 3.1
diff --git a/python/require.txt b/python/require.txt
@@ -1,19 +1,19 @@
 # The following packages are used in building the test data generator framework.
 # All packages used are already installed in the Databricks runtime environment for version 6.5 or later
 numpy==1.22.0
-pandas==1.2.5
+pandas==1.3.4
 pickleshare==0.7.5
 py4j==0.10.9
-pyarrow==4.0.1
-pyspark>=3.2.1
-python-dateutil==2.8.1
-six==1.15.0
-pyparsing==2.4.7
+pyarrow==7.0.0
+pyspark==3.3.0
+python-dateutil==2.8.2
+six==1.16.0
+pyparsing==3.0.4
 jmespath==0.10.0
 
 # The following packages are required for development only
-wheel==0.36.2
-setuptools==52.0.0
+wheel==0.37.0
+setuptools==58.0.4
 bumpversion
 pytest
 pytest-cov
@@ -27,7 +27,7 @@ sphinx_rtd_theme
 nbsphinx
 numpydoc==0.8
 pypandoc
-ipython==7.22.0
+ipython==7.32.0
 recommonmark
 sphinx-markdown-builder
 Jinja2 < 3.1
diff --git a/setup.py b/setup.py
@@ -55,5 +55,5 @@
         "Intended Audience :: Developers",
         "Intended Audience :: System Administrators"
     ],
-    python_requires='>=3.8.10',
+    python_requires='>=3.9.21',
 )
diff --git a/tests/test_basic_test.py b/tests/test_basic_test.py
diff --git a/tests/test_complex_columns.py b/tests/test_complex_columns.py
diff --git a/tests/test_constraints.py b/tests/test_constraints.py
diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py