Feature html formatting (#208)

ronanstokes-db · web-flow · commit 80ca02b29849 · 2023-04-20T16:14:46.000-07:00
* wip

* wip

* wip

* wip

* wip
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,12 +3,19 @@
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
+### Unreleased
+
+#### Changed
+* Added formatting of generated code as Html for script methods
+
+
 ### Version 0.3.4 Post 2
 
 ### Fixed
 * Fix for use of values in columns of type array, map and struct 
 * Fix for generation of arrays via `numFeatures` and `structType` attributes when numFeatures has value of 1
 
+
 ### Version 0.3.4 Post 1
 
 ### Fixed
diff --git a/Pipfile b/Pipfile
@@ -25,6 +25,7 @@ wheel = "==0.38.4"
 pandas = "==1.2.4"
 setuptools = "==65.6.3"
 pyparsing = "==2.4.7"
+jmespath = "==0.10.0"
 
 [requires]
 python_version = ">=3.8.10"
diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
@@ -27,7 +27,8 @@
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
                                RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
 from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
-    deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins
+    deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
+    json_value_from_path, system_time_millis
 from ._version import __version__
 from .column_generation_spec import ColumnGenerationSpec
 from .column_spec_options import ColumnSpecOptions
@@ -40,11 +41,12 @@
 from .spark_singleton import SparkSingleton
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
+from .html_utils import HtmlUtils
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins"
+           "text_generator_plugins", "html_utils"
            ]
 
 
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -17,6 +17,7 @@
                                OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
 
 from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
+from .html_utils import HtmlUtils
 from . _version import _get_spark_version
 from .schema_parser import SchemaParser
 
@@ -1187,13 +1188,14 @@ def _mkInsertOrUpdateStatement(self, columns, srcAlias, substitutions, isUpdate=
 
         return ", ".join(results)
 
-    def scriptTable(self, name=None, location=None, tableFormat="delta"):
+    def scriptTable(self, name=None, location=None, tableFormat="delta", asHtml=False):
         """ generate create table script suitable for format of test data set
 
         :param name: name of table to use in generated script
         :param location: path to location of data. If specified (default is None), will generate
                          an external table definition.
         :param tableFormat: table format for table
+        :param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
         :returns: SQL string for scripted table
         """
         assert name is not None, "`name` must be specified"
@@ -1219,14 +1221,21 @@ def scriptTable(self, name=None, location=None, tableFormat="delta"):
         if location is not None:
             results.append(f"location '{location}'")
 
-        return "\n".join(results)
+        results = "\n".join(results)
+
+        if asHtml:
+            results = HtmlUtils.formatCodeAsHtml(results)
+
+        return results
 
     def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
                     insertExpr=None,
                     useExplicitNames=True,
                     updateColumns=None, updateColumnExprs=None,
                     insertColumns=None, insertColumnExprs=None,
-                    srcAlias="src", tgtAlias="tgt"):
+                    srcAlias="src", tgtAlias="tgt",
+                    asHtml=False
+                    ):
         """ generate merge table script suitable for format of test data set
 
         :param tgtName: name of target table to use in generated script
@@ -1253,6 +1262,7 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
             By default, will use src column as update value for
             target table. This should have the form [ ("update_column_name", "update column expr"), ...]
         :param useExplicitNames: If True, generate explicit column names in insert and update statements
+        :param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
         :returns: SQL string for scripted merge statement
         """
         assert tgtName is not None, "you must specify a target table"
@@ -1327,4 +1337,9 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
 
         results.append(ins_clause)
 
-        return "\n".join(results)
+        result = "\n".join(results)
+
+        if asHtml:
+            result = HtmlUtils.formatCodeAsHtml(results)
+
+        return result
diff --git a/dbldatagen/html_utils.py b/dbldatagen/html_utils.py
@@ -0,0 +1,102 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file defines the `HtmlUtils` classes and utility functions
+"""
+
+from .utils import system_time_millis
+
+
+class HtmlUtils:
+    """ Utility class for formatting code as HTML and other notebook related formatting
+
+    """
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def formatCodeAsHtml(cls, codeText):
+        """ Formats supplied code as Html suitable for use with notebook ``displayHTML``
+
+        :param codeText: Code to be wrapped in html section
+        :return: Html string
+
+        This will wrap the code with a html section using html ``pre`` and ``code`` tags.
+
+        It adds a copy text to clipboard button to enable users to easily copy the code to the clipboard.
+
+        It does not reformat code so supplied code should be preformatted into lines.
+
+        .. note::
+            As the notebook environment uses IFrames in rendering html within ``displayHtml``, it cannot use
+            the newer ``navigator`` based functionality as this is blocked for cross domain IFrames by default.
+
+        """
+        ts = system_time_millis()
+
+        formattedCode = f"""
+            <h3>Generated Code</h3>
+            <div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{ts}"> 
+              {codeText}
+            </code></pre></p></br>
+            </div>
+            <p><button type="button" onclick="dbldatagen_copy_code_to_clipboard()">Copy code to clipboard!</button> </p>
+            <script>
+            function dbldatagen_copy_code_to_clipboard() {{
+               try {{
+                 var r = document.createRange();
+                 r.selectNode(document.getElementById("generated_code_{ts}"));
+                 window.getSelection().removeAllRanges();
+                 window.getSelection().addRange(r);
+                 document.execCommand('copy');
+                 window.getSelection().removeAllRanges();
+               }}
+               catch {{
+                 console.error("copy to clipboard failed")
+               }}
+            }}
+        </script>
+        """
+
+        return formattedCode
+
+    @classmethod
+    def formatTextAsHtml(cls, textContent, title="Output"):
+        """ Formats supplied text as Html suitable for use with notebook ``displayHTML``
+
+        :param textContent: Text to be wrapped in html section
+        :param title: Title text to be used
+        :return: Html string
+
+        This will wrap the text content with with Html formatting
+
+        """
+        ts = system_time_millis()
+        formattedContent = f"""
+            <h3>{title}</h3>
+            <div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{ts}"> 
+              {textContent}
+            </pre></p></br>
+            </div>
+            <p><button type="button" onclick="dbldatagen_copy_to_clipboard()">Copy output to clipboard!</button></p>
+            <script>
+            function dbldatagen_copy_to_clipboard() {{
+               try {{
+                 var r = document.createRange();
+                 r.selectNode(document.getElementById("generated_content_{ts}"));
+                 window.getSelection().removeAllRanges();
+                 window.getSelection().addRange(r);
+                 document.execCommand('copy');
+                 window.getSelection().removeAllRanges();
+               }}
+               catch {{
+                 console.error("copy to clipboard failed")
+               }}
+            }}
+        </script>
+        """
+
+        return formattedContent
diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -12,6 +12,9 @@
 import warnings
 from datetime import timedelta
 import re
+import json
+import time
+import jmespath
 
 
 def deprecated(message=""):
@@ -321,3 +324,36 @@ def match_condition(matchList, matchFn):
 
     # filter out empty lists
     return [el for el in retval if el != []]
+
+
+def json_value_from_path(searchPath, jsonData, defaultValue):
+    """ Get JSON value from JSON data referenced by searchPath
+
+    searchPath should be a JSON path as supported by the `jmespath` package
+    (see https://jmespath.org/)
+
+    :param searchPath: A `jmespath` compatible JSON search path
+    :param jsonData: The json data to search (string representation of the JSON data)
+    :param defaultValue: The default value to be returned if the value was not found
+    :return: Returns the json value if present, otherwise returns the default value
+    """
+    assert searchPath is not None and len(searchPath) > 0, "search path cannot be empty"
+    assert jsonData is not None and len(jsonData) > 0, "JSON data cannot be empty"
+
+    jsonDict = json.loads(jsonData)
+
+    jsonValue = jmespath.search(searchPath, jsonDict)
+
+    if jsonValue is not None:
+        return jsonValue
+
+    return defaultValue
+
+
+def system_time_millis():
+    """ return system time as milliseconds since start of epoch
+
+    :return: system time millis as long
+    """
+    curr_time = round(time.time() / 1000)
+    return curr_time
diff --git a/docs/utils/mk_quick_index.py b/docs/utils/mk_quick_index.py
@@ -43,6 +43,8 @@
                            "grouping": "internal classes"},
     "utils.py": {"briefDesc": "",
                  "grouping": "internal classes"},
+    "html_utils.py": {"briefDesc": "",
+                 "grouping": "internal classes"},
 
     "beta.py": {"briefDesc": "Beta distribution related code",
                 "grouping": "data distribution"},
diff --git a/python/dev_require.txt b/python/dev_require.txt
@@ -9,6 +9,7 @@ pyspark>=3.1.3
 python-dateutil==2.8.1
 six==1.15.0
 pyparsing==2.4.7
+jmespath==0.10.0
 
 # The following packages are required for development only
 wheel==0.36.2
diff --git a/python/require.txt b/python/require.txt
@@ -9,6 +9,7 @@ pyspark>=3.1.3
 python-dateutil==2.8.1
 six==1.15.0
 pyparsing==2.4.7
+jmespath==0.10.0
 
 # The following packages are required for development only
 wheel==0.36.2
@@ -31,3 +32,5 @@ recommonmark
 sphinx-markdown-builder
 rst2pdf==0.98
 Jinja2 < 3.1
+sphinx-copybutton
+
diff --git a/tests/test_html_utils.py b/tests/test_html_utils.py
@@ -0,0 +1,33 @@
+import pytest
+
+from dbldatagen import HtmlUtils, SparkSingleton
+
+spark = SparkSingleton.getLocalInstance("unit tests")
+
+
+class TestHtmlUtils:
+
+    @pytest.mark.parametrize("content",
+                             ["""
+                                for x in range(10):
+                                    print(x)
+                             """]
+                             )
+    def test_html_format_code(self, content):
+        formattedContent = HtmlUtils.formatCodeAsHtml(content)
+        assert formattedContent is not None
+        assert content in formattedContent
+
+    @pytest.mark.parametrize("content, heading",
+                             [("""
+                                this is a test 
+                                this is another one
+                             """, "testing"
+                               )])
+    def test_html_format_content(self, content, heading):
+        formattedContent = HtmlUtils.formatTextAsHtml(content, title=heading)
+
+        assert formattedContent is not None, "formatted output is None"
+
+        assert content in formattedContent, "original content missing"
+        assert heading in formattedContent, "heading missing from content"
diff --git a/tests/test_quick_tests.py b/tests/test_quick_tests.py
diff --git a/tests/test_utils.py b/tests/test_utils.py