Skip to content

Commit 80ca02b

Browse files
Feature html formatting (#208)
* wip * wip * wip * wip * wip
1 parent 9e58aaa commit 80ca02b

File tree

12 files changed

+322
-123
lines changed

12 files changed

+322
-123
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,19 @@
33
## Change History
44
All notable changes to the Databricks Labs Data Generator will be documented in this file.
55

6+
### Unreleased
7+
8+
#### Changed
9+
* Added formatting of generated code as Html for script methods
10+
11+
612
### Version 0.3.4 Post 2
713

814
### Fixed
915
* Fix for use of values in columns of type array, map and struct
1016
* Fix for generation of arrays via `numFeatures` and `structType` attributes when numFeatures has value of 1
1117

18+
1219
### Version 0.3.4 Post 1
1320

1421
### Fixed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ wheel = "==0.38.4"
2525
pandas = "==1.2.4"
2626
setuptools = "==65.6.3"
2727
pyparsing = "==2.4.7"
28+
jmespath = "==0.10.0"
2829

2930
[requires]
3031
python_version = ">=3.8.10"

dbldatagen/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
2828
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
2929
from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
30-
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins
30+
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
31+
json_value_from_path, system_time_millis
3132
from ._version import __version__
3233
from .column_generation_spec import ColumnGenerationSpec
3334
from .column_spec_options import ColumnSpecOptions
@@ -40,11 +41,12 @@
4041
from .spark_singleton import SparkSingleton
4142
from .text_generators import TemplateGenerator, ILText, TextGenerator
4243
from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
44+
from .html_utils import HtmlUtils
4345

4446
__all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
4547
"column_generation_spec", "utils", "function_builder",
4648
"spark_singleton", "text_generators", "datarange", "datagen_constants",
47-
"text_generator_plugins"
49+
"text_generator_plugins", "html_utils"
4850
]
4951

5052

dbldatagen/data_generator.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
1818

1919
from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
20+
from .html_utils import HtmlUtils
2021
from . _version import _get_spark_version
2122
from .schema_parser import SchemaParser
2223

@@ -1187,13 +1188,14 @@ def _mkInsertOrUpdateStatement(self, columns, srcAlias, substitutions, isUpdate=
11871188

11881189
return ", ".join(results)
11891190

1190-
def scriptTable(self, name=None, location=None, tableFormat="delta"):
1191+
def scriptTable(self, name=None, location=None, tableFormat="delta", asHtml=False):
11911192
""" generate create table script suitable for format of test data set
11921193
11931194
:param name: name of table to use in generated script
11941195
:param location: path to location of data. If specified (default is None), will generate
11951196
an external table definition.
11961197
:param tableFormat: table format for table
1198+
:param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
11971199
:returns: SQL string for scripted table
11981200
"""
11991201
assert name is not None, "`name` must be specified"
@@ -1219,14 +1221,21 @@ def scriptTable(self, name=None, location=None, tableFormat="delta"):
12191221
if location is not None:
12201222
results.append(f"location '{location}'")
12211223

1222-
return "\n".join(results)
1224+
results = "\n".join(results)
1225+
1226+
if asHtml:
1227+
results = HtmlUtils.formatCodeAsHtml(results)
1228+
1229+
return results
12231230

12241231
def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
12251232
insertExpr=None,
12261233
useExplicitNames=True,
12271234
updateColumns=None, updateColumnExprs=None,
12281235
insertColumns=None, insertColumnExprs=None,
1229-
srcAlias="src", tgtAlias="tgt"):
1236+
srcAlias="src", tgtAlias="tgt",
1237+
asHtml=False
1238+
):
12301239
""" generate merge table script suitable for format of test data set
12311240
12321241
:param tgtName: name of target table to use in generated script
@@ -1253,6 +1262,7 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
12531262
By default, will use src column as update value for
12541263
target table. This should have the form [ ("update_column_name", "update column expr"), ...]
12551264
:param useExplicitNames: If True, generate explicit column names in insert and update statements
1265+
:param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
12561266
:returns: SQL string for scripted merge statement
12571267
"""
12581268
assert tgtName is not None, "you must specify a target table"
@@ -1327,4 +1337,9 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
13271337

13281338
results.append(ins_clause)
13291339

1330-
return "\n".join(results)
1340+
result = "\n".join(results)
1341+
1342+
if asHtml:
1343+
result = HtmlUtils.formatCodeAsHtml(results)
1344+
1345+
return result

dbldatagen/html_utils.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# See the License for the specific language governing permissions and
2+
# limitations under the License.
3+
#
4+
5+
"""
6+
This file defines the `HtmlUtils` classes and utility functions
7+
"""
8+
9+
from .utils import system_time_millis
10+
11+
12+
class HtmlUtils:
13+
""" Utility class for formatting code as HTML and other notebook related formatting
14+
15+
"""
16+
17+
def __init__(self):
18+
pass
19+
20+
@classmethod
21+
def formatCodeAsHtml(cls, codeText):
22+
""" Formats supplied code as Html suitable for use with notebook ``displayHTML``
23+
24+
:param codeText: Code to be wrapped in html section
25+
:return: Html string
26+
27+
This will wrap the code with a html section using html ``pre`` and ``code`` tags.
28+
29+
It adds a copy text to clipboard button to enable users to easily copy the code to the clipboard.
30+
31+
It does not reformat code so supplied code should be preformatted into lines.
32+
33+
.. note::
34+
As the notebook environment uses IFrames in rendering html within ``displayHtml``, it cannot use
35+
the newer ``navigator`` based functionality as this is blocked for cross domain IFrames by default.
36+
37+
"""
38+
ts = system_time_millis()
39+
40+
formattedCode = f"""
41+
<h3>Generated Code</h3>
42+
<div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{ts}">
43+
{codeText}
44+
</code></pre></p></br>
45+
</div>
46+
<p><button type="button" onclick="dbldatagen_copy_code_to_clipboard()">Copy code to clipboard!</button> </p>
47+
<script>
48+
function dbldatagen_copy_code_to_clipboard() {{
49+
try {{
50+
var r = document.createRange();
51+
r.selectNode(document.getElementById("generated_code_{ts}"));
52+
window.getSelection().removeAllRanges();
53+
window.getSelection().addRange(r);
54+
document.execCommand('copy');
55+
window.getSelection().removeAllRanges();
56+
}}
57+
catch {{
58+
console.error("copy to clipboard failed")
59+
}}
60+
}}
61+
</script>
62+
"""
63+
64+
return formattedCode
65+
66+
@classmethod
67+
def formatTextAsHtml(cls, textContent, title="Output"):
68+
""" Formats supplied text as Html suitable for use with notebook ``displayHTML``
69+
70+
:param textContent: Text to be wrapped in html section
71+
:param title: Title text to be used
72+
:return: Html string
73+
74+
This will wrap the text content with with Html formatting
75+
76+
"""
77+
ts = system_time_millis()
78+
formattedContent = f"""
79+
<h3>{title}</h3>
80+
<div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{ts}">
81+
{textContent}
82+
</pre></p></br>
83+
</div>
84+
<p><button type="button" onclick="dbldatagen_copy_to_clipboard()">Copy output to clipboard!</button></p>
85+
<script>
86+
function dbldatagen_copy_to_clipboard() {{
87+
try {{
88+
var r = document.createRange();
89+
r.selectNode(document.getElementById("generated_content_{ts}"));
90+
window.getSelection().removeAllRanges();
91+
window.getSelection().addRange(r);
92+
document.execCommand('copy');
93+
window.getSelection().removeAllRanges();
94+
}}
95+
catch {{
96+
console.error("copy to clipboard failed")
97+
}}
98+
}}
99+
</script>
100+
"""
101+
102+
return formattedContent

dbldatagen/utils.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
import warnings
1313
from datetime import timedelta
1414
import re
15+
import json
16+
import time
17+
import jmespath
1518

1619

1720
def deprecated(message=""):
@@ -321,3 +324,36 @@ def match_condition(matchList, matchFn):
321324

322325
# filter out empty lists
323326
return [el for el in retval if el != []]
327+
328+
329+
def json_value_from_path(searchPath, jsonData, defaultValue):
330+
""" Get JSON value from JSON data referenced by searchPath
331+
332+
searchPath should be a JSON path as supported by the `jmespath` package
333+
(see https://jmespath.org/)
334+
335+
:param searchPath: A `jmespath` compatible JSON search path
336+
:param jsonData: The json data to search (string representation of the JSON data)
337+
:param defaultValue: The default value to be returned if the value was not found
338+
:return: Returns the json value if present, otherwise returns the default value
339+
"""
340+
assert searchPath is not None and len(searchPath) > 0, "search path cannot be empty"
341+
assert jsonData is not None and len(jsonData) > 0, "JSON data cannot be empty"
342+
343+
jsonDict = json.loads(jsonData)
344+
345+
jsonValue = jmespath.search(searchPath, jsonDict)
346+
347+
if jsonValue is not None:
348+
return jsonValue
349+
350+
return defaultValue
351+
352+
353+
def system_time_millis():
354+
""" return system time as milliseconds since start of epoch
355+
356+
:return: system time millis as long
357+
"""
358+
curr_time = round(time.time() / 1000)
359+
return curr_time

docs/utils/mk_quick_index.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
"grouping": "internal classes"},
4444
"utils.py": {"briefDesc": "",
4545
"grouping": "internal classes"},
46+
"html_utils.py": {"briefDesc": "",
47+
"grouping": "internal classes"},
4648

4749
"beta.py": {"briefDesc": "Beta distribution related code",
4850
"grouping": "data distribution"},

python/dev_require.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pyspark>=3.1.3
99
python-dateutil==2.8.1
1010
six==1.15.0
1111
pyparsing==2.4.7
12+
jmespath==0.10.0
1213

1314
# The following packages are required for development only
1415
wheel==0.36.2

python/require.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pyspark>=3.1.3
99
python-dateutil==2.8.1
1010
six==1.15.0
1111
pyparsing==2.4.7
12+
jmespath==0.10.0
1213

1314
# The following packages are required for development only
1415
wheel==0.36.2
@@ -31,3 +32,5 @@ recommonmark
3132
sphinx-markdown-builder
3233
rst2pdf==0.98
3334
Jinja2 < 3.1
35+
sphinx-copybutton
36+

tests/test_html_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pytest
2+
3+
from dbldatagen import HtmlUtils, SparkSingleton
4+
5+
spark = SparkSingleton.getLocalInstance("unit tests")
6+
7+
8+
class TestHtmlUtils:
9+
10+
@pytest.mark.parametrize("content",
11+
["""
12+
for x in range(10):
13+
print(x)
14+
"""]
15+
)
16+
def test_html_format_code(self, content):
17+
formattedContent = HtmlUtils.formatCodeAsHtml(content)
18+
assert formattedContent is not None
19+
assert content in formattedContent
20+
21+
@pytest.mark.parametrize("content, heading",
22+
[("""
23+
this is a test
24+
this is another one
25+
""", "testing"
26+
)])
27+
def test_html_format_content(self, content, heading):
28+
formattedContent = HtmlUtils.formatTextAsHtml(content, title=heading)
29+
30+
assert formattedContent is not None, "formatted output is None"
31+
32+
assert content in formattedContent, "original content missing"
33+
assert heading in formattedContent, "heading missing from content"

0 commit comments

Comments
 (0)