Skip to content

Commit 3479083

Browse files
committed
XML Reader and Reader2Doc Improvements (#14691)
* Reader2Doc new defaults to always output single document * XMLReader improvements - doesn't output empty text anymore - Can extract tag attribute values * Reader2Doc improvements - adjusted defaults, so we always output a single large document - can specify join char with new parameter - adjusted other readers for new defaults * Reader2Doc improvements python side * ReaderAssembler: Fix failing test
1 parent 8537c95 commit 3479083

File tree

13 files changed

+518
-63
lines changed

13 files changed

+518
-63
lines changed

python/sparknlp/partition/partition_properties.py

Lines changed: 146 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818

1919
class HasReaderProperties(Params):
20-
2120
inputCol = Param(
2221
Params._dummy(),
2322
"inputCol",
@@ -245,8 +244,8 @@ def setOutputAsDocument(self, value):
245244
"""
246245
return self._set(outputAsDocument=value)
247246

248-
class HasEmailReaderProperties(Params):
249247

248+
class HasEmailReaderProperties(Params):
250249
addAttachmentContent = Param(
251250
Params._dummy(),
252251
"addAttachmentContent",
@@ -278,7 +277,6 @@ def getAddAttachmentContent(self):
278277

279278

280279
class HasExcelReaderProperties(Params):
281-
282280
cellSeparator = Param(
283281
Params._dummy(),
284282
"cellSeparator",
@@ -337,8 +335,8 @@ def getAppendCells(self):
337335
"""
338336
return self.getOrDefault(self.appendCells)
339337

340-
class HasHTMLReaderProperties(Params):
341338

339+
class HasHTMLReaderProperties(Params):
342340
timeout = Param(
343341
Params._dummy(),
344342
"timeout",
@@ -395,8 +393,8 @@ def setOutputFormat(self, value: str):
395393
"""
396394
return self._set(outputFormat=value)
397395

398-
class HasPowerPointProperties(Params):
399396

397+
class HasPowerPointProperties(Params):
400398
includeSlideNotes = Param(
401399
Params._dummy(),
402400
"includeSlideNotes",
@@ -426,8 +424,8 @@ def getIncludeSlideNotes(self):
426424
"""
427425
return self.getOrDefault(self.includeSlideNotes)
428426

429-
class HasTextReaderProperties(Params):
430427

428+
class HasTextReaderProperties(Params):
431429
titleLengthSize = Param(
432430
Params._dummy(),
433431
"titleLengthSize",
@@ -436,9 +434,28 @@ class HasTextReaderProperties(Params):
436434
)
437435

438436
def setTitleLengthSize(self, value):
437+
"""Set the maximum character length used to identify title blocks.
438+
439+
Parameters
440+
----------
441+
value : int
442+
Maximum number of characters a text block can have to be considered a title.
443+
444+
Returns
445+
-------
446+
self
447+
The instance with updated `titleLengthSize` parameter.
448+
"""
439449
return self._set(titleLengthSize=value)
440450

441451
def getTitleLengthSize(self):
452+
"""Get the configured maximum title length.
453+
454+
Returns
455+
-------
456+
int
457+
The maximum character length used to detect title blocks.
458+
"""
442459
return self.getOrDefault(self.titleLengthSize)
443460

444461
groupBrokenParagraphs = Param(
@@ -449,9 +466,28 @@ def getTitleLengthSize(self):
449466
)
450467

451468
def setGroupBrokenParagraphs(self, value):
469+
"""Enable or disable grouping of broken paragraphs.
470+
471+
Parameters
472+
----------
473+
value : bool
474+
True to merge fragmented lines into paragraphs, False to leave lines as-is.
475+
476+
Returns
477+
-------
478+
self
479+
The instance with updated `groupBrokenParagraphs` parameter.
480+
"""
452481
return self._set(groupBrokenParagraphs=value)
453482

454483
def getGroupBrokenParagraphs(self):
484+
"""Get whether broken paragraph grouping is enabled.
485+
486+
Returns
487+
-------
488+
bool
489+
True if grouping of broken paragraphs is enabled, False otherwise.
490+
"""
455491
return self.getOrDefault(self.groupBrokenParagraphs)
456492

457493
paragraphSplit = Param(
@@ -462,9 +498,28 @@ def getGroupBrokenParagraphs(self):
462498
)
463499

464500
def setParagraphSplit(self, value):
501+
"""Set the regex pattern used to split paragraphs when grouping broken paragraphs.
502+
503+
Parameters
504+
----------
505+
value : str
506+
Regular expression string used to detect paragraph boundaries.
507+
508+
Returns
509+
-------
510+
self
511+
The instance with updated `paragraphSplit` parameter.
512+
"""
465513
return self._set(paragraphSplit=value)
466514

467515
def getParagraphSplit(self):
516+
"""Get the paragraph-splitting regex pattern.
517+
518+
Returns
519+
-------
520+
str
521+
The regex pattern used to detect paragraph boundaries.
522+
"""
468523
return self.getOrDefault(self.paragraphSplit)
469524

470525
shortLineWordThreshold = Param(
@@ -475,9 +530,28 @@ def getParagraphSplit(self):
475530
)
476531

477532
def setShortLineWordThreshold(self, value):
533+
"""Set the maximum word count for a line to be considered short.
534+
535+
Parameters
536+
----------
537+
value : int
538+
Number of words under which a line is considered 'short'.
539+
540+
Returns
541+
-------
542+
self
543+
The instance with updated `shortLineWordThreshold` parameter.
544+
"""
478545
return self._set(shortLineWordThreshold=value)
479546

480547
def getShortLineWordThreshold(self):
548+
"""Get the short line word threshold.
549+
550+
Returns
551+
-------
552+
int
553+
Word count threshold for short lines used in paragraph grouping.
554+
"""
481555
return self.getOrDefault(self.shortLineWordThreshold)
482556

483557
maxLineCount = Param(
@@ -488,9 +562,28 @@ def getShortLineWordThreshold(self):
488562
)
489563

490564
def setMaxLineCount(self, value):
565+
"""Set the maximum number of lines to inspect when estimating paragraph layout.
566+
567+
Parameters
568+
----------
569+
value : int
570+
Maximum number of lines to evaluate for layout heuristics.
571+
572+
Returns
573+
-------
574+
self
575+
The instance with updated `maxLineCount` parameter.
576+
"""
491577
return self._set(maxLineCount=value)
492578

493579
def getMaxLineCount(self):
580+
"""Get the maximum number of lines used for layout heuristics.
581+
582+
Returns
583+
-------
584+
int
585+
The configured maximum number of lines to consider.
586+
"""
494587
return self.getOrDefault(self.maxLineCount)
495588

496589
threshold = Param(
@@ -501,11 +594,58 @@ def getMaxLineCount(self):
501594
)
502595

503596
def setThreshold(self, value):
597+
"""Set the empty-line ratio threshold for paragraph grouping decision.
598+
599+
Parameters
600+
----------
601+
value : float
602+
Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
603+
604+
Returns
605+
-------
606+
self
607+
The instance with updated `threshold` parameter.
608+
"""
504609
return self._set(threshold=value)
505610

506611
def getThreshold(self):
612+
"""Get the configured empty-line threshold ratio.
613+
614+
Returns
615+
-------
616+
float
617+
The ratio used to decide paragraph grouping strategy.
618+
"""
507619
return self.getOrDefault(self.threshold)
508620

621+
extractTagAttributes = Param(
622+
Params._dummy(),
623+
"extractTagAttributes",
624+
"Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
625+
typeConverter=TypeConverters.toListString
626+
)
627+
628+
def setExtractTagAttributes(self, attributes: list[str]):
629+
"""
630+
Specify which tag attributes should have their values extracted as text when parsing
631+
tag-based formats (e.g., HTML or XML).
632+
633+
:param attributes: list of attribute names to extract
634+
:return: this instance with the updated `extractTagAttributes` parameter
635+
"""
636+
return self._set(extractTagAttributes=attributes)
637+
638+
def getExtractTagAttributes(self):
639+
"""Get the list of tag attribute names configured to be extracted.
640+
641+
Returns
642+
-------
643+
list[str]
644+
The attribute names whose values will be extracted as text.
645+
"""
646+
return self.getOrDefault(self.extractTagAttributes)
647+
648+
509649
class HasChunkerProperties(Params):
510650

511651
chunkingStrategy = Param(

python/sparknlp/reader/reader2doc.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ def setExcludeNonText(self, value):
9191
"""
9292
return self._set(excludeNonText=value)
9393

94+
joinString = Param(
95+
Params._dummy(),
96+
"joinString",
97+
"If outputAsDocument is true, specifies the string used to join elements into a single document.",
98+
typeConverter=TypeConverters.toString
99+
)
100+
101+
def setJoinString(self, value):
102+
"""
103+
If outputAsDocument is true, specifies the string used to join elements into a single
104+
"""
105+
return self._set(joinString=value)
106+
94107
@keyword_only
95108
def __init__(self):
96109
super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
@@ -99,8 +112,12 @@ def __init__(self):
99112
explodeDocs=False,
100113
contentType="",
101114
flattenOutput=False,
102-
titleThreshold=18
115+
outputAsDocument=True,
116+
outputFormat="plain-text",
117+
excludeNonText=False,
118+
joinString="\n"
103119
)
120+
104121
@keyword_only
105122
def setParams(self):
106123
kwargs = self._input_kwargs

python/sparknlp/reader/reader2table.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ class Reader2Table(
3535
@keyword_only
3636
def __init__(self):
3737
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
38-
self._setDefault(outputCol="document")
38+
self._setDefault(outputCol="document", outputFormat="json-table", inferTableStructure=True,
39+
outputAsDocument=False)
3940

4041
@keyword_only
4142
def setParams(self):

0 commit comments

Comments
 (0)