JohnSnowLabs
diff --git a/‎python/sparknlp/partition/partition_properties.py‎
Lines changed: 146 additions & 6 deletions b/‎python/sparknlp/partition/partition_properties.py‎
Lines changed: 146 additions & 6 deletions
diff --git a/‎python/sparknlp/reader/reader2doc.py‎
Lines changed: 18 additions & 1 deletion b/‎python/sparknlp/reader/reader2doc.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎python/sparknlp/reader/reader2table.py‎
Lines changed: 2 additions & 1 deletion b/‎python/sparknlp/reader/reader2table.py‎
Lines changed: 2 additions & 1 deletion
@@ -17,7 +17,6 @@
 
 
 class HasReaderProperties(Params):
-
     inputCol = Param(
         Params._dummy(),
         "inputCol",
@@ -245,8 +244,8 @@ def setOutputAsDocument(self, value):
         """
         return self._set(outputAsDocument=value)
 
-class HasEmailReaderProperties(Params):
 
+class HasEmailReaderProperties(Params):
     addAttachmentContent = Param(
         Params._dummy(),
         "addAttachmentContent",
@@ -278,7 +277,6 @@ def getAddAttachmentContent(self):
 
 
 class HasExcelReaderProperties(Params):
-
     cellSeparator = Param(
         Params._dummy(),
         "cellSeparator",
@@ -337,8 +335,8 @@ def getAppendCells(self):
         """
         return self.getOrDefault(self.appendCells)
 
-class HasHTMLReaderProperties(Params):
 
+class HasHTMLReaderProperties(Params):
     timeout = Param(
         Params._dummy(),
         "timeout",
@@ -395,8 +393,8 @@ def setOutputFormat(self, value: str):
         """
         return self._set(outputFormat=value)
 
-class HasPowerPointProperties(Params):
 
+class HasPowerPointProperties(Params):
     includeSlideNotes = Param(
         Params._dummy(),
         "includeSlideNotes",
@@ -426,8 +424,8 @@ def getIncludeSlideNotes(self):
         """
         return self.getOrDefault(self.includeSlideNotes)
 
-class HasTextReaderProperties(Params):
 
+class HasTextReaderProperties(Params):
     titleLengthSize = Param(
         Params._dummy(),
         "titleLengthSize",
@@ -436,9 +434,28 @@ class HasTextReaderProperties(Params):
     )
 
     def setTitleLengthSize(self, value):
+        """Set the maximum character length used to identify title blocks.
+
+        Parameters
+        ----------
+        value : int
+            Maximum number of characters a text block can have to be considered a title.
+
+        Returns
+        -------
+        self
+            The instance with updated `titleLengthSize` parameter.
+        """
         return self._set(titleLengthSize=value)
 
     def getTitleLengthSize(self):
+        """Get the configured maximum title length.
+
+        Returns
+        -------
+        int
+            The maximum character length used to detect title blocks.
+        """
         return self.getOrDefault(self.titleLengthSize)
 
     groupBrokenParagraphs = Param(
@@ -449,9 +466,28 @@ def getTitleLengthSize(self):
     )
 
     def setGroupBrokenParagraphs(self, value):
+        """Enable or disable grouping of broken paragraphs.
+
+        Parameters
+        ----------
+        value : bool
+            True to merge fragmented lines into paragraphs, False to leave lines as-is.
+
+        Returns
+        -------
+        self
+            The instance with updated `groupBrokenParagraphs` parameter.
+        """
         return self._set(groupBrokenParagraphs=value)
 
     def getGroupBrokenParagraphs(self):
+        """Get whether broken paragraph grouping is enabled.
+
+        Returns
+        -------
+        bool
+            True if grouping of broken paragraphs is enabled, False otherwise.
+        """
         return self.getOrDefault(self.groupBrokenParagraphs)
 
     paragraphSplit = Param(
@@ -462,9 +498,28 @@ def getGroupBrokenParagraphs(self):
     )
 
     def setParagraphSplit(self, value):
+        """Set the regex pattern used to split paragraphs when grouping broken paragraphs.
+
+        Parameters
+        ----------
+        value : str
+            Regular expression string used to detect paragraph boundaries.
+
+        Returns
+        -------
+        self
+            The instance with updated `paragraphSplit` parameter.
+        """
         return self._set(paragraphSplit=value)
 
     def getParagraphSplit(self):
+        """Get the paragraph-splitting regex pattern.
+
+        Returns
+        -------
+        str
+            The regex pattern used to detect paragraph boundaries.
+        """
         return self.getOrDefault(self.paragraphSplit)
 
     shortLineWordThreshold = Param(
@@ -475,9 +530,28 @@ def getParagraphSplit(self):
     )
 
     def setShortLineWordThreshold(self, value):
+        """Set the maximum word count for a line to be considered short.
+
+        Parameters
+        ----------
+        value : int
+            Number of words under which a line is considered 'short'.
+
+        Returns
+        -------
+        self
+            The instance with updated `shortLineWordThreshold` parameter.
+        """
         return self._set(shortLineWordThreshold=value)
 
     def getShortLineWordThreshold(self):
+        """Get the short line word threshold.
+
+        Returns
+        -------
+        int
+            Word count threshold for short lines used in paragraph grouping.
+        """
         return self.getOrDefault(self.shortLineWordThreshold)
 
     maxLineCount = Param(
@@ -488,9 +562,28 @@ def getShortLineWordThreshold(self):
     )
 
     def setMaxLineCount(self, value):
+        """Set the maximum number of lines to inspect when estimating paragraph layout.
+
+        Parameters
+        ----------
+        value : int
+            Maximum number of lines to evaluate for layout heuristics.
+
+        Returns
+        -------
+        self
+            The instance with updated `maxLineCount` parameter.
+        """
         return self._set(maxLineCount=value)
 
     def getMaxLineCount(self):
+        """Get the maximum number of lines used for layout heuristics.
+
+        Returns
+        -------
+        int
+            The configured maximum number of lines to consider.
+        """
         return self.getOrDefault(self.maxLineCount)
 
     threshold = Param(
@@ -501,11 +594,58 @@ def getMaxLineCount(self):
     )
 
     def setThreshold(self, value):
+        """Set the empty-line ratio threshold for paragraph grouping decision.
+
+        Parameters
+        ----------
+        value : float
+            Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
+
+        Returns
+        -------
+        self
+            The instance with updated `threshold` parameter.
+        """
         return self._set(threshold=value)
 
     def getThreshold(self):
+        """Get the configured empty-line threshold ratio.
+
+        Returns
+        -------
+        float
+            The ratio used to decide paragraph grouping strategy.
+        """
         return self.getOrDefault(self.threshold)
 
+    extractTagAttributes = Param(
+        Params._dummy(),
+        "extractTagAttributes",
+        "Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
+        typeConverter=TypeConverters.toListString
+    )
+
+    def setExtractTagAttributes(self, attributes: list[str]):
+        """
+        Specify which tag attributes should have their values extracted as text when parsing
+        tag-based formats (e.g., HTML or XML).
+
+        :param attributes: list of attribute names to extract
+        :return: this instance with the updated `extractTagAttributes` parameter
+        """
+        return self._set(extractTagAttributes=attributes)
+
+    def getExtractTagAttributes(self):
+        """Get the list of tag attribute names configured to be extracted.
+
+        Returns
+        -------
+        list[str]
+            The attribute names whose values will be extracted as text.
+        """
+        return self.getOrDefault(self.extractTagAttributes)
+
+
 class HasChunkerProperties(Params):
 
     chunkingStrategy = Param(
 
@@ -91,6 +91,19 @@ def setExcludeNonText(self, value):
         """
         return self._set(excludeNonText=value)
 
+    joinString = Param(
+        Params._dummy(),
+        "joinString",
+        "If outputAsDocument is true, specifies the string used to join elements into a single document.",
+        typeConverter=TypeConverters.toString
+    )
+
+    def setJoinString(self, value):
+        """
+        If outputAsDocument is true, specifies the string used to join elements into a single
+        """
+        return self._set(joinString=value)
+
     @keyword_only
     def __init__(self):
         super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
@@ -99,8 +112,12 @@ def __init__(self):
             explodeDocs=False,
             contentType="",
             flattenOutput=False,
-            titleThreshold=18
+            outputAsDocument=True,
+            outputFormat="plain-text",
+            excludeNonText=False,
+            joinString="\n"
         )
+
     @keyword_only
     def setParams(self):
         kwargs = self._input_kwargs
 
@@ -35,7 +35,8 @@ class Reader2Table(
     @keyword_only
     def __init__(self):
         super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
-        self._setDefault(outputCol="document")
+        self._setDefault(outputCol="document", outputFormat="json-table", inferTableStructure=True,
+                         outputAsDocument=False)
 
     @keyword_only
     def setParams(self):