1717
1818
1919class HasReaderProperties (Params ):
20-
2120 inputCol = Param (
2221 Params ._dummy (),
2322 "inputCol" ,
@@ -245,8 +244,8 @@ def setOutputAsDocument(self, value):
245244 """
246245 return self ._set (outputAsDocument = value )
247246
248- class HasEmailReaderProperties (Params ):
249247
248+ class HasEmailReaderProperties (Params ):
250249 addAttachmentContent = Param (
251250 Params ._dummy (),
252251 "addAttachmentContent" ,
@@ -278,7 +277,6 @@ def getAddAttachmentContent(self):
278277
279278
280279class HasExcelReaderProperties (Params ):
281-
282280 cellSeparator = Param (
283281 Params ._dummy (),
284282 "cellSeparator" ,
@@ -337,8 +335,8 @@ def getAppendCells(self):
337335 """
338336 return self .getOrDefault (self .appendCells )
339337
340- class HasHTMLReaderProperties (Params ):
341338
339+ class HasHTMLReaderProperties (Params ):
342340 timeout = Param (
343341 Params ._dummy (),
344342 "timeout" ,
@@ -395,8 +393,8 @@ def setOutputFormat(self, value: str):
395393 """
396394 return self ._set (outputFormat = value )
397395
398- class HasPowerPointProperties (Params ):
399396
397+ class HasPowerPointProperties (Params ):
400398 includeSlideNotes = Param (
401399 Params ._dummy (),
402400 "includeSlideNotes" ,
@@ -426,8 +424,8 @@ def getIncludeSlideNotes(self):
426424 """
427425 return self .getOrDefault (self .includeSlideNotes )
428426
429- class HasTextReaderProperties (Params ):
430427
428+ class HasTextReaderProperties (Params ):
431429 titleLengthSize = Param (
432430 Params ._dummy (),
433431 "titleLengthSize" ,
@@ -436,9 +434,28 @@ class HasTextReaderProperties(Params):
436434 )
437435
438436 def setTitleLengthSize (self , value ):
437+ """Set the maximum character length used to identify title blocks.
438+
439+ Parameters
440+ ----------
441+ value : int
442+ Maximum number of characters a text block can have to be considered a title.
443+
444+ Returns
445+ -------
446+ self
447+ The instance with updated `titleLengthSize` parameter.
448+ """
439449 return self ._set (titleLengthSize = value )
440450
441451 def getTitleLengthSize (self ):
452+ """Get the configured maximum title length.
453+
454+ Returns
455+ -------
456+ int
457+ The maximum character length used to detect title blocks.
458+ """
442459 return self .getOrDefault (self .titleLengthSize )
443460
444461 groupBrokenParagraphs = Param (
@@ -449,9 +466,28 @@ def getTitleLengthSize(self):
449466 )
450467
451468 def setGroupBrokenParagraphs (self , value ):
469+ """Enable or disable grouping of broken paragraphs.
470+
471+ Parameters
472+ ----------
473+ value : bool
474+ True to merge fragmented lines into paragraphs, False to leave lines as-is.
475+
476+ Returns
477+ -------
478+ self
479+ The instance with updated `groupBrokenParagraphs` parameter.
480+ """
452481 return self ._set (groupBrokenParagraphs = value )
453482
454483 def getGroupBrokenParagraphs (self ):
484+ """Get whether broken paragraph grouping is enabled.
485+
486+ Returns
487+ -------
488+ bool
489+ True if grouping of broken paragraphs is enabled, False otherwise.
490+ """
455491 return self .getOrDefault (self .groupBrokenParagraphs )
456492
457493 paragraphSplit = Param (
@@ -462,9 +498,28 @@ def getGroupBrokenParagraphs(self):
462498 )
463499
464500 def setParagraphSplit (self , value ):
501+ """Set the regex pattern used to split paragraphs when grouping broken paragraphs.
502+
503+ Parameters
504+ ----------
505+ value : str
506+ Regular expression string used to detect paragraph boundaries.
507+
508+ Returns
509+ -------
510+ self
511+ The instance with updated `paragraphSplit` parameter.
512+ """
465513 return self ._set (paragraphSplit = value )
466514
467515 def getParagraphSplit (self ):
516+ """Get the paragraph-splitting regex pattern.
517+
518+ Returns
519+ -------
520+ str
521+ The regex pattern used to detect paragraph boundaries.
522+ """
468523 return self .getOrDefault (self .paragraphSplit )
469524
470525 shortLineWordThreshold = Param (
@@ -475,9 +530,28 @@ def getParagraphSplit(self):
475530 )
476531
477532 def setShortLineWordThreshold (self , value ):
533+ """Set the maximum word count for a line to be considered short.
534+
535+ Parameters
536+ ----------
537+ value : int
538+ Number of words under which a line is considered 'short'.
539+
540+ Returns
541+ -------
542+ self
543+ The instance with updated `shortLineWordThreshold` parameter.
544+ """
478545 return self ._set (shortLineWordThreshold = value )
479546
480547 def getShortLineWordThreshold (self ):
548+ """Get the short line word threshold.
549+
550+ Returns
551+ -------
552+ int
553+ Word count threshold for short lines used in paragraph grouping.
554+ """
481555 return self .getOrDefault (self .shortLineWordThreshold )
482556
483557 maxLineCount = Param (
@@ -488,9 +562,28 @@ def getShortLineWordThreshold(self):
488562 )
489563
490564 def setMaxLineCount (self , value ):
565+ """Set the maximum number of lines to inspect when estimating paragraph layout.
566+
567+ Parameters
568+ ----------
569+ value : int
570+ Maximum number of lines to evaluate for layout heuristics.
571+
572+ Returns
573+ -------
574+ self
575+ The instance with updated `maxLineCount` parameter.
576+ """
491577 return self ._set (maxLineCount = value )
492578
493579 def getMaxLineCount (self ):
580+ """Get the maximum number of lines used for layout heuristics.
581+
582+ Returns
583+ -------
584+ int
585+ The configured maximum number of lines to consider.
586+ """
494587 return self .getOrDefault (self .maxLineCount )
495588
496589 threshold = Param (
@@ -501,11 +594,58 @@ def getMaxLineCount(self):
501594 )
502595
503596 def setThreshold (self , value ):
597+ """Set the empty-line ratio threshold for paragraph grouping decision.
598+
599+ Parameters
600+ ----------
601+ value : float
602+ Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
603+
604+ Returns
605+ -------
606+ self
607+ The instance with updated `threshold` parameter.
608+ """
504609 return self ._set (threshold = value )
505610
506611 def getThreshold (self ):
612+ """Get the configured empty-line threshold ratio.
613+
614+ Returns
615+ -------
616+ float
617+ The ratio used to decide paragraph grouping strategy.
618+ """
507619 return self .getOrDefault (self .threshold )
508620
621+ extractTagAttributes = Param (
622+ Params ._dummy (),
623+ "extractTagAttributes" ,
624+ "Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML)." ,
625+ typeConverter = TypeConverters .toListString
626+ )
627+
628+ def setExtractTagAttributes (self , attributes : list [str ]):
629+ """
630+ Specify which tag attributes should have their values extracted as text when parsing
631+ tag-based formats (e.g., HTML or XML).
632+
633+ :param attributes: list of attribute names to extract
634+ :return: this instance with the updated `extractTagAttributes` parameter
635+ """
636+ return self ._set (extractTagAttributes = attributes )
637+
638+ def getExtractTagAttributes (self ):
639+ """Get the list of tag attribute names configured to be extracted.
640+
641+ Returns
642+ -------
643+ list[str]
644+ The attribute names whose values will be extracted as text.
645+ """
646+ return self .getOrDefault (self .extractTagAttributes )
647+
648+
509649class HasChunkerProperties (Params ):
510650
511651 chunkingStrategy = Param (
0 commit comments