diff --git a/CHANGELOG.md b/CHANGELOG.md index 343a8e67..e26f5240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Added the ignore_case argument for `filter_by_text_equal` and `filter_by_text_contains` methods for the `ElementList` class in `filtering.py` + ## [0.10.2] - 2022-11-07 ### Changed diff --git a/py_pdf_parser/filtering.py b/py_pdf_parser/filtering.py index 64be537a..0e246a94 100644 --- a/py_pdf_parser/filtering.py +++ b/py_pdf_parser/filtering.py @@ -133,7 +133,9 @@ def filter_by_tags(self, *tags: str) -> "ElementList": ) return ElementList(self.document, new_indexes) - def filter_by_text_equal(self, text: str, stripped: bool = True) -> "ElementList": + def filter_by_text_equal( + self, text: str, stripped: bool = True, ignore_case: bool = False + ) -> "ElementList": """ Filter for elements whose text is exactly the given string. @@ -141,27 +143,47 @@ def filter_by_text_equal(self, text: str, stripped: bool = True) -> "ElementList text (str): The text to filter for. stripped (bool, optional): Whether to strip the text of the element before comparison. Default: True. + ignore_case (bool): Whether to ignore case sensitivity when filtering for matches. Default: False. + Returns: ElementList: The filtered list. """ - new_indexes = set( - element._index for element in self if element.text(stripped) == text - ) + if ignore_case: + new_indexes = set( + element._index + for element in self + if element.text(stripped).casefold() == text.casefold() + ) + else: + new_indexes = set( + element._index for element in self if element.text(stripped) == text + ) return ElementList(self.document, new_indexes) - def filter_by_text_contains(self, text: str) -> "ElementList": + def filter_by_text_contains( + self, text: str, ignore_case: bool = False + ) -> "ElementList": """ Filter for elements whose text contains the given string. Args: text (str): The text to filter for. - + ignore_case (bool): Whether to ignore case sensitivity when filtering for matches. Default: False. Returns: ElementList: The filtered list. """ - new_indexes = set(element._index for element in self if text in element.text()) + if ignore_case: + new_indexes = set( + element._index + for element in self + if text.casefold() in element.text().casefold() + ) + else: + new_indexes = set( + element._index for element in self if text in element.text() + ) return ElementList(self.document, new_indexes) def filter_by_regex(