Skip to content

Commit c4c08b8

Browse files
authored
Merge pull request #385 from aiden2480/extra-filters
Add custom filter predicate and header/footer filters
2 parents b5c6a24 + 1bf3661 commit c4c08b8

File tree

3 files changed

+165
-15
lines changed

3 files changed

+165
-15
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
### Added
10+
- Added extra filtering methods for ElementList
11+
912
## [0.12.0] - 2023-11-10
1013

1114
### Added

py_pdf_parser/filtering.py

Lines changed: 105 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import (
22
TYPE_CHECKING,
3+
Callable,
34
FrozenSet,
45
Iterable,
56
Iterator,
@@ -103,6 +104,21 @@ def add_tag_to_elements(self, tag: str) -> None:
103104
for element in self:
104105
element.add_tag(tag)
105106

107+
def filter(self, predicate: Callable[["PDFElement"], bool]) -> "ElementList":
108+
"""
109+
Filter by elements matching a custom predicate. Anything that passes the
110+
predicate is included in the new ElementList.
111+
112+
Args:
113+
predicate (Callable[[PDFElement], bool]): The predicate to filter by.
114+
115+
Returns:
116+
ElementList: The filtered list.
117+
"""
118+
119+
new_indexes = set(element._index for element in self if predicate(element))
120+
return ElementList(self.document, new_indexes)
121+
106122
def filter_by_tag(self, tag: str) -> "ElementList":
107123
"""
108124
Filter for elements containing only the given tag.
@@ -113,8 +129,8 @@ def filter_by_tag(self, tag: str) -> "ElementList":
113129
Returns:
114130
ElementList: The filtered list.
115131
"""
116-
new_indexes = set(element._index for element in self if tag in element.tags)
117-
return ElementList(self.document, new_indexes)
132+
133+
return self.filter(lambda e: tag in e.tags)
118134

119135
def filter_by_tags(self, *tags: str) -> "ElementList":
120136
"""
@@ -126,12 +142,8 @@ def filter_by_tags(self, *tags: str) -> "ElementList":
126142
Returns:
127143
ElementList: The filtered list.
128144
"""
129-
new_indexes = set(
130-
element._index
131-
for element in self
132-
if any(tag in element.tags for tag in tags)
133-
)
134-
return ElementList(self.document, new_indexes)
145+
146+
return self.filter(lambda e: any(tag in e.tags for tag in tags))
135147

136148
def filter_by_text_equal(self, text: str, stripped: bool = True) -> "ElementList":
137149
"""
@@ -145,11 +157,8 @@ def filter_by_text_equal(self, text: str, stripped: bool = True) -> "ElementList
145157
Returns:
146158
ElementList: The filtered list.
147159
"""
148-
new_indexes = set(
149-
element._index for element in self if element.text(stripped) == text
150-
)
151160

152-
return ElementList(self.document, new_indexes)
161+
return self.filter(lambda e: e.text(stripped) == text)
153162

154163
def filter_by_text_contains(self, text: str) -> "ElementList":
155164
"""
@@ -161,8 +170,8 @@ def filter_by_text_contains(self, text: str) -> "ElementList":
161170
Returns:
162171
ElementList: The filtered list.
163172
"""
164-
new_indexes = set(element._index for element in self if text in element.text())
165-
return ElementList(self.document, new_indexes)
173+
174+
return self.filter(lambda e: text in e.text())
166175

167176
def filter_by_regex(
168177
self,
@@ -216,6 +225,19 @@ def filter_by_fonts(self, *fonts: str) -> "ElementList":
216225
new_indexes = self.indexes & self.document._element_indexes_with_fonts(*fonts)
217226
return ElementList(self.document, new_indexes)
218227

228+
def filter_by_font_size(self, font_size: float) -> "ElementList":
229+
"""
230+
Filter for elements of a particular font size.
231+
232+
Args:
233+
font_size (float): The font size to filter for.
234+
235+
Returns:
236+
ElementList: The filtered list.
237+
"""
238+
239+
return self.filter(lambda e: e.font_size == font_size)
240+
219241
def filter_by_page(self, page_number: int) -> "ElementList":
220242
"""
221243
Filter for elements on the given page.
@@ -786,7 +808,7 @@ def extract_single_element(self) -> "PDFElement":
786808
f"There are {len(self.indexes)} elements in the ElementList"
787809
)
788810

789-
return self.document._element_list[list(self.indexes)[0]]
811+
return self.first()
790812

791813
def add_element(self, element: "PDFElement") -> "ElementList":
792814
"""
@@ -919,6 +941,74 @@ def move_backwards_from(
919941
"""
920942
return self.move_forwards_from(element, count=-count, capped=capped)
921943

944+
def filter_out_header(self, bottom_of_header_y: float) -> "ElementList":
945+
"""
946+
Filter out header elements, as specified by a certain y position. Only elements
947+
completely within the header are discarded. Partially overlapping elements are
948+
kept.
949+
950+
Args:
951+
bottom_of_header_y (float): The Y coordinate of the bottom of the header.
952+
953+
Note:
954+
Y decreases as elements go down the page.
955+
956+
Returns:
957+
ElementList: The filtered list without header elements.
958+
"""
959+
960+
return self.filter(lambda e: e.bounding_box.y0 < bottom_of_header_y)
961+
962+
def filter_out_footer(self, top_of_footer_y: float) -> "ElementList":
963+
"""
964+
Filter out footer elements, as specified by a certain y position. Only elements
965+
completely within the footer are discarded. Partially overlapping elements are
966+
kept.
967+
968+
Args:
969+
top_of_footer_y (float): The Y coordinate of the top of the footer.
970+
971+
Note:
972+
Y decreases as elements go down the page.
973+
974+
Returns:
975+
ElementList: The filtered list without footer elements.
976+
"""
977+
978+
return self.filter(lambda e: e.bounding_box.y1 > top_of_footer_y)
979+
980+
def first(self) -> "PDFElement":
981+
"""
982+
Returns the first element in the ElementList
983+
984+
Unlike extract_single_element, an error is not thrown if there is more
985+
than one element in the ElementList.
986+
987+
Raises:
988+
NoElementFoundError: If there are no elements in the ElementList
989+
"""
990+
991+
if len(self.indexes) == 0:
992+
raise NoElementFoundError("There are no elements in the ElementList")
993+
994+
return self[0]
995+
996+
def last(self) -> "PDFElement":
997+
"""
998+
Returns the last element in the ElementList
999+
1000+
Unlike extract_single_element, an error is not thrown if there is more
1001+
than one element in the ElementList.
1002+
1003+
Raises:
1004+
NoElementFoundError: If there are no elements in the ElementList
1005+
"""
1006+
1007+
if len(self.indexes) == 0:
1008+
raise NoElementFoundError("There are no elements in the ElementList")
1009+
1010+
return self[-1]
1011+
9221012
def __intersect_indexes_with_self(self, new_indexes: Set[int]) -> "ElementList":
9231013
return self & ElementList(self.document, new_indexes)
9241014

tests/test_filtering.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,63 @@ def test_move_backwards_from(self):
12111211
self.elem_list[-1],
12121212
)
12131213

1214+
def test_filter(self):
1215+
elem1 = FakePDFMinerTextElement(text="even")
1216+
elem2 = FakePDFMinerTextElement(text="odd")
1217+
elem3 = FakePDFMinerTextElement(text="even")
1218+
elem4 = FakePDFMinerTextElement(text="odd")
1219+
elem5 = FakePDFMinerTextElement(text="even")
1220+
1221+
doc = create_pdf_document([elem1, elem2, elem3, elem4, elem5])
1222+
even_elems = doc.elements.filter(lambda e: len(e.text()) % 2 == 0)
1223+
odd_elems = doc.elements.filter(lambda e: len(e.text()) % 2 == 1)
1224+
1225+
self.assertEqual(ElementList(doc, {0, 2, 4}), even_elems)
1226+
self.assertEqual(ElementList(doc, {1, 3}), odd_elems)
1227+
1228+
def test_filter_by_font_size(self):
1229+
elem1 = FakePDFMinerTextElement(font_name="foo", font_size=1)
1230+
elem2 = FakePDFMinerTextElement(font_name="bar", font_size=2)
1231+
elem3 = FakePDFMinerTextElement(font_name="bat", font_size=2)
1232+
elem4 = FakePDFMinerTextElement(font_name="baz", font_size=3)
1233+
doc = create_pdf_document([elem1, elem2, elem3, elem4])
1234+
1235+
self.assertEqual(ElementList(doc, {1, 2}), doc.elements.filter_by_font_size(2))
1236+
1237+
def test_filter_out_header(self):
1238+
bbox1 = BoundingBox(20, 30, 75, 80) # Completely within header - discarded
1239+
bbox2 = BoundingBox(20, 30, 25, 75) # Completely outside header - kept
1240+
bbox3 = BoundingBox(20, 30, 10, 20) # Partially within header - kept
1241+
1242+
elems = [FakePDFMinerTextElement(b) for b in (bbox1, bbox2, bbox3)]
1243+
doc = create_pdf_document(elems)
1244+
1245+
self.assertEqual(ElementList(doc, {1, 2}), doc.elements.filter_out_header(50))
1246+
1247+
def test_filter_out_footer(self):
1248+
bbox1 = BoundingBox(20, 30, 75, 80) # Completely outside footer - kept
1249+
bbox2 = BoundingBox(20, 30, 25, 75) # Partially within footer - kept
1250+
bbox3 = BoundingBox(20, 30, 10, 20) # Completely within footer - discarded
1251+
1252+
elems = [FakePDFMinerTextElement(b) for b in (bbox1, bbox2, bbox3)]
1253+
doc = create_pdf_document(elems)
1254+
1255+
self.assertEqual(ElementList(doc, {0, 1}), doc.elements.filter_out_footer(50))
1256+
1257+
def test_first(self):
1258+
actual_first_elem = self.extract_element_from_list(self.elem1, self.elem_list)
1259+
self.assertEqual(self.elem_list.first(), actual_first_elem)
1260+
1261+
with self.assertRaises(NoElementFoundError):
1262+
self.elem_list.filter_by_tag("non_existent_tag").first()
1263+
1264+
def test_last(self):
1265+
actual_last_elem = self.extract_element_from_list(self.elem6, self.elem_list)
1266+
self.assertEqual(self.elem_list.last(), actual_last_elem)
1267+
1268+
with self.assertRaises(NoElementFoundError):
1269+
self.elem_list.filter_by_tag("non_existent_tag").last()
1270+
12141271
def test_repr(self):
12151272
self.assertEqual(repr(self.elem_list), "<ElementList of 6 elements>")
12161273

0 commit comments

Comments
 (0)