11from typing import (
22 TYPE_CHECKING ,
3+ Callable ,
34 FrozenSet ,
45 Iterable ,
56 Iterator ,
@@ -103,6 +104,21 @@ def add_tag_to_elements(self, tag: str) -> None:
103104 for element in self :
104105 element .add_tag (tag )
105106
107+ def filter (self , predicate : Callable [["PDFElement" ], bool ]) -> "ElementList" :
108+ """
109+ Filter by elements matching a custom predicate. Anything that passes the
110+ predicate is included in the new ElementList.
111+
112+ Args:
113+ predicate (Callable[[PDFElement], bool]): The predicate to filter by.
114+
115+ Returns:
116+ ElementList: The filtered list.
117+ """
118+
119+ new_indexes = set (element ._index for element in self if predicate (element ))
120+ return ElementList (self .document , new_indexes )
121+
106122 def filter_by_tag (self , tag : str ) -> "ElementList" :
107123 """
108124 Filter for elements containing only the given tag.
@@ -113,8 +129,8 @@ def filter_by_tag(self, tag: str) -> "ElementList":
113129 Returns:
114130 ElementList: The filtered list.
115131 """
116- new_indexes = set ( element . _index for element in self if tag in element . tags )
117- return ElementList ( self .document , new_indexes )
132+
133+ return self .filter ( lambda e : tag in e . tags )
118134
119135 def filter_by_tags (self , * tags : str ) -> "ElementList" :
120136 """
@@ -126,12 +142,8 @@ def filter_by_tags(self, *tags: str) -> "ElementList":
126142 Returns:
127143 ElementList: The filtered list.
128144 """
129- new_indexes = set (
130- element ._index
131- for element in self
132- if any (tag in element .tags for tag in tags )
133- )
134- return ElementList (self .document , new_indexes )
145+
146+ return self .filter (lambda e : any (tag in e .tags for tag in tags ))
135147
136148 def filter_by_text_equal (self , text : str , stripped : bool = True ) -> "ElementList" :
137149 """
@@ -145,11 +157,8 @@ def filter_by_text_equal(self, text: str, stripped: bool = True) -> "ElementList
145157 Returns:
146158 ElementList: The filtered list.
147159 """
148- new_indexes = set (
149- element ._index for element in self if element .text (stripped ) == text
150- )
151160
152- return ElementList ( self .document , new_indexes )
161+ return self .filter ( lambda e : e . text ( stripped ) == text )
153162
154163 def filter_by_text_contains (self , text : str ) -> "ElementList" :
155164 """
@@ -161,8 +170,8 @@ def filter_by_text_contains(self, text: str) -> "ElementList":
161170 Returns:
162171 ElementList: The filtered list.
163172 """
164- new_indexes = set ( element . _index for element in self if text in element . text ())
165- return ElementList ( self .document , new_indexes )
173+
174+ return self .filter ( lambda e : text in e . text () )
166175
167176 def filter_by_regex (
168177 self ,
@@ -216,6 +225,19 @@ def filter_by_fonts(self, *fonts: str) -> "ElementList":
216225 new_indexes = self .indexes & self .document ._element_indexes_with_fonts (* fonts )
217226 return ElementList (self .document , new_indexes )
218227
228+ def filter_by_font_size (self , font_size : float ) -> "ElementList" :
229+ """
230+ Filter for elements of a particular font size.
231+
232+ Args:
233+ font_size (float): The font size to filter for.
234+
235+ Returns:
236+ ElementList: The filtered list.
237+ """
238+
239+ return self .filter (lambda e : e .font_size == font_size )
240+
219241 def filter_by_page (self , page_number : int ) -> "ElementList" :
220242 """
221243 Filter for elements on the given page.
@@ -786,7 +808,7 @@ def extract_single_element(self) -> "PDFElement":
786808 f"There are { len (self .indexes )} elements in the ElementList"
787809 )
788810
789- return self .document . _element_list [ list ( self . indexes )[ 0 ]]
811+ return self .first ()
790812
791813 def add_element (self , element : "PDFElement" ) -> "ElementList" :
792814 """
@@ -919,6 +941,74 @@ def move_backwards_from(
919941 """
920942 return self .move_forwards_from (element , count = - count , capped = capped )
921943
944+ def filter_out_header (self , bottom_of_header_y : float ) -> "ElementList" :
945+ """
946+ Filter out header elements, as specified by a certain y position. Only elements
947+ completely within the header are discarded. Partially overlapping elements are
948+ kept.
949+
950+ Args:
951+ bottom_of_header_y (float): The Y coordinate of the bottom of the header.
952+
953+ Note:
954+ Y decreases as elements go down the page.
955+
956+ Returns:
957+ ElementList: The filtered list without header elements.
958+ """
959+
960+ return self .filter (lambda e : e .bounding_box .y0 < bottom_of_header_y )
961+
962+ def filter_out_footer (self , top_of_footer_y : float ) -> "ElementList" :
963+ """
964+ Filter out footer elements, as specified by a certain y position. Only elements
965+ completely within the footer are discarded. Partially overlapping elements are
966+ kept.
967+
968+ Args:
969+ top_of_footer_y (float): The Y coordinate of the top of the footer.
970+
971+ Note:
972+ Y decreases as elements go down the page.
973+
974+ Returns:
975+ ElementList: The filtered list without footer elements.
976+ """
977+
978+ return self .filter (lambda e : e .bounding_box .y1 > top_of_footer_y )
979+
980+ def first (self ) -> "PDFElement" :
981+ """
982+ Returns the first element in the ElementList
983+
984+ Unlike extract_single_element, an error is not thrown if there is more
985+ than one element in the ElementList.
986+
987+ Raises:
988+ NoElementFoundError: If there are no elements in the ElementList
989+ """
990+
991+ if len (self .indexes ) == 0 :
992+ raise NoElementFoundError ("There are no elements in the ElementList" )
993+
994+ return self [0 ]
995+
996+ def last (self ) -> "PDFElement" :
997+ """
998+ Returns the last element in the ElementList
999+
1000+ Unlike extract_single_element, an error is not thrown if there is more
1001+ than one element in the ElementList.
1002+
1003+ Raises:
1004+ NoElementFoundError: If there are no elements in the ElementList
1005+ """
1006+
1007+ if len (self .indexes ) == 0 :
1008+ raise NoElementFoundError ("There are no elements in the ElementList" )
1009+
1010+ return self [- 1 ]
1011+
9221012 def __intersect_indexes_with_self (self , new_indexes : Set [int ]) -> "ElementList" :
9231013 return self & ElementList (self .document , new_indexes )
9241014
0 commit comments