@@ -1925,17 +1925,20 @@ def print_unique_links_with_status_codes(self):
19251925 soup = self .get_beautiful_soup (self .get_page_source ())
19261926 page_utils ._print_unique_links_with_status_codes (page_url , soup )
19271927
1928- def assert_pdf_text (self , pdf , text , page = None ):
1929- """ Asserts text in a PDF file.
1928+ def __get_pdf_reader_obj (self , pdf_file_object , strict = False ):
1929+ import PyPDF2
1930+ pdf_reader_object = PyPDF2 .PdfFileReader (pdf_file_object , strict )
1931+ return pdf_reader_object
1932+
1933+ def get_pdf_text (self , pdf , page = None ):
1934+ """ Gets text from a PDF file.
19301935 PDF can be either a URL or a file path on the local file system.
19311936 @Params
19321937 pdf - The URL or file path of the PDF file.
1933- text - The expected text to verify in the PDF.
19341938 page - The page number of the PDF to use (optional).
19351939 If a page number is provided, looks only at that page.
19361940 (1 is the first page, 2 is the second page, etc.)
1937- If no page number is provided, looks at all the pages. """
1938- import PyPDF2
1941+ If no page number is provided, returns all PDF text. """
19391942 if not pdf .lower ().endswith ('.pdf' ):
19401943 raise Exception ("%s is not a PDF file! (Expecting a .pdf)" % pdf )
19411944 file_path = None
@@ -1950,25 +1953,41 @@ def assert_pdf_text(self, pdf, text, page=None):
19501953 raise Exception ("%s is not a valid URL or file path!" % pdf )
19511954 file_path = os .path .abspath (pdf )
19521955 pdf_file_object = open (file_path , "rb" )
1953- pdf_reader = PyPDF2 . PdfFileReader (pdf_file_object , strict = False )
1956+ pdf_reader = self . __get_pdf_reader_obj (pdf_file_object , strict = False )
19541957 num_pages = pdf_reader .numPages
1958+ pdf_text = ""
19551959 if type (page ) is int :
19561960 if page > num_pages :
19571961 raise Exception ("Invalid page number for the PDF!" )
19581962 page = page - 1
19591963 page_obj = pdf_reader .getPage (page )
1960- pdf_page_text = page_obj .extractText ()
1961- if text not in pdf_page_text :
1962- raise Exception ("PDF [%s] is missing expected text [%s] on "
1963- "page [%s]!" % (file_path , text , page ))
1964+ pdf_text = page_obj .extractText ()
19641965 else :
19651966 for page_num in range (num_pages ):
19661967 page_obj = pdf_reader .getPage (page_num )
1967- pdf_page_text = page_obj .extractText ()
1968- if text in pdf_page_text :
1969- return
1970- raise Exception ("PDF [%s] is missing expected text [%s]!"
1971- "" % (file_path , text ))
1968+ pdf_text = pdf_text + '\n ' + page_obj .extractText ()
1969+ return pdf_text
1970+
1971+ def assert_pdf_text (self , pdf , text , page = None ):
1972+ """ Asserts text in a PDF file.
1973+ PDF can be either a URL or a file path on the local file system.
1974+ @Params
1975+ pdf - The URL or file path of the PDF file.
1976+ text - The expected text to verify in the PDF.
1977+ page - The page number of the PDF to use (optional).
1978+ If a page number is provided, looks only at that page.
1979+ (1 is the first page, 2 is the second page, etc.)
1980+ If no page number is provided, looks at all the pages. """
1981+ pdf_text = self .get_pdf_text (pdf , page = page )
1982+ if type (page ) is int :
1983+ if text not in pdf_text :
1984+ raise Exception ("PDF [%s] is missing expected text [%s] on "
1985+ "page [%s]!" % (pdf , text , page ))
1986+ else :
1987+ if text not in pdf_text :
1988+ raise Exception ("PDF [%s] is missing expected text [%s]!"
1989+ "" % (pdf , text ))
1990+ return True
19721991
19731992 def create_folder (self , folder ):
19741993 """ Creates a folder of the given name if it doesn't already exist. """
0 commit comments