@@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
19491949 soup = self .get_beautiful_soup (self .get_page_source ())
19501950 page_utils ._print_unique_links_with_status_codes (page_url , soup )
19511951
1952- def __get_pdf_reader_obj (self , pdf_file_object , strict = False ):
1953- import PyPDF2
1954- pdf_reader_object = PyPDF2 .PdfFileReader (pdf_file_object , strict )
1955- return pdf_reader_object
1956-
1957- def get_pdf_text (self , pdf , page = None ):
1952+ def __fix_unicode_conversion (self , text ):
1953+ """ Fixing Chinese characters when converting from PDF to HTML. """
1954+ text = text .replace (u'\u2f8f ' , u'\u884c ' )
1955+ text = text .replace (u'\u2f45 ' , u'\u65b9 ' )
1956+ text = text .replace (u'\u2f08 ' , u'\u4eba ' )
1957+ text = text .replace (u'\u2f70 ' , u'\u793a ' )
1958+ return text
1959+
1960+ def get_pdf_text (self , pdf , page = None , maxpages = None ,
1961+ password = None , codec = 'utf-8' , wrap = False , nav = False ,
1962+ override = False ):
19581963 """ Gets text from a PDF file.
19591964 PDF can be either a URL or a file path on the local file system.
19601965 @Params
19611966 pdf - The URL or file path of the PDF file.
1962- page - The page number of the PDF to use (optional) .
1967+ page - The page number (or a list of page numbers) of the PDF .
19631968 If a page number is provided, looks only at that page.
19641969 (1 is the first page, 2 is the second page, etc.)
1965- If no page number is provided, returns all PDF text. """
1970+ If no page number is provided, returns all PDF text.
1971+ maxpages - Instead of providing a page number, you can provide
1972+ the number of pages to use from the beginning.
1973+ password - If the PDF is password-protected, enter it here.
1974+ codec - The compression format for character encoding.
1975+ (The default codec used by this method is 'utf-8'.)
1976+ wrap - Replaces ' \n ' with ' ' so that individual sentences
1977+ from a PDF don't get broken up into seperate lines when
1978+ getting converted into text format.
1979+ nav - If PDF is a URL, navigates to the URL in the browser first.
1980+ (Not needed because the PDF will be downloaded anyway.)
1981+ override - If the PDF file to be downloaded already exists in the
1982+ downloaded_files/ folder, that PDF will be used
1983+ instead of downloading it again. """
1984+ from pdfminer .high_level import extract_text
1985+ if not password :
1986+ password = ''
1987+ if not maxpages :
1988+ maxpages = 0
19661989 if not pdf .lower ().endswith ('.pdf' ):
19671990 raise Exception ("%s is not a PDF file! (Expecting a .pdf)" % pdf )
19681991 file_path = None
19691992 if page_utils .is_valid_url (pdf ):
1970- if self . get_current_url () != pdf :
1971- self .open ( pdf )
1972- self .download_file (pdf )
1993+ if nav :
1994+ if self .get_current_url () != pdf :
1995+ self .open (pdf )
19731996 file_name = pdf .split ('/' )[- 1 ]
19741997 file_path = self .get_downloads_folder () + '/' + file_name
1998+ if not os .path .exists (file_path ):
1999+ self .download_file (pdf )
2000+ elif override :
2001+ self .download_file (pdf )
19752002 else :
19762003 if not os .path .exists (pdf ):
19772004 raise Exception ("%s is not a valid URL or file path!" % pdf )
19782005 file_path = os .path .abspath (pdf )
1979- pdf_file_object = open ( file_path , "rb" )
1980- pdf_reader = self . __get_pdf_reader_obj ( pdf_file_object , strict = False )
1981- num_pages = pdf_reader . numPages
1982- pdf_text = ""
1983- if type ( page ) is int :
1984- if page > num_pages :
1985- raise Exception ( "Invalid page number for the PDF!" )
2006+ page_search = None # (Pages are delimited by '\x0c' )
2007+ if type ( page ) is list :
2008+ pages = page
2009+ page_search = []
2010+ for page in pages :
2011+ page_search . append ( page - 1 )
2012+ elif type ( page ) is int :
19862013 page = page - 1
1987- page_obj = pdf_reader .getPage (page )
1988- pdf_text = page_obj .extractText ()
2014+ if page < 0 :
2015+ page = 0
2016+ page_search = [page ]
19892017 else :
1990- for page_num in range (num_pages ):
1991- page_obj = pdf_reader .getPage (page_num )
1992- pdf_text = pdf_text + '\n ' + page_obj .extractText ()
2018+ page_search = None
2019+ pdf_text = extract_text (
2020+ file_path , password = '' , page_numbers = page_search ,
2021+ maxpages = maxpages , caching = False , codec = codec )
2022+ pdf_text = self .__fix_unicode_conversion (pdf_text )
2023+ if wrap :
2024+ pdf_text = pdf_text .replace (' \n ' , ' ' )
19932025 return pdf_text
19942026
1995- def assert_pdf_text (self , pdf , text , page = None ):
2027+ def assert_pdf_text (self , pdf , text , page = None , maxpages = None ,
2028+ password = None , codec = 'utf-8' , wrap = True , nav = False ,
2029+ override = False ):
19962030 """ Asserts text in a PDF file.
19972031 PDF can be either a URL or a file path on the local file system.
19982032 @Params
@@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
20012035 page - The page number of the PDF to use (optional).
20022036 If a page number is provided, looks only at that page.
20032037 (1 is the first page, 2 is the second page, etc.)
2004- If no page number is provided, looks at all the pages. """
2005- pdf_text = self .get_pdf_text (pdf , page = page )
2038+ If no page number is provided, looks at all the pages.
2039+ maxpages - Instead of providing a page number, you can provide
2040+ the number of pages to use from the beginning.
2041+ password - If the PDF is password-protected, enter it here.
2042+ codec - The compression format for character encoding.
2043+ (The default codec used by this method is 'utf-8'.)
2044+ wrap - Replaces ' \n ' with ' ' so that individual sentences
2045+ from a PDF don't get broken up into seperate lines when
2046+ getting converted into text format.
2047+ nav - If PDF is a URL, navigates to the URL in the browser first.
2048+ (Not needed because the PDF will be downloaded anyway.)
2049+ override - If the PDF file to be downloaded already exists in the
2050+ downloaded_files/ folder, that PDF will be used
2051+ instead of downloading it again. """
2052+ text = self .__fix_unicode_conversion (text )
2053+ if not codec :
2054+ codec = 'utf-8'
2055+ pdf_text = self .get_pdf_text (
2056+ pdf , page = page , maxpages = maxpages , password = password , codec = codec ,
2057+ wrap = wrap , nav = nav , override = override )
20062058 if type (page ) is int :
20072059 if text not in pdf_text :
20082060 raise Exception ("PDF [%s] is missing expected text [%s] on "
0 commit comments