@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128 return b"" .join (rv )
129129
130130
131- def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
131+ def HTMLInputStream (source , override_encoding = None , ** kwargs ):
132132 # Work around Python bug #20007: read(0) closes the connection.
133133 # http://bugs.python.org/issue20007
134134 if (isinstance (source , http_client .HTTPResponse ) or
@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142 isUnicode = isinstance (source , text_type )
143143
144144 if isUnicode :
145- if encoding is not None :
146- raise TypeError ("Cannot explicitly set an encoding with a unicode string " )
145+ if override_encoding is not None :
146+ raise TypeError ("Cannot set an override encoding with a unicode input " )
147147
148148 return HTMLUnicodeInputStream (source )
149149 else :
150- return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
150+ return HTMLBinaryInputStream (source , override_encoding = override_encoding , ** kwargs )
151151
152152
153153class HTMLUnicodeInputStream (object ):
@@ -173,8 +173,6 @@ def __init__(self, source):
173173 regardless of any BOM or later declaration (such as in a meta
174174 element)
175175
176- parseMeta - Look for a <meta> element containing encoding information
177-
178176 """
179177
180178 if not utils .supports_lone_surrogates :
@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390388
391389 """
392390
393- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
391+ def __init__ (self , source , override_encoding = None , transport_encoding = None ,
392+ same_origin_parent_encoding = None , likely_encoding = None ,
393+ default_encoding = "windows-1252" , useChardet = True ):
394394 """Initialises the HTMLInputStream.
395395
396396 HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403403 regardless of any BOM or later declaration (such as in a meta
404404 element)
405405
406- parseMeta - Look for a <meta> element containing encoding information
407-
408406 """
409407 # Raw Stream - for unicode objects this will encode to utf-8 and set
410408 # self.charEncoding as appropriate
411409 self .rawStream = self .openStream (source )
412410
413411 HTMLUnicodeInputStream .__init__ (self , self .rawStream )
414412
415- self .charEncoding = (lookupEncoding (encoding ), "certain" )
416-
417413 # Encoding Information
418414 # Number of bytes to use when looking for a meta element with
419415 # encoding information
420416 self .numBytesMeta = 1024
421417 # Number of bytes to use when using detecting encoding using chardet
422418 self .numBytesChardet = 100
423- # Encoding to use if no other information can be found
424- self .defaultEncoding = "windows-1252"
419+ # Things from args
420+ self .override_encoding = override_encoding
421+ self .transport_encoding = transport_encoding
422+ self .same_origin_parent_encoding = same_origin_parent_encoding
423+ self .likely_encoding = likely_encoding
424+ self .default_encoding = default_encoding
425425
426- # Detect encoding iff no explicit "transport level" encoding is supplied
427- if (self .charEncoding [0 ] is None ):
428- self .charEncoding = self .detectEncoding (parseMeta , chardet )
429- assert self .charEncoding [0 ] is not None
426+ # Determine encoding
427+ self .charEncoding = self .determineEncoding (useChardet )
428+ assert self .charEncoding [0 ] is not None
430429
431430 # Call superclass
432431 self .reset ()
@@ -454,21 +453,45 @@ def openStream(self, source):
454453
455454 return stream
456455
457- def detectEncoding (self , parseMeta = True , chardet = True ):
458- # First look for a BOM
456+ def determineEncoding (self , chardet = True ):
457+ # BOMs take precedence over everything
459458 # This will also read past the BOM if present
460- encoding = self .detectBOM ()
461- confidence = "certain"
462- # If there is no BOM need to look for meta elements with encoding
463- # information
464- if encoding is None and parseMeta :
465- encoding = self .detectEncodingMeta ()
466- confidence = "tentative"
459+ charEncoding = self .detectBOM (), "certain"
460+ if charEncoding [0 ] is not None :
461+ return charEncoding
462+
463+ # If we've been overriden, we've been overriden
464+ charEncoding = lookupEncoding (self .override_encoding ), "certain"
465+ if charEncoding [0 ] is not None :
466+ return charEncoding
467+
468+ # Now check the transport layer
469+ charEncoding = lookupEncoding (self .transport_encoding ), "certain"
470+ if charEncoding [0 ] is not None :
471+ return charEncoding
472+
473+ # Look for meta elements with encoding information
474+ charEncoding = self .detectEncodingMeta (), "tentative"
475+ if charEncoding [0 ] is not None :
476+ return charEncoding
477+
478+ # Parent document encoding
479+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ), "tentative"
480+ if charEncoding [0 ] is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
481+ return charEncoding
482+
483+ # "likely" encoding
484+ charEncoding = lookupEncoding (self .likely_encoding ), "tentative"
485+ if charEncoding [0 ] is not None :
486+ return charEncoding
487+
467488 # Guess with chardet, if available
468- if encoding is None and chardet :
469- confidence = "tentative"
489+ if chardet :
470490 try :
471491 from chardet .universaldetector import UniversalDetector
492+ except ImportError :
493+ pass
494+ else :
472495 buffers = []
473496 detector = UniversalDetector ()
474497 while not detector .done :
@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481504 detector .close ()
482505 encoding = lookupEncoding (detector .result ['encoding' ])
483506 self .rawStream .seek (0 )
484- except ImportError :
485- pass
486- # If all else fails use the default encoding
487- if encoding is None :
488- confidence = "tentative"
489- encoding = lookupEncoding (self .defaultEncoding )
507+ if encoding is not None :
508+ return encoding , "tentative"
509+
510+ # Try the default encoding
511+ charEncoding = lookupEncoding (self .default_encoding ), "tentative"
512+ if charEncoding [0 ] is not None :
513+ return charEncoding
490514
491- return encoding , confidence
515+ # Fallback to html5lib's default if even that hasn't worked
516+ return lookupEncoding ("windows-1252" ), "tentative"
492517
493518 def changeEncoding (self , newEncoding ):
494519 assert self .charEncoding [1 ] != "certain"
0 commit comments