@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128 return b"" .join (rv )
129129
130130
131- def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
131+ def HTMLInputStream (source , ** kwargs ):
132132 # Work around Python bug #20007: read(0) closes the connection.
133133 # http://bugs.python.org/issue20007
134134 if (isinstance (source , http_client .HTTPResponse ) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142 isUnicode = isinstance (source , text_type )
143143
144144 if isUnicode :
145- if encoding is not None :
146- raise TypeError ("Cannot explicitly set an encoding with a unicode string" )
145+ encodings = [x for x in kwargs if x .endswith ("_encoding" )]
146+ if encodings :
147+ raise TypeError ("Cannot set an encoding with a unicode input, set %r" % encodings )
147148
148- return HTMLUnicodeInputStream (source )
149+ return HTMLUnicodeInputStream (source , ** kwargs )
149150 else :
150- return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
151+ return HTMLBinaryInputStream (source , ** kwargs )
151152
152153
153154class HTMLUnicodeInputStream (object ):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174 regardless of any BOM or later declaration (such as in a meta
174175 element)
175176
176- parseMeta - Look for a <meta> element containing encoding information
177-
178177 """
179178
180179 if not utils .supports_lone_surrogates :
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390 """
392391
393- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
392+ def __init__ (self , source , override_encoding = None , transport_encoding = None ,
393+ same_origin_parent_encoding = None , likely_encoding = None ,
394+ default_encoding = "windows-1252" , useChardet = True ):
394395 """Initialises the HTMLInputStream.
395396
396397 HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404 regardless of any BOM or later declaration (such as in a meta
404405 element)
405406
406- parseMeta - Look for a <meta> element containing encoding information
407-
408407 """
409408 # Raw Stream - for unicode objects this will encode to utf-8 and set
410409 # self.charEncoding as appropriate
411410 self .rawStream = self .openStream (source )
412411
413412 HTMLUnicodeInputStream .__init__ (self , self .rawStream )
414413
415- self .charEncoding = (lookupEncoding (encoding ), "certain" )
416-
417414 # Encoding Information
418415 # Number of bytes to use when looking for a meta element with
419416 # encoding information
420417 self .numBytesMeta = 1024
421418 # Number of bytes to use when using detecting encoding using chardet
422419 self .numBytesChardet = 100
423- # Encoding to use if no other information can be found
424- self .defaultEncoding = "windows-1252"
420+ # Things from args
421+ self .override_encoding = override_encoding
422+ self .transport_encoding = transport_encoding
423+ self .same_origin_parent_encoding = same_origin_parent_encoding
424+ self .likely_encoding = likely_encoding
425+ self .default_encoding = default_encoding
425426
426- # Detect encoding iff no explicit "transport level" encoding is supplied
427- if (self .charEncoding [0 ] is None ):
428- self .charEncoding = self .detectEncoding (parseMeta , chardet )
429- assert self .charEncoding [0 ] is not None
427+ # Determine encoding
428+ self .charEncoding = self .determineEncoding (useChardet )
429+ assert self .charEncoding [0 ] is not None
430430
431431 # Call superclass
432432 self .reset ()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454
455455 return stream
456456
457- def detectEncoding (self , parseMeta = True , chardet = True ):
458- # First look for a BOM
457+ def determineEncoding (self , chardet = True ):
458+ # BOMs take precedence over everything
459459 # This will also read past the BOM if present
460- encoding = self .detectBOM ()
461- confidence = "certain"
462- # If there is no BOM need to look for meta elements with encoding
463- # information
464- if encoding is None and parseMeta :
465- encoding = self .detectEncodingMeta ()
466- confidence = "tentative"
460+ charEncoding = self .detectBOM (), "certain"
461+ if charEncoding [0 ] is not None :
462+ return charEncoding
463+
464+ # If we've been overriden, we've been overriden
465+ charEncoding = lookupEncoding (self .override_encoding ), "certain"
466+ if charEncoding [0 ] is not None :
467+ return charEncoding
468+
469+ # Now check the transport layer
470+ charEncoding = lookupEncoding (self .transport_encoding ), "certain"
471+ if charEncoding [0 ] is not None :
472+ return charEncoding
473+
474+ # Look for meta elements with encoding information
475+ charEncoding = self .detectEncodingMeta (), "tentative"
476+ if charEncoding [0 ] is not None :
477+ return charEncoding
478+
479+ # Parent document encoding
480+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ), "tentative"
481+ if charEncoding [0 ] is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
482+ return charEncoding
483+
484+ # "likely" encoding
485+ charEncoding = lookupEncoding (self .likely_encoding ), "tentative"
486+ if charEncoding [0 ] is not None :
487+ return charEncoding
488+
467489 # Guess with chardet, if available
468- if encoding is None and chardet :
469- confidence = "tentative"
490+ if chardet :
470491 try :
471492 from chardet .universaldetector import UniversalDetector
493+ except ImportError :
494+ pass
495+ else :
472496 buffers = []
473497 detector = UniversalDetector ()
474498 while not detector .done :
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505 detector .close ()
482506 encoding = lookupEncoding (detector .result ['encoding' ])
483507 self .rawStream .seek (0 )
484- except ImportError :
485- pass
486- # If all else fails use the default encoding
487- if encoding is None :
488- confidence = "tentative"
489- encoding = lookupEncoding (self .defaultEncoding )
508+ if encoding is not None :
509+ return encoding , "tentative"
510+
511+ # Try the default encoding
512+ charEncoding = lookupEncoding (self .default_encoding ), "tentative"
513+ if charEncoding [0 ] is not None :
514+ return charEncoding
490515
491- return encoding , confidence
516+ # Fallback to html5lib's default if even that hasn't worked
517+ return lookupEncoding ("windows-1252" ), "tentative"
492518
493519 def changeEncoding (self , newEncoding ):
494520 assert self .charEncoding [1 ] != "certain"
0 commit comments