@@ -84,6 +84,7 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
8484 @read = false
8585 @uri = nil
8686 @decode_content = false
87+ @body_encoding = false
8788 end
8889
8990 # The HTTP version supported by the server.
@@ -106,6 +107,18 @@ def initialize(httpv, code, msg) #:nodoc: internal use only
106107 # Accept-Encoding header from the user.
107108 attr_accessor :decode_content
108109
110+ # The encoding to use for the response body. If Encoding, use that encoding.
111+ # If other true value, attempt to detect the appropriate encoding, and use
112+ # that.
113+ attr_reader :body_encoding
114+
115+ # Set the encoding to use for the response body. If given a String, find
116+ # the related Encoding.
117+ def body_encoding = ( value )
118+ value = Encoding . find ( value ) if value . is_a? ( String )
119+ @body_encoding = value
120+ end
121+
109122 def inspect
110123 "#<#{ self . class } #{ @code } #{ @message } readbody=#{ @read } >"
111124 end
@@ -214,6 +227,17 @@ def read_body(dest = nil, &block)
214227 end
215228 @read = true
216229
230+ case enc = @body_encoding
231+ when Encoding , false , nil
232+ # Encoding: force given encoding
233+ # false/nil: do not force encoding
234+ else
235+ # other value: detect encoding from body
236+ enc = detect_encoding ( @body )
237+ end
238+
239+ @body . force_encoding ( enc ) if enc
240+
217241 @body
218242 end
219243
@@ -245,6 +269,141 @@ def body=(value)
245269
246270 private
247271
272+ # :nodoc:
273+ def detect_encoding ( str , encoding = nil )
274+ if encoding
275+ elsif encoding = type_params [ 'charset' ]
276+ elsif encoding = check_bom ( str )
277+ else
278+ encoding = case content_type &.downcase
279+ when %r{text/x(?:ht)?ml|application/(?:[^+]+\+ )?xml}
280+ /\A <xml[ \t \r \n ]+
281+ version[ \t \r \n ]*=[ \t \r \n ]*(?:"[0-9.]+"|'[0-9.]*')[ \t \r \n ]+
282+ encoding[ \t \r \n ]*=[ \t \r \n ]*
283+ (?:"([A-Za-z][\- A-Za-z0-9._]*)"|'([A-Za-z][\- A-Za-z0-9._]*)')/x =~ str
284+ encoding = $1 || $2 || Encoding ::UTF_8
285+ when %r{text/html.*}
286+ sniff_encoding ( str )
287+ end
288+ end
289+ return encoding
290+ end
291+
292+ # :nodoc:
293+ def sniff_encoding ( str , encoding = nil )
294+ # the encoding sniffing algorithm
295+ # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
296+ if enc = scanning_meta ( str )
297+ enc
298+ # 6. last visited page or something
299+ # 7. frequency
300+ elsif str . ascii_only?
301+ Encoding ::US_ASCII
302+ elsif str . dup . force_encoding ( Encoding ::UTF_8 ) . valid_encoding?
303+ Encoding ::UTF_8
304+ end
305+ # 8. implementation-defined or user-specified
306+ end
307+
308+ # :nodoc:
309+ def check_bom ( str )
310+ case str . byteslice ( 0 , 2 )
311+ when "\xFE \xFF "
312+ return Encoding ::UTF_16BE
313+ when "\xFF \xFE "
314+ return Encoding ::UTF_16LE
315+ end
316+ if "\xEF \xBB \xBF " == str . byteslice ( 0 , 3 )
317+ return Encoding ::UTF_8
318+ end
319+ nil
320+ end
321+
322+ # :nodoc:
323+ def scanning_meta ( str )
324+ require 'strscan'
325+ ss = StringScanner . new ( str )
326+ if ss . scan_until ( /<meta[\t \n \f \r ]*/ )
327+ attrs = { } # attribute_list
328+ got_pragma = false
329+ need_pragma = nil
330+ charset = nil
331+
332+ # step: Attributes
333+ while attr = get_attribute ( ss )
334+ name , value = *attr
335+ next if attrs [ name ]
336+ attrs [ name ] = true
337+ case name
338+ when 'http-equiv'
339+ got_pragma = true if value == 'content-type'
340+ when 'content'
341+ encoding = extracting_encodings_from_meta_elements ( value )
342+ unless charset
343+ charset = encoding
344+ end
345+ need_pragma = true
346+ when 'charset'
347+ need_pragma = false
348+ charset = value
349+ end
350+ end
351+
352+ # step: Processing
353+ return if need_pragma . nil?
354+ return if need_pragma && !got_pragma
355+
356+ charset = Encoding . find ( charset ) rescue nil
357+ return unless charset
358+ charset = Encoding ::UTF_8 if charset == Encoding ::UTF_16
359+ return charset # tentative
360+ end
361+ nil
362+ end
363+
364+ def get_attribute ( ss )
365+ ss . scan ( /[\t \n \f \r \/ ]*/ )
366+ if ss . peek ( 1 ) == '>'
367+ ss . getch
368+ return nil
369+ end
370+ name = ss . scan ( /[^=\t \n \f \r \/ >]*/ )
371+ name . downcase!
372+ raise if name . empty?
373+ ss . skip ( /[\t \n \f \r ]*/ )
374+ if ss . getch != '='
375+ value = ''
376+ return [ name , value ]
377+ end
378+ ss . skip ( /[\t \n \f \r ]*/ )
379+ case ss . peek ( 1 )
380+ when '"'
381+ ss . getch
382+ value = ss . scan ( /[^"]+/ )
383+ value . downcase!
384+ ss . getch
385+ when "'"
386+ ss . getch
387+ value = ss . scan ( /[^']+/ )
388+ value . downcase!
389+ ss . getch
390+ when '>'
391+ value = ''
392+ else
393+ value = ss . scan ( /[^\t \n \f \r >]+/ )
394+ value . downcase!
395+ end
396+ [ name , value ]
397+ end
398+
399+ def extracting_encodings_from_meta_elements ( value )
400+ # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
401+ if /charset[\t \n \f \r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z |([^\t \n \f \r ;]+))/i =~ value
402+ return $1 || $2 || $3
403+ end
404+ return nil
405+ end
406+
248407 ##
249408 # Checks for a supported Content-Encoding header and yields an Inflate
250409 # wrapper for this response's socket when zlib is present. If the
0 commit comments