2222from scrapy .responsetypes import responsetypes
2323from scrapy .utils .defer import deferred_from_coro
2424from scrapy .utils .misc import load_object
25+ from scrapy .utils .python import to_unicode
2526from scrapy .utils .reactor import verify_installed_reactor
2627from twisted .internet .defer import Deferred , inlineCallbacks
28+ from w3lib .encoding import html_body_declared_encoding , http_content_type_encoding
2729
2830from scrapy_playwright .headers import use_scrapy_headers
2931from scrapy_playwright .page import PageCoroutine
@@ -220,7 +222,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
220222 pc .result = await method (* pc .args , ** pc .kwargs )
221223 await page .wait_for_load_state (timeout = self .default_navigation_timeout )
222224
223- body = ( await page .content ()). encode ( "utf8" )
225+ body_str = await page .content ()
224226 request .meta ["download_latency" ] = time () - start_time
225227
226228 if request .meta .get ("playwright_include_page" ):
@@ -231,6 +233,8 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
231233
232234 headers = Headers (response .headers )
233235 headers .pop ("Content-Encoding" , None )
236+ encoding = _get_response_encoding (headers , body_str ) or "utf-8"
237+ body = body_str .encode (encoding )
234238 respcls = responsetypes .from_args (headers = headers , url = page .url , body = body )
235239 return respcls (
236240 url = page .url ,
@@ -239,6 +243,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
239243 body = body ,
240244 request = request ,
241245 flags = ["playwright" ],
246+ encoding = encoding ,
242247 )
243248
244249 def _increment_request_stats (self , request : PlaywrightRequest ) -> None :
@@ -285,3 +290,13 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
285290 await route .continue_ (** overrides )
286291
287292 return _request_handler
293+
294+
295+ def _get_response_encoding (headers : Headers , body : str ) -> Optional [str ]:
296+ encoding = None
297+ if headers .get ("content-type" ):
298+ content_type = to_unicode (headers ["content-type" ])
299+ encoding = http_content_type_encoding (content_type )
300+ if not encoding :
301+ encoding = html_body_declared_encoding (body )
302+ return encoding
0 commit comments