Skip to content

Commit ef75426

Browse files
authored
Fix response encoding detection (#65)
1 parent 4b7134e commit ef75426

File tree

2 files changed

+45
-1
lines changed

2 files changed

+45
-1
lines changed

scrapy_playwright/handler.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
from scrapy.responsetypes import responsetypes
2323
from scrapy.utils.defer import deferred_from_coro
2424
from scrapy.utils.misc import load_object
25+
from scrapy.utils.python import to_unicode
2526
from scrapy.utils.reactor import verify_installed_reactor
2627
from twisted.internet.defer import Deferred, inlineCallbacks
28+
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
2729

2830
from scrapy_playwright.headers import use_scrapy_headers
2931
from scrapy_playwright.page import PageCoroutine
@@ -220,7 +222,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
220222
pc.result = await method(*pc.args, **pc.kwargs)
221223
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
222224

223-
body = (await page.content()).encode("utf8")
225+
body_str = await page.content()
224226
request.meta["download_latency"] = time() - start_time
225227

226228
if request.meta.get("playwright_include_page"):
@@ -231,6 +233,8 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
231233

232234
headers = Headers(response.headers)
233235
headers.pop("Content-Encoding", None)
236+
encoding = _get_response_encoding(headers, body_str) or "utf-8"
237+
body = body_str.encode(encoding)
234238
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
235239
return respcls(
236240
url=page.url,
@@ -239,6 +243,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
239243
body=body,
240244
request=request,
241245
flags=["playwright"],
246+
encoding=encoding,
242247
)
243248

244249
def _increment_request_stats(self, request: PlaywrightRequest) -> None:
@@ -285,3 +290,13 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
285290
await route.continue_(**overrides)
286291

287292
return _request_handler
293+
294+
295+
def _get_response_encoding(headers: Headers, body: str) -> Optional[str]:
296+
encoding = None
297+
if headers.get("content-type"):
298+
content_type = to_unicode(headers["content-type"])
299+
encoding = http_content_type_encoding(content_type)
300+
if not encoding:
301+
encoding = html_body_declared_encoding(body)
302+
return encoding

tests/test_misc.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import pytest
2+
from scrapy.http.headers import Headers
3+
4+
from scrapy_playwright.handler import _get_response_encoding
5+
6+
7+
@pytest.mark.asyncio
8+
async def test_get_response_encoding():
9+
assert (
10+
_get_response_encoding(
11+
headers=Headers({"content-type": "text/html; charset=UTF-8"}),
12+
body="",
13+
)
14+
== "utf-8"
15+
)
16+
assert (
17+
_get_response_encoding(
18+
headers=Headers(),
19+
body="""<!doctype html>
20+
<html lang="cn">
21+
<head>
22+
<meta charset="gb2312">
23+
</head>
24+
</html>
25+
""",
26+
)
27+
== "gb18030"
28+
)
29+
assert _get_response_encoding(headers=Headers(), body="") is None

0 commit comments

Comments
 (0)