Skip to content

Commit d08a944

Browse files
authored
Avoid UnicodeEncodeError: try multiple body encodings (#72)
1 parent d79aa5b commit d08a944

File tree

3 files changed

+78
-38
lines changed

3 files changed

+78
-38
lines changed

scrapy_playwright/handler.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from inspect import isawaitable
77
from ipaddress import ip_address
88
from time import time
9-
from typing import Callable, Dict, Optional, Type, TypeVar
9+
from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar
1010

1111
from playwright.async_api import (
1212
BrowserContext,
@@ -283,8 +283,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
283283

284284
headers = Headers(response.headers)
285285
headers.pop("Content-Encoding", None)
286-
encoding = _get_response_encoding(headers, body_str) or "utf-8"
287-
body = body_str.encode(encoding)
286+
body, encoding = _encode_body(headers=headers, text=body_str)
288287
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
289288
return respcls(
290289
url=page.url,
@@ -352,11 +351,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
352351
return _request_handler
353352

354353

355-
def _get_response_encoding(headers: Headers, body: str) -> Optional[str]:
356-
encoding = None
354+
def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
357355
if headers.get("content-type"):
358356
content_type = to_unicode(headers["content-type"])
359-
encoding = http_content_type_encoding(content_type)
360-
if not encoding:
361-
encoding = html_body_declared_encoding(body)
362-
return encoding
357+
yield http_content_type_encoding(content_type)
358+
yield html_body_declared_encoding(text)
359+
360+
361+
def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
362+
for encoding in filter(None, _possible_encodings(headers, text)):
363+
try:
364+
body = text.encode(encoding)
365+
except UnicodeEncodeError:
366+
pass
367+
else:
368+
return body, encoding
369+
return text.encode("utf-8"), "utf-8" # fallback

tests/test_encoding.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pytest
2+
from scrapy.http.headers import Headers
3+
4+
from scrapy_playwright.handler import _encode_body
5+
6+
7+
def body_str(charset: str, content: str = "áéíóú") -> str:
8+
return f"""
9+
<!doctype html>
10+
<html>
11+
<head>
12+
<meta charset="{charset}">
13+
</head>
14+
<body>
15+
<p>{content}</p>
16+
</body>
17+
</html>
18+
""".strip()
19+
20+
21+
@pytest.mark.asyncio
22+
async def test_encode_from_headers():
23+
"""Charset declared in headers takes precedence"""
24+
text = body_str("gb2312")
25+
body, encoding = _encode_body(
26+
headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
27+
text=text,
28+
)
29+
assert encoding == "cp1252"
30+
assert body == text.encode(encoding)
31+
32+
33+
@pytest.mark.asyncio
34+
async def test_encode_from_body():
35+
"""No charset declared in headers, use the one declared in the body"""
36+
text = body_str("gb2312")
37+
body, encoding = _encode_body(headers=Headers({}), text=text)
38+
assert encoding == "gb18030"
39+
assert body == text.encode(encoding)
40+
41+
42+
@pytest.mark.asyncio
43+
async def test_encode_fallback():
44+
"""No charset declared, use utf-8 as fallback"""
45+
text = "<html>áéíóú</html>"
46+
body, encoding = _encode_body(headers=Headers(), text=text)
47+
assert encoding == "utf-8"
48+
assert body == text.encode(encoding)
49+
50+
51+
@pytest.mark.asyncio
52+
async def test_encode_mismatch():
53+
"""Charset declared in headers and body do not match, and the headers
54+
one fails to encode: use the one in the body (first one that works)
55+
"""
56+
text = body_str("gb2312", content="空手道")
57+
body, encoding = _encode_body(
58+
headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
59+
text=text,
60+
)
61+
assert encoding == "gb18030"
62+
assert body == text.encode(encoding)

tests/test_misc.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

0 commit comments

Comments
 (0)