|
6 | 6 | from inspect import isawaitable |
7 | 7 | from ipaddress import ip_address |
8 | 8 | from time import time |
9 | | -from typing import Callable, Dict, Optional, Type, TypeVar |
| 9 | +from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar |
10 | 10 |
|
11 | 11 | from playwright.async_api import ( |
12 | 12 | BrowserContext, |
@@ -283,8 +283,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res |
283 | 283 |
|
284 | 284 | headers = Headers(response.headers) |
285 | 285 | headers.pop("Content-Encoding", None) |
286 | | - encoding = _get_response_encoding(headers, body_str) or "utf-8" |
287 | | - body = body_str.encode(encoding) |
| 286 | + body, encoding = _encode_body(headers=headers, text=body_str) |
288 | 287 | respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) |
289 | 288 | return respcls( |
290 | 289 | url=page.url, |
@@ -352,11 +351,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) |
352 | 351 | return _request_handler |
353 | 352 |
|
354 | 353 |
|
355 | | -def _get_response_encoding(headers: Headers, body: str) -> Optional[str]: |
356 | | - encoding = None |
| 354 | +def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]: |
357 | 355 | if headers.get("content-type"): |
358 | 356 | content_type = to_unicode(headers["content-type"]) |
359 | | - encoding = http_content_type_encoding(content_type) |
360 | | - if not encoding: |
361 | | - encoding = html_body_declared_encoding(body) |
362 | | - return encoding |
| 357 | + yield http_content_type_encoding(content_type) |
| 358 | + yield html_body_declared_encoding(text) |
| 359 | + |
| 360 | + |
| 361 | +def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: |
| 362 | + for encoding in filter(None, _possible_encodings(headers, text)): |
| 363 | + try: |
| 364 | + body = text.encode(encoding) |
| 365 | + except UnicodeEncodeError: |
| 366 | + pass |
| 367 | + else: |
| 368 | + return body, encoding |
| 369 | + return text.encode("utf-8"), "utf-8" # fallback |
0 commit comments