Skip to content

Commit fa7d5f1

Browse files
authored
Accept sync functions to process headers (#87)
1 parent afef146 commit fa7d5f1

File tree

2 files changed

+43
-39
lines changed

2 files changed

+43
-39
lines changed

README.md

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ $ pip install scrapy-playwright
3838
Please see the [changelog.md](changelog.md) file.
3939

4040

41-
## Configuration
41+
## Activation
4242

43-
Replace the default `http` and `https` Download Handlers through
43+
Replace the default `http` and/or `https` Download Handlers through
4444
[`DOWNLOAD_HANDLERS`](https://docs.scrapy.org/en/latest/topics/settings.html):
4545

4646
```python
@@ -60,7 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
6060
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
6161
```
6262

63-
### Settings
63+
## Settings
6464

6565
`scrapy-playwright` accepts the following settings:
6666

@@ -99,9 +99,10 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
9999

100100
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
101101

102-
The path to a coroutine function (`async def`) that processes headers for a given request
102+
A function (or the path to a function) that processes headers for a given request
103103
and returns a dictionary with the headers to be used (note that, depending on the browser,
104-
additional default headers will be sent as well).
104+
additional default headers will be sent as well). Coroutine functions (`async def`) are
105+
supported.
105106

106107
The function must return a `dict` object, and receives the following keyword arguments:
107108

@@ -156,13 +157,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
156157

157158
```
158159

159-
Please note:
160+
Please note that all requests will appear in the DEBUG level logs, however there will
161+
be no corresponding response log lines for aborted requests. Aborted requests
162+
are counted in the `playwright/request_count/aborted` job stats item.
160163

161-
* All requests will appear in the DEBUG level logs, however there will
162-
be no corresponding response log lines for aborted requests. Aborted requests
163-
are counted in the `playwright/request_count/aborted` job stats item.
164-
* Passing callable objects is only supported when using Scrapy>=2.4. With prior
165-
versions, only strings containing object paths are supported.
164+
### General note about settings
165+
166+
For the settings which accept object paths as strings, passing callable objects is
167+
only supported when using Scrapy>=2.4. With prior versions, only strings are supported.
166168

167169

168170
## Basic usage

scrapy_playwright/handler.py

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,26 +41,6 @@
4141
logger = logging.getLogger("scrapy-playwright")
4242

4343

44-
def _make_request_logger(context_name: str) -> Callable:
45-
def _log_request(request: PlaywrightRequest) -> None:
46-
logger.debug(
47-
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
48-
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
49-
)
50-
51-
return _log_request
52-
53-
54-
def _make_response_logger(context_name: str) -> Callable:
55-
def _log_request(response: PlaywrightResponse) -> None:
56-
logger.debug(
57-
f"[Context={context_name}] Response: <{response.status} {response.url}> "
58-
f"(referrer: {response.headers.get('referer')})"
59-
)
60-
61-
return _log_request
62-
63-
6444
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
6545
def __init__(self, crawler: Crawler) -> None:
6646
super().__init__(settings=crawler.settings, crawler=crawler)
@@ -289,9 +269,7 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None:
289269
except AttributeError:
290270
logger.warning(f"Ignoring {repr(pm)}: could not find method")
291271
else:
292-
pm.result = method(*pm.args, **pm.kwargs)
293-
if isinstance(pm.result, Awaitable):
294-
pm.result = await pm.result
272+
pm.result = await _await_if_necessary(method(*pm.args, **pm.kwargs))
295273
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
296274
else:
297275
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
@@ -333,16 +311,14 @@ def _make_request_handler(
333311
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
334312
"""Override request headers, method and body."""
335313
if self.abort_request:
336-
should_abort = self.abort_request(playwright_request)
337-
if isinstance(should_abort, Awaitable):
338-
should_abort = await should_abort
314+
should_abort = await _await_if_necessary(self.abort_request(playwright_request))
339315
if should_abort:
340316
await route.abort()
341317
self.stats.inc_value("playwright/request_count/aborted")
342318
return None
343319

344-
processed_headers = await self.process_request_headers(
345-
self.browser_type, playwright_request, scrapy_headers
320+
processed_headers = await _await_if_necessary(
321+
self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
346322
)
347323

348324
# the request that reaches the callback should contain the headers that were sent
@@ -368,6 +344,32 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
368344
return _request_handler
369345

370346

347+
async def _await_if_necessary(obj):
348+
if isinstance(obj, Awaitable):
349+
return await obj
350+
return obj
351+
352+
353+
def _make_request_logger(context_name: str) -> Callable:
354+
def _log_request(request: PlaywrightRequest) -> None:
355+
logger.debug(
356+
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
357+
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
358+
)
359+
360+
return _log_request
361+
362+
363+
def _make_response_logger(context_name: str) -> Callable:
364+
def _log_request(response: PlaywrightResponse) -> None:
365+
logger.debug(
366+
f"[Context={context_name}] Response: <{response.status} {response.url}> "
367+
f"(referrer: {response.headers.get('referer')})"
368+
)
369+
370+
return _log_request
371+
372+
371373
def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
372374
if headers.get("content-type"):
373375
content_type = to_unicode(headers["content-type"])

0 commit comments

Comments
 (0)