Skip to content

Commit f50e7df

Browse files
elacuestaGallaecio
andauthored
Process request headers (#38)
* Process request headers * Typing fix * Rename private request handler * Additional typing fix * Clarifying comment * scrapy_playwright/handler.py: fix typo: reches → reaches * Add test/docstring * Docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS * Update readme Co-authored-by: Adrián Chaves <adrian@chaves.io>
1 parent 2ca67f3 commit f50e7df

File tree

4 files changed

+140
-21
lines changed

4 files changed

+140
-21
lines changed

README.md

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
103103
the default value will be used (30000 ms at the time of writing this).
104104
See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout).
105105

106+
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`)
107+
108+
The path to a coroutine function (`async def`) that processes headers for a given request
109+
and returns a dictionary with the headers to be used (note that, depending on the browser,
110+
additional default headers will be sent as well).
111+
112+
The function must return a `dict` object, and receives the following keyword arguments:
113+
114+
```python
115+
browser_type: str, playwright_request: playwright.async_api.Request, scrapy_headers: scrapy.http.headers.Headers
116+
```
117+
118+
The default value (`scrapy_playwright.headers.use_scrapy_headers`) tries to emulate Scrapy's
119+
behaviour for navigation requests, i.e. overriding headers with their values from the Scrapy request.
120+
For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
121+
is overriden, for consistency.
122+
123+
There is nother function available: `scrapy_playwright.headers.use_playwright_headers`,
124+
which will return the headers from the Playwright request without any changes.
106125

107126
## Basic usage
108127

@@ -135,8 +154,8 @@ class AwesomeSpider(scrapy.Spider):
135154
By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
136155
`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
137156
This could cause some sites to react in unexpected ways, for instance if the user agent
138-
does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
139-
set the Scrapy user agent to `None`.
157+
does not match the running Browser. If you prefer the `User-Agent` sent by
158+
default by the specific browser you're using, set the Scrapy user agent to `None`.
140159

141160

142161
## Receiving the Page object in the callback

scrapy_playwright/handler.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from contextlib import suppress
66
from time import time
77
from typing import Callable, Dict, Optional, Type, TypeVar
8-
from urllib.parse import urlparse
98

109
from playwright.async_api import (
1110
BrowserContext,
@@ -22,9 +21,11 @@
2221
from scrapy.http.headers import Headers
2322
from scrapy.responsetypes import responsetypes
2423
from scrapy.utils.defer import deferred_from_coro
24+
from scrapy.utils.misc import load_object
2525
from scrapy.utils.reactor import verify_installed_reactor
2626
from twisted.internet.defer import Deferred, inlineCallbacks
2727

28+
from scrapy_playwright.headers import use_scrapy_headers
2829
from scrapy_playwright.page import PageCoroutine
2930

3031

@@ -66,13 +67,21 @@ def __init__(self, crawler: Crawler) -> None:
6667

6768
self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium"
6869
self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
70+
6971
self.default_navigation_timeout: Optional[float] = None
7072
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
7173
with suppress(TypeError, ValueError):
7274
self.default_navigation_timeout = float(
7375
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
7476
)
7577

78+
if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
79+
self.process_request_headers = load_object(
80+
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
81+
)
82+
else:
83+
self.process_request_headers = use_scrapy_headers
84+
7685
default_context_kwargs: dict = {}
7786
if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings:
7887
default_context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXT_ARGS")
@@ -180,9 +189,8 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
180189
await page.route(
181190
"**",
182191
self._make_request_handler(
183-
url=request.url,
184192
method=request.method,
185-
headers=request.headers.to_unicode_dict(),
193+
scrapy_headers=request.headers,
186194
body=request.body,
187195
encoding=getattr(request, "encoding", None),
188196
),
@@ -249,23 +257,24 @@ def close_browser_context_callback() -> None:
249257
return close_browser_context_callback
250258

251259
def _make_request_handler(
252-
self, url: str, method: str, headers: dict, body: Optional[bytes], encoding: str = "utf8"
260+
self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8"
253261
) -> Callable:
254-
def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
262+
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
255263
"""Override request headers, method and body."""
256-
headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
257-
if pw_request.url == url:
258-
overrides: dict = {"method": method, "headers": headers}
264+
processed_headers = await self.process_request_headers(
265+
self.browser_type, playwright_request, scrapy_headers
266+
)
267+
268+
# the request that reaches the callback should contain the headers that were sent
269+
scrapy_headers.clear()
270+
scrapy_headers.update(processed_headers)
271+
272+
overrides: dict = {"headers": processed_headers}
273+
if playwright_request.is_navigation_request():
274+
overrides["method"] = method
259275
if body is not None:
260276
overrides["post_data"] = body.decode(encoding)
261-
if self.browser_type == "firefox":
262-
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
263-
overrides["headers"]["host"] = urlparse(pw_request.url).netloc
264-
else:
265-
overrides = {"headers": pw_request.headers.copy()}
266-
# override user agent, for consistency with other requests
267-
if headers.get("user-agent"):
268-
overrides["headers"]["user-agent"] = headers["user-agent"]
269-
asyncio.create_task(route.continue_(**overrides))
270-
271-
return request_handler
277+
278+
await route.continue_(**overrides)
279+
280+
return _request_handler

scrapy_playwright/headers.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from urllib.parse import urlparse
2+
3+
from playwright.async_api import Request as PlaywrightRequest
4+
from scrapy.http.headers import Headers
5+
6+
7+
"""
8+
This module includes functions to process request headers.
9+
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
10+
"""
11+
12+
13+
async def use_scrapy_headers(
14+
browser_type: str,
15+
playwright_request: PlaywrightRequest,
16+
scrapy_headers: Headers,
17+
) -> dict:
18+
"""Scrapy headers take precedence over Playwright headers for navigation requests.
19+
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
20+
21+
headers = scrapy_headers.to_unicode_dict()
22+
23+
# Scrapy's user agent has priority over Playwright's
24+
headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))
25+
26+
if playwright_request.is_navigation_request():
27+
if browser_type == "firefox":
28+
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
29+
headers["host"] = urlparse(playwright_request.url).netloc
30+
return headers
31+
else:
32+
# override user agent, for consistency with other requests
33+
if headers.get("user-agent"):
34+
playwright_request.headers["user-agent"] = headers["user-agent"]
35+
return playwright_request.headers
36+
37+
38+
async def use_playwright_headers(
39+
browser_type: str,
40+
playwright_request: PlaywrightRequest,
41+
scrapy_headers: Headers,
42+
) -> dict:
43+
"""Return headers from the Playwright request, unaltered"""
44+
return playwright_request.headers

tests/test_playwright_requests.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,53 @@ async def test_user_agent(self):
259259
headers = {key.lower(): value for key, value in headers.items()}
260260
assert headers["user-agent"] == "foobar"
261261

262+
@pytest.mark.asyncio
263+
async def test_use_playwright_headers(self):
264+
"""Ignore Scrapy headers"""
265+
settings_dict = {
266+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
267+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
268+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501
269+
}
270+
async with make_handler(settings_dict) as handler:
271+
with MockServer() as server:
272+
req = Request(
273+
url=server.urljoin("/headers"),
274+
meta={"playwright": True},
275+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
276+
)
277+
resp = await handler._download_request(req, Spider("foo"))
278+
headers = json.loads(resp.css("pre::text").get())
279+
headers = {key.lower(): value for key, value in headers.items()}
280+
assert headers["user-agent"] == self.browser_type
281+
assert "asdf" not in headers
282+
283+
@pytest.mark.asyncio
284+
async def test_use_custom_headers(self):
285+
"""Custom header processing function"""
286+
287+
async def important_headers(*args, **kwargs) -> dict:
288+
return {"foo": "bar"}
289+
290+
settings_dict = {
291+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
292+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
293+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
294+
}
295+
async with make_handler(settings_dict) as handler:
296+
with MockServer() as server:
297+
req = Request(
298+
url=server.urljoin("/headers"),
299+
meta={"playwright": True},
300+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
301+
)
302+
resp = await handler._download_request(req, Spider("foo"))
303+
headers = json.loads(resp.css("pre::text").get())
304+
headers = {key.lower(): value for key, value in headers.items()}
305+
assert headers["foo"] == "bar"
306+
assert headers.get("user-agent") not in (self.browser_type, "foobar")
307+
assert "asdf" not in headers
308+
262309
@pytest.mark.asyncio
263310
async def test_event_handler_dialog_callable(self):
264311
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:

0 commit comments

Comments
 (0)