Skip to content

Commit b36a79f

Browse files
authored
Use new headers API from Playwright 1.15 (#93)
1 parent a632118 commit b36a79f

File tree

8 files changed

+237
-125
lines changed

8 files changed

+237
-125
lines changed

README.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ to integrate `asyncio`-based projects such as `Playwright`.
2424

2525
* Python >= 3.7
2626
* Scrapy >= 2.0 (!= 2.4.0)
27-
* Playwright >= 1.8.0a1
27+
* Playwright >= 1.15
2828

2929

3030
## Installation
@@ -97,13 +97,17 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
9797
the default value will be used (30000 ms at the time of writing this).
9898
See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-set-default-navigation-timeout).
9999

100-
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
100+
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Optional[Union[Callable, str]]`, default `scrapy_playwright.headers.use_scrapy_headers`)
101101

102102
A function (or the path to a function) that processes headers for a given request
103103
and returns a dictionary with the headers to be used (note that, depending on the browser,
104-
additional default headers will be sent as well). Coroutine functions (`async def`) are
104+
additional default headers could be sent as well). Coroutine functions (`async def`) are
105105
supported.
106106

107+
This will be called at least once for each Scrapy request (receiving said request and the
108+
corresponding Playwright request), but it could be called additional times if the given
109+
resource generates more requests (e.g. to retrieve assets like images or scripts).
110+
107111
The function must return a `dict` object, and receives the following keyword arguments:
108112

109113
```python
@@ -117,10 +121,11 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
117121
For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
118122
is overriden, for consistency.
119123

120-
There is another built-in function available: `scrapy_playwright.headers.use_playwright_headers`,
121-
which will return the headers from the Playwright request unmodified.
122-
When using this alternative, please keep in mind that headers passed via the `Request.headers`
123-
attribute or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
124+
Setting `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` will give complete control of the headers to
125+
Playwright, i.e. headers from Scrapy requests will be ignored and only headers set by
126+
Playwright will be sent.
127+
When doing this, please keep in mind that headers passed via the `Request.headers` attribute
128+
or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
124129
attribute).
125130

126131
* `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT` (type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting)
@@ -562,6 +567,12 @@ for more information about deprecations and removals.
562567

563568
### Currently deprecated features
564569

570+
* `scrapy_playwright.headers.use_playwright_headers` function
571+
572+
Deprecated since
573+
[`v0.0.16`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16),
574+
set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead
575+
565576
* `scrapy_playwright.page.PageCoroutine` class
566577

567578
Deprecated since

changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# scrapy-playwright changelog
22

3+
4+
### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-NN-NN)
5+
6+
* Use new headers API introduced in Playwright 1.15 (bump required Playwright version)
7+
* Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead
8+
9+
310
### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08)
411

512
* Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting

scrapy_playwright/handler.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from twisted.internet.defer import Deferred, inlineCallbacks
2929
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
3030

31-
from scrapy_playwright.headers import use_scrapy_headers
31+
from scrapy_playwright.headers import use_scrapy_headers, use_playwright_headers
3232
from scrapy_playwright.page import PageMethod
3333

3434

@@ -61,10 +61,22 @@ def __init__(self, crawler: Crawler) -> None:
6161
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
6262
)
6363

64-
if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
65-
self.process_request_headers = load_object(
66-
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
67-
)
64+
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
65+
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
66+
self.process_request_headers = None # use headers from the Playwright request
67+
else:
68+
self.process_request_headers = load_object(
69+
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
70+
)
71+
if self.process_request_headers is use_playwright_headers:
72+
warnings.warn(
73+
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
74+
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
75+
" instead.",
76+
category=ScrapyDeprecationWarning,
77+
stacklevel=1,
78+
)
79+
self.process_request_headers = None
6880
else:
6981
self.process_request_headers = use_scrapy_headers
7082

@@ -233,7 +245,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
233245
with suppress(AttributeError):
234246
request.meta["playwright_security_details"] = await response.security_details()
235247

236-
headers = Headers(response.headers)
248+
headers = Headers(await response.all_headers())
237249
headers.pop("Content-Encoding", None)
238250
body, encoding = _encode_body(headers=headers, text=body_str)
239251
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
@@ -317,15 +329,18 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
317329
self.stats.inc_value("playwright/request_count/aborted")
318330
return None
319331

320-
processed_headers = await _await_if_necessary(
321-
self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
322-
)
332+
overrides: dict = {}
323333

324-
# the request that reaches the callback should contain the headers that were sent
325-
scrapy_headers.clear()
326-
scrapy_headers.update(processed_headers)
334+
if self.process_request_headers is not None:
335+
overrides["headers"] = await _await_if_necessary(
336+
self.process_request_headers(
337+
self.browser_type, playwright_request, scrapy_headers
338+
)
339+
)
340+
# the request that reaches the callback should contain the final headers
341+
scrapy_headers.clear()
342+
scrapy_headers.update(overrides["headers"])
327343

328-
overrides: dict = {"headers": processed_headers}
329344
if playwright_request.is_navigation_request():
330345
overrides["method"] = method
331346
if body is not None:
@@ -351,20 +366,22 @@ async def _await_if_necessary(obj):
351366

352367

353368
def _make_request_logger(context_name: str) -> Callable:
354-
def _log_request(request: PlaywrightRequest) -> None:
369+
async def _log_request(request: PlaywrightRequest) -> None:
370+
referrer = await request.header_value("referer")
355371
logger.debug(
356372
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
357-
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
373+
f"(resource type: {request.resource_type}, referrer: {referrer})"
358374
)
359375

360376
return _log_request
361377

362378

363379
def _make_response_logger(context_name: str) -> Callable:
364-
def _log_request(response: PlaywrightResponse) -> None:
380+
async def _log_request(response: PlaywrightResponse) -> None:
381+
referrer = await response.header_value("referer")
365382
logger.debug(
366383
f"[Context={context_name}] Response: <{response.status} {response.url}> "
367-
f"(referrer: {response.headers.get('referer')})"
384+
f"(referrer: {referrer})"
368385
)
369386

370387
return _log_request

scrapy_playwright/headers.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
This module includes functions to process request headers.
33
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
44
"""
5-
5+
import warnings
66
from urllib.parse import urlparse
77

88
from playwright.async_api import Request as PlaywrightRequest
9+
from scrapy.exceptions import ScrapyDeprecationWarning
910
from scrapy.http.headers import Headers
1011

1112

@@ -17,30 +18,34 @@ async def use_scrapy_headers(
1718
"""Scrapy headers take precedence over Playwright headers for navigation requests.
1819
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
1920

20-
headers = scrapy_headers.to_unicode_dict()
21+
scrapy_headers_str = scrapy_headers.to_unicode_dict()
22+
playwright_headers = await playwright_request.all_headers()
2123

2224
# Scrapy's user agent has priority over Playwright's
23-
headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))
25+
scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))
2426

2527
if playwright_request.is_navigation_request():
2628
if browser_type == "firefox":
2729
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
28-
headers["host"] = urlparse(playwright_request.url).netloc
29-
return headers
30+
scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
31+
return scrapy_headers_str
3032

3133
# override user agent, for consistency with other requests
32-
if headers.get("user-agent"):
33-
return {
34-
**playwright_request.headers,
35-
"user-agent": headers["user-agent"],
36-
}
37-
return playwright_request.headers
34+
if scrapy_headers_str.get("user-agent"):
35+
playwright_headers["user-agent"] = scrapy_headers_str["user-agent"]
36+
return playwright_headers
3837

3938

4039
async def use_playwright_headers(
4140
browser_type: str,
4241
playwright_request: PlaywrightRequest,
4342
scrapy_headers: Headers,
4443
) -> dict:
45-
"""Return headers from the Playwright request, unaltered"""
46-
return playwright_request.headers
44+
warnings.warn(
45+
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
46+
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
47+
" instead.",
48+
category=ScrapyDeprecationWarning,
49+
stacklevel=1,
50+
)
51+
return await playwright_request.all_headers()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,6 @@
3535
python_requires=">=3.7",
3636
install_requires=[
3737
"scrapy>=2.0,!=2.4.0",
38-
"playwright>=1.8.0a1",
38+
"playwright>=1.15",
3939
],
4040
)

tests/test_headers.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import json
2+
import platform
3+
import sys
4+
import warnings
5+
6+
import pytest
7+
from scrapy import Spider, Request
8+
from scrapy.http.headers import Headers
9+
10+
from tests import make_handler
11+
from tests.mockserver import MockServer
12+
13+
from scrapy_playwright.headers import use_playwright_headers
14+
15+
16+
@pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8")
17+
@pytest.mark.asyncio
18+
async def test_use_playwright_headers_deprecated():
19+
from unittest.mock import AsyncMock
20+
21+
headers = {"foo": "bar", "a": "b"}
22+
playwright_request = AsyncMock()
23+
playwright_request.all_headers.return_value = headers
24+
with warnings.catch_warnings(record=True) as warning_list:
25+
processed_headers = await use_playwright_headers("foobar", playwright_request, Headers({}))
26+
assert processed_headers == headers
27+
assert str(warning_list[0].message) == (
28+
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
29+
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
30+
" instead."
31+
)
32+
33+
34+
class MixinProcessHeadersTestCase:
35+
@pytest.mark.asyncio
36+
async def test_user_agent(self):
37+
settings_dict = {
38+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
39+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
40+
"USER_AGENT": None,
41+
}
42+
async with make_handler(settings_dict) as handler:
43+
with MockServer() as server:
44+
# if Scrapy's user agent is None, use the one from the Browser
45+
req = Request(
46+
url=server.urljoin("/headers"),
47+
meta={"playwright": True},
48+
)
49+
resp = await handler._download_request(req, Spider("foo"))
50+
headers = json.loads(resp.css("pre::text").get())
51+
headers = {key.lower(): value for key, value in headers.items()}
52+
assert headers["user-agent"] == self.browser_type
53+
54+
# if Scrapy's user agent is set to some value, use it
55+
req = Request(
56+
url=server.urljoin("/headers"),
57+
meta={"playwright": True},
58+
headers={"User-Agent": "foobar"},
59+
)
60+
resp = await handler._download_request(req, Spider("foo"))
61+
headers = json.loads(resp.css("pre::text").get())
62+
headers = {key.lower(): value for key, value in headers.items()}
63+
assert headers["user-agent"] == "foobar"
64+
65+
@pytest.mark.asyncio
66+
async def test_use_playwright_headers(self):
67+
"""Ignore Scrapy headers"""
68+
settings_dict = {
69+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
70+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
71+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
72+
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
73+
}
74+
async with make_handler(settings_dict) as handler:
75+
with MockServer() as server:
76+
req = Request(
77+
url=server.urljoin("/headers"),
78+
meta={"playwright": True},
79+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
80+
)
81+
resp = await handler._download_request(req, Spider("foo"))
82+
headers = json.loads(resp.css("pre::text").get())
83+
headers = {key.lower(): value for key, value in headers.items()}
84+
assert headers["user-agent"] == self.browser_type
85+
assert "asdf" not in headers
86+
87+
@pytest.mark.asyncio
88+
async def test_use_playwright_headers_deprecated(self):
89+
"""Ignore Scrapy headers"""
90+
settings_dict = {
91+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
92+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
93+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": use_playwright_headers,
94+
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
95+
}
96+
with warnings.catch_warnings(record=True) as warning_list:
97+
async with make_handler(settings_dict) as handler:
98+
with MockServer() as server:
99+
req = Request(
100+
url=server.urljoin("/headers"),
101+
meta={"playwright": True},
102+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
103+
)
104+
resp = await handler._download_request(req, Spider("foo"))
105+
headers = json.loads(resp.css("pre::text").get())
106+
headers = {key.lower(): value for key, value in headers.items()}
107+
assert headers["user-agent"] == self.browser_type
108+
assert "asdf" not in headers
109+
110+
assert str(warning_list[0].message) == (
111+
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
112+
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
113+
" instead."
114+
)
115+
116+
@pytest.mark.asyncio
117+
async def test_use_custom_headers(self):
118+
"""Custom header processing function"""
119+
120+
async def important_headers(*args, **kwargs) -> dict:
121+
return {"foo": "bar"}
122+
123+
settings_dict = {
124+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
125+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
126+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
127+
}
128+
async with make_handler(settings_dict) as handler:
129+
with MockServer() as server:
130+
req = Request(
131+
url=server.urljoin("/headers"),
132+
meta={"playwright": True},
133+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
134+
)
135+
resp = await handler._download_request(req, Spider("foo"))
136+
headers = json.loads(resp.css("pre::text").get())
137+
headers = {key.lower(): value for key, value in headers.items()}
138+
assert headers["foo"] == "bar"
139+
assert headers.get("user-agent") not in (self.browser_type, "foobar")
140+
assert "asdf" not in headers
141+
142+
143+
class TestProcessHeadersChromium(MixinProcessHeadersTestCase):
144+
browser_type = "chromium"
145+
146+
147+
class TestProcessHeadersFirefox(MixinProcessHeadersTestCase):
148+
browser_type = "firefox"
149+
150+
151+
@pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
152+
class TestProcessHeadersWebkit(MixinProcessHeadersTestCase):
153+
browser_type = "webkit"

0 commit comments

Comments
 (0)