Process request headers (#38)

elacuesta · Gallaecio · web-flow · commit f50e7df1e533 · 2022-01-27T13:12:30.000-03:00
* Process request headers

* Typing fix

* Rename private request handler

* Additional typing fix

* Clarifying comment

* scrapy_playwright/handler.py: fix typo: reches → reaches

* Add test/docstring

* Docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS

* Update readme

Co-authored-by: Adrián Chaves &lt;adrian@chaves.io&gt;
diff --git a/README.md b/README.md
@@ -103,6 +103,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
     the default value will be used (30000 ms at the time of writing this).
     See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout).
 
+* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`)
+
+    The path to a coroutine function (`async def`) that processes headers for a given request
+    and returns a dictionary with the headers to be used (note that, depending on the browser,
+    additional default headers will be sent as well).
+
+    The function must return a `dict` object, and receives the following keyword arguments:
+
+    ```python
+    browser_type: str, playwright_request: playwright.async_api.Request, scrapy_headers: scrapy.http.headers.Headers
+    ```
+
+    The default value (`scrapy_playwright.headers.use_scrapy_headers`) tries to emulate Scrapy's
+    behaviour for navigation requests, i.e. overriding headers with their values from the Scrapy request.
+    For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
+    is overriden, for consistency.
+
+    There is nother function available: `scrapy_playwright.headers.use_playwright_headers`,
+    which will return the headers from the Playwright request without any changes.
 
 ## Basic usage
 
@@ -135,8 +154,8 @@ class AwesomeSpider(scrapy.Spider):
 By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
 `USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
 This could cause some sites to react in unexpected ways, for instance if the user agent
-does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
-set the Scrapy user agent to `None`.
+does not match the running Browser. If you prefer the `User-Agent` sent by
+default by the specific browser you're using, set the Scrapy user agent to `None`.
 
 
 ## Receiving the Page object in the callback
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -5,7 +5,6 @@
 from contextlib import suppress
 from time import time
 from typing import Callable, Dict, Optional, Type, TypeVar
-from urllib.parse import urlparse
 
 from playwright.async_api import (
     BrowserContext,
@@ -22,9 +21,11 @@
 from scrapy.http.headers import Headers
 from scrapy.responsetypes import responsetypes
 from scrapy.utils.defer import deferred_from_coro
+from scrapy.utils.misc import load_object
 from scrapy.utils.reactor import verify_installed_reactor
 from twisted.internet.defer import Deferred, inlineCallbacks
 
+from scrapy_playwright.headers import use_scrapy_headers
 from scrapy_playwright.page import PageCoroutine
 
 
@@ -66,13 +67,21 @@ def __init__(self, crawler: Crawler) -> None:
 
         self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium"
         self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
+
         self.default_navigation_timeout: Optional[float] = None
         if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
             with suppress(TypeError, ValueError):
                 self.default_navigation_timeout = float(
                     crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
                 )
 
+        if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
+            self.process_request_headers = load_object(
+                crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
+            )
+        else:
+            self.process_request_headers = use_scrapy_headers
+
         default_context_kwargs: dict = {}
         if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings:
             default_context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXT_ARGS")
@@ -180,9 +189,8 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
         await page.route(
             "**",
             self._make_request_handler(
-                url=request.url,
                 method=request.method,
-                headers=request.headers.to_unicode_dict(),
+                scrapy_headers=request.headers,
                 body=request.body,
                 encoding=getattr(request, "encoding", None),
             ),
@@ -249,23 +257,24 @@ def close_browser_context_callback() -> None:
         return close_browser_context_callback
 
     def _make_request_handler(
-        self, url: str, method: str, headers: dict, body: Optional[bytes], encoding: str = "utf8"
+        self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8"
     ) -> Callable:
-        def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
+        async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
             """Override request headers, method and body."""
-            headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
-            if pw_request.url == url:
-                overrides: dict = {"method": method, "headers": headers}
+            processed_headers = await self.process_request_headers(
+                self.browser_type, playwright_request, scrapy_headers
+            )
+
+            # the request that reaches the callback should contain the headers that were sent
+            scrapy_headers.clear()
+            scrapy_headers.update(processed_headers)
+
+            overrides: dict = {"headers": processed_headers}
+            if playwright_request.is_navigation_request():
+                overrides["method"] = method
                 if body is not None:
                     overrides["post_data"] = body.decode(encoding)
-                if self.browser_type == "firefox":
-                    # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
-                    overrides["headers"]["host"] = urlparse(pw_request.url).netloc
-            else:
-                overrides = {"headers": pw_request.headers.copy()}
-                # override user agent, for consistency with other requests
-                if headers.get("user-agent"):
-                    overrides["headers"]["user-agent"] = headers["user-agent"]
-            asyncio.create_task(route.continue_(**overrides))
-
-        return request_handler
+
+            await route.continue_(**overrides)
+
+        return _request_handler
diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py
@@ -0,0 +1,44 @@
+from urllib.parse import urlparse
+
+from playwright.async_api import Request as PlaywrightRequest
+from scrapy.http.headers import Headers
+
+
+"""
+This module includes functions to process request headers.
+Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
+"""
+
+
+async def use_scrapy_headers(
+    browser_type: str,
+    playwright_request: PlaywrightRequest,
+    scrapy_headers: Headers,
+) -> dict:
+    """Scrapy headers take precedence over Playwright headers for navigation requests.
+    For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
+
+    headers = scrapy_headers.to_unicode_dict()
+
+    # Scrapy's user agent has priority over Playwright's
+    headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))
+
+    if playwright_request.is_navigation_request():
+        if browser_type == "firefox":
+            # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
+            headers["host"] = urlparse(playwright_request.url).netloc
+        return headers
+    else:
+        # override user agent, for consistency with other requests
+        if headers.get("user-agent"):
+            playwright_request.headers["user-agent"] = headers["user-agent"]
+        return playwright_request.headers
+
+
+async def use_playwright_headers(
+    browser_type: str,
+    playwright_request: PlaywrightRequest,
+    scrapy_headers: Headers,
+) -> dict:
+    """Return headers from the Playwright request, unaltered"""
+    return playwright_request.headers
diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py
@@ -259,6 +259,53 @@ async def test_user_agent(self):
                 headers = {key.lower(): value for key, value in headers.items()}
                 assert headers["user-agent"] == "foobar"
 
+    @pytest.mark.asyncio
+    async def test_use_playwright_headers(self):
+        """Ignore Scrapy headers"""
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers",  # noqa: E501
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == self.browser_type
+                assert "asdf" not in headers
+
+    @pytest.mark.asyncio
+    async def test_use_custom_headers(self):
+        """Custom header processing function"""
+
+        async def important_headers(*args, **kwargs) -> dict:
+            return {"foo": "bar"}
+
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["foo"] == "bar"
+                assert headers.get("user-agent") not in (self.browser_type, "foobar")
+                assert "asdf" not in headers
+
     @pytest.mark.asyncio
     async def test_event_handler_dialog_callable(self):
         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: