Skip to content

Commit 8c6d9bb

Browse files
authored
Connect to remote browser using BrowserType.connect (#283)
* Connect to remote browser with BrowserType.connect * Docs for PLAYWRIGHT_CONNECT_URL & PLAYWRIGHT_CONNECT_KWARGS * Test error when setting PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL * Rename method, simplify variables * Test PLAYWRIGHT_CONNECT_URL * Setup node for tests * Test connect on Windows * Match python & nodejs playwright versions * Adjustments * Update tox.ini * Double slash
1 parent 55528dc commit 8c6d9bb

File tree

8 files changed

+172
-23
lines changed

8 files changed

+172
-23
lines changed

.github/workflows/tests.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ jobs:
2525
with:
2626
python-version: ${{ matrix.python-version }}
2727

28+
- name: Set up node
29+
uses: actions/setup-node@v4
30+
with:
31+
node-version: 18
32+
2833
- name: Install tox
2934
run: pip install tox
3035

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,8 @@ coverage.xml
1717
coverage-*.xml
1818
coverage-asyncio/
1919
coverage-twisted/
20+
21+
# nodejs stuff
22+
node_modules/
23+
package-lock.json
24+
package.json

README.md

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,17 @@ Type `Optional[str]`, default `None`
168168
The endpoint of a remote Chromium browser to connect using the
169169
[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
170170
via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).
171+
172+
```python
173+
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
174+
```
175+
171176
If this setting is used:
172177
* all non-persistent contexts will be created on the connected remote browser
173178
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
174179
* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"
175180

176-
```python
177-
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
178-
```
181+
**This settings CANNOT be used at the same time as `PLAYWRIGHT_CONNECT_URL`**
179182

180183
### `PLAYWRIGHT_CDP_KWARGS`
181184
Type `dict[str, Any]`, default `{}`
@@ -192,6 +195,41 @@ PLAYWRIGHT_CDP_KWARGS = {
192195
}
193196
```
194197

198+
### `PLAYWRIGHT_CONNECT_URL`
199+
Type `Optional[str]`, default `None`
200+
201+
URL of a remote Playwright browser instance to connect using
202+
[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect).
203+
204+
From the upstream Playwright docs:
205+
> When connecting to another browser launched via
206+
> [`BrowserType.launchServer`](https://playwright.dev/docs/api/class-browsertype#browser-type-launch-server)
207+
> in Node.js, the major and minor version needs to match the client version (1.2.3 → is compatible with 1.2.x).
208+
209+
```python
210+
PLAYWRIGHT_CONNECT_URL = "ws://localhost:35477/ae1fa0bc325adcfd9600d9f712e9c733"
211+
```
212+
213+
If this setting is used:
214+
* all non-persistent contexts will be created on the connected remote browser
215+
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
216+
217+
**This settings CANNOT be used at the same time as `PLAYWRIGHT_CDP_URL`**
218+
219+
### `PLAYWRIGHT_CONNECT_KWARGS`
220+
Type `dict[str, Any]`, default `{}`
221+
222+
Additional keyword arguments to be passed to
223+
[`BrowserType.connect`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect)
224+
when using `PLAYWRIGHT_CONNECT_URL`. The `ws_endpoint` key is always ignored,
225+
`PLAYWRIGHT_CONNECT_URL` is used instead.
226+
227+
```python
228+
PLAYWRIGHT_CONNECT_KWARGS = {
229+
"slow_mo": 1000,
230+
"timeout": 10 * 1000
231+
}
232+
```
195233

196234
### `PLAYWRIGHT_CONTEXTS`
197235
Type `dict[str, dict]`, default `{}`

scrapy_playwright/handler.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from scrapy import Spider, signals
2323
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
2424
from scrapy.crawler import Crawler
25+
from scrapy.exceptions import NotSupported
2526
from scrapy.http import Request, Response
2627
from scrapy.http.headers import Headers
2728
from scrapy.responsetypes import responsetypes
@@ -69,6 +70,8 @@ class BrowserContextWrapper:
6970
class Config:
7071
cdp_url: Optional[str]
7172
cdp_kwargs: dict
73+
connect_url: Optional[str]
74+
connect_kwargs: dict
7275
browser_type_name: str
7376
launch_options: dict
7477
max_pages_per_context: int
@@ -78,9 +81,15 @@ class Config:
7881

7982
@classmethod
8083
def from_settings(cls, settings: Settings) -> "Config":
84+
if settings.get("PLAYWRIGHT_CDP_URL") and settings.get("PLAYWRIGHT_CONNECT_URL"):
85+
msg = "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
86+
logger.error(msg)
87+
raise NotSupported(msg)
8188
cfg = cls(
8289
cdp_url=settings.get("PLAYWRIGHT_CDP_URL"),
8390
cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {},
91+
connect_url=settings.get("PLAYWRIGHT_CONNECT_URL"),
92+
connect_kwargs=settings.getdict("PLAYWRIGHT_CONNECT_KWARGS") or {},
8493
browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE,
8594
launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {},
8695
max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"),
@@ -91,10 +100,11 @@ def from_settings(cls, settings: Settings) -> "Config":
91100
),
92101
)
93102
cfg.cdp_kwargs.pop("endpoint_url", None)
103+
cfg.connect_kwargs.pop("ws_endpoint", None)
94104
if not cfg.max_pages_per_context:
95105
cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS")
96-
if cfg.cdp_url and cfg.launch_options:
97-
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
106+
if (cfg.cdp_url or cfg.connect_url) and cfg.launch_options:
107+
logger.warning("Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
98108
return cfg
99109

100110

@@ -166,7 +176,7 @@ async def _maybe_launch_browser(self) -> None:
166176
self.browser = await self.browser_type.launch(**self.config.launch_options)
167177
logger.info("Browser %s launched", self.browser_type.name)
168178

169-
async def _maybe_connect_devtools(self) -> None:
179+
async def _maybe_connect_remote_devtools(self) -> None:
170180
async with self.browser_launch_lock:
171181
if not hasattr(self, "browser"):
172182
logger.info("Connecting using CDP: %s", self.config.cdp_url)
@@ -175,6 +185,15 @@ async def _maybe_connect_devtools(self) -> None:
175185
)
176186
logger.info("Connected using CDP: %s", self.config.cdp_url)
177187

188+
async def _maybe_connect_remote(self) -> None:
189+
async with self.browser_launch_lock:
190+
if not hasattr(self, "browser"):
191+
logger.info("Connecting to remote Playwright")
192+
self.browser = await self.browser_type.connect(
193+
self.config.connect_url, **self.config.connect_kwargs
194+
)
195+
logger.info("Connected to remote Playwright")
196+
178197
async def _create_browser_context(
179198
self,
180199
name: str,
@@ -187,20 +206,21 @@ async def _create_browser_context(
187206
if hasattr(self, "context_semaphore"):
188207
await self.context_semaphore.acquire()
189208
context_kwargs = context_kwargs or {}
209+
persistent = remote = False
190210
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
191211
context = await self.browser_type.launch_persistent_context(**context_kwargs)
192212
persistent = True
193-
remote = False
194213
elif self.config.cdp_url:
195-
await self._maybe_connect_devtools()
214+
await self._maybe_connect_remote_devtools()
215+
context = await self.browser.new_context(**context_kwargs)
216+
remote = True
217+
elif self.config.connect_url:
218+
await self._maybe_connect_remote()
196219
context = await self.browser.new_context(**context_kwargs)
197-
persistent = False
198220
remote = True
199221
else:
200222
await self._maybe_launch_browser()
201223
context = await self.browser.new_context(**context_kwargs)
202-
persistent = False
203-
remote = False
204224

205225
context.on(
206226
"close", self._make_close_browser_context_callback(name, persistent, remote, spider)

tests/launch_browser_server.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// used to start a browser server to test the PLAYWRIGHT_CONNECT_URL setting
2+
// usage:
3+
// node launch_browser_server.js PORT WS_PATH
4+
5+
const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'.
6+
7+
(async () => {
8+
const browserServer = await chromium.launchServer({
9+
host: 'localhost',
10+
port: process.argv[2],
11+
wsPath: process.argv[3]
12+
});
13+
})();

tests/tests_asyncio/test_remote.py

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
import asyncio
12
import logging
3+
import random
24
import re
35
import subprocess
46
import time
7+
import uuid
58
from contextlib import asynccontextmanager
9+
from pathlib import Path
610
from typing import Tuple
711
from unittest import IsolatedAsyncioTestCase
812

@@ -14,8 +18,8 @@
1418
from tests.mockserver import StaticMockServer
1519

1620

17-
async def _run_chromium() -> Tuple[subprocess.Popen, str]:
18-
"""Run a Croumium instance in a separate process, return the process
21+
async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]:
22+
"""Run a Chromium instance in a separate process, return the process
1923
object and a string with its devtools endpoint.
2024
"""
2125
async with async_playwright() as playwright:
@@ -38,32 +42,50 @@ async def _run_chromium() -> Tuple[subprocess.Popen, str]:
3842
return proc, devtools_url
3943

4044

45+
def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]:
46+
"""Start a Playwright server in a separate process, return the process
47+
object and a string with its websocket endpoint.
48+
Pass fixed port and ws path as arguments instead of allowing Playwright
49+
to choose, for some reason I was unable to capture stdout/stderr :shrug:
50+
"""
51+
port = str(random.randint(60_000, 63_000))
52+
ws_path = str(uuid.uuid4())
53+
launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js")
54+
command = ["node", launch_server_script_path, port, ws_path]
55+
proc = subprocess.Popen(command) # pylint: disable=consider-using-with
56+
return proc, f"ws://localhost:{port}/{ws_path}"
57+
58+
4159
@asynccontextmanager
42-
async def remote_chromium():
43-
"""Launch a Chromium instance with remote debugging enabled."""
44-
proc = None
45-
devtools_url = None
60+
async def remote_browser(is_chrome_devtools_protocol: bool = True):
61+
"""Launch a remote browser that lasts while in the context."""
62+
proc = url = None
4663
try:
47-
proc, devtools_url = await _run_chromium()
64+
if is_chrome_devtools_protocol:
65+
proc, url = await _run_chromium_devtools()
66+
else:
67+
proc, url = _run_playwright_browser_server()
68+
await asyncio.sleep(1) # allow some time for the browser to start
4869
except Exception:
4970
pass
5071
else:
51-
yield devtools_url
72+
print(f"Browser URL: {url}")
73+
yield url
5274
finally:
5375
if proc:
5476
proc.kill()
5577
proc.communicate()
5678

5779

58-
class TestRemoteDevtools(IsolatedAsyncioTestCase):
80+
class TestRemote(IsolatedAsyncioTestCase):
5981
@pytest.fixture(autouse=True)
6082
def inject_fixtures(self, caplog):
6183
caplog.set_level(logging.DEBUG)
6284
self._caplog = caplog
6385

6486
@allow_windows
65-
async def test_devtools(self):
66-
async with remote_chromium() as devtools_url:
87+
async def test_connect_devtools(self):
88+
async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url:
6789
settings_dict = {
6890
"PLAYWRIGHT_CDP_URL": devtools_url,
6991
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
@@ -76,5 +98,33 @@ async def test_devtools(self):
7698
assert (
7799
"scrapy-playwright",
78100
logging.WARNING,
79-
"PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
101+
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
102+
) in self._caplog.record_tuples
103+
104+
@allow_windows
105+
async def test_connect(self):
106+
async with remote_browser(is_chrome_devtools_protocol=False) as browser_url:
107+
settings_dict = {
108+
"PLAYWRIGHT_CONNECT_URL": browser_url,
109+
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
110+
}
111+
async with make_handler(settings_dict) as handler:
112+
with StaticMockServer() as server:
113+
req = Request(server.urljoin("/index.html"), meta={"playwright": True})
114+
resp = await handler._download_request(req, Spider("foo"))
115+
assert_correct_response(resp, req)
116+
assert (
117+
"scrapy-playwright",
118+
logging.INFO,
119+
"Connecting to remote Playwright",
120+
) in self._caplog.record_tuples
121+
assert (
122+
"scrapy-playwright",
123+
logging.INFO,
124+
"Connected to remote Playwright",
125+
) in self._caplog.record_tuples
126+
assert (
127+
"scrapy-playwright",
128+
logging.WARNING,
129+
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
80130
) in self._caplog.record_tuples

tests/tests_asyncio/test_settings.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from unittest import IsolatedAsyncioTestCase
22

3+
import pytest
4+
from scrapy.exceptions import NotSupported
35
from scrapy.settings import Settings
46

57
from scrapy_playwright.handler import Config
@@ -31,6 +33,16 @@ async def test_max_pages_per_context(self):
3133
config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876}))
3234
assert config.max_pages_per_context == 9876
3335

36+
async def test_connect_remote_urls(self):
37+
with pytest.raises(NotSupported) as exc_info:
38+
Config.from_settings(
39+
Settings({"PLAYWRIGHT_CONNECT_URL": "asdf", "PLAYWRIGHT_CDP_URL": "qwerty"})
40+
)
41+
assert (
42+
str(exc_info.value)
43+
== "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
44+
)
45+
3446
@allow_windows
3547
async def test_max_contexts(self):
3648
async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler:

tox.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ deps =
77
pytest_cov==4.1.0
88
pytest_twisted==1.14
99
psutil==5.9.7
10+
playwright==1.44 # version must match the one installed with npm below
11+
allowlist_externals =
12+
npm
13+
npx
1014
commands =
1115
playwright install
16+
npm install playwright@1.44
17+
npx playwright install chromium
1218
py.test -vv --reactor=asyncio \
1319
--cov-report=term-missing \
1420
--cov-report=xml:coverage-asyncio.xml \

0 commit comments

Comments
 (0)