Skip to content

Commit 317adde

Browse files
authored
Retry to create page on browser crash (#305)
* Retry to create page on browser crash * Add psutil to pylint tox env * Skip browser crash tests on Windows * Test browser crash only in Linux * Revert "Test browser crash only in Linux" This reverts commit a1d12b0. * Update browser crash test for macOS
1 parent 84ba393 commit 317adde

File tree

3 files changed

+132
-11
lines changed

3 files changed

+132
-11
lines changed

scrapy_playwright/handler.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from time import time
1010
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
1111

12+
from playwright._impl._errors import TargetClosedError
1213
from playwright.async_api import (
1314
BrowserContext,
1415
BrowserType,
@@ -94,7 +95,8 @@ class Config:
9495
startup_context_kwargs: dict
9596
navigation_timeout: Optional[float]
9697
restart_disconnected_browser: bool
97-
use_threaded_loop: bool
98+
target_closed_max_retries: int = 3
99+
use_threaded_loop: bool = False
98100

99101
@classmethod
100102
def from_settings(cls, settings: Settings) -> "Config":
@@ -202,6 +204,7 @@ async def _maybe_launch_browser(self) -> None:
202204
logger.info("Launching browser %s", self.browser_type.name)
203205
self.browser = await self.browser_type.launch(**self.config.launch_options)
204206
logger.info("Browser %s launched", self.browser_type.name)
207+
self.stats.inc_value("playwright/browser_count")
205208
self.browser.on("disconnected", self._browser_disconnected_callback)
206209

207210
async def _maybe_connect_remote_devtools(self) -> None:
@@ -212,6 +215,7 @@ async def _maybe_connect_remote_devtools(self) -> None:
212215
self.config.cdp_url, **self.config.cdp_kwargs
213216
)
214217
logger.info("Connected using CDP: %s", self.config.cdp_url)
218+
self.stats.inc_value("playwright/browser_count")
215219
self.browser.on("disconnected", self._browser_disconnected_callback)
216220

217221
async def _maybe_connect_remote(self) -> None:
@@ -222,6 +226,7 @@ async def _maybe_connect_remote(self) -> None:
222226
self.config.connect_url, **self.config.connect_kwargs
223227
)
224228
logger.info("Connected to remote Playwright")
229+
self.stats.inc_value("playwright/browser_count")
225230
self.browser.on("disconnected", self._browser_disconnected_callback)
226231

227232
async def _create_browser_context(
@@ -350,7 +355,8 @@ def close(self) -> Deferred:
350355
_ThreadedLoopAdapter.stop(id(self))
351356

352357
async def _close(self) -> None:
353-
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
358+
with suppress(TargetClosedError):
359+
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
354360
self.context_wrappers.clear()
355361
if hasattr(self, "browser"):
356362
logger.info("Closing browser")
@@ -366,8 +372,28 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:
366372
return super().download_request(request, spider)
367373

368374
async def _download_request(self, request: Request, spider: Spider) -> Response:
375+
counter = 0
376+
while True:
377+
try:
378+
return await self._download_request_with_retry(request=request, spider=spider)
379+
except TargetClosedError as ex:
380+
counter += 1
381+
if counter > self.config.target_closed_max_retries:
382+
raise ex
383+
logger.debug(
384+
"Target closed, retrying to create page for %s",
385+
request,
386+
extra={
387+
"spider": spider,
388+
"scrapy_request_url": request.url,
389+
"scrapy_request_method": request.method,
390+
"exception": ex,
391+
},
392+
)
393+
394+
async def _download_request_with_retry(self, request: Request, spider: Spider) -> Response:
369395
page = request.meta.get("playwright_page")
370-
if not isinstance(page, Page):
396+
if not isinstance(page, Page) or page.is_closed():
371397
page = await self._create_page(request=request, spider=spider)
372398
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
373399

@@ -626,9 +652,12 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
626652
self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}")
627653

628654
async def _browser_disconnected_callback(self) -> None:
629-
await asyncio.gather(
630-
*[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()]
631-
)
655+
close_context_coros = [
656+
ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()
657+
]
658+
self.context_wrappers.clear()
659+
with suppress(TargetClosedError):
660+
await asyncio.gather(*close_context_coros)
632661
logger.debug("Browser disconnected")
633662
if self.config.restart_disconnected_browser:
634663
del self.browser

tests/tests_asyncio/test_browser.py

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
11
import asyncio
22
import logging
3+
import os
4+
import platform
35
import random
46
import re
7+
import signal
58
import subprocess
69
import time
710
import uuid
811
from contextlib import asynccontextmanager
912
from pathlib import Path
13+
from threading import Thread
1014
from typing import Tuple
1115
from unittest import IsolatedAsyncioTestCase
1216

17+
import psutil
1318
import pytest
19+
from playwright._impl._errors import TargetClosedError
1420
from playwright.async_api import async_playwright
1521
from scrapy import Request, Spider
1622

@@ -77,7 +83,7 @@ async def remote_chromium(with_devtools_protocol: bool = True):
7783
proc.communicate()
7884

7985

80-
class TestRemoteBrowser(IsolatedAsyncioTestCase):
86+
class TestBrowserRemoteChromium(IsolatedAsyncioTestCase):
8187
@pytest.fixture(autouse=True)
8288
def inject_fixtures(self, caplog):
8389
caplog.set_level(logging.DEBUG)
@@ -87,6 +93,7 @@ def inject_fixtures(self, caplog):
8793
async def test_connect_devtools(self):
8894
async with remote_chromium(with_devtools_protocol=True) as devtools_url:
8995
settings_dict = {
96+
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
9097
"PLAYWRIGHT_CDP_URL": devtools_url,
9198
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
9299
}
@@ -105,6 +112,7 @@ async def test_connect_devtools(self):
105112
async def test_connect(self):
106113
async with remote_chromium(with_devtools_protocol=False) as browser_url:
107114
settings_dict = {
115+
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
108116
"PLAYWRIGHT_CONNECT_URL": browser_url,
109117
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
110118
}
@@ -130,16 +138,22 @@ async def test_connect(self):
130138
) in self._caplog.record_tuples
131139

132140

133-
class TestBrowserReconnect(IsolatedAsyncioTestCase):
141+
class TestBrowserReconnectChromium(IsolatedAsyncioTestCase):
134142
@pytest.fixture(autouse=True)
135143
def inject_fixtures(self, caplog):
136144
caplog.set_level(logging.DEBUG)
137145
self._caplog = caplog
138146

147+
@staticmethod
148+
def kill_chrome():
149+
for proc in psutil.process_iter(["pid", "name"]):
150+
if proc.info["name"].lower() in ("chrome", "chromium"):
151+
os.kill(proc.info["pid"], signal.SIGKILL)
152+
139153
@allow_windows
140-
async def test_restart_browser(self):
154+
async def test_browser_closed_restart(self):
141155
spider = Spider("foo")
142-
async with make_handler() as handler:
156+
async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
143157
with StaticMockServer() as server:
144158
req1 = Request(
145159
server.urljoin("/index.html"),
@@ -172,3 +186,80 @@ async def test_restart_browser(self):
172186
)
173187
== 2 # one at the beginning, one after calling Browser.close() manually
174188
)
189+
190+
@pytest.mark.skipif(
191+
platform.system() == "Windows",
192+
reason="os.kill does not work as expected on Windows",
193+
)
194+
async def test_browser_crashed_restart(self):
195+
spider = Spider("foo")
196+
async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
197+
with StaticMockServer() as server:
198+
req1 = Request(
199+
server.urljoin("/index.html"),
200+
meta={"playwright": True, "playwright_include_page": True},
201+
)
202+
resp1 = await handler._download_request(req1, spider)
203+
thread = Thread(target=self.kill_chrome, daemon=True)
204+
thread.start()
205+
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
206+
req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
207+
req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
208+
resp2 = await handler._download_request(req2, spider)
209+
resp3 = await handler._download_request(req3, spider)
210+
resp4 = await handler._download_request(req4, spider)
211+
thread.join()
212+
assert_correct_response(resp1, req1)
213+
assert_correct_response(resp2, req2)
214+
assert_correct_response(resp3, req3)
215+
assert_correct_response(resp4, req4)
216+
assert (
217+
self._caplog.record_tuples.count(
218+
(
219+
"scrapy-playwright",
220+
logging.DEBUG,
221+
"Browser disconnected",
222+
)
223+
)
224+
== 2 # one mid-crawl after killing the browser process, one at the end
225+
)
226+
assert (
227+
self._caplog.record_tuples.count(
228+
(
229+
"scrapy-playwright",
230+
logging.INFO,
231+
"Launching browser chromium",
232+
)
233+
)
234+
== 2 # one at the beginning, one after killing the broser process
235+
)
236+
237+
@pytest.mark.skipif(
238+
platform.system() == "Windows",
239+
reason="os.kill does not work as expected on Windows",
240+
)
241+
async def test_browser_crashed_do_not_restart(self):
242+
spider = Spider("foo")
243+
settings_dict = {
244+
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
245+
"PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER": False,
246+
}
247+
async with make_handler(settings_dict=settings_dict) as handler:
248+
with StaticMockServer() as server:
249+
await asyncio.sleep(1) # allow time for the browser to fully launch
250+
req1 = Request(
251+
server.urljoin("/index.html"),
252+
meta={"playwright": True, "playwright_include_page": True},
253+
)
254+
resp1 = await handler._download_request(req1, spider)
255+
assert_correct_response(resp1, req1)
256+
thread = Thread(target=self.kill_chrome, daemon=True)
257+
thread.start()
258+
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
259+
req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
260+
req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
261+
with pytest.raises(TargetClosedError):
262+
await handler._download_request(req2, spider)
263+
await handler._download_request(req3, spider)
264+
await handler._download_request(req4, spider)
265+
thread.join()

tox.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,9 @@ commands =
6363

6464
[testenv:pylint]
6565
deps =
66-
pytest==7.4.0
66+
psutil==5.9.7
6767
pylint==3.2.2
68+
pytest==7.4.0
6869
commands =
6970
pip install -e .
7071
pylint {posargs: scrapy_playwright setup.py tests}

0 commit comments

Comments
 (0)