Skip to content

Commit 6853d94

Browse files
authored
Set redirect meta (#142)
1 parent 14da796 commit 6853d94

File tree

3 files changed

+55
-7
lines changed

3 files changed

+55
-7
lines changed

scrapy_playwright/handler.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
303303
)
304304
headers = Headers()
305305
else:
306+
await self._set_redirect_meta(request=request, response=response)
306307
headers = Headers(await response.all_headers())
307308
headers.pop("Content-Encoding", None)
308309
await self._apply_page_methods(page, request)
@@ -334,6 +335,23 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
334335
ip_address=server_ip_address,
335336
)
336337

338+
async def _set_redirect_meta(self, request: Request, response: PlaywrightResponse) -> None:
339+
redirect_times: int = 0
340+
redirect_urls: list = []
341+
redirect_reasons: list = []
342+
redirected = response.request.redirected_from
343+
while redirected is not None:
344+
redirect_times += 1
345+
redirect_urls.append(redirected.url)
346+
redirected_response = await redirected.response()
347+
reason = None if redirected_response is None else redirected_response.status
348+
redirect_reasons.append(reason)
349+
redirected = redirected.redirected_from
350+
if redirect_times:
351+
request.meta["redirect_times"] = redirect_times
352+
request.meta["redirect_urls"] = list(reversed(redirect_urls))
353+
request.meta["redirect_reasons"] = list(reversed(redirect_reasons))
354+
337355
async def _apply_page_methods(self, page: Page, request: Request) -> None:
338356
page_methods = request.meta.get("playwright_page_methods") or ()
339357
if isinstance(page_methods, dict):

tests/mockserver.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def urljoin(self, url):
4040

4141

4242
class _RequestHandler(BaseHTTPRequestHandler):
43-
def do_POST(self):
43+
def do_POST(self) -> None:
4444
"""Echo back the request body"""
4545
content_length = int(self.headers["Content-Length"])
4646
body = self.rfile.read(content_length)
@@ -49,21 +49,33 @@ def do_POST(self):
4949
self.wfile.write(b"Request body: ")
5050
self.wfile.write(body)
5151

52-
def do_GET(self):
53-
body = "{}"
52+
def do_GET(self) -> None:
5453
if self.path == "/headers":
55-
body = json.dumps(dict(self.headers), indent=4)
54+
self._send_json(dict(self.headers))
55+
elif self.path == "/redirect2":
56+
self.send_response(302)
57+
self.send_header("Location", "/redirect")
58+
self.end_headers()
59+
elif self.path == "/redirect":
60+
self.send_response(301)
61+
self.send_header("Location", "/headers")
62+
self.end_headers()
5663
else:
5764
delay_match = re.match(r"^/delay/(\d+)$", self.path)
5865
if delay_match:
5966
delay = int(delay_match.group(1))
6067
print(f"Sleeping {delay} seconds...")
6168
time.sleep(delay)
62-
body = json.dumps({"delay": delay})
63-
self.send_response(200)
69+
self._send_json({"delay": delay})
70+
else:
71+
self._send_json({"error": "unknown path"}, status=400)
72+
73+
def _send_json(self, body: dict, status: int = 200) -> None:
74+
self.send_response(status)
6475
self.send_header("Content-Type", "application/json")
6576
self.end_headers()
66-
self.wfile.write(body.encode())
77+
body_bytes = json.dumps(body, indent=4).encode("utf8")
78+
self.wfile.write(body_bytes)
6779

6880

6981
class MockServer:

tests/test_playwright_requests.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,24 @@ async def init_page(page, request, unused_arg):
341341
assert f"[Context=default] Page init callback exception for {req!r}" in log_entry[2]
342342
assert "init_page() missing 1 required positional argument: 'unused_arg'" in log_entry[2]
343343

344+
@pytest.mark.asyncio
345+
async def test_redirect(self):
346+
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
347+
with MockServer() as server:
348+
req = Request(
349+
url=server.urljoin("/redirect2"),
350+
meta={"playwright": True},
351+
)
352+
response = await handler._download_request(req, Spider("spider_name"))
353+
354+
assert response.url == server.urljoin("/headers")
355+
assert response.meta["redirect_times"] == 2
356+
assert response.meta["redirect_reasons"] == [302, 301]
357+
assert response.meta["redirect_urls"] == [
358+
server.urljoin("/redirect2"),
359+
server.urljoin("/redirect"),
360+
]
361+
344362

345363
class TestCaseChromium(MixinTestCase):
346364
browser_type = "chromium"

0 commit comments

Comments
 (0)