Skip to content

Commit 9b43905

Browse files
committed
preserve proxy response headers across multiple requests
1 parent e88153a commit 9b43905

File tree

3 files changed

+26
-2
lines changed

3 files changed

+26
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "scrapy-proxy-headers"
7-
version = "0.1.3"
7+
version = "0.1.4"
88
authors = [
99
{ name="ProxyMesh", email="support@proxymesh.com" },
1010
]

scrapy_proxy_headers/agent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,6 @@ def _cb_bodydone(self, result, request, url: str):
124124
proxy_response_headers = getattr(self._agent._endpoint, '_proxy_response_headers', None)
125125
if proxy_response_headers:
126126
r.headers.update(proxy_response_headers)
127+
# save this for download handler
128+
r._proxy_response_headers = proxy_response_headers
127129
return r

scrapy_proxy_headers/download_handler.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
33

44
class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
5+
def __init__(self, *args, **kwargs):
6+
super().__init__(*args, **kwargs)
7+
self._proxy_headers_by_proxy = {}
8+
59
def download_request(self, request, spider):
610
"""Return a deferred for the HTTP download"""
711
agent = ScrapyProxyHeadersAgent(
@@ -12,4 +16,22 @@ def download_request(self, request, spider):
1216
fail_on_dataloss=self._fail_on_dataloss,
1317
crawler=self._crawler,
1418
)
15-
return agent.download_request(request)
19+
response = agent.download_request(request)
20+
proxy = request.meta.get("proxy")
21+
22+
if proxy:
23+
# we need to do all this because the proxy tunnels can get re-used
24+
# when that happens, the proxy headers are not available in subsequent responses
25+
# so we need to save the proxy headers by the proxy, from the first tunnel response
26+
# so we can add them to subsequent responses
27+
def callback(response):
28+
if hasattr(response, '_proxy_response_headers'):
29+
self._proxy_headers_by_proxy[proxy] = response._proxy_response_headers
30+
31+
if proxy in self._proxy_headers_by_proxy:
32+
response.headers.update(self._proxy_headers_by_proxy[proxy])
33+
34+
return response
35+
36+
response.addCallback(callback)
37+
return response

0 commit comments

Comments
 (0)