preserve proxy response headers across multiple requests

proxymesh · proxymesh · commit 9b439054ac30 · 2025-04-18T13:18:33.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "scrapy-proxy-headers"
-version = "0.1.3"
+version = "0.1.4"
 authors = [
   { name="ProxyMesh", email="support@proxymesh.com" },
 ]
diff --git a/scrapy_proxy_headers/agent.py b/scrapy_proxy_headers/agent.py
@@ -124,4 +124,6 @@ def _cb_bodydone(self, result, request, url: str):
                 proxy_response_headers = getattr(self._agent._endpoint, '_proxy_response_headers', None)
                 if proxy_response_headers:
                     r.headers.update(proxy_response_headers)
+                    # save this for download handler
+                    r._proxy_response_headers = proxy_response_headers
         return r
diff --git a/scrapy_proxy_headers/download_handler.py b/scrapy_proxy_headers/download_handler.py
@@ -2,6 +2,10 @@
 from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
 
 class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._proxy_headers_by_proxy = {}
+    
     def download_request(self, request, spider):
         """Return a deferred for the HTTP download"""
         agent = ScrapyProxyHeadersAgent(
@@ -12,4 +16,22 @@ def download_request(self, request, spider):
             fail_on_dataloss=self._fail_on_dataloss,
             crawler=self._crawler,
         )
-        return agent.download_request(request)
+        response = agent.download_request(request)
+        proxy = request.meta.get("proxy")
+
+        if proxy:
+            # we need to do all this because the proxy tunnels can get re-used
+            # when that happens, the proxy headers are not available in subsequent responses
+            # so we need to save the proxy headers by the proxy, from the first tunnel response
+            # so we can add them to subsequent responses
+            def callback(response):
+                if hasattr(response, '_proxy_response_headers'):
+                    self._proxy_headers_by_proxy[proxy] = response._proxy_response_headers
+
+                if proxy in self._proxy_headers_by_proxy:
+                    response.headers.update(self._proxy_headers_by_proxy[proxy])
+
+                return response
+
+            response.addCallback(callback)
+        return response

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "scrapy-proxy-headers"`
`7`		`-version = "0.1.3"`
	`7`	`+version = "0.1.4"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="ProxyMesh", email="support@proxymesh.com" },`
`10`	`10`	`]`