From 68acbe9c32456284b1ba2136a4ca5cf7d99391f1 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 13:29:02 +0530 Subject: [PATCH 1/9] added SplashHtmlResponse --- scrapy_splash/response.py | 9 ++++++++- scrapy_splash/responsetypes.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index d15f678..c1bf0de 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -5,7 +5,7 @@ import base64 import re -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy import Selector from scrapy_splash.utils import headers_to_scrapy @@ -70,6 +70,13 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) +class SplashHtmlResponse(_SplashResponseMixin, HtmlResponse): + """ + This HtmlResponse subclass sets response.url to the URL of a remote website + instead of an URL of Splash server. "Real" response URL is still available + as ``response.real_url``. + """ + class SplashJsonResponse(SplashResponse): """ Splash Response with JSON data. It provides a convenient way to access diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index 04e9264..a510185 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -9,7 +9,7 @@ class SplashResponseTypes(ResponseTypes): CLASSES = { - 'text/html': 'scrapy_splash.response.SplashTextResponse', + 'text/html': 'scrapy_splash.response.SplashHtmlResponse', 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', From 4b97884c4d4000b2bc41df05119c11ddc3e47631 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 14:28:11 +0530 Subject: [PATCH 2/9] fixed test_middleware --- tests/test_middleware.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 5ac52bf..b19b8c8 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -7,7 +7,7 @@ import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash @@ -82,14 +82,14 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = TextResponse("http://127.0.0.1:8050/render.html", + response = HtmlResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"Hello") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashTextResponse) + assert isinstance(response2, scrapy_splash.SplashHtmlResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url From 2d87834547464ea32fb5abc0c9b1f2ad9ccf4f6a Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 14:54:32 +0530 Subject: [PATCH 3/9] fixed test_middleware1 --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b19b8c8..511c8ab 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -89,7 +89,7 @@ def test_splash_request(): body=b"Hello") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashHtmlResponse) + assert isinstance(response2, scrapy_splash.SplashTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url From 044e6e93c993dff7ba1fa7be3434bbe6d7612d54 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:16:08 +0530 Subject: [PATCH 4/9] fixed response --- scrapy_splash/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index c1bf0de..c9dbafb 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -70,7 +70,7 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(_SplashResponseMixin, HtmlResponse): +class SplashHtmlResponse(_SplashResponseMixin, SplashTextResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available From 62e50d6d74c58cd01420f9e095f6cccbf8a3afc7 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:21:21 +0530 Subject: [PATCH 5/9] fixed response1 --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 511c8ab..5579e18 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -82,7 +82,7 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = HtmlResponse("http://127.0.0.1:8050/render.html", + response = TextResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, From b688e628e57faf35f07cbf6c738c8a6b39f710c3 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:26:36 +0530 Subject: [PATCH 6/9] fixed response2 --- scrapy_splash/response.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index c9dbafb..27cf957 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -70,12 +70,13 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(_SplashResponseMixin, SplashTextResponse): +class SplashHtmlResponse(SplashTextResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available as ``response.real_url``. """ + pass class SplashJsonResponse(SplashResponse): """ From d5554e692078dac77a91b128ae4d34fb530b1d38 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:26:06 +0530 Subject: [PATCH 7/9] added as subclass of HtmlResponse --- scrapy_splash/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index 27cf957..3a011f1 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -70,7 +70,7 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(SplashTextResponse): +class SplashHtmlResponse(SplashTextResponse,HtmlResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available From b043b1fc3b9900e48f7eaac546312238046e313f Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:30:22 +0530 Subject: [PATCH 8/9] now support application/xhtml+xml and application/vnd.wap.xhtml+xml --- scrapy_splash/responsetypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index a510185..71fb242 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -13,8 +13,8 @@ class SplashResponseTypes(ResponseTypes): 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse', + 'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', + 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', 'application/xml': 'scrapy_splash.response.SplashTextResponse', 'application/json': 'scrapy_splash.response.SplashJsonResponse', 'application/x-json': 'scrapy_splash.response.SplashJsonResponse', From 6725240f1ccd70970f87bea90da44643fbd3c9a6 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:37:29 +0530 Subject: [PATCH 9/9] removed unused import --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 5579e18..5ac52bf 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -7,7 +7,7 @@ import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse, HtmlResponse +from scrapy.http import Response, TextResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash