diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index d15f678..3a011f1 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -5,7 +5,7 @@ import base64 import re -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy import Selector from scrapy_splash.utils import headers_to_scrapy @@ -70,6 +70,14 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) +class SplashHtmlResponse(SplashTextResponse,HtmlResponse): + """ + This HtmlResponse subclass sets response.url to the URL of a remote website + instead of an URL of Splash server. "Real" response URL is still available + as ``response.real_url``. + """ + pass + class SplashJsonResponse(SplashResponse): """ Splash Response with JSON data. It provides a convenient way to access diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index 04e9264..71fb242 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -9,12 +9,12 @@ class SplashResponseTypes(ResponseTypes): CLASSES = { - 'text/html': 'scrapy_splash.response.SplashTextResponse', + 'text/html': 'scrapy_splash.response.SplashHtmlResponse', 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse', + 'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', + 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', 'application/xml': 'scrapy_splash.response.SplashTextResponse', 'application/json': 'scrapy_splash.response.SplashJsonResponse', 'application/x-json': 'scrapy_splash.response.SplashJsonResponse',