Merge pull request #1085 from vioao/common-downloader-error-process

sutra · web-flow · commit db9c92edf5ac · 2022-10-01T17:44:43.000+08:00
Common downloader error process
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
@@ -26,7 +26,7 @@ public Html download(String url) {
     /**
      * A simple method to download a url.
      *
-     * @param url url
+     * @param url     url
      * @param charset charset
      * @return html
      */
@@ -38,7 +38,7 @@ public Html download(String url, String charset) {
     protected void onSuccess(Request request) {
     }
 
-    protected void onError(Request request) {
+    protected void onError(Request request, Throwable e) {
     }
 
 }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -87,7 +87,7 @@ public Page download(Request request, Task task) {
             return page;
         } catch (IOException e) {
             logger.warn("download page {} error", request.getUrl(), e);
-            onError(request);
+            onError(request, e);
             return page;
         } finally {
             if (httpResponse != null) {
@@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
         String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
         Page page = new Page();
         page.setBytes(bytes);
-        if (!request.isBinaryContent()){
+        if (!request.isBinaryContent()) {
             if (charset == null) {
                 charset = getHtmlCharset(contentType, bytes);
             }
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
@@ -16,135 +16,112 @@
  * @version 0.5.3
  */
 public class PhantomJSDownloader extends AbstractDownloader {
-
-    private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
+    private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
     private static String crawlJsPath;
     private static String phantomJsCommand = "phantomjs"; // default
 
-    private int retryNum;
-    private int threadNum;
-
     public PhantomJSDownloader() {
         this.initPhantomjsCrawlPath();
     }
-    
+
     /**
      * 添加新的构造函数，支持phantomjs自定义命令
-     * 
-     * example: 
-     *    phantomjs.exe 支持windows环境
-     *    phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
-     *    /usr/local/bin/phantomjs 命令的绝对路径，避免因系统环境变量引起的IOException
-     *   
+     * <p>
+     * example:
+     * phantomjs.exe 支持windows环境
+     * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
+     * /usr/local/bin/phantomjs 命令的绝对路径，避免因系统环境变量引起的IOException
+     *
      * @param phantomJsCommand phantomJsCommand
      */
     public PhantomJSDownloader(String phantomJsCommand) {
         this.initPhantomjsCrawlPath();
         PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
     }
-    
+
     /**
      * 新增构造函数，支持crawl.js路径自定义，因为当其他项目依赖此jar包时，runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
      * <pre>
      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * </pre>
      * 具体项目时可以将以上js代码复制下来使用
-     *   
+     * <p>
      * example:
-     *    new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
-     * 
+     * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+     *
      * @param phantomJsCommand phantomJsCommand
-     * @param crawlJsPath crawlJsPath
+     * @param crawlJsPath      crawlJsPath
      */
     public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
-      PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
-      PhantomJSDownloader.crawlJsPath = crawlJsPath;
+        PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+        PhantomJSDownloader.crawlJsPath = crawlJsPath;
     }
-    
+
     private void initPhantomjsCrawlPath() {
-        PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
+        PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+                + System.getProperty("file.separator") + "crawl.js ";
     }
 
     @Override
     public Page download(Request request, Task task) {
         if (logger.isInfoEnabled()) {
             logger.info("downloading page: " + request.getUrl());
         }
-        String content = getPage(request);
-        if (content.contains("HTTP request failed")) {
-            for (int i = 1; i <= getRetryNum(); i++) {
-                content = getPage(request);
-                if (!content.contains("HTTP request failed")) {
-                    break;
-                }
-            }
-            if (content.contains("HTTP request failed")) {
-                //when failed
-                Page page = new Page();
+
+        Page page = Page.fail();
+        try {
+            String content = getPage(request);
+            if (!content.contains("HTTP request failed")) {
+                page.setDownloadSuccess(true);
+                page.setRawText(content);
+                page.setUrl(new PlainText(request.getUrl()));
                 page.setRequest(request);
-                return page;
+                page.setStatusCode(200);
             }
+            onSuccess(request);
+        } catch (Exception e) {
+            onError(request, e);
+            logger.warn("download page {} error", request.getUrl(), e);
         }
-
-        Page page = new Page();
-        page.setRawText(content);
-        page.setUrl(new PlainText(request.getUrl()));
-        page.setRequest(request);
-        page.setStatusCode(200);
         return page;
     }
 
     @Override
     public void setThread(int threadNum) {
-        this.threadNum = threadNum;
+        // ignore
     }
 
-    protected String getPage(Request request) {
-        try {
-            String url = request.getUrl();
-            Runtime runtime = Runtime.getRuntime();
-            Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
-            InputStream is = process.getInputStream();
-            BufferedReader br = new BufferedReader(new InputStreamReader(is));
-            StringBuffer stringBuffer = new StringBuffer();
-            String line;
-            while ((line = br.readLine()) != null) {
-                stringBuffer.append(line).append("\n");
-            }
-            return stringBuffer.toString();
-        } catch (IOException e) {
-            e.printStackTrace();
+    protected String getPage(Request request) throws Exception {
+        String url = request.getUrl();
+        Runtime runtime = Runtime.getRuntime();
+        Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+        InputStream is = process.getInputStream();
+        BufferedReader br = new BufferedReader(new InputStreamReader(is));
+        StringBuilder builder = new StringBuilder();
+        String line;
+        while ((line = br.readLine()) != null) {
+            builder.append(line).append("\n");
         }
-
-        return null;
-    }
-
-    public int getRetryNum() {
-        return retryNum;
-    }
-
-    public PhantomJSDownloader setRetryNum(int retryNum) {
-        this.retryNum = retryNum;
-        return this;
+        return builder.toString();
     }
 }
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
@@ -36,7 +36,7 @@ public Site getSite() {
     }
 
     public static void main(String[] args) throws Exception {
-        PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
+        PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
 
         CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
 
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ public Html download(String url) {`
`26`	`26`	`/**`
`27`	`27`	`* A simple method to download a url.`
`28`	`28`	`*`
`29`		`- * @param url url`
	`29`	`+ * @param url url`
`30`	`30`	`* @param charset charset`
`31`	`31`	`* @return html`
`32`	`32`	`*/`
`@@ -38,7 +38,7 @@ public Html download(String url, String charset) {`
`38`	`38`	`protected void onSuccess(Request request) {`
`39`	`39`	`}`
`40`	`40`
`41`		`- protected void onError(Request request) {`
	`41`	`+ protected void onError(Request request, Throwable e) {`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`}`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ public Site getSite() {`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`public static void main(String[] args) throws Exception {`
`39`		`- PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);`
	`39`	`+ PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();`
`40`	`40`
`41`	`41`	`CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();`
`42`	`42`