Skip to content

Commit db9c92e

Browse files
authored
Merge pull request #1085 from vioao/common-downloader-error-process
Common downloader error process
2 parents a342884 + 5751681 commit db9c92e

File tree

5 files changed

+169
-184
lines changed

5 files changed

+169
-184
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public Html download(String url) {
2626
/**
2727
* A simple method to download a url.
2828
*
29-
* @param url url
29+
* @param url url
3030
* @param charset charset
3131
* @return html
3232
*/
@@ -38,7 +38,7 @@ public Html download(String url, String charset) {
3838
protected void onSuccess(Request request) {
3939
}
4040

41-
protected void onError(Request request) {
41+
protected void onError(Request request, Throwable e) {
4242
}
4343

4444
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public Page download(Request request, Task task) {
8787
return page;
8888
} catch (IOException e) {
8989
logger.warn("download page {} error", request.getUrl(), e);
90-
onError(request);
90+
onError(request, e);
9191
return page;
9292
} finally {
9393
if (httpResponse != null) {
@@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
110110
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
111111
Page page = new Page();
112112
page.setBytes(bytes);
113-
if (!request.isBinaryContent()){
113+
if (!request.isBinaryContent()) {
114114
if (charset == null) {
115115
charset = getHtmlCharset(contentType, bytes);
116116
}

webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java

Lines changed: 48 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -16,135 +16,112 @@
1616
* @version 0.5.3
1717
*/
1818
public class PhantomJSDownloader extends AbstractDownloader {
19-
20-
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
19+
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
2120
private static String crawlJsPath;
2221
private static String phantomJsCommand = "phantomjs"; // default
2322

24-
private int retryNum;
25-
private int threadNum;
26-
2723
public PhantomJSDownloader() {
2824
this.initPhantomjsCrawlPath();
2925
}
30-
26+
3127
/**
3228
* 添加新的构造函数,支持phantomjs自定义命令
33-
*
34-
* example:
35-
* phantomjs.exe 支持windows环境
36-
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
37-
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
38-
*
29+
* <p>
30+
* example:
31+
* phantomjs.exe 支持windows环境
32+
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
33+
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
34+
*
3935
* @param phantomJsCommand phantomJsCommand
4036
*/
4137
public PhantomJSDownloader(String phantomJsCommand) {
4238
this.initPhantomjsCrawlPath();
4339
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
4440
}
45-
41+
4642
/**
4743
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
4844
* <pre>
4945
* crawl.js start --
50-
*
46+
*
5147
* var system = require('system');
5248
* var url = system.args[1];
53-
*
49+
*
5450
* var page = require('webpage').create();
5551
* page.settings.loadImages = false;
5652
* page.settings.resourceTimeout = 5000;
57-
*
53+
*
5854
* page.open(url, function (status) {
5955
* if (status != 'success') {
6056
* console.log("HTTP request failed!");
6157
* } else {
6258
* console.log(page.content);
6359
* }
64-
*
60+
*
6561
* page.close();
6662
* phantom.exit();
6763
* });
68-
*
64+
*
6965
* -- crawl.js end
7066
* </pre>
7167
* 具体项目时可以将以上js代码复制下来使用
72-
*
68+
* <p>
7369
* example:
74-
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
75-
*
70+
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
71+
*
7672
* @param phantomJsCommand phantomJsCommand
77-
* @param crawlJsPath crawlJsPath
73+
* @param crawlJsPath crawlJsPath
7874
*/
7975
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
80-
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
81-
PhantomJSDownloader.crawlJsPath = crawlJsPath;
76+
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
77+
PhantomJSDownloader.crawlJsPath = crawlJsPath;
8278
}
83-
79+
8480
private void initPhantomjsCrawlPath() {
85-
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
81+
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
82+
+ System.getProperty("file.separator") + "crawl.js ";
8683
}
8784

8885
@Override
8986
public Page download(Request request, Task task) {
9087
if (logger.isInfoEnabled()) {
9188
logger.info("downloading page: " + request.getUrl());
9289
}
93-
String content = getPage(request);
94-
if (content.contains("HTTP request failed")) {
95-
for (int i = 1; i <= getRetryNum(); i++) {
96-
content = getPage(request);
97-
if (!content.contains("HTTP request failed")) {
98-
break;
99-
}
100-
}
101-
if (content.contains("HTTP request failed")) {
102-
//when failed
103-
Page page = new Page();
90+
91+
Page page = Page.fail();
92+
try {
93+
String content = getPage(request);
94+
if (!content.contains("HTTP request failed")) {
95+
page.setDownloadSuccess(true);
96+
page.setRawText(content);
97+
page.setUrl(new PlainText(request.getUrl()));
10498
page.setRequest(request);
105-
return page;
99+
page.setStatusCode(200);
106100
}
101+
onSuccess(request);
102+
} catch (Exception e) {
103+
onError(request, e);
104+
logger.warn("download page {} error", request.getUrl(), e);
107105
}
108-
109-
Page page = new Page();
110-
page.setRawText(content);
111-
page.setUrl(new PlainText(request.getUrl()));
112-
page.setRequest(request);
113-
page.setStatusCode(200);
114106
return page;
115107
}
116108

117109
@Override
118110
public void setThread(int threadNum) {
119-
this.threadNum = threadNum;
111+
// ignore
120112
}
121113

122-
protected String getPage(Request request) {
123-
try {
124-
String url = request.getUrl();
125-
Runtime runtime = Runtime.getRuntime();
126-
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
127-
InputStream is = process.getInputStream();
128-
BufferedReader br = new BufferedReader(new InputStreamReader(is));
129-
StringBuffer stringBuffer = new StringBuffer();
130-
String line;
131-
while ((line = br.readLine()) != null) {
132-
stringBuffer.append(line).append("\n");
133-
}
134-
return stringBuffer.toString();
135-
} catch (IOException e) {
136-
e.printStackTrace();
114+
protected String getPage(Request request) throws Exception {
115+
String url = request.getUrl();
116+
Runtime runtime = Runtime.getRuntime();
117+
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
118+
InputStream is = process.getInputStream();
119+
BufferedReader br = new BufferedReader(new InputStreamReader(is));
120+
StringBuilder builder = new StringBuilder();
121+
String line;
122+
while ((line = br.readLine()) != null) {
123+
builder.append(line).append("\n");
137124
}
138-
139-
return null;
140-
}
141-
142-
public int getRetryNum() {
143-
return retryNum;
144-
}
145-
146-
public PhantomJSDownloader setRetryNum(int retryNum) {
147-
this.retryNum = retryNum;
148-
return this;
125+
return builder.toString();
149126
}
150127
}

webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public Site getSite() {
3636
}
3737

3838
public static void main(String[] args) throws Exception {
39-
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
39+
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
4040

4141
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
4242

0 commit comments

Comments
 (0)