Skip to content

Commit 37cb43b

Browse files
committed
Merge pull request #176 from lavenderx/master
add PhantomJSDownloader
2 parents 2400ff7 + 7628dc6 commit 37cb43b

File tree

4 files changed

+182
-0
lines changed

4 files changed

+182
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package us.codecraft.webmagic.downloader;
2+
3+
import org.apache.http.annotation.ThreadSafe;
4+
import org.slf4j.Logger;
5+
import org.slf4j.LoggerFactory;
6+
import us.codecraft.webmagic.Page;
7+
import us.codecraft.webmagic.Request;
8+
import us.codecraft.webmagic.Task;
9+
import us.codecraft.webmagic.selector.PlainText;
10+
11+
import java.io.*;
12+
13+
/**
14+
* this downloader is used to download pages which need to render the javascript
15+
*
16+
* @author dolphineor@gmail.com
17+
* @version 0.5.3
18+
*/
19+
@ThreadSafe
20+
public class PhantomJSDownloader extends AbstractDownloader {
21+
22+
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
23+
private static String phantomJSPath;
24+
25+
private int retryNum;
26+
private int threadNum;
27+
28+
public PhantomJSDownloader() {
29+
PhantomJSDownloader.phantomJSPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
30+
}
31+
32+
@Override
33+
public Page download(Request request, Task task) {
34+
if (logger.isInfoEnabled()) {
35+
logger.info("downloading page: " + request.getUrl());
36+
}
37+
String content = getPage(request);
38+
if (content.contains("HTTP request failed")) {
39+
for (int i = 1; i <= getRetryNum(); i++) {
40+
content = getPage(request);
41+
if (!content.contains("HTTP request failed")) {
42+
break;
43+
}
44+
}
45+
if (content.contains("HTTP request failed")) {
46+
//when failed
47+
Page page = new Page();
48+
page.setRequest(request);
49+
return page;
50+
}
51+
}
52+
53+
Page page = new Page();
54+
page.setRawText(content);
55+
page.setUrl(new PlainText(request.getUrl()));
56+
page.setRequest(request);
57+
page.setStatusCode(200);
58+
return page;
59+
}
60+
61+
@Override
62+
public void setThread(int threadNum) {
63+
this.threadNum = threadNum;
64+
}
65+
66+
protected String getPage(Request request) {
67+
try {
68+
String url = request.getUrl();
69+
Runtime runtime = Runtime.getRuntime();
70+
Process process = runtime.exec("phantomjs " + phantomJSPath + url);
71+
InputStream is = process.getInputStream();
72+
BufferedReader br = new BufferedReader(new InputStreamReader(is));
73+
StringBuffer stringBuffer = new StringBuffer();
74+
String line;
75+
while ((line = br.readLine()) != null) {
76+
stringBuffer.append(line).append("\n");
77+
}
78+
return stringBuffer.toString();
79+
} catch (IOException e) {
80+
e.printStackTrace();
81+
}
82+
83+
return null;
84+
}
85+
86+
public int getRetryNum() {
87+
return retryNum;
88+
}
89+
90+
public PhantomJSDownloader setRetryNum(int retryNum) {
91+
this.retryNum = retryNum;
92+
return this;
93+
}
94+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
var system = require('system');
2+
var url = system.args[1];
3+
4+
var page = require('webpage').create();
5+
page.settings.loadImages = false;
6+
page.settings.resourceTimeout = 5000;
7+
8+
page.open(url, function (status) {
9+
if (status != 'success') {
10+
console.log("HTTP request failed!");
11+
} else {
12+
console.log(page.content);
13+
}
14+
15+
page.close();
16+
phantom.exit();
17+
});
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package us.codecraft.webmagic.samples;
2+
3+
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.ResultItems;
5+
import us.codecraft.webmagic.Site;
6+
import us.codecraft.webmagic.Spider;
7+
import us.codecraft.webmagic.downloader.PhantomJSDownloader;
8+
import us.codecraft.webmagic.pipeline.CollectorPipeline;
9+
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
10+
import us.codecraft.webmagic.processor.PageProcessor;
11+
12+
import java.util.List;
13+
14+
/**
15+
* Created by dolphineor on 2014-11-21.
16+
* <p/>
17+
* 以淘宝为例, 搜索冬装的相关结果
18+
*/
19+
public class PhantomJSPageProcessor implements PageProcessor {
20+
21+
private Site site = Site.me()
22+
.setDomain("s.taobao.com")
23+
.setCharset("GBK")
24+
.addHeader("Referer", "http://www.taobao.com/")
25+
.setRetryTimes(3).setSleepTime(1000);
26+
27+
@Override
28+
public void process(Page page) {
29+
if (page.getRawText() != null)
30+
page.putField("html", page.getRawText());
31+
}
32+
33+
@Override
34+
public Site getSite() {
35+
return site;
36+
}
37+
38+
public static void main(String[] args) throws Exception {
39+
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
40+
41+
CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();
42+
43+
Spider.create(new PhantomJSPageProcessor())
44+
.addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
45+
.setDownloader(phantomDownloader)
46+
.addPipeline(collectorPipeline)
47+
.thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
48+
.run();
49+
50+
List<ResultItems> resultItemsList = collectorPipeline.getCollected();
51+
System.out.println(resultItemsList.get(0).get("html").toString());
52+
}
53+
54+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
var system = require('system');
2+
var url = system.args[1];
3+
4+
var page = require('webpage').create();
5+
page.settings.loadImages = false;
6+
page.settings.resourceTimeout = 5000;
7+
8+
page.open(url, function (status) {
9+
if (status != 'success') {
10+
console.log("HTTP request failed!");
11+
} else {
12+
console.log(page.content);
13+
}
14+
15+
page.close();
16+
phantom.exit();
17+
});

0 commit comments

Comments
 (0)