|
16 | 16 | * @version 0.5.3 |
17 | 17 | */ |
18 | 18 | public class PhantomJSDownloader extends AbstractDownloader { |
19 | | - |
20 | | - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); |
| 19 | + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); |
21 | 20 | private static String crawlJsPath; |
22 | 21 | private static String phantomJsCommand = "phantomjs"; // default |
23 | 22 |
|
24 | | - private int retryNum; |
25 | | - private int threadNum; |
26 | | - |
27 | 23 | public PhantomJSDownloader() { |
28 | 24 | this.initPhantomjsCrawlPath(); |
29 | 25 | } |
30 | | - |
| 26 | + |
31 | 27 | /** |
32 | 28 | * 添加新的构造函数,支持phantomjs自定义命令 |
33 | | - * |
34 | | - * example: |
35 | | - * phantomjs.exe 支持windows环境 |
36 | | - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 |
37 | | - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException |
38 | | - * |
| 29 | + * <p> |
| 30 | + * example: |
| 31 | + * phantomjs.exe 支持windows环境 |
| 32 | + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 |
| 33 | + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException |
| 34 | + * |
39 | 35 | * @param phantomJsCommand phantomJsCommand |
40 | 36 | */ |
41 | 37 | public PhantomJSDownloader(String phantomJsCommand) { |
42 | 38 | this.initPhantomjsCrawlPath(); |
43 | 39 | PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
44 | 40 | } |
45 | | - |
| 41 | + |
46 | 42 | /** |
47 | 43 | * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js |
48 | 44 | * <pre> |
49 | 45 | * crawl.js start -- |
50 | | - * |
| 46 | + * |
51 | 47 | * var system = require('system'); |
52 | 48 | * var url = system.args[1]; |
53 | | - * |
| 49 | + * |
54 | 50 | * var page = require('webpage').create(); |
55 | 51 | * page.settings.loadImages = false; |
56 | 52 | * page.settings.resourceTimeout = 5000; |
57 | | - * |
| 53 | + * |
58 | 54 | * page.open(url, function (status) { |
59 | 55 | * if (status != 'success') { |
60 | 56 | * console.log("HTTP request failed!"); |
61 | 57 | * } else { |
62 | 58 | * console.log(page.content); |
63 | 59 | * } |
64 | | - * |
| 60 | + * |
65 | 61 | * page.close(); |
66 | 62 | * phantom.exit(); |
67 | 63 | * }); |
68 | | - * |
| 64 | + * |
69 | 65 | * -- crawl.js end |
70 | 66 | * </pre> |
71 | 67 | * 具体项目时可以将以上js代码复制下来使用 |
72 | | - * |
| 68 | + * <p> |
73 | 69 | * example: |
74 | | - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); |
75 | | - * |
| 70 | + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); |
| 71 | + * |
76 | 72 | * @param phantomJsCommand phantomJsCommand |
77 | | - * @param crawlJsPath crawlJsPath |
| 73 | + * @param crawlJsPath crawlJsPath |
78 | 74 | */ |
79 | 75 | public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { |
80 | | - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
81 | | - PhantomJSDownloader.crawlJsPath = crawlJsPath; |
| 76 | + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; |
| 77 | + PhantomJSDownloader.crawlJsPath = crawlJsPath; |
82 | 78 | } |
83 | | - |
| 79 | + |
84 | 80 | private void initPhantomjsCrawlPath() { |
85 | | - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; |
| 81 | + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() |
| 82 | + + System.getProperty("file.separator") + "crawl.js "; |
86 | 83 | } |
87 | 84 |
|
88 | 85 | @Override |
89 | 86 | public Page download(Request request, Task task) { |
90 | 87 | if (logger.isInfoEnabled()) { |
91 | 88 | logger.info("downloading page: " + request.getUrl()); |
92 | 89 | } |
93 | | - String content = getPage(request); |
94 | | - if (content.contains("HTTP request failed")) { |
95 | | - for (int i = 1; i <= getRetryNum(); i++) { |
96 | | - content = getPage(request); |
97 | | - if (!content.contains("HTTP request failed")) { |
98 | | - break; |
99 | | - } |
100 | | - } |
101 | | - if (content.contains("HTTP request failed")) { |
102 | | - //when failed |
103 | | - Page page = new Page(); |
| 90 | + |
| 91 | + Page page = Page.fail(); |
| 92 | + try { |
| 93 | + String content = getPage(request); |
| 94 | + if (!content.contains("HTTP request failed")) { |
| 95 | + page.setDownloadSuccess(true); |
| 96 | + page.setRawText(content); |
| 97 | + page.setUrl(new PlainText(request.getUrl())); |
104 | 98 | page.setRequest(request); |
105 | | - return page; |
| 99 | + page.setStatusCode(200); |
106 | 100 | } |
| 101 | + onSuccess(request); |
| 102 | + } catch (Exception e) { |
| 103 | + onError(request, e); |
| 104 | + logger.warn("download page {} error", request.getUrl(), e); |
107 | 105 | } |
108 | | - |
109 | | - Page page = new Page(); |
110 | | - page.setRawText(content); |
111 | | - page.setUrl(new PlainText(request.getUrl())); |
112 | | - page.setRequest(request); |
113 | | - page.setStatusCode(200); |
114 | 106 | return page; |
115 | 107 | } |
116 | 108 |
|
117 | 109 | @Override |
118 | 110 | public void setThread(int threadNum) { |
119 | | - this.threadNum = threadNum; |
| 111 | + // ignore |
120 | 112 | } |
121 | 113 |
|
122 | | - protected String getPage(Request request) { |
123 | | - try { |
124 | | - String url = request.getUrl(); |
125 | | - Runtime runtime = Runtime.getRuntime(); |
126 | | - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); |
127 | | - InputStream is = process.getInputStream(); |
128 | | - BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
129 | | - StringBuffer stringBuffer = new StringBuffer(); |
130 | | - String line; |
131 | | - while ((line = br.readLine()) != null) { |
132 | | - stringBuffer.append(line).append("\n"); |
133 | | - } |
134 | | - return stringBuffer.toString(); |
135 | | - } catch (IOException e) { |
136 | | - e.printStackTrace(); |
| 114 | + protected String getPage(Request request) throws Exception { |
| 115 | + String url = request.getUrl(); |
| 116 | + Runtime runtime = Runtime.getRuntime(); |
| 117 | + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); |
| 118 | + InputStream is = process.getInputStream(); |
| 119 | + BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
| 120 | + StringBuilder builder = new StringBuilder(); |
| 121 | + String line; |
| 122 | + while ((line = br.readLine()) != null) { |
| 123 | + builder.append(line).append("\n"); |
137 | 124 | } |
138 | | - |
139 | | - return null; |
140 | | - } |
141 | | - |
142 | | - public int getRetryNum() { |
143 | | - return retryNum; |
144 | | - } |
145 | | - |
146 | | - public PhantomJSDownloader setRetryNum(int retryNum) { |
147 | | - this.retryNum = retryNum; |
148 | | - return this; |
| 125 | + return builder.toString(); |
149 | 126 | } |
150 | 127 | } |
0 commit comments