|
1 | 1 | package us.codecraft.webmagic.proxy; |
2 | 2 |
|
3 | 3 | import org.apache.http.HttpHost; |
4 | | -import org.slf4j.Logger; |
5 | | -import org.slf4j.LoggerFactory; |
6 | | -import us.codecraft.webmagic.utils.FilePersistentBase; |
7 | | -import us.codecraft.webmagic.utils.ProxyUtils; |
8 | | - |
9 | | -import java.io.*; |
10 | | -import java.net.InetAddress; |
11 | | -import java.net.UnknownHostException; |
12 | | -import java.util.*; |
13 | | -import java.util.Map.Entry; |
14 | | -import java.util.concurrent.BlockingQueue; |
15 | | -import java.util.concurrent.ConcurrentHashMap; |
16 | | -import java.util.concurrent.DelayQueue; |
17 | 4 |
|
18 | 5 | /** |
19 | | - * Pooled Proxy Object |
20 | | - * |
21 | | - * @author yxssfxwzy@sina.com <br> |
22 | | - * @see Proxy |
23 | | - * @since 0.5.1 |
| 6 | + * Created by edwardsbean on 15-2-28. |
24 | 7 | */ |
25 | | -public class ProxyPool { |
26 | | - |
27 | | - private Logger logger = LoggerFactory.getLogger(getClass()); |
28 | | - |
29 | | - private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>(); |
30 | | - private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>(); |
31 | | - |
32 | | - private int reuseInterval = 1500;// ms |
33 | | - private int reviveTime = 2 * 60 * 60 * 1000;// ms |
34 | | - private int saveProxyInterval = 10 * 60 * 1000;// ms |
35 | | - |
36 | | - private boolean isEnable = false; |
37 | | - private boolean validateWhenInit = false; |
38 | | - // private boolean isUseLastProxy = true; |
39 | | - private String proxyFilePath = "/data/webmagic/lastUse.proxy"; |
40 | | - |
41 | | - private FilePersistentBase fBase = new FilePersistentBase(); |
42 | | - |
43 | | - private Timer timer = new Timer(true); |
44 | | - private TimerTask saveProxyTask = new TimerTask() { |
45 | | - |
46 | | - @Override |
47 | | - public void run() { |
48 | | - saveProxyList(); |
49 | | - logger.info(allProxyStatus()); |
50 | | - } |
51 | | - }; |
52 | | - |
53 | | - public ProxyPool() { |
54 | | - this(null, true); |
55 | | - } |
56 | | - |
57 | | - public ProxyPool(List<String[]> httpProxyList) { |
58 | | - this(httpProxyList, true); |
59 | | - } |
60 | | - |
61 | | - public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) { |
62 | | - if (httpProxyList != null) { |
63 | | - addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); |
64 | | - } |
65 | | - if (isUseLastProxy) { |
66 | | - if (!new File(proxyFilePath).exists()) { |
67 | | - setFilePath(); |
68 | | - } |
69 | | - readProxyList(); |
70 | | - timer.schedule(saveProxyTask, 0, saveProxyInterval); |
71 | | - } |
72 | | - } |
73 | | - |
74 | | - private void setFilePath() { |
75 | | - String tmpDir = System.getProperty("java.io.tmpdir"); |
76 | | - String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy"; |
77 | | - if (tmpDir != null && new File(tmpDir).isDirectory()) { |
78 | | - fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic"); |
79 | | - File f = fBase.getFile(path); |
80 | | - if (!f.exists()) { |
81 | | - try { |
82 | | - f.createNewFile(); |
83 | | - |
84 | | - } catch (IOException e) { |
85 | | - logger.error("proxy file create error", e); |
86 | | - } |
87 | | - } |
88 | | - |
89 | | - } else { |
90 | | - logger.error("java tmp dir not exists"); |
91 | | - } |
92 | | - this.proxyFilePath = path; |
93 | | - } |
94 | | - |
95 | | - private void saveProxyList() { |
96 | | - if (allProxy.size() == 0) { |
97 | | - return; |
98 | | - } |
99 | | - try { |
100 | | - ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); |
101 | | - os.writeObject(prepareForSaving()); |
102 | | - os.close(); |
103 | | - logger.info("save proxy"); |
104 | | - } catch (FileNotFoundException e) { |
105 | | - logger.error("proxy file not found", e); |
106 | | - } catch (IOException e) { |
107 | | - e.printStackTrace(); |
108 | | - } |
109 | | - } |
110 | | - |
111 | | - private Map<String, Proxy> prepareForSaving() { |
112 | | - Map<String, Proxy> tmp = new HashMap<String, Proxy>(); |
113 | | - for (Entry<String, Proxy> e : allProxy.entrySet()) { |
114 | | - Proxy p = e.getValue(); |
115 | | - p.setFailedNum(0); |
116 | | - tmp.put(e.getKey(), p); |
117 | | - } |
118 | | - return tmp; |
119 | | - } |
120 | | - |
121 | | - private void readProxyList() { |
122 | | - try { |
123 | | - ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); |
124 | | - addProxy((Map<String, Proxy>) is.readObject()); |
125 | | - is.close(); |
126 | | - } catch (FileNotFoundException e) { |
127 | | - logger.info("last use proxy file not found", e); |
128 | | - } catch (IOException e) { |
129 | | - // e.printStackTrace(); |
130 | | - } catch (ClassNotFoundException e) { |
131 | | - // e.printStackTrace(); |
132 | | - } |
133 | | - } |
134 | | - |
135 | | - private void addProxy(Map<String, Proxy> httpProxyMap) { |
136 | | - isEnable = true; |
137 | | - for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) { |
138 | | - try { |
139 | | - if (allProxy.containsKey(entry.getKey())) { |
140 | | - continue; |
141 | | - } |
142 | | - if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { |
143 | | - entry.getValue().setFailedNum(0); |
144 | | - entry.getValue().setReuseTimeInterval(reuseInterval); |
145 | | - proxyQueue.add(entry.getValue()); |
146 | | - allProxy.put(entry.getKey(), entry.getValue()); |
147 | | - } |
148 | | - } catch (NumberFormatException e) { |
149 | | - logger.error("HttpHost init error:", e); |
150 | | - } |
151 | | - } |
152 | | - logger.info("proxy pool size>>>>" + allProxy.size()); |
153 | | - } |
154 | | - |
155 | | - public void addProxy(String[]... httpProxyList) { |
156 | | - isEnable = true; |
157 | | - for (String[] s : httpProxyList) { |
158 | | - try { |
159 | | - if (allProxy.containsKey(s[2])) { |
160 | | - continue; |
161 | | - } |
162 | | - HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3])); |
163 | | - if (!validateWhenInit || ProxyUtils.validateProxy(item)) { |
164 | | - Proxy p = new Proxy(item, reuseInterval, s[0], s[1]); |
165 | | - proxyQueue.add(p); |
166 | | - allProxy.put(s[2], p); |
167 | | - } |
168 | | - } catch (NumberFormatException e) { |
169 | | - logger.error("HttpHost init error:", e); |
170 | | - } catch (UnknownHostException e) { |
171 | | - logger.error("HttpHost init error:", e); |
172 | | - } |
173 | | - } |
174 | | - logger.info("proxy pool size>>>>" + allProxy.size()); |
175 | | - } |
176 | | - |
177 | | - public Proxy getProxy() { |
178 | | - Proxy proxy = null; |
179 | | - try { |
180 | | - Long time = System.currentTimeMillis(); |
181 | | - proxy = proxyQueue.take(); |
182 | | - double costTime = (System.currentTimeMillis() - time) / 1000.0; |
183 | | - if (costTime > reuseInterval) { |
184 | | - logger.info("get proxy time >>>> " + costTime); |
185 | | - } |
186 | | - Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); |
187 | | - p.setLastBorrowTime(System.currentTimeMillis()); |
188 | | - p.borrowNumIncrement(1); |
189 | | - } catch (InterruptedException e) { |
190 | | - logger.error("get proxy error", e); |
191 | | - } |
192 | | - if (proxy == null) { |
193 | | - throw new NoSuchElementException(); |
194 | | - } |
195 | | - return proxy; |
196 | | - } |
197 | | - |
198 | | - public void returnProxy(HttpHost host, int statusCode) { |
199 | | - Proxy p = allProxy.get(host.getAddress().getHostAddress()); |
200 | | - if (p == null) { |
201 | | - return; |
202 | | - } |
203 | | - switch (statusCode) { |
204 | | - case Proxy.SUCCESS: |
205 | | - p.setReuseTimeInterval(reuseInterval); |
206 | | - p.setFailedNum(0); |
207 | | - p.setFailedErrorType(new ArrayList<Integer>()); |
208 | | - p.recordResponse(); |
209 | | - p.successNumIncrement(1); |
210 | | - break; |
211 | | - case Proxy.ERROR_403: |
212 | | - // banned,try longer interval |
213 | | - p.fail(Proxy.ERROR_403); |
214 | | - p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); |
215 | | - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); |
216 | | - break; |
217 | | - case Proxy.ERROR_BANNED: |
218 | | - p.fail(Proxy.ERROR_BANNED); |
219 | | - p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); |
220 | | - logger.warn("this proxy is banned >>>> " + p.getHttpHost()); |
221 | | - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); |
222 | | - break; |
223 | | - case Proxy.ERROR_404: |
224 | | - // p.fail(Proxy.ERROR_404); |
225 | | - // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); |
226 | | - break; |
227 | | - default: |
228 | | - p.fail(statusCode); |
229 | | - break; |
230 | | - } |
231 | | - if (p.getFailedNum() > 20) { |
232 | | - p.setReuseTimeInterval(reviveTime); |
233 | | - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); |
234 | | - return; |
235 | | - } |
236 | | - if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { |
237 | | - if (!ProxyUtils.validateProxy(host)) { |
238 | | - p.setReuseTimeInterval(reviveTime); |
239 | | - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); |
240 | | - return; |
241 | | - } |
242 | | - } |
243 | | - try { |
244 | | - proxyQueue.put(p); |
245 | | - } catch (InterruptedException e) { |
246 | | - logger.warn("proxyQueue return proxy error", e); |
247 | | - } |
248 | | - } |
249 | | - |
250 | | - public String allProxyStatus() { |
251 | | - String re = "all proxy info >>>> \n"; |
252 | | - for (Entry<String, Proxy> entry : allProxy.entrySet()) { |
253 | | - re += entry.getValue().toString() + "\n"; |
254 | | - } |
255 | | - return re; |
256 | | - } |
257 | | - |
258 | | - public int getIdleNum() { |
259 | | - return proxyQueue.size(); |
260 | | - } |
261 | | - |
262 | | - public int getReuseInterval() { |
263 | | - return reuseInterval; |
264 | | - } |
265 | | - |
266 | | - public void setReuseInterval(int reuseInterval) { |
267 | | - this.reuseInterval = reuseInterval; |
268 | | - } |
269 | | - |
270 | | - public void enable(boolean isEnable) { |
271 | | - this.isEnable = isEnable; |
272 | | - } |
273 | | - |
274 | | - public boolean isEnable() { |
275 | | - return isEnable; |
276 | | - } |
277 | | - |
278 | | - public int getReviveTime() { |
279 | | - return reviveTime; |
280 | | - } |
281 | | - |
282 | | - public void setReviveTime(int reviveTime) { |
283 | | - this.reviveTime = reviveTime; |
284 | | - } |
285 | | - |
286 | | - public boolean isValidateWhenInit() { |
287 | | - return validateWhenInit; |
288 | | - } |
289 | | - |
290 | | - public void validateWhenInit(boolean validateWhenInit) { |
291 | | - this.validateWhenInit = validateWhenInit; |
292 | | - } |
293 | | - |
294 | | - public int getSaveProxyInterval() { |
295 | | - return saveProxyInterval; |
296 | | - } |
297 | | - |
298 | | - public void setSaveProxyInterval(int saveProxyInterval) { |
299 | | - this.saveProxyInterval = saveProxyInterval; |
300 | | - } |
301 | | - |
302 | | - public String getProxyFilePath() { |
303 | | - return proxyFilePath; |
304 | | - } |
305 | | - |
306 | | - public void setProxyFilePath(String proxyFilePath) { |
307 | | - this.proxyFilePath = proxyFilePath; |
308 | | - } |
309 | | - |
| 8 | +public interface ProxyPool { |
| 9 | + public void returnProxy(HttpHost host, int statusCode); |
| 10 | + public Proxy getProxy(); |
| 11 | + public boolean isEnable(); |
310 | 12 | } |
0 commit comments