Skip to content

Commit 25821c3

Browse files
committed
Merge branch 'EdwardsBean-proxy-strategy'
2 parents b541336 + 61c28a0 commit 25821c3

File tree

5 files changed

+322
-314
lines changed

5 files changed

+322
-314
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Site.java

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import org.apache.http.HttpHost;
66

77
import us.codecraft.webmagic.proxy.Proxy;
8+
import us.codecraft.webmagic.proxy.SimpleProxyPool;
89
import us.codecraft.webmagic.proxy.ProxyPool;
910
import us.codecraft.webmagic.utils.UrlUtils;
1011

@@ -470,18 +471,18 @@ public String toString() {
470471
* @param httpProxyList httpProxyList
471472
* @return this
472473
*/
473-
public Site setHttpProxyPool(List<String[]> httpProxyList) {
474-
this.httpProxyPool=new ProxyPool(httpProxyList);
474+
public Site setHttpProxyPool(ProxyPool proxyPool) {
475+
this.httpProxyPool = proxyPool;
475476
return this;
476477
}
477478

478479
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
479-
this.httpProxyPool=new ProxyPool(httpProxyList, isUseLastProxy);
480+
this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy);
480481
return this;
481482
}
482483

483484
public Site enableHttpProxyPool() {
484-
this.httpProxyPool=new ProxyPool();
485+
this.httpProxyPool=new SimpleProxyPool();
485486
return this;
486487
}
487488

@@ -497,9 +498,4 @@ public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
497498
httpProxyPool.returnProxy(proxy,statusCode);
498499
}
499500

500-
public Site setProxyReuseInterval(int reuseInterval) {
501-
this.httpProxyPool.setReuseInterval(reuseInterval);
502-
return this;
503-
}
504-
505501
}

webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
*
5252
* @author yxssfxwzy@sina.com <br>
5353
* @since 0.5.1
54-
* @see ProxyPool
54+
* @see SimpleProxyPool
5555
*/
5656

5757
public class Proxy implements Delayed, Serializable {
Lines changed: 5 additions & 303 deletions
Original file line numberDiff line numberDiff line change
@@ -1,310 +1,12 @@
11
package us.codecraft.webmagic.proxy;
22

33
import org.apache.http.HttpHost;
4-
import org.slf4j.Logger;
5-
import org.slf4j.LoggerFactory;
6-
import us.codecraft.webmagic.utils.FilePersistentBase;
7-
import us.codecraft.webmagic.utils.ProxyUtils;
8-
9-
import java.io.*;
10-
import java.net.InetAddress;
11-
import java.net.UnknownHostException;
12-
import java.util.*;
13-
import java.util.Map.Entry;
14-
import java.util.concurrent.BlockingQueue;
15-
import java.util.concurrent.ConcurrentHashMap;
16-
import java.util.concurrent.DelayQueue;
174

185
/**
19-
* Pooled Proxy Object
20-
*
21-
* @author yxssfxwzy@sina.com <br>
22-
* @see Proxy
23-
* @since 0.5.1
6+
* Created by edwardsbean on 15-2-28.
247
*/
25-
public class ProxyPool {
26-
27-
private Logger logger = LoggerFactory.getLogger(getClass());
28-
29-
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>();
30-
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>();
31-
32-
private int reuseInterval = 1500;// ms
33-
private int reviveTime = 2 * 60 * 60 * 1000;// ms
34-
private int saveProxyInterval = 10 * 60 * 1000;// ms
35-
36-
private boolean isEnable = false;
37-
private boolean validateWhenInit = false;
38-
// private boolean isUseLastProxy = true;
39-
private String proxyFilePath = "/data/webmagic/lastUse.proxy";
40-
41-
private FilePersistentBase fBase = new FilePersistentBase();
42-
43-
private Timer timer = new Timer(true);
44-
private TimerTask saveProxyTask = new TimerTask() {
45-
46-
@Override
47-
public void run() {
48-
saveProxyList();
49-
logger.info(allProxyStatus());
50-
}
51-
};
52-
53-
public ProxyPool() {
54-
this(null, true);
55-
}
56-
57-
public ProxyPool(List<String[]> httpProxyList) {
58-
this(httpProxyList, true);
59-
}
60-
61-
public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
62-
if (httpProxyList != null) {
63-
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
64-
}
65-
if (isUseLastProxy) {
66-
if (!new File(proxyFilePath).exists()) {
67-
setFilePath();
68-
}
69-
readProxyList();
70-
timer.schedule(saveProxyTask, 0, saveProxyInterval);
71-
}
72-
}
73-
74-
private void setFilePath() {
75-
String tmpDir = System.getProperty("java.io.tmpdir");
76-
String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy";
77-
if (tmpDir != null && new File(tmpDir).isDirectory()) {
78-
fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic");
79-
File f = fBase.getFile(path);
80-
if (!f.exists()) {
81-
try {
82-
f.createNewFile();
83-
84-
} catch (IOException e) {
85-
logger.error("proxy file create error", e);
86-
}
87-
}
88-
89-
} else {
90-
logger.error("java tmp dir not exists");
91-
}
92-
this.proxyFilePath = path;
93-
}
94-
95-
private void saveProxyList() {
96-
if (allProxy.size() == 0) {
97-
return;
98-
}
99-
try {
100-
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
101-
os.writeObject(prepareForSaving());
102-
os.close();
103-
logger.info("save proxy");
104-
} catch (FileNotFoundException e) {
105-
logger.error("proxy file not found", e);
106-
} catch (IOException e) {
107-
e.printStackTrace();
108-
}
109-
}
110-
111-
private Map<String, Proxy> prepareForSaving() {
112-
Map<String, Proxy> tmp = new HashMap<String, Proxy>();
113-
for (Entry<String, Proxy> e : allProxy.entrySet()) {
114-
Proxy p = e.getValue();
115-
p.setFailedNum(0);
116-
tmp.put(e.getKey(), p);
117-
}
118-
return tmp;
119-
}
120-
121-
private void readProxyList() {
122-
try {
123-
ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
124-
addProxy((Map<String, Proxy>) is.readObject());
125-
is.close();
126-
} catch (FileNotFoundException e) {
127-
logger.info("last use proxy file not found", e);
128-
} catch (IOException e) {
129-
// e.printStackTrace();
130-
} catch (ClassNotFoundException e) {
131-
// e.printStackTrace();
132-
}
133-
}
134-
135-
private void addProxy(Map<String, Proxy> httpProxyMap) {
136-
isEnable = true;
137-
for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
138-
try {
139-
if (allProxy.containsKey(entry.getKey())) {
140-
continue;
141-
}
142-
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
143-
entry.getValue().setFailedNum(0);
144-
entry.getValue().setReuseTimeInterval(reuseInterval);
145-
proxyQueue.add(entry.getValue());
146-
allProxy.put(entry.getKey(), entry.getValue());
147-
}
148-
} catch (NumberFormatException e) {
149-
logger.error("HttpHost init error:", e);
150-
}
151-
}
152-
logger.info("proxy pool size>>>>" + allProxy.size());
153-
}
154-
155-
public void addProxy(String[]... httpProxyList) {
156-
isEnable = true;
157-
for (String[] s : httpProxyList) {
158-
try {
159-
if (allProxy.containsKey(s[2])) {
160-
continue;
161-
}
162-
HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
163-
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
164-
Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
165-
proxyQueue.add(p);
166-
allProxy.put(s[2], p);
167-
}
168-
} catch (NumberFormatException e) {
169-
logger.error("HttpHost init error:", e);
170-
} catch (UnknownHostException e) {
171-
logger.error("HttpHost init error:", e);
172-
}
173-
}
174-
logger.info("proxy pool size>>>>" + allProxy.size());
175-
}
176-
177-
public Proxy getProxy() {
178-
Proxy proxy = null;
179-
try {
180-
Long time = System.currentTimeMillis();
181-
proxy = proxyQueue.take();
182-
double costTime = (System.currentTimeMillis() - time) / 1000.0;
183-
if (costTime > reuseInterval) {
184-
logger.info("get proxy time >>>> " + costTime);
185-
}
186-
Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
187-
p.setLastBorrowTime(System.currentTimeMillis());
188-
p.borrowNumIncrement(1);
189-
} catch (InterruptedException e) {
190-
logger.error("get proxy error", e);
191-
}
192-
if (proxy == null) {
193-
throw new NoSuchElementException();
194-
}
195-
return proxy;
196-
}
197-
198-
public void returnProxy(HttpHost host, int statusCode) {
199-
Proxy p = allProxy.get(host.getAddress().getHostAddress());
200-
if (p == null) {
201-
return;
202-
}
203-
switch (statusCode) {
204-
case Proxy.SUCCESS:
205-
p.setReuseTimeInterval(reuseInterval);
206-
p.setFailedNum(0);
207-
p.setFailedErrorType(new ArrayList<Integer>());
208-
p.recordResponse();
209-
p.successNumIncrement(1);
210-
break;
211-
case Proxy.ERROR_403:
212-
// banned,try longer interval
213-
p.fail(Proxy.ERROR_403);
214-
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
215-
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
216-
break;
217-
case Proxy.ERROR_BANNED:
218-
p.fail(Proxy.ERROR_BANNED);
219-
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
220-
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
221-
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
222-
break;
223-
case Proxy.ERROR_404:
224-
// p.fail(Proxy.ERROR_404);
225-
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
226-
break;
227-
default:
228-
p.fail(statusCode);
229-
break;
230-
}
231-
if (p.getFailedNum() > 20) {
232-
p.setReuseTimeInterval(reviveTime);
233-
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
234-
return;
235-
}
236-
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
237-
if (!ProxyUtils.validateProxy(host)) {
238-
p.setReuseTimeInterval(reviveTime);
239-
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
240-
return;
241-
}
242-
}
243-
try {
244-
proxyQueue.put(p);
245-
} catch (InterruptedException e) {
246-
logger.warn("proxyQueue return proxy error", e);
247-
}
248-
}
249-
250-
public String allProxyStatus() {
251-
String re = "all proxy info >>>> \n";
252-
for (Entry<String, Proxy> entry : allProxy.entrySet()) {
253-
re += entry.getValue().toString() + "\n";
254-
}
255-
return re;
256-
}
257-
258-
public int getIdleNum() {
259-
return proxyQueue.size();
260-
}
261-
262-
public int getReuseInterval() {
263-
return reuseInterval;
264-
}
265-
266-
public void setReuseInterval(int reuseInterval) {
267-
this.reuseInterval = reuseInterval;
268-
}
269-
270-
public void enable(boolean isEnable) {
271-
this.isEnable = isEnable;
272-
}
273-
274-
public boolean isEnable() {
275-
return isEnable;
276-
}
277-
278-
public int getReviveTime() {
279-
return reviveTime;
280-
}
281-
282-
public void setReviveTime(int reviveTime) {
283-
this.reviveTime = reviveTime;
284-
}
285-
286-
public boolean isValidateWhenInit() {
287-
return validateWhenInit;
288-
}
289-
290-
public void validateWhenInit(boolean validateWhenInit) {
291-
this.validateWhenInit = validateWhenInit;
292-
}
293-
294-
public int getSaveProxyInterval() {
295-
return saveProxyInterval;
296-
}
297-
298-
public void setSaveProxyInterval(int saveProxyInterval) {
299-
this.saveProxyInterval = saveProxyInterval;
300-
}
301-
302-
public String getProxyFilePath() {
303-
return proxyFilePath;
304-
}
305-
306-
public void setProxyFilePath(String proxyFilePath) {
307-
this.proxyFilePath = proxyFilePath;
308-
}
309-
8+
public interface ProxyPool {
9+
public void returnProxy(HttpHost host, int statusCode);
10+
public Proxy getProxy();
11+
public boolean isEnable();
31012
}

0 commit comments

Comments
 (0)