Skip to content

Commit 19474e4

Browse files
committed
add SimpleProxyPool and IProxyPool
1 parent 05a1f39 commit 19474e4

File tree

4 files changed

+136
-12
lines changed

4 files changed

+136
-12
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Site.java

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import com.google.common.collect.Table;
55
import org.apache.http.HttpHost;
66

7-
import us.codecraft.webmagic.proxy.ProxyPool;
7+
import us.codecraft.webmagic.proxy.IProxyPool;
8+
import us.codecraft.webmagic.proxy.SimpleProxyPool;
89
import us.codecraft.webmagic.utils.UrlUtils;
910

1011
import java.util.*;
@@ -51,7 +52,7 @@ public class Site {
5152

5253
private HttpHost httpProxy;
5354

54-
private ProxyPool httpProxyPool;
55+
private IProxyPool httpProxyPool;
5556

5657
private boolean useGzip = true;
5758

@@ -464,17 +465,17 @@ public String toString() {
464465
*
465466
* @return this
466467
*/
467-
public Site setHttpProxyPool(List<String[]> httpProxyList) {
468-
this.httpProxyPool=new ProxyPool(httpProxyList);
468+
public Site setHttpProxyPool(IProxyPool proxyPool) {
469+
this.httpProxyPool = proxyPool;
469470
return this;
470471
}
471472

472473
public Site enableHttpProxyPool() {
473-
this.httpProxyPool=new ProxyPool();
474+
this.httpProxyPool=new SimpleProxyPool();
474475
return this;
475476
}
476477

477-
public ProxyPool getHttpProxyPool() {
478+
public IProxyPool getHttpProxyPool() {
478479
return httpProxyPool;
479480
}
480481

@@ -486,9 +487,4 @@ public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
486487
httpProxyPool.returnProxy(proxy,statusCode);
487488
}
488489

489-
public Site setProxyReuseInterval(int reuseInterval) {
490-
this.httpProxyPool.setReuseInterval(reuseInterval);
491-
return this;
492-
}
493-
494490
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package us.codecraft.webmagic.proxy;
2+
3+
import org.apache.http.HttpHost;
4+
5+
/**
6+
* Created by edwardsbean on 15-2-28.
7+
*/
8+
public interface IProxyPool {
9+
public void returnProxy(HttpHost host, int statusCode);
10+
public HttpHost getProxy();
11+
public boolean isEnable();
12+
}

webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
* @see Proxy
2323
* @since 0.5.1
2424
*/
25-
public class ProxyPool {
25+
public class ProxyPool implements IProxyPool{
2626

2727
private Logger logger = LoggerFactory.getLogger(getClass());
2828

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package us.codecraft.webmagic.proxy;
2+
3+
import org.apache.http.HttpHost;
4+
import org.slf4j.Logger;
5+
import org.slf4j.LoggerFactory;
6+
7+
import java.net.InetAddress;
8+
import java.net.UnknownHostException;
9+
import java.util.ArrayList;
10+
import java.util.List;
11+
import java.util.Map;
12+
import java.util.NoSuchElementException;
13+
import java.util.concurrent.BlockingQueue;
14+
import java.util.concurrent.ConcurrentHashMap;
15+
import java.util.concurrent.DelayQueue;
16+
17+
/**
18+
* Created by edwardsbean on 15-2-28.
19+
*/
20+
public class SimpleProxyPool implements IProxyPool{
21+
private Logger logger = LoggerFactory.getLogger(getClass());
22+
23+
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>();
24+
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>();
25+
private boolean isEnable = false;
26+
private int reuseInterval = 1500;// ms
27+
private int reviveTime = 2 * 60 * 60 * 1000;// ms
28+
29+
public SimpleProxyPool() {
30+
this(null);
31+
}
32+
33+
public SimpleProxyPool(List<String[]> httpProxyList) {
34+
if (httpProxyList != null) {
35+
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
36+
}
37+
}
38+
39+
public void addProxy(String[]... httpProxyList) {
40+
isEnable = true;
41+
for (String[] s : httpProxyList) {
42+
try {
43+
if (allProxy.containsKey(s[0])) {
44+
continue;
45+
}
46+
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
47+
Proxy p = new Proxy(item, reuseInterval);
48+
proxyQueue.add(p);
49+
allProxy.put(s[0], p);
50+
} catch (NumberFormatException e) {
51+
logger.error("HttpHost init error:", e);
52+
} catch (UnknownHostException e) {
53+
logger.error("HttpHost init error:", e);
54+
}
55+
}
56+
logger.info("proxy pool size>>>>" + allProxy.size());
57+
}
58+
59+
public void returnProxy(HttpHost host, int statusCode) {
60+
Proxy p = allProxy.get(host.getAddress().getHostAddress());
61+
if (p == null) {
62+
return;
63+
}
64+
switch (statusCode) {
65+
case Proxy.SUCCESS:
66+
p.setFailedNum(0);
67+
p.setFailedErrorType(new ArrayList<Integer>());
68+
p.recordResponse();
69+
p.successNumIncrement(1);
70+
break;
71+
case Proxy.ERROR_403:
72+
// banned,try longer interval
73+
p.fail(Proxy.ERROR_403);
74+
break;
75+
case Proxy.ERROR_BANNED:
76+
p.fail(Proxy.ERROR_BANNED);
77+
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
78+
break;
79+
case Proxy.ERROR_404:
80+
// p.fail(Proxy.ERROR_404);
81+
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
82+
break;
83+
default:
84+
p.fail(statusCode);
85+
break;
86+
}
87+
if (p.getFailedNum() > 3) {
88+
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
89+
return;
90+
}
91+
try {
92+
proxyQueue.put(p);
93+
} catch (InterruptedException e) {
94+
logger.warn("proxyQueue return proxy error", e);
95+
}
96+
}
97+
98+
@Override
99+
public HttpHost getProxy() {
100+
Proxy proxy = null;
101+
try {
102+
proxy = proxyQueue.take();
103+
} catch (InterruptedException e) {
104+
logger.error("get proxy error", e);
105+
}
106+
if (proxy == null) {
107+
throw new NoSuchElementException();
108+
}
109+
return proxy.getHttpHost();
110+
}
111+
112+
@Override
113+
public boolean isEnable() {
114+
return isEnable;
115+
}
116+
}

0 commit comments

Comments
 (0)