Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.google.common.collect.Table;
import org.apache.http.HttpHost;

import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;

Expand Down Expand Up @@ -469,6 +470,11 @@ public Site setHttpProxyPool(List<String[]> httpProxyList) {
return this;
}

public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
this.httpProxyPool=new ProxyPool(httpProxyList, isUseLastProxy);
return this;
}

public Site enableHttpProxyPool() {
this.httpProxyPool=new ProxyPool();
return this;
Expand All @@ -478,7 +484,7 @@ public ProxyPool getHttpProxyPool() {
return httpProxyPool;
}

public HttpHost getHttpProxyFromPool() {
public Proxy getHttpProxyFromPool() {
return httpProxyPool.getProxy();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
Expand All @@ -50,17 +51,17 @@ public class HttpClientDownloader extends AbstractDownloader {

private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();

private CloseableHttpClient getHttpClient(Site site) {
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
if (site == null) {
return httpClientGenerator.getClient(null);
return httpClientGenerator.getClient(null, proxy);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClient = httpClientGenerator.getClient(site, proxy);
httpClients.put(domain, httpClient);
}
}
Expand Down Expand Up @@ -88,8 +89,15 @@ public Page download(Request request, Task task) {
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
HttpHost proxyHost = null;
Proxy proxy = null; //TODO
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool();
proxyHost = proxy.getHttpHost();
}

HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴���
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Expand Down Expand Up @@ -129,7 +137,7 @@ protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}

protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers,HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
Expand All @@ -141,10 +149,9 @@ protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<Strin
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
HttpHost host = site.getHttpProxyFromPool();
requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host);
if (proxy !=null) {
requestConfigBuilder.setProxy(proxy);
request.putExtra(Request.PROXY, proxy);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package us.codecraft.webmagic.downloader;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
Expand All @@ -15,6 +19,7 @@
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;

import java.io.IOException;
import java.util.Map;
Expand All @@ -41,12 +46,24 @@ public HttpClientGenerator setPoolSize(int poolSize) {
return this;
}

public CloseableHttpClient getClient(Site site) {
return generateClient(site);
public CloseableHttpClient getClient(Site site, Proxy proxy) {
return generateClient(site, proxy);
}

private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager);
private CloseableHttpClient generateClient(Site site, Proxy proxy) {
CredentialsProvider credsProvider = null;
HttpClientBuilder httpClientBuilder = HttpClients.custom();

if(proxy!=null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
{
credsProvider= new BasicCredentialsProvider();
credsProvider.setCredentials(
new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
}

httpClientBuilder.setConnectionManager(connectionManager);
if (site != null && site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
Expand All @@ -61,7 +78,6 @@ public void process(
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}

}
});
}
Expand Down
26 changes: 22 additions & 4 deletions webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package us.codecraft.webmagic.proxy;

import org.apache.http.HttpHost;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;

import org.apache.http.HttpHost;

/**
* >>>> Proxy lifecycle

Expand Down Expand Up @@ -64,6 +64,9 @@ public class Proxy implements Delayed, Serializable {
public static final int SUCCESS = 200;

private final HttpHost httpHost;
private String user;
private String password;


private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
Expand All @@ -76,13 +79,17 @@ public class Proxy implements Delayed, Serializable {

private List<Integer> failedErrorType = new ArrayList<Integer>();

Proxy(HttpHost httpHost) {
Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}

Proxy(HttpHost httpHost, int reuseInterval) {
Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
}

Expand Down Expand Up @@ -170,6 +177,17 @@ public String toString() {
return re;

}

public String getUser()
{
return user;

}
public String getPassword()
{
return password;

}

public void borrowNumIncrement(int increment) {
this.borrowNum += increment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,14 @@ public void addProxy(String[]... httpProxyList) {
isEnable = true;
for (String[] s : httpProxyList) {
try {
if (allProxy.containsKey(s[0])) {
if (allProxy.containsKey(s[2])) {
continue;
}
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval);
Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
proxyQueue.add(p);
allProxy.put(s[0], p);
allProxy.put(s[2], p);
}
} catch (NumberFormatException e) {
logger.error("HttpHost init error:", e);
Expand All @@ -174,7 +174,7 @@ public void addProxy(String[]... httpProxyList) {
logger.info("proxy pool size>>>>" + allProxy.size());
}

public HttpHost getProxy() {
public Proxy getProxy() {
Proxy proxy = null;
try {
Long time = System.currentTimeMillis();
Expand All @@ -192,7 +192,7 @@ public HttpHost getProxy() {
if (proxy == null) {
throw new NoSuchElementException();
}
return proxy.getHttpHost();
return proxy;
}

public void returnProxy(HttpHost host, int statusCode) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@ public void run() {
private String getCharsetByUrl(String url) {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
// encoding in http header Content-Type
Request requestGBK = new Request(url);
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null));
} catch (IOException e) {
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ public class ProxyTest {
public static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
for (String line : source) {
httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] });
}
}

Expand All @@ -37,7 +37,8 @@ public void testProxy() {
for (int i = 0; i < 2; i++) {
List<Fetch> fetchList = new ArrayList<Fetch>();
while (proxyPool.getIdleNum() != 0) {
HttpHost httphost = proxyPool.getProxy();
Proxy proxy = proxyPool.getProxy();
HttpHost httphost = proxy.getHttpHost();
// httphostList.add(httphost);
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
Fetch tmp = new Fetch(httphost);
Expand Down Expand Up @@ -69,4 +70,5 @@ public void run() {
}
}
}

}