Skip to content

Commit a266df4

Browse files
committed
Add Site.defaultCharset. closes #1101.
1 parent 80424b0 commit a266df4

File tree

3 files changed

+48
-4
lines changed

3 files changed

+48
-4
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Site.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public class Site {
2828

2929
private String charset;
3030

31+
private String defaultCharset;
32+
3133
private int sleepTime = 5000;
3234

3335
private int retryTimes = 0;
@@ -168,6 +170,30 @@ public String getCharset() {
168170
return charset;
169171
}
170172

173+
/**
174+
* Set default charset of page.
175+
*
176+
* When charset detect failed, use this default charset.
177+
*
178+
* @param defaultCharset the default charset
179+
* @return this
180+
* @since 0.9.0
181+
*/
182+
public Site setDefaultCharset(String defaultCharset) {
183+
this.defaultCharset = defaultCharset;
184+
return this;
185+
}
186+
187+
/**
188+
* The default charset if charset detected failed.
189+
*
190+
* @return the defulat charset
191+
* @since 0.9.0
192+
*/
193+
public String getDefaultCharset() {
194+
return defaultCharset;
195+
}
196+
171197
public int getTimeOut() {
172198
return timeOut;
173199
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import java.nio.charset.Charset;
55
import java.util.HashMap;
66
import java.util.Map;
7+
import java.util.Optional;
78

89
import org.apache.commons.io.IOUtils;
910
import org.apache.http.HttpResponse;
@@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
116117
page.setBytes(bytes);
117118
if (!request.isBinaryContent()) {
118119
if (charset == null) {
119-
charset = getHtmlCharset(contentType, bytes);
120+
charset = getHtmlCharset(contentType, bytes, task);
120121
}
121122
page.setCharset(charset);
122123
page.setRawText(new String(bytes, charset));
@@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
131132
return page;
132133
}
133134

134-
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
135+
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
135136
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
136137
if (charset == null) {
137-
charset = Charset.defaultCharset().name();
138-
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
138+
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
139+
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
139140
}
140141
return charset;
141142
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package us.codecraft.webmagic;
2+
3+
import static org.junit.Assert.assertEquals;
4+
5+
import java.nio.charset.StandardCharsets;
6+
7+
import org.junit.Test;
8+
9+
public class SiteTest {
10+
11+
@Test
12+
public void test() {
13+
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
14+
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
15+
}
16+
17+
}

0 commit comments

Comments
 (0)