Skip to content

Commit f47038d

Browse files
authored
Merge pull request #1107 from hooyantsing/develop
修复 HtmlCleaner 无法正常解析 tr 和 td 标签的问题
2 parents a266df4 + 08f4a40 commit f47038d

File tree

4 files changed

+82
-41
lines changed

4 files changed

+82
-41
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.jsoup.Jsoup;
44
import org.jsoup.nodes.Document;
55
import org.jsoup.nodes.Element;
6+
import us.codecraft.webmagic.utils.BaseSelectorUtils;
67

78
import java.util.ArrayList;
89
import java.util.List;
@@ -13,16 +14,9 @@
1314
*/
1415
public abstract class BaseElementSelector implements Selector, ElementSelector {
1516
private Document parse(String text) {
16-
if (text == null) {
17-
return null;
18-
}
19-
2017
// Jsoup could not parse <tr></tr> or <td></td> tag directly
2118
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
22-
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
23-
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
24-
text = "<table>" + text + "</table>";
25-
}
19+
text = BaseSelectorUtils.preParse(text);
2620
return Jsoup.parse(text);
2721
}
2822

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package us.codecraft.webmagic.utils;
2+
3+
/**
4+
* @author hooy
5+
*/
6+
public class BaseSelectorUtils {
7+
8+
/**
9+
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
10+
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
11+
*
12+
* @param text - the html string
13+
* @return text
14+
*/
15+
public static String preParse(String text) {
16+
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
17+
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
18+
text = "<table>" + text + "</table>";
19+
}
20+
return text;
21+
}
22+
23+
}

webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import java.util.concurrent.ConcurrentHashMap;
99

1010
import javax.xml.namespace.NamespaceContext;
11+
import javax.xml.parsers.ParserConfigurationException;
1112
import javax.xml.transform.OutputKeys;
1213
import javax.xml.transform.Transformer;
1314
import javax.xml.transform.TransformerFactory;
@@ -29,13 +30,14 @@
2930

3031
import net.sf.saxon.lib.NamespaceConstant;
3132
import net.sf.saxon.xpath.XPathEvaluator;
33+
import us.codecraft.webmagic.utils.BaseSelectorUtils;
3234

3335
/**
3436
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
3537
*
3638
* @author code4crafter@gmail.com <br>
37-
* Date: 13-4-21
38-
* Time: 上午9:39
39+
* Date: 13-4-21
40+
* Time: 上午9:39
3941
*/
4042
public class Xpath2Selector implements Selector {
4143

@@ -111,14 +113,11 @@ private void init() throws XPathExpressionException {
111113
@Override
112114
public String select(String text) {
113115
try {
114-
HtmlCleaner htmlCleaner = new HtmlCleaner();
115-
TagNode tagNode = htmlCleaner.clean(text);
116-
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
117116
Object result;
118117
try {
119-
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
118+
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
120119
} catch (XPathExpressionException e) {
121-
result = xPathExpression.evaluate(document, XPathConstants.STRING);
120+
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
122121
}
123122
if (result instanceof NodeList) {
124123
NodeList nodeList = (NodeList) result;
@@ -147,14 +146,11 @@ public String select(String text) {
147146
public List<String> selectList(String text) {
148147
List<String> results = new ArrayList<String>();
149148
try {
150-
HtmlCleaner htmlCleaner = new HtmlCleaner();
151-
TagNode tagNode = htmlCleaner.clean(text);
152-
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
153149
Object result;
154150
try {
155-
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
151+
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
156152
} catch (XPathExpressionException e) {
157-
result = xPathExpression.evaluate(document, XPathConstants.STRING);
153+
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
158154
}
159155
if (result instanceof NodeList) {
160156
NodeList nodeList = (NodeList) result;
@@ -179,4 +175,12 @@ public List<String> selectList(String text) {
179175
}
180176
return results;
181177
}
178+
179+
private Document parse(String text) throws ParserConfigurationException {
180+
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
181+
text = BaseSelectorUtils.preParse(text);
182+
HtmlCleaner htmlCleaner = new HtmlCleaner();
183+
TagNode tagNode = htmlCleaner.clean(text);
184+
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
185+
}
182186
}

webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
import org.junit.Ignore;
1212
import org.junit.Test;
1313

14+
import us.codecraft.webmagic.Page;
15+
import us.codecraft.webmagic.Spider;
16+
import us.codecraft.webmagic.processor.PageProcessor;
1417
import us.codecraft.xsoup.XPathEvaluator;
1518
import us.codecraft.xsoup.Xsoup;
1619

@@ -1385,35 +1388,52 @@ public void testXpath2Selector() {
13851388
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
13861389
}
13871390

1391+
@Ignore("test parse <table> <tr> <td> tag")
1392+
@Test
1393+
public void htmlCleanerParseTest() {
1394+
Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run();
1395+
}
1396+
1397+
class RuoxiaPageProcessor implements PageProcessor {
1398+
@Override
1399+
public void process(Page page) {
1400+
List<String> items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText());
1401+
for (String item : items) {
1402+
String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item);
1403+
System.out.println(name);
1404+
}
1405+
}
1406+
}
1407+
13881408
@Ignore("take long time")
13891409
@Test
13901410
public void performanceTest() {
13911411
Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
1392-
long time =System.currentTimeMillis();
1412+
long time = System.currentTimeMillis();
13931413
for (int i = 0; i < 1000; i++) {
13941414
xpath2Selector.selectList(html);
13951415
}
1396-
System.out.println(System.currentTimeMillis()-time);
1416+
System.out.println(System.currentTimeMillis() - time);
13971417

13981418
XpathSelector xpathSelector = new XpathSelector("//a");
1399-
time =System.currentTimeMillis();
1419+
time = System.currentTimeMillis();
14001420
for (int i = 0; i < 1000; i++) {
14011421
xpathSelector.selectList(html);
14021422
}
1403-
System.out.println(System.currentTimeMillis()-time);
1423+
System.out.println(System.currentTimeMillis() - time);
14041424

1405-
time =System.currentTimeMillis();
1425+
time = System.currentTimeMillis();
14061426
for (int i = 0; i < 1000; i++) {
14071427
xpath2Selector.selectList(html);
14081428
}
14091429
System.out.println(System.currentTimeMillis() - time);
14101430

14111431
CssSelector cssSelector = new CssSelector("a");
1412-
time =System.currentTimeMillis();
1432+
time = System.currentTimeMillis();
14131433
for (int i = 0; i < 1000; i++) {
14141434
cssSelector.selectList(html);
14151435
}
1416-
System.out.println("css "+(System.currentTimeMillis()-time));
1436+
System.out.println("css " + (System.currentTimeMillis() - time));
14171437
}
14181438

14191439
@Ignore("take long time")
@@ -1425,54 +1445,54 @@ public void parserPerformanceTest() throws XPatherException {
14251445
TagNode tagNode = htmlCleaner.clean(html);
14261446
Document document = Jsoup.parse(html);
14271447

1428-
long time =System.currentTimeMillis();
1448+
long time = System.currentTimeMillis();
14291449
for (int i = 0; i < 2000; i++) {
14301450
htmlCleaner.clean(html);
14311451
}
1432-
System.out.println(System.currentTimeMillis()-time);
1452+
System.out.println(System.currentTimeMillis() - time);
14331453

1434-
time =System.currentTimeMillis();
1454+
time = System.currentTimeMillis();
14351455
for (int i = 0; i < 2000; i++) {
14361456
tagNode.evaluateXPath("//a");
14371457
}
1438-
System.out.println(System.currentTimeMillis()-time);
1458+
System.out.println(System.currentTimeMillis() - time);
14391459

14401460
System.out.println("=============");
14411461

1442-
time =System.currentTimeMillis();
1462+
time = System.currentTimeMillis();
14431463
for (int i = 0; i < 2000; i++) {
14441464
Jsoup.parse(html);
14451465
}
1446-
System.out.println(System.currentTimeMillis()-time);
1466+
System.out.println(System.currentTimeMillis() - time);
14471467

1448-
time =System.currentTimeMillis();
1468+
time = System.currentTimeMillis();
14491469
for (int i = 0; i < 2000; i++) {
14501470
document.select("a");
14511471
}
1452-
System.out.println(System.currentTimeMillis()-time);
1472+
System.out.println(System.currentTimeMillis() - time);
14531473

14541474
System.out.println("=============");
14551475

1456-
time =System.currentTimeMillis();
1476+
time = System.currentTimeMillis();
14571477
for (int i = 0; i < 2000; i++) {
14581478
htmlCleaner.clean(html);
14591479
}
1460-
System.out.println(System.currentTimeMillis()-time);
1480+
System.out.println(System.currentTimeMillis() - time);
14611481

1462-
time =System.currentTimeMillis();
1482+
time = System.currentTimeMillis();
14631483
for (int i = 0; i < 2000; i++) {
14641484
tagNode.evaluateXPath("//a");
14651485
}
1466-
System.out.println(System.currentTimeMillis()-time);
1486+
System.out.println(System.currentTimeMillis() - time);
14671487

14681488
System.out.println("=============");
14691489

14701490
XPathEvaluator compile = Xsoup.compile("//a");
1471-
time =System.currentTimeMillis();
1491+
time = System.currentTimeMillis();
14721492
for (int i = 0; i < 2000; i++) {
14731493
compile.evaluate(document);
14741494
}
1475-
System.out.println(System.currentTimeMillis()-time);
1495+
System.out.println(System.currentTimeMillis() - time);
14761496

14771497
}
14781498

0 commit comments

Comments
 (0)