Skip to content

Commit 12ce864

Browse files
committed
BugFix: Jsoup 和 HtmlCleaner 构建 Dom 时,若缺失 table 标签,则无法正常解析 tr 和 td 标签。
1 parent a266df4 commit 12ce864

File tree

4 files changed

+60
-20
lines changed

4 files changed

+60
-20
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.jsoup.Jsoup;
44
import org.jsoup.nodes.Document;
55
import org.jsoup.nodes.Element;
6+
import us.codecraft.webmagic.utils.BaseSelectorUtils;
67

78
import java.util.ArrayList;
89
import java.util.List;
@@ -13,16 +14,9 @@
1314
*/
1415
public abstract class BaseElementSelector implements Selector, ElementSelector {
1516
private Document parse(String text) {
16-
if (text == null) {
17-
return null;
18-
}
19-
2017
// Jsoup could not parse <tr></tr> or <td></td> tag directly
2118
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
22-
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
23-
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
24-
text = "<table>" + text + "</table>";
25-
}
19+
text = BaseSelectorUtils.preParse(text);
2620
return Jsoup.parse(text);
2721
}
2822

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package us.codecraft.webmagic.utils;
2+
3+
/**
4+
* @author hooy
5+
*/
6+
public class BaseSelectorUtils {
7+
8+
/**
9+
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
10+
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
11+
*
12+
* @param text - the html string
13+
* @return text
14+
*/
15+
public static String preParse(String text) {
16+
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
17+
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
18+
text = "<table>" + text + "</table>";
19+
}
20+
return text;
21+
}
22+
23+
}

webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import java.util.concurrent.ConcurrentHashMap;
99

1010
import javax.xml.namespace.NamespaceContext;
11+
import javax.xml.parsers.ParserConfigurationException;
1112
import javax.xml.transform.OutputKeys;
1213
import javax.xml.transform.Transformer;
1314
import javax.xml.transform.TransformerFactory;
@@ -29,13 +30,14 @@
2930

3031
import net.sf.saxon.lib.NamespaceConstant;
3132
import net.sf.saxon.xpath.XPathEvaluator;
33+
import us.codecraft.webmagic.utils.BaseSelectorUtils;
3234

3335
/**
3436
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
3537
*
3638
* @author code4crafter@gmail.com <br>
37-
* Date: 13-4-21
38-
* Time: 上午9:39
39+
* Date: 13-4-21
40+
* Time: 上午9:39
3941
*/
4042
public class Xpath2Selector implements Selector {
4143

@@ -111,14 +113,11 @@ private void init() throws XPathExpressionException {
111113
@Override
112114
public String select(String text) {
113115
try {
114-
HtmlCleaner htmlCleaner = new HtmlCleaner();
115-
TagNode tagNode = htmlCleaner.clean(text);
116-
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
117116
Object result;
118117
try {
119-
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
118+
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
120119
} catch (XPathExpressionException e) {
121-
result = xPathExpression.evaluate(document, XPathConstants.STRING);
120+
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
122121
}
123122
if (result instanceof NodeList) {
124123
NodeList nodeList = (NodeList) result;
@@ -147,14 +146,11 @@ public String select(String text) {
147146
public List<String> selectList(String text) {
148147
List<String> results = new ArrayList<String>();
149148
try {
150-
HtmlCleaner htmlCleaner = new HtmlCleaner();
151-
TagNode tagNode = htmlCleaner.clean(text);
152-
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
153149
Object result;
154150
try {
155-
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
151+
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET);
156152
} catch (XPathExpressionException e) {
157-
result = xPathExpression.evaluate(document, XPathConstants.STRING);
153+
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
158154
}
159155
if (result instanceof NodeList) {
160156
NodeList nodeList = (NodeList) result;
@@ -179,4 +175,12 @@ public List<String> selectList(String text) {
179175
}
180176
return results;
181177
}
178+
179+
private Document parse(String text) throws ParserConfigurationException {
180+
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
181+
text = BaseSelectorUtils.preParse(text);
182+
HtmlCleaner htmlCleaner = new HtmlCleaner();
183+
TagNode tagNode = htmlCleaner.clean(text);
184+
return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
185+
}
182186
}

webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
import org.junit.Ignore;
1212
import org.junit.Test;
1313

14+
import us.codecraft.webmagic.Page;
15+
import us.codecraft.webmagic.Spider;
16+
import us.codecraft.webmagic.processor.PageProcessor;
1417
import us.codecraft.xsoup.XPathEvaluator;
1518
import us.codecraft.xsoup.Xsoup;
1619

@@ -1385,6 +1388,22 @@ public void testXpath2Selector() {
13851388
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
13861389
}
13871390

1391+
@Ignore("test parse <table> <tr> <td> tag")
1392+
@Test
1393+
public void htmlCleanerParseTest() {
1394+
Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run();
1395+
}
1396+
class RuoxiaPageProcessor implements PageProcessor {
1397+
@Override
1398+
public void process(Page page) {
1399+
List<Selectable> nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes();
1400+
for (Selectable node:nodes) {
1401+
String name = node.xpath("//td[3]/div/a[1]/text()").get();
1402+
System.out.println(name);
1403+
}
1404+
}
1405+
}
1406+
13881407
@Ignore("take long time")
13891408
@Test
13901409
public void performanceTest() {

0 commit comments

Comments
 (0)