Skip to content

Commit 124c52b

Browse files
committed
Downgrade htmlcleaner from 2.24 back to 2.5, to make Xpath2Selector pass the test cases.
1 parent 683db09 commit 124c52b

File tree

3 files changed

+29
-21
lines changed

3 files changed

+29
-21
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@
171171
<dependency>
172172
<groupId>net.sourceforge.htmlcleaner</groupId>
173173
<artifactId>htmlcleaner</artifactId>
174-
<version>2.24</version>
174+
<version>2.5</version>
175175
</dependency>
176176
<dependency>
177177
<groupId>com.github.detro</groupId>

webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
package us.codecraft.webmagic.selector;
22

3-
import net.sf.saxon.lib.NamespaceConstant;
4-
import net.sf.saxon.xpath.XPathEvaluator;
5-
import org.htmlcleaner.CleanerProperties;
6-
import org.htmlcleaner.DomSerializer;
7-
import org.htmlcleaner.HtmlCleaner;
8-
import org.htmlcleaner.TagNode;
9-
import org.slf4j.Logger;
10-
import org.slf4j.LoggerFactory;
11-
import org.w3c.dom.Document;
12-
import org.w3c.dom.Node;
13-
import org.w3c.dom.NodeList;
3+
import java.io.StringWriter;
4+
import java.util.ArrayList;
5+
import java.util.Iterator;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.concurrent.ConcurrentHashMap;
149

1510
import javax.xml.namespace.NamespaceContext;
1611
import javax.xml.transform.OutputKeys;
@@ -21,12 +16,19 @@
2116
import javax.xml.xpath.XPathConstants;
2217
import javax.xml.xpath.XPathExpression;
2318
import javax.xml.xpath.XPathExpressionException;
24-
import java.io.StringWriter;
25-
import java.util.ArrayList;
26-
import java.util.Iterator;
27-
import java.util.List;
28-
import java.util.Map;
29-
import java.util.concurrent.ConcurrentHashMap;
19+
20+
import org.htmlcleaner.CleanerProperties;
21+
import org.htmlcleaner.DomSerializer;
22+
import org.htmlcleaner.HtmlCleaner;
23+
import org.htmlcleaner.TagNode;
24+
import org.slf4j.Logger;
25+
import org.slf4j.LoggerFactory;
26+
import org.w3c.dom.Document;
27+
import org.w3c.dom.Node;
28+
import org.w3c.dom.NodeList;
29+
30+
import net.sf.saxon.lib.NamespaceConstant;
31+
import net.sf.saxon.xpath.XPathEvaluator;
3032

3133
/**
3234
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>

webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package us.codecraft.webmagic.selector;
22

3+
import java.util.List;
4+
35
import org.htmlcleaner.HtmlCleaner;
46
import org.htmlcleaner.TagNode;
57
import org.htmlcleaner.XPatherException;
@@ -1368,15 +1370,19 @@ public void testOschina() {
13681370
public void testXPath2() {
13691371
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
13701372
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
1371-
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
1372-
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收 ", xpathSelector.select(text));
1373+
Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
1374+
Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
13731375
}
13741376

13751377
@Test
13761378
public void testXpath2Selector() {
13771379
Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
13781380
String select = xpath2Selector.select(html);
1379-
Assert.assertNotNull(select);
1381+
Assert.assertEquals("http://www.oschina.net/", select);
1382+
1383+
List<String> selectList = xpath2Selector.selectList(html);
1384+
Assert.assertEquals(113, selectList.size());
1385+
Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
13801386
}
13811387

13821388
@Ignore("take long time")

0 commit comments

Comments
 (0)