Skip to content

Commit 69accb6

Browse files
authored
Merge pull request #1086 from vioao/enhance_jsoup_parse_table
Enhance Jsoup could parse tr td tag directly
2 parents db9c92e + e7a7fbe commit 69accb6

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package us.codecraft.webmagic.selector;
22

33
import org.jsoup.Jsoup;
4+
import org.jsoup.nodes.Document;
45
import org.jsoup.nodes.Element;
56

67
import java.util.ArrayList;
@@ -11,34 +12,47 @@
1112
* @since 0.3.0
1213
*/
1314
public abstract class BaseElementSelector implements Selector, ElementSelector {
15+
private Document parse(String text) {
16+
if (text == null) {
17+
return null;
18+
}
19+
20+
// Jsoup could not parse <tr></tr> or <td></td> tag directly
21+
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
22+
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
23+
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
24+
text = "<table>" + text + "</table>";
25+
}
26+
return Jsoup.parse(text);
27+
}
1428

1529
@Override
1630
public String select(String text) {
1731
if (text != null) {
18-
return select(Jsoup.parse(text));
32+
return select(parse(text));
1933
}
2034
return null;
2135
}
2236

2337
@Override
2438
public List<String> selectList(String text) {
2539
if (text != null) {
26-
return selectList(Jsoup.parse(text));
40+
return selectList(parse(text));
2741
} else {
2842
return new ArrayList<String>();
2943
}
3044
}
3145

3246
public Element selectElement(String text) {
3347
if (text != null) {
34-
return selectElement(Jsoup.parse(text));
48+
return selectElement(parse(text));
3549
}
3650
return null;
3751
}
3852

3953
public List<Element> selectElements(String text) {
4054
if (text != null) {
41-
return selectElements(Jsoup.parse(text));
55+
return selectElements(parse(text));
4256
} else {
4357
return new ArrayList<Element>();
4458
}

0 commit comments

Comments
 (0)