|
1 | 1 | package us.codecraft.webmagic.selector; |
2 | 2 |
|
3 | 3 | import org.jsoup.Jsoup; |
| 4 | +import org.jsoup.nodes.Document; |
4 | 5 | import org.jsoup.nodes.Element; |
5 | 6 |
|
6 | 7 | import java.util.ArrayList; |
|
11 | 12 | * @since 0.3.0 |
12 | 13 | */ |
13 | 14 | public abstract class BaseElementSelector implements Selector, ElementSelector { |
| 15 | + private Document parse(String text) { |
| 16 | + if (text == null) { |
| 17 | + return null; |
| 18 | + } |
| 19 | + |
| 20 | + // Jsoup could not parse <tr></tr> or <td></td> tag directly |
| 21 | + // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag |
| 22 | + if ((text.startsWith("<tr>") && text.endsWith("</tr>")) |
| 23 | + || (text.startsWith("<td>") && text.endsWith("</td>"))) { |
| 24 | + text = "<table>" + text + "</table>"; |
| 25 | + } |
| 26 | + return Jsoup.parse(text); |
| 27 | + } |
14 | 28 |
|
15 | 29 | @Override |
16 | 30 | public String select(String text) { |
17 | 31 | if (text != null) { |
18 | | - return select(Jsoup.parse(text)); |
| 32 | + return select(parse(text)); |
19 | 33 | } |
20 | 34 | return null; |
21 | 35 | } |
22 | 36 |
|
23 | 37 | @Override |
24 | 38 | public List<String> selectList(String text) { |
25 | 39 | if (text != null) { |
26 | | - return selectList(Jsoup.parse(text)); |
| 40 | + return selectList(parse(text)); |
27 | 41 | } else { |
28 | 42 | return new ArrayList<String>(); |
29 | 43 | } |
30 | 44 | } |
31 | 45 |
|
32 | 46 | public Element selectElement(String text) { |
33 | 47 | if (text != null) { |
34 | | - return selectElement(Jsoup.parse(text)); |
| 48 | + return selectElement(parse(text)); |
35 | 49 | } |
36 | 50 | return null; |
37 | 51 | } |
38 | 52 |
|
39 | 53 | public List<Element> selectElements(String text) { |
40 | 54 | if (text != null) { |
41 | | - return selectElements(Jsoup.parse(text)); |
| 55 | + return selectElements(parse(text)); |
42 | 56 | } else { |
43 | 57 | return new ArrayList<Element>(); |
44 | 58 | } |
|
0 commit comments