11package us .codecraft .webmagic .selector ;
22
3- import java .io .StringWriter ;
4- import java .util .ArrayList ;
5- import java .util .Iterator ;
6- import java .util .List ;
7- import java .util .Map ;
3+ import java .util .*;
84import java .util .concurrent .ConcurrentHashMap ;
95
106import javax .xml .namespace .NamespaceContext ;
117import javax .xml .parsers .ParserConfigurationException ;
12- import javax .xml .transform .OutputKeys ;
13- import javax .xml .transform .Transformer ;
14- import javax .xml .transform .TransformerFactory ;
15- import javax .xml .transform .dom .DOMSource ;
16- import javax .xml .transform .stream .StreamResult ;
178import javax .xml .xpath .XPathConstants ;
189import javax .xml .xpath .XPathExpression ;
1910import javax .xml .xpath .XPathExpressionException ;
3223import net .sf .saxon .xpath .XPathEvaluator ;
3324import us .codecraft .webmagic .utils .BaseSelectorUtils ;
3425
26+ import static us .codecraft .webmagic .selector .JaxpSelectorUtils .*;
27+
3528/**
3629 * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
3730 *
38- * @author code4crafter@gmail.com <br>
31+ * @author code4crafter@gmail.com, hooy <br>
3932 * Date: 13-4-21
4033 * Time: 上午9:39
4134 */
42- public class Xpath2Selector implements Selector {
35+ public class Xpath2Selector implements Selector , NodeSelector {
4336
44- private String xpathStr ;
37+ private final String xpathStr ;
4538
4639 private XPathExpression xPathExpression ;
4740
48- private Logger logger = LoggerFactory .getLogger (getClass ());
41+ private final Logger logger = LoggerFactory .getLogger (getClass ());
4942
5043 public Xpath2Selector (String xpathStr ) {
5144 this .xpathStr = xpathStr ;
@@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) {
5649 }
5750 }
5851
52+ public static Xpath2Selector newInstance (String xpathStr ) {
53+ return new Xpath2Selector (xpathStr );
54+ }
55+
5956 enum XPath2NamespaceContext implements NamespaceContext {
6057
6158 INSTANCE ;
6259
63- private final Map <String , String > prefix2NamespaceMap = new ConcurrentHashMap <String , String >();
60+ private final Map <String , String > prefix2NamespaceMap = new ConcurrentHashMap <>();
6461
65- private final Map <String , List <String >> namespace2PrefixMap = new ConcurrentHashMap <String , List < String > >();
62+ private final Map <String , List <String >> namespace2PrefixMap = new ConcurrentHashMap <>();
6663
6764 private void put (String prefix , String namespaceURI ) {
6865 prefix2NamespaceMap .put (prefix , namespaceURI );
69- List <String > prefixes = namespace2PrefixMap .get (namespaceURI );
70- if (prefixes == null ) {
71- prefixes = new ArrayList <String >();
72- namespace2PrefixMap .put (namespaceURI , prefixes );
73- }
66+ List <String > prefixes = namespace2PrefixMap .computeIfAbsent (namespaceURI , k -> new ArrayList <>());
7467 prefixes .add (prefix );
7568 }
7669
77- private XPath2NamespaceContext () {
70+ XPath2NamespaceContext () {
7871 put ("fn" , NamespaceConstant .FN );
7972 put ("xslt" , NamespaceConstant .XSLT );
8073 put ("xhtml" , NamespaceConstant .XHTML );
@@ -113,29 +106,18 @@ private void init() throws XPathExpressionException {
113106 @ Override
114107 public String select (String text ) {
115108 try {
116- Object result ;
117- try {
118- result = xPathExpression .evaluate (parse (text ), XPathConstants .NODESET );
119- } catch (XPathExpressionException e ) {
120- result = xPathExpression .evaluate (parse (text ), XPathConstants .STRING );
121- }
122- if (result instanceof NodeList ) {
123- NodeList nodeList = (NodeList ) result ;
124- if (nodeList .getLength () == 0 ) {
125- return null ;
126- }
127- Node item = nodeList .item (0 );
128- if (item .getNodeType () == Node .ATTRIBUTE_NODE || item .getNodeType () == Node .TEXT_NODE ) {
129- return item .getTextContent ();
130- } else {
131- StreamResult xmlOutput = new StreamResult (new StringWriter ());
132- Transformer transformer = TransformerFactory .newInstance ().newTransformer ();
133- transformer .setOutputProperty (OutputKeys .OMIT_XML_DECLARATION , "yes" );
134- transformer .transform (new DOMSource (item ), xmlOutput );
135- return xmlOutput .getWriter ().toString ();
136- }
137- }
138- return result .toString ();
109+ Document doc = parse (text );
110+ return select (doc );
111+ } catch (Exception e ) {
112+ logger .error ("select text error! " + xpathStr , e );
113+ }
114+ return null ;
115+ }
116+
117+ @ Override
118+ public String select (Node node ) {
119+ try {
120+ return (String ) xPathExpression .evaluate (node , XPathConstants .STRING );
139121 } catch (Exception e ) {
140122 logger .error ("select text error! " + xpathStr , e );
141123 }
@@ -144,43 +126,72 @@ public String select(String text) {
144126
145127 @ Override
146128 public List <String > selectList (String text ) {
147- List <String > results = new ArrayList <String >();
148129 try {
149- Object result ;
150- try {
151- result = xPathExpression .evaluate (parse (text ), XPathConstants .NODESET );
152- } catch (XPathExpressionException e ) {
153- result = xPathExpression .evaluate (parse (text ), XPathConstants .STRING );
154- }
155- if (result instanceof NodeList ) {
156- NodeList nodeList = (NodeList ) result ;
157- Transformer transformer = TransformerFactory .newInstance ().newTransformer ();
158- StreamResult xmlOutput = new StreamResult ();
159- transformer .setOutputProperty (OutputKeys .OMIT_XML_DECLARATION , "yes" );
160- for (int i = 0 ; i < nodeList .getLength (); i ++) {
161- Node item = nodeList .item (i );
162- if (item .getNodeType () == Node .ATTRIBUTE_NODE || item .getNodeType () == Node .TEXT_NODE ) {
163- results .add (item .getTextContent ());
164- } else {
165- xmlOutput .setWriter (new StringWriter ());
166- transformer .transform (new DOMSource (item ), xmlOutput );
167- results .add (xmlOutput .getWriter ().toString ());
168- }
169- }
170- } else {
171- results .add (result .toString ());
172- }
130+ Document doc = parse (text );
131+ return selectList (doc );
132+ } catch (Exception e ) {
133+ logger .error ("select text error! " + xpathStr , e );
134+ }
135+ return null ;
136+ }
137+
138+ @ Override
139+ public List <String > selectList (Node node ) {
140+ try {
141+ NodeList result = (NodeList ) xPathExpression .evaluate (node , XPathConstants .NODESET );
142+ List <Node > nodes = NodeListToArrayList (result );
143+ return nodesToStrings (nodes );
173144 } catch (Exception e ) {
174145 logger .error ("select text error! " + xpathStr , e );
175146 }
176- return results ;
147+ return null ;
177148 }
178149
179- private Document parse (String text ) throws ParserConfigurationException {
150+ public Node selectNode (String text ) {
151+ try {
152+ Document doc = parse (text );
153+ return selectNode (doc );
154+ } catch (Exception e ) {
155+ logger .error ("select text error! " + xpathStr , e );
156+ }
157+ return null ;
158+ }
159+
160+ public Node selectNode (Node node ) {
161+ try {
162+ return (Node ) xPathExpression .evaluate (node , XPathConstants .NODE );
163+ } catch (Exception e ) {
164+ logger .error ("select text error! " + xpathStr , e );
165+ }
166+ return null ;
167+ }
168+
169+ public List <Node > selectNodes (String text ) {
170+ try {
171+ Document doc = parse (text );
172+ return selectNodes (doc );
173+ } catch (Exception e ) {
174+ logger .error ("select text error! " + xpathStr , e );
175+ }
176+ return null ;
177+ }
178+
179+ public List <Node > selectNodes (Node node ) {
180+ try {
181+ NodeList result = (NodeList ) xPathExpression .evaluate (node , XPathConstants .NODESET );
182+ return NodeListToArrayList (result );
183+ } catch (Exception e ) {
184+ logger .error ("select text error! " + xpathStr , e );
185+ }
186+ return null ;
187+ }
188+
189+ protected static Document parse (String text ) throws ParserConfigurationException {
180190 // HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
181191 text = BaseSelectorUtils .preParse (text );
182192 HtmlCleaner htmlCleaner = new HtmlCleaner ();
183193 TagNode tagNode = htmlCleaner .clean (text );
184194 return new DomSerializer (new CleanerProperties ()).createDOM (tagNode );
185195 }
196+
186197}
0 commit comments