|
11 | 11 |
|
12 | 12 | namespace Symfony\Component\DomCrawler; |
13 | 13 |
|
| 14 | +use Masterminds\HTML5; |
14 | 15 | use Symfony\Component\CssSelector\CssSelectorConverter; |
15 | 16 |
|
16 | 17 | /** |
@@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate |
55 | 56 | private $isHtml = true; |
56 | 57 |
|
57 | 58 | /** |
58 | | - * @param mixed $node A Node to use as the base for the crawling |
59 | | - * @param string $uri The current URI |
60 | | - * @param string $baseHref The base href value |
| 59 | + * @var HTML5|null |
61 | 60 | */ |
62 | | - public function __construct($node = null, string $uri = null, string $baseHref = null) |
| 61 | + private $html5Parser; |
| 62 | + |
| 63 | + /** |
| 64 | + * @param mixed $node A Node to use as the base for the crawling |
| 65 | + * @param string $uri The current URI |
| 66 | + * @param string $baseHref The base href value |
| 67 | + * @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser |
| 68 | + */ |
| 69 | + public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null) |
63 | 70 | { |
64 | 71 | $this->uri = $uri; |
65 | 72 | $this->baseHref = $baseHref ?: $uri; |
66 | 73 |
|
| 74 | + if ($useHtml5Parser && !class_exists(HTML5::class)) { |
| 75 | + throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".'); |
| 76 | + } |
| 77 | + |
| 78 | + if ($useHtml5Parser ?? class_exists(HTML5::class)) { |
| 79 | + $this->html5Parser = new HTML5(['disable_html_ns' => true]); |
| 80 | + } |
| 81 | + |
67 | 82 | $this->add($node); |
68 | 83 | } |
69 | 84 |
|
@@ -183,29 +198,7 @@ public function addContent($content, $type = null) |
183 | 198 | */ |
184 | 199 | public function addHtmlContent($content, $charset = 'UTF-8') |
185 | 200 | { |
186 | | - $internalErrors = libxml_use_internal_errors(true); |
187 | | - $disableEntities = libxml_disable_entity_loader(true); |
188 | | - |
189 | | - $dom = new \DOMDocument('1.0', $charset); |
190 | | - $dom->validateOnParse = true; |
191 | | - |
192 | | - set_error_handler(function () { throw new \Exception(); }); |
193 | | - |
194 | | - try { |
195 | | - // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() |
196 | | - $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); |
197 | | - } catch (\Exception $e) { |
198 | | - } |
199 | | - |
200 | | - restore_error_handler(); |
201 | | - |
202 | | - if ('' !== trim($content)) { |
203 | | - @$dom->loadHTML($content); |
204 | | - } |
205 | | - |
206 | | - libxml_use_internal_errors($internalErrors); |
207 | | - libxml_disable_entity_loader($disableEntities); |
208 | | - |
| 201 | + $dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset); |
209 | 202 | $this->addDocument($dom); |
210 | 203 |
|
211 | 204 | $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); |
@@ -608,6 +601,15 @@ public function html(/* $default = null */) |
608 | 601 | throw new \InvalidArgumentException('The current node list is empty.'); |
609 | 602 | } |
610 | 603 |
|
| 604 | + if (null !== $this->html5Parser) { |
| 605 | + $html = ''; |
| 606 | + foreach ($this->getNode(0)->childNodes as $child) { |
| 607 | + $html .= $this->html5Parser->saveHTML($child); |
| 608 | + } |
| 609 | + |
| 610 | + return $html; |
| 611 | + } |
| 612 | + |
611 | 613 | $html = ''; |
612 | 614 | foreach ($this->getNode(0)->childNodes as $child) { |
613 | 615 | $html .= $child->ownerDocument->saveHTML($child); |
@@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling') |
1112 | 1114 | return $nodes; |
1113 | 1115 | } |
1114 | 1116 |
|
| 1117 | + private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument |
| 1118 | + { |
| 1119 | + return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset); |
| 1120 | + } |
| 1121 | + |
| 1122 | + private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument |
| 1123 | + { |
| 1124 | + $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); |
| 1125 | + |
| 1126 | + $internalErrors = libxml_use_internal_errors(true); |
| 1127 | + $disableEntities = libxml_disable_entity_loader(true); |
| 1128 | + |
| 1129 | + $dom = new \DOMDocument('1.0', $charset); |
| 1130 | + $dom->validateOnParse = true; |
| 1131 | + |
| 1132 | + if ('' !== trim($htmlContent)) { |
| 1133 | + @$dom->loadHTML($htmlContent); |
| 1134 | + } |
| 1135 | + |
| 1136 | + libxml_use_internal_errors($internalErrors); |
| 1137 | + libxml_disable_entity_loader($disableEntities); |
| 1138 | + |
| 1139 | + return $dom; |
| 1140 | + } |
| 1141 | + |
| 1142 | + /** |
| 1143 | + * Convert charset to HTML-entities to ensure valid parsing. |
| 1144 | + */ |
| 1145 | + private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string |
| 1146 | + { |
| 1147 | + set_error_handler(function () { throw new \Exception(); }); |
| 1148 | + |
| 1149 | + try { |
| 1150 | + return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset); |
| 1151 | + } catch (\Exception $e) { |
| 1152 | + try { |
| 1153 | + $htmlContent = iconv($charset, 'UTF-8', $htmlContent); |
| 1154 | + $htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'); |
| 1155 | + } catch (\Exception $e) { |
| 1156 | + } |
| 1157 | + |
| 1158 | + return $htmlContent; |
| 1159 | + } finally { |
| 1160 | + restore_error_handler(); |
| 1161 | + } |
| 1162 | + } |
| 1163 | + |
1115 | 1164 | /** |
1116 | 1165 | * @throws \InvalidArgumentException |
1117 | 1166 | */ |
|
0 commit comments