From 5885dbbe78f97c4bfd6ab659fa1608a5c16201c9 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 00:49:46 +0100 Subject: [PATCH 1/4] Remove pointless `stdClass` `DOMNode::$childNodes` always contained `DOMNodeList`. --- src/Readability.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 6fb88ad..432ef01 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1231,11 +1231,6 @@ protected function grabArticle(?\DOMElement $page = null) $parentOfTopCandidate = $topCandidate->parentNode; $siblingNodes = $parentOfTopCandidate->childNodes; - if (0 === $siblingNodes->length) { - $siblingNodes = new \stdClass(); - $siblingNodes->length = 0; - } - for ($s = 0, $sl = $siblingNodes->length; $s < $sl; ++$s) { $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; From 8b1ef0740165a1520147a6926ff0dbe6a05b43da Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 03:17:21 +0100 Subject: [PATCH 2/4] Extract `for`-iterated items into variables This simplifies the code a bit and will make it slightly easier in case we decide to switch to `foreach` iteration. --- src/Readability.php | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 432ef01..9954c5a 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -586,7 +586,7 @@ public function clean(\DOMElement $e, string $tag): void } // Then check the elements inside this element for the same. - if (preg_match($this->regexps['media'], $targetList->item($y)->getInnerHTML())) { + if (preg_match($this->regexps['media'], $currentItem->getInnerHTML())) { continue; } } @@ -719,8 +719,9 @@ public function cleanHeaders(\DOMElement $e): void $headers = $e->getElementsByTagName('h' . $headerIndex); for ($i = $headers->length - 1; $i >= 0; --$i) { - if ($this->getWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { - $headers->item($i)->parentNode->removeChild($headers->item($i)); + $header = $headers->item($i); + if ($this->getWeight($header) < 0 || $this->getLinkDensity($header) > 0.33) { + $header->parentNode->removeChild($header); } } } @@ -812,12 +813,14 @@ protected function prepDocument(): void // Remove all style tags in head. $styleTags = $this->dom->getElementsByTagName('style'); for ($i = $styleTags->length - 1; $i >= 0; --$i) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + $styleTag = $styleTags->item($i); + $styleTag->parentNode->removeChild($styleTag); } $linkTags = $this->dom->getElementsByTagName('link'); for ($i = $linkTags->length - 1; $i >= 0; --$i) { - $linkTags->item($i)->parentNode->removeChild($linkTags->item($i)); + $linkTag = $linkTags->item($i); + $linkTag->parentNode->removeChild($linkTag); } } From d454c3a4621d667a87cdd4e8adf0aef26f5f466d Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 03:23:04 +0100 Subject: [PATCH 3/4] Remove dead iteration code This was forgotten in b580cf216d9001f82c866bb9a6c8bcad1cc862d8. --- src/Readability.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 9954c5a..9c08a25 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1079,11 +1079,6 @@ protected function grabArticle(?\DOMElement $page = null) } } - $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); - - for ($c = $candidates->length - 1; $c >= 0; --$c) { - $node = $candidates->item($c); - } unset($candidates); } From 9a9373de4bcb176226c5668e531d84e3b8772794 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 17 Mar 2024 00:29:39 +0100 Subject: [PATCH 4/4] Iterate node lists with `foreach` `DOMNodeList` implements `Traversable`. There are some `for` loops left but we cannot simply replace those: PHP follows the DOM specification, which requires that `NodeList` objects in the DOM are live. As a result, any operation that removes a node list member node from its parent (such as `removeChild`, `replaceChild` or `appendChild`) will cause the next node in the iterator to be skipped. We could work around that by converting those node lists to static arrays using `iterator_to_array` but not sure if it is worth it. --- src/Readability.php | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 9c08a25..290cc28 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -302,8 +302,7 @@ public function addFootnotes(\DOMElement $articleContent): void $articleLinks = $articleContent->getElementsByTagName('a'); $linkCount = 0; - for ($i = 0; $i < $articleLinks->length; ++$i) { - $articleLink = $articleLinks->item($i); + foreach ($articleLinks as $articleLink) { $footnoteLink = $articleLink->cloneNode(true); $refLink = $this->dom->createElement('a'); $footnote = $this->dom->createElement('li'); @@ -383,8 +382,8 @@ public function prepArticle(\DOMNode $articleContent): void // Remove service data-candidate attribute. $elems = $xpath->query('.//*[@data-candidate]', $articleContent); - for ($i = $elems->length - 1; $i >= 0; --$i) { - $elems->item($i)->removeAttribute('data-candidate'); + foreach ($elems as $elem) { + $elem->removeAttribute('data-candidate'); } // Clean out junk from the article content. @@ -520,11 +519,12 @@ public function getLinkDensity(\DOMElement $e, bool $excludeExternal = false): f $textLength = mb_strlen($this->getInnerText($e, true, true)); $linkLength = 0; - for ($dRe = $this->domainRegExp, $i = 0, $il = $links->length; $i < $il; ++$i) { - if ($excludeExternal && $dRe && !preg_match($dRe, $links->item($i)->getAttribute('href'))) { + $dRe = $this->domainRegExp; + foreach ($links as $link) { + if ($excludeExternal && $dRe && !preg_match($dRe, $link->getAttribute('href'))) { continue; } - $linkLength += mb_strlen($this->getInnerText($links->item($i))); + $linkLength += mb_strlen($this->getInnerText($link)); } if ($textLength > 0 && $linkLength > 0) { @@ -640,15 +640,15 @@ public function cleanConditionally(\DOMElement $e, string $tag): void $embedCount = 0; $embeds = $node->getElementsByTagName('embed'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } $embeds = $node->getElementsByTagName('iframe'); - for ($ei = 0, $il = $embeds->length; $ei < $il; ++$ei) { - if (preg_match($this->regexps['media'], $embeds->item($ei)->getAttribute('src'))) { + foreach ($embeds as $embed) { + if (preg_match($this->regexps['media'], $embed->getAttribute('src'))) { ++$embedCount; } } @@ -1018,15 +1018,15 @@ protected function grabArticle(?\DOMElement $page = null) * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. */ - for ($pt = 0, $scored = \count($nodesToScore); $pt < $scored; ++$pt) { - $ancestors = $this->getAncestors($nodesToScore[$pt], 5); + foreach ($nodesToScore as $nodeToScore) { + $ancestors = $this->getAncestors($nodeToScore, 5); // No parent node? Move on... if (0 === \count($ancestors)) { continue; } - $innerText = $this->getInnerText($nodesToScore[$pt]); + $innerText = $this->getInnerText($nodeToScore); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {