From 7d404122d4a9beee5160a7053a2b5f1ff67fc7e7 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+ndossche@users.noreply.github.com> Date: Thu, 6 Nov 2025 22:26:58 +0100 Subject: [PATCH] Fix GH-20395: \Dom\ParentNode::querySelector and \Dom\ParentNode::querySelectorAll requires elements in $selectors to be lowercase The selector needs to be compared in a lowercase manner. This also almost completely obsoletes the interned string optimization, so get rid of that for simplicity sake. While there is still theoretical benefit, it is only 1-2% in my random tests, not worth it anymore. --- .../lexbor/selectors-adapted/selectors.c | 99 ++++++++++--------- .../lexbor/selectors-adapted/selectors.h | 1 - .../tests/modern/css_selectors/gh20395.phpt | 33 +++++++ 3 files changed, 85 insertions(+), 48 deletions(-) create mode 100644 ext/dom/tests/modern/css_selectors/gh20395.phpt diff --git a/ext/dom/lexbor/lexbor/selectors-adapted/selectors.c b/ext/dom/lexbor/lexbor/selectors-adapted/selectors.c index 4e094f632ef79..2058b2ecf9792 100644 --- a/ext/dom/lexbor/lexbor/selectors-adapted/selectors.c +++ b/ext/dom/lexbor/lexbor/selectors-adapted/selectors.c @@ -35,9 +35,24 @@ static void dom_lxb_str_wrapper_release(dom_lxb_str_wrapper *wrapper) } } -static zend_always_inline bool lxb_selectors_adapted_cmp_local_name_literal(const xmlNode *node, const char *name) +static bool lxb_selectors_str_cmp_loright(const char *lhs, const char *rhs) { - return strcmp((const char *) node->name, name) == 0; + while (true) { + if (*rhs != zend_tolower_ascii(*lhs)) { + return false; + } + if (!*lhs) { + return true; + } + ++rhs; + ++lhs; + } +} + +/* `name` is lowercase */ +static zend_always_inline bool lxb_selectors_cmp_html_name_lit(const xmlNode *node, const char *name) +{ + return lxb_selectors_str_cmp_loright((const char *) node->name, name); } static zend_always_inline bool lxb_selectors_adapted_cmp_ns(const xmlNode *a, const xmlNode *b) @@ -46,16 +61,18 @@ static zend_always_inline bool lxb_selectors_adapted_cmp_ns(const xmlNode *a, co return a->ns == b->ns || (a->ns != NULL && b->ns != NULL && xmlStrEqual(a->ns->href, b->ns->href)); } +/* From https://html.spec.whatwg.org/#case-sensitivity-of-selectors */ static zend_always_inline bool lxb_selectors_adapted_cmp_local_name_id(const xmlNode *node, const lxb_selectors_adapted_id *id) { - uintptr_t ptr = (uintptr_t) node->name; - if (id->interned && (ptr & (ZEND_MM_ALIGNMENT - 1)) != 0) { - /* It cannot be a heap-allocated string because the pointer is not properly aligned for a heap allocation. - * Therefore, it must be interned into the dictionary pool. */ - return node->name == id->name; + ZEND_ASSERT(node->doc != NULL); + if (php_dom_ns_is_html_and_document_is_html(node)) { + /* From https://html.spec.whatwg.org/#case-sensitivity-of-selectors: + * The element name must be compared case sensitively _after_ converting the selector to lowercase. + * E.g. selector "DIV" must match element "div" but not "Div". */ + return lxb_selectors_str_cmp_loright((const char *) id->name, (const char *) node->name); + } else { + return strcmp((const char *) node->name, (const char *) id->name) == 0; } - - return strcmp((const char *) node->name, (const char *) id->name) == 0; } static zend_always_inline const xmlAttr *lxb_selectors_adapted_attr(const xmlNode *node, const lxb_char_t *name) @@ -64,9 +81,8 @@ static zend_always_inline const xmlAttr *lxb_selectors_adapted_attr(const xmlNod ZEND_ASSERT(node->doc != NULL); if (php_dom_ns_is_html_and_document_is_html(node)) { /* No need to handle DTD entities as we're in HTML. */ - size_t name_bound = strlen((const char *) name) + 1; for (const xmlAttr *cur = node->properties; cur != NULL; cur = cur->next) { - if (lexbor_str_data_nlocmp_right(cur->name, name, name_bound)) { + if (lxb_selectors_str_cmp_loright((const char *) name, (const char *) cur->name)) { attr = cur; break; } @@ -154,18 +170,7 @@ static bool lxb_selectors_is_lowercased_html_attrib_name(const lxb_css_selector_ static void lxb_selectors_adapted_set_entry_id_ex(lxb_selectors_entry_t *entry, const lxb_css_selector_t *selector, const xmlNode *node) { entry->id.attr_case_insensitive = lxb_selectors_is_lowercased_html_attrib_name(selector); - - if (node->doc != NULL && node->doc->dict != NULL) { - const xmlChar *interned = xmlDictExists(node->doc->dict, selector->name.data, selector->name.length); - if (interned != NULL) { - entry->id.name = interned; - entry->id.interned = true; - return; - } - } - entry->id.name = selector->name.data; - entry->id.interned = false; } static zend_always_inline void lxb_selectors_adapted_set_entry_id(lxb_selectors_entry_t *entry, const lxb_css_selector_t *selector, const xmlNode *node) @@ -1686,8 +1691,8 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, case LXB_CSS_SELECTOR_PSEUDO_CLASS_ANY_LINK: /* https://drafts.csswg.org/selectors/#the-any-link-pseudo */ if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && (lxb_selectors_adapted_cmp_local_name_literal(node, "a") - || lxb_selectors_adapted_cmp_local_name_literal(node, "area"))) + && (lxb_selectors_cmp_html_name_lit(node, "a") + || lxb_selectors_cmp_html_name_lit(node, "area"))) { return lxb_selectors_adapted_has_attr(node, "href"); } @@ -1705,7 +1710,7 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, if (!php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token)) { return false; } - if (lxb_selectors_adapted_cmp_local_name_literal(node, "input")) { + if (lxb_selectors_cmp_html_name_lit(node, "input")) { const xmlAttr *dom_attr = lxb_selectors_adapted_attr(node, (const lxb_char_t *) "type"); if (dom_attr == NULL) { return false; @@ -1729,7 +1734,7 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, return res; } - else if(lxb_selectors_adapted_cmp_local_name_literal(node, "option")) { + else if(lxb_selectors_cmp_html_name_lit(node, "option")) { return lxb_selectors_adapted_has_attr(node, "selected"); } @@ -1802,8 +1807,8 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, case LXB_CSS_SELECTOR_PSEUDO_CLASS_LINK: /* https://html.spec.whatwg.org/multipage/semantics-other.html#selector-link */ if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && (lxb_selectors_adapted_cmp_local_name_literal(node, "a") - || lxb_selectors_adapted_cmp_local_name_literal(node, "area"))) + && (lxb_selectors_cmp_html_name_lit(node, "a") + || lxb_selectors_cmp_html_name_lit(node, "area"))) { return lxb_selectors_adapted_has_attr(node, "href"); } @@ -1823,9 +1828,9 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, case LXB_CSS_SELECTOR_PSEUDO_CLASS_OPTIONAL: if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && (lxb_selectors_adapted_cmp_local_name_literal(node, "input") - || lxb_selectors_adapted_cmp_local_name_literal(node, "select") - || lxb_selectors_adapted_cmp_local_name_literal(node, "textarea"))) + && (lxb_selectors_cmp_html_name_lit(node, "input") + || lxb_selectors_cmp_html_name_lit(node, "select") + || lxb_selectors_cmp_html_name_lit(node, "textarea"))) { return !lxb_selectors_adapted_has_attr(node, "required"); } @@ -1840,8 +1845,8 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, case LXB_CSS_SELECTOR_PSEUDO_CLASS_PLACEHOLDER_SHOWN: if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && (lxb_selectors_adapted_cmp_local_name_literal(node, "input") - || lxb_selectors_adapted_cmp_local_name_literal(node, "textarea"))) + && (lxb_selectors_cmp_html_name_lit(node, "input") + || lxb_selectors_cmp_html_name_lit(node, "textarea"))) { return lxb_selectors_adapted_has_attr(node, "placeholder"); } @@ -1856,9 +1861,9 @@ lxb_selectors_pseudo_class(const lxb_css_selector_t *selector, case LXB_CSS_SELECTOR_PSEUDO_CLASS_REQUIRED: if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && (lxb_selectors_adapted_cmp_local_name_literal(node, "input") - || lxb_selectors_adapted_cmp_local_name_literal(node, "select") - || lxb_selectors_adapted_cmp_local_name_literal(node, "textarea"))) + && (lxb_selectors_cmp_html_name_lit(node, "input") + || lxb_selectors_cmp_html_name_lit(node, "select") + || lxb_selectors_cmp_html_name_lit(node, "textarea"))) { return lxb_selectors_adapted_has_attr(node, "required"); } @@ -2104,24 +2109,24 @@ lxb_selectors_pseudo_class_disabled(const xmlNode *node) } if (lxb_selectors_adapted_has_attr(node, "disabled") - && (lxb_selectors_adapted_cmp_local_name_literal(node, "button") - || lxb_selectors_adapted_cmp_local_name_literal(node, "input") - || lxb_selectors_adapted_cmp_local_name_literal(node, "select") - || lxb_selectors_adapted_cmp_local_name_literal(node, "textarea") - || lxb_selectors_adapted_cmp_local_name_literal(node, "optgroup") - || lxb_selectors_adapted_cmp_local_name_literal(node, "fieldset"))) + && (lxb_selectors_cmp_html_name_lit(node, "button") + || lxb_selectors_cmp_html_name_lit(node, "input") + || lxb_selectors_cmp_html_name_lit(node, "select") + || lxb_selectors_cmp_html_name_lit(node, "textarea") + || lxb_selectors_cmp_html_name_lit(node, "optgroup") + || lxb_selectors_cmp_html_name_lit(node, "fieldset"))) { return true; } - if (lxb_selectors_adapted_cmp_local_name_literal(node, "fieldset")) { + if (lxb_selectors_cmp_html_name_lit(node, "fieldset")) { const xmlNode *fieldset = node; node = node->parent; while (node != NULL && CMP_NODE_TYPE(node, XML_ELEMENT_NODE)) { /* node is a disabled fieldset that is an ancestor of fieldset */ if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) - && lxb_selectors_adapted_cmp_local_name_literal(node, "fieldset") + && lxb_selectors_cmp_html_name_lit(node, "fieldset") && lxb_selectors_adapted_has_attr(node, "disabled")) { /* Search first legend child and figure out if fieldset is a descendent from that. */ @@ -2129,7 +2134,7 @@ lxb_selectors_pseudo_class_disabled(const xmlNode *node) do { if (search_current->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(search_current, php_dom_ns_is_html_magic_token) - && lxb_selectors_adapted_cmp_local_name_literal(search_current, "legend")) { + && lxb_selectors_cmp_html_name_lit(search_current, "legend")) { /* search_current is a legend element. */ const xmlNode *inner_search_current = fieldset; @@ -2235,8 +2240,8 @@ static bool lxb_selectors_pseudo_class_read_write(const xmlNode *node) { if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token)) { - if (lxb_selectors_adapted_cmp_local_name_literal(node, "input") - || lxb_selectors_adapted_cmp_local_name_literal(node, "textarea")) { + if (lxb_selectors_cmp_html_name_lit(node, "input") + || lxb_selectors_cmp_html_name_lit(node, "textarea")) { return !lxb_selectors_adapted_has_attr(node, "readonly") && !lxb_selectors_adapted_has_attr(node, "disabled"); } else { const xmlAttr *attr = lxb_selectors_adapted_attr(node, (const lxb_char_t *) "contenteditable"); diff --git a/ext/dom/lexbor/lexbor/selectors-adapted/selectors.h b/ext/dom/lexbor/lexbor/selectors-adapted/selectors.h index c0f76cce3d5cc..b64a9e49ee262 100644 --- a/ext/dom/lexbor/lexbor/selectors-adapted/selectors.h +++ b/ext/dom/lexbor/lexbor/selectors-adapted/selectors.h @@ -77,7 +77,6 @@ typedef lxb_selectors_entry_t * typedef struct { const xmlChar *name; - bool interned; bool attr_case_insensitive; } lxb_selectors_adapted_id; diff --git a/ext/dom/tests/modern/css_selectors/gh20395.phpt b/ext/dom/tests/modern/css_selectors/gh20395.phpt new file mode 100644 index 0000000000000..af04cb1c27a42 --- /dev/null +++ b/ext/dom/tests/modern/css_selectors/gh20395.phpt @@ -0,0 +1,33 @@ +--TEST-- +GH-20395 (\Dom\ParentNode::querySelector and \Dom\ParentNode::querySelectorAll requires elements in $selectors to be lowercase) +--EXTENSIONS-- +dom +--CREDITS-- +DeveloperRob +--FILE-- +'; +$dom = Dom\HtmlDocument::createFromString($html); +var_dump(is_null($dom->querySelector('html'))); +var_dump(is_null($dom->querySelector('Html'))); +var_dump(is_null($dom->querySelector('HTML'))); + +$dom->body->appendChild($dom->createElement('div')); +$dom->body->appendChild($dom->createElementNS('http://www.w3.org/1999/xhtml', 'Div')); + +foreach ($dom->querySelectorAll('div') as $div) { + var_dump($div->localName); +} + +foreach ($dom->querySelectorAll('Div') as $div) { + var_dump($div->localName); +} + +?> +--EXPECT-- +bool(false) +bool(false) +bool(false) +string(3) "div" +string(3) "div"