🐛 FIX: numeric character reference passing (#272)

chrisjsewell · web-flow · commit 4e6dfd5994bc · 2023-06-02T09:19:23.000+02:00
Fix issue with incorrect determination of a numeric character reference, and subsequent failure to convert to an integer code. From https://github.com/google/oss-fuzz/tree/master/projects/markdown-it-py, fixes issue 55371 This also essentially fixes a bug in upstream, see markdown-it/markdown-it#935
diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py
@@ -2,7 +2,6 @@
 """
 from __future__ import annotations
 
-import html
 import re
 from typing import Match, TypeVar
 
@@ -52,9 +51,6 @@ def arrayReplaceAt(
     return src[:pos] + newElements + src[pos + 1 :]
 
 
-######################################################################
-
-
 def isValidEntityCode(c: int) -> bool:
     # broken sequence
     if c >= 0xD800 and c <= 0xDFFF:
@@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
     return chr(c)
 
 
-UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
+# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
 # ENTITY_RE_g       = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
 UNESCAPE_ALL_RE = re.compile(
     r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
     re.IGNORECASE,
 )
-DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
+DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
+DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
 
 
 def replaceEntityPattern(match: str, name: str) -> str:
-    """Convert HTML entity patterns
-
-    ::
-
-        https://www.google.com -> https%3A//www.google.com
-
+    """Convert HTML entity patterns,
+    see https://spec.commonmark.org/0.30/#entity-references
     """
-    code = 0
-
     if name in entities:
         return entities[name]
 
-    if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
-        code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
-        if isValidEntityCode(code):
-            return fromCodePoint(code)
-
-    return match
-
-
-# def replaceEntities(string):
-#   if (string.indexOf('&') < 0):
-#       return string
-#   return string.replace(ENTITY_RE, replaceEntityPattern)
+    code: None | int = None
+    if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
+        code = int(pat.group(1), 10)
+    elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
+        code = int(pat.group(1), 16)
 
+    if code is not None and isValidEntityCode(code):
+        return fromCodePoint(code)
 
-def unescapeMd(string: str) -> str:
-    raise NotImplementedError
-    # if "\\" in string:
-    #     return string
-    # return string.replace(UNESCAPE_MD_RE, "$1")
+    return match
 
 
 def unescapeAll(string: str) -> str:
@@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
     return ESCAPE_CHAR.sub(r"\1", string)
 
 
-# //////////////////////////////////////////////////////////////////////////////
-
-# TODO This section changed quite a lot, should re-check
-
-# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
-# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
-# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
-
-
-# def escapeHtml(string: str):
-
-#     if HTML_ESCAPE_REPLACE_RE.search(string):
-
-#         string = UNESCAPE_HTML_RE.sub("&", string)
-#         string = ESCAPE_AND_HTML.sub("&amp;", string)
-#         for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
-#             string = string.replace(k, v)
-
-#     return string
-
-
 def escapeHtml(raw: str) -> str:
-    # return html.escape(html.unescape(raw)).replace("&#x27;", "'")
-    return html.escape(raw).replace("&#x27;", "'")
+    """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
+    # like html.escape, but without escaping single quotes
+    raw = raw.replace("&", "&amp;")  # Must be done first!
+    raw = raw.replace("<", "&lt;")
+    raw = raw.replace(">", "&gt;")
+    raw = raw.replace('"', "&quot;")
+    return raw
 
 
 # //////////////////////////////////////////////////////////////////////////////
diff --git a/tests/test_fuzzer.py b/tests/test_fuzzer.py
@@ -10,15 +10,15 @@
 from markdown_it import MarkdownIt
 
 TESTS = {
-    55363: ">```\n>",
-    55367: ">-\n>\n>",
-    # 55371: "[](so&#4»0;!"  TODO this did not fail
-    # 55401: "?c_" * 100_000  TODO this did not fail
+    55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
+    55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
+    55371: ("[](so&#4H0;!", "<p>[](so&amp;#4H0;!</p>\n"),
+    # 55401: (("?c_" * 100000) + "c_", ""),  TODO this does not fail, just takes a long time
 }
 
 
-@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
-def test_fuzzing(raw_input):
+@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
+def test_fuzzing(raw_input, expected):
     md = MarkdownIt()
     md.parse(raw_input)
-    print(md.render(raw_input))
+    assert md.render(raw_input) == expected
diff --git a/tests/test_port/fixtures/issue-fixes.md b/tests/test_port/fixtures/issue-fixes.md
@@ -45,3 +45,12 @@ Fix CVE-2023-26303
 <p><img src="%5B" alt="
 " /></p>
 .
+
+Fix parsing of incorrect numeric character references
+.
+[](&#X22y;) &#X22y;
+[](&#35y;) &#35y;
+.
+<p><a href="&amp;#X22y;"></a> &amp;#X22y;
+<a href="&amp;#35y;"></a> &amp;#35y;</p>
+.