Skip to content

Commit 4e6dfd5

Browse files
authored
🐛 FIX: numeric character reference passing (#272)
Fix issue with incorrect determination of a numeric character reference, and subsequent failure to convert to an integer code. From https://github.com/google/oss-fuzz/tree/master/projects/markdown-it-py, fixes issue 55371 This also essentially fixes a bug in upstream, see markdown-it/markdown-it#935
1 parent 36a428b commit 4e6dfd5

File tree

3 files changed

+36
-61
lines changed

3 files changed

+36
-61
lines changed

markdown_it/common/utils.py

Lines changed: 20 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""
33
from __future__ import annotations
44

5-
import html
65
import re
76
from typing import Match, TypeVar
87

@@ -52,9 +51,6 @@ def arrayReplaceAt(
5251
return src[:pos] + newElements + src[pos + 1 :]
5352

5453

55-
######################################################################
56-
57-
5854
def isValidEntityCode(c: int) -> bool:
5955
# broken sequence
6056
if c >= 0xD800 and c <= 0xDFFF:
@@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
8985
return chr(c)
9086

9187

92-
UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
88+
# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
9389
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
9490
UNESCAPE_ALL_RE = re.compile(
9591
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
9692
re.IGNORECASE,
9793
)
98-
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
94+
DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
95+
DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
9996

10097

10198
def replaceEntityPattern(match: str, name: str) -> str:
102-
"""Convert HTML entity patterns
103-
104-
::
105-
106-
https://www.google.com -> https%3A//www.google.com
107-
99+
"""Convert HTML entity patterns,
100+
see https://spec.commonmark.org/0.30/#entity-references
108101
"""
109-
code = 0
110-
111102
if name in entities:
112103
return entities[name]
113104

114-
if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
115-
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
116-
if isValidEntityCode(code):
117-
return fromCodePoint(code)
118-
119-
return match
120-
121-
122-
# def replaceEntities(string):
123-
# if (string.indexOf('&') < 0):
124-
# return string
125-
# return string.replace(ENTITY_RE, replaceEntityPattern)
105+
code: None | int = None
106+
if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
107+
code = int(pat.group(1), 10)
108+
elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
109+
code = int(pat.group(1), 16)
126110

111+
if code is not None and isValidEntityCode(code):
112+
return fromCodePoint(code)
127113

128-
def unescapeMd(string: str) -> str:
129-
raise NotImplementedError
130-
# if "\\" in string:
131-
# return string
132-
# return string.replace(UNESCAPE_MD_RE, "$1")
114+
return match
133115

134116

135117
def unescapeAll(string: str) -> str:
@@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
154136
return ESCAPE_CHAR.sub(r"\1", string)
155137

156138

157-
# //////////////////////////////////////////////////////////////////////////////
158-
159-
# TODO This section changed quite a lot, should re-check
160-
161-
# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
162-
# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
163-
# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
164-
165-
166-
# def escapeHtml(string: str):
167-
168-
# if HTML_ESCAPE_REPLACE_RE.search(string):
169-
170-
# string = UNESCAPE_HTML_RE.sub("&", string)
171-
# string = ESCAPE_AND_HTML.sub("&amp;", string)
172-
# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
173-
# string = string.replace(k, v)
174-
175-
# return string
176-
177-
178139
def escapeHtml(raw: str) -> str:
179-
# return html.escape(html.unescape(raw)).replace("&#x27;", "'")
180-
return html.escape(raw).replace("&#x27;", "'")
140+
"""Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
141+
# like html.escape, but without escaping single quotes
142+
raw = raw.replace("&", "&amp;") # Must be done first!
143+
raw = raw.replace("<", "&lt;")
144+
raw = raw.replace(">", "&gt;")
145+
raw = raw.replace('"', "&quot;")
146+
return raw
181147

182148

183149
# //////////////////////////////////////////////////////////////////////////////

tests/test_fuzzer.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
from markdown_it import MarkdownIt
1111

1212
TESTS = {
13-
55363: ">```\n>",
14-
55367: ">-\n>\n>",
15-
# 55371: "[](so&#4»0;!" TODO this did not fail
16-
# 55401: "?c_" * 100_000 TODO this did not fail
13+
55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
14+
55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
15+
55371: ("[](so&#4H0;!", "<p>[](so&amp;#4H0;!</p>\n"),
16+
# 55401: (("?c_" * 100000) + "c_", ""), TODO this does not fail, just takes a long time
1717
}
1818

1919

20-
@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
21-
def test_fuzzing(raw_input):
20+
@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
21+
def test_fuzzing(raw_input, expected):
2222
md = MarkdownIt()
2323
md.parse(raw_input)
24-
print(md.render(raw_input))
24+
assert md.render(raw_input) == expected

tests/test_port/fixtures/issue-fixes.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,12 @@ Fix CVE-2023-26303
4545
<p><img src="%5B" alt="
4646
" /></p>
4747
.
48+
49+
Fix parsing of incorrect numeric character references
50+
.
51+
[](&#X22y;) &#X22y;
52+
[](&#35y;) &#35y;
53+
.
54+
<p><a href="&amp;#X22y;"></a> &amp;#X22y;
55+
<a href="&amp;#35y;"></a> &amp;#35y;</p>
56+
.

0 commit comments

Comments
 (0)