Skip to content

Commit 086bf94

Browse files
authored
Fix inaccurate content type decoding (boostorg#1950) (boostorg#1952)
1 parent 0128074 commit 086bf94

File tree

3 files changed

+60
-6
lines changed

3 files changed

+60
-6
lines changed

core/boostrenderer.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,18 @@ def extract_file_data(response, s3_key):
2424
"""Extracts the file content, content type, and last modified date from an S3
2525
response object."""
2626
file_content = response["Body"].read()
27-
detected_encoding = chardet.detect(file_content)["encoding"] or "utf-8"
28-
# decoding here stops django debug toolbar erroring on non-utf-8, e.g. preprocessor
29-
if detected_encoding != "utf-8":
30-
file_content = file_content.decode(detected_encoding).encode("utf-8")
27+
# Try UTF-8 first, falling back to chardet detection if that fails. This prevents
28+
# double-encoding issues where UTF-8 content is misdetected as Windows-1252
29+
try:
30+
file_content.decode("utf-8")
31+
# Content is valid UTF-8, use as-is
32+
except UnicodeDecodeError:
33+
# Content is not UTF-8, detect encoding and re-encode
34+
# decoding here stops django debug toolbar erroring on non-utf-8, e.g.
35+
# the preprocessor library
36+
if detected_encoding := chardet.detect(file_content)["encoding"]:
37+
file_content = file_content.decode(detected_encoding).encode("utf-8")
38+
3139
content_type = get_content_type(s3_key, response["ContentType"])
3240
last_modified = response["LastModified"]
3341
return {

core/tests/test_renderer.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,54 @@ def test_extract_file_data():
4040
assert result == expected_result
4141

4242

43+
def test_extract_file_data_utf8_not_double_encoded():
44+
"""Test that UTF-8 content with non-breaking spaces is not double-encoded.
45+
46+
This test ensures that content containing UTF-8 bytes like \xc2\xa0 (non-breaking
47+
space) is not misdetected as Windows-1252 and incorrectly re-encoded, which would
48+
cause double-encoding (\xc2\xa0 -> \xc3\x82\xc2\xa0).
49+
"""
50+
# HTML content with UTF-8 encoded non-breaking space (\xc2\xa0)
51+
utf8_content = b"<html><title>Chapter\xc2\xa01.\xc2\xa0Boost.Beast</title></html>"
52+
53+
response = {
54+
"Body": BytesIO(utf8_content),
55+
"ContentType": "text/html; charset=UTF-8",
56+
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
57+
}
58+
s3_key = "example.html"
59+
60+
result = extract_file_data(response, s3_key)
61+
62+
# Content should remain as UTF-8, not be double-encoded
63+
assert result["content"] == utf8_content
64+
# Should NOT contain double-encoded sequence \xc3\x82\xc2\xa0
65+
assert b"\xc3\x82\xc2\xa0" not in result["content"]
66+
# Should contain the original UTF-8 non-breaking space \xc2\xa0
67+
assert b"\xc2\xa0" in result["content"]
68+
69+
70+
def test_extract_file_data_non_utf8_reencoded():
71+
"""Test that genuinely non-UTF-8 content is detected and re-encoded to UTF-8."""
72+
# Latin-1 content with a character not valid in UTF-8
73+
latin1_content = b"<html><title>Test\xe9</title></html>" # \xe9 is 'é' in Latin-1
74+
75+
response = {
76+
"Body": BytesIO(latin1_content),
77+
"ContentType": "text/html",
78+
"LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
79+
}
80+
s3_key = "example.html"
81+
82+
result = extract_file_data(response, s3_key)
83+
84+
# Content should be re-encoded to UTF-8
85+
# 'é' in UTF-8 is \xc3\xa9
86+
assert b"\xc3\xa9" in result["content"]
87+
# Original Latin-1 byte should not be present
88+
assert result["content"] != latin1_content
89+
90+
4391
def test_get_body_from_html():
4492
html_string = (
4593
"<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"

core/views.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,6 @@ def process_content(self, content: bytes):
516516
req_uri = self.request.build_absolute_uri()
517517
canonical_uri = generate_canonical_library_uri(req_uri)
518518

519-
# this decode is needed for some libraries, e.g. assert
520-
content = content.decode(chardet.detect(content)["encoding"])
521519
soup = BeautifulSoup(content, "html.parser")
522520

523521
# handle libraries that expect no processing

0 commit comments

Comments
 (0)