Fix inaccurate content type decoding (boostorg#1950) (boostorg#1952)

daveoconnor · web-flow · commit 086bf942c0d2 · 2025-10-08T12:45:51.000-07:00
diff --git a/core/boostrenderer.py b/core/boostrenderer.py
@@ -24,10 +24,18 @@ def extract_file_data(response, s3_key):
     """Extracts the file content, content type, and last modified date from an S3
     response object."""
     file_content = response["Body"].read()
-    detected_encoding = chardet.detect(file_content)["encoding"] or "utf-8"
-    # decoding here stops django debug toolbar erroring on non-utf-8, e.g. preprocessor
-    if detected_encoding != "utf-8":
-        file_content = file_content.decode(detected_encoding).encode("utf-8")
+    # Try UTF-8 first, falling back to chardet detection if that fails. This prevents
+    # double-encoding issues where UTF-8 content is misdetected as Windows-1252
+    try:
+        file_content.decode("utf-8")
+        # Content is valid UTF-8, use as-is
+    except UnicodeDecodeError:
+        # Content is not UTF-8, detect encoding and re-encode
+        # decoding here stops django debug toolbar erroring on non-utf-8, e.g.
+        #  the preprocessor library
+        if detected_encoding := chardet.detect(file_content)["encoding"]:
+            file_content = file_content.decode(detected_encoding).encode("utf-8")
+
     content_type = get_content_type(s3_key, response["ContentType"])
     last_modified = response["LastModified"]
     return {
diff --git a/core/tests/test_renderer.py b/core/tests/test_renderer.py
@@ -40,6 +40,54 @@ def test_extract_file_data():
     assert result == expected_result
 
 
+def test_extract_file_data_utf8_not_double_encoded():
+    """Test that UTF-8 content with non-breaking spaces is not double-encoded.
+
+    This test ensures that content containing UTF-8 bytes like \xc2\xa0 (non-breaking
+    space) is not misdetected as Windows-1252 and incorrectly re-encoded, which would
+    cause double-encoding (\xc2\xa0 -> \xc3\x82\xc2\xa0).
+    """
+    # HTML content with UTF-8 encoded non-breaking space (\xc2\xa0)
+    utf8_content = b"<html><title>Chapter\xc2\xa01.\xc2\xa0Boost.Beast</title></html>"
+
+    response = {
+        "Body": BytesIO(utf8_content),
+        "ContentType": "text/html; charset=UTF-8",
+        "LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
+    }
+    s3_key = "example.html"
+
+    result = extract_file_data(response, s3_key)
+
+    # Content should remain as UTF-8, not be double-encoded
+    assert result["content"] == utf8_content
+    # Should NOT contain double-encoded sequence \xc3\x82\xc2\xa0
+    assert b"\xc3\x82\xc2\xa0" not in result["content"]
+    # Should contain the original UTF-8 non-breaking space \xc2\xa0
+    assert b"\xc2\xa0" in result["content"]
+
+
+def test_extract_file_data_non_utf8_reencoded():
+    """Test that genuinely non-UTF-8 content is detected and re-encoded to UTF-8."""
+    # Latin-1 content with a character not valid in UTF-8
+    latin1_content = b"<html><title>Test\xe9</title></html>"  # \xe9 is 'é' in Latin-1
+
+    response = {
+        "Body": BytesIO(latin1_content),
+        "ContentType": "text/html",
+        "LastModified": datetime.datetime(2023, 6, 8, 12, 0, 0),
+    }
+    s3_key = "example.html"
+
+    result = extract_file_data(response, s3_key)
+
+    # Content should be re-encoded to UTF-8
+    # 'é' in UTF-8 is \xc3\xa9
+    assert b"\xc3\xa9" in result["content"]
+    # Original Latin-1 byte should not be present
+    assert result["content"] != latin1_content
+
+
 def test_get_body_from_html():
     html_string = (
         "<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"
diff --git a/core/views.py b/core/views.py
@@ -516,8 +516,6 @@ def process_content(self, content: bytes):
         req_uri = self.request.build_absolute_uri()
         canonical_uri = generate_canonical_library_uri(req_uri)
 
-        # this decode is needed for some libraries, e.g. assert
-        content = content.decode(chardet.detect(content)["encoding"])
         soup = BeautifulSoup(content, "html.parser")
 
         # handle libraries that expect no processing