@@ -40,6 +40,54 @@ def test_extract_file_data():
4040 assert result == expected_result
4141
4242
43+ def test_extract_file_data_utf8_not_double_encoded ():
44+ """Test that UTF-8 content with non-breaking spaces is not double-encoded.
45+
46+ This test ensures that content containing UTF-8 bytes like \xc2 \xa0 (non-breaking
47+ space) is not misdetected as Windows-1252 and incorrectly re-encoded, which would
48+ cause double-encoding (\xc2 \xa0 -> \xc3 \x82 \xc2 \xa0 ).
49+ """
50+ # HTML content with UTF-8 encoded non-breaking space (\xc2\xa0)
51+ utf8_content = b"<html><title>Chapter\xc2 \xa0 1.\xc2 \xa0 Boost.Beast</title></html>"
52+
53+ response = {
54+ "Body" : BytesIO (utf8_content ),
55+ "ContentType" : "text/html; charset=UTF-8" ,
56+ "LastModified" : datetime .datetime (2023 , 6 , 8 , 12 , 0 , 0 ),
57+ }
58+ s3_key = "example.html"
59+
60+ result = extract_file_data (response , s3_key )
61+
62+ # Content should remain as UTF-8, not be double-encoded
63+ assert result ["content" ] == utf8_content
64+ # Should NOT contain double-encoded sequence \xc3\x82\xc2\xa0
65+ assert b"\xc3 \x82 \xc2 \xa0 " not in result ["content" ]
66+ # Should contain the original UTF-8 non-breaking space \xc2\xa0
67+ assert b"\xc2 \xa0 " in result ["content" ]
68+
69+
70+ def test_extract_file_data_non_utf8_reencoded ():
71+ """Test that genuinely non-UTF-8 content is detected and re-encoded to UTF-8."""
72+ # Latin-1 content with a character not valid in UTF-8
73+ latin1_content = b"<html><title>Test\xe9 </title></html>" # \xe9 is 'é' in Latin-1
74+
75+ response = {
76+ "Body" : BytesIO (latin1_content ),
77+ "ContentType" : "text/html" ,
78+ "LastModified" : datetime .datetime (2023 , 6 , 8 , 12 , 0 , 0 ),
79+ }
80+ s3_key = "example.html"
81+
82+ result = extract_file_data (response , s3_key )
83+
84+ # Content should be re-encoded to UTF-8
85+ # 'é' in UTF-8 is \xc3\xa9
86+ assert b"\xc3 \xa9 " in result ["content" ]
87+ # Original Latin-1 byte should not be present
88+ assert result ["content" ] != latin1_content
89+
90+
4391def test_get_body_from_html ():
4492 html_string = (
4593 "<html><head><title>Test</title></head><body><h1>Test</h1></body></html>"
0 commit comments