Merge pull request #34 from MathiasKoch/enhancement/correctly-handle-escaped-strings

MathiasKoch · web-flow · commit 8de413eca093 · 2020-12-11T08:12:43.000+01:00
Correctly handle escaped strings when parsing json strings. Fixes #30 & #31
diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -175,10 +175,37 @@ impl<'a> Deserializer<'a> {
         loop {
             match self.peek() {
                 Some(b'"') => {
-                    let end = self.index;
-                    self.eat_char();
-                    return str::from_utf8(&self.slice[start..end])
-                        .map_err(|_| Error::InvalidUnicodeCodePoint);
+                    // Counts the number of backslashes in front of the current index.
+                    //
+                    // "some string with \\\" included."
+                    //                  ^^^^^
+                    //                  |||||
+                    //       loop run:  4321|
+                    //                      |
+                    //                   `index`
+                    //
+                    // Since we only get in this code branch if we found a " starting the string and `index` is greater
+                    // than the start position, we know the loop will end no later than this point.
+                    let leading_backslashes = |index: usize| -> usize {
+                        let mut count = 0;
+                        loop {
+                            if self.slice[index - count - 1] == b'\\' {
+                                count += 1;
+                            } else {
+                                return count;
+                            }
+                        }
+                    };
+
+                    let is_escaped = leading_backslashes(self.index) % 2 == 1;
+                    if is_escaped {
+                        self.eat_char(); // just continue
+                    } else {
+                        let end = self.index;
+                        self.eat_char();
+                        return str::from_utf8(&self.slice[start..end])
+                            .map_err(|_| Error::InvalidUnicodeCodePoint);
+                    }
                 }
                 Some(_) => self.eat_char(),
                 None => return Err(Error::EofWhileParsingString),
@@ -745,6 +772,34 @@ mod tests {
     #[test]
     fn str() {
         assert_eq!(crate::from_str(r#" "hello" "#), Ok("hello"));
+        assert_eq!(crate::from_str(r#" "" "#), Ok(""));
+        assert_eq!(crate::from_str(r#" " " "#), Ok(" "));
+        assert_eq!(crate::from_str(r#" "👏" "#), Ok("👏"));
+
+        // no unescaping is done (as documented as a known issue in lib.rs)
+        assert_eq!(crate::from_str(r#" "hel\tlo" "#), Ok("hel\\tlo"));
+        assert_eq!(crate::from_str(r#" "hello \\" "#), Ok("hello \\\\"));
+
+        // escaped " in the string content
+        assert_eq!(crate::from_str(r#" "foo\"bar" "#), Ok(r#"foo\"bar"#));
+        assert_eq!(crate::from_str(r#" "foo\\\"bar" "#), Ok(r#"foo\\\"bar"#));
+        assert_eq!(crate::from_str(r#" "foo\"\"bar" "#), Ok(r#"foo\"\"bar"#));
+        assert_eq!(crate::from_str(r#" "\"bar" "#), Ok(r#"\"bar"#));
+        assert_eq!(crate::from_str(r#" "foo\"" "#), Ok(r#"foo\""#));
+        assert_eq!(crate::from_str(r#" "\"" "#), Ok(r#"\""#));
+
+        // non-excaped " preceded by backslashes
+        assert_eq!(crate::from_str(r#" "foo bar\\" "#), Ok(r#"foo bar\\"#));
+        assert_eq!(crate::from_str(r#" "foo bar\\\\" "#), Ok(r#"foo bar\\\\"#));
+        assert_eq!(
+            crate::from_str(r#" "foo bar\\\\\\" "#),
+            Ok(r#"foo bar\\\\\\"#)
+        );
+        assert_eq!(
+            crate::from_str(r#" "foo bar\\\\\\\\" "#),
+            Ok(r#"foo bar\\\\\\\\"#)
+        );
+        assert_eq!(crate::from_str(r#" "\\" "#), Ok(r#"\\"#));
     }
 
     #[test]
@@ -1029,28 +1084,28 @@ mod tests {
         assert_eq!(
             crate::from_str::<Thing<'_>>(
                 r#"
-{
-  "type": "thing",
-  "properties": {
-    "temperature": {
-      "type": "number",
-      "unit": "celsius",
-      "description": "An ambient temperature sensor",
-      "href": "/properties/temperature"
-    },
-    "humidity": {
-      "type": "number",
-      "unit": "percent",
-      "href": "/properties/humidity"
-    },
-    "led": {
-      "type": "boolean",
-      "description": "A red LED",
-      "href": "/properties/led"
-    }
-  }
-}
-"#
+                    {
+                    "type": "thing",
+                    "properties": {
+                        "temperature": {
+                        "type": "number",
+                        "unit": "celsius",
+                        "description": "An ambient temperature sensor",
+                        "href": "/properties/temperature"
+                        },
+                        "humidity": {
+                        "type": "number",
+                        "unit": "percent",
+                        "href": "/properties/humidity"
+                        },
+                        "led": {
+                        "type": "boolean",
+                        "description": "A red LED",
+                        "href": "/properties/led"
+                        }
+                    }
+                    }
+                    "#
             ),
             Ok(Thing {
                 properties: Properties {
diff --git a/src/ser/mod.rs b/src/ser/mod.rs
@@ -134,6 +134,20 @@ macro_rules! serialize_fmt {
     }};
 }
 
+/// Upper-case hex for value in 0..16, encoded as ASCII bytes
+fn hex_4bit(c: u8) -> u8 {
+    if c <= 9 {
+        0x30 + c
+    } else {
+        0x41 + (c - 10)
+    }
+}
+
+/// Upper-case hex for value in 0..256, encoded as ASCII bytes
+fn hex(c: u8) -> (u8, u8) {
+    (hex_4bit(c >> 4), hex_4bit(c & 0x0F))
+}
+
 impl<'a, B> ser::Serializer for &'a mut Serializer<B>
 where
     B: heapless::ArrayLength<u8>,
@@ -212,7 +226,66 @@ where
 
     fn serialize_str(self, v: &str) -> Result<Self::Ok> {
         self.buf.push(b'"')?;
-        self.buf.extend_from_slice(v.as_bytes())?;
+
+
+        // Do escaping according to "6. MUST represent all strings (including object member names) in
+        // their minimal-length UTF-8 encoding": https://gibson042.github.io/canonicaljson-spec/
+        //
+        // We don't need to escape lone surrogates because surrogate pairs do not exist in valid UTF-8,
+        // even if they can exist in JSON or JavaScript strings (UCS-2 based). As a result, lone surrogates
+        // cannot exist in a Rust String. If they do, the bug is in the String constructor.
+        // An excellent explanation is available at https://www.youtube.com/watch?v=HhIEDWmQS3w
+
+        // Temporary storage for encoded a single char.
+        // A char is up to 4 bytes long wehn encoded to UTF-8.
+        let mut encoding_tmp = [0u8; 4];
+
+        for c in v.chars() {
+            match c {
+                '\\' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'\\')?;
+                }
+                '"' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'"')?;
+                }
+                '\u{0008}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'b')?;
+                }
+                '\u{0009}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b't')?;
+                }
+                '\u{000A}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'n')?;
+                }
+                '\u{000C}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'f')?;
+                }
+                '\u{000D}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'r')?;
+                }
+                '\u{0000}'..='\u{001F}' => {
+                    self.buf.push(b'\\')?;
+                    self.buf.push(b'u')?;
+                    self.buf.push(b'0')?;
+                    self.buf.push(b'0')?;
+                    let (hex1, hex2) = hex(c as u8);
+                    self.buf.push(hex1)?;
+                    self.buf.push(hex2)?;
+                }
+                _ => {
+                    let encoded = c.encode_utf8(&mut encoding_tmp as &mut [u8]);
+                    self.buf.extend_from_slice(encoded.as_bytes())?;
+                }
+            }
+        }
+
         self.buf.push(b'"')?;
         Ok(())
     }
@@ -472,6 +545,33 @@ mod tests {
     #[test]
     fn str() {
         assert_eq!(&*crate::to_string::<N, _>("hello").unwrap(), r#""hello""#);
+        assert_eq!(&*crate::to_string::<N, _>("").unwrap(), r#""""#);
+
+        // Characters unescaped if possible
+        assert_eq!(&*crate::to_string::<N, _>("ä").unwrap(), r#""ä""#);
+        assert_eq!(&*crate::to_string::<N, _>("৬").unwrap(), r#""৬""#);
+        // assert_eq!(&*crate::to_string::<N, _>("\u{A0}").unwrap(), r#"" ""#); // non-breaking space
+        assert_eq!(&*crate::to_string::<N, _>("ℝ").unwrap(), r#""ℝ""#); // 3 byte character
+        assert_eq!(&*crate::to_string::<N, _>("💣").unwrap(), r#""💣""#); // 4 byte character
+
+        // " and \ must be escaped
+        assert_eq!(&*crate::to_string::<N, _>("foo\"bar").unwrap(), r#""foo\"bar""#);
+        assert_eq!(&*crate::to_string::<N, _>("foo\\bar").unwrap(), r#""foo\\bar""#);
+
+        // \b, \t, \n, \f, \r must be escaped in their two-character escaping
+        assert_eq!(&*crate::to_string::<N, _>(" \u{0008} ").unwrap(), r#"" \b ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{0009} ").unwrap(), r#"" \t ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{000A} ").unwrap(), r#"" \n ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{000C} ").unwrap(), r#"" \f ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{000D} ").unwrap(), r#"" \r ""#);
+
+        // U+0000 through U+001F is escaped using six-character \u00xx uppercase hexadecimal escape sequences
+        assert_eq!(&*crate::to_string::<N, _>(" \u{0000} ").unwrap(), r#"" \u0000 ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{0001} ").unwrap(), r#"" \u0001 ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{0007} ").unwrap(), r#"" \u0007 ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{000e} ").unwrap(), r#"" \u000E ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{001D} ").unwrap(), r#"" \u001D ""#);
+        assert_eq!(&*crate::to_string::<N, _>(" \u{001f} ").unwrap(), r#"" \u001F ""#);
     }
 
     #[test]