Skip to content

Commit 1bffd44

Browse files
committed
Correctly escape strings when serializing. Fixes #30
1 parent 0f94e5e commit 1bffd44

File tree

2 files changed

+131
-25
lines changed

2 files changed

+131
-25
lines changed

src/de/mod.rs

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -791,8 +791,14 @@ mod tests {
791791
// non-excaped " preceded by backslashes
792792
assert_eq!(crate::from_str(r#" "foo bar\\" "#), Ok(r#"foo bar\\"#));
793793
assert_eq!(crate::from_str(r#" "foo bar\\\\" "#), Ok(r#"foo bar\\\\"#));
794-
assert_eq!(crate::from_str(r#" "foo bar\\\\\\" "#), Ok(r#"foo bar\\\\\\"#));
795-
assert_eq!(crate::from_str(r#" "foo bar\\\\\\\\" "#), Ok(r#"foo bar\\\\\\\\"#));
794+
assert_eq!(
795+
crate::from_str(r#" "foo bar\\\\\\" "#),
796+
Ok(r#"foo bar\\\\\\"#)
797+
);
798+
assert_eq!(
799+
crate::from_str(r#" "foo bar\\\\\\\\" "#),
800+
Ok(r#"foo bar\\\\\\\\"#)
801+
);
796802
assert_eq!(crate::from_str(r#" "\\" "#), Ok(r#"\\"#));
797803
}
798804

@@ -1078,28 +1084,28 @@ mod tests {
10781084
assert_eq!(
10791085
crate::from_str::<Thing<'_>>(
10801086
r#"
1081-
{
1082-
"type": "thing",
1083-
"properties": {
1084-
"temperature": {
1085-
"type": "number",
1086-
"unit": "celsius",
1087-
"description": "An ambient temperature sensor",
1088-
"href": "/properties/temperature"
1089-
},
1090-
"humidity": {
1091-
"type": "number",
1092-
"unit": "percent",
1093-
"href": "/properties/humidity"
1094-
},
1095-
"led": {
1096-
"type": "boolean",
1097-
"description": "A red LED",
1098-
"href": "/properties/led"
1099-
}
1100-
}
1101-
}
1102-
"#
1087+
{
1088+
"type": "thing",
1089+
"properties": {
1090+
"temperature": {
1091+
"type": "number",
1092+
"unit": "celsius",
1093+
"description": "An ambient temperature sensor",
1094+
"href": "/properties/temperature"
1095+
},
1096+
"humidity": {
1097+
"type": "number",
1098+
"unit": "percent",
1099+
"href": "/properties/humidity"
1100+
},
1101+
"led": {
1102+
"type": "boolean",
1103+
"description": "A red LED",
1104+
"href": "/properties/led"
1105+
}
1106+
}
1107+
}
1108+
"#
11031109
),
11041110
Ok(Thing {
11051111
properties: Properties {

src/ser/mod.rs

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,20 @@ macro_rules! serialize_fmt {
134134
}};
135135
}
136136

137+
/// Upper-case hex for value in 0..16, encoded as ASCII bytes
138+
fn hex_4bit(c: u8) -> u8 {
139+
if c <= 9 {
140+
0x30 + c
141+
} else {
142+
0x41 + (c - 10)
143+
}
144+
}
145+
146+
/// Upper-case hex for value in 0..256, encoded as ASCII bytes
147+
fn hex(c: u8) -> (u8, u8) {
148+
(hex_4bit(c >> 4), hex_4bit(c & 0x0F))
149+
}
150+
137151
impl<'a, B> ser::Serializer for &'a mut Serializer<B>
138152
where
139153
B: heapless::ArrayLength<u8>,
@@ -212,7 +226,66 @@ where
212226

213227
fn serialize_str(self, v: &str) -> Result<Self::Ok> {
214228
self.buf.push(b'"')?;
215-
self.buf.extend_from_slice(v.as_bytes())?;
229+
230+
231+
// Do escaping according to "6. MUST represent all strings (including object member names) in
232+
// their minimal-length UTF-8 encoding": https://gibson042.github.io/canonicaljson-spec/
233+
//
234+
// We don't need to escape lone surrogates because surrogate pairs do not exist in valid UTF-8,
235+
// even if they can exist in JSON or JavaScript strings (UCS-2 based). As a result, lone surrogates
236+
// cannot exist in a Rust String. If they do, the bug is in the String constructor.
237+
// An excellent explanation is available at https://www.youtube.com/watch?v=HhIEDWmQS3w
238+
239+
// Temporary storage for encoded a single char.
240+
// A char is up to 4 bytes long wehn encoded to UTF-8.
241+
let mut encoding_tmp = [0u8; 4];
242+
243+
for c in v.chars() {
244+
match c {
245+
'\\' => {
246+
self.buf.push(b'\\')?;
247+
self.buf.push(b'\\')?;
248+
}
249+
'"' => {
250+
self.buf.push(b'\\')?;
251+
self.buf.push(b'"')?;
252+
}
253+
'\u{0008}' => {
254+
self.buf.push(b'\\')?;
255+
self.buf.push(b'b')?;
256+
}
257+
'\u{0009}' => {
258+
self.buf.push(b'\\')?;
259+
self.buf.push(b't')?;
260+
}
261+
'\u{000A}' => {
262+
self.buf.push(b'\\')?;
263+
self.buf.push(b'n')?;
264+
}
265+
'\u{000C}' => {
266+
self.buf.push(b'\\')?;
267+
self.buf.push(b'f')?;
268+
}
269+
'\u{000D}' => {
270+
self.buf.push(b'\\')?;
271+
self.buf.push(b'r')?;
272+
}
273+
'\u{0000}'..='\u{001F}' => {
274+
self.buf.push(b'\\')?;
275+
self.buf.push(b'u')?;
276+
self.buf.push(b'0')?;
277+
self.buf.push(b'0')?;
278+
let (hex1, hex2) = hex(c as u8);
279+
self.buf.push(hex1)?;
280+
self.buf.push(hex2)?;
281+
}
282+
_ => {
283+
let encoded = c.encode_utf8(&mut encoding_tmp as &mut [u8]);
284+
self.buf.extend_from_slice(encoded.as_bytes())?;
285+
}
286+
}
287+
}
288+
216289
self.buf.push(b'"')?;
217290
Ok(())
218291
}
@@ -472,6 +545,33 @@ mod tests {
472545
#[test]
473546
fn str() {
474547
assert_eq!(&*crate::to_string::<N, _>("hello").unwrap(), r#""hello""#);
548+
assert_eq!(&*crate::to_string::<N, _>("").unwrap(), r#""""#);
549+
550+
// Characters unescaped if possible
551+
assert_eq!(&*crate::to_string::<N, _>("ä").unwrap(), r#""ä""#);
552+
assert_eq!(&*crate::to_string::<N, _>("৬").unwrap(), r#""৬""#);
553+
// assert_eq!(&*crate::to_string::<N, _>("\u{A0}").unwrap(), r#"" ""#); // non-breaking space
554+
assert_eq!(&*crate::to_string::<N, _>("ℝ").unwrap(), r#""ℝ""#); // 3 byte character
555+
assert_eq!(&*crate::to_string::<N, _>("💣").unwrap(), r#""💣""#); // 4 byte character
556+
557+
// " and \ must be escaped
558+
assert_eq!(&*crate::to_string::<N, _>("foo\"bar").unwrap(), r#""foo\"bar""#);
559+
assert_eq!(&*crate::to_string::<N, _>("foo\\bar").unwrap(), r#""foo\\bar""#);
560+
561+
// \b, \t, \n, \f, \r must be escaped in their two-character escaping
562+
assert_eq!(&*crate::to_string::<N, _>(" \u{0008} ").unwrap(), r#"" \b ""#);
563+
assert_eq!(&*crate::to_string::<N, _>(" \u{0009} ").unwrap(), r#"" \t ""#);
564+
assert_eq!(&*crate::to_string::<N, _>(" \u{000A} ").unwrap(), r#"" \n ""#);
565+
assert_eq!(&*crate::to_string::<N, _>(" \u{000C} ").unwrap(), r#"" \f ""#);
566+
assert_eq!(&*crate::to_string::<N, _>(" \u{000D} ").unwrap(), r#"" \r ""#);
567+
568+
// U+0000 through U+001F is escaped using six-character \u00xx uppercase hexadecimal escape sequences
569+
assert_eq!(&*crate::to_string::<N, _>(" \u{0000} ").unwrap(), r#"" \u0000 ""#);
570+
assert_eq!(&*crate::to_string::<N, _>(" \u{0001} ").unwrap(), r#"" \u0001 ""#);
571+
assert_eq!(&*crate::to_string::<N, _>(" \u{0007} ").unwrap(), r#"" \u0007 ""#);
572+
assert_eq!(&*crate::to_string::<N, _>(" \u{000e} ").unwrap(), r#"" \u000E ""#);
573+
assert_eq!(&*crate::to_string::<N, _>(" \u{001D} ").unwrap(), r#"" \u001D ""#);
574+
assert_eq!(&*crate::to_string::<N, _>(" \u{001f} ").unwrap(), r#"" \u001F ""#);
475575
}
476576

477577
#[test]

0 commit comments

Comments
 (0)