Skip to content

Commit 8de413e

Browse files
authored
Merge pull request #34 from MathiasKoch/enhancement/correctly-handle-escaped-strings
Correctly handle escaped strings when parsing json strings. Fixes #30 & #31
2 parents 238c8fe + 1bffd44 commit 8de413e

File tree

2 files changed

+182
-27
lines changed

2 files changed

+182
-27
lines changed

src/de/mod.rs

Lines changed: 81 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,37 @@ impl<'a> Deserializer<'a> {
175175
loop {
176176
match self.peek() {
177177
Some(b'"') => {
178-
let end = self.index;
179-
self.eat_char();
180-
return str::from_utf8(&self.slice[start..end])
181-
.map_err(|_| Error::InvalidUnicodeCodePoint);
178+
// Counts the number of backslashes in front of the current index.
179+
//
180+
// "some string with \\\" included."
181+
// ^^^^^
182+
// |||||
183+
// loop run: 4321|
184+
// |
185+
// `index`
186+
//
187+
// Since we only get in this code branch if we found a " starting the string and `index` is greater
188+
// than the start position, we know the loop will end no later than this point.
189+
let leading_backslashes = |index: usize| -> usize {
190+
let mut count = 0;
191+
loop {
192+
if self.slice[index - count - 1] == b'\\' {
193+
count += 1;
194+
} else {
195+
return count;
196+
}
197+
}
198+
};
199+
200+
let is_escaped = leading_backslashes(self.index) % 2 == 1;
201+
if is_escaped {
202+
self.eat_char(); // just continue
203+
} else {
204+
let end = self.index;
205+
self.eat_char();
206+
return str::from_utf8(&self.slice[start..end])
207+
.map_err(|_| Error::InvalidUnicodeCodePoint);
208+
}
182209
}
183210
Some(_) => self.eat_char(),
184211
None => return Err(Error::EofWhileParsingString),
@@ -745,6 +772,34 @@ mod tests {
745772
#[test]
746773
fn str() {
747774
assert_eq!(crate::from_str(r#" "hello" "#), Ok("hello"));
775+
assert_eq!(crate::from_str(r#" "" "#), Ok(""));
776+
assert_eq!(crate::from_str(r#" " " "#), Ok(" "));
777+
assert_eq!(crate::from_str(r#" "👏" "#), Ok("👏"));
778+
779+
// no unescaping is done (as documented as a known issue in lib.rs)
780+
assert_eq!(crate::from_str(r#" "hel\tlo" "#), Ok("hel\\tlo"));
781+
assert_eq!(crate::from_str(r#" "hello \\" "#), Ok("hello \\\\"));
782+
783+
// escaped " in the string content
784+
assert_eq!(crate::from_str(r#" "foo\"bar" "#), Ok(r#"foo\"bar"#));
785+
assert_eq!(crate::from_str(r#" "foo\\\"bar" "#), Ok(r#"foo\\\"bar"#));
786+
assert_eq!(crate::from_str(r#" "foo\"\"bar" "#), Ok(r#"foo\"\"bar"#));
787+
assert_eq!(crate::from_str(r#" "\"bar" "#), Ok(r#"\"bar"#));
788+
assert_eq!(crate::from_str(r#" "foo\"" "#), Ok(r#"foo\""#));
789+
assert_eq!(crate::from_str(r#" "\"" "#), Ok(r#"\""#));
790+
791+
// non-excaped " preceded by backslashes
792+
assert_eq!(crate::from_str(r#" "foo bar\\" "#), Ok(r#"foo bar\\"#));
793+
assert_eq!(crate::from_str(r#" "foo bar\\\\" "#), Ok(r#"foo bar\\\\"#));
794+
assert_eq!(
795+
crate::from_str(r#" "foo bar\\\\\\" "#),
796+
Ok(r#"foo bar\\\\\\"#)
797+
);
798+
assert_eq!(
799+
crate::from_str(r#" "foo bar\\\\\\\\" "#),
800+
Ok(r#"foo bar\\\\\\\\"#)
801+
);
802+
assert_eq!(crate::from_str(r#" "\\" "#), Ok(r#"\\"#));
748803
}
749804

750805
#[test]
@@ -1029,28 +1084,28 @@ mod tests {
10291084
assert_eq!(
10301085
crate::from_str::<Thing<'_>>(
10311086
r#"
1032-
{
1033-
"type": "thing",
1034-
"properties": {
1035-
"temperature": {
1036-
"type": "number",
1037-
"unit": "celsius",
1038-
"description": "An ambient temperature sensor",
1039-
"href": "/properties/temperature"
1040-
},
1041-
"humidity": {
1042-
"type": "number",
1043-
"unit": "percent",
1044-
"href": "/properties/humidity"
1045-
},
1046-
"led": {
1047-
"type": "boolean",
1048-
"description": "A red LED",
1049-
"href": "/properties/led"
1050-
}
1051-
}
1052-
}
1053-
"#
1087+
{
1088+
"type": "thing",
1089+
"properties": {
1090+
"temperature": {
1091+
"type": "number",
1092+
"unit": "celsius",
1093+
"description": "An ambient temperature sensor",
1094+
"href": "/properties/temperature"
1095+
},
1096+
"humidity": {
1097+
"type": "number",
1098+
"unit": "percent",
1099+
"href": "/properties/humidity"
1100+
},
1101+
"led": {
1102+
"type": "boolean",
1103+
"description": "A red LED",
1104+
"href": "/properties/led"
1105+
}
1106+
}
1107+
}
1108+
"#
10541109
),
10551110
Ok(Thing {
10561111
properties: Properties {

src/ser/mod.rs

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,20 @@ macro_rules! serialize_fmt {
134134
}};
135135
}
136136

137+
/// Upper-case hex for value in 0..16, encoded as ASCII bytes
138+
fn hex_4bit(c: u8) -> u8 {
139+
if c <= 9 {
140+
0x30 + c
141+
} else {
142+
0x41 + (c - 10)
143+
}
144+
}
145+
146+
/// Upper-case hex for value in 0..256, encoded as ASCII bytes
147+
fn hex(c: u8) -> (u8, u8) {
148+
(hex_4bit(c >> 4), hex_4bit(c & 0x0F))
149+
}
150+
137151
impl<'a, B> ser::Serializer for &'a mut Serializer<B>
138152
where
139153
B: heapless::ArrayLength<u8>,
@@ -212,7 +226,66 @@ where
212226

213227
fn serialize_str(self, v: &str) -> Result<Self::Ok> {
214228
self.buf.push(b'"')?;
215-
self.buf.extend_from_slice(v.as_bytes())?;
229+
230+
231+
// Do escaping according to "6. MUST represent all strings (including object member names) in
232+
// their minimal-length UTF-8 encoding": https://gibson042.github.io/canonicaljson-spec/
233+
//
234+
// We don't need to escape lone surrogates because surrogate pairs do not exist in valid UTF-8,
235+
// even if they can exist in JSON or JavaScript strings (UCS-2 based). As a result, lone surrogates
236+
// cannot exist in a Rust String. If they do, the bug is in the String constructor.
237+
// An excellent explanation is available at https://www.youtube.com/watch?v=HhIEDWmQS3w
238+
239+
// Temporary storage for encoded a single char.
240+
// A char is up to 4 bytes long wehn encoded to UTF-8.
241+
let mut encoding_tmp = [0u8; 4];
242+
243+
for c in v.chars() {
244+
match c {
245+
'\\' => {
246+
self.buf.push(b'\\')?;
247+
self.buf.push(b'\\')?;
248+
}
249+
'"' => {
250+
self.buf.push(b'\\')?;
251+
self.buf.push(b'"')?;
252+
}
253+
'\u{0008}' => {
254+
self.buf.push(b'\\')?;
255+
self.buf.push(b'b')?;
256+
}
257+
'\u{0009}' => {
258+
self.buf.push(b'\\')?;
259+
self.buf.push(b't')?;
260+
}
261+
'\u{000A}' => {
262+
self.buf.push(b'\\')?;
263+
self.buf.push(b'n')?;
264+
}
265+
'\u{000C}' => {
266+
self.buf.push(b'\\')?;
267+
self.buf.push(b'f')?;
268+
}
269+
'\u{000D}' => {
270+
self.buf.push(b'\\')?;
271+
self.buf.push(b'r')?;
272+
}
273+
'\u{0000}'..='\u{001F}' => {
274+
self.buf.push(b'\\')?;
275+
self.buf.push(b'u')?;
276+
self.buf.push(b'0')?;
277+
self.buf.push(b'0')?;
278+
let (hex1, hex2) = hex(c as u8);
279+
self.buf.push(hex1)?;
280+
self.buf.push(hex2)?;
281+
}
282+
_ => {
283+
let encoded = c.encode_utf8(&mut encoding_tmp as &mut [u8]);
284+
self.buf.extend_from_slice(encoded.as_bytes())?;
285+
}
286+
}
287+
}
288+
216289
self.buf.push(b'"')?;
217290
Ok(())
218291
}
@@ -472,6 +545,33 @@ mod tests {
472545
#[test]
473546
fn str() {
474547
assert_eq!(&*crate::to_string::<N, _>("hello").unwrap(), r#""hello""#);
548+
assert_eq!(&*crate::to_string::<N, _>("").unwrap(), r#""""#);
549+
550+
// Characters unescaped if possible
551+
assert_eq!(&*crate::to_string::<N, _>("ä").unwrap(), r#""ä""#);
552+
assert_eq!(&*crate::to_string::<N, _>("৬").unwrap(), r#""৬""#);
553+
// assert_eq!(&*crate::to_string::<N, _>("\u{A0}").unwrap(), r#"" ""#); // non-breaking space
554+
assert_eq!(&*crate::to_string::<N, _>("ℝ").unwrap(), r#""ℝ""#); // 3 byte character
555+
assert_eq!(&*crate::to_string::<N, _>("💣").unwrap(), r#""💣""#); // 4 byte character
556+
557+
// " and \ must be escaped
558+
assert_eq!(&*crate::to_string::<N, _>("foo\"bar").unwrap(), r#""foo\"bar""#);
559+
assert_eq!(&*crate::to_string::<N, _>("foo\\bar").unwrap(), r#""foo\\bar""#);
560+
561+
// \b, \t, \n, \f, \r must be escaped in their two-character escaping
562+
assert_eq!(&*crate::to_string::<N, _>(" \u{0008} ").unwrap(), r#"" \b ""#);
563+
assert_eq!(&*crate::to_string::<N, _>(" \u{0009} ").unwrap(), r#"" \t ""#);
564+
assert_eq!(&*crate::to_string::<N, _>(" \u{000A} ").unwrap(), r#"" \n ""#);
565+
assert_eq!(&*crate::to_string::<N, _>(" \u{000C} ").unwrap(), r#"" \f ""#);
566+
assert_eq!(&*crate::to_string::<N, _>(" \u{000D} ").unwrap(), r#"" \r ""#);
567+
568+
// U+0000 through U+001F is escaped using six-character \u00xx uppercase hexadecimal escape sequences
569+
assert_eq!(&*crate::to_string::<N, _>(" \u{0000} ").unwrap(), r#"" \u0000 ""#);
570+
assert_eq!(&*crate::to_string::<N, _>(" \u{0001} ").unwrap(), r#"" \u0001 ""#);
571+
assert_eq!(&*crate::to_string::<N, _>(" \u{0007} ").unwrap(), r#"" \u0007 ""#);
572+
assert_eq!(&*crate::to_string::<N, _>(" \u{000e} ").unwrap(), r#"" \u000E ""#);
573+
assert_eq!(&*crate::to_string::<N, _>(" \u{001D} ").unwrap(), r#"" \u001D ""#);
574+
assert_eq!(&*crate::to_string::<N, _>(" \u{001f} ").unwrap(), r#"" \u001F ""#);
475575
}
476576

477577
#[test]

0 commit comments

Comments
 (0)