@@ -10,7 +10,7 @@ use measureme::{
1010 stringtable:: STRING_REF_TAG ,
1111} ;
1212use measureme:: { Addr , StringId } ;
13- use memchr:: memchr;
13+ use memchr:: { memchr, memchr2 } ;
1414use rustc_hash:: FxHashMap ;
1515use std:: borrow:: Cow ;
1616use std:: convert:: TryInto ;
@@ -34,6 +34,10 @@ pub struct StringRef<'st> {
3434// be resolved.
3535const UNKNOWN_STRING : & str = "<unknown>" ;
3636
37+ // This is the text we emit when we encounter string data that does not have a
38+ // proper terminator.
39+ const INVALID_STRING : & str = "<invalid>" ;
40+
3741impl < ' st > StringRef < ' st > {
3842 /// Expands the StringRef into an actual string. This method will
3943 /// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -110,9 +114,18 @@ impl<'st> StringRef<'st> {
110114
111115 pos += STRING_REF_ENCODED_SIZE ;
112116 } else {
113- while let Some ( ( c, len) ) = decode_utf8_char ( & self . table . string_data [ pos..] ) {
114- output. push ( c) ;
117+ // This is a literal UTF-8 string value. Find its end by looking
118+ // for either of the two possible terminator bytes.
119+ let remaining_data = & self . table . string_data [ pos..] ;
120+ if let Some ( len) = memchr2 ( 0xFF , 0xFE , remaining_data) {
121+ let value = String :: from_utf8_lossy ( & remaining_data[ ..len] ) ;
122+ output. push_str ( & value) ;
115123 pos += len;
124+ } else {
125+ // The grammar does not allow unterminated raw strings. We
126+ // have to stop decoding.
127+ output. push_str ( INVALID_STRING ) ;
128+ return ;
116129 }
117130 }
118131 }
@@ -141,58 +154,6 @@ fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
141154 StringId :: new ( id)
142155}
143156
144- // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
145- // Returns the decoded `char` and its size in bytes if it succeeds.
146- // Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
147- // See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
148- // encoding.
149- fn decode_utf8_char ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
150- use std:: convert:: TryFrom ;
151- let first_byte = bytes[ 0 ] as u32 ;
152- let ( codepoint, len) = if ( first_byte & 0b1000_0000 ) == 0 {
153- // The highest bit is zero, so this is a single-byte char
154- ( first_byte, 1 )
155- } else if ( first_byte & 0b1110_0000 ) == 0b1100_0000 {
156- // This is a two byte character
157- let bits0 = first_byte & 0b0001_1111 ;
158- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
159-
160- ( bits0 << 6 | bits1, 2 )
161- } else if ( first_byte & 0b1111_0000 ) == 0b1110_0000 {
162- // This is a three byte character
163- let bits0 = first_byte & 0b0000_1111 ;
164- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
165- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
166-
167- ( ( bits0 << 12 ) | ( bits1 << 6 ) | bits2, 3 )
168- } else if ( first_byte & 0b1111_1000 ) == 0b1111_0000 {
169- // This is a four byte character
170- let bits0 = first_byte & 0b0000_0111 ;
171- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
172- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
173- let bits3 = ( bytes[ 3 ] & 0b0011_1111 ) as u32 ;
174-
175- ( ( bits0 << 18 ) | ( bits1 << 12 ) | ( bits2 << 6 ) | bits3, 4 )
176- } else {
177- return None ;
178- } ;
179-
180- match char:: try_from ( codepoint) {
181- Ok ( c) => {
182- debug_assert ! ( {
183- let test_bytes = & mut [ 0u8 ; 8 ] ;
184- c. encode_utf8( test_bytes) ;
185- & test_bytes[ ..len] == & bytes[ ..len]
186- } ) ;
187-
188- Some ( ( c, len) )
189- }
190- Err ( e) => {
191- panic ! ( "StringTable: Encountered invalid UTF8 char: {:?}" , e) ;
192- }
193- }
194- }
195-
196157/// Read-only version of the string table
197158#[ derive( Debug ) ]
198159pub struct StringTable {
@@ -343,20 +304,4 @@ mod tests {
343304 assert_eq ! ( str_ref. to_string( ) , write_to) ;
344305 }
345306 }
346-
347- #[ test]
348- fn utf8_char_decoding ( ) {
349- use std:: convert:: TryFrom ;
350-
351- // Let's just test all possible codepoints because there are not that
352- // many actually.
353- for codepoint in 0 ..=0x10FFFFu32 {
354- if let Ok ( expected_char) = char:: try_from ( codepoint) {
355- let buffer = & mut [ 0 ; 4 ] ;
356- let expected_len = expected_char. encode_utf8 ( buffer) . len ( ) ;
357- let expected = Some ( ( expected_char, expected_len) ) ;
358- assert_eq ! ( expected, decode_utf8_char( & buffer[ ..] ) ) ;
359- }
360- }
361- }
362307}
0 commit comments