11//! See module-level documentation `measureme::stringtable`.
22
3- use measureme:: file_header:: {
4- strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA ,
5- FILE_MAGIC_STRINGTABLE_INDEX ,
3+ use measureme:: stringtable:: { METADATA_STRING_ID , TERMINATOR } ;
4+ use measureme:: {
5+ file_header:: {
6+ strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA ,
7+ FILE_MAGIC_STRINGTABLE_INDEX ,
8+ } ,
9+ stringtable:: STRING_REF_ENCODED_SIZE ,
10+ stringtable:: STRING_REF_TAG ,
611} ;
7- use measureme:: stringtable:: { METADATA_STRING_ID , STRING_ID_MASK , TERMINATOR } ;
812use measureme:: { Addr , StringId } ;
9- use memchr:: memchr;
13+ use memchr:: { memchr, memchr2 } ;
1014use rustc_hash:: FxHashMap ;
1115use std:: borrow:: Cow ;
1216use std:: convert:: TryInto ;
@@ -30,6 +34,10 @@ pub struct StringRef<'st> {
3034// be resolved.
3135const UNKNOWN_STRING : & str = "<unknown>" ;
3236
37+ // This is the text we emit when we encounter string data that does not have a
38+ // proper terminator.
39+ const INVALID_STRING : & str = "<invalid>" ;
40+
3341impl < ' st > StringRef < ' st > {
3442 /// Expands the StringRef into an actual string. This method will
3543 /// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -55,9 +63,8 @@ impl<'st> StringRef<'st> {
5563
5664 // Check if this is a string containing a single StringId component
5765 let first_byte = self . table . string_data [ pos] ;
58- const STRING_ID_SIZE : usize = std:: mem:: size_of :: < StringId > ( ) ;
59- if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte ( first_byte) {
60- let id = decode_string_id_from_data ( & self . table . string_data [ pos..pos + STRING_ID_SIZE ] ) ;
66+ if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
67+ let id = decode_string_ref_from_data ( & self . table . string_data [ pos..] ) ;
6168 return StringRef {
6269 id,
6370 table : self . table ,
@@ -97,19 +104,28 @@ impl<'st> StringRef<'st> {
97104
98105 if byte == TERMINATOR {
99106 return ;
100- } else if is_utf8_continuation_byte ( byte) {
107+ } else if byte == STRING_REF_TAG {
101108 let string_ref = StringRef {
102- id : decode_string_id_from_data ( & self . table . string_data [ pos..pos + 4 ] ) ,
109+ id : decode_string_ref_from_data ( & self . table . string_data [ pos..] ) ,
103110 table : self . table ,
104111 } ;
105112
106113 string_ref. write_to_string ( output) ;
107114
108- pos += 4 ;
115+ pos += STRING_REF_ENCODED_SIZE ;
109116 } else {
110- while let Some ( ( c, len) ) = decode_utf8_char ( & self . table . string_data [ pos..] ) {
111- output. push ( c) ;
117+ // This is a literal UTF-8 string value. Find its end by looking
118+ // for either of the two possible terminator bytes.
119+ let remaining_data = & self . table . string_data [ pos..] ;
120+ if let Some ( len) = memchr2 ( 0xFF , 0xFE , remaining_data) {
121+ let value = String :: from_utf8_lossy ( & remaining_data[ ..len] ) ;
122+ output. push_str ( & value) ;
112123 pos += len;
124+ } else {
125+ // The grammar does not allow unterminated raw strings. We
126+ // have to stop decoding.
127+ output. push_str ( INVALID_STRING ) ;
128+ return ;
113129 }
114130 }
115131 }
@@ -129,71 +145,17 @@ impl<'st> StringRef<'st> {
129145 }
130146}
131147
132- fn is_utf8_continuation_byte ( byte : u8 ) -> bool {
133- // See module-level documentation for more information on the encoding.
134- const UTF8_CONTINUATION_MASK : u8 = 0b1100_0000 ;
135- const UTF8_CONTINUATION_BYTE : u8 = 0b1000_0000 ;
136- ( byte & UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION_BYTE
137- }
138-
139148// String IDs in the table data are encoded in big endian format, while string
140149// IDs in the index are encoded in little endian format. Don't mix the two up.
141- fn decode_string_id_from_data ( bytes : & [ u8 ] ) -> StringId {
142- let id = u32:: from_be_bytes ( bytes[ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ;
143- // Mask off the `0b10` prefix
144- StringId :: new ( id & STRING_ID_MASK )
145- }
146-
147- // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
148- // Returns the decoded `char` and its size in bytes if it succeeds.
149- // Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
150- // See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
151- // encoding.
152- fn decode_utf8_char ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
153- use std:: convert:: TryFrom ;
154- let first_byte = bytes[ 0 ] as u32 ;
155- let ( codepoint, len) = if ( first_byte & 0b1000_0000 ) == 0 {
156- // The highest bit is zero, so this is a single-byte char
157- ( first_byte, 1 )
158- } else if ( first_byte & 0b1110_0000 ) == 0b1100_0000 {
159- // This is a two byte character
160- let bits0 = first_byte & 0b0001_1111 ;
161- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
162-
163- ( bits0 << 6 | bits1, 2 )
164- } else if ( first_byte & 0b1111_0000 ) == 0b1110_0000 {
165- // This is a three byte character
166- let bits0 = first_byte & 0b0000_1111 ;
167- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
168- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
169-
170- ( ( bits0 << 12 ) | ( bits1 << 6 ) | bits2, 3 )
171- } else if ( first_byte & 0b1111_1000 ) == 0b1111_0000 {
172- // This is a four byte character
173- let bits0 = first_byte & 0b0000_0111 ;
174- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
175- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
176- let bits3 = ( bytes[ 3 ] & 0b0011_1111 ) as u32 ;
177-
178- ( ( bits0 << 18 ) | ( bits1 << 12 ) | ( bits2 << 6 ) | bits3, 4 )
179- } else {
180- return None ;
181- } ;
182-
183- match char:: try_from ( codepoint) {
184- Ok ( c) => {
185- debug_assert ! ( {
186- let test_bytes = & mut [ 0u8 ; 8 ] ;
187- c. encode_utf8( test_bytes) ;
188- & test_bytes[ ..len] == & bytes[ ..len]
189- } ) ;
190-
191- Some ( ( c, len) )
192- }
193- Err ( e) => {
194- panic ! ( "StringTable: Encountered invalid UTF8 char: {:?}" , e) ;
195- }
196- }
150+ fn decode_string_ref_from_data ( bytes : & [ u8 ] ) -> StringId {
151+ // The code below assumes we use a 5-byte encoding for string
152+ // refs, where the first byte is STRING_REF_TAG and the
153+ // following 4 bytes are a little-endian u32 string ID value.
154+ assert ! ( bytes[ 0 ] == STRING_REF_TAG ) ;
155+ assert ! ( STRING_REF_ENCODED_SIZE == 5 ) ;
156+
157+ let id = u32:: from_le_bytes ( bytes[ 1 ..5 ] . try_into ( ) . unwrap ( ) ) ;
158+ StringId :: new ( id)
197159}
198160
199161/// Read-only version of the string table
@@ -346,20 +308,4 @@ mod tests {
346308 assert_eq ! ( str_ref. to_string( ) , write_to) ;
347309 }
348310 }
349-
350- #[ test]
351- fn utf8_char_decoding ( ) {
352- use std:: convert:: TryFrom ;
353-
354- // Let's just test all possible codepoints because there are not that
355- // many actually.
356- for codepoint in 0 ..=0x10FFFFu32 {
357- if let Ok ( expected_char) = char:: try_from ( codepoint) {
358- let buffer = & mut [ 0 ; 4 ] ;
359- let expected_len = expected_char. encode_utf8 ( buffer) . len ( ) ;
360- let expected = Some ( ( expected_char, expected_len) ) ;
361- assert_eq ! ( expected, decode_utf8_char( & buffer[ ..] ) ) ;
362- }
363- }
364- }
365311}
0 commit comments