@@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
102102 } )
103103}
104104
105+ // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
106+ // classes.
107+
108+ /// True if `c` is considered a whitespace according to Rust language definition.
109+ pub fn is_whitespace ( c : char ) -> bool {
110+ // This is Pattern_White_Space.
111+ //
112+ // Note that this set is stable (ie, it doesn't change with different
113+ // Unicode versions), so it's ok to just hard-code the values.
114+
115+ match c {
116+ // Usual ASCII suspects
117+ | '\u{0009}' // \t
118+ | '\u{000A}' // \n
119+ | '\u{000B}' // vertical tab
120+ | '\u{000C}' // form feed
121+ | '\u{000D}' // \r
122+ | '\u{0020}' // space
123+
124+ // NEXT LINE from latin1
125+ | '\u{0085}'
126+
127+ // Bidi markers
128+ | '\u{200E}' // LEFT-TO-RIGHT MARK
129+ | '\u{200F}' // RIGHT-TO-LEFT MARK
130+
131+ // Dedicated whitespace characters from Unicode
132+ | '\u{2028}' // LINE SEPARATOR
133+ | '\u{2029}' // PARAGRAPH SEPARATOR
134+ => true ,
135+ _ => false ,
136+ }
137+ }
138+
139+ /// True if `c` is valid as a first character of an identifier.
140+ pub fn is_id_start ( c : char ) -> bool {
141+ // This is XID_Start OR '_' (which formally is not a XID_Start).
142+ // We also add fast-path for ascii idents
143+ ( 'a' <= c && c <= 'z' )
144+ || ( 'A' <= c && c <= 'Z' )
145+ || c == '_'
146+ || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_start ( c) )
147+ }
148+
149+ /// True if `c` is valid as a non-first character of an identifier.
150+ pub fn is_id_continue ( c : char ) -> bool {
151+ // This is exactly XID_Continue.
152+ // We also add fast-path for ascii idents
153+ ( 'a' <= c && c <= 'z' )
154+ || ( 'A' <= c && c <= 'Z' )
155+ || ( '0' <= c && c <= '9' )
156+ || c == '_'
157+ || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_continue ( c) )
158+ }
159+
160+
105161impl Cursor < ' _ > {
106162 fn advance_token ( & mut self ) -> Token {
107163 let first_char = self . bump ( ) . unwrap ( ) ;
@@ -111,9 +167,9 @@ impl Cursor<'_> {
111167 '*' => self . block_comment ( ) ,
112168 _ => Slash ,
113169 } ,
114- c if character_properties :: is_whitespace ( c) => self . whitespace ( ) ,
170+ c if is_whitespace ( c) => self . whitespace ( ) ,
115171 'r' => match ( self . nth_char ( 0 ) , self . nth_char ( 1 ) ) {
116- ( '#' , c1) if character_properties :: is_id_start ( c1) => self . raw_ident ( ) ,
172+ ( '#' , c1) if is_id_start ( c1) => self . raw_ident ( ) ,
117173 ( '#' , _) | ( '"' , _) => {
118174 let ( n_hashes, started, terminated) = self . raw_double_quoted_string ( ) ;
119175 let suffix_start = self . len_consumed ( ) ;
@@ -158,7 +214,7 @@ impl Cursor<'_> {
158214 }
159215 _ => self . ident ( ) ,
160216 } ,
161- c if character_properties :: is_id_start ( c) => self . ident ( ) ,
217+ c if is_id_start ( c) => self . ident ( ) ,
162218 c @ '0' ..='9' => {
163219 let literal_kind = self . number ( c) ;
164220 let suffix_start = self . len_consumed ( ) ;
@@ -246,8 +302,8 @@ impl Cursor<'_> {
246302 }
247303
248304 fn whitespace ( & mut self ) -> TokenKind {
249- debug_assert ! ( character_properties :: is_whitespace( self . prev( ) ) ) ;
250- while character_properties :: is_whitespace ( self . nth_char ( 0 ) ) {
305+ debug_assert ! ( is_whitespace( self . prev( ) ) ) ;
306+ while is_whitespace ( self . nth_char ( 0 ) ) {
251307 self . bump ( ) ;
252308 }
253309 Whitespace
@@ -257,19 +313,19 @@ impl Cursor<'_> {
257313 debug_assert ! (
258314 self . prev( ) == 'r'
259315 && self . nth_char( 0 ) == '#'
260- && character_properties :: is_id_start( self . nth_char( 1 ) )
316+ && is_id_start( self . nth_char( 1 ) )
261317 ) ;
262318 self . bump ( ) ;
263319 self . bump ( ) ;
264- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
320+ while is_id_continue ( self . nth_char ( 0 ) ) {
265321 self . bump ( ) ;
266322 }
267323 RawIdent
268324 }
269325
270326 fn ident ( & mut self ) -> TokenKind {
271- debug_assert ! ( character_properties :: is_id_start( self . prev( ) ) ) ;
272- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
327+ debug_assert ! ( is_id_start( self . prev( ) ) ) ;
328+ while is_id_continue ( self . nth_char ( 0 ) ) {
273329 self . bump ( ) ;
274330 }
275331 Ident
@@ -314,7 +370,7 @@ impl Cursor<'_> {
314370 // integer literal followed by field/method access or a range pattern
315371 // (`0..2` and `12.foo()`)
316372 '.' if self . nth_char ( 1 ) != '.'
317- && !character_properties :: is_id_start ( self . nth_char ( 1 ) ) =>
373+ && !is_id_start ( self . nth_char ( 1 ) ) =>
318374 {
319375 // might have stuff after the ., and if it does, it needs to start
320376 // with a number
@@ -344,15 +400,15 @@ impl Cursor<'_> {
344400 fn lifetime_or_char ( & mut self ) -> TokenKind {
345401 debug_assert ! ( self . prev( ) == '\'' ) ;
346402 let mut starts_with_number = false ;
347- if ( character_properties :: is_id_start ( self . nth_char ( 0 ) )
403+ if ( is_id_start ( self . nth_char ( 0 ) )
348404 || self . nth_char ( 0 ) . is_digit ( 10 ) && {
349405 starts_with_number = true ;
350406 true
351407 } )
352408 && self . nth_char ( 1 ) != '\''
353409 {
354410 self . bump ( ) ;
355- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
411+ while is_id_continue ( self . nth_char ( 0 ) ) {
356412 self . bump ( ) ;
357413 }
358414
@@ -494,64 +550,13 @@ impl Cursor<'_> {
494550 }
495551
496552 fn eat_literal_suffix ( & mut self ) {
497- if !character_properties :: is_id_start ( self . nth_char ( 0 ) ) {
553+ if !is_id_start ( self . nth_char ( 0 ) ) {
498554 return ;
499555 }
500556 self . bump ( ) ;
501557
502- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
558+ while is_id_continue ( self . nth_char ( 0 ) ) {
503559 self . bump ( ) ;
504560 }
505561 }
506562}
507-
508- pub mod character_properties {
509- // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
510- // classes.
511-
512- // This is Pattern_White_Space.
513- //
514- // Note that this set is stable (ie, it doesn't change with different
515- // Unicode versions), so it's ok to just hard-code the values.
516- pub fn is_whitespace ( c : char ) -> bool {
517- match c {
518- // Usual ASCII suspects
519- | '\u{0009}' // \t
520- | '\u{000A}' // \n
521- | '\u{000B}' // vertical tab
522- | '\u{000C}' // form feed
523- | '\u{000D}' // \r
524- | '\u{0020}' // space
525-
526- // NEXT LINE from latin1
527- | '\u{0085}'
528-
529- // Bidi markers
530- | '\u{200E}' // LEFT-TO-RIGHT MARK
531- | '\u{200F}' // RIGHT-TO-LEFT MARK
532-
533- // Dedicated whitespace characters from Unicode
534- | '\u{2028}' // LINE SEPARATOR
535- | '\u{2029}' // PARAGRAPH SEPARATOR
536- => true ,
537- _ => false ,
538- }
539- }
540-
541- // This is XID_Start OR '_' (which formally is not a XID_Start).
542- pub fn is_id_start ( c : char ) -> bool {
543- ( 'a' <= c && c <= 'z' )
544- || ( 'A' <= c && c <= 'Z' )
545- || c == '_'
546- || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_start ( c) )
547- }
548-
549- // This is XID_Continue.
550- pub fn is_id_continue ( c : char ) -> bool {
551- ( 'a' <= c && c <= 'z' )
552- || ( 'A' <= c && c <= 'Z' )
553- || ( '0' <= c && c <= '9' )
554- || c == '_'
555- || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_continue ( c) )
556- }
557- }
0 commit comments