@@ -206,6 +206,9 @@ pub struct Tokenizer<'a> {
206206 input : & ' a str ,
207207 /// Counted in bytes, not code points. From 0.
208208 position : usize ,
209+ /// The position at the start of the current line; but adjusted to
210+ /// ensure that computing the column will give the result in units
211+ /// of UTF-16 characters.
209212 current_line_start_position : usize ,
210213 current_line_number : u32 ,
211214 var_functions : SeenStatus ,
@@ -370,6 +373,9 @@ impl<'a> Tokenizer<'a> {
370373 #[ inline]
371374 fn consume_4byte_intro ( & mut self ) {
372375 debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
376+ // This takes two UTF-16 characters to represent, so we
377+ // actually have an undercount.
378+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
373379 self . position += 1 ;
374380 }
375381
@@ -378,6 +384,10 @@ impl<'a> Tokenizer<'a> {
378384 #[ inline]
379385 fn consume_continuation_byte ( & mut self ) {
380386 debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
387+ // Continuation bytes contribute to column overcount. Note
388+ // that due to the special case for the 4-byte sequence intro,
389+ // we must use wrapping add here.
390+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( 1 ) ;
381391 self . position += 1 ;
382392 }
383393
@@ -386,6 +396,16 @@ impl<'a> Tokenizer<'a> {
386396 fn consume_known_byte ( & mut self , byte : u8 ) {
387397 debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
388398 self . position += 1 ;
399+ // Continuation bytes contribute to column overcount.
400+ if byte & 0xF0 == 0xF0 {
401+ // This takes two UTF-16 characters to represent, so we
402+ // actually have an undercount.
403+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
404+ } else if byte & 0xC0 == 0x80 {
405+ // Note that due to the special case for the 4-byte
406+ // sequence intro, we must use wrapping add here.
407+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( 1 ) ;
408+ }
389409 }
390410
391411 #[ inline]
@@ -416,7 +436,11 @@ impl<'a> Tokenizer<'a> {
416436 #[ inline]
417437 fn consume_char ( & mut self ) -> char {
418438 let c = self . next_char ( ) ;
419- self . position += c. len_utf8 ( ) ;
439+ let len_utf8 = c. len_utf8 ( ) ;
440+ self . position += len_utf8;
441+ // Note that due to the special case for the 4-byte sequence
442+ // intro, we must use wrapping add here.
443+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( len_utf8 - c. len_utf16 ( ) ) ;
420444 c
421445 }
422446
@@ -498,6 +522,7 @@ pub struct SourceLocation {
498522 pub line : u32 ,
499523
500524 /// The column number within a line, starting at 0 for first the character of the line.
525+ /// Column numbers are in units of UTF-16 characters.
501526 pub column : u32 ,
502527}
503528
@@ -1126,6 +1151,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11261151
11271152 if newlines > 0 {
11281153 tokenizer. current_line_number += newlines;
1154+ // No need for wrapping_add here, because there's no possible
1155+ // way to wrap.
11291156 tokenizer. current_line_start_position = start_position + last_newline + 1 ;
11301157 }
11311158
0 commit comments