Store utf8/utf16 position difference instead of counting utf16 positions

servo · Apr 16, 2024 · 75ecffa · 75ecffa
1 parent f3affa9
commit 75ecffa
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 29 deletions.
diff --git a/src/parser.rs b/src/parser.rs
@@ -18,7 +18,8 @@ use std::ops::Range;
 pub struct ParserState {
     pub(crate) position: usize,
     pub(crate) current_line_start_position: usize,
-    pub(crate) current_position: usize,
+    pub(crate) current_line_start_difference: u16,
+    pub(crate) position_difference: u16,
     pub(crate) current_line_number: u32,
     pub(crate) at_start_of: Option<BlockType>,
 }
@@ -35,14 +36,18 @@ impl ParserState {
     pub fn source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position - self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize +
+                1
+            ) as u32,
         }
     }
 
     /// The position from the start of the input, counted in UTF-16 code units
     #[inline]
     pub fn utf16_position(&self) -> u32 {
-        self.current_position as u32
+        (self.position - self.position_difference as usize) as u32
     }
 }
 

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -214,7 +214,8 @@ pub struct Tokenizer<'a> {
     /// ensure that computing the column will give the result in units
     /// of UTF-16 characters.
     current_line_start_position: usize,
-    current_position: usize,
+    position_difference: u16,
+    current_line_start_difference: u16,
     current_line_number: u32,
     var_or_env_functions: SeenStatus,
     source_map_url: Option<&'a str>,
@@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> {
             input,
             position: 0,
             current_line_start_position: 0,
-            current_position: 0,
+            current_line_start_difference: 0,
             current_line_number: 0,
+            position_difference: 0,
             var_or_env_functions: SeenStatus::DontCare,
             source_map_url: None,
             source_url: None,
@@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> {
     pub fn current_source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position -
+                self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize
+                + 1
+            ) as u32,
         }
     }
 
@@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> {
         ParserState {
             position: self.position,
             current_line_start_position: self.current_line_start_position,
-            current_position: self.current_position,
+            current_line_start_difference: self.current_line_start_difference,
+            position_difference: self.position_difference,
             current_line_number: self.current_line_number,
             at_start_of: None,
         }
@@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> {
     pub fn reset(&mut self, state: &ParserState) {
         self.position = state.position;
         self.current_line_start_position = state.current_line_start_position;
-        self.current_position = state.current_position;
+        self.current_line_start_difference = state.current_line_start_difference;
+        self.position_difference = state.position_difference;
         self.current_line_number = state.current_line_number;
     }
 
@@ -374,7 +383,6 @@ impl<'a> Tokenizer<'a> {
                 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
             }
         }
-        self.current_position = self.current_position.wrapping_add(n);
         self.position += n
     }
 
@@ -396,8 +404,7 @@ impl<'a> Tokenizer<'a> {
         debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
         // This takes two UTF-16 characters to represent, so we
         // actually have an undercount.
-        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
-        self.current_position = self.current_position.wrapping_add(2);
+        self.position_difference = self.position_difference.wrapping_sub(1);
         self.position += 1;
     }
 
@@ -409,7 +416,7 @@ impl<'a> Tokenizer<'a> {
         // Continuation bytes contribute to column overcount.  Note
         // that due to the special case for the 4-byte sequence intro,
         // we must use wrapping add here.
-        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+        self.position_difference = self.position_difference.wrapping_add(1);
         self.position += 1;
     }
 
@@ -422,14 +429,11 @@ impl<'a> Tokenizer<'a> {
         if byte & 0xF0 == 0xF0 {
             // This takes two UTF-16 characters to represent, so we
             // actually have an undercount.
-            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
-            self.current_position = self.current_position.wrapping_add(2);
+            self.position_difference = self.position_difference.wrapping_sub(1);
         } else if byte & 0xC0 == 0x80 {
             // Note that due to the special case for the 4-byte
             // sequence intro, we must use wrapping add here.
-            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
-        } else {
-            self.current_position = self.current_position.wrapping_add(1);
+            self.position_difference = self.position_difference.wrapping_add(1);
         }
     }
 
@@ -448,12 +452,11 @@ impl<'a> Tokenizer<'a> {
         let byte = self.next_byte_unchecked();
         debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
         self.position += 1;
-        self.current_position = self.current_position.wrapping_add(1);
         if byte == b'\r' && self.next_byte() == Some(b'\n') {
             self.position += 1;
-            self.current_position = self.current_position.wrapping_add(1);
         }
         self.current_line_start_position = self.position;
+        self.current_line_start_difference = self.position_difference;
         self.current_line_number += 1;
     }
 
@@ -467,14 +470,13 @@ impl<'a> Tokenizer<'a> {
     fn consume_char(&mut self) -> char {
         let c = self.next_char();
         let len_utf8 = c.len_utf8();
+        let len_utf16 = c.len_utf16();
         self.position += len_utf8;
         // Note that due to the special case for the 4-byte sequence
         // intro, we must use wrapping add here.
-        let len_utf16 = c.len_utf16();
-        self.current_line_start_position = self
-            .current_line_start_position
-            .wrapping_add(len_utf8 - len_utf16);
-        self.current_position = self.current_position.wrapping_add(len_utf16);
+        self.position_difference = self
+            .position_difference
+            .wrapping_add((len_utf8 - len_utf16) as u16);
         c
     }
 
@@ -1164,16 +1166,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
         };
         match_byte! { b,
-            b' ' | b'\t' => {
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
-            },
+            b' ' | b'\t' => {},
             b'\n' | b'\x0C' => {
                 newlines += 1;
                 last_newline = offset;
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
             }
             b'\r' => {
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
                 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
                     newlines += 1;
                     last_newline = offset;