Skip to content

Commit

Permalink
Store utf8/utf16 position difference instead of counting utf16 positions
Browse files Browse the repository at this point in the history
  • Loading branch information
nchevobbe committed Apr 16, 2024
1 parent f3affa9 commit 75ecffa
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 29 deletions.
11 changes: 8 additions & 3 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ use std::ops::Range;
pub struct ParserState {
pub(crate) position: usize,
pub(crate) current_line_start_position: usize,
pub(crate) current_position: usize,
pub(crate) current_line_start_difference: u16,
pub(crate) position_difference: u16,
pub(crate) current_line_number: u32,
pub(crate) at_start_of: Option<BlockType>,
}
Expand All @@ -35,14 +36,18 @@ impl ParserState {
pub fn source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position - self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize +
1
) as u32,
}
}

/// The position from the start of the input, counted in UTF-16 code units
#[inline]
pub fn utf16_position(&self) -> u32 {
self.current_position as u32
(self.position - self.position_difference as usize) as u32
}
}

Expand Down
50 changes: 24 additions & 26 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ pub struct Tokenizer<'a> {
/// ensure that computing the column will give the result in units
/// of UTF-16 characters.
current_line_start_position: usize,
current_position: usize,
position_difference: u16,
current_line_start_difference: u16,
current_line_number: u32,
var_or_env_functions: SeenStatus,
source_map_url: Option<&'a str>,
Expand All @@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> {
input,
position: 0,
current_line_start_position: 0,
current_position: 0,
current_line_start_difference: 0,
current_line_number: 0,
position_difference: 0,
var_or_env_functions: SeenStatus::DontCare,
source_map_url: None,
source_url: None,
Expand Down Expand Up @@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> {
pub fn current_source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position -
self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize
+ 1
) as u32,
}
}

Expand All @@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> {
ParserState {
position: self.position,
current_line_start_position: self.current_line_start_position,
current_position: self.current_position,
current_line_start_difference: self.current_line_start_difference,
position_difference: self.position_difference,
current_line_number: self.current_line_number,
at_start_of: None,
}
Expand All @@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> {
pub fn reset(&mut self, state: &ParserState) {
self.position = state.position;
self.current_line_start_position = state.current_line_start_position;
self.current_position = state.current_position;
self.current_line_start_difference = state.current_line_start_difference;
self.position_difference = state.position_difference;
self.current_line_number = state.current_line_number;
}

Expand Down Expand Up @@ -374,7 +383,6 @@ impl<'a> Tokenizer<'a> {
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
}
}
self.current_position = self.current_position.wrapping_add(n);
self.position += n
}

Expand All @@ -396,8 +404,7 @@ impl<'a> Tokenizer<'a> {
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.current_position = self.current_position.wrapping_add(2);
self.position_difference = self.position_difference.wrapping_sub(1);
self.position += 1;
}

Expand All @@ -409,7 +416,7 @@ impl<'a> Tokenizer<'a> {
// Continuation bytes contribute to column overcount. Note
// that due to the special case for the 4-byte sequence intro,
// we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
self.position += 1;
}

Expand All @@ -422,14 +429,11 @@ impl<'a> Tokenizer<'a> {
if byte & 0xF0 == 0xF0 {
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.current_position = self.current_position.wrapping_add(2);
self.position_difference = self.position_difference.wrapping_sub(1);
} else if byte & 0xC0 == 0x80 {
// Note that due to the special case for the 4-byte
// sequence intro, we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
} else {
self.current_position = self.current_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
}
}

Expand All @@ -448,12 +452,11 @@ impl<'a> Tokenizer<'a> {
let byte = self.next_byte_unchecked();
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
self.position += 1;
self.current_position = self.current_position.wrapping_add(1);
if byte == b'\r' && self.next_byte() == Some(b'\n') {
self.position += 1;
self.current_position = self.current_position.wrapping_add(1);
}
self.current_line_start_position = self.position;
self.current_line_start_difference = self.position_difference;
self.current_line_number += 1;
}

Expand All @@ -467,14 +470,13 @@ impl<'a> Tokenizer<'a> {
fn consume_char(&mut self) -> char {
let c = self.next_char();
let len_utf8 = c.len_utf8();
let len_utf16 = c.len_utf16();
self.position += len_utf8;
// Note that due to the special case for the 4-byte sequence
// intro, we must use wrapping add here.
let len_utf16 = c.len_utf16();
self.current_line_start_position = self
.current_line_start_position
.wrapping_add(len_utf8 - len_utf16);
self.current_position = self.current_position.wrapping_add(len_utf16);
self.position_difference = self
.position_difference
.wrapping_add((len_utf8 - len_utf16) as u16);
c
}

Expand Down Expand Up @@ -1164,16 +1166,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
}
};
match_byte! { b,
b' ' | b'\t' => {
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
},
b' ' | b'\t' => {},
b'\n' | b'\x0C' => {
newlines += 1;
last_newline = offset;
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
}
b'\r' => {
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
newlines += 1;
last_newline = offset;
Expand Down

0 comments on commit 75ecffa

Please sign in to comment.