Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More lexer improvements #102302

Merged
merged 15 commits into from
Sep 28, 2022
Merged
7 changes: 1 addition & 6 deletions compiler/rustc_ast/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym};
use rustc_span::symbol::{Ident, Symbol};
use rustc_span::{self, edition::Edition, Span, DUMMY_SP};
use std::borrow::Cow;
use std::{fmt, mem};
use std::fmt;

#[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)]
pub enum CommentKind {
Expand Down Expand Up @@ -335,11 +335,6 @@ impl Token {
Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span)
}

/// Return this token by value and leave a dummy token in its place.
pub fn take(&mut self) -> Self {
mem::replace(self, Token::dummy())
}

/// For interpolated tokens, returns a span of the fragment to which the interpolated
/// token refers. For all other tokens this is just a regular span.
/// It is particularly important to use this for identifiers and lifetimes
Expand Down
3 changes: 2 additions & 1 deletion compiler/rustc_errors/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ pub mod translation;
pub use diagnostic_builder::IntoDiagnostic;
pub use snippet::Style;

pub type PResult<'a, T> = Result<T, DiagnosticBuilder<'a, ErrorGuaranteed>>;
pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>;
pub type PResult<'a, T> = Result<T, PErr<'a>>;

// `PResult` is used a lot. Make sure it doesn't unintentionally get bigger.
// (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.)
Expand Down
16 changes: 8 additions & 8 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use std::str::Chars;
///
/// Next characters can be peeked via `first` method,
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
initial_len: usize,
pub struct Cursor<'a> {
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
len_remaining: usize,
/// Iterator over chars. Slightly faster than a &str.
chars: Chars<'a>,
#[cfg(debug_assertions)]
Expand All @@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';

impl<'a> Cursor<'a> {
pub(crate) fn new(input: &'a str) -> Cursor<'a> {
pub fn new(input: &'a str) -> Cursor<'a> {
Cursor {
initial_len: input.len(),
len_remaining: input.len(),
chars: input.chars(),
#[cfg(debug_assertions)]
prev: EOF_CHAR,
Expand Down Expand Up @@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
}

/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> u32 {
(self.initial_len - self.chars.as_str().len()) as u32
pub(crate) fn pos_within_token(&self) -> u32 {
(self.len_remaining - self.chars.as_str().len()) as u32
}

/// Resets the number of bytes consumed to 0.
pub(crate) fn reset_len_consumed(&mut self) {
self.initial_len = self.chars.as_str().len();
pub(crate) fn reset_pos_within_token(&mut self) {
self.len_remaining = self.chars.as_str().len();
}

/// Moves to the next character.
Expand Down
51 changes: 24 additions & 27 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
// We want to be able to build this crate with a stable compiler, so no
// `#![feature]` attributes should be added.

mod cursor;
pub mod cursor;
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
pub mod unescape;

#[cfg(test)]
Expand Down Expand Up @@ -139,6 +139,9 @@ pub enum TokenKind {

/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,

/// End of input.
Eof,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
Expand Down Expand Up @@ -219,13 +222,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
None
}

/// Parses the first token from the provided input string.
#[inline]
pub fn first_token(input: &str) -> Token {
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}

/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
Expand All @@ -243,12 +239,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
std::iter::from_fn(move || {
if cursor.is_eof() {
None
} else {
cursor.reset_len_consumed();
Some(cursor.advance_token())
}
let token = cursor.advance_token();
if token.kind != TokenKind::Eof { Some(token) } else { None }
})
}

Expand Down Expand Up @@ -311,8 +303,11 @@ pub fn is_ident(string: &str) -> bool {

impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
pub fn advance_token(&mut self) -> Token {
let first_char = match self.bump() {
Some(c) => c,
None => return Token::new(TokenKind::Eof, 0),
};
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.first() {
Expand All @@ -329,7 +324,7 @@ impl Cursor<'_> {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let res = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
Expand All @@ -344,7 +339,7 @@ impl Cursor<'_> {
('\'', _) => {
self.bump();
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -354,7 +349,7 @@ impl Cursor<'_> {
('"', _) => {
self.bump();
let terminated = self.double_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -364,7 +359,7 @@ impl Cursor<'_> {
('r', '"') | ('r', '#') => {
self.bump();
let res = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
Expand All @@ -381,7 +376,7 @@ impl Cursor<'_> {
// Numeric literal.
c @ '0'..='9' => {
let literal_kind = self.number(c);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
self.eat_literal_suffix();
TokenKind::Literal { kind: literal_kind, suffix_start }
}
Expand Down Expand Up @@ -420,7 +415,7 @@ impl Cursor<'_> {
// String literal.
'"' => {
let terminated = self.double_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -433,7 +428,9 @@ impl Cursor<'_> {
}
_ => Unknown,
};
Token::new(token_kind, self.len_consumed())
let res = Token::new(token_kind, self.pos_within_token());
self.reset_pos_within_token();
res
}

fn line_comment(&mut self) -> TokenKind {
Expand Down Expand Up @@ -618,7 +615,7 @@ impl Cursor<'_> {

if !can_be_a_lifetime {
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -643,7 +640,7 @@ impl Cursor<'_> {
if self.first() == '\'' {
self.bump();
let kind = Char { terminated: true };
Literal { kind, suffix_start: self.len_consumed() }
Literal { kind, suffix_start: self.pos_within_token() }
} else {
Lifetime { starts_with_number }
}
Expand Down Expand Up @@ -724,7 +721,7 @@ impl Cursor<'_> {

fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == 'r');
let start_pos = self.len_consumed();
let start_pos = self.pos_within_token();
let mut possible_terminator_offset = None;
let mut max_hashes = 0;

Expand Down Expand Up @@ -778,7 +775,7 @@ impl Cursor<'_> {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}
Expand Down
Loading