Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More lexer improvements #102302

Merged
merged 15 commits into from
Sep 28, 2022
Merged
4 changes: 2 additions & 2 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::str::Chars;
///
/// Next characters can be peeked via `first` method,
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
pub struct Cursor<'a> {
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
initial_len: usize,
/// Iterator over chars. Slightly faster than a &str.
chars: Chars<'a>,
Expand All @@ -15,7 +15,7 @@ pub(crate) struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';

impl<'a> Cursor<'a> {
pub(crate) fn new(input: &'a str) -> Cursor<'a> {
pub fn new(input: &'a str) -> Cursor<'a> {
Cursor {
initial_len: input.len(),
chars: input.chars(),
Expand Down
26 changes: 7 additions & 19 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
// We want to be able to build this crate with a stable compiler, so no
// `#![feature]` attributes should be added.

mod cursor;
pub mod cursor;
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
pub mod unescape;

#[cfg(test)]
Expand Down Expand Up @@ -219,13 +219,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
None
}

/// Parses the first token from the provided input string.
#[inline]
pub fn first_token(input: &str) -> Token {
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}

/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
Expand All @@ -242,14 +235,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
std::iter::from_fn(move || {
if cursor.is_eof() {
None
} else {
cursor.reset_len_consumed();
Some(cursor.advance_token())
}
})
std::iter::from_fn(move || cursor.advance_token())
}

/// True if `c` is considered a whitespace according to Rust language definition.
Expand Down Expand Up @@ -311,8 +297,8 @@ pub fn is_ident(string: &str) -> bool {

impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
pub fn advance_token(&mut self) -> Option<Token> {
let first_char = self.bump()?;
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.first() {
Expand Down Expand Up @@ -433,7 +419,9 @@ impl Cursor<'_> {
}
_ => Unknown,
};
Token::new(token_kind, self.len_consumed())
let res = Some(Token::new(token_kind, self.len_consumed()));
self.reset_len_consumed();
res
}

fn line_comment(&mut self) -> TokenKind {
Expand Down
23 changes: 13 additions & 10 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
use rustc_ast::tokenstream::TokenStream;
use rustc_ast::util::unicode::contains_text_flow_control_chars;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult};
use rustc_lexer::cursor::Cursor;
use rustc_lexer::unescape::{self, Mode};
use rustc_lexer::{Base, DocStyle, RawStrError};
use rustc_session::lint::builtin::{
Expand Down Expand Up @@ -48,7 +49,9 @@ pub(crate) fn parse_token_trees<'a>(
start_pos = start_pos + BytePos::from_usize(shebang_len);
}

let string_reader = StringReader { sess, start_pos, pos: start_pos, src, override_span };
let cursor = Cursor::new(src);
let string_reader =
StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
tokentrees::TokenTreesReader::parse_token_trees(string_reader)
}

Expand All @@ -60,6 +63,8 @@ struct StringReader<'a> {
pos: BytePos,
/// Source text to tokenize.
src: &'a str,
/// Cursor for getting lexer tokens.
cursor: Cursor<'a>,
override_span: Option<Span>,
}

Expand All @@ -75,15 +80,13 @@ impl<'a> StringReader<'a> {

// Skip trivial (whitespace & comments) tokens
loop {
let start_src_index = self.src_index(self.pos);
let text: &str = &self.src[start_src_index..];

if text.is_empty() {
let span = self.mk_sp(self.pos, self.pos);
return (Token::new(token::Eof, span), preceded_by_whitespace);
}

let token = rustc_lexer::first_token(text);
let token = match self.cursor.advance_token() {
Some(token) => token,
None => {
let span = self.mk_sp(self.pos, self.pos);
return (Token::new(token::Eof, span), preceded_by_whitespace);
}
};

let start = self.pos;
self.pos = self.pos + BytePos(token.len);
Expand Down
9 changes: 4 additions & 5 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::collections::VecDeque;
use std::fmt::{Display, Write};

use rustc_data_structures::fx::FxHashMap;
use rustc_lexer::cursor::Cursor;
use rustc_lexer::{LiteralKind, TokenKind};
use rustc_span::edition::Edition;
use rustc_span::symbol::Symbol;
Expand Down Expand Up @@ -408,15 +409,13 @@ enum Highlight<'a> {

struct TokenIter<'a> {
src: &'a str,
cursor: Cursor<'a>,
}

impl<'a> Iterator for TokenIter<'a> {
type Item = (TokenKind, &'a str);
fn next(&mut self) -> Option<(TokenKind, &'a str)> {
if self.src.is_empty() {
return None;
}
let token = rustc_lexer::first_token(self.src);
let token = self.cursor.advance_token()?;
let (text, rest) = self.src.split_at(token.len as usize);
self.src = rest;
Some((token.kind, text))
Expand Down Expand Up @@ -525,7 +524,7 @@ impl<'a> Classifier<'a> {
/// Takes as argument the source code to HTML-ify, the rust edition to use and the source code
/// file span which will be used later on by the `span_correspondance_map`.
fn new(src: &str, file_span: Span, decoration_info: Option<DecorationInfo>) -> Classifier<'_> {
let tokens = PeekIter::new(TokenIter { src });
let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) });
let decorations = decoration_info.map(Decorations::new);
Classifier {
tokens,
Expand Down