rustc_lexer/
lib.rs

1//! Low-level Rust lexer.
2//!
3//! The idea with `rustc_lexer` is to make a reusable library,
4//! by separating out pure lexing and rustc-specific concerns, like spans,
5//! error reporting, and interning. So, rustc_lexer operates directly on `&str`,
6//! produces simple tokens which are a pair of type-tag and a bit of original text,
7//! and does not report errors, instead storing them as flags on the token.
8//!
9//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
10//! For that see [`rustc_parse::lexer`], which converts this basic token stream
11//! into wide tokens used by actual parser.
12//!
13//! The purpose of this crate is to convert raw sources into a labeled sequence
14//! of well-known token types, so building an actual Rust token stream will
15//! be easier.
16//!
17//! The main entity of this crate is the [`TokenKind`] enum which represents common
18//! lexeme types.
19//!
20//! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
21
22// tidy-alphabetical-start
23// We want to be able to build this crate with a stable compiler,
24// so no `#![feature]` attributes should be added.
25#![deny(unstable_features)]
26// tidy-alphabetical-end
27
28mod cursor;
29
30#[cfg(test)]
31mod tests;
32
33use LiteralKind::*;
34use TokenKind::*;
35use cursor::EOF_CHAR;
36pub use cursor::{Cursor, FrontmatterAllowed};
37use unicode_properties::UnicodeEmoji;
38pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
39
40/// Parsed token.
41/// It doesn't contain information about data that has been parsed,
42/// only the type of the token and its size.
43#[derive(Debug)]
44pub struct Token {
45    pub kind: TokenKind,
46    pub len: u32,
47}
48
49impl Token {
50    fn new(kind: TokenKind, len: u32) -> Token {
51        Token { kind, len }
52    }
53}
54
55/// Enum representing common lexeme types.
56#[derive(Clone, Copy, Debug, PartialEq, Eq)]
57pub enum TokenKind {
58    /// A line comment, e.g. `// comment`.
59    LineComment {
60        doc_style: Option<DocStyle>,
61    },
62
63    /// A block comment, e.g. `/* block comment */`.
64    ///
65    /// Block comments can be recursive, so a sequence like `/* /* */`
66    /// will not be considered terminated and will result in a parsing error.
67    BlockComment {
68        doc_style: Option<DocStyle>,
69        terminated: bool,
70    },
71
72    /// Any whitespace character sequence.
73    Whitespace,
74
75    Frontmatter {
76        has_invalid_preceding_whitespace: bool,
77        invalid_infostring: bool,
78    },
79
80    /// An identifier or keyword, e.g. `ident` or `continue`.
81    Ident,
82
83    /// An identifier that is invalid because it contains emoji.
84    InvalidIdent,
85
86    /// A raw identifier, e.g. "r#ident".
87    RawIdent,
88
89    /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
90    /// literal prefixes that contain emoji, which are considered "invalid".
91    ///
92    /// Note that only the
93    /// prefix (`foo`) is included in the token, not the separator (which is
94    /// lexed as its own distinct token). In Rust 2021 and later, reserved
95    /// prefixes are reported as errors; in earlier editions, they result in a
96    /// (allowed by default) lint, and are treated as regular identifier
97    /// tokens.
98    UnknownPrefix,
99
100    /// An unknown prefix in a lifetime, like `'foo#`.
101    ///
102    /// Like `UnknownPrefix`, only the `'` and prefix are included in the token
103    /// and not the separator.
104    UnknownPrefixLifetime,
105
106    /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
107    /// several tokens: `'r` and `#` and `foo`.
108    RawLifetime,
109
110    /// Guarded string literal prefix: `#"` or `##`.
111    ///
112    /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
113    /// Split into the component tokens on older editions.
114    GuardedStrPrefix,
115
116    /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
117    /// suffix, but may be present here on string and float literals. Users of
118    /// this type will need to check for and reject that case.
119    ///
120    /// See [LiteralKind] for more details.
121    Literal {
122        kind: LiteralKind,
123        suffix_start: u32,
124    },
125
126    /// A lifetime, e.g. `'a`.
127    Lifetime {
128        starts_with_number: bool,
129    },
130
131    /// `;`
132    Semi,
133    /// `,`
134    Comma,
135    /// `.`
136    Dot,
137    /// `(`
138    OpenParen,
139    /// `)`
140    CloseParen,
141    /// `{`
142    OpenBrace,
143    /// `}`
144    CloseBrace,
145    /// `[`
146    OpenBracket,
147    /// `]`
148    CloseBracket,
149    /// `@`
150    At,
151    /// `#`
152    Pound,
153    /// `~`
154    Tilde,
155    /// `?`
156    Question,
157    /// `:`
158    Colon,
159    /// `$`
160    Dollar,
161    /// `=`
162    Eq,
163    /// `!`
164    Bang,
165    /// `<`
166    Lt,
167    /// `>`
168    Gt,
169    /// `-`
170    Minus,
171    /// `&`
172    And,
173    /// `|`
174    Or,
175    /// `+`
176    Plus,
177    /// `*`
178    Star,
179    /// `/`
180    Slash,
181    /// `^`
182    Caret,
183    /// `%`
184    Percent,
185
186    /// Unknown token, not expected by the lexer, e.g. "№"
187    Unknown,
188
189    /// End of input.
190    Eof,
191}
192
193#[derive(Clone, Copy, Debug, PartialEq, Eq)]
194pub enum DocStyle {
195    Outer,
196    Inner,
197}
198
199/// Enum representing the literal types supported by the lexer.
200///
201/// Note that the suffix is *not* considered when deciding the `LiteralKind` in
202/// this type. This means that float literals like `1f32` are classified by this
203/// type as `Int`. (Compare against `rustc_ast::token::LitKind` and
204/// `rustc_ast::ast::LitKind`).
205#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
206pub enum LiteralKind {
207    /// `12_u8`, `0o100`, `0b120i99`, `1f32`.
208    Int { base: Base, empty_int: bool },
209    /// `12.34f32`, `1e3`, but not `1f32`.
210    Float { base: Base, empty_exponent: bool },
211    /// `'a'`, `'\\'`, `'''`, `';`
212    Char { terminated: bool },
213    /// `b'a'`, `b'\\'`, `b'''`, `b';`
214    Byte { terminated: bool },
215    /// `"abc"`, `"abc`
216    Str { terminated: bool },
217    /// `b"abc"`, `b"abc`
218    ByteStr { terminated: bool },
219    /// `c"abc"`, `c"abc`
220    CStr { terminated: bool },
221    /// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
222    /// an invalid literal.
223    RawStr { n_hashes: Option<u8> },
224    /// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
225    /// indicates an invalid literal.
226    RawByteStr { n_hashes: Option<u8> },
227    /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
228    RawCStr { n_hashes: Option<u8> },
229}
230
231/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
232///
233/// Can capture fewer closing hashes than starting hashes,
234/// for more efficient lexing and better backwards diagnostics.
235#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
236pub struct GuardedStr {
237    pub n_hashes: u32,
238    pub terminated: bool,
239    pub token_len: u32,
240}
241
242#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
243pub enum RawStrError {
244    /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
245    InvalidStarter { bad_char: char },
246    /// The string was not terminated, e.g. `r###"abcde"##`.
247    /// `possible_terminator_offset` is the number of characters after `r` or
248    /// `br` where they may have intended to terminate it.
249    NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
250    /// More than 255 `#`s exist.
251    TooManyDelimiters { found: u32 },
252}
253
254/// Base of numeric literal encoding according to its prefix.
255#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
256pub enum Base {
257    /// Literal starts with "0b".
258    Binary = 2,
259    /// Literal starts with "0o".
260    Octal = 8,
261    /// Literal doesn't contain a prefix.
262    Decimal = 10,
263    /// Literal starts with "0x".
264    Hexadecimal = 16,
265}
266
267/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
268/// but shebang isn't a part of rust syntax.
269pub fn strip_shebang(input: &str) -> Option<usize> {
270    // Shebang must start with `#!` literally, without any preceding whitespace.
271    // For simplicity we consider any line starting with `#!` a shebang,
272    // regardless of restrictions put on shebangs by specific platforms.
273    if let Some(input_tail) = input.strip_prefix("#!") {
274        // Ok, this is a shebang but if the next non-whitespace token is `[`,
275        // then it may be valid Rust code, so consider it Rust code.
276        let next_non_whitespace_token =
277            tokenize(input_tail, FrontmatterAllowed::No).map(|tok| tok.kind).find(|tok| {
278                !matches!(
279                    tok,
280                    TokenKind::Whitespace
281                        | TokenKind::LineComment { doc_style: None }
282                        | TokenKind::BlockComment { doc_style: None, .. }
283                )
284            });
285        if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
286            // No other choice than to consider this a shebang.
287            return Some(2 + input_tail.lines().next().unwrap_or_default().len());
288        }
289    }
290    None
291}
292
293/// Validates a raw string literal. Used for getting more information about a
294/// problem with a `RawStr`/`RawByteStr` with a `None` field.
295#[inline]
296pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
297    debug_assert!(!input.is_empty());
298    let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
299    // Move past the leading `r` or `br`.
300    for _ in 0..prefix_len {
301        cursor.bump().unwrap();
302    }
303    cursor.raw_double_quoted_string(prefix_len).map(|_| ())
304}
305
306/// Creates an iterator that produces tokens from the input string.
307///
308/// When parsing a full Rust document,
309/// first [`strip_shebang`] and then allow frontmatters with [`FrontmatterAllowed::Yes`].
310///
311/// When tokenizing a slice of a document, be sure to disallow frontmatters with [`FrontmatterAllowed::No`]
312pub fn tokenize(
313    input: &str,
314    frontmatter_allowed: FrontmatterAllowed,
315) -> impl Iterator<Item = Token> {
316    let mut cursor = Cursor::new(input, frontmatter_allowed);
317    std::iter::from_fn(move || {
318        let token = cursor.advance_token();
319        if token.kind != TokenKind::Eof { Some(token) } else { None }
320    })
321}
322
323/// True if `c` is considered a whitespace according to Rust language definition.
324/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
325/// for definitions of these classes.
326pub fn is_whitespace(c: char) -> bool {
327    // This is Pattern_White_Space.
328    //
329    // Note that this set is stable (ie, it doesn't change with different
330    // Unicode versions), so it's ok to just hard-code the values.
331
332    matches!(
333        c,
334        // Usual ASCII suspects
335        '\u{0009}'   // \t
336        | '\u{000A}' // \n
337        | '\u{000B}' // vertical tab
338        | '\u{000C}' // form feed
339        | '\u{000D}' // \r
340        | '\u{0020}' // space
341
342        // NEXT LINE from latin1
343        | '\u{0085}'
344
345        // Bidi markers
346        | '\u{200E}' // LEFT-TO-RIGHT MARK
347        | '\u{200F}' // RIGHT-TO-LEFT MARK
348
349        // Dedicated whitespace characters from Unicode
350        | '\u{2028}' // LINE SEPARATOR
351        | '\u{2029}' // PARAGRAPH SEPARATOR
352    )
353}
354
355/// True if `c` is valid as a first character of an identifier.
356/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
357/// a formal definition of valid identifier name.
358pub fn is_id_start(c: char) -> bool {
359    // This is XID_Start OR '_' (which formally is not a XID_Start).
360    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
361}
362
363/// True if `c` is valid as a non-first character of an identifier.
364/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
365/// a formal definition of valid identifier name.
366pub fn is_id_continue(c: char) -> bool {
367    unicode_xid::UnicodeXID::is_xid_continue(c)
368}
369
370/// The passed string is lexically an identifier.
371pub fn is_ident(string: &str) -> bool {
372    let mut chars = string.chars();
373    if let Some(start) = chars.next() {
374        is_id_start(start) && chars.all(is_id_continue)
375    } else {
376        false
377    }
378}
379
380impl Cursor<'_> {
381    /// Parses a token from the input string.
382    pub fn advance_token(&mut self) -> Token {
383        let Some(first_char) = self.bump() else {
384            return Token::new(TokenKind::Eof, 0);
385        };
386
387        let token_kind = match first_char {
388            c if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
389                && is_whitespace(c) =>
390            {
391                let mut last = first_char;
392                while is_whitespace(self.first()) {
393                    let Some(c) = self.bump() else {
394                        break;
395                    };
396                    last = c;
397                }
398                // invalid frontmatter opening as whitespace preceding it isn't newline.
399                // combine the whitespace and the frontmatter to a single token as we shall
400                // error later.
401                if last != '\n' && self.as_str().starts_with("---") {
402                    self.bump();
403                    self.frontmatter(true)
404                } else {
405                    Whitespace
406                }
407            }
408            '-' if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
409                && self.as_str().starts_with("--") =>
410            {
411                // happy path
412                self.frontmatter(false)
413            }
414            // Slash, comment or block comment.
415            '/' => match self.first() {
416                '/' => self.line_comment(),
417                '*' => self.block_comment(),
418                _ => Slash,
419            },
420
421            // Whitespace sequence.
422            c if is_whitespace(c) => self.whitespace(),
423
424            // Raw identifier, raw string literal or identifier.
425            'r' => match (self.first(), self.second()) {
426                ('#', c1) if is_id_start(c1) => self.raw_ident(),
427                ('#', _) | ('"', _) => {
428                    let res = self.raw_double_quoted_string(1);
429                    let suffix_start = self.pos_within_token();
430                    if res.is_ok() {
431                        self.eat_literal_suffix();
432                    }
433                    let kind = RawStr { n_hashes: res.ok() };
434                    Literal { kind, suffix_start }
435                }
436                _ => self.ident_or_unknown_prefix(),
437            },
438
439            // Byte literal, byte string literal, raw byte string literal or identifier.
440            'b' => self.c_or_byte_string(
441                |terminated| ByteStr { terminated },
442                |n_hashes| RawByteStr { n_hashes },
443                Some(|terminated| Byte { terminated }),
444            ),
445
446            // c-string literal, raw c-string literal or identifier.
447            'c' => self.c_or_byte_string(
448                |terminated| CStr { terminated },
449                |n_hashes| RawCStr { n_hashes },
450                None,
451            ),
452
453            // Identifier (this should be checked after other variant that can
454            // start as identifier).
455            c if is_id_start(c) => self.ident_or_unknown_prefix(),
456
457            // Numeric literal.
458            c @ '0'..='9' => {
459                let literal_kind = self.number(c);
460                let suffix_start = self.pos_within_token();
461                self.eat_literal_suffix();
462                TokenKind::Literal { kind: literal_kind, suffix_start }
463            }
464
465            // Guarded string literal prefix: `#"` or `##`
466            '#' if matches!(self.first(), '"' | '#') => {
467                self.bump();
468                TokenKind::GuardedStrPrefix
469            }
470
471            // One-symbol tokens.
472            ';' => Semi,
473            ',' => Comma,
474            '.' => Dot,
475            '(' => OpenParen,
476            ')' => CloseParen,
477            '{' => OpenBrace,
478            '}' => CloseBrace,
479            '[' => OpenBracket,
480            ']' => CloseBracket,
481            '@' => At,
482            '#' => Pound,
483            '~' => Tilde,
484            '?' => Question,
485            ':' => Colon,
486            '$' => Dollar,
487            '=' => Eq,
488            '!' => Bang,
489            '<' => Lt,
490            '>' => Gt,
491            '-' => Minus,
492            '&' => And,
493            '|' => Or,
494            '+' => Plus,
495            '*' => Star,
496            '^' => Caret,
497            '%' => Percent,
498
499            // Lifetime or character literal.
500            '\'' => self.lifetime_or_char(),
501
502            // String literal.
503            '"' => {
504                let terminated = self.double_quoted_string();
505                let suffix_start = self.pos_within_token();
506                if terminated {
507                    self.eat_literal_suffix();
508                }
509                let kind = Str { terminated };
510                Literal { kind, suffix_start }
511            }
512            // Identifier starting with an emoji. Only lexed for graceful error recovery.
513            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
514            _ => Unknown,
515        };
516        if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
517            && !matches!(token_kind, Whitespace)
518        {
519            // stop allowing frontmatters after first non-whitespace token
520            self.frontmatter_allowed = FrontmatterAllowed::No;
521        }
522        let res = Token::new(token_kind, self.pos_within_token());
523        self.reset_pos_within_token();
524        res
525    }
526
527    /// Given that one `-` was eaten, eat the rest of the frontmatter.
528    fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind {
529        debug_assert_eq!('-', self.prev());
530
531        let pos = self.pos_within_token();
532        self.eat_while(|c| c == '-');
533
534        // one `-` is eaten by the caller.
535        let length_opening = self.pos_within_token() - pos + 1;
536
537        // must be ensured by the caller
538        debug_assert!(length_opening >= 3);
539
540        // whitespace between the opening and the infostring.
541        self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
542
543        // copied from `eat_identifier`, but allows `.` in infostring to allow something like
544        // `---Cargo.toml` as a valid opener
545        if is_id_start(self.first()) {
546            self.bump();
547            self.eat_while(|c| is_id_continue(c) || c == '.');
548        }
549
550        self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
551        let invalid_infostring = self.first() != '\n';
552
553        let mut s = self.as_str();
554        let mut found = false;
555        let mut size = 0;
556        while let Some(closing) = s.find(&"-".repeat(length_opening as usize)) {
557            let preceding_chars_start = s[..closing].rfind("\n").map_or(0, |i| i + 1);
558            if s[preceding_chars_start..closing].chars().all(is_whitespace) {
559                // candidate found
560                self.bump_bytes(size + closing);
561                // in case like
562                // ---cargo
563                // --- blahblah
564                // or
565                // ---cargo
566                // ----
567                // combine those stuff into this frontmatter token such that it gets detected later.
568                self.eat_until(b'\n');
569                found = true;
570                break;
571            } else {
572                s = &s[closing + length_opening as usize..];
573                size += closing + length_opening as usize;
574            }
575        }
576
577        if !found {
578            // recovery strategy: a closing statement might have preceding whitespace/newline
579            // but not have enough dashes to properly close. In this case, we eat until there,
580            // and report a mismatch in the parser.
581            let mut rest = self.as_str();
582            // We can look for a shorter closing (starting with four dashes but closing with three)
583            // and other indications that Rust has started and the infostring has ended.
584            let mut potential_closing = rest
585                .find("\n---")
586                // n.b. only in the case where there are dashes, we move the index to the line where
587                // the dashes start as we eat to include that line. For other cases those are Rust code
588                // and not included in the frontmatter.
589                .map(|x| x + 1)
590                .or_else(|| rest.find("\nuse "))
591                .or_else(|| rest.find("\n//!"))
592                .or_else(|| rest.find("\n#!["));
593
594            if potential_closing.is_none() {
595                // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
596                // on a standalone line. Might be wrong.
597                while let Some(closing) = rest.find("---") {
598                    let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
599                    if rest[preceding_chars_start..closing].chars().all(is_whitespace) {
600                        // candidate found
601                        potential_closing = Some(closing);
602                        break;
603                    } else {
604                        rest = &rest[closing + 3..];
605                    }
606                }
607            }
608
609            if let Some(potential_closing) = potential_closing {
610                // bump to the potential closing, and eat everything on that line.
611                self.bump_bytes(potential_closing);
612                self.eat_until(b'\n');
613            } else {
614                // eat everything. this will get reported as an unclosed frontmatter.
615                self.eat_while(|_| true);
616            }
617        }
618
619        Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
620    }
621
622    fn line_comment(&mut self) -> TokenKind {
623        debug_assert!(self.prev() == '/' && self.first() == '/');
624        self.bump();
625
626        let doc_style = match self.first() {
627            // `//!` is an inner line doc comment.
628            '!' => Some(DocStyle::Inner),
629            // `////` (more than 3 slashes) is not considered a doc comment.
630            '/' if self.second() != '/' => Some(DocStyle::Outer),
631            _ => None,
632        };
633
634        self.eat_until(b'\n');
635        LineComment { doc_style }
636    }
637
638    fn block_comment(&mut self) -> TokenKind {
639        debug_assert!(self.prev() == '/' && self.first() == '*');
640        self.bump();
641
642        let doc_style = match self.first() {
643            // `/*!` is an inner block doc comment.
644            '!' => Some(DocStyle::Inner),
645            // `/***` (more than 2 stars) is not considered a doc comment.
646            // `/**/` is not considered a doc comment.
647            '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
648            _ => None,
649        };
650
651        let mut depth = 1usize;
652        while let Some(c) = self.bump() {
653            match c {
654                '/' if self.first() == '*' => {
655                    self.bump();
656                    depth += 1;
657                }
658                '*' if self.first() == '/' => {
659                    self.bump();
660                    depth -= 1;
661                    if depth == 0 {
662                        // This block comment is closed, so for a construction like "/* */ */"
663                        // there will be a successfully parsed block comment "/* */"
664                        // and " */" will be processed separately.
665                        break;
666                    }
667                }
668                _ => (),
669            }
670        }
671
672        BlockComment { doc_style, terminated: depth == 0 }
673    }
674
675    fn whitespace(&mut self) -> TokenKind {
676        debug_assert!(is_whitespace(self.prev()));
677        self.eat_while(is_whitespace);
678        Whitespace
679    }
680
681    fn raw_ident(&mut self) -> TokenKind {
682        debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
683        // Eat "#" symbol.
684        self.bump();
685        // Eat the identifier part of RawIdent.
686        self.eat_identifier();
687        RawIdent
688    }
689
690    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
691        debug_assert!(is_id_start(self.prev()));
692        // Start is already eaten, eat the rest of identifier.
693        self.eat_while(is_id_continue);
694        // Known prefixes must have been handled earlier. So if
695        // we see a prefix here, it is definitely an unknown prefix.
696        match self.first() {
697            '#' | '"' | '\'' => UnknownPrefix,
698            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
699            _ => Ident,
700        }
701    }
702
703    fn invalid_ident(&mut self) -> TokenKind {
704        // Start is already eaten, eat the rest of identifier.
705        self.eat_while(|c| {
706            const ZERO_WIDTH_JOINER: char = '\u{200d}';
707            is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
708        });
709        // An invalid identifier followed by '#' or '"' or '\'' could be
710        // interpreted as an invalid literal prefix. We don't bother doing that
711        // because the treatment of invalid identifiers and invalid prefixes
712        // would be the same.
713        InvalidIdent
714    }
715
716    fn c_or_byte_string(
717        &mut self,
718        mk_kind: fn(bool) -> LiteralKind,
719        mk_kind_raw: fn(Option<u8>) -> LiteralKind,
720        single_quoted: Option<fn(bool) -> LiteralKind>,
721    ) -> TokenKind {
722        match (self.first(), self.second(), single_quoted) {
723            ('\'', _, Some(single_quoted)) => {
724                self.bump();
725                let terminated = self.single_quoted_string();
726                let suffix_start = self.pos_within_token();
727                if terminated {
728                    self.eat_literal_suffix();
729                }
730                let kind = single_quoted(terminated);
731                Literal { kind, suffix_start }
732            }
733            ('"', _, _) => {
734                self.bump();
735                let terminated = self.double_quoted_string();
736                let suffix_start = self.pos_within_token();
737                if terminated {
738                    self.eat_literal_suffix();
739                }
740                let kind = mk_kind(terminated);
741                Literal { kind, suffix_start }
742            }
743            ('r', '"', _) | ('r', '#', _) => {
744                self.bump();
745                let res = self.raw_double_quoted_string(2);
746                let suffix_start = self.pos_within_token();
747                if res.is_ok() {
748                    self.eat_literal_suffix();
749                }
750                let kind = mk_kind_raw(res.ok());
751                Literal { kind, suffix_start }
752            }
753            _ => self.ident_or_unknown_prefix(),
754        }
755    }
756
757    fn number(&mut self, first_digit: char) -> LiteralKind {
758        debug_assert!('0' <= self.prev() && self.prev() <= '9');
759        let mut base = Base::Decimal;
760        if first_digit == '0' {
761            // Attempt to parse encoding base.
762            match self.first() {
763                'b' => {
764                    base = Base::Binary;
765                    self.bump();
766                    if !self.eat_decimal_digits() {
767                        return Int { base, empty_int: true };
768                    }
769                }
770                'o' => {
771                    base = Base::Octal;
772                    self.bump();
773                    if !self.eat_decimal_digits() {
774                        return Int { base, empty_int: true };
775                    }
776                }
777                'x' => {
778                    base = Base::Hexadecimal;
779                    self.bump();
780                    if !self.eat_hexadecimal_digits() {
781                        return Int { base, empty_int: true };
782                    }
783                }
784                // Not a base prefix; consume additional digits.
785                '0'..='9' | '_' => {
786                    self.eat_decimal_digits();
787                }
788
789                // Also not a base prefix; nothing more to do here.
790                '.' | 'e' | 'E' => {}
791
792                // Just a 0.
793                _ => return Int { base, empty_int: false },
794            }
795        } else {
796            // No base prefix, parse number in the usual way.
797            self.eat_decimal_digits();
798        }
799
800        match self.first() {
801            // Don't be greedy if this is actually an
802            // integer literal followed by field/method access or a range pattern
803            // (`0..2` and `12.foo()`)
804            '.' if self.second() != '.' && !is_id_start(self.second()) => {
805                // might have stuff after the ., and if it does, it needs to start
806                // with a number
807                self.bump();
808                let mut empty_exponent = false;
809                if self.first().is_ascii_digit() {
810                    self.eat_decimal_digits();
811                    match self.first() {
812                        'e' | 'E' => {
813                            self.bump();
814                            empty_exponent = !self.eat_float_exponent();
815                        }
816                        _ => (),
817                    }
818                }
819                Float { base, empty_exponent }
820            }
821            'e' | 'E' => {
822                self.bump();
823                let empty_exponent = !self.eat_float_exponent();
824                Float { base, empty_exponent }
825            }
826            _ => Int { base, empty_int: false },
827        }
828    }
829
830    fn lifetime_or_char(&mut self) -> TokenKind {
831        debug_assert!(self.prev() == '\'');
832
833        let can_be_a_lifetime = if self.second() == '\'' {
834            // It's surely not a lifetime.
835            false
836        } else {
837            // If the first symbol is valid for identifier, it can be a lifetime.
838            // Also check if it's a number for a better error reporting (so '0 will
839            // be reported as invalid lifetime and not as unterminated char literal).
840            is_id_start(self.first()) || self.first().is_ascii_digit()
841        };
842
843        if !can_be_a_lifetime {
844            let terminated = self.single_quoted_string();
845            let suffix_start = self.pos_within_token();
846            if terminated {
847                self.eat_literal_suffix();
848            }
849            let kind = Char { terminated };
850            return Literal { kind, suffix_start };
851        }
852
853        if self.first() == 'r' && self.second() == '#' && is_id_start(self.third()) {
854            // Eat "r" and `#`, and identifier start characters.
855            self.bump();
856            self.bump();
857            self.bump();
858            self.eat_while(is_id_continue);
859            return RawLifetime;
860        }
861
862        // Either a lifetime or a character literal with
863        // length greater than 1.
864        let starts_with_number = self.first().is_ascii_digit();
865
866        // Skip the literal contents.
867        // First symbol can be a number (which isn't a valid identifier start),
868        // so skip it without any checks.
869        self.bump();
870        self.eat_while(is_id_continue);
871
872        match self.first() {
873            // Check if after skipping literal contents we've met a closing
874            // single quote (which means that user attempted to create a
875            // string with single quotes).
876            '\'' => {
877                self.bump();
878                let kind = Char { terminated: true };
879                Literal { kind, suffix_start: self.pos_within_token() }
880            }
881            '#' if !starts_with_number => UnknownPrefixLifetime,
882            _ => Lifetime { starts_with_number },
883        }
884    }
885
886    fn single_quoted_string(&mut self) -> bool {
887        debug_assert!(self.prev() == '\'');
888        // Check if it's a one-symbol literal.
889        if self.second() == '\'' && self.first() != '\\' {
890            self.bump();
891            self.bump();
892            return true;
893        }
894
895        // Literal has more than one symbol.
896
897        // Parse until either quotes are terminated or error is detected.
898        loop {
899            match self.first() {
900                // Quotes are terminated, finish parsing.
901                '\'' => {
902                    self.bump();
903                    return true;
904                }
905                // Probably beginning of the comment, which we don't want to include
906                // to the error report.
907                '/' => break,
908                // Newline without following '\'' means unclosed quote, stop parsing.
909                '\n' if self.second() != '\'' => break,
910                // End of file, stop parsing.
911                EOF_CHAR if self.is_eof() => break,
912                // Escaped slash is considered one character, so bump twice.
913                '\\' => {
914                    self.bump();
915                    self.bump();
916                }
917                // Skip the character.
918                _ => {
919                    self.bump();
920                }
921            }
922        }
923        // String was not terminated.
924        false
925    }
926
927    /// Eats double-quoted string and returns true
928    /// if string is terminated.
929    fn double_quoted_string(&mut self) -> bool {
930        debug_assert!(self.prev() == '"');
931        while let Some(c) = self.bump() {
932            match c {
933                '"' => {
934                    return true;
935                }
936                '\\' if self.first() == '\\' || self.first() == '"' => {
937                    // Bump again to skip escaped character.
938                    self.bump();
939                }
940                _ => (),
941            }
942        }
943        // End of file reached.
944        false
945    }
946
947    /// Attempt to lex for a guarded string literal.
948    ///
949    /// Used by `rustc_parse::lexer` to lex for guarded strings
950    /// conditionally based on edition.
951    ///
952    /// Note: this will not reset the `Cursor` when a
953    /// guarded string is not found. It is the caller's
954    /// responsibility to do so.
955    pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
956        debug_assert!(self.prev() != '#');
957
958        let mut n_start_hashes: u32 = 0;
959        while self.first() == '#' {
960            n_start_hashes += 1;
961            self.bump();
962        }
963
964        if self.first() != '"' {
965            return None;
966        }
967        self.bump();
968        debug_assert!(self.prev() == '"');
969
970        // Lex the string itself as a normal string literal
971        // so we can recover that for older editions later.
972        let terminated = self.double_quoted_string();
973        if !terminated {
974            let token_len = self.pos_within_token();
975            self.reset_pos_within_token();
976
977            return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
978        }
979
980        // Consume closing '#' symbols.
981        // Note that this will not consume extra trailing `#` characters:
982        // `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
983        // followed by a `#` token.
984        let mut n_end_hashes = 0;
985        while self.first() == '#' && n_end_hashes < n_start_hashes {
986            n_end_hashes += 1;
987            self.bump();
988        }
989
990        // Reserved syntax, always an error, so it doesn't matter if
991        // `n_start_hashes != n_end_hashes`.
992
993        self.eat_literal_suffix();
994
995        let token_len = self.pos_within_token();
996        self.reset_pos_within_token();
997
998        Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
999    }
1000
1001    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
1002    fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
1003        // Wrap the actual function to handle the error with too many hashes.
1004        // This way, it eats the whole raw string.
1005        let n_hashes = self.raw_string_unvalidated(prefix_len)?;
1006        // Only up to 255 `#`s are allowed in raw strings
1007        match u8::try_from(n_hashes) {
1008            Ok(num) => Ok(num),
1009            Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
1010        }
1011    }
1012
1013    fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
1014        debug_assert!(self.prev() == 'r');
1015        let start_pos = self.pos_within_token();
1016        let mut possible_terminator_offset = None;
1017        let mut max_hashes = 0;
1018
1019        // Count opening '#' symbols.
1020        let mut eaten = 0;
1021        while self.first() == '#' {
1022            eaten += 1;
1023            self.bump();
1024        }
1025        let n_start_hashes = eaten;
1026
1027        // Check that string is started.
1028        match self.bump() {
1029            Some('"') => (),
1030            c => {
1031                let c = c.unwrap_or(EOF_CHAR);
1032                return Err(RawStrError::InvalidStarter { bad_char: c });
1033            }
1034        }
1035
1036        // Skip the string contents and on each '#' character met, check if this is
1037        // a raw string termination.
1038        loop {
1039            self.eat_until(b'"');
1040
1041            if self.is_eof() {
1042                return Err(RawStrError::NoTerminator {
1043                    expected: n_start_hashes,
1044                    found: max_hashes,
1045                    possible_terminator_offset,
1046                });
1047            }
1048
1049            // Eat closing double quote.
1050            self.bump();
1051
1052            // Check that amount of closing '#' symbols
1053            // is equal to the amount of opening ones.
1054            // Note that this will not consume extra trailing `#` characters:
1055            // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
1056            // followed by a `#` token.
1057            let mut n_end_hashes = 0;
1058            while self.first() == '#' && n_end_hashes < n_start_hashes {
1059                n_end_hashes += 1;
1060                self.bump();
1061            }
1062
1063            if n_end_hashes == n_start_hashes {
1064                return Ok(n_start_hashes);
1065            } else if n_end_hashes > max_hashes {
1066                // Keep track of possible terminators to give a hint about
1067                // where there might be a missing terminator
1068                possible_terminator_offset =
1069                    Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
1070                max_hashes = n_end_hashes;
1071            }
1072        }
1073    }
1074
1075    fn eat_decimal_digits(&mut self) -> bool {
1076        let mut has_digits = false;
1077        loop {
1078            match self.first() {
1079                '_' => {
1080                    self.bump();
1081                }
1082                '0'..='9' => {
1083                    has_digits = true;
1084                    self.bump();
1085                }
1086                _ => break,
1087            }
1088        }
1089        has_digits
1090    }
1091
1092    fn eat_hexadecimal_digits(&mut self) -> bool {
1093        let mut has_digits = false;
1094        loop {
1095            match self.first() {
1096                '_' => {
1097                    self.bump();
1098                }
1099                '0'..='9' | 'a'..='f' | 'A'..='F' => {
1100                    has_digits = true;
1101                    self.bump();
1102                }
1103                _ => break,
1104            }
1105        }
1106        has_digits
1107    }
1108
1109    /// Eats the float exponent. Returns true if at least one digit was met,
1110    /// and returns false otherwise.
1111    fn eat_float_exponent(&mut self) -> bool {
1112        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
1113        if self.first() == '-' || self.first() == '+' {
1114            self.bump();
1115        }
1116        self.eat_decimal_digits()
1117    }
1118
1119    // Eats the suffix of the literal, e.g. "u8".
1120    fn eat_literal_suffix(&mut self) {
1121        self.eat_identifier();
1122    }
1123
1124    // Eats the identifier. Note: succeeds on `_`, which isn't a valid
1125    // identifier.
1126    fn eat_identifier(&mut self) {
1127        if !is_id_start(self.first()) {
1128            return;
1129        }
1130        self.bump();
1131
1132        self.eat_while(is_id_continue);
1133    }
1134}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy