rustc_parse/lexer/
mod.rs

1use diagnostics::make_errors_for_mismatched_closing_delims;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9    Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15    TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32// This type is used a lot. Make sure it doesn't unintentionally get bigger.
33//
34// This assertion is in this crate, rather than in `rustc_lexer`, because that
35// crate cannot depend on `rustc_data_structures`.
36#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41    pub found_delim: Option<Delimiter>,
42    pub found_span: Span,
43    pub unclosed_span: Option<Span>,
44    pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48    psess: &'psess ParseSess,
49    mut src: &'src str,
50    mut start_pos: BytePos,
51    override_span: Option<Span>,
52) -> Result<TokenStream, Vec<Diag<'psess>>> {
53    // Skip `#!`, if present.
54    if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
55        src = &src[shebang_len..];
56        start_pos = start_pos + BytePos::from_usize(shebang_len);
57    }
58
59    let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
60    let mut lexer = Lexer {
61        psess,
62        start_pos,
63        pos: start_pos,
64        src,
65        cursor,
66        override_span,
67        nbsp_is_whitespace: false,
68        last_lifetime: None,
69        token: Token::dummy(),
70        diag_info: TokenTreeDiagInfo::default(),
71    };
72    let res = lexer.lex_token_trees(/* is_delimited */ false);
73
74    let mut unmatched_closing_delims: Vec<_> =
75        make_errors_for_mismatched_closing_delims(&lexer.diag_info.unmatched_delims, psess);
76
77    match res {
78        Ok((_open_spacing, stream)) => {
79            if unmatched_closing_delims.is_empty() {
80                Ok(stream)
81            } else {
82                // Return error if there are unmatched delimiters or unclosed delimiters.
83                Err(unmatched_closing_delims)
84            }
85        }
86        Err(errs) => {
87            // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
88            // because the delimiter mismatch is more likely to be the root cause of error
89            unmatched_closing_delims.extend(errs);
90            Err(unmatched_closing_delims)
91        }
92    }
93}
94
95struct Lexer<'psess, 'src> {
96    psess: &'psess ParseSess,
97    /// Initial position, read-only.
98    start_pos: BytePos,
99    /// The absolute offset within the source_map of the current character.
100    pos: BytePos,
101    /// Source text to tokenize.
102    src: &'src str,
103    /// Cursor for getting lexer tokens.
104    cursor: Cursor<'src>,
105    override_span: Option<Span>,
106    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
107    /// in this file, it's safe to treat further occurrences of the non-breaking
108    /// space character as whitespace.
109    nbsp_is_whitespace: bool,
110
111    /// Track the `Span` for the leading `'` of the last lifetime. Used for
112    /// diagnostics to detect possible typo where `"` was meant.
113    last_lifetime: Option<Span>,
114
115    /// The current token.
116    token: Token,
117
118    diag_info: TokenTreeDiagInfo,
119}
120
121impl<'psess, 'src> Lexer<'psess, 'src> {
122    fn dcx(&self) -> DiagCtxtHandle<'psess> {
123        self.psess.dcx()
124    }
125
126    fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
127        self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
128    }
129
130    /// Returns the next token, paired with a bool indicating if the token was
131    /// preceded by whitespace.
132    fn next_token_from_cursor(&mut self) -> (Token, bool) {
133        let mut preceded_by_whitespace = false;
134        let mut swallow_next_invalid = 0;
135        // Skip trivial (whitespace & comments) tokens
136        loop {
137            let str_before = self.cursor.as_str();
138            let token = self.cursor.advance_token();
139            let start = self.pos;
140            self.pos = self.pos + BytePos(token.len);
141
142            debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
143
144            if let rustc_lexer::TokenKind::Semi
145            | rustc_lexer::TokenKind::LineComment { .. }
146            | rustc_lexer::TokenKind::BlockComment { .. }
147            | rustc_lexer::TokenKind::CloseParen
148            | rustc_lexer::TokenKind::CloseBrace
149            | rustc_lexer::TokenKind::CloseBracket = token.kind
150            {
151                // Heuristic: we assume that it is unlikely we're dealing with an unterminated
152                // string surrounded by single quotes.
153                self.last_lifetime = None;
154            }
155
156            // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
157            // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
158            // additional validation.
159            let kind = match token.kind {
160                rustc_lexer::TokenKind::LineComment { doc_style } => {
161                    // Skip non-doc comments
162                    let Some(doc_style) = doc_style else {
163                        self.lint_unicode_text_flow(start);
164                        preceded_by_whitespace = true;
165                        continue;
166                    };
167
168                    // Opening delimiter of the length 3 is not included into the symbol.
169                    let content_start = start + BytePos(3);
170                    let content = self.str_from(content_start);
171                    self.lint_doc_comment_unicode_text_flow(start, content);
172                    self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
173                }
174                rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
175                    if !terminated {
176                        self.report_unterminated_block_comment(start, doc_style);
177                    }
178
179                    // Skip non-doc comments
180                    let Some(doc_style) = doc_style else {
181                        self.lint_unicode_text_flow(start);
182                        preceded_by_whitespace = true;
183                        continue;
184                    };
185
186                    // Opening delimiter of the length 3 and closing delimiter of the length 2
187                    // are not included into the symbol.
188                    let content_start = start + BytePos(3);
189                    let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
190                    let content = self.str_from_to(content_start, content_end);
191                    self.lint_doc_comment_unicode_text_flow(start, content);
192                    self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
193                }
194                rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
195                    self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
196                    preceded_by_whitespace = true;
197                    continue;
198                }
199                rustc_lexer::TokenKind::Whitespace => {
200                    preceded_by_whitespace = true;
201                    continue;
202                }
203                rustc_lexer::TokenKind::Ident => self.ident(start),
204                rustc_lexer::TokenKind::RawIdent => {
205                    let sym = nfc_normalize(self.str_from(start + BytePos(2)));
206                    let span = self.mk_sp(start, self.pos);
207                    self.psess.symbol_gallery.insert(sym, span);
208                    if !sym.can_be_raw() {
209                        self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
210                    }
211                    self.psess.raw_identifier_spans.push(span);
212                    token::Ident(sym, IdentIsRaw::Yes)
213                }
214                rustc_lexer::TokenKind::UnknownPrefix => {
215                    self.report_unknown_prefix(start);
216                    self.ident(start)
217                }
218                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
219                    self.report_unknown_prefix(start);
220                    // Include the leading `'` in the real identifier, for macro
221                    // expansion purposes. See #12512 for the gory details of why
222                    // this is necessary.
223                    let lifetime_name = self.str_from(start);
224                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
225                    let ident = Symbol::intern(lifetime_name);
226                    token::Lifetime(ident, IdentIsRaw::No)
227                }
228                rustc_lexer::TokenKind::InvalidIdent
229                    // Do not recover an identifier with emoji if the codepoint is a confusable
230                    // with a recoverable substitution token, like `âž–`.
231                    if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
232                        let sym = self.str_from(start);
233                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
234                    }) =>
235                {
236                    let sym = nfc_normalize(self.str_from(start));
237                    let span = self.mk_sp(start, self.pos);
238                    self.psess
239                        .bad_unicode_identifiers
240                        .borrow_mut()
241                        .entry(sym)
242                        .or_default()
243                        .push(span);
244                    token::Ident(sym, IdentIsRaw::No)
245                }
246                // split up (raw) c string literals to an ident and a string literal when edition <
247                // 2021.
248                rustc_lexer::TokenKind::Literal {
249                    kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
250                    suffix_start: _,
251                } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
252                    let prefix_len = match kind {
253                        LiteralKind::CStr { .. } => 1,
254                        LiteralKind::RawCStr { .. } => 2,
255                        _ => unreachable!(),
256                    };
257
258                    // reset the state so that only the prefix ("c" or "cr")
259                    // was consumed.
260                    let lit_start = start + BytePos(prefix_len);
261                    self.pos = lit_start;
262                    self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
263                    self.report_unknown_prefix(start);
264                    let prefix_span = self.mk_sp(start, lit_start);
265                    return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
266                }
267                rustc_lexer::TokenKind::GuardedStrPrefix => {
268                    self.maybe_report_guarded_str(start, str_before)
269                }
270                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
271                    let suffix_start = start + BytePos(suffix_start);
272                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
273                    let suffix = if suffix_start < self.pos {
274                        let string = self.str_from(suffix_start);
275                        if string == "_" {
276                            self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
277                                span: self.mk_sp(suffix_start, self.pos),
278                            });
279                            None
280                        } else {
281                            Some(Symbol::intern(string))
282                        }
283                    } else {
284                        None
285                    };
286                    self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
287                    token::Literal(token::Lit { kind, symbol, suffix })
288                }
289                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
290                    // Include the leading `'` in the real identifier, for macro
291                    // expansion purposes. See #12512 for the gory details of why
292                    // this is necessary.
293                    let lifetime_name = self.str_from(start);
294                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
295                    if starts_with_number {
296                        let span = self.mk_sp(start, self.pos);
297                        self.dcx()
298                            .struct_err("lifetimes cannot start with a number")
299                            .with_span(span)
300                            .stash(span, StashKey::LifetimeIsChar);
301                    }
302                    let ident = Symbol::intern(lifetime_name);
303                    token::Lifetime(ident, IdentIsRaw::No)
304                }
305                rustc_lexer::TokenKind::RawLifetime => {
306                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
307
308                    let ident_start = start + BytePos(3);
309                    let prefix_span = self.mk_sp(start, ident_start);
310
311                    if prefix_span.at_least_rust_2021() {
312                        // If the raw lifetime is followed by \' then treat it a normal
313                        // lifetime followed by a \', which is to interpret it as a character
314                        // literal. In this case, it's always an invalid character literal
315                        // since the literal must necessarily have >3 characters (r#...) inside
316                        // of it, which is invalid.
317                        if self.cursor.as_str().starts_with('\'') {
318                            let lit_span = self.mk_sp(start, self.pos + BytePos(1));
319                            let contents = self.str_from_to(start + BytePos(1), self.pos);
320                            emit_unescape_error(
321                                self.dcx(),
322                                contents,
323                                lit_span,
324                                lit_span,
325                                Mode::Char,
326                                0..contents.len(),
327                                EscapeError::MoreThanOneChar,
328                            )
329                            .expect("expected error");
330                        }
331
332                        let span = self.mk_sp(start, self.pos);
333
334                        let lifetime_name_without_tick =
335                            Symbol::intern(&self.str_from(ident_start));
336                        if !lifetime_name_without_tick.can_be_raw() {
337                            self.dcx().emit_err(
338                                errors::CannotBeRawLifetime {
339                                    span,
340                                    ident: lifetime_name_without_tick
341                                }
342                            );
343                        }
344
345                        // Put the `'` back onto the lifetime name.
346                        let mut lifetime_name =
347                            String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
348                        lifetime_name.push('\'');
349                        lifetime_name += lifetime_name_without_tick.as_str();
350                        let sym = Symbol::intern(&lifetime_name);
351
352                        // Make sure we mark this as a raw identifier.
353                        self.psess.raw_identifier_spans.push(span);
354
355                        token::Lifetime(sym, IdentIsRaw::Yes)
356                    } else {
357                        // Otherwise, this should be parsed like `'r`. Warn about it though.
358                        self.psess.buffer_lint(
359                            RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
360                            prefix_span,
361                            ast::CRATE_NODE_ID,
362                            BuiltinLintDiag::RawPrefix(prefix_span),
363                        );
364
365                        // Reset the state so we just lex the `'r`.
366                        let lt_start = start + BytePos(2);
367                        self.pos = lt_start;
368                        self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
369
370                        let lifetime_name = self.str_from(start);
371                        let ident = Symbol::intern(lifetime_name);
372                        token::Lifetime(ident, IdentIsRaw::No)
373                    }
374                }
375                rustc_lexer::TokenKind::Semi => token::Semi,
376                rustc_lexer::TokenKind::Comma => token::Comma,
377                rustc_lexer::TokenKind::Dot => token::Dot,
378                rustc_lexer::TokenKind::OpenParen => token::OpenParen,
379                rustc_lexer::TokenKind::CloseParen => token::CloseParen,
380                rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
381                rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
382                rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
383                rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
384                rustc_lexer::TokenKind::At => token::At,
385                rustc_lexer::TokenKind::Pound => token::Pound,
386                rustc_lexer::TokenKind::Tilde => token::Tilde,
387                rustc_lexer::TokenKind::Question => token::Question,
388                rustc_lexer::TokenKind::Colon => token::Colon,
389                rustc_lexer::TokenKind::Dollar => token::Dollar,
390                rustc_lexer::TokenKind::Eq => token::Eq,
391                rustc_lexer::TokenKind::Bang => token::Bang,
392                rustc_lexer::TokenKind::Lt => token::Lt,
393                rustc_lexer::TokenKind::Gt => token::Gt,
394                rustc_lexer::TokenKind::Minus => token::Minus,
395                rustc_lexer::TokenKind::And => token::And,
396                rustc_lexer::TokenKind::Or => token::Or,
397                rustc_lexer::TokenKind::Plus => token::Plus,
398                rustc_lexer::TokenKind::Star => token::Star,
399                rustc_lexer::TokenKind::Slash => token::Slash,
400                rustc_lexer::TokenKind::Caret => token::Caret,
401                rustc_lexer::TokenKind::Percent => token::Percent,
402
403                rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
404                    // Don't emit diagnostics for sequences of the same invalid token
405                    if swallow_next_invalid > 0 {
406                        swallow_next_invalid -= 1;
407                        continue;
408                    }
409                    let mut it = self.str_from_to_end(start).chars();
410                    let c = it.next().unwrap();
411                    if c == '\u{00a0}' {
412                        // If an error has already been reported on non-breaking
413                        // space characters earlier in the file, treat all
414                        // subsequent occurrences as whitespace.
415                        if self.nbsp_is_whitespace {
416                            preceded_by_whitespace = true;
417                            continue;
418                        }
419                        self.nbsp_is_whitespace = true;
420                    }
421                    let repeats = it.take_while(|c1| *c1 == c).count();
422                    // FIXME: the lexer could be used to turn the ASCII version of unicode
423                    // homoglyphs, instead of keeping a table in `check_for_substitution`into the
424                    // token. Ideally, this should be inside `rustc_lexer`. However, we should
425                    // first remove compound tokens like `<<` from `rustc_lexer`, and then add
426                    // fancier error recovery to it, as there will be less overall work to do this
427                    // way.
428                    let (token, sugg) =
429                        unicode_chars::check_for_substitution(self, start, c, repeats + 1);
430                    self.dcx().emit_err(errors::UnknownTokenStart {
431                        span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
432                        escaped: escaped_char(c),
433                        sugg,
434                        null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
435                        repeat: if repeats > 0 {
436                            swallow_next_invalid = repeats;
437                            Some(errors::UnknownTokenRepeat { repeats })
438                        } else {
439                            None
440                        },
441                    });
442
443                    if let Some(token) = token {
444                        token
445                    } else {
446                        preceded_by_whitespace = true;
447                        continue;
448                    }
449                }
450                rustc_lexer::TokenKind::Eof => token::Eof,
451            };
452            let span = self.mk_sp(start, self.pos);
453            return (Token::new(kind, span), preceded_by_whitespace);
454        }
455    }
456
457    fn ident(&self, start: BytePos) -> TokenKind {
458        let sym = nfc_normalize(self.str_from(start));
459        let span = self.mk_sp(start, self.pos);
460        self.psess.symbol_gallery.insert(sym, span);
461        token::Ident(sym, IdentIsRaw::No)
462    }
463
464    /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly
465    /// complain about it.
466    fn lint_unicode_text_flow(&self, start: BytePos) {
467        // Opening delimiter of the length 2 is not included into the comment text.
468        let content_start = start + BytePos(2);
469        let content = self.str_from(content_start);
470        if contains_text_flow_control_chars(content) {
471            let span = self.mk_sp(start, self.pos);
472            self.psess.buffer_lint(
473                TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
474                span,
475                ast::CRATE_NODE_ID,
476                BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
477            );
478        }
479    }
480
481    fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
482        if contains_text_flow_control_chars(content) {
483            self.report_text_direction_codepoint(
484                content,
485                self.mk_sp(start, self.pos),
486                0,
487                false,
488                "doc comment",
489            );
490        }
491    }
492
493    fn lint_literal_unicode_text_flow(
494        &mut self,
495        text: Symbol,
496        lit_kind: token::LitKind,
497        span: Span,
498        label: &'static str,
499    ) {
500        if !contains_text_flow_control_chars(text.as_str()) {
501            return;
502        }
503        let (padding, point_at_inner_spans) = match lit_kind {
504            // account for `"` or `'`
505            token::LitKind::Str | token::LitKind::Char => (1, true),
506            // account for `c"`
507            token::LitKind::CStr => (2, true),
508            // account for `r###"`
509            token::LitKind::StrRaw(n) => (n as u32 + 2, true),
510            // account for `cr###"`
511            token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
512            // suppress bad literals.
513            token::LitKind::Err(_) => return,
514            // Be conservative just in case new literals do support these.
515            _ => (0, false),
516        };
517        self.report_text_direction_codepoint(
518            text.as_str(),
519            span,
520            padding,
521            point_at_inner_spans,
522            label,
523        );
524    }
525
526    fn report_text_direction_codepoint(
527        &self,
528        text: &str,
529        span: Span,
530        padding: u32,
531        point_at_inner_spans: bool,
532        label: &str,
533    ) {
534        // Obtain the `Span`s for each of the forbidden chars.
535        let spans: Vec<_> = text
536            .char_indices()
537            .filter_map(|(i, c)| {
538                TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
539                    let lo = span.lo() + BytePos(i as u32 + padding);
540                    (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
541                })
542            })
543            .collect();
544
545        let count = spans.len();
546        let labels = point_at_inner_spans.then_some(spans.clone());
547
548        self.psess.buffer_lint(
549            TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
550            span,
551            ast::CRATE_NODE_ID,
552            BuiltinLintDiag::HiddenUnicodeCodepoints {
553                label: label.to_string(),
554                count,
555                span_label: span,
556                labels,
557                escape: point_at_inner_spans && !spans.is_empty(),
558                spans,
559            },
560        );
561    }
562
563    fn validate_frontmatter(
564        &self,
565        start: BytePos,
566        has_invalid_preceding_whitespace: bool,
567        invalid_infostring: bool,
568    ) {
569        let s = self.str_from(start);
570        let real_start = s.find("---").unwrap();
571        let frontmatter_opening_pos = BytePos(real_start as u32) + start;
572        let s_new = &s[real_start..];
573        let within = s_new.trim_start_matches('-');
574        let len_opening = s_new.len() - within.len();
575
576        let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
577        if has_invalid_preceding_whitespace {
578            let line_start =
579                BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
580            let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
581            let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
582            self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
583                span,
584                note_span: label_span,
585            });
586        }
587
588        if invalid_infostring {
589            let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
590            let span = self.mk_sp(
591                frontmatter_opening_end_pos,
592                frontmatter_opening_pos + BytePos(line_end as u32),
593            );
594            self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
595        }
596
597        let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
598        let last_line = &within[last_line_start..];
599        let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
600        let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
601
602        let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
603        self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
604
605        if !last_line_trimmed.starts_with("---") {
606            let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
607            self.dcx().emit_err(errors::FrontmatterUnclosed {
608                span: frontmatter_span,
609                note_span: label_span,
610            });
611            return;
612        }
613
614        if last_line_trimmed.len() != last_line.len() {
615            let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
616            let span = self.mk_sp(last_line_start_pos, line_end);
617            let whitespace_end =
618                last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
619            let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
620            self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
621                span,
622                note_span: label_span,
623            });
624        }
625
626        let rest = last_line_trimmed.trim_start_matches('-');
627        let len_close = last_line_trimmed.len() - rest.len();
628        if len_close != len_opening {
629            let span = self.mk_sp(frontmatter_opening_pos, self.pos);
630            let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
631            let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
632            let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
633            self.dcx().emit_err(errors::FrontmatterLengthMismatch {
634                span,
635                opening,
636                close,
637                len_opening,
638                len_close,
639            });
640        }
641
642        if !rest.trim_matches(is_whitespace).is_empty() {
643            let span = self.mk_sp(last_line_start_pos, self.pos);
644            self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
645        }
646    }
647
648    fn cook_doc_comment(
649        &self,
650        content_start: BytePos,
651        content: &str,
652        comment_kind: CommentKind,
653        doc_style: DocStyle,
654    ) -> TokenKind {
655        if content.contains('\r') {
656            for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
657                let span = self.mk_sp(
658                    content_start + BytePos(idx as u32),
659                    content_start + BytePos(idx as u32 + 1),
660                );
661                let block = matches!(comment_kind, CommentKind::Block);
662                self.dcx().emit_err(errors::CrDocComment { span, block });
663            }
664        }
665
666        let attr_style = match doc_style {
667            DocStyle::Outer => AttrStyle::Outer,
668            DocStyle::Inner => AttrStyle::Inner,
669        };
670
671        token::DocComment(comment_kind, attr_style, Symbol::intern(content))
672    }
673
674    fn cook_lexer_literal(
675        &self,
676        start: BytePos,
677        end: BytePos,
678        kind: rustc_lexer::LiteralKind,
679    ) -> (token::LitKind, Symbol) {
680        match kind {
681            rustc_lexer::LiteralKind::Char { terminated } => {
682                if !terminated {
683                    let mut err = self
684                        .dcx()
685                        .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
686                        .with_code(E0762);
687                    if let Some(lt_sp) = self.last_lifetime {
688                        err.multipart_suggestion(
689                            "if you meant to write a string literal, use double quotes",
690                            vec![
691                                (lt_sp, "\"".to_string()),
692                                (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
693                            ],
694                            Applicability::MaybeIncorrect,
695                        );
696                    }
697                    err.emit()
698                }
699                self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
700            }
701            rustc_lexer::LiteralKind::Byte { terminated } => {
702                if !terminated {
703                    self.dcx()
704                        .struct_span_fatal(
705                            self.mk_sp(start + BytePos(1), end),
706                            "unterminated byte constant",
707                        )
708                        .with_code(E0763)
709                        .emit()
710                }
711                self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
712            }
713            rustc_lexer::LiteralKind::Str { terminated } => {
714                if !terminated {
715                    self.dcx()
716                        .struct_span_fatal(
717                            self.mk_sp(start, end),
718                            "unterminated double quote string",
719                        )
720                        .with_code(E0765)
721                        .emit()
722                }
723                self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
724            }
725            rustc_lexer::LiteralKind::ByteStr { terminated } => {
726                if !terminated {
727                    self.dcx()
728                        .struct_span_fatal(
729                            self.mk_sp(start + BytePos(1), end),
730                            "unterminated double quote byte string",
731                        )
732                        .with_code(E0766)
733                        .emit()
734                }
735                self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
736                // b" "
737            }
738            rustc_lexer::LiteralKind::CStr { terminated } => {
739                if !terminated {
740                    self.dcx()
741                        .struct_span_fatal(
742                            self.mk_sp(start + BytePos(1), end),
743                            "unterminated C string",
744                        )
745                        .with_code(E0767)
746                        .emit()
747                }
748                self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
749            }
750            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
751                if let Some(n_hashes) = n_hashes {
752                    let n = u32::from(n_hashes);
753                    let kind = token::StrRaw(n_hashes);
754                    self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
755                // r##" "##
756                } else {
757                    self.report_raw_str_error(start, 1);
758                }
759            }
760            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
761                if let Some(n_hashes) = n_hashes {
762                    let n = u32::from(n_hashes);
763                    let kind = token::ByteStrRaw(n_hashes);
764                    self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
765                // br##" "##
766                } else {
767                    self.report_raw_str_error(start, 2);
768                }
769            }
770            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
771                if let Some(n_hashes) = n_hashes {
772                    let n = u32::from(n_hashes);
773                    let kind = token::CStrRaw(n_hashes);
774                    self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
775                // cr##" "##
776                } else {
777                    self.report_raw_str_error(start, 2);
778                }
779            }
780            rustc_lexer::LiteralKind::Int { base, empty_int } => {
781                let mut kind = token::Integer;
782                if empty_int {
783                    let span = self.mk_sp(start, end);
784                    let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
785                    kind = token::Err(guar);
786                } else if matches!(base, Base::Binary | Base::Octal) {
787                    let base = base as u32;
788                    let s = self.str_from_to(start + BytePos(2), end);
789                    for (idx, c) in s.char_indices() {
790                        let span = self.mk_sp(
791                            start + BytePos::from_usize(2 + idx),
792                            start + BytePos::from_usize(2 + idx + c.len_utf8()),
793                        );
794                        if c != '_' && c.to_digit(base).is_none() {
795                            let guar =
796                                self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
797                            kind = token::Err(guar);
798                        }
799                    }
800                }
801                (kind, self.symbol_from_to(start, end))
802            }
803            rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
804                let mut kind = token::Float;
805                if empty_exponent {
806                    let span = self.mk_sp(start, self.pos);
807                    let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
808                    kind = token::Err(guar);
809                }
810                let base = match base {
811                    Base::Hexadecimal => Some("hexadecimal"),
812                    Base::Octal => Some("octal"),
813                    Base::Binary => Some("binary"),
814                    _ => None,
815                };
816                if let Some(base) = base {
817                    let span = self.mk_sp(start, end);
818                    let guar =
819                        self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
820                    kind = token::Err(guar)
821                }
822                (kind, self.symbol_from_to(start, end))
823            }
824        }
825    }
826
827    #[inline]
828    fn src_index(&self, pos: BytePos) -> usize {
829        (pos - self.start_pos).to_usize()
830    }
831
832    /// Slice of the source text from `start` up to but excluding `self.pos`,
833    /// meaning the slice does not include the character `self.ch`.
834    fn str_from(&self, start: BytePos) -> &'src str {
835        self.str_from_to(start, self.pos)
836    }
837
838    /// As symbol_from, with an explicit endpoint.
839    fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
840        debug!("taking an ident from {:?} to {:?}", start, end);
841        Symbol::intern(self.str_from_to(start, end))
842    }
843
844    /// Slice of the source text spanning from `start` up to but excluding `end`.
845    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
846        &self.src[self.src_index(start)..self.src_index(end)]
847    }
848
849    /// Slice of the source text spanning from `start` until the end
850    fn str_from_to_end(&self, start: BytePos) -> &'src str {
851        &self.src[self.src_index(start)..]
852    }
853
854    fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
855        match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
856            Err(RawStrError::InvalidStarter { bad_char }) => {
857                self.report_non_started_raw_string(start, bad_char)
858            }
859            Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
860                .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
861            Err(RawStrError::TooManyDelimiters { found }) => {
862                self.report_too_many_hashes(start, found)
863            }
864            Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
865        }
866    }
867
868    fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
869        self.dcx()
870            .struct_span_fatal(
871                self.mk_sp(start, self.pos),
872                format!(
873                    "found invalid character; only `#` is allowed in raw string delimitation: {}",
874                    escaped_char(bad_char)
875                ),
876            )
877            .emit()
878    }
879
880    fn report_unterminated_raw_string(
881        &self,
882        start: BytePos,
883        n_hashes: u32,
884        possible_offset: Option<u32>,
885        found_terminators: u32,
886    ) -> ! {
887        let mut err =
888            self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
889        err.code(E0748);
890        err.span_label(self.mk_sp(start, start), "unterminated raw string");
891
892        if n_hashes > 0 {
893            err.note(format!(
894                "this raw string should be terminated with `\"{}`",
895                "#".repeat(n_hashes as usize)
896            ));
897        }
898
899        if let Some(possible_offset) = possible_offset {
900            let lo = start + BytePos(possible_offset);
901            let hi = lo + BytePos(found_terminators);
902            let span = self.mk_sp(lo, hi);
903            err.span_suggestion(
904                span,
905                "consider terminating the string here",
906                "#".repeat(n_hashes as usize),
907                Applicability::MaybeIncorrect,
908            );
909        }
910
911        err.emit()
912    }
913
914    fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
915        let msg = match doc_style {
916            Some(_) => "unterminated block doc-comment",
917            None => "unterminated block comment",
918        };
919        let last_bpos = self.pos;
920        let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
921        err.code(E0758);
922        let mut nested_block_comment_open_idxs = vec![];
923        let mut last_nested_block_comment_idxs = None;
924        let mut content_chars = self.str_from(start).char_indices().peekable();
925
926        while let Some((idx, current_char)) = content_chars.next() {
927            match content_chars.peek() {
928                Some((_, '*')) if current_char == '/' => {
929                    nested_block_comment_open_idxs.push(idx);
930                }
931                Some((_, '/')) if current_char == '*' => {
932                    last_nested_block_comment_idxs =
933                        nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
934                }
935                _ => {}
936            };
937        }
938
939        if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
940            err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
941                .span_label(
942                    self.mk_sp(
943                        start + BytePos(nested_open_idx as u32),
944                        start + BytePos(nested_open_idx as u32 + 2),
945                    ),
946                    "...as last nested comment starts here, maybe you want to close this instead?",
947                )
948                .span_label(
949                    self.mk_sp(
950                        start + BytePos(nested_close_idx as u32),
951                        start + BytePos(nested_close_idx as u32 + 2),
952                    ),
953                    "...and last nested comment terminates here.",
954                );
955        }
956
957        err.emit();
958    }
959
960    // RFC 3101 introduced the idea of (reserved) prefixes. As of Rust 2021,
961    // using a (unknown) prefix is an error. In earlier editions, however, they
962    // only result in a (allowed by default) lint, and are treated as regular
963    // identifier tokens.
964    fn report_unknown_prefix(&self, start: BytePos) {
965        let prefix_span = self.mk_sp(start, self.pos);
966        let prefix = self.str_from_to(start, self.pos);
967        let expn_data = prefix_span.ctxt().outer_expn_data();
968
969        if expn_data.edition.at_least_rust_2021() {
970            // In Rust 2021, this is a hard error.
971            let sugg = if prefix == "rb" {
972                Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
973            } else if prefix == "rc" {
974                Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
975            } else if expn_data.is_root() {
976                if self.cursor.first() == '\''
977                    && let Some(start) = self.last_lifetime
978                    && self.cursor.third() != '\''
979                    && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
980                    && !self.psess.source_map().is_multiline(start.until(end))
981                {
982                    // FIXME: An "unclosed `char`" error will be emitted already in some cases,
983                    // but it's hard to silence this error while not also silencing important cases
984                    // too. We should use the error stashing machinery instead.
985                    Some(errors::UnknownPrefixSugg::MeantStr { start, end })
986                } else {
987                    Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
988                }
989            } else {
990                None
991            };
992            self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
993        } else {
994            // Before Rust 2021, only emit a lint for migration.
995            self.psess.buffer_lint(
996                RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
997                prefix_span,
998                ast::CRATE_NODE_ID,
999                BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1000            );
1001        }
1002    }
1003
1004    /// Detect guarded string literal syntax
1005    ///
1006    /// RFC 3593 reserved this syntax for future use. As of Rust 2024,
1007    /// using this syntax produces an error. In earlier editions, however, it
1008    /// only results in an (allowed by default) lint, and is treated as
1009    /// separate tokens.
1010    fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1011        let span = self.mk_sp(start, self.pos);
1012        let edition2024 = span.edition().at_least_rust_2024();
1013
1014        let space_pos = start + BytePos(1);
1015        let space_span = self.mk_sp(space_pos, space_pos);
1016
1017        let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1018
1019        let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1020            Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1021                let end = start + BytePos(token_len);
1022                let span = self.mk_sp(start, end);
1023                let str_start = start + BytePos(n_hashes);
1024
1025                if edition2024 {
1026                    self.cursor = cursor;
1027                    self.pos = end;
1028                }
1029
1030                let unterminated = if terminated { None } else { Some(str_start) };
1031
1032                (true, span, unterminated)
1033            }
1034            None => {
1035                // We should only get here in the `##+` case.
1036                debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1037
1038                (false, span, None)
1039            }
1040        };
1041        if edition2024 {
1042            if let Some(str_start) = unterminated {
1043                // Only a fatal error if string is unterminated.
1044                self.dcx()
1045                    .struct_span_fatal(
1046                        self.mk_sp(str_start, self.pos),
1047                        "unterminated double quote string",
1048                    )
1049                    .with_code(E0765)
1050                    .emit()
1051            }
1052
1053            let sugg = if span.from_expansion() {
1054                None
1055            } else {
1056                Some(errors::GuardedStringSugg(space_span))
1057            };
1058
1059            // In Edition 2024 and later, emit a hard error.
1060            let err = if is_string {
1061                self.dcx().emit_err(errors::ReservedString { span, sugg })
1062            } else {
1063                self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1064            };
1065
1066            token::Literal(token::Lit {
1067                kind: token::Err(err),
1068                symbol: self.symbol_from_to(start, self.pos),
1069                suffix: None,
1070            })
1071        } else {
1072            // Before Rust 2024, only emit a lint for migration.
1073            self.psess.buffer_lint(
1074                RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1075                span,
1076                ast::CRATE_NODE_ID,
1077                BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1078            );
1079
1080            // For backwards compatibility, roll back to after just the first `#`
1081            // and return the `Pound` token.
1082            self.pos = start + BytePos(1);
1083            self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1084            token::Pound
1085        }
1086    }
1087
1088    fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1089        self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1090    }
1091
1092    fn cook_quoted(
1093        &self,
1094        mut kind: token::LitKind,
1095        mode: Mode,
1096        start: BytePos,
1097        end: BytePos,
1098        prefix_len: u32,
1099        postfix_len: u32,
1100    ) -> (token::LitKind, Symbol) {
1101        let content_start = start + BytePos(prefix_len);
1102        let content_end = end - BytePos(postfix_len);
1103        let lit_content = self.str_from_to(content_start, content_end);
1104        check_for_errors(lit_content, mode, |range, err| {
1105            let span_with_quotes = self.mk_sp(start, end);
1106            let (start, end) = (range.start as u32, range.end as u32);
1107            let lo = content_start + BytePos(start);
1108            let hi = lo + BytePos(end - start);
1109            let span = self.mk_sp(lo, hi);
1110            let is_fatal = err.is_fatal();
1111            if let Some(guar) = emit_unescape_error(
1112                self.dcx(),
1113                lit_content,
1114                span_with_quotes,
1115                span,
1116                mode,
1117                range,
1118                err,
1119            ) {
1120                assert!(is_fatal);
1121                kind = token::Err(guar);
1122            }
1123        });
1124
1125        // We normally exclude the quotes for the symbol, but for errors we
1126        // include it because it results in clearer error messages.
1127        let sym = if !matches!(kind, token::Err(_)) {
1128            Symbol::intern(lit_content)
1129        } else {
1130            self.symbol_from_to(start, end)
1131        };
1132        (kind, sym)
1133    }
1134}
1135
1136pub fn nfc_normalize(string: &str) -> Symbol {
1137    use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1138    match is_nfc_quick(string.chars()) {
1139        IsNormalized::Yes => Symbol::intern(string),
1140        _ => {
1141            let normalized_str: String = string.chars().nfc().collect();
1142            Symbol::intern(&normalized_str)
1143        }
1144    }
1145}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy