rustc_parse/lexer/
mod.rs

1use diagnostics::make_unclosed_delims_error;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9    Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15    TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32// This type is used a lot. Make sure it doesn't unintentionally get bigger.
33//
34// This assertion is in this crate, rather than in `rustc_lexer`, because that
35// crate cannot depend on `rustc_data_structures`.
36#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41    pub found_delim: Option<Delimiter>,
42    pub found_span: Span,
43    pub unclosed_span: Option<Span>,
44    pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48    psess: &'psess ParseSess,
49    mut src: &'src str,
50    mut start_pos: BytePos,
51    override_span: Option<Span>,
52) -> Result<TokenStream, Vec<Diag<'psess>>> {
53    // Skip `#!`, if present.
54    if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
55        src = &src[shebang_len..];
56        start_pos = start_pos + BytePos::from_usize(shebang_len);
57    }
58
59    let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
60    let mut lexer = Lexer {
61        psess,
62        start_pos,
63        pos: start_pos,
64        src,
65        cursor,
66        override_span,
67        nbsp_is_whitespace: false,
68        last_lifetime: None,
69        token: Token::dummy(),
70        diag_info: TokenTreeDiagInfo::default(),
71    };
72    let res = lexer.lex_token_trees(/* is_delimited */ false);
73
74    let mut unmatched_delims: Vec<_> = lexer
75        .diag_info
76        .unmatched_delims
77        .into_iter()
78        .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
79        .collect();
80
81    match res {
82        Ok((_open_spacing, stream)) => {
83            if unmatched_delims.is_empty() {
84                Ok(stream)
85            } else {
86                // Return error if there are unmatched delimiters or unclosed delimiters.
87                Err(unmatched_delims)
88            }
89        }
90        Err(errs) => {
91            // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
92            // because the delimiter mismatch is more likely to be the root cause of error
93            unmatched_delims.extend(errs);
94            Err(unmatched_delims)
95        }
96    }
97}
98
99struct Lexer<'psess, 'src> {
100    psess: &'psess ParseSess,
101    /// Initial position, read-only.
102    start_pos: BytePos,
103    /// The absolute offset within the source_map of the current character.
104    pos: BytePos,
105    /// Source text to tokenize.
106    src: &'src str,
107    /// Cursor for getting lexer tokens.
108    cursor: Cursor<'src>,
109    override_span: Option<Span>,
110    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
111    /// in this file, it's safe to treat further occurrences of the non-breaking
112    /// space character as whitespace.
113    nbsp_is_whitespace: bool,
114
115    /// Track the `Span` for the leading `'` of the last lifetime. Used for
116    /// diagnostics to detect possible typo where `"` was meant.
117    last_lifetime: Option<Span>,
118
119    /// The current token.
120    token: Token,
121
122    diag_info: TokenTreeDiagInfo,
123}
124
125impl<'psess, 'src> Lexer<'psess, 'src> {
126    fn dcx(&self) -> DiagCtxtHandle<'psess> {
127        self.psess.dcx()
128    }
129
130    fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
131        self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
132    }
133
134    /// Returns the next token, paired with a bool indicating if the token was
135    /// preceded by whitespace.
136    fn next_token_from_cursor(&mut self) -> (Token, bool) {
137        let mut preceded_by_whitespace = false;
138        let mut swallow_next_invalid = 0;
139        // Skip trivial (whitespace & comments) tokens
140        loop {
141            let str_before = self.cursor.as_str();
142            let token = self.cursor.advance_token();
143            let start = self.pos;
144            self.pos = self.pos + BytePos(token.len);
145
146            debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
147
148            if let rustc_lexer::TokenKind::Semi
149            | rustc_lexer::TokenKind::LineComment { .. }
150            | rustc_lexer::TokenKind::BlockComment { .. }
151            | rustc_lexer::TokenKind::CloseParen
152            | rustc_lexer::TokenKind::CloseBrace
153            | rustc_lexer::TokenKind::CloseBracket = token.kind
154            {
155                // Heuristic: we assume that it is unlikely we're dealing with an unterminated
156                // string surrounded by single quotes.
157                self.last_lifetime = None;
158            }
159
160            // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
161            // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
162            // additional validation.
163            let kind = match token.kind {
164                rustc_lexer::TokenKind::LineComment { doc_style } => {
165                    // Skip non-doc comments
166                    let Some(doc_style) = doc_style else {
167                        self.lint_unicode_text_flow(start);
168                        preceded_by_whitespace = true;
169                        continue;
170                    };
171
172                    // Opening delimiter of the length 3 is not included into the symbol.
173                    let content_start = start + BytePos(3);
174                    let content = self.str_from(content_start);
175                    self.lint_doc_comment_unicode_text_flow(start, content);
176                    self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
177                }
178                rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
179                    if !terminated {
180                        self.report_unterminated_block_comment(start, doc_style);
181                    }
182
183                    // Skip non-doc comments
184                    let Some(doc_style) = doc_style else {
185                        self.lint_unicode_text_flow(start);
186                        preceded_by_whitespace = true;
187                        continue;
188                    };
189
190                    // Opening delimiter of the length 3 and closing delimiter of the length 2
191                    // are not included into the symbol.
192                    let content_start = start + BytePos(3);
193                    let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
194                    let content = self.str_from_to(content_start, content_end);
195                    self.lint_doc_comment_unicode_text_flow(start, content);
196                    self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
197                }
198                rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
199                    self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
200                    preceded_by_whitespace = true;
201                    continue;
202                }
203                rustc_lexer::TokenKind::Whitespace => {
204                    preceded_by_whitespace = true;
205                    continue;
206                }
207                rustc_lexer::TokenKind::Ident => self.ident(start),
208                rustc_lexer::TokenKind::RawIdent => {
209                    let sym = nfc_normalize(self.str_from(start + BytePos(2)));
210                    let span = self.mk_sp(start, self.pos);
211                    self.psess.symbol_gallery.insert(sym, span);
212                    if !sym.can_be_raw() {
213                        self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
214                    }
215                    self.psess.raw_identifier_spans.push(span);
216                    token::Ident(sym, IdentIsRaw::Yes)
217                }
218                rustc_lexer::TokenKind::UnknownPrefix => {
219                    self.report_unknown_prefix(start);
220                    self.ident(start)
221                }
222                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
223                    self.report_unknown_prefix(start);
224                    // Include the leading `'` in the real identifier, for macro
225                    // expansion purposes. See #12512 for the gory details of why
226                    // this is necessary.
227                    let lifetime_name = self.str_from(start);
228                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
229                    let ident = Symbol::intern(lifetime_name);
230                    token::Lifetime(ident, IdentIsRaw::No)
231                }
232                rustc_lexer::TokenKind::InvalidIdent
233                    // Do not recover an identifier with emoji if the codepoint is a confusable
234                    // with a recoverable substitution token, like `âž–`.
235                    if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
236                        let sym = self.str_from(start);
237                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
238                    }) =>
239                {
240                    let sym = nfc_normalize(self.str_from(start));
241                    let span = self.mk_sp(start, self.pos);
242                    self.psess
243                        .bad_unicode_identifiers
244                        .borrow_mut()
245                        .entry(sym)
246                        .or_default()
247                        .push(span);
248                    token::Ident(sym, IdentIsRaw::No)
249                }
250                // split up (raw) c string literals to an ident and a string literal when edition <
251                // 2021.
252                rustc_lexer::TokenKind::Literal {
253                    kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
254                    suffix_start: _,
255                } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
256                    let prefix_len = match kind {
257                        LiteralKind::CStr { .. } => 1,
258                        LiteralKind::RawCStr { .. } => 2,
259                        _ => unreachable!(),
260                    };
261
262                    // reset the state so that only the prefix ("c" or "cr")
263                    // was consumed.
264                    let lit_start = start + BytePos(prefix_len);
265                    self.pos = lit_start;
266                    self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
267                    self.report_unknown_prefix(start);
268                    let prefix_span = self.mk_sp(start, lit_start);
269                    return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
270                }
271                rustc_lexer::TokenKind::GuardedStrPrefix => {
272                    self.maybe_report_guarded_str(start, str_before)
273                }
274                rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
275                    let suffix_start = start + BytePos(suffix_start);
276                    let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
277                    let suffix = if suffix_start < self.pos {
278                        let string = self.str_from(suffix_start);
279                        if string == "_" {
280                            self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
281                                span: self.mk_sp(suffix_start, self.pos),
282                            });
283                            None
284                        } else {
285                            Some(Symbol::intern(string))
286                        }
287                    } else {
288                        None
289                    };
290                    self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
291                    token::Literal(token::Lit { kind, symbol, suffix })
292                }
293                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
294                    // Include the leading `'` in the real identifier, for macro
295                    // expansion purposes. See #12512 for the gory details of why
296                    // this is necessary.
297                    let lifetime_name = self.str_from(start);
298                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
299                    if starts_with_number {
300                        let span = self.mk_sp(start, self.pos);
301                        self.dcx()
302                            .struct_err("lifetimes cannot start with a number")
303                            .with_span(span)
304                            .stash(span, StashKey::LifetimeIsChar);
305                    }
306                    let ident = Symbol::intern(lifetime_name);
307                    token::Lifetime(ident, IdentIsRaw::No)
308                }
309                rustc_lexer::TokenKind::RawLifetime => {
310                    self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
311
312                    let ident_start = start + BytePos(3);
313                    let prefix_span = self.mk_sp(start, ident_start);
314
315                    if prefix_span.at_least_rust_2021() {
316                        // If the raw lifetime is followed by \' then treat it a normal
317                        // lifetime followed by a \', which is to interpret it as a character
318                        // literal. In this case, it's always an invalid character literal
319                        // since the literal must necessarily have >3 characters (r#...) inside
320                        // of it, which is invalid.
321                        if self.cursor.as_str().starts_with('\'') {
322                            let lit_span = self.mk_sp(start, self.pos + BytePos(1));
323                            let contents = self.str_from_to(start + BytePos(1), self.pos);
324                            emit_unescape_error(
325                                self.dcx(),
326                                contents,
327                                lit_span,
328                                lit_span,
329                                Mode::Char,
330                                0..contents.len(),
331                                EscapeError::MoreThanOneChar,
332                            )
333                            .expect("expected error");
334                        }
335
336                        let span = self.mk_sp(start, self.pos);
337
338                        let lifetime_name_without_tick =
339                            Symbol::intern(&self.str_from(ident_start));
340                        if !lifetime_name_without_tick.can_be_raw() {
341                            self.dcx().emit_err(
342                                errors::CannotBeRawLifetime {
343                                    span,
344                                    ident: lifetime_name_without_tick
345                                }
346                            );
347                        }
348
349                        // Put the `'` back onto the lifetime name.
350                        let mut lifetime_name =
351                            String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
352                        lifetime_name.push('\'');
353                        lifetime_name += lifetime_name_without_tick.as_str();
354                        let sym = Symbol::intern(&lifetime_name);
355
356                        // Make sure we mark this as a raw identifier.
357                        self.psess.raw_identifier_spans.push(span);
358
359                        token::Lifetime(sym, IdentIsRaw::Yes)
360                    } else {
361                        // Otherwise, this should be parsed like `'r`. Warn about it though.
362                        self.psess.buffer_lint(
363                            RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
364                            prefix_span,
365                            ast::CRATE_NODE_ID,
366                            BuiltinLintDiag::RawPrefix(prefix_span),
367                        );
368
369                        // Reset the state so we just lex the `'r`.
370                        let lt_start = start + BytePos(2);
371                        self.pos = lt_start;
372                        self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
373
374                        let lifetime_name = self.str_from(start);
375                        let ident = Symbol::intern(lifetime_name);
376                        token::Lifetime(ident, IdentIsRaw::No)
377                    }
378                }
379                rustc_lexer::TokenKind::Semi => token::Semi,
380                rustc_lexer::TokenKind::Comma => token::Comma,
381                rustc_lexer::TokenKind::Dot => token::Dot,
382                rustc_lexer::TokenKind::OpenParen => token::OpenParen,
383                rustc_lexer::TokenKind::CloseParen => token::CloseParen,
384                rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
385                rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
386                rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
387                rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
388                rustc_lexer::TokenKind::At => token::At,
389                rustc_lexer::TokenKind::Pound => token::Pound,
390                rustc_lexer::TokenKind::Tilde => token::Tilde,
391                rustc_lexer::TokenKind::Question => token::Question,
392                rustc_lexer::TokenKind::Colon => token::Colon,
393                rustc_lexer::TokenKind::Dollar => token::Dollar,
394                rustc_lexer::TokenKind::Eq => token::Eq,
395                rustc_lexer::TokenKind::Bang => token::Bang,
396                rustc_lexer::TokenKind::Lt => token::Lt,
397                rustc_lexer::TokenKind::Gt => token::Gt,
398                rustc_lexer::TokenKind::Minus => token::Minus,
399                rustc_lexer::TokenKind::And => token::And,
400                rustc_lexer::TokenKind::Or => token::Or,
401                rustc_lexer::TokenKind::Plus => token::Plus,
402                rustc_lexer::TokenKind::Star => token::Star,
403                rustc_lexer::TokenKind::Slash => token::Slash,
404                rustc_lexer::TokenKind::Caret => token::Caret,
405                rustc_lexer::TokenKind::Percent => token::Percent,
406
407                rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
408                    // Don't emit diagnostics for sequences of the same invalid token
409                    if swallow_next_invalid > 0 {
410                        swallow_next_invalid -= 1;
411                        continue;
412                    }
413                    let mut it = self.str_from_to_end(start).chars();
414                    let c = it.next().unwrap();
415                    if c == '\u{00a0}' {
416                        // If an error has already been reported on non-breaking
417                        // space characters earlier in the file, treat all
418                        // subsequent occurrences as whitespace.
419                        if self.nbsp_is_whitespace {
420                            preceded_by_whitespace = true;
421                            continue;
422                        }
423                        self.nbsp_is_whitespace = true;
424                    }
425                    let repeats = it.take_while(|c1| *c1 == c).count();
426                    // FIXME: the lexer could be used to turn the ASCII version of unicode
427                    // homoglyphs, instead of keeping a table in `check_for_substitution`into the
428                    // token. Ideally, this should be inside `rustc_lexer`. However, we should
429                    // first remove compound tokens like `<<` from `rustc_lexer`, and then add
430                    // fancier error recovery to it, as there will be less overall work to do this
431                    // way.
432                    let (token, sugg) =
433                        unicode_chars::check_for_substitution(self, start, c, repeats + 1);
434                    self.dcx().emit_err(errors::UnknownTokenStart {
435                        span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
436                        escaped: escaped_char(c),
437                        sugg,
438                        null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
439                        repeat: if repeats > 0 {
440                            swallow_next_invalid = repeats;
441                            Some(errors::UnknownTokenRepeat { repeats })
442                        } else {
443                            None
444                        },
445                    });
446
447                    if let Some(token) = token {
448                        token
449                    } else {
450                        preceded_by_whitespace = true;
451                        continue;
452                    }
453                }
454                rustc_lexer::TokenKind::Eof => token::Eof,
455            };
456            let span = self.mk_sp(start, self.pos);
457            return (Token::new(kind, span), preceded_by_whitespace);
458        }
459    }
460
461    fn ident(&self, start: BytePos) -> TokenKind {
462        let sym = nfc_normalize(self.str_from(start));
463        let span = self.mk_sp(start, self.pos);
464        self.psess.symbol_gallery.insert(sym, span);
465        token::Ident(sym, IdentIsRaw::No)
466    }
467
468    /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly
469    /// complain about it.
470    fn lint_unicode_text_flow(&self, start: BytePos) {
471        // Opening delimiter of the length 2 is not included into the comment text.
472        let content_start = start + BytePos(2);
473        let content = self.str_from(content_start);
474        if contains_text_flow_control_chars(content) {
475            let span = self.mk_sp(start, self.pos);
476            self.psess.buffer_lint(
477                TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
478                span,
479                ast::CRATE_NODE_ID,
480                BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
481            );
482        }
483    }
484
485    fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
486        if contains_text_flow_control_chars(content) {
487            self.report_text_direction_codepoint(
488                content,
489                self.mk_sp(start, self.pos),
490                0,
491                false,
492                "doc comment",
493            );
494        }
495    }
496
497    fn lint_literal_unicode_text_flow(
498        &mut self,
499        text: Symbol,
500        lit_kind: token::LitKind,
501        span: Span,
502        label: &'static str,
503    ) {
504        if !contains_text_flow_control_chars(text.as_str()) {
505            return;
506        }
507        let (padding, point_at_inner_spans) = match lit_kind {
508            // account for `"` or `'`
509            token::LitKind::Str | token::LitKind::Char => (1, true),
510            // account for `c"`
511            token::LitKind::CStr => (2, true),
512            // account for `r###"`
513            token::LitKind::StrRaw(n) => (n as u32 + 2, true),
514            // account for `cr###"`
515            token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
516            // suppress bad literals.
517            token::LitKind::Err(_) => return,
518            // Be conservative just in case new literals do support these.
519            _ => (0, false),
520        };
521        self.report_text_direction_codepoint(
522            text.as_str(),
523            span,
524            padding,
525            point_at_inner_spans,
526            label,
527        );
528    }
529
530    fn report_text_direction_codepoint(
531        &self,
532        text: &str,
533        span: Span,
534        padding: u32,
535        point_at_inner_spans: bool,
536        label: &str,
537    ) {
538        // Obtain the `Span`s for each of the forbidden chars.
539        let spans: Vec<_> = text
540            .char_indices()
541            .filter_map(|(i, c)| {
542                TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
543                    let lo = span.lo() + BytePos(i as u32 + padding);
544                    (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
545                })
546            })
547            .collect();
548
549        let count = spans.len();
550        let labels = point_at_inner_spans.then_some(spans.clone());
551
552        self.psess.buffer_lint(
553            TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
554            span,
555            ast::CRATE_NODE_ID,
556            BuiltinLintDiag::HiddenUnicodeCodepoints {
557                label: label.to_string(),
558                count,
559                span_label: span,
560                labels,
561                escape: point_at_inner_spans && !spans.is_empty(),
562                spans,
563            },
564        );
565    }
566
567    fn validate_frontmatter(
568        &self,
569        start: BytePos,
570        has_invalid_preceding_whitespace: bool,
571        invalid_infostring: bool,
572    ) {
573        let s = self.str_from(start);
574        let real_start = s.find("---").unwrap();
575        let frontmatter_opening_pos = BytePos(real_start as u32) + start;
576        let s_new = &s[real_start..];
577        let within = s_new.trim_start_matches('-');
578        let len_opening = s_new.len() - within.len();
579
580        let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
581        if has_invalid_preceding_whitespace {
582            let line_start =
583                BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
584            let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
585            let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
586            self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
587                span,
588                note_span: label_span,
589            });
590        }
591
592        if invalid_infostring {
593            let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
594            let span = self.mk_sp(
595                frontmatter_opening_end_pos,
596                frontmatter_opening_pos + BytePos(line_end as u32),
597            );
598            self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
599        }
600
601        let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
602        let last_line = &within[last_line_start..];
603        let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
604        let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
605
606        let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
607        self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
608
609        if !last_line_trimmed.starts_with("---") {
610            let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
611            self.dcx().emit_err(errors::FrontmatterUnclosed {
612                span: frontmatter_span,
613                note_span: label_span,
614            });
615            return;
616        }
617
618        if last_line_trimmed.len() != last_line.len() {
619            let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
620            let span = self.mk_sp(last_line_start_pos, line_end);
621            let whitespace_end =
622                last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
623            let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
624            self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
625                span,
626                note_span: label_span,
627            });
628        }
629
630        let rest = last_line_trimmed.trim_start_matches('-');
631        let len_close = last_line_trimmed.len() - rest.len();
632        if len_close != len_opening {
633            let span = self.mk_sp(frontmatter_opening_pos, self.pos);
634            let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
635            let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
636            let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
637            self.dcx().emit_err(errors::FrontmatterLengthMismatch {
638                span,
639                opening,
640                close,
641                len_opening,
642                len_close,
643            });
644        }
645
646        if !rest.trim_matches(is_whitespace).is_empty() {
647            let span = self.mk_sp(last_line_start_pos, self.pos);
648            self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
649        }
650    }
651
652    fn cook_doc_comment(
653        &self,
654        content_start: BytePos,
655        content: &str,
656        comment_kind: CommentKind,
657        doc_style: DocStyle,
658    ) -> TokenKind {
659        if content.contains('\r') {
660            for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
661                let span = self.mk_sp(
662                    content_start + BytePos(idx as u32),
663                    content_start + BytePos(idx as u32 + 1),
664                );
665                let block = matches!(comment_kind, CommentKind::Block);
666                self.dcx().emit_err(errors::CrDocComment { span, block });
667            }
668        }
669
670        let attr_style = match doc_style {
671            DocStyle::Outer => AttrStyle::Outer,
672            DocStyle::Inner => AttrStyle::Inner,
673        };
674
675        token::DocComment(comment_kind, attr_style, Symbol::intern(content))
676    }
677
678    fn cook_lexer_literal(
679        &self,
680        start: BytePos,
681        end: BytePos,
682        kind: rustc_lexer::LiteralKind,
683    ) -> (token::LitKind, Symbol) {
684        match kind {
685            rustc_lexer::LiteralKind::Char { terminated } => {
686                if !terminated {
687                    let mut err = self
688                        .dcx()
689                        .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
690                        .with_code(E0762);
691                    if let Some(lt_sp) = self.last_lifetime {
692                        err.multipart_suggestion(
693                            "if you meant to write a string literal, use double quotes",
694                            vec![
695                                (lt_sp, "\"".to_string()),
696                                (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
697                            ],
698                            Applicability::MaybeIncorrect,
699                        );
700                    }
701                    err.emit()
702                }
703                self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
704            }
705            rustc_lexer::LiteralKind::Byte { terminated } => {
706                if !terminated {
707                    self.dcx()
708                        .struct_span_fatal(
709                            self.mk_sp(start + BytePos(1), end),
710                            "unterminated byte constant",
711                        )
712                        .with_code(E0763)
713                        .emit()
714                }
715                self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
716            }
717            rustc_lexer::LiteralKind::Str { terminated } => {
718                if !terminated {
719                    self.dcx()
720                        .struct_span_fatal(
721                            self.mk_sp(start, end),
722                            "unterminated double quote string",
723                        )
724                        .with_code(E0765)
725                        .emit()
726                }
727                self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
728            }
729            rustc_lexer::LiteralKind::ByteStr { terminated } => {
730                if !terminated {
731                    self.dcx()
732                        .struct_span_fatal(
733                            self.mk_sp(start + BytePos(1), end),
734                            "unterminated double quote byte string",
735                        )
736                        .with_code(E0766)
737                        .emit()
738                }
739                self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
740                // b" "
741            }
742            rustc_lexer::LiteralKind::CStr { terminated } => {
743                if !terminated {
744                    self.dcx()
745                        .struct_span_fatal(
746                            self.mk_sp(start + BytePos(1), end),
747                            "unterminated C string",
748                        )
749                        .with_code(E0767)
750                        .emit()
751                }
752                self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
753            }
754            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
755                if let Some(n_hashes) = n_hashes {
756                    let n = u32::from(n_hashes);
757                    let kind = token::StrRaw(n_hashes);
758                    self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
759                // r##" "##
760                } else {
761                    self.report_raw_str_error(start, 1);
762                }
763            }
764            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
765                if let Some(n_hashes) = n_hashes {
766                    let n = u32::from(n_hashes);
767                    let kind = token::ByteStrRaw(n_hashes);
768                    self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
769                // br##" "##
770                } else {
771                    self.report_raw_str_error(start, 2);
772                }
773            }
774            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
775                if let Some(n_hashes) = n_hashes {
776                    let n = u32::from(n_hashes);
777                    let kind = token::CStrRaw(n_hashes);
778                    self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
779                // cr##" "##
780                } else {
781                    self.report_raw_str_error(start, 2);
782                }
783            }
784            rustc_lexer::LiteralKind::Int { base, empty_int } => {
785                let mut kind = token::Integer;
786                if empty_int {
787                    let span = self.mk_sp(start, end);
788                    let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
789                    kind = token::Err(guar);
790                } else if matches!(base, Base::Binary | Base::Octal) {
791                    let base = base as u32;
792                    let s = self.str_from_to(start + BytePos(2), end);
793                    for (idx, c) in s.char_indices() {
794                        let span = self.mk_sp(
795                            start + BytePos::from_usize(2 + idx),
796                            start + BytePos::from_usize(2 + idx + c.len_utf8()),
797                        );
798                        if c != '_' && c.to_digit(base).is_none() {
799                            let guar =
800                                self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
801                            kind = token::Err(guar);
802                        }
803                    }
804                }
805                (kind, self.symbol_from_to(start, end))
806            }
807            rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
808                let mut kind = token::Float;
809                if empty_exponent {
810                    let span = self.mk_sp(start, self.pos);
811                    let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
812                    kind = token::Err(guar);
813                }
814                let base = match base {
815                    Base::Hexadecimal => Some("hexadecimal"),
816                    Base::Octal => Some("octal"),
817                    Base::Binary => Some("binary"),
818                    _ => None,
819                };
820                if let Some(base) = base {
821                    let span = self.mk_sp(start, end);
822                    let guar =
823                        self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
824                    kind = token::Err(guar)
825                }
826                (kind, self.symbol_from_to(start, end))
827            }
828        }
829    }
830
831    #[inline]
832    fn src_index(&self, pos: BytePos) -> usize {
833        (pos - self.start_pos).to_usize()
834    }
835
836    /// Slice of the source text from `start` up to but excluding `self.pos`,
837    /// meaning the slice does not include the character `self.ch`.
838    fn str_from(&self, start: BytePos) -> &'src str {
839        self.str_from_to(start, self.pos)
840    }
841
842    /// As symbol_from, with an explicit endpoint.
843    fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
844        debug!("taking an ident from {:?} to {:?}", start, end);
845        Symbol::intern(self.str_from_to(start, end))
846    }
847
848    /// Slice of the source text spanning from `start` up to but excluding `end`.
849    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
850        &self.src[self.src_index(start)..self.src_index(end)]
851    }
852
853    /// Slice of the source text spanning from `start` until the end
854    fn str_from_to_end(&self, start: BytePos) -> &'src str {
855        &self.src[self.src_index(start)..]
856    }
857
858    fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
859        match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
860            Err(RawStrError::InvalidStarter { bad_char }) => {
861                self.report_non_started_raw_string(start, bad_char)
862            }
863            Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
864                .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
865            Err(RawStrError::TooManyDelimiters { found }) => {
866                self.report_too_many_hashes(start, found)
867            }
868            Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
869        }
870    }
871
872    fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
873        self.dcx()
874            .struct_span_fatal(
875                self.mk_sp(start, self.pos),
876                format!(
877                    "found invalid character; only `#` is allowed in raw string delimitation: {}",
878                    escaped_char(bad_char)
879                ),
880            )
881            .emit()
882    }
883
884    fn report_unterminated_raw_string(
885        &self,
886        start: BytePos,
887        n_hashes: u32,
888        possible_offset: Option<u32>,
889        found_terminators: u32,
890    ) -> ! {
891        let mut err =
892            self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
893        err.code(E0748);
894        err.span_label(self.mk_sp(start, start), "unterminated raw string");
895
896        if n_hashes > 0 {
897            err.note(format!(
898                "this raw string should be terminated with `\"{}`",
899                "#".repeat(n_hashes as usize)
900            ));
901        }
902
903        if let Some(possible_offset) = possible_offset {
904            let lo = start + BytePos(possible_offset);
905            let hi = lo + BytePos(found_terminators);
906            let span = self.mk_sp(lo, hi);
907            err.span_suggestion(
908                span,
909                "consider terminating the string here",
910                "#".repeat(n_hashes as usize),
911                Applicability::MaybeIncorrect,
912            );
913        }
914
915        err.emit()
916    }
917
918    fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
919        let msg = match doc_style {
920            Some(_) => "unterminated block doc-comment",
921            None => "unterminated block comment",
922        };
923        let last_bpos = self.pos;
924        let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
925        err.code(E0758);
926        let mut nested_block_comment_open_idxs = vec![];
927        let mut last_nested_block_comment_idxs = None;
928        let mut content_chars = self.str_from(start).char_indices().peekable();
929
930        while let Some((idx, current_char)) = content_chars.next() {
931            match content_chars.peek() {
932                Some((_, '*')) if current_char == '/' => {
933                    nested_block_comment_open_idxs.push(idx);
934                }
935                Some((_, '/')) if current_char == '*' => {
936                    last_nested_block_comment_idxs =
937                        nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
938                }
939                _ => {}
940            };
941        }
942
943        if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
944            err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
945                .span_label(
946                    self.mk_sp(
947                        start + BytePos(nested_open_idx as u32),
948                        start + BytePos(nested_open_idx as u32 + 2),
949                    ),
950                    "...as last nested comment starts here, maybe you want to close this instead?",
951                )
952                .span_label(
953                    self.mk_sp(
954                        start + BytePos(nested_close_idx as u32),
955                        start + BytePos(nested_close_idx as u32 + 2),
956                    ),
957                    "...and last nested comment terminates here.",
958                );
959        }
960
961        err.emit();
962    }
963
964    // RFC 3101 introduced the idea of (reserved) prefixes. As of Rust 2021,
965    // using a (unknown) prefix is an error. In earlier editions, however, they
966    // only result in a (allowed by default) lint, and are treated as regular
967    // identifier tokens.
968    fn report_unknown_prefix(&self, start: BytePos) {
969        let prefix_span = self.mk_sp(start, self.pos);
970        let prefix = self.str_from_to(start, self.pos);
971        let expn_data = prefix_span.ctxt().outer_expn_data();
972
973        if expn_data.edition.at_least_rust_2021() {
974            // In Rust 2021, this is a hard error.
975            let sugg = if prefix == "rb" {
976                Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
977            } else if prefix == "rc" {
978                Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
979            } else if expn_data.is_root() {
980                if self.cursor.first() == '\''
981                    && let Some(start) = self.last_lifetime
982                    && self.cursor.third() != '\''
983                    && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
984                    && !self.psess.source_map().is_multiline(start.until(end))
985                {
986                    // FIXME: An "unclosed `char`" error will be emitted already in some cases,
987                    // but it's hard to silence this error while not also silencing important cases
988                    // too. We should use the error stashing machinery instead.
989                    Some(errors::UnknownPrefixSugg::MeantStr { start, end })
990                } else {
991                    Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
992                }
993            } else {
994                None
995            };
996            self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
997        } else {
998            // Before Rust 2021, only emit a lint for migration.
999            self.psess.buffer_lint(
1000                RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
1001                prefix_span,
1002                ast::CRATE_NODE_ID,
1003                BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1004            );
1005        }
1006    }
1007
1008    /// Detect guarded string literal syntax
1009    ///
1010    /// RFC 3593 reserved this syntax for future use. As of Rust 2024,
1011    /// using this syntax produces an error. In earlier editions, however, it
1012    /// only results in an (allowed by default) lint, and is treated as
1013    /// separate tokens.
1014    fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1015        let span = self.mk_sp(start, self.pos);
1016        let edition2024 = span.edition().at_least_rust_2024();
1017
1018        let space_pos = start + BytePos(1);
1019        let space_span = self.mk_sp(space_pos, space_pos);
1020
1021        let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1022
1023        let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1024            Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1025                let end = start + BytePos(token_len);
1026                let span = self.mk_sp(start, end);
1027                let str_start = start + BytePos(n_hashes);
1028
1029                if edition2024 {
1030                    self.cursor = cursor;
1031                    self.pos = end;
1032                }
1033
1034                let unterminated = if terminated { None } else { Some(str_start) };
1035
1036                (true, span, unterminated)
1037            }
1038            None => {
1039                // We should only get here in the `##+` case.
1040                debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1041
1042                (false, span, None)
1043            }
1044        };
1045        if edition2024 {
1046            if let Some(str_start) = unterminated {
1047                // Only a fatal error if string is unterminated.
1048                self.dcx()
1049                    .struct_span_fatal(
1050                        self.mk_sp(str_start, self.pos),
1051                        "unterminated double quote string",
1052                    )
1053                    .with_code(E0765)
1054                    .emit()
1055            }
1056
1057            let sugg = if span.from_expansion() {
1058                None
1059            } else {
1060                Some(errors::GuardedStringSugg(space_span))
1061            };
1062
1063            // In Edition 2024 and later, emit a hard error.
1064            let err = if is_string {
1065                self.dcx().emit_err(errors::ReservedString { span, sugg })
1066            } else {
1067                self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1068            };
1069
1070            token::Literal(token::Lit {
1071                kind: token::Err(err),
1072                symbol: self.symbol_from_to(start, self.pos),
1073                suffix: None,
1074            })
1075        } else {
1076            // Before Rust 2024, only emit a lint for migration.
1077            self.psess.buffer_lint(
1078                RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1079                span,
1080                ast::CRATE_NODE_ID,
1081                BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1082            );
1083
1084            // For backwards compatibility, roll back to after just the first `#`
1085            // and return the `Pound` token.
1086            self.pos = start + BytePos(1);
1087            self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1088            token::Pound
1089        }
1090    }
1091
1092    fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1093        self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1094    }
1095
1096    fn cook_quoted(
1097        &self,
1098        mut kind: token::LitKind,
1099        mode: Mode,
1100        start: BytePos,
1101        end: BytePos,
1102        prefix_len: u32,
1103        postfix_len: u32,
1104    ) -> (token::LitKind, Symbol) {
1105        let content_start = start + BytePos(prefix_len);
1106        let content_end = end - BytePos(postfix_len);
1107        let lit_content = self.str_from_to(content_start, content_end);
1108        check_for_errors(lit_content, mode, |range, err| {
1109            let span_with_quotes = self.mk_sp(start, end);
1110            let (start, end) = (range.start as u32, range.end as u32);
1111            let lo = content_start + BytePos(start);
1112            let hi = lo + BytePos(end - start);
1113            let span = self.mk_sp(lo, hi);
1114            let is_fatal = err.is_fatal();
1115            if let Some(guar) = emit_unescape_error(
1116                self.dcx(),
1117                lit_content,
1118                span_with_quotes,
1119                span,
1120                mode,
1121                range,
1122                err,
1123            ) {
1124                assert!(is_fatal);
1125                kind = token::Err(guar);
1126            }
1127        });
1128
1129        // We normally exclude the quotes for the symbol, but for errors we
1130        // include it because it results in clearer error messages.
1131        let sym = if !matches!(kind, token::Err(_)) {
1132            Symbol::intern(lit_content)
1133        } else {
1134            self.symbol_from_to(start, end)
1135        };
1136        (kind, sym)
1137    }
1138}
1139
1140pub fn nfc_normalize(string: &str) -> Symbol {
1141    use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1142    match is_nfc_quick(string.chars()) {
1143        IsNormalized::Yes => Symbol::intern(string),
1144        _ => {
1145            let normalized_str: String = string.chars().nfc().collect();
1146            Symbol::intern(&normalized_str)
1147        }
1148    }
1149}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy