1use diagnostics::make_errors_for_mismatched_closing_delims;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9 Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15 TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41 pub found_delim: Option<Delimiter>,
42 pub found_span: Span,
43 pub unclosed_span: Option<Span>,
44 pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48 psess: &'psess ParseSess,
49 mut src: &'src str,
50 mut start_pos: BytePos,
51 override_span: Option<Span>,
52) -> Result<TokenStream, Vec<Diag<'psess>>> {
53 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
55 src = &src[shebang_len..];
56 start_pos = start_pos + BytePos::from_usize(shebang_len);
57 }
58
59 let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
60 let mut lexer = Lexer {
61 psess,
62 start_pos,
63 pos: start_pos,
64 src,
65 cursor,
66 override_span,
67 nbsp_is_whitespace: false,
68 last_lifetime: None,
69 token: Token::dummy(),
70 diag_info: TokenTreeDiagInfo::default(),
71 };
72 let res = lexer.lex_token_trees(false);
73
74 let mut unmatched_closing_delims: Vec<_> =
75 make_errors_for_mismatched_closing_delims(&lexer.diag_info.unmatched_delims, psess);
76
77 match res {
78 Ok((_open_spacing, stream)) => {
79 if unmatched_closing_delims.is_empty() {
80 Ok(stream)
81 } else {
82 Err(unmatched_closing_delims)
84 }
85 }
86 Err(errs) => {
87 unmatched_closing_delims.extend(errs);
90 Err(unmatched_closing_delims)
91 }
92 }
93}
94
95struct Lexer<'psess, 'src> {
96 psess: &'psess ParseSess,
97 start_pos: BytePos,
99 pos: BytePos,
101 src: &'src str,
103 cursor: Cursor<'src>,
105 override_span: Option<Span>,
106 nbsp_is_whitespace: bool,
110
111 last_lifetime: Option<Span>,
114
115 token: Token,
117
118 diag_info: TokenTreeDiagInfo,
119}
120
121impl<'psess, 'src> Lexer<'psess, 'src> {
122 fn dcx(&self) -> DiagCtxtHandle<'psess> {
123 self.psess.dcx()
124 }
125
126 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
127 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
128 }
129
130 fn next_token_from_cursor(&mut self) -> (Token, bool) {
133 let mut preceded_by_whitespace = false;
134 let mut swallow_next_invalid = 0;
135 loop {
137 let str_before = self.cursor.as_str();
138 let token = self.cursor.advance_token();
139 let start = self.pos;
140 self.pos = self.pos + BytePos(token.len);
141
142 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
143
144 if let rustc_lexer::TokenKind::Semi
145 | rustc_lexer::TokenKind::LineComment { .. }
146 | rustc_lexer::TokenKind::BlockComment { .. }
147 | rustc_lexer::TokenKind::CloseParen
148 | rustc_lexer::TokenKind::CloseBrace
149 | rustc_lexer::TokenKind::CloseBracket = token.kind
150 {
151 self.last_lifetime = None;
154 }
155
156 let kind = match token.kind {
160 rustc_lexer::TokenKind::LineComment { doc_style } => {
161 let Some(doc_style) = doc_style else {
163 self.lint_unicode_text_flow(start);
164 preceded_by_whitespace = true;
165 continue;
166 };
167
168 let content_start = start + BytePos(3);
170 let content = self.str_from(content_start);
171 self.lint_doc_comment_unicode_text_flow(start, content);
172 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
173 }
174 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
175 if !terminated {
176 self.report_unterminated_block_comment(start, doc_style);
177 }
178
179 let Some(doc_style) = doc_style else {
181 self.lint_unicode_text_flow(start);
182 preceded_by_whitespace = true;
183 continue;
184 };
185
186 let content_start = start + BytePos(3);
189 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
190 let content = self.str_from_to(content_start, content_end);
191 self.lint_doc_comment_unicode_text_flow(start, content);
192 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
193 }
194 rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
195 self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
196 preceded_by_whitespace = true;
197 continue;
198 }
199 rustc_lexer::TokenKind::Whitespace => {
200 preceded_by_whitespace = true;
201 continue;
202 }
203 rustc_lexer::TokenKind::Ident => self.ident(start),
204 rustc_lexer::TokenKind::RawIdent => {
205 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
206 let span = self.mk_sp(start, self.pos);
207 self.psess.symbol_gallery.insert(sym, span);
208 if !sym.can_be_raw() {
209 self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
210 }
211 self.psess.raw_identifier_spans.push(span);
212 token::Ident(sym, IdentIsRaw::Yes)
213 }
214 rustc_lexer::TokenKind::UnknownPrefix => {
215 self.report_unknown_prefix(start);
216 self.ident(start)
217 }
218 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
219 self.report_unknown_prefix(start);
220 let lifetime_name = self.str_from(start);
224 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
225 let ident = Symbol::intern(lifetime_name);
226 token::Lifetime(ident, IdentIsRaw::No)
227 }
228 rustc_lexer::TokenKind::InvalidIdent
229 if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
232 let sym = self.str_from(start);
233 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
234 }) =>
235 {
236 let sym = nfc_normalize(self.str_from(start));
237 let span = self.mk_sp(start, self.pos);
238 self.psess
239 .bad_unicode_identifiers
240 .borrow_mut()
241 .entry(sym)
242 .or_default()
243 .push(span);
244 token::Ident(sym, IdentIsRaw::No)
245 }
246 rustc_lexer::TokenKind::Literal {
249 kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
250 suffix_start: _,
251 } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
252 let prefix_len = match kind {
253 LiteralKind::CStr { .. } => 1,
254 LiteralKind::RawCStr { .. } => 2,
255 _ => unreachable!(),
256 };
257
258 let lit_start = start + BytePos(prefix_len);
261 self.pos = lit_start;
262 self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
263 self.report_unknown_prefix(start);
264 let prefix_span = self.mk_sp(start, lit_start);
265 return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
266 }
267 rustc_lexer::TokenKind::GuardedStrPrefix => {
268 self.maybe_report_guarded_str(start, str_before)
269 }
270 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
271 let suffix_start = start + BytePos(suffix_start);
272 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
273 let suffix = if suffix_start < self.pos {
274 let string = self.str_from(suffix_start);
275 if string == "_" {
276 self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
277 span: self.mk_sp(suffix_start, self.pos),
278 });
279 None
280 } else {
281 Some(Symbol::intern(string))
282 }
283 } else {
284 None
285 };
286 self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
287 token::Literal(token::Lit { kind, symbol, suffix })
288 }
289 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
290 let lifetime_name = self.str_from(start);
294 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
295 if starts_with_number {
296 let span = self.mk_sp(start, self.pos);
297 self.dcx()
298 .struct_err("lifetimes cannot start with a number")
299 .with_span(span)
300 .stash(span, StashKey::LifetimeIsChar);
301 }
302 let ident = Symbol::intern(lifetime_name);
303 token::Lifetime(ident, IdentIsRaw::No)
304 }
305 rustc_lexer::TokenKind::RawLifetime => {
306 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
307
308 let ident_start = start + BytePos(3);
309 let prefix_span = self.mk_sp(start, ident_start);
310
311 if prefix_span.at_least_rust_2021() {
312 if self.cursor.as_str().starts_with('\'') {
318 let lit_span = self.mk_sp(start, self.pos + BytePos(1));
319 let contents = self.str_from_to(start + BytePos(1), self.pos);
320 emit_unescape_error(
321 self.dcx(),
322 contents,
323 lit_span,
324 lit_span,
325 Mode::Char,
326 0..contents.len(),
327 EscapeError::MoreThanOneChar,
328 )
329 .expect("expected error");
330 }
331
332 let span = self.mk_sp(start, self.pos);
333
334 let lifetime_name_without_tick =
335 Symbol::intern(&self.str_from(ident_start));
336 if !lifetime_name_without_tick.can_be_raw() {
337 self.dcx().emit_err(
338 errors::CannotBeRawLifetime {
339 span,
340 ident: lifetime_name_without_tick
341 }
342 );
343 }
344
345 let mut lifetime_name =
347 String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
348 lifetime_name.push('\'');
349 lifetime_name += lifetime_name_without_tick.as_str();
350 let sym = Symbol::intern(&lifetime_name);
351
352 self.psess.raw_identifier_spans.push(span);
354
355 token::Lifetime(sym, IdentIsRaw::Yes)
356 } else {
357 self.psess.buffer_lint(
359 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
360 prefix_span,
361 ast::CRATE_NODE_ID,
362 BuiltinLintDiag::RawPrefix(prefix_span),
363 );
364
365 let lt_start = start + BytePos(2);
367 self.pos = lt_start;
368 self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
369
370 let lifetime_name = self.str_from(start);
371 let ident = Symbol::intern(lifetime_name);
372 token::Lifetime(ident, IdentIsRaw::No)
373 }
374 }
375 rustc_lexer::TokenKind::Semi => token::Semi,
376 rustc_lexer::TokenKind::Comma => token::Comma,
377 rustc_lexer::TokenKind::Dot => token::Dot,
378 rustc_lexer::TokenKind::OpenParen => token::OpenParen,
379 rustc_lexer::TokenKind::CloseParen => token::CloseParen,
380 rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
381 rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
382 rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
383 rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
384 rustc_lexer::TokenKind::At => token::At,
385 rustc_lexer::TokenKind::Pound => token::Pound,
386 rustc_lexer::TokenKind::Tilde => token::Tilde,
387 rustc_lexer::TokenKind::Question => token::Question,
388 rustc_lexer::TokenKind::Colon => token::Colon,
389 rustc_lexer::TokenKind::Dollar => token::Dollar,
390 rustc_lexer::TokenKind::Eq => token::Eq,
391 rustc_lexer::TokenKind::Bang => token::Bang,
392 rustc_lexer::TokenKind::Lt => token::Lt,
393 rustc_lexer::TokenKind::Gt => token::Gt,
394 rustc_lexer::TokenKind::Minus => token::Minus,
395 rustc_lexer::TokenKind::And => token::And,
396 rustc_lexer::TokenKind::Or => token::Or,
397 rustc_lexer::TokenKind::Plus => token::Plus,
398 rustc_lexer::TokenKind::Star => token::Star,
399 rustc_lexer::TokenKind::Slash => token::Slash,
400 rustc_lexer::TokenKind::Caret => token::Caret,
401 rustc_lexer::TokenKind::Percent => token::Percent,
402
403 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
404 if swallow_next_invalid > 0 {
406 swallow_next_invalid -= 1;
407 continue;
408 }
409 let mut it = self.str_from_to_end(start).chars();
410 let c = it.next().unwrap();
411 if c == '\u{00a0}' {
412 if self.nbsp_is_whitespace {
416 preceded_by_whitespace = true;
417 continue;
418 }
419 self.nbsp_is_whitespace = true;
420 }
421 let repeats = it.take_while(|c1| *c1 == c).count();
422 let (token, sugg) =
429 unicode_chars::check_for_substitution(self, start, c, repeats + 1);
430 self.dcx().emit_err(errors::UnknownTokenStart {
431 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
432 escaped: escaped_char(c),
433 sugg,
434 null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
435 repeat: if repeats > 0 {
436 swallow_next_invalid = repeats;
437 Some(errors::UnknownTokenRepeat { repeats })
438 } else {
439 None
440 },
441 });
442
443 if let Some(token) = token {
444 token
445 } else {
446 preceded_by_whitespace = true;
447 continue;
448 }
449 }
450 rustc_lexer::TokenKind::Eof => token::Eof,
451 };
452 let span = self.mk_sp(start, self.pos);
453 return (Token::new(kind, span), preceded_by_whitespace);
454 }
455 }
456
457 fn ident(&self, start: BytePos) -> TokenKind {
458 let sym = nfc_normalize(self.str_from(start));
459 let span = self.mk_sp(start, self.pos);
460 self.psess.symbol_gallery.insert(sym, span);
461 token::Ident(sym, IdentIsRaw::No)
462 }
463
464 fn lint_unicode_text_flow(&self, start: BytePos) {
467 let content_start = start + BytePos(2);
469 let content = self.str_from(content_start);
470 if contains_text_flow_control_chars(content) {
471 let span = self.mk_sp(start, self.pos);
472 self.psess.buffer_lint(
473 TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
474 span,
475 ast::CRATE_NODE_ID,
476 BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
477 );
478 }
479 }
480
481 fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
482 if contains_text_flow_control_chars(content) {
483 self.report_text_direction_codepoint(
484 content,
485 self.mk_sp(start, self.pos),
486 0,
487 false,
488 "doc comment",
489 );
490 }
491 }
492
493 fn lint_literal_unicode_text_flow(
494 &mut self,
495 text: Symbol,
496 lit_kind: token::LitKind,
497 span: Span,
498 label: &'static str,
499 ) {
500 if !contains_text_flow_control_chars(text.as_str()) {
501 return;
502 }
503 let (padding, point_at_inner_spans) = match lit_kind {
504 token::LitKind::Str | token::LitKind::Char => (1, true),
506 token::LitKind::CStr => (2, true),
508 token::LitKind::StrRaw(n) => (n as u32 + 2, true),
510 token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
512 token::LitKind::Err(_) => return,
514 _ => (0, false),
516 };
517 self.report_text_direction_codepoint(
518 text.as_str(),
519 span,
520 padding,
521 point_at_inner_spans,
522 label,
523 );
524 }
525
526 fn report_text_direction_codepoint(
527 &self,
528 text: &str,
529 span: Span,
530 padding: u32,
531 point_at_inner_spans: bool,
532 label: &str,
533 ) {
534 let spans: Vec<_> = text
536 .char_indices()
537 .filter_map(|(i, c)| {
538 TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
539 let lo = span.lo() + BytePos(i as u32 + padding);
540 (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
541 })
542 })
543 .collect();
544
545 let count = spans.len();
546 let labels = point_at_inner_spans.then_some(spans.clone());
547
548 self.psess.buffer_lint(
549 TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
550 span,
551 ast::CRATE_NODE_ID,
552 BuiltinLintDiag::HiddenUnicodeCodepoints {
553 label: label.to_string(),
554 count,
555 span_label: span,
556 labels,
557 escape: point_at_inner_spans && !spans.is_empty(),
558 spans,
559 },
560 );
561 }
562
563 fn validate_frontmatter(
564 &self,
565 start: BytePos,
566 has_invalid_preceding_whitespace: bool,
567 invalid_infostring: bool,
568 ) {
569 let s = self.str_from(start);
570 let real_start = s.find("---").unwrap();
571 let frontmatter_opening_pos = BytePos(real_start as u32) + start;
572 let s_new = &s[real_start..];
573 let within = s_new.trim_start_matches('-');
574 let len_opening = s_new.len() - within.len();
575
576 let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
577 if has_invalid_preceding_whitespace {
578 let line_start =
579 BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
580 let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
581 let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
582 self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
583 span,
584 note_span: label_span,
585 });
586 }
587
588 if invalid_infostring {
589 let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
590 let span = self.mk_sp(
591 frontmatter_opening_end_pos,
592 frontmatter_opening_pos + BytePos(line_end as u32),
593 );
594 self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
595 }
596
597 let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
598 let last_line = &within[last_line_start..];
599 let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
600 let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
601
602 let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
603 self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
604
605 if !last_line_trimmed.starts_with("---") {
606 let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
607 self.dcx().emit_err(errors::FrontmatterUnclosed {
608 span: frontmatter_span,
609 note_span: label_span,
610 });
611 return;
612 }
613
614 if last_line_trimmed.len() != last_line.len() {
615 let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
616 let span = self.mk_sp(last_line_start_pos, line_end);
617 let whitespace_end =
618 last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
619 let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
620 self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
621 span,
622 note_span: label_span,
623 });
624 }
625
626 let rest = last_line_trimmed.trim_start_matches('-');
627 let len_close = last_line_trimmed.len() - rest.len();
628 if len_close != len_opening {
629 let span = self.mk_sp(frontmatter_opening_pos, self.pos);
630 let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
631 let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
632 let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
633 self.dcx().emit_err(errors::FrontmatterLengthMismatch {
634 span,
635 opening,
636 close,
637 len_opening,
638 len_close,
639 });
640 }
641
642 if !rest.trim_matches(is_whitespace).is_empty() {
643 let span = self.mk_sp(last_line_start_pos, self.pos);
644 self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
645 }
646 }
647
648 fn cook_doc_comment(
649 &self,
650 content_start: BytePos,
651 content: &str,
652 comment_kind: CommentKind,
653 doc_style: DocStyle,
654 ) -> TokenKind {
655 if content.contains('\r') {
656 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
657 let span = self.mk_sp(
658 content_start + BytePos(idx as u32),
659 content_start + BytePos(idx as u32 + 1),
660 );
661 let block = matches!(comment_kind, CommentKind::Block);
662 self.dcx().emit_err(errors::CrDocComment { span, block });
663 }
664 }
665
666 let attr_style = match doc_style {
667 DocStyle::Outer => AttrStyle::Outer,
668 DocStyle::Inner => AttrStyle::Inner,
669 };
670
671 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
672 }
673
674 fn cook_lexer_literal(
675 &self,
676 start: BytePos,
677 end: BytePos,
678 kind: rustc_lexer::LiteralKind,
679 ) -> (token::LitKind, Symbol) {
680 match kind {
681 rustc_lexer::LiteralKind::Char { terminated } => {
682 if !terminated {
683 let mut err = self
684 .dcx()
685 .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
686 .with_code(E0762);
687 if let Some(lt_sp) = self.last_lifetime {
688 err.multipart_suggestion(
689 "if you meant to write a string literal, use double quotes",
690 vec![
691 (lt_sp, "\"".to_string()),
692 (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
693 ],
694 Applicability::MaybeIncorrect,
695 );
696 }
697 err.emit()
698 }
699 self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) }
701 rustc_lexer::LiteralKind::Byte { terminated } => {
702 if !terminated {
703 self.dcx()
704 .struct_span_fatal(
705 self.mk_sp(start + BytePos(1), end),
706 "unterminated byte constant",
707 )
708 .with_code(E0763)
709 .emit()
710 }
711 self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) }
713 rustc_lexer::LiteralKind::Str { terminated } => {
714 if !terminated {
715 self.dcx()
716 .struct_span_fatal(
717 self.mk_sp(start, end),
718 "unterminated double quote string",
719 )
720 .with_code(E0765)
721 .emit()
722 }
723 self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) }
725 rustc_lexer::LiteralKind::ByteStr { terminated } => {
726 if !terminated {
727 self.dcx()
728 .struct_span_fatal(
729 self.mk_sp(start + BytePos(1), end),
730 "unterminated double quote byte string",
731 )
732 .with_code(E0766)
733 .emit()
734 }
735 self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
736 }
738 rustc_lexer::LiteralKind::CStr { terminated } => {
739 if !terminated {
740 self.dcx()
741 .struct_span_fatal(
742 self.mk_sp(start + BytePos(1), end),
743 "unterminated C string",
744 )
745 .with_code(E0767)
746 .emit()
747 }
748 self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) }
750 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
751 if let Some(n_hashes) = n_hashes {
752 let n = u32::from(n_hashes);
753 let kind = token::StrRaw(n_hashes);
754 self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
755 } else {
757 self.report_raw_str_error(start, 1);
758 }
759 }
760 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
761 if let Some(n_hashes) = n_hashes {
762 let n = u32::from(n_hashes);
763 let kind = token::ByteStrRaw(n_hashes);
764 self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
765 } else {
767 self.report_raw_str_error(start, 2);
768 }
769 }
770 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
771 if let Some(n_hashes) = n_hashes {
772 let n = u32::from(n_hashes);
773 let kind = token::CStrRaw(n_hashes);
774 self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
775 } else {
777 self.report_raw_str_error(start, 2);
778 }
779 }
780 rustc_lexer::LiteralKind::Int { base, empty_int } => {
781 let mut kind = token::Integer;
782 if empty_int {
783 let span = self.mk_sp(start, end);
784 let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
785 kind = token::Err(guar);
786 } else if matches!(base, Base::Binary | Base::Octal) {
787 let base = base as u32;
788 let s = self.str_from_to(start + BytePos(2), end);
789 for (idx, c) in s.char_indices() {
790 let span = self.mk_sp(
791 start + BytePos::from_usize(2 + idx),
792 start + BytePos::from_usize(2 + idx + c.len_utf8()),
793 );
794 if c != '_' && c.to_digit(base).is_none() {
795 let guar =
796 self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
797 kind = token::Err(guar);
798 }
799 }
800 }
801 (kind, self.symbol_from_to(start, end))
802 }
803 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
804 let mut kind = token::Float;
805 if empty_exponent {
806 let span = self.mk_sp(start, self.pos);
807 let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
808 kind = token::Err(guar);
809 }
810 let base = match base {
811 Base::Hexadecimal => Some("hexadecimal"),
812 Base::Octal => Some("octal"),
813 Base::Binary => Some("binary"),
814 _ => None,
815 };
816 if let Some(base) = base {
817 let span = self.mk_sp(start, end);
818 let guar =
819 self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
820 kind = token::Err(guar)
821 }
822 (kind, self.symbol_from_to(start, end))
823 }
824 }
825 }
826
827 #[inline]
828 fn src_index(&self, pos: BytePos) -> usize {
829 (pos - self.start_pos).to_usize()
830 }
831
832 fn str_from(&self, start: BytePos) -> &'src str {
835 self.str_from_to(start, self.pos)
836 }
837
838 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
840 debug!("taking an ident from {:?} to {:?}", start, end);
841 Symbol::intern(self.str_from_to(start, end))
842 }
843
844 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
846 &self.src[self.src_index(start)..self.src_index(end)]
847 }
848
849 fn str_from_to_end(&self, start: BytePos) -> &'src str {
851 &self.src[self.src_index(start)..]
852 }
853
854 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
855 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
856 Err(RawStrError::InvalidStarter { bad_char }) => {
857 self.report_non_started_raw_string(start, bad_char)
858 }
859 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
860 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
861 Err(RawStrError::TooManyDelimiters { found }) => {
862 self.report_too_many_hashes(start, found)
863 }
864 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
865 }
866 }
867
868 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
869 self.dcx()
870 .struct_span_fatal(
871 self.mk_sp(start, self.pos),
872 format!(
873 "found invalid character; only `#` is allowed in raw string delimitation: {}",
874 escaped_char(bad_char)
875 ),
876 )
877 .emit()
878 }
879
880 fn report_unterminated_raw_string(
881 &self,
882 start: BytePos,
883 n_hashes: u32,
884 possible_offset: Option<u32>,
885 found_terminators: u32,
886 ) -> ! {
887 let mut err =
888 self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
889 err.code(E0748);
890 err.span_label(self.mk_sp(start, start), "unterminated raw string");
891
892 if n_hashes > 0 {
893 err.note(format!(
894 "this raw string should be terminated with `\"{}`",
895 "#".repeat(n_hashes as usize)
896 ));
897 }
898
899 if let Some(possible_offset) = possible_offset {
900 let lo = start + BytePos(possible_offset);
901 let hi = lo + BytePos(found_terminators);
902 let span = self.mk_sp(lo, hi);
903 err.span_suggestion(
904 span,
905 "consider terminating the string here",
906 "#".repeat(n_hashes as usize),
907 Applicability::MaybeIncorrect,
908 );
909 }
910
911 err.emit()
912 }
913
914 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
915 let msg = match doc_style {
916 Some(_) => "unterminated block doc-comment",
917 None => "unterminated block comment",
918 };
919 let last_bpos = self.pos;
920 let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
921 err.code(E0758);
922 let mut nested_block_comment_open_idxs = vec![];
923 let mut last_nested_block_comment_idxs = None;
924 let mut content_chars = self.str_from(start).char_indices().peekable();
925
926 while let Some((idx, current_char)) = content_chars.next() {
927 match content_chars.peek() {
928 Some((_, '*')) if current_char == '/' => {
929 nested_block_comment_open_idxs.push(idx);
930 }
931 Some((_, '/')) if current_char == '*' => {
932 last_nested_block_comment_idxs =
933 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
934 }
935 _ => {}
936 };
937 }
938
939 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
940 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
941 .span_label(
942 self.mk_sp(
943 start + BytePos(nested_open_idx as u32),
944 start + BytePos(nested_open_idx as u32 + 2),
945 ),
946 "...as last nested comment starts here, maybe you want to close this instead?",
947 )
948 .span_label(
949 self.mk_sp(
950 start + BytePos(nested_close_idx as u32),
951 start + BytePos(nested_close_idx as u32 + 2),
952 ),
953 "...and last nested comment terminates here.",
954 );
955 }
956
957 err.emit();
958 }
959
960 fn report_unknown_prefix(&self, start: BytePos) {
965 let prefix_span = self.mk_sp(start, self.pos);
966 let prefix = self.str_from_to(start, self.pos);
967 let expn_data = prefix_span.ctxt().outer_expn_data();
968
969 if expn_data.edition.at_least_rust_2021() {
970 let sugg = if prefix == "rb" {
972 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
973 } else if prefix == "rc" {
974 Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
975 } else if expn_data.is_root() {
976 if self.cursor.first() == '\''
977 && let Some(start) = self.last_lifetime
978 && self.cursor.third() != '\''
979 && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
980 && !self.psess.source_map().is_multiline(start.until(end))
981 {
982 Some(errors::UnknownPrefixSugg::MeantStr { start, end })
986 } else {
987 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
988 }
989 } else {
990 None
991 };
992 self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
993 } else {
994 self.psess.buffer_lint(
996 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
997 prefix_span,
998 ast::CRATE_NODE_ID,
999 BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1000 );
1001 }
1002 }
1003
1004 fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1011 let span = self.mk_sp(start, self.pos);
1012 let edition2024 = span.edition().at_least_rust_2024();
1013
1014 let space_pos = start + BytePos(1);
1015 let space_span = self.mk_sp(space_pos, space_pos);
1016
1017 let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1018
1019 let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1020 Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1021 let end = start + BytePos(token_len);
1022 let span = self.mk_sp(start, end);
1023 let str_start = start + BytePos(n_hashes);
1024
1025 if edition2024 {
1026 self.cursor = cursor;
1027 self.pos = end;
1028 }
1029
1030 let unterminated = if terminated { None } else { Some(str_start) };
1031
1032 (true, span, unterminated)
1033 }
1034 None => {
1035 debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1037
1038 (false, span, None)
1039 }
1040 };
1041 if edition2024 {
1042 if let Some(str_start) = unterminated {
1043 self.dcx()
1045 .struct_span_fatal(
1046 self.mk_sp(str_start, self.pos),
1047 "unterminated double quote string",
1048 )
1049 .with_code(E0765)
1050 .emit()
1051 }
1052
1053 let sugg = if span.from_expansion() {
1054 None
1055 } else {
1056 Some(errors::GuardedStringSugg(space_span))
1057 };
1058
1059 let err = if is_string {
1061 self.dcx().emit_err(errors::ReservedString { span, sugg })
1062 } else {
1063 self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1064 };
1065
1066 token::Literal(token::Lit {
1067 kind: token::Err(err),
1068 symbol: self.symbol_from_to(start, self.pos),
1069 suffix: None,
1070 })
1071 } else {
1072 self.psess.buffer_lint(
1074 RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1075 span,
1076 ast::CRATE_NODE_ID,
1077 BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1078 );
1079
1080 self.pos = start + BytePos(1);
1083 self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1084 token::Pound
1085 }
1086 }
1087
1088 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1089 self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1090 }
1091
1092 fn cook_quoted(
1093 &self,
1094 mut kind: token::LitKind,
1095 mode: Mode,
1096 start: BytePos,
1097 end: BytePos,
1098 prefix_len: u32,
1099 postfix_len: u32,
1100 ) -> (token::LitKind, Symbol) {
1101 let content_start = start + BytePos(prefix_len);
1102 let content_end = end - BytePos(postfix_len);
1103 let lit_content = self.str_from_to(content_start, content_end);
1104 check_for_errors(lit_content, mode, |range, err| {
1105 let span_with_quotes = self.mk_sp(start, end);
1106 let (start, end) = (range.start as u32, range.end as u32);
1107 let lo = content_start + BytePos(start);
1108 let hi = lo + BytePos(end - start);
1109 let span = self.mk_sp(lo, hi);
1110 let is_fatal = err.is_fatal();
1111 if let Some(guar) = emit_unescape_error(
1112 self.dcx(),
1113 lit_content,
1114 span_with_quotes,
1115 span,
1116 mode,
1117 range,
1118 err,
1119 ) {
1120 assert!(is_fatal);
1121 kind = token::Err(guar);
1122 }
1123 });
1124
1125 let sym = if !matches!(kind, token::Err(_)) {
1128 Symbol::intern(lit_content)
1129 } else {
1130 self.symbol_from_to(start, end)
1131 };
1132 (kind, sym)
1133 }
1134}
1135
1136pub fn nfc_normalize(string: &str) -> Symbol {
1137 use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1138 match is_nfc_quick(string.chars()) {
1139 IsNormalized::Yes => Symbol::intern(string),
1140 _ => {
1141 let normalized_str: String = string.chars().nfc().collect();
1142 Symbol::intern(&normalized_str)
1143 }
1144 }
1145}