Ensure parser error snippets are valid UTF-8

byroot · byroot · commit e144793b7226 · 2025-02-26T12:40:49.000+01:00
Fix: #755 Error messages now include a snippet of the document that doesn't parse to help locate the issue, however the way it was done wasn't UTF-8 aware, and it could result in exception messages with truncated characters. It would be nice to go a bit farther and actually support codepoints, but it's a lot of complexity to do it in C, perhaps if we move that logic to Ruby given it's not a performance sensitive codepath.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,7 @@
 # Changes
 
+* Ensure document snippets that are included in parser errors don't include truncated multibyte characters.
+
 ### 2025-02-10 (2.10.1)
 
 * Fix a compatibility issue with `MultiJson.dump(obj, pretty: true)`: `no implicit conversion of false into Proc (TypeError)`.
diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c
@@ -454,15 +454,24 @@ RBIMPL_ATTR_NORETURN()
 #endif
 static void raise_parse_error(const char *format, const char *start)
 {
-    char buffer[PARSE_ERROR_FRAGMENT_LEN + 1];
+    unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1];
 
     size_t len = start ? strnlen(start, PARSE_ERROR_FRAGMENT_LEN) : 0;
     const char *ptr = start;
 
     if (len == PARSE_ERROR_FRAGMENT_LEN) {
         MEMCPY(buffer, start, char, PARSE_ERROR_FRAGMENT_LEN);
-        buffer[PARSE_ERROR_FRAGMENT_LEN] = '\0';
-        ptr = buffer;
+
+        while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte
+            len--;
+        }
+
+        if (buffer[len - 1] >= 0xC0) { // multibyte character start
+            len--;
+        }
+
+        buffer[len] = '\0';
+        ptr = (const char *)buffer;
     }
 
     rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr);
diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb
@@ -645,6 +645,22 @@ def test_parse_error_incomplete_hash
     end
   end
 
+  def test_parse_error_snippet
+    omit "C ext only test" unless RUBY_ENGINE == "ruby"
+
+    error = assert_raise(JSON::ParserError) { JSON.parse("あああああああああああああああああああああああ") }
+    assert_equal "unexpected character: 'ああああああああああ'", error.message
+
+    error = assert_raise(JSON::ParserError) { JSON.parse("aあああああああああああああああああああああああ") }
+    assert_equal "unexpected character: 'aああああああああああ'", error.message
+
+    error = assert_raise(JSON::ParserError) { JSON.parse("abあああああああああああああああああああああああ") }
+    assert_equal "unexpected character: 'abあああああああああ'", error.message
+
+    error = assert_raise(JSON::ParserError) { JSON.parse("abcあああああああああああああああああああああああ") }
+    assert_equal "unexpected character: 'abcあああああああああ'", error.message
+  end
+
   def test_parse_leading_slash
     # ref: https://github.com/ruby/ruby/pull/12598
     assert_raise(JSON::ParserError) do