[mypyc] Fix C string encoding (python#7978)

saleemrashid · msullivan · commit 05b92c0fa4ef · 2019-11-19T16:33:23.000-08:00
Using repr() to encode C strings relied on implementation-defined
behaviour and had a number of bugs, for example:

* The C string could inadvertently contain trigraphs.

* Valid hexdecimal characters following a hexadecimal escape sequence
  would be parsed as part of the escape sequence.

Octal escape sequences are used for unprintable characters because they
do not have the same issue as hexadecimal escape sequences.

It would be possible to mitigate the issue with hexadecimal escape
sequences by emitting multiple string literals. However, the complexity
is not worth it, especially as the output is not designed to be
human-readable.
diff --git a/mypyc/build.py b/mypyc/build.py
@@ -507,7 +507,7 @@ def mypycify(
     if compiler.compiler_type == 'unix':
         cflags += [
             '-O{}'.format(opt_level), '-Werror', '-Wno-unused-function', '-Wno-unused-label',
-            '-Wno-unreachable-code', '-Wno-unused-variable', '-Wno-trigraphs',
+            '-Wno-unreachable-code', '-Wno-unused-variable',
             '-Wno-unused-command-line-argument', '-Wno-unknown-warning-option',
         ]
         if 'gcc' in compiler.compiler[0]:
diff --git a/mypyc/cstring.py b/mypyc/cstring.py
@@ -0,0 +1,49 @@
+"""Encode valid C string literals from Python strings.
+
+If a character is not allowed in C string literals, it is either emitted
+as a simple escape sequence (e.g. '\\n'), or an octal escape sequence
+with exactly three digits ('\\oXXX'). Question marks are escaped to
+prevent trigraphs in the string literal from being interpreted. Note
+that '\\?' is an invalid escape sequence in Python.
+
+Consider the string literal "AB\\xCDEF". As one would expect, Python
+parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard
+specifies that all hexadecimal digits immediately following '\\x' will
+be interpreted as part of the escape sequence. Therefore, it is
+unexpectedly parsed as ['A', 'B', 0xCDEF].
+
+Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt
+for simplicity and use octal escape sequences instead. They do not
+suffer from the same issue as they are defined to parse at most three
+octal digits.
+"""
+
+import string
+from typing import Tuple
+
+CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)]
+
+# It is safe to use string.printable as it always uses the C locale.
+for c in string.printable:
+    CHAR_MAP[ord(c)] = c
+
+# These assignments must come last because we prioritize simple escape
+# sequences over any other representation.
+for c in ('\'', '"', '\\', 'a', 'b', 'f', 'n', 'r', 't', 'v'):
+    escaped = '\\{}'.format(c)
+    decoded = escaped.encode('ascii').decode('unicode_escape')
+    CHAR_MAP[ord(decoded)] = escaped
+
+# This escape sequence is invalid in Python.
+CHAR_MAP[ord('?')] = r'\?'
+
+
+def encode_as_c_string(s: str) -> Tuple[str, int]:
+    """Produce a quoted C string literal and its size, for a UTF-8 string."""
+    return encode_bytes_as_c_string(s.encode('utf-8'))
+
+
+def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
+    """Produce a quoted C string literal and its size, for a byte string."""
+    escaped = ''.join([CHAR_MAP[i] for i in b])
+    return '"{}"'.format(escaped), len(b)
diff --git a/mypyc/emitmodule.py b/mypyc/emitmodule.py
@@ -23,6 +23,7 @@
 from mypyc.common import (
     PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, shared_lib_name,
 )
+from mypyc.cstring import encode_as_c_string, encode_bytes_as_c_string
 from mypyc.emit import EmitterContext, Emitter, HeaderDeclaration
 from mypyc.emitfunc import generate_native_function, native_function_header
 from mypyc.emitclass import generate_class_type_decl, generate_class
@@ -414,18 +415,6 @@ def generate_function_declaration(fn: FuncIR, emitter: Emitter) -> None:
             '{};'.format(wrapper_function_header(fn, emitter.names)))
 
 
-def encode_as_c_string(s: str) -> Tuple[str, int]:
-    """Produce a utf-8 encoded, escaped, quoted C string and its size from a string"""
-    return encode_bytes_as_c_string(s.encode('utf-8'))
-
-
-def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
-    """Produce a single-escaped, quoted C string and its size from a bytes"""
-    # This is a kind of abusive way to do this...
-    escaped = repr(b)[2:-1].replace('"', '\\"')
-    return '"{}"'.format(escaped), len(b)
-
-
 def pointerize(decl: str, name: str) -> str:
     """Given a C decl and its name, modify it to be a declaration to a pointer."""
     # This doesn't work in general but does work for all our types...

Original file line number	Diff line number	Diff line change
`@@ -507,7 +507,7 @@ def mypycify(`
`507`	`507`	`if compiler.compiler_type == 'unix':`
`508`	`508`	`cflags += [`
`509`	`509`	`'-O{}'.format(opt_level), '-Werror', '-Wno-unused-function', '-Wno-unused-label',`
`510`		`- '-Wno-unreachable-code', '-Wno-unused-variable', '-Wno-trigraphs',`
	`510`	`+ '-Wno-unreachable-code', '-Wno-unused-variable',`
`511`	`511`	`'-Wno-unused-command-line-argument', '-Wno-unknown-warning-option',`
`512`	`512`	`]`
`513`	`513`	`if 'gcc' in compiler.compiler[0]:`