mypyc: Fix C string encoding

saleemrashid · saleemrashid · commit bad98fa5040e · 2019-11-19T18:21:49.000Z
Using repr() to encode C strings relied on implementation-defined
behaviour and had a number of bugs, for example:

* The C string could inadvertently contain trigraphs.

* Valid hexdecimal characters following a hexadecimal escape sequence
  would be parsed as part of the escape sequence.

Octal escape sequences are used for unprintable characters because they
do not have the same issue as hexadecimal escape sequences.

It would be possible to mitigate the issue with hexadecimal escape
sequences by emitting multiple string literals. However, the complexity
is not worth it, especially as the output is not designed to be
human-readable.
diff --git a/mypyc/emitmodule.py b/mypyc/emitmodule.py
@@ -6,6 +6,7 @@
 import os
 import hashlib
 import json
+import string
 from collections import OrderedDict
 from typing import List, Tuple, Dict, Iterable, Set, TypeVar, Optional
 
@@ -67,6 +68,33 @@
 # A list of (file name, file contents) pairs.
 FileContents = List[Tuple[str, str]]
 
+# The C standard specifies that an unlimited number of valid hexadecimal
+# characters are parsed as part of the hexadecimal escape sequence. For
+# example, "\x12345" would be unexpectedly parsed as {0x12345}, instead of
+# {0x123, '4', '5'}.  Therefore, we use octal escape sequences which are
+# specified to contain at most three octal digits.
+C_CHAR_MAP = ['\\{:03o}'.format(x) for x in range(256)]
+# Most printable characters do not need to be escaped in string literals. We
+# can safely use string.printable here because it always uses the C locale.
+for x in string.printable:
+    C_CHAR_MAP[ord(x)] = x
+# These assignments must be done after string.printable because they are
+# overrides for the printable characters that need to be escaped in string
+# literals.
+C_CHAR_MAP[ord('\'')] = r'\''
+C_CHAR_MAP[ord('\"')] = r'\"'
+C_CHAR_MAP[ord('\\')] = r'\\'
+C_CHAR_MAP[ord('\a')] = r'\a'
+C_CHAR_MAP[ord('\b')] = r'\b'
+C_CHAR_MAP[ord('\f')] = r'\f'
+C_CHAR_MAP[ord('\n')] = r'\n'
+C_CHAR_MAP[ord('\r')] = r'\r'
+C_CHAR_MAP[ord('\t')] = r'\t'
+C_CHAR_MAP[ord('\v')] = r'\v'
+# The question mark is escaped to prevent trigraphs from being interpreted
+# inside string literals. This escape sequence is invalid in Python.
+C_CHAR_MAP[ord('?')] = r'\?'
+
 
 class MarkedDeclaration:
     """Add a mark, useful for topological sort."""
@@ -421,8 +449,7 @@ def encode_as_c_string(s: str) -> Tuple[str, int]:
 
 def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
     """Produce a single-escaped, quoted C string and its size from a bytes"""
-    # This is a kind of abusive way to do this...
-    escaped = repr(b)[2:-1].replace('"', '\\"')
+    escaped = ''.join(map(C_CHAR_MAP.__getitem__, b))
     return '"{}"'.format(escaped), len(b)