Skip to content

Commit 05b92c0

Browse files
saleemrashidmsullivan
authored andcommitted
[mypyc] Fix C string encoding (python#7978)
Using repr() to encode C strings relied on implementation-defined behaviour and had a number of bugs, for example: * The C string could inadvertently contain trigraphs. * Valid hexdecimal characters following a hexadecimal escape sequence would be parsed as part of the escape sequence. Octal escape sequences are used for unprintable characters because they do not have the same issue as hexadecimal escape sequences. It would be possible to mitigate the issue with hexadecimal escape sequences by emitting multiple string literals. However, the complexity is not worth it, especially as the output is not designed to be human-readable.
1 parent e99a2b5 commit 05b92c0

File tree

3 files changed

+51
-13
lines changed

3 files changed

+51
-13
lines changed

mypyc/build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ def mypycify(
507507
if compiler.compiler_type == 'unix':
508508
cflags += [
509509
'-O{}'.format(opt_level), '-Werror', '-Wno-unused-function', '-Wno-unused-label',
510-
'-Wno-unreachable-code', '-Wno-unused-variable', '-Wno-trigraphs',
510+
'-Wno-unreachable-code', '-Wno-unused-variable',
511511
'-Wno-unused-command-line-argument', '-Wno-unknown-warning-option',
512512
]
513513
if 'gcc' in compiler.compiler[0]:

mypyc/cstring.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""Encode valid C string literals from Python strings.
2+
3+
If a character is not allowed in C string literals, it is either emitted
4+
as a simple escape sequence (e.g. '\\n'), or an octal escape sequence
5+
with exactly three digits ('\\oXXX'). Question marks are escaped to
6+
prevent trigraphs in the string literal from being interpreted. Note
7+
that '\\?' is an invalid escape sequence in Python.
8+
9+
Consider the string literal "AB\\xCDEF". As one would expect, Python
10+
parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard
11+
specifies that all hexadecimal digits immediately following '\\x' will
12+
be interpreted as part of the escape sequence. Therefore, it is
13+
unexpectedly parsed as ['A', 'B', 0xCDEF].
14+
15+
Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt
16+
for simplicity and use octal escape sequences instead. They do not
17+
suffer from the same issue as they are defined to parse at most three
18+
octal digits.
19+
"""
20+
21+
import string
22+
from typing import Tuple
23+
24+
CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)]
25+
26+
# It is safe to use string.printable as it always uses the C locale.
27+
for c in string.printable:
28+
CHAR_MAP[ord(c)] = c
29+
30+
# These assignments must come last because we prioritize simple escape
31+
# sequences over any other representation.
32+
for c in ('\'', '"', '\\', 'a', 'b', 'f', 'n', 'r', 't', 'v'):
33+
escaped = '\\{}'.format(c)
34+
decoded = escaped.encode('ascii').decode('unicode_escape')
35+
CHAR_MAP[ord(decoded)] = escaped
36+
37+
# This escape sequence is invalid in Python.
38+
CHAR_MAP[ord('?')] = r'\?'
39+
40+
41+
def encode_as_c_string(s: str) -> Tuple[str, int]:
42+
"""Produce a quoted C string literal and its size, for a UTF-8 string."""
43+
return encode_bytes_as_c_string(s.encode('utf-8'))
44+
45+
46+
def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
47+
"""Produce a quoted C string literal and its size, for a byte string."""
48+
escaped = ''.join([CHAR_MAP[i] for i in b])
49+
return '"{}"'.format(escaped), len(b)

mypyc/emitmodule.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from mypyc.common import (
2424
PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, shared_lib_name,
2525
)
26+
from mypyc.cstring import encode_as_c_string, encode_bytes_as_c_string
2627
from mypyc.emit import EmitterContext, Emitter, HeaderDeclaration
2728
from mypyc.emitfunc import generate_native_function, native_function_header
2829
from mypyc.emitclass import generate_class_type_decl, generate_class
@@ -414,18 +415,6 @@ def generate_function_declaration(fn: FuncIR, emitter: Emitter) -> None:
414415
'{};'.format(wrapper_function_header(fn, emitter.names)))
415416

416417

417-
def encode_as_c_string(s: str) -> Tuple[str, int]:
418-
"""Produce a utf-8 encoded, escaped, quoted C string and its size from a string"""
419-
return encode_bytes_as_c_string(s.encode('utf-8'))
420-
421-
422-
def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
423-
"""Produce a single-escaped, quoted C string and its size from a bytes"""
424-
# This is a kind of abusive way to do this...
425-
escaped = repr(b)[2:-1].replace('"', '\\"')
426-
return '"{}"'.format(escaped), len(b)
427-
428-
429418
def pointerize(decl: str, name: str) -> str:
430419
"""Given a C decl and its name, modify it to be a declaration to a pointer."""
431420
# This doesn't work in general but does work for all our types...

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy