From c62202a7d7f62427117244b7376d2e3b6ace80f6 Mon Sep 17 00:00:00 2001 From: Saleem Rashid Date: Tue, 19 Nov 2019 18:21:49 +0000 Subject: [PATCH 1/2] mypyc: Fix C string encoding Using repr() to encode C strings relied on implementation-defined behaviour and had a number of bugs, for example: * The C string could inadvertently contain trigraphs. * Valid hexdecimal characters following a hexadecimal escape sequence would be parsed as part of the escape sequence. Octal escape sequences are used for unprintable characters because they do not have the same issue as hexadecimal escape sequences. It would be possible to mitigate the issue with hexadecimal escape sequences by emitting multiple string literals. However, the complexity is not worth it, especially as the output is not designed to be human-readable. --- mypyc/cstring.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ mypyc/emitmodule.py | 13 +----------- 2 files changed, 50 insertions(+), 12 deletions(-) create mode 100644 mypyc/cstring.py diff --git a/mypyc/cstring.py b/mypyc/cstring.py new file mode 100644 index 000000000000..4fdb279258bd --- /dev/null +++ b/mypyc/cstring.py @@ -0,0 +1,49 @@ +"""Encode valid C string literals from Python strings. + +If a character is not allowed in C string literals, it is either emitted +as a simple escape sequence (e.g. '\\n'), or an octal escape sequence +with exactly three digits ('\\oXXX'). Question marks are escaped to +prevent trigraphs in the string literal from being interpreted. Note +that '\\?' is an invalid escape sequence in Python. + +Consider the string literal "AB\\xCDEF". As one would expect, Python +parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard +specifies that all hexadecimal digits immediately following '\\x' will +be interpreted as part of the escape sequence. Therefore, it is +unexpectedly parsed as ['A', 'B', 0xCDEF]. + +Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt +for simplicity and use octal escape sequences instead. They do not +suffer from the same issue as they are defined to parse at most three +octal digits. +""" + +import string +from typing import Tuple + +CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)] + +# It is safe to use string.printable as it always uses the C locale. +for c in string.printable: + CHAR_MAP[ord(c)] = c + +# These assignments must come last because we prioritize simple escape +# sequences over any other representation. +for c in ('\'', '"', '\\', 'a', 'b', 'f', 'n', 'r', 't', 'v'): + escaped = '\\{}'.format(c) + decoded = escaped.encode('ascii').decode('unicode_escape') + CHAR_MAP[ord(decoded)] = escaped + +# This escape sequence is invalid in Python. +CHAR_MAP[ord('?')] = r'\?' + + +def encode_as_c_string(s: str) -> Tuple[str, int]: + """Produce a quoted C string literal and its size, for a UTF-8 string.""" + return encode_bytes_as_c_string(s.encode('utf-8')) + + +def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]: + """Produce a quoted C string literal and its size, for a byte string.""" + escaped = ''.join([CHAR_MAP[i] for i in b]) + return '"{}"'.format(escaped), len(b) diff --git a/mypyc/emitmodule.py b/mypyc/emitmodule.py index f1fdf0b5f833..efb7bffa262e 100644 --- a/mypyc/emitmodule.py +++ b/mypyc/emitmodule.py @@ -23,6 +23,7 @@ from mypyc.common import ( PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, shared_lib_name, ) +from mypyc.cstring import encode_as_c_string, encode_bytes_as_c_string from mypyc.emit import EmitterContext, Emitter, HeaderDeclaration from mypyc.emitfunc import generate_native_function, native_function_header from mypyc.emitclass import generate_class_type_decl, generate_class @@ -414,18 +415,6 @@ def generate_function_declaration(fn: FuncIR, emitter: Emitter) -> None: '{};'.format(wrapper_function_header(fn, emitter.names))) -def encode_as_c_string(s: str) -> Tuple[str, int]: - """Produce a utf-8 encoded, escaped, quoted C string and its size from a string""" - return encode_bytes_as_c_string(s.encode('utf-8')) - - -def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]: - """Produce a single-escaped, quoted C string and its size from a bytes""" - # This is a kind of abusive way to do this... - escaped = repr(b)[2:-1].replace('"', '\\"') - return '"{}"'.format(escaped), len(b) - - def pointerize(decl: str, name: str) -> str: """Given a C decl and its name, modify it to be a declaration to a pointer.""" # This doesn't work in general but does work for all our types... From d4f3656d87d5ae41d27b28867e3d73e80ff135ce Mon Sep 17 00:00:00 2001 From: Saleem Rashid Date: Tue, 19 Nov 2019 23:44:05 +0000 Subject: [PATCH 2/2] mypyc: Remove -Wno-trigraphs from CFLAGS The compiler will not complain about trigraphs anymore because we now escape all question marks in C string literals. --- mypyc/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/build.py b/mypyc/build.py index f3ef88302239..20efdce2d37b 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -507,7 +507,7 @@ def mypycify( if compiler.compiler_type == 'unix': cflags += [ '-O{}'.format(opt_level), '-Werror', '-Wno-unused-function', '-Wno-unused-label', - '-Wno-unreachable-code', '-Wno-unused-variable', '-Wno-trigraphs', + '-Wno-unreachable-code', '-Wno-unused-variable', '-Wno-unused-command-line-argument', '-Wno-unknown-warning-option', ] if 'gcc' in compiler.compiler[0]: pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy