From 4c46c6d4bdf0175f55f498c1ca2d6ebb7a0fc8d5 Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Sat, 4 May 2024 16:53:14 +0200 Subject: [PATCH 1/2] Use non-BOM encodings --- src/embed_tests/TestPyType.cs | 2 +- src/runtime/Loader.cs | 6 ++-- src/runtime/Native/CustomMarshaler.cs | 2 +- src/runtime/Native/NativeTypeSpec.cs | 2 +- src/runtime/PythonTypes/PyType.cs | 2 +- src/runtime/Runtime.cs | 42 +++++++++++++-------------- src/runtime/Util/Encodings.cs | 10 +++++++ tests/test_conversion.py | 3 ++ 8 files changed, 41 insertions(+), 28 deletions(-) create mode 100644 src/runtime/Util/Encodings.cs diff --git a/src/embed_tests/TestPyType.cs b/src/embed_tests/TestPyType.cs index 34645747d..0470070c3 100644 --- a/src/embed_tests/TestPyType.cs +++ b/src/embed_tests/TestPyType.cs @@ -28,7 +28,7 @@ public void CanCreateHeapType() const string name = "nÁmæ"; const string docStr = "dÁcæ"; - using var doc = new StrPtr(docStr, Encoding.UTF8); + using var doc = new StrPtr(docStr, Encodings.UTF8); var spec = new TypeSpec( name: name, basicSize: Util.ReadInt32(Runtime.Runtime.PyBaseObjectType, TypeOffset.tp_basicsize), diff --git a/src/runtime/Loader.cs b/src/runtime/Loader.cs index 516b9ab9c..c0e964abc 100644 --- a/src/runtime/Loader.cs +++ b/src/runtime/Loader.cs @@ -12,7 +12,7 @@ public unsafe static int Initialize(IntPtr data, int size) { try { - var dllPath = Encoding.UTF8.GetString((byte*)data.ToPointer(), size); + var dllPath = Encodings.UTF8.GetString((byte*)data.ToPointer(), size); if (!string.IsNullOrEmpty(dllPath)) { @@ -33,7 +33,7 @@ public unsafe static int Initialize(IntPtr data, int size) ); return 1; } - + return 0; } @@ -41,7 +41,7 @@ public unsafe static int Shutdown(IntPtr data, int size) { try { - var command = Encoding.UTF8.GetString((byte*)data.ToPointer(), size); + var command = Encodings.UTF8.GetString((byte*)data.ToPointer(), size); if (command == "full_shutdown") { diff --git a/src/runtime/Native/CustomMarshaler.cs b/src/runtime/Native/CustomMarshaler.cs index 62c027150..299af3a33 100644 --- a/src/runtime/Native/CustomMarshaler.cs +++ b/src/runtime/Native/CustomMarshaler.cs @@ -42,7 +42,7 @@ public int GetNativeDataSize() internal class UcsMarshaler : MarshalerBase { internal static readonly int _UCS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 2 : 4; - internal static readonly Encoding PyEncoding = _UCS == 2 ? Encoding.Unicode : Encoding.UTF32; + internal static readonly Encoding PyEncoding = _UCS == 2 ? Encodings.UTF16 : Encodings.UTF32; private static readonly MarshalerBase Instance = new UcsMarshaler(); public override IntPtr MarshalManagedToNative(object managedObj) diff --git a/src/runtime/Native/NativeTypeSpec.cs b/src/runtime/Native/NativeTypeSpec.cs index 8b84df536..50019a148 100644 --- a/src/runtime/Native/NativeTypeSpec.cs +++ b/src/runtime/Native/NativeTypeSpec.cs @@ -17,7 +17,7 @@ public NativeTypeSpec(TypeSpec spec) { if (spec is null) throw new ArgumentNullException(nameof(spec)); - this.Name = new StrPtr(spec.Name, Encoding.UTF8); + this.Name = new StrPtr(spec.Name, Encodings.UTF8); this.BasicSize = spec.BasicSize; this.ItemSize = spec.ItemSize; this.Flags = (int)spec.Flags; diff --git a/src/runtime/PythonTypes/PyType.cs b/src/runtime/PythonTypes/PyType.cs index af796a5c5..28bda5d3e 100644 --- a/src/runtime/PythonTypes/PyType.cs +++ b/src/runtime/PythonTypes/PyType.cs @@ -53,7 +53,7 @@ public string Name { RawPointer = Util.ReadIntPtr(this, TypeOffset.tp_name), }; - return namePtr.ToString(System.Text.Encoding.UTF8)!; + return namePtr.ToString(Encodings.UTF8)!; } } diff --git a/src/runtime/Runtime.cs b/src/runtime/Runtime.cs index 4e1c6156a..3d51b8bc9 100644 --- a/src/runtime/Runtime.cs +++ b/src/runtime/Runtime.cs @@ -795,13 +795,13 @@ public static int Py_Main(int argc, string[] argv) internal static int PyRun_SimpleString(string code) { - using var codePtr = new StrPtr(code, Encoding.UTF8); + using var codePtr = new StrPtr(code, Encodings.UTF8); return Delegates.PyRun_SimpleStringFlags(codePtr, Utf8String); } internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedReference globals, BorrowedReference locals) { - using var codePtr = new StrPtr(code, Encoding.UTF8); + using var codePtr = new StrPtr(code, Encodings.UTF8); return Delegates.PyRun_StringFlags(codePtr, st, globals, locals, Utf8String); } @@ -813,14 +813,14 @@ internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedR /// internal static NewReference Py_CompileString(string str, string file, int start) { - using var strPtr = new StrPtr(str, Encoding.UTF8); + using var strPtr = new StrPtr(str, Encodings.UTF8); using var fileObj = new PyString(file); return Delegates.Py_CompileStringObject(strPtr, fileObj, start, Utf8String, -1); } internal static NewReference PyImport_ExecCodeModule(string name, BorrowedReference code) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_ExecCodeModule(namePtr, code); } @@ -867,13 +867,13 @@ internal static bool PyObject_IsIterable(BorrowedReference ob) internal static int PyObject_HasAttrString(BorrowedReference pointer, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_HasAttrString(pointer, namePtr); } internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_GetAttrString(pointer, namePtr); } @@ -884,12 +884,12 @@ internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, S internal static int PyObject_DelAttr(BorrowedReference @object, BorrowedReference name) => Delegates.PyObject_SetAttr(@object, name, null); internal static int PyObject_DelAttrString(BorrowedReference @object, string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_SetAttrString(@object, namePtr, null); } internal static int PyObject_SetAttrString(BorrowedReference @object, string name, BorrowedReference value) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyObject_SetAttrString(@object, namePtr, value); } @@ -1071,7 +1071,7 @@ internal static bool PyBool_CheckExact(BorrowedReference ob) internal static NewReference PyLong_FromString(string value, int radix) { - using var valPtr = new StrPtr(value, Encoding.UTF8); + using var valPtr = new StrPtr(value, Encodings.UTF8); return Delegates.PyLong_FromString(valPtr, IntPtr.Zero, radix); } @@ -1272,7 +1272,7 @@ internal static NewReference EmptyPyBytes() internal static NewReference PyByteArray_FromStringAndSize(IntPtr strPtr, nint len) => Delegates.PyByteArray_FromStringAndSize(strPtr, len); internal static NewReference PyByteArray_FromStringAndSize(string s) { - using var ptr = new StrPtr(s, Encoding.UTF8); + using var ptr = new StrPtr(s, Encodings.UTF8); return PyByteArray_FromStringAndSize(ptr.RawPointer, checked((nint)ptr.ByteCount)); } @@ -1300,7 +1300,7 @@ internal static IntPtr PyBytes_AsString(BorrowedReference ob) internal static NewReference PyUnicode_InternFromString(string s) { - using var ptr = new StrPtr(s, Encoding.UTF8); + using var ptr = new StrPtr(s, Encodings.UTF8); return Delegates.PyUnicode_InternFromString(ptr); } @@ -1375,7 +1375,7 @@ internal static bool PyDict_Check(BorrowedReference ob) internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer, string key) { - using var keyStr = new StrPtr(key, Encoding.UTF8); + using var keyStr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_GetItemString(pointer, keyStr); } @@ -1391,7 +1391,7 @@ internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer /// internal static int PyDict_SetItemString(BorrowedReference dict, string key, BorrowedReference value) { - using var keyPtr = new StrPtr(key, Encoding.UTF8); + using var keyPtr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_SetItemString(dict, keyPtr, value); } @@ -1400,7 +1400,7 @@ internal static int PyDict_SetItemString(BorrowedReference dict, string key, Bor internal static int PyDict_DelItemString(BorrowedReference pointer, string key) { - using var keyPtr = new StrPtr(key, Encoding.UTF8); + using var keyPtr = new StrPtr(key, Encodings.UTF8); return Delegates.PyDict_DelItemString(pointer, keyPtr); } @@ -1515,7 +1515,7 @@ internal static bool PyIter_Check(BorrowedReference ob) internal static NewReference PyModule_New(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyModule_New(namePtr); } @@ -1529,7 +1529,7 @@ internal static NewReference PyModule_New(string name) /// Return -1 on error, 0 on success. internal static int PyModule_AddObject(BorrowedReference module, string name, StolenReference value) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); IntPtr valueAddr = value.DangerousGetAddressOrNull(); int res = Delegates.PyModule_AddObject(module, namePtr, valueAddr); // We can't just exit here because the reference is stolen only on success. @@ -1547,7 +1547,7 @@ internal static int PyModule_AddObject(BorrowedReference module, string name, St internal static NewReference PyImport_ImportModule(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_ImportModule(namePtr); } @@ -1556,7 +1556,7 @@ internal static NewReference PyImport_ImportModule(string name) internal static BorrowedReference PyImport_AddModule(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PyImport_AddModule(namePtr); } @@ -1584,13 +1584,13 @@ internal static void PySys_SetArgvEx(int argc, string[] argv, int updatepath) internal static BorrowedReference PySys_GetObject(string name) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PySys_GetObject(namePtr); } internal static int PySys_SetObject(string name, BorrowedReference ob) { - using var namePtr = new StrPtr(name, Encoding.UTF8); + using var namePtr = new StrPtr(name, Encodings.UTF8); return Delegates.PySys_SetObject(namePtr, ob); } @@ -1689,7 +1689,7 @@ internal static IntPtr PyMem_Malloc(long size) internal static void PyErr_SetString(BorrowedReference ob, string message) { - using var msgPtr = new StrPtr(message, Encoding.UTF8); + using var msgPtr = new StrPtr(message, Encodings.UTF8); Delegates.PyErr_SetString(ob, msgPtr); } diff --git a/src/runtime/Util/Encodings.cs b/src/runtime/Util/Encodings.cs new file mode 100644 index 000000000..d5a0c6ff8 --- /dev/null +++ b/src/runtime/Util/Encodings.cs @@ -0,0 +1,10 @@ +using System; +using System.Text; + +namespace Python.Runtime; + +static class Encodings { + public static System.Text.Encoding UTF8 = new UTF8Encoding(false, true); + public static System.Text.Encoding UTF16 = new UnicodeEncoding(!BitConverter.IsLittleEndian, false, true); + public static System.Text.Encoding UTF32 = new UTF32Encoding(!BitConverter.IsLittleEndian, false, true); +} diff --git a/tests/test_conversion.py b/tests/test_conversion.py index bb686dd52..dd70f900a 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -510,6 +510,9 @@ def test_string_conversion(): ob.StringField = System.String(u'\uffff\uffff') assert ob.StringField == u'\uffff\uffff' + ob.StringField = System.String("\ufeffbom") + assert ob.StringField == "\ufeffbom" + ob.StringField = None assert ob.StringField is None From dc6f5efb7905ff695d244a65e0afc161be6ef1af Mon Sep 17 00:00:00 2001 From: Benedikt Reinartz Date: Sun, 5 May 2024 20:38:24 +0200 Subject: [PATCH 2/2] Copy potential BOM to the output of PyString_FromString The documentation of the used `PyUnicode_DecodeUTF16` states that not passing `*byteorder` or passing a 0 results in the first two bytes, if they are the BOM (U+FEFF, zero-width no-break space), to be interpreted and skipped, which is incorrect when we convert a known "non BOM" string, which all strings from C# are. --- src/runtime/Runtime.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/Runtime.cs b/src/runtime/Runtime.cs index 3d51b8bc9..2f9e18f65 100644 --- a/src/runtime/Runtime.cs +++ b/src/runtime/Runtime.cs @@ -1252,12 +1252,14 @@ internal static bool PyString_CheckExact(BorrowedReference ob) internal static NewReference PyString_FromString(string value) { + int byteorder = BitConverter.IsLittleEndian ? -1 : 1; + int* byteorderPtr = &byteorder; fixed(char* ptr = value) return Delegates.PyUnicode_DecodeUTF16( (IntPtr)ptr, value.Length * sizeof(Char), IntPtr.Zero, - IntPtr.Zero + (IntPtr)byteorderPtr ); } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy