Skip to content

Commit 195cde6

Browse files
authored
Use non-BOM encodings (#2370)
* Use non-BOM encodings The documentation of the used `PyUnicode_DecodeUTF16` states that not passing `*byteorder` or passing a 0 results in the first two bytes, if they are the BOM (U+FEFF, zero-width no-break space), to be interpreted and skipped, which is incorrect when we convert a known "non BOM" string, which all strings from C# are.
1 parent 6a8a97d commit 195cde6

File tree

8 files changed

+44
-29
lines changed

8 files changed

+44
-29
lines changed

src/embed_tests/TestPyType.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public void CanCreateHeapType()
2828
const string name = "nÁmæ";
2929
const string docStr = "dÁcæ";
3030

31-
using var doc = new StrPtr(docStr, Encoding.UTF8);
31+
using var doc = new StrPtr(docStr, Encodings.UTF8);
3232
var spec = new TypeSpec(
3333
name: name,
3434
basicSize: Util.ReadInt32(Runtime.Runtime.PyBaseObjectType, TypeOffset.tp_basicsize),

src/runtime/Loader.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public unsafe static int Initialize(IntPtr data, int size)
1212
{
1313
try
1414
{
15-
var dllPath = Encoding.UTF8.GetString((byte*)data.ToPointer(), size);
15+
var dllPath = Encodings.UTF8.GetString((byte*)data.ToPointer(), size);
1616

1717
if (!string.IsNullOrEmpty(dllPath))
1818
{
@@ -33,15 +33,15 @@ public unsafe static int Initialize(IntPtr data, int size)
3333
);
3434
return 1;
3535
}
36-
36+
3737
return 0;
3838
}
3939

4040
public unsafe static int Shutdown(IntPtr data, int size)
4141
{
4242
try
4343
{
44-
var command = Encoding.UTF8.GetString((byte*)data.ToPointer(), size);
44+
var command = Encodings.UTF8.GetString((byte*)data.ToPointer(), size);
4545

4646
if (command == "full_shutdown")
4747
{

src/runtime/Native/CustomMarshaler.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public int GetNativeDataSize()
4242
internal class UcsMarshaler : MarshalerBase
4343
{
4444
internal static readonly int _UCS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 2 : 4;
45-
internal static readonly Encoding PyEncoding = _UCS == 2 ? Encoding.Unicode : Encoding.UTF32;
45+
internal static readonly Encoding PyEncoding = _UCS == 2 ? Encodings.UTF16 : Encodings.UTF32;
4646
private static readonly MarshalerBase Instance = new UcsMarshaler();
4747

4848
public override IntPtr MarshalManagedToNative(object managedObj)

src/runtime/Native/NativeTypeSpec.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public NativeTypeSpec(TypeSpec spec)
1717
{
1818
if (spec is null) throw new ArgumentNullException(nameof(spec));
1919

20-
this.Name = new StrPtr(spec.Name, Encoding.UTF8);
20+
this.Name = new StrPtr(spec.Name, Encodings.UTF8);
2121
this.BasicSize = spec.BasicSize;
2222
this.ItemSize = spec.ItemSize;
2323
this.Flags = (int)spec.Flags;

src/runtime/PythonTypes/PyType.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public string Name
5353
{
5454
RawPointer = Util.ReadIntPtr(this, TypeOffset.tp_name),
5555
};
56-
return namePtr.ToString(System.Text.Encoding.UTF8)!;
56+
return namePtr.ToString(Encodings.UTF8)!;
5757
}
5858
}
5959

src/runtime/Runtime.cs

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -795,13 +795,13 @@ public static int Py_Main(int argc, string[] argv)
795795

796796
internal static int PyRun_SimpleString(string code)
797797
{
798-
using var codePtr = new StrPtr(code, Encoding.UTF8);
798+
using var codePtr = new StrPtr(code, Encodings.UTF8);
799799
return Delegates.PyRun_SimpleStringFlags(codePtr, Utf8String);
800800
}
801801

802802
internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedReference globals, BorrowedReference locals)
803803
{
804-
using var codePtr = new StrPtr(code, Encoding.UTF8);
804+
using var codePtr = new StrPtr(code, Encodings.UTF8);
805805
return Delegates.PyRun_StringFlags(codePtr, st, globals, locals, Utf8String);
806806
}
807807

@@ -813,14 +813,14 @@ internal static NewReference PyRun_String(string code, RunFlagType st, BorrowedR
813813
/// </summary>
814814
internal static NewReference Py_CompileString(string str, string file, int start)
815815
{
816-
using var strPtr = new StrPtr(str, Encoding.UTF8);
816+
using var strPtr = new StrPtr(str, Encodings.UTF8);
817817
using var fileObj = new PyString(file);
818818
return Delegates.Py_CompileStringObject(strPtr, fileObj, start, Utf8String, -1);
819819
}
820820

821821
internal static NewReference PyImport_ExecCodeModule(string name, BorrowedReference code)
822822
{
823-
using var namePtr = new StrPtr(name, Encoding.UTF8);
823+
using var namePtr = new StrPtr(name, Encodings.UTF8);
824824
return Delegates.PyImport_ExecCodeModule(namePtr, code);
825825
}
826826

@@ -867,13 +867,13 @@ internal static bool PyObject_IsIterable(BorrowedReference ob)
867867

868868
internal static int PyObject_HasAttrString(BorrowedReference pointer, string name)
869869
{
870-
using var namePtr = new StrPtr(name, Encoding.UTF8);
870+
using var namePtr = new StrPtr(name, Encodings.UTF8);
871871
return Delegates.PyObject_HasAttrString(pointer, namePtr);
872872
}
873873

874874
internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, string name)
875875
{
876-
using var namePtr = new StrPtr(name, Encoding.UTF8);
876+
using var namePtr = new StrPtr(name, Encodings.UTF8);
877877
return Delegates.PyObject_GetAttrString(pointer, namePtr);
878878
}
879879

@@ -884,12 +884,12 @@ internal static NewReference PyObject_GetAttrString(BorrowedReference pointer, S
884884
internal static int PyObject_DelAttr(BorrowedReference @object, BorrowedReference name) => Delegates.PyObject_SetAttr(@object, name, null);
885885
internal static int PyObject_DelAttrString(BorrowedReference @object, string name)
886886
{
887-
using var namePtr = new StrPtr(name, Encoding.UTF8);
887+
using var namePtr = new StrPtr(name, Encodings.UTF8);
888888
return Delegates.PyObject_SetAttrString(@object, namePtr, null);
889889
}
890890
internal static int PyObject_SetAttrString(BorrowedReference @object, string name, BorrowedReference value)
891891
{
892-
using var namePtr = new StrPtr(name, Encoding.UTF8);
892+
using var namePtr = new StrPtr(name, Encodings.UTF8);
893893
return Delegates.PyObject_SetAttrString(@object, namePtr, value);
894894
}
895895

@@ -1071,7 +1071,7 @@ internal static bool PyBool_CheckExact(BorrowedReference ob)
10711071

10721072
internal static NewReference PyLong_FromString(string value, int radix)
10731073
{
1074-
using var valPtr = new StrPtr(value, Encoding.UTF8);
1074+
using var valPtr = new StrPtr(value, Encodings.UTF8);
10751075
return Delegates.PyLong_FromString(valPtr, IntPtr.Zero, radix);
10761076
}
10771077

@@ -1252,12 +1252,14 @@ internal static bool PyString_CheckExact(BorrowedReference ob)
12521252

12531253
internal static NewReference PyString_FromString(string value)
12541254
{
1255+
int byteorder = BitConverter.IsLittleEndian ? -1 : 1;
1256+
int* byteorderPtr = &byteorder;
12551257
fixed(char* ptr = value)
12561258
return Delegates.PyUnicode_DecodeUTF16(
12571259
(IntPtr)ptr,
12581260
value.Length * sizeof(Char),
12591261
IntPtr.Zero,
1260-
IntPtr.Zero
1262+
(IntPtr)byteorderPtr
12611263
);
12621264
}
12631265

@@ -1272,7 +1274,7 @@ internal static NewReference EmptyPyBytes()
12721274
internal static NewReference PyByteArray_FromStringAndSize(IntPtr strPtr, nint len) => Delegates.PyByteArray_FromStringAndSize(strPtr, len);
12731275
internal static NewReference PyByteArray_FromStringAndSize(string s)
12741276
{
1275-
using var ptr = new StrPtr(s, Encoding.UTF8);
1277+
using var ptr = new StrPtr(s, Encodings.UTF8);
12761278
return PyByteArray_FromStringAndSize(ptr.RawPointer, checked((nint)ptr.ByteCount));
12771279
}
12781280

@@ -1300,7 +1302,7 @@ internal static IntPtr PyBytes_AsString(BorrowedReference ob)
13001302

13011303
internal static NewReference PyUnicode_InternFromString(string s)
13021304
{
1303-
using var ptr = new StrPtr(s, Encoding.UTF8);
1305+
using var ptr = new StrPtr(s, Encodings.UTF8);
13041306
return Delegates.PyUnicode_InternFromString(ptr);
13051307
}
13061308

@@ -1375,7 +1377,7 @@ internal static bool PyDict_Check(BorrowedReference ob)
13751377

13761378
internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer, string key)
13771379
{
1378-
using var keyStr = new StrPtr(key, Encoding.UTF8);
1380+
using var keyStr = new StrPtr(key, Encodings.UTF8);
13791381
return Delegates.PyDict_GetItemString(pointer, keyStr);
13801382
}
13811383

@@ -1391,7 +1393,7 @@ internal static BorrowedReference PyDict_GetItemString(BorrowedReference pointer
13911393
/// </summary>
13921394
internal static int PyDict_SetItemString(BorrowedReference dict, string key, BorrowedReference value)
13931395
{
1394-
using var keyPtr = new StrPtr(key, Encoding.UTF8);
1396+
using var keyPtr = new StrPtr(key, Encodings.UTF8);
13951397
return Delegates.PyDict_SetItemString(dict, keyPtr, value);
13961398
}
13971399

@@ -1400,7 +1402,7 @@ internal static int PyDict_SetItemString(BorrowedReference dict, string key, Bor
14001402

14011403
internal static int PyDict_DelItemString(BorrowedReference pointer, string key)
14021404
{
1403-
using var keyPtr = new StrPtr(key, Encoding.UTF8);
1405+
using var keyPtr = new StrPtr(key, Encodings.UTF8);
14041406
return Delegates.PyDict_DelItemString(pointer, keyPtr);
14051407
}
14061408

@@ -1515,7 +1517,7 @@ internal static bool PyIter_Check(BorrowedReference ob)
15151517

15161518
internal static NewReference PyModule_New(string name)
15171519
{
1518-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1520+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15191521
return Delegates.PyModule_New(namePtr);
15201522
}
15211523

@@ -1529,7 +1531,7 @@ internal static NewReference PyModule_New(string name)
15291531
/// <returns>Return -1 on error, 0 on success.</returns>
15301532
internal static int PyModule_AddObject(BorrowedReference module, string name, StolenReference value)
15311533
{
1532-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1534+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15331535
IntPtr valueAddr = value.DangerousGetAddressOrNull();
15341536
int res = Delegates.PyModule_AddObject(module, namePtr, valueAddr);
15351537
// We can't just exit here because the reference is stolen only on success.
@@ -1547,7 +1549,7 @@ internal static int PyModule_AddObject(BorrowedReference module, string name, St
15471549

15481550
internal static NewReference PyImport_ImportModule(string name)
15491551
{
1550-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1552+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15511553
return Delegates.PyImport_ImportModule(namePtr);
15521554
}
15531555

@@ -1556,7 +1558,7 @@ internal static NewReference PyImport_ImportModule(string name)
15561558

15571559
internal static BorrowedReference PyImport_AddModule(string name)
15581560
{
1559-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1561+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15601562
return Delegates.PyImport_AddModule(namePtr);
15611563
}
15621564

@@ -1584,13 +1586,13 @@ internal static void PySys_SetArgvEx(int argc, string[] argv, int updatepath)
15841586

15851587
internal static BorrowedReference PySys_GetObject(string name)
15861588
{
1587-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1589+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15881590
return Delegates.PySys_GetObject(namePtr);
15891591
}
15901592

15911593
internal static int PySys_SetObject(string name, BorrowedReference ob)
15921594
{
1593-
using var namePtr = new StrPtr(name, Encoding.UTF8);
1595+
using var namePtr = new StrPtr(name, Encodings.UTF8);
15941596
return Delegates.PySys_SetObject(namePtr, ob);
15951597
}
15961598

@@ -1689,7 +1691,7 @@ internal static IntPtr PyMem_Malloc(long size)
16891691

16901692
internal static void PyErr_SetString(BorrowedReference ob, string message)
16911693
{
1692-
using var msgPtr = new StrPtr(message, Encoding.UTF8);
1694+
using var msgPtr = new StrPtr(message, Encodings.UTF8);
16931695
Delegates.PyErr_SetString(ob, msgPtr);
16941696
}
16951697

src/runtime/Util/Encodings.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
using System;
2+
using System.Text;
3+
4+
namespace Python.Runtime;
5+
6+
static class Encodings {
7+
public static System.Text.Encoding UTF8 = new UTF8Encoding(false, true);
8+
public static System.Text.Encoding UTF16 = new UnicodeEncoding(!BitConverter.IsLittleEndian, false, true);
9+
public static System.Text.Encoding UTF32 = new UTF32Encoding(!BitConverter.IsLittleEndian, false, true);
10+
}

tests/test_conversion.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,9 @@ def test_string_conversion():
510510
ob.StringField = System.String(u'\uffff\uffff')
511511
assert ob.StringField == u'\uffff\uffff'
512512

513+
ob.StringField = System.String("\ufeffbom")
514+
assert ob.StringField == "\ufeffbom"
515+
513516
ob.StringField = None
514517
assert ob.StringField is None
515518

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy