From 52bb3da654eaac496a3f7cf0e5fe96b37ab4af20 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 21 Feb 2024 23:15:23 -0800 Subject: [PATCH 01/13] Implement the small code model for x86_64 and aarch64 macOS and Linux --- Python/jit.c | 143 +++++++++++++++++++++++++++++++++++++----- Tools/jit/_schema.py | 10 +++ Tools/jit/_targets.py | 68 ++++++++++++++++---- 3 files changed, 191 insertions(+), 30 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 839414bd810677..8a1196139ac379 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -47,18 +47,18 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } -static char * +static unsigned char * jit_alloc(size_t size) { assert(size); assert(size % get_page_size() == 0); #ifdef MS_WINDOWS int flags = MEM_COMMIT | MEM_RESERVE; - char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); + unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); int failed = memory == NULL; #else int flags = MAP_ANONYMOUS | MAP_PRIVATE; - char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); int failed = memory == MAP_FAILED; #endif if (failed) { @@ -69,7 +69,7 @@ jit_alloc(size_t size) } static int -jit_free(char *memory, size_t size) +jit_free(unsigned char *memory, size_t size) { assert(size); assert(size % get_page_size() == 0); @@ -86,7 +86,7 @@ jit_free(char *memory, size_t size) } static int -mark_executable(char *memory, size_t size) +mark_executable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size) } static int -mark_readable(char *memory, size_t size) +mark_readable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // Fill all of stencil's holes in the memory pointed to by base, using the // values in patches. static void -patch(char *base, const Stencil *stencil, uint64_t *patches) +patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { for (uint64_t i = 0; i < stencil->holes_size; i++) { const Hole *hole = &stencil->holes[i]; - void *location = base + hole->offset; + unsigned char *location = base + hole->offset; uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend; + uint8_t *loc8 = (uint8_t *)location; uint32_t *loc32 = (uint32_t *)location; uint64_t *loc64 = (uint64_t *)location; // LLD is a great reference for performing relocations... just keep in // mind that Tools/jit/build.py does filtering and preprocessing for us! // Here's a good place to start for each platform: // - aarch64-apple-darwin: + // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h // - aarch64-unknown-linux-gnu: @@ -208,6 +210,60 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) // 64-bit absolute address. *loc64 = value; continue; + case HoleKind_R_X86_64_GOTPCRELX: + case HoleKind_R_X86_64_REX_GOTPCRELX: + case HoleKind_X86_64_RELOC_GOT: + case HoleKind_X86_64_RELOC_GOT_LOAD: { + // 32-bit relative address. + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // Before: mov eax, dword ptr [rip + AAA] + // After: lea eax, [rip + XXX] + assert(hole->kind == HoleKind_IMAGE_REL_AMD64_REL32 || + hole->kind == HoleKind_R_X86_64_GOTPCRELX || + hole->kind == HoleKind_R_X86_64_REX_GOTPCRELX || + hole->kind == HoleKind_X86_64_RELOC_GOT_LOAD); + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // Before: call qword ptr [rip + AAA] + // After: nop + // call XXX + assert(hole->kind == HoleKind_R_X86_64_GOTPCRELX || + hole->kind == HoleKind_X86_64_RELOC_GOT); + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // Before: jmp qword ptr [rip + AAA] + // After: nop + // jmp XXX + assert(hole->kind == HoleKind_IMAGE_REL_AMD64_REL32 || + hole->kind == HoleKind_R_X86_64_GOTPCRELX); + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + } + // Fall through... + case HoleKind_R_X86_64_GOTPCREL: + case HoleKind_R_X86_64_PC32: + case HoleKind_X86_64_RELOC_SIGNED: + case HoleKind_X86_64_RELOC_BRANCH: + // 32-bit relative address. + value -= (uint64_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + loc32[0] = (uint32_t)value; + continue; case HoleKind_R_AARCH64_CALL26: case HoleKind_R_AARCH64_JUMP26: // 28-bit relative branch. @@ -249,10 +305,61 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 48, 16); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: + case HoleKind_R_AARCH64_ADR_GOT_PAGE: { // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). - assert(IS_AARCH64_ADRP(*loc32)); + const Hole *next_hole = &stencil->holes[i + 1]; + if (i + 1 < stencil->holes_size && + (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || + next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && + next_hole->offset == hole->offset + 4 && + next_hole->symbol == hole->symbol && + next_hole->addend == hole->addend && + next_hole->value == hole->value) + { + assert(IS_AARCH64_ADRP(*loc32)); + unsigned char rd = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + unsigned char rt = get_bits(loc32[1], 0, 5); + unsigned char rn = get_bits(loc32[1], 5, 5); + assert(rd == rn && rn == rt); + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // Before: adrp x0, AAA + // ldr x0, [x0 + BBB] + // After: movz x0, XXX + // nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + if (relaxed < (1ULL << 32)) { + // Before: adrp x0, AAA + // ldr x0, [x0 + BBB] + // After: movz x0, XXX + // movk x0, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; + i++; + continue; + } + relaxed = (uint64_t)value - (uint64_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // Before: adrp x0, AAA + // ldr x0, [x0 + BBB] + // After: ldr x0, XXX + // nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + } // Number of pages between this page and the value's page: value = (value >> 12) - ((uint64_t)location >> 12); // Check that we're not out of range of 21 signed bits: @@ -263,7 +370,9 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) // value[2:21] goes in loc[5:26]: set_bits(loc32, 5, value, 2, 19); continue; + } case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: + case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: // 12-bit low part of an absolute address. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGE21 (above). assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); @@ -285,7 +394,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) } static void -copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) +copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { memcpy(base, stencil->body, stencil->body_size); patch(base, stencil, patches); @@ -294,8 +403,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) static void emit(const StencilGroup *group, uint64_t patches[]) { - copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches); - copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); } // Compiles executor in-place. Don't forget to call _PyJIT_Free later! @@ -316,14 +425,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size assert((page_size & (page_size - 1)) == 0); code_size += page_size - (code_size & (page_size - 1)); data_size += page_size - (data_size & (page_size - 1)); - char *memory = jit_alloc(code_size + data_size); + unsigned char *memory = jit_alloc(code_size + data_size); if (memory == NULL) { return -1; } // Loop again to emit the code: - char *code = memory; - char *data = memory + code_size; - char *top = code; + unsigned char *code = memory; + unsigned char *data = memory + code_size; + unsigned char *top = code; if (trace[0].opcode == _START_EXECUTOR) { // Don't want to execute this more than once: top += stencil_groups[_START_EXECUTOR].code.body_size; @@ -360,7 +469,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size void _PyJIT_Free(_PyExecutorObject *executor) { - char *memory = (char *)executor->jit_code; + unsigned char *memory = (unsigned char *)executor->jit_code; size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 8eeb78e6cd69ee..975ca650a13c1a 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -8,13 +8,23 @@ "IMAGE_REL_AMD64_ADDR64", "IMAGE_REL_I386_DIR32", "R_AARCH64_ABS64", + "R_AARCH64_ADR_GOT_PAGE", "R_AARCH64_CALL26", "R_AARCH64_JUMP26", + "R_AARCH64_LD64_GOT_LO12_NC", "R_AARCH64_MOVW_UABS_G0_NC", "R_AARCH64_MOVW_UABS_G1_NC", "R_AARCH64_MOVW_UABS_G2_NC", "R_AARCH64_MOVW_UABS_G3", "R_X86_64_64", + "R_X86_64_GOTPCREL", + "R_X86_64_GOTPCRELX", + "R_X86_64_PC32", + "R_X86_64_REX_GOTPCRELX", + "X86_64_RELOC_BRANCH", + "X86_64_RELOC_GOT", + "X86_64_RELOC_GOT_LOAD", + "X86_64_RELOC_SIGNED", "X86_64_RELOC_UNSIGNED", ] diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 51b091eb246413..50f98df8a553b7 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]): triple: str _: dataclasses.KW_ONLY alignment: int = 1 + args: typing.Sequence[str] = () prefix: str = "" debug: bool = False force: bool = False @@ -121,21 +122,14 @@ async def _compile( "-fno-asynchronous-unwind-tables", # SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds: "-fno-jump-tables", - # Position-independent code adds indirection to every load and jump: - "-fno-pic", + "-fno-plt", # Don't make calls to weird stack-smashing canaries: "-fno-stack-protector", - # We have three options for code model: - # - "small": the default, assumes that code and data reside in the - # lowest 2GB of memory (128MB on aarch64) - # - "medium": assumes that code resides in the lowest 2GB of memory, - # and makes no assumptions about data (not available on aarch64) - # - "large": makes no assumptions about either code or data - "-mcmodel=large", "-o", f"{o}", "-std=c11", f"{c}", + *self.args, ] await _llvm.run("clang", args, echo=self.verbose) return await self._parse(o) @@ -284,7 +278,23 @@ def _handle_section( def _handle_relocation( self, base: int, relocation: _schema.ELFRelocation, raw: bytes ) -> _stencils.Hole: + symbol: str | None match relocation: + case { + "Addend": addend, + "Offset": offset, + "Symbol": {"Value": s}, + "Type": { + "Value": "R_AARCH64_ADR_GOT_PAGE" + | "R_AARCH64_LD64_GOT_LO12_NC" + | "R_X86_64_GOTPCREL" + | "R_X86_64_GOTPCRELX" + | "R_X86_64_REX_GOTPCRELX" as kind + }, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.HoleValue.GOT, s case { "Addend": addend, "Offset": offset, @@ -356,6 +366,34 @@ def _handle_relocation( s = s.removeprefix(self.prefix) value, symbol = _stencils.HoleValue.GOT, s addend = 0 + case { + "Offset": offset, + "Symbol": {"Value": s}, + "Type": { + "Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind + }, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.HoleValue.GOT, s + addend = int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + case { + "Offset": offset, + "Section": {"Value": s}, + "Type": { + "Value": "X86_64_RELOC_SIGNED" as kind + }, + } | { + "Offset": offset, + "Symbol": {"Value": s}, + "Type": { + "Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind + }, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.symbol_to_value(s) + addend = int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 case { "Offset": offset, "Section": {"Value": s}, @@ -380,15 +418,19 @@ def _handle_relocation( def get_target(host: str) -> _COFF | _ELF | _MachO: """Build a _Target for the given host "triple" and options.""" if re.fullmatch(r"aarch64-apple-darwin.*", host): - return _MachO(host, alignment=8, prefix="_") + args = ["-mcmodel=large"] + return _MachO(host, alignment=8, args=args, prefix="_") if re.fullmatch(r"aarch64-.*-linux-gnu", host): - return _ELF(host, alignment=8) + args = ["-mcmodel=large"] + return _ELF(host, alignment=8, args=args) if re.fullmatch(r"i686-pc-windows-msvc", host): - return _COFF(host, prefix="_") + args = ["-mcmodel=large"] + return _COFF(host, args=args, prefix="_") if re.fullmatch(r"x86_64-apple-darwin.*", host): return _MachO(host, prefix="_") if re.fullmatch(r"x86_64-pc-windows-msvc", host): - return _COFF(host) + args = ["-mcmodel=large"] + return _COFF(host, args=args) if re.fullmatch(r"x86_64-.*-linux-gnu", host): return _ELF(host) raise ValueError(host) From 81fe5edbd01eba7af51b7db456e84a020d6aa292 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 21 Feb 2024 23:16:32 -0800 Subject: [PATCH 02/13] blacken --- Tools/jit/_targets.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 50f98df8a553b7..6f7b58a9b1dc2a 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -369,20 +369,18 @@ def _handle_relocation( case { "Offset": offset, "Symbol": {"Value": s}, - "Type": { - "Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind - }, + "Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind}, }: offset += base s = s.removeprefix(self.prefix) value, symbol = _stencils.HoleValue.GOT, s - addend = int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + addend = ( + int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + ) case { "Offset": offset, "Section": {"Value": s}, - "Type": { - "Value": "X86_64_RELOC_SIGNED" as kind - }, + "Type": {"Value": "X86_64_RELOC_SIGNED" as kind}, } | { "Offset": offset, "Symbol": {"Value": s}, @@ -393,7 +391,9 @@ def _handle_relocation( offset += base s = s.removeprefix(self.prefix) value, symbol = _stencils.symbol_to_value(s) - addend = int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + addend = ( + int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + ) case { "Offset": offset, "Section": {"Value": s}, From 36de1cd4a879d9d6401008c5e52b7d75ac8c30f3 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 00:11:32 -0800 Subject: [PATCH 03/13] Remove references to IMAGE_REL_AMD64_REL32 and clean up comments --- Python/jit.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 8a1196139ac379..07e5877cb1ce36 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -221,31 +221,18 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) { if (loc8[-2] == 0x8B) { - // Before: mov eax, dword ptr [rip + AAA] - // After: lea eax, [rip + XXX] - assert(hole->kind == HoleKind_IMAGE_REL_AMD64_REL32 || - hole->kind == HoleKind_R_X86_64_GOTPCRELX || - hole->kind == HoleKind_R_X86_64_REX_GOTPCRELX || - hole->kind == HoleKind_X86_64_RELOC_GOT_LOAD); + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] loc8[-2] = 0x8D; value = relaxed; } else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { - // Before: call qword ptr [rip + AAA] - // After: nop - // call XXX - assert(hole->kind == HoleKind_R_X86_64_GOTPCRELX || - hole->kind == HoleKind_X86_64_RELOC_GOT); + // call qword ptr [rip + AAA] -> nop; call XXX loc8[-2] = 0x90; loc8[-1] = 0xE8; value = relaxed; } else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { - // Before: jmp qword ptr [rip + AAA] - // After: nop - // jmp XXX - assert(hole->kind == HoleKind_IMAGE_REL_AMD64_REL32 || - hole->kind == HoleKind_R_X86_64_GOTPCRELX); + // jmp qword ptr [rip + AAA] -> nop; jmp XXX loc8[-2] = 0x90; loc8[-1] = 0xE9; value = relaxed; @@ -326,20 +313,14 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) assert(rd == rn && rn == rt); uint64_t relaxed = *(uint64_t *)value; if (relaxed < (1UL << 16)) { - // Before: adrp x0, AAA - // ldr x0, [x0 + BBB] - // After: movz x0, XXX - // nop + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; loc32[1] = 0xD503201F; i++; continue; } if (relaxed < (1ULL << 32)) { - // Before: adrp x0, AAA - // ldr x0, [x0 + BBB] - // After: movz x0, XXX - // movk x0, YYY + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; i++; @@ -350,10 +331,7 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) (int64_t)relaxed >= -(1L << 19) && (int64_t)relaxed < (1L << 19)) { - // Before: adrp x0, AAA - // ldr x0, [x0 + BBB] - // After: ldr x0, XXX - // nop + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; loc32[1] = 0xD503201F; i++; From 74860c12134a587d6b481e4112261fa4ba837685 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 00:13:53 -0800 Subject: [PATCH 04/13] Add comment --- Python/jit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/jit.c b/Python/jit.c index 07e5877cb1ce36..8cc9bae6c79e4a 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -296,6 +296,7 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). + // Try to relax the pair of GOT loads into an immediate value: const Hole *next_hole = &stencil->holes[i + 1]; if (i + 1 < stencil->holes_size && (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || From aa53fab0ccc6ac98a704a3cecdfa0bbd6b4efdf9 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 00:15:21 -0800 Subject: [PATCH 05/13] Move assert back --- Python/jit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 8cc9bae6c79e4a..552fb874690e24 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -292,11 +292,12 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 48, 16); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: - case HoleKind_R_AARCH64_ADR_GOT_PAGE: { + case HoleKind_R_AARCH64_ADR_GOT_PAGE: // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). // Try to relax the pair of GOT loads into an immediate value: + assert(IS_AARCH64_ADRP(*loc32)); const Hole *next_hole = &stencil->holes[i + 1]; if (i + 1 < stencil->holes_size && (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || @@ -306,7 +307,6 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) next_hole->addend == hole->addend && next_hole->value == hole->value) { - assert(IS_AARCH64_ADRP(*loc32)); unsigned char rd = get_bits(loc32[0], 0, 5); assert(IS_AARCH64_LDR_OR_STR(loc32[1])); unsigned char rt = get_bits(loc32[1], 0, 5); @@ -349,7 +349,6 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) // value[2:21] goes in loc[5:26]: set_bits(loc32, 5, value, 2, 19); continue; - } case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: // 12-bit low part of an absolute address. Pairs nicely with From 462095c45048125e75953be2e2fbb2fa2db9fc9a Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 00:16:05 -0800 Subject: [PATCH 06/13] fixup --- Python/jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/jit.c b/Python/jit.c index 552fb874690e24..ac2c60ed925a26 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -296,8 +296,8 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). - // Try to relax the pair of GOT loads into an immediate value: assert(IS_AARCH64_ADRP(*loc32)); + // Try to relax the pair of GOT loads into an immediate value: const Hole *next_hole = &stencil->holes[i + 1]; if (i + 1 < stencil->holes_size && (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || From dfa925c405b50aa06afc2b6c2297b87949f052ed Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 14:28:47 -0800 Subject: [PATCH 07/13] Update JIT to use small code model on Windows --- Python/jit.c | 5 +++-- Tools/jit/_schema.py | 3 ++- Tools/jit/_targets.py | 32 +++++++++++++++++++++----------- Tools/jit/template.c | 4 ++-- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index ac2c60ed925a26..54a300eec83cde 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -203,13 +203,14 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) *loc32 = (uint32_t)value; continue; case HoleKind_ARM64_RELOC_UNSIGNED: - case HoleKind_IMAGE_REL_AMD64_ADDR64: case HoleKind_R_AARCH64_ABS64: case HoleKind_X86_64_RELOC_UNSIGNED: case HoleKind_R_X86_64_64: // 64-bit absolute address. *loc64 = value; continue; + case HoleKind_IMAGE_REL_AMD64_REL32: + case HoleKind_IMAGE_REL_I386_REL32: case HoleKind_R_X86_64_GOTPCRELX: case HoleKind_R_X86_64_REX_GOTPCRELX: case HoleKind_X86_64_RELOC_GOT: @@ -249,7 +250,7 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) // Check that we're not out of range of 32 signed bits: assert((int64_t)value >= -(1LL << 31)); assert((int64_t)value < (1LL << 31)); - loc32[0] = (uint32_t)value; + *loc32 = (uint32_t)value; continue; case HoleKind_R_AARCH64_CALL26: case HoleKind_R_AARCH64_JUMP26: diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 975ca650a13c1a..a76a24207bcbbc 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -5,8 +5,9 @@ "ARM64_RELOC_GOT_LOAD_PAGE21", "ARM64_RELOC_GOT_LOAD_PAGEOFF12", "ARM64_RELOC_UNSIGNED", - "IMAGE_REL_AMD64_ADDR64", + "IMAGE_REL_AMD64_REL32", "IMAGE_REL_I386_DIR32", + "IMAGE_REL_I386_REL32", "R_AARCH64_ABS64", "R_AARCH64_ADR_GOT_PAGE", "R_AARCH64_CALL26", diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 06dc4e7acc6c91..b3e63bb32bc879 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -106,7 +106,7 @@ async def _compile( o = tempdir / f"{opname}.o" args = [ f"--target={self.triple}", - "-DPy_BUILD_CORE", + "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", f"-D_JIT_OPCODE={opname}", "-D_PyJIT_ACTIVE", @@ -200,6 +200,14 @@ def _handle_section( hole = self._handle_relocation(base, relocation, stencil.body) stencil.holes.append(hole) + def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]: + if name.startswith("__imp_"): + name = name.removeprefix("__imp_") + name = name.removeprefix(self.prefix) + return _stencils.HoleValue.GOT, name + name = name.removeprefix(self.prefix) + return _stencils.symbol_to_value(name) + def _handle_relocation( self, base: int, relocation: _schema.COFFRelocation, raw: bytes ) -> _stencils.Hole: @@ -207,21 +215,23 @@ def _handle_relocation( case { "Offset": offset, "Symbol": s, - "Type": {"Value": "IMAGE_REL_AMD64_ADDR64" as kind}, + "Type": {"Value": "IMAGE_REL_I386_DIR32" as kind}, }: offset += base - s = s.removeprefix(self.prefix) - value, symbol = _stencils.symbol_to_value(s) - addend = int.from_bytes(raw[offset : offset + 8], "little") + value, symbol = self._unwrap_dllimport(s) + addend = int.from_bytes(raw[offset : offset + 4], "little") case { "Offset": offset, "Symbol": s, - "Type": {"Value": "IMAGE_REL_I386_DIR32" as kind}, + "Type": { + "Value": "IMAGE_REL_AMD64_REL32" | "IMAGE_REL_I386_REL32" as kind + }, }: offset += base - s = s.removeprefix(self.prefix) - value, symbol = _stencils.symbol_to_value(s) - addend = int.from_bytes(raw[offset : offset + 4], "little") + value, symbol = self._unwrap_dllimport(s) + addend = ( + int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + ) case _: raise NotImplementedError(relocation) return _stencils.Hole(offset, kind, value, symbol, addend) @@ -423,12 +433,12 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: args = ["-mcmodel=large"] return _ELF(host, alignment=8, args=args) if re.fullmatch(r"i686-pc-windows-msvc", host): - args = ["-mcmodel=large"] + args = ["-DPy_NO_ENABLE_SHARED"] return _COFF(host, args=args, prefix="_") if re.fullmatch(r"x86_64-apple-darwin.*", host): return _MachO(host, prefix="_") if re.fullmatch(r"x86_64-pc-windows-msvc", host): - args = ["-mcmodel=large"] + args = ["-fms-runtime-lib=dll"] return _COFF(host, args=args) if re.fullmatch(r"x86_64-.*-linux-gnu", host): return _ELF(host) diff --git a/Tools/jit/template.c b/Tools/jit/template.c index d79c6efb8f6de4..3cb9460291ed29 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -58,11 +58,11 @@ do { \ } while (0) #define PATCH_VALUE(TYPE, NAME, ALIAS) \ - extern void ALIAS; \ + PyAPI_DATA(void) ALIAS; \ TYPE NAME = (TYPE)(uint64_t)&ALIAS; #define PATCH_JUMP(ALIAS) \ - extern void ALIAS; \ + PyAPI_DATA(void) ALIAS; \ __attribute__((musttail)) \ return ((jit_func)&ALIAS)(frame, stack_pointer, tstate); From f7cc2fd6b31066d6fc1a654ba9729e900befac3a Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 22 Feb 2024 14:29:24 -0800 Subject: [PATCH 08/13] Export a bunch of needed symbols --- Include/cpython/optimizer.h | 2 +- Include/internal/pycore_ceval.h | 30 ++++++++++++++----------- Include/internal/pycore_dict.h | 10 ++++----- Include/internal/pycore_floatobject.h | 2 +- Include/internal/pycore_function.h | 2 +- Include/internal/pycore_genobject.h | 4 ++-- Include/internal/pycore_intrinsics.h | 4 ++-- Include/internal/pycore_list.h | 6 ++--- Include/internal/pycore_long.h | 6 ++--- Include/internal/pycore_object.h | 4 ++-- Include/internal/pycore_pyerrors.h | 8 +++---- Include/internal/pycore_sliceobject.h | 2 +- Include/internal/pycore_tuple.h | 2 +- Include/internal/pycore_typeobject.h | 2 +- Include/internal/pycore_unicodeobject.h | 4 ++-- Python/bytecodes.c | 8 +++---- Python/ceval.c | 6 +++++ Python/ceval_macros.h | 7 ------ Python/executor_cases.c.h | 8 +++---- Python/generated_cases.c.h | 8 +++---- Tools/jit/_stencils.py | 9 +++++++- 21 files changed, 72 insertions(+), 62 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index fe54d1ddfe6129..213968a7984299 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -92,7 +92,7 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); -int +PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer, _PyExecutorObject **exec_ptr); void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *); diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index bf77526cf75cc1..6eab2ba1daedf8 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -181,22 +181,26 @@ extern PyObject* _Py_MakeCoro(PyFunctionObject *func); /* Handle signals, pending calls, GIL drop request and asynchronous exception */ -extern int _Py_HandlePending(PyThreadState *tstate); +PyAPI_FUNC(int) _Py_HandlePending(PyThreadState *tstate); extern PyObject * _PyEval_GetFrameLocals(void); -extern const binaryfunc _PyEval_BinaryOps[]; -int _PyEval_CheckExceptStarTypeValid(PyThreadState *tstate, PyObject* right); -int _PyEval_CheckExceptTypeValid(PyThreadState *tstate, PyObject* right); -int _PyEval_ExceptionGroupMatch(PyObject* exc_value, PyObject *match_type, PyObject **match, PyObject **rest); -void _PyEval_FormatAwaitableError(PyThreadState *tstate, PyTypeObject *type, int oparg); -void _PyEval_FormatExcCheckArg(PyThreadState *tstate, PyObject *exc, const char *format_str, PyObject *obj); -void _PyEval_FormatExcUnbound(PyThreadState *tstate, PyCodeObject *co, int oparg); -void _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject *kwargs); -PyObject *_PyEval_MatchClass(PyThreadState *tstate, PyObject *subject, PyObject *type, Py_ssize_t nargs, PyObject *kwargs); -PyObject *_PyEval_MatchKeys(PyThreadState *tstate, PyObject *map, PyObject *keys); -int _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int argcntafter, PyObject **sp); -void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); +typedef PyObject *(*conversion_func)(PyObject *); + +PyAPI_DATA(const binaryfunc) _PyEval_BinaryOps[]; +PyAPI_DATA(const conversion_func) _PyEval_ConversionFuncs[]; + +PyAPI_FUNC(int) _PyEval_CheckExceptStarTypeValid(PyThreadState *tstate, PyObject* right); +PyAPI_FUNC(int) _PyEval_CheckExceptTypeValid(PyThreadState *tstate, PyObject* right); +PyAPI_FUNC(int) _PyEval_ExceptionGroupMatch(PyObject* exc_value, PyObject *match_type, PyObject **match, PyObject **rest); +PyAPI_FUNC(void) _PyEval_FormatAwaitableError(PyThreadState *tstate, PyTypeObject *type, int oparg); +PyAPI_FUNC(void) _PyEval_FormatExcCheckArg(PyThreadState *tstate, PyObject *exc, const char *format_str, PyObject *obj); +PyAPI_FUNC(void) _PyEval_FormatExcUnbound(PyThreadState *tstate, PyCodeObject *co, int oparg); +PyAPI_FUNC(void) _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject *kwargs); +PyAPI_FUNC(PyObject *)_PyEval_MatchClass(PyThreadState *tstate, PyObject *subject, PyObject *type, Py_ssize_t nargs, PyObject *kwargs); +PyAPI_FUNC(PyObject *)_PyEval_MatchKeys(PyThreadState *tstate, PyObject *map, PyObject *keys); +PyAPI_FUNC(int) _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int argcntafter, PyObject **sp); +PyAPI_FUNC(void) _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); /* Bits that can be set in PyThreadState.eval_breaker */ diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index 5a496d59ab7af7..954d6b46c55fe5 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -52,7 +52,7 @@ PyAPI_FUNC(Py_ssize_t) _PyDict_SizeOf(PyDictObject *); of a key wins, if override is 2, a KeyError with conflicting key as argument is raised. */ -extern int _PyDict_MergeEx(PyObject *mp, PyObject *other, int override); +PyAPI_FUNC(int) _PyDict_MergeEx(PyObject *mp, PyObject *other, int override); extern void _PyDict_DebugMallocStats(FILE *out); @@ -100,10 +100,10 @@ extern Py_ssize_t _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t has extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *); extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key); -extern PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *); +PyAPI_FUNC(PyObject *)_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *); /* Consumes references to key and value */ -extern int _PyDict_SetItem_Take2(PyDictObject *op, PyObject *key, PyObject *value); +PyAPI_FUNC(int) _PyDict_SetItem_Take2(PyDictObject *op, PyObject *key, PyObject *value); extern int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value); extern int _PyDict_Pop_KnownHash( @@ -246,8 +246,8 @@ _PyDict_NotifyEvent(PyInterpreterState *interp, } extern PyObject *_PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values); -extern bool _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv); -extern PyObject *_PyDict_FromItems( +PyAPI_FUNC(bool) _PyObject_MakeInstanceAttributesFromDict(PyObject *obj, PyDictOrValues *dorv); +PyAPI_FUNC(PyObject *)_PyDict_FromItems( PyObject *const *keys, Py_ssize_t keys_offset, PyObject *const *values, Py_ssize_t values_offset, Py_ssize_t length); diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h index 3767df5506d43f..f984df695696c3 100644 --- a/Include/internal/pycore_floatobject.h +++ b/Include/internal/pycore_floatobject.h @@ -34,7 +34,7 @@ struct _Py_float_runtime_state { -void _PyFloat_ExactDealloc(PyObject *op); +PyAPI_FUNC(void) _PyFloat_ExactDealloc(PyObject *op); extern void _PyFloat_DebugMallocStats(FILE* out); diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index 3f3da8a44b77e4..dad6a89af77dec 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -29,7 +29,7 @@ struct _py_func_state { extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr); extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); -extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); +PyAPI_FUNC(void) _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); extern PyObject *_Py_set_function_type_params( diff --git a/Include/internal/pycore_genobject.h b/Include/internal/pycore_genobject.h index b2aa017598409f..9463c822ad8669 100644 --- a/Include/internal/pycore_genobject.h +++ b/Include/internal/pycore_genobject.h @@ -10,7 +10,7 @@ extern "C" { #include "pycore_freelist.h" -extern PyObject *_PyGen_yf(PyGenObject *); +PyAPI_FUNC(PyObject *)_PyGen_yf(PyGenObject *); extern void _PyGen_Finalize(PyObject *self); // Export for '_asyncio' shared extension @@ -19,7 +19,7 @@ PyAPI_FUNC(int) _PyGen_SetStopIterationValue(PyObject *); // Export for '_asyncio' shared extension PyAPI_FUNC(int) _PyGen_FetchStopIterationValue(PyObject **); -extern PyObject *_PyCoro_GetAwaitableIter(PyObject *o); +PyAPI_FUNC(PyObject *)_PyCoro_GetAwaitableIter(PyObject *o); extern PyObject *_PyAsyncGenValueWrapperNew(PyThreadState *state, PyObject *); extern PyTypeObject _PyCoroWrapper_Type; diff --git a/Include/internal/pycore_intrinsics.h b/Include/internal/pycore_intrinsics.h index 3a8dd95cff8e5d..8fa88ea3f74caa 100644 --- a/Include/internal/pycore_intrinsics.h +++ b/Include/internal/pycore_intrinsics.h @@ -44,7 +44,7 @@ typedef struct { const char *name; } intrinsic_func2_info; -extern const intrinsic_func1_info _PyIntrinsics_UnaryFunctions[]; -extern const intrinsic_func2_info _PyIntrinsics_BinaryFunctions[]; +PyAPI_DATA(const intrinsic_func1_info) _PyIntrinsics_UnaryFunctions[]; +PyAPI_DATA(const intrinsic_func2_info) _PyIntrinsics_BinaryFunctions[]; #endif // !Py_INTERNAL_INTRINSIC_H diff --git a/Include/internal/pycore_list.h b/Include/internal/pycore_list.h index 50dc13c4da4487..2a82912e41d557 100644 --- a/Include/internal/pycore_list.h +++ b/Include/internal/pycore_list.h @@ -10,12 +10,12 @@ extern "C" { #include "pycore_freelist.h" // _PyFreeListState -extern PyObject* _PyList_Extend(PyListObject *, PyObject *); +PyAPI_FUNC(PyObject*) _PyList_Extend(PyListObject *, PyObject *); extern void _PyList_DebugMallocStats(FILE *out); #define _PyList_ITEMS(op) _Py_RVALUE(_PyList_CAST(op)->ob_item) -extern int +PyAPI_FUNC(int) _PyList_AppendTakeRefListResize(PyListObject *self, PyObject *newitem); // In free-threaded build: self should be locked by the caller, if it should be thread-safe. @@ -54,7 +54,7 @@ typedef struct { PyListObject *it_seq; /* Set to NULL when iterator is exhausted */ } _PyListIterObject; -extern PyObject *_PyList_FromArraySteal(PyObject *const *src, Py_ssize_t n); +PyAPI_FUNC(PyObject *)_PyList_FromArraySteal(PyObject *const *src, Py_ssize_t n); #ifdef __cplusplus } diff --git a/Include/internal/pycore_long.h b/Include/internal/pycore_long.h index ec27df9e416c58..f04f66d053bab9 100644 --- a/Include/internal/pycore_long.h +++ b/Include/internal/pycore_long.h @@ -121,9 +121,9 @@ PyAPI_DATA(PyObject*) _PyLong_Rshift(PyObject *, size_t); // Export for 'math' shared extension PyAPI_DATA(PyObject*) _PyLong_Lshift(PyObject *, size_t); -extern PyObject* _PyLong_Add(PyLongObject *left, PyLongObject *right); -extern PyObject* _PyLong_Multiply(PyLongObject *left, PyLongObject *right); -extern PyObject* _PyLong_Subtract(PyLongObject *left, PyLongObject *right); +PyAPI_FUNC(PyObject*) _PyLong_Add(PyLongObject *left, PyLongObject *right); +PyAPI_FUNC(PyObject*) _PyLong_Multiply(PyLongObject *left, PyLongObject *right); +PyAPI_FUNC(PyObject*) _PyLong_Subtract(PyLongObject *left, PyLongObject *right); // Export for 'binascii' shared extension. PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; diff --git a/Include/internal/pycore_object.h b/Include/internal/pycore_object.h index 34a83ea228e8b1..9809f5f2e0271a 100644 --- a/Include/internal/pycore_object.h +++ b/Include/internal/pycore_object.h @@ -73,7 +73,7 @@ PyAPI_FUNC(int) _PyObject_IsFreed(PyObject *); .ob_size = size \ } -extern void _Py_NO_RETURN _Py_FatalRefcountErrorFunc( +PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalRefcountErrorFunc( const char *func, const char *message); @@ -684,7 +684,7 @@ PyAPI_FUNC(PyObject*) _PyObject_LookupSpecial(PyObject *, PyObject *); extern int _PyObject_IsAbstract(PyObject *); -extern int _PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method); +PyAPI_FUNC(int) _PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method); extern PyObject* _PyObject_NextNotImplemented(PyObject *); // Pickle support. diff --git a/Include/internal/pycore_pyerrors.h b/Include/internal/pycore_pyerrors.h index 0f16fb894d17e1..910335fd2cf33b 100644 --- a/Include/internal/pycore_pyerrors.h +++ b/Include/internal/pycore_pyerrors.h @@ -95,7 +95,7 @@ extern void _PyErr_Fetch( extern PyObject* _PyErr_GetRaisedException(PyThreadState *tstate); -extern int _PyErr_ExceptionMatches( +PyAPI_FUNC(int) _PyErr_ExceptionMatches( PyThreadState *tstate, PyObject *exc); @@ -114,18 +114,18 @@ extern void _PyErr_SetObject( extern void _PyErr_ChainStackItem(void); -extern void _PyErr_Clear(PyThreadState *tstate); +PyAPI_FUNC(void) _PyErr_Clear(PyThreadState *tstate); extern void _PyErr_SetNone(PyThreadState *tstate, PyObject *exception); extern PyObject* _PyErr_NoMemory(PyThreadState *tstate); -extern void _PyErr_SetString( +PyAPI_FUNC(void) _PyErr_SetString( PyThreadState *tstate, PyObject *exception, const char *string); -extern PyObject* _PyErr_Format( +PyAPI_FUNC(PyObject*) _PyErr_Format( PyThreadState *tstate, PyObject *exception, const char *format, diff --git a/Include/internal/pycore_sliceobject.h b/Include/internal/pycore_sliceobject.h index 89086f67683a2f..ba8b1f1cb27dee 100644 --- a/Include/internal/pycore_sliceobject.h +++ b/Include/internal/pycore_sliceobject.h @@ -11,7 +11,7 @@ extern "C" { /* runtime lifecycle */ -extern PyObject * +PyAPI_FUNC(PyObject *) _PyBuildSlice_ConsumeRefs(PyObject *start, PyObject *stop); #ifdef __cplusplus diff --git a/Include/internal/pycore_tuple.h b/Include/internal/pycore_tuple.h index 4605f355ccbc38..14a9e42c3a324c 100644 --- a/Include/internal/pycore_tuple.h +++ b/Include/internal/pycore_tuple.h @@ -21,7 +21,7 @@ extern PyStatus _PyTuple_InitGlobalObjects(PyInterpreterState *); #define _PyTuple_ITEMS(op) _Py_RVALUE(_PyTuple_CAST(op)->ob_item) extern PyObject *_PyTuple_FromArray(PyObject *const *, Py_ssize_t); -extern PyObject *_PyTuple_FromArraySteal(PyObject *const *, Py_ssize_t); +PyAPI_FUNC(PyObject *)_PyTuple_FromArraySteal(PyObject *const *, Py_ssize_t); typedef struct { PyObject_HEAD diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h index 9134ab45cd0039..c214111fed6f97 100644 --- a/Include/internal/pycore_typeobject.h +++ b/Include/internal/pycore_typeobject.h @@ -147,7 +147,7 @@ extern PyObject* _Py_slot_tp_getattr_hook(PyObject *self, PyObject *name); extern PyTypeObject _PyBufferWrapper_Type; -extern PyObject* _PySuper_Lookup(PyTypeObject *su_type, PyObject *su_obj, +PyAPI_FUNC(PyObject*) _PySuper_Lookup(PyTypeObject *su_type, PyObject *su_obj, PyObject *name, int *meth_found); diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 7ee540154b23d8..fea5ceea0954f4 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -31,7 +31,7 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency( PyObject *op, int check_content); -extern void _PyUnicode_ExactDealloc(PyObject *op); +PyAPI_FUNC(void) _PyUnicode_ExactDealloc(PyObject *op); extern Py_ssize_t _PyUnicode_InternedSize(void); // Get a copy of a Unicode string. @@ -202,7 +202,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( /* --- Methods & Slots ---------------------------------------------------- */ -extern PyObject* _PyUnicode_JoinArray( +PyAPI_FUNC(PyObject*) _PyUnicode_JoinArray( PyObject *separator, PyObject *const *items, Py_ssize_t seqlen diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 7e2c9c4d6a6db4..62dd447e13cdfe 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2782,7 +2782,7 @@ dummy_func( GOTO_ERROR(error); } DECREF_INPUTS(); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -2817,7 +2817,7 @@ dummy_func( GOTO_ERROR(error); } DECREF_INPUTS(); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -3852,9 +3852,9 @@ dummy_func( } inst(CONVERT_VALUE, (value -- result)) { - convertion_func_ptr conv_fn; + conversion_func conv_fn; assert(oparg >= FVC_STR && oparg <= FVC_ASCII); - conv_fn = CONVERSION_FUNCTIONS[oparg]; + conv_fn = _PyEval_ConversionFuncs[oparg]; result = conv_fn(value); Py_DECREF(value); ERROR_IF(result == NULL, error); diff --git a/Python/ceval.c b/Python/ceval.c index 06c136aeb252c9..f817f288903694 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -337,6 +337,12 @@ const binaryfunc _PyEval_BinaryOps[] = { [NB_INPLACE_XOR] = PyNumber_InPlaceXor, }; +const conversion_func _PyEval_ConversionFuncs[4] = { + [FVC_STR] = PyObject_Str, + [FVC_REPR] = PyObject_Repr, + [FVC_ASCII] = PyObject_ASCII +}; + // PEP 634: Structural Pattern Matching diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 01a9b32229d8a5..b39984b298e6fb 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -348,13 +348,6 @@ do { \ } \ } while (0); -typedef PyObject *(*convertion_func_ptr)(PyObject *); - -static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = { - [FVC_STR] = PyObject_Str, - [FVC_REPR] = PyObject_Repr, - [FVC_ASCII] = PyObject_ASCII -}; // GH-89279: Force inlining by using a macro. #if defined(_MSC_VER) && SIZEOF_INT == 4 diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 3054058cf44d31..228454b77a1c8b 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2548,7 +2548,7 @@ GOTO_ERROR(error); } Py_DECREF(mgr); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -2591,7 +2591,7 @@ GOTO_ERROR(error); } Py_DECREF(mgr); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -3570,9 +3570,9 @@ PyObject *result; oparg = CURRENT_OPARG(); value = stack_pointer[-1]; - convertion_func_ptr conv_fn; + conversion_func conv_fn; assert(oparg >= FVC_STR && oparg <= FVC_ASCII); - conv_fn = CONVERSION_FUNCTIONS[oparg]; + conv_fn = _PyEval_ConversionFuncs[oparg]; result = conv_fn(value); Py_DECREF(value); if (result == NULL) goto pop_1_error_tier_two; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 87579134146c85..e1ce6d064cea78 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -40,7 +40,7 @@ GOTO_ERROR(error); } Py_DECREF(mgr); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -86,7 +86,7 @@ GOTO_ERROR(error); } Py_DECREF(mgr); - res = _PyObject_CallNoArgsTstate(tstate, enter); + res = PyObject_CallNoArgs(enter); Py_DECREF(enter); if (res == NULL) { Py_DECREF(exit); @@ -2148,9 +2148,9 @@ PyObject *value; PyObject *result; value = stack_pointer[-1]; - convertion_func_ptr conv_fn; + conversion_func conv_fn; assert(oparg >= FVC_STR && oparg <= FVC_ASCII); - conv_fn = CONVERSION_FUNCTIONS[oparg]; + conv_fn = _PyEval_ConversionFuncs[oparg]; result = conv_fn(value); Py_DECREF(value); if (result == NULL) goto pop_1_error; diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 71c678e04fbfd5..eddec731984c82 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -96,7 +96,7 @@ def emit_aarch64_trampoline(self, hole: Hole) -> None: instruction |= ((base - hole.offset) >> 2) & 0x03FFFFFF self.body[where] = instruction.to_bytes(4, sys.byteorder) self.disassembly += [ - f"{base + 4 * 0: x}: d2800008 mov x8, #0x0", + f"{base + 4 * 0:x}: d2800008 mov x8, #0x0", f"{base + 4 * 0:016x}: R_AARCH64_MOVW_UABS_G0_NC {hole.symbol}", f"{base + 4 * 1:x}: f2a00008 movk x8, #0x0, lsl #16", f"{base + 4 * 1:016x}: R_AARCH64_MOVW_UABS_G1_NC {hole.symbol}", @@ -162,6 +162,13 @@ def process_relocations(self, *, alignment: int = 1) -> None: ): self.code.emit_aarch64_trampoline(hole) continue + elif ( + hole.kind in {"IMAGE_REL_AMD64_REL32"} + and hole.value is HoleValue.ZERO + ): + raise ValueError( + f"Add PyAPI_FUNC(...) or PyAPI_DATA(...) to declaration of {hole.symbol}!" + ) holes.append(hole) stencil.holes[:] = holes self.code.pad(alignment) From 6b2428dca84885d7e654b9a5d9dd459d3f56e5f3 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 26 Feb 2024 10:57:36 -0800 Subject: [PATCH 09/13] Feedback from (the last) PR review --- Python/jit.c | 12 ++++++------ Tools/jit/_targets.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 54a300eec83cde..edf5294adc2a2f 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -310,19 +310,19 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { unsigned char rd = get_bits(loc32[0], 0, 5); assert(IS_AARCH64_LDR_OR_STR(loc32[1])); - unsigned char rt = get_bits(loc32[1], 0, 5); - unsigned char rn = get_bits(loc32[1], 5, 5); - assert(rd == rn && rn == rt); + // There should be only one register involved: + assert(rd == get_bits(loc32[1], 0, 5)); // rt + assert(rd == get_bits(loc32[1], 5, 5)); // rn uint64_t relaxed = *(uint64_t *)value; if (relaxed < (1UL << 16)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + // adrp rd, AAA; ldr rd, [rd + BBB] -> movz rd, XXX; nop loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; loc32[1] = 0xD503201F; i++; continue; } if (relaxed < (1ULL << 32)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + // adrp rd, AAA; ldr rd, [rd + BBB] -> movz rd, XXX; movk rd, YYY loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; i++; @@ -333,7 +333,7 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) (int64_t)relaxed >= -(1L << 19) && (int64_t)relaxed < (1L << 19)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop + // adrp rd, AAA; ldr rd, [rd + BBB] -> ldr rd, XXX; nop loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; loc32[1] = 0xD503201F; i++; diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index b3e63bb32bc879..51e2117e771479 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -118,12 +118,20 @@ async def _compile( f"-I{CPYTHON / 'Python'}", "-O3", "-c", + # This debug info isn't necessary, and bloats out the JIT'ed code. + # We *may* be able to re-enable this, process it, and JIT it for a + # nicer debugging experience... but that needs a lot more research: "-fno-asynchronous-unwind-tables", + # Don't call built-in functions that we can't find or patch: "-fno-builtin", - # SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds: + # This breaks SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds. + # It's not clear why: "-fno-jump-tables", + # Emit relaxable 64-bit calls/jumps, so we don't have to worry about + # about emitting in-range trampolines for out-of-range targets. + # We can probably remove this and emit trampolines in the future: "-fno-plt", - # Don't make calls to weird stack-smashing canaries: + # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-o", f"{o}", From 98dea1bade9bcc5c59a980d6ce281a41106057f4 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 28 Feb 2024 03:49:43 -0800 Subject: [PATCH 10/13] Make _PyOptimizer_Optimize internal --- Include/cpython/optimizer.h | 3 --- Include/internal/pycore_optimizer.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index b0acc67ece9a57..6d7b8bc3c1433a 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -92,9 +92,6 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); -PyAPI_FUNC(int) -_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer, _PyExecutorObject **exec_ptr); - void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *); void _Py_ExecutorClear(_PyExecutorObject *); void _Py_BloomFilter_Init(_PyBloomFilter *); diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index eee71c700d4904..2649232af26bfd 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -25,6 +25,8 @@ extern PyTypeObject _PyDefaultOptimizer_Type; extern PyTypeObject _PyUOpExecutor_Type; extern PyTypeObject _PyUOpOptimizer_Type; +PyAPI_FUNC(int) _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer, _PyExecutorObject **exec_ptr); + #ifdef __cplusplus } #endif From 774b5bb05f28f12b36168f33951349e3e92dd778 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 28 Feb 2024 03:50:05 -0800 Subject: [PATCH 11/13] Make references to registers less cryptic --- Python/jit.c | 20 ++++++++++---------- Tools/jit/template.c | 1 + 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index edf5294adc2a2f..6587793ef42fb7 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -308,23 +308,23 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) next_hole->addend == hole->addend && next_hole->value == hole->value) { - unsigned char rd = get_bits(loc32[0], 0, 5); + unsigned char reg = get_bits(loc32[0], 0, 5); assert(IS_AARCH64_LDR_OR_STR(loc32[1])); // There should be only one register involved: - assert(rd == get_bits(loc32[1], 0, 5)); // rt - assert(rd == get_bits(loc32[1], 5, 5)); // rn + assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. + assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. uint64_t relaxed = *(uint64_t *)value; if (relaxed < (1UL << 16)) { - // adrp rd, AAA; ldr rd, [rd + BBB] -> movz rd, XXX; nop - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; loc32[1] = 0xD503201F; i++; continue; } if (relaxed < (1ULL << 32)) { - // adrp rd, AAA; ldr rd, [rd + BBB] -> movz rd, XXX; movk rd, YYY - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; - loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; i++; continue; } @@ -333,8 +333,8 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) (int64_t)relaxed >= -(1L << 19) && (int64_t)relaxed < (1L << 19)) { - // adrp rd, AAA; ldr rd, [rd + BBB] -> ldr rd, XXX; nop - loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; loc32[1] = 0xD503201F; i++; continue; diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 3cb9460291ed29..8aaf4581de362d 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -9,6 +9,7 @@ #include "pycore_long.h" #include "pycore_opcode_metadata.h" #include "pycore_opcode_utils.h" +#include "pycore_optimizer.h" #include "pycore_range.h" #include "pycore_setobject.h" #include "pycore_sliceobject.h" From ab5a9a086ef7c24616b58391c041ff20cda6b22c Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 28 Feb 2024 22:08:17 -0800 Subject: [PATCH 12/13] Fix handling of duplicate COFF symbols --- Tools/jit/_targets.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 51e2117e771479..07959b15b6c4b9 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -124,9 +124,6 @@ async def _compile( "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", - # This breaks SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds. - # It's not clear why: - "-fno-jump-tables", # Emit relaxable 64-bit calls/jumps, so we don't have to worry about # about emitting in-range trampolines for out-of-range targets. # We can probably remove this and emit trampolines in the future: @@ -202,7 +199,8 @@ def _handle_section( offset = base + symbol["Value"] name = symbol["Name"] name = name.removeprefix(self.prefix) - group.symbols[name] = value, offset + if name not in group.symbols: + group.symbols[name] = value, offset for wrapped_relocation in section["Relocations"]: relocation = wrapped_relocation["Relocation"] hole = self._handle_relocation(base, relocation, stencil.body) From 6c0db0c95fa8bb1fd4a278aeb3f056357e906797 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 28 Feb 2024 22:45:05 -0800 Subject: [PATCH 13/13] Add missing relocations (now that jump tables are enabled) --- Python/jit.c | 3 +++ Tools/jit/_schema.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Python/jit.c b/Python/jit.c index 6587793ef42fb7..9f9e123ab91fef 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -340,6 +340,8 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) continue; } } + // Fall through... + case HoleKind_ARM64_RELOC_PAGE21: // Number of pages between this page and the value's page: value = (value >> 12) - ((uint64_t)location >> 12); // Check that we're not out of range of 21 signed bits: @@ -351,6 +353,7 @@ patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 2, 19); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: + case HoleKind_ARM64_RELOC_PAGEOFF12: case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: // 12-bit low part of an absolute address. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGE21 (above). diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index a76a24207bcbbc..14e5fc2aae80ef 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -4,6 +4,8 @@ HoleKind: typing.TypeAlias = typing.Literal[ "ARM64_RELOC_GOT_LOAD_PAGE21", "ARM64_RELOC_GOT_LOAD_PAGEOFF12", + "ARM64_RELOC_PAGE21", + "ARM64_RELOC_PAGEOFF12", "ARM64_RELOC_UNSIGNED", "IMAGE_REL_AMD64_REL32", "IMAGE_REL_I386_DIR32", pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy