diff --git a/Python/jit.c b/Python/jit.c index 839414bd810677..ac2c60ed925a26 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -47,18 +47,18 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } -static char * +static unsigned char * jit_alloc(size_t size) { assert(size); assert(size % get_page_size() == 0); #ifdef MS_WINDOWS int flags = MEM_COMMIT | MEM_RESERVE; - char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); + unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); int failed = memory == NULL; #else int flags = MAP_ANONYMOUS | MAP_PRIVATE; - char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); int failed = memory == MAP_FAILED; #endif if (failed) { @@ -69,7 +69,7 @@ jit_alloc(size_t size) } static int -jit_free(char *memory, size_t size) +jit_free(unsigned char *memory, size_t size) { assert(size); assert(size % get_page_size() == 0); @@ -86,7 +86,7 @@ jit_free(char *memory, size_t size) } static int -mark_executable(char *memory, size_t size) +mark_executable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size) } static int -mark_readable(char *memory, size_t size) +mark_readable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // Fill all of stencil's holes in the memory pointed to by base, using the // values in patches. static void -patch(char *base, const Stencil *stencil, uint64_t *patches) +patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { for (uint64_t i = 0; i < stencil->holes_size; i++) { const Hole *hole = &stencil->holes[i]; - void *location = base + hole->offset; + unsigned char *location = base + hole->offset; uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend; + uint8_t *loc8 = (uint8_t *)location; uint32_t *loc32 = (uint32_t *)location; uint64_t *loc64 = (uint64_t *)location; // LLD is a great reference for performing relocations... just keep in // mind that Tools/jit/build.py does filtering and preprocessing for us! // Here's a good place to start for each platform: // - aarch64-apple-darwin: + // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h // - aarch64-unknown-linux-gnu: @@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) // 64-bit absolute address. *loc64 = value; continue; + case HoleKind_R_X86_64_GOTPCRELX: + case HoleKind_R_X86_64_REX_GOTPCRELX: + case HoleKind_X86_64_RELOC_GOT: + case HoleKind_X86_64_RELOC_GOT_LOAD: { + // 32-bit relative address. + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // call qword ptr [rip + AAA] -> nop; call XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // jmp qword ptr [rip + AAA] -> nop; jmp XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + } + // Fall through... + case HoleKind_R_X86_64_GOTPCREL: + case HoleKind_R_X86_64_PC32: + case HoleKind_X86_64_RELOC_SIGNED: + case HoleKind_X86_64_RELOC_BRANCH: + // 32-bit relative address. + value -= (uint64_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + loc32[0] = (uint32_t)value; + continue; case HoleKind_R_AARCH64_CALL26: case HoleKind_R_AARCH64_JUMP26: // 28-bit relative branch. @@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 48, 16); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: + case HoleKind_R_AARCH64_ADR_GOT_PAGE: // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). assert(IS_AARCH64_ADRP(*loc32)); + // Try to relax the pair of GOT loads into an immediate value: + const Hole *next_hole = &stencil->holes[i + 1]; + if (i + 1 < stencil->holes_size && + (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || + next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && + next_hole->offset == hole->offset + 4 && + next_hole->symbol == hole->symbol && + next_hole->addend == hole->addend && + next_hole->value == hole->value) + { + unsigned char rd = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + unsigned char rt = get_bits(loc32[1], 0, 5); + unsigned char rn = get_bits(loc32[1], 5, 5); + assert(rd == rn && rn == rt); + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + if (relaxed < (1ULL << 32)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; + i++; + continue; + } + relaxed = (uint64_t)value - (uint64_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + } // Number of pages between this page and the value's page: value = (value >> 12) - ((uint64_t)location >> 12); // Check that we're not out of range of 21 signed bits: @@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 2, 19); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: + case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: // 12-bit low part of an absolute address. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGE21 (above). assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); @@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) } static void -copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) +copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { memcpy(base, stencil->body, stencil->body_size); patch(base, stencil, patches); @@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) static void emit(const StencilGroup *group, uint64_t patches[]) { - copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches); - copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); } // Compiles executor in-place. Don't forget to call _PyJIT_Free later! @@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size assert((page_size & (page_size - 1)) == 0); code_size += page_size - (code_size & (page_size - 1)); data_size += page_size - (data_size & (page_size - 1)); - char *memory = jit_alloc(code_size + data_size); + unsigned char *memory = jit_alloc(code_size + data_size); if (memory == NULL) { return -1; } // Loop again to emit the code: - char *code = memory; - char *data = memory + code_size; - char *top = code; + unsigned char *code = memory; + unsigned char *data = memory + code_size; + unsigned char *top = code; if (trace[0].opcode == _START_EXECUTOR) { // Don't want to execute this more than once: top += stencil_groups[_START_EXECUTOR].code.body_size; @@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size void _PyJIT_Free(_PyExecutorObject *executor) { - char *memory = (char *)executor->jit_code; + unsigned char *memory = (unsigned char *)executor->jit_code; size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 8eeb78e6cd69ee..975ca650a13c1a 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -8,13 +8,23 @@ "IMAGE_REL_AMD64_ADDR64", "IMAGE_REL_I386_DIR32", "R_AARCH64_ABS64", + "R_AARCH64_ADR_GOT_PAGE", "R_AARCH64_CALL26", "R_AARCH64_JUMP26", + "R_AARCH64_LD64_GOT_LO12_NC", "R_AARCH64_MOVW_UABS_G0_NC", "R_AARCH64_MOVW_UABS_G1_NC", "R_AARCH64_MOVW_UABS_G2_NC", "R_AARCH64_MOVW_UABS_G3", "R_X86_64_64", + "R_X86_64_GOTPCREL", + "R_X86_64_GOTPCRELX", + "R_X86_64_PC32", + "R_X86_64_REX_GOTPCRELX", + "X86_64_RELOC_BRANCH", + "X86_64_RELOC_GOT", + "X86_64_RELOC_GOT_LOAD", + "X86_64_RELOC_SIGNED", "X86_64_RELOC_UNSIGNED", ] diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 6c1d440324c505..06dc4e7acc6c91 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]): triple: str _: dataclasses.KW_ONLY alignment: int = 1 + args: typing.Sequence[str] = () prefix: str = "" debug: bool = False force: bool = False @@ -121,21 +122,14 @@ async def _compile( "-fno-builtin", # SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds: "-fno-jump-tables", - # Position-independent code adds indirection to every load and jump: - "-fno-pic", + "-fno-plt", # Don't make calls to weird stack-smashing canaries: "-fno-stack-protector", - # We have three options for code model: - # - "small": the default, assumes that code and data reside in the - # lowest 2GB of memory (128MB on aarch64) - # - "medium": assumes that code resides in the lowest 2GB of memory, - # and makes no assumptions about data (not available on aarch64) - # - "large": makes no assumptions about either code or data - "-mcmodel=large", "-o", f"{o}", "-std=c11", f"{c}", + *self.args, ] await _llvm.run("clang", args, echo=self.verbose) return await self._parse(o) @@ -284,7 +278,23 @@ def _handle_section( def _handle_relocation( self, base: int, relocation: _schema.ELFRelocation, raw: bytes ) -> _stencils.Hole: + symbol: str | None match relocation: + case { + "Addend": addend, + "Offset": offset, + "Symbol": {"Value": s}, + "Type": { + "Value": "R_AARCH64_ADR_GOT_PAGE" + | "R_AARCH64_LD64_GOT_LO12_NC" + | "R_X86_64_GOTPCREL" + | "R_X86_64_GOTPCRELX" + | "R_X86_64_REX_GOTPCRELX" as kind + }, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.HoleValue.GOT, s case { "Addend": addend, "Offset": offset, @@ -358,6 +368,34 @@ def _handle_relocation( s = s.removeprefix(self.prefix) value, symbol = _stencils.HoleValue.GOT, s addend = 0 + case { + "Offset": offset, + "Symbol": {"Value": s}, + "Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind}, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.HoleValue.GOT, s + addend = ( + int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + ) + case { + "Offset": offset, + "Section": {"Value": s}, + "Type": {"Value": "X86_64_RELOC_SIGNED" as kind}, + } | { + "Offset": offset, + "Symbol": {"Value": s}, + "Type": { + "Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind + }, + }: + offset += base + s = s.removeprefix(self.prefix) + value, symbol = _stencils.symbol_to_value(s) + addend = ( + int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 + ) case { "Offset": offset, "Section": {"Value": s}, @@ -379,15 +417,19 @@ def _handle_relocation( def get_target(host: str) -> _COFF | _ELF | _MachO: """Build a _Target for the given host "triple" and options.""" if re.fullmatch(r"aarch64-apple-darwin.*", host): - return _MachO(host, alignment=8, prefix="_") + args = ["-mcmodel=large"] + return _MachO(host, alignment=8, args=args, prefix="_") if re.fullmatch(r"aarch64-.*-linux-gnu", host): - return _ELF(host, alignment=8) + args = ["-mcmodel=large"] + return _ELF(host, alignment=8, args=args) if re.fullmatch(r"i686-pc-windows-msvc", host): - return _COFF(host, prefix="_") + args = ["-mcmodel=large"] + return _COFF(host, args=args, prefix="_") if re.fullmatch(r"x86_64-apple-darwin.*", host): return _MachO(host, prefix="_") if re.fullmatch(r"x86_64-pc-windows-msvc", host): - return _COFF(host) + args = ["-mcmodel=large"] + return _COFF(host, args=args) if re.fullmatch(r"x86_64-.*-linux-gnu", host): return _ELF(host) raise ValueError(host)
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: