Skip to content

Commit 7259480

Browse files
authored
GH-115802: JIT "small" code for macOS and Linux (GH-115826)
1 parent 5a83292 commit 7259480

File tree

3 files changed

+168
-29
lines changed

3 files changed

+168
-29
lines changed

Python/jit.c

Lines changed: 103 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,18 @@ jit_error(const char *message)
4747
PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint);
4848
}
4949

50-
static char *
50+
static unsigned char *
5151
jit_alloc(size_t size)
5252
{
5353
assert(size);
5454
assert(size % get_page_size() == 0);
5555
#ifdef MS_WINDOWS
5656
int flags = MEM_COMMIT | MEM_RESERVE;
57-
char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
57+
unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
5858
int failed = memory == NULL;
5959
#else
6060
int flags = MAP_ANONYMOUS | MAP_PRIVATE;
61-
char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
61+
unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
6262
int failed = memory == MAP_FAILED;
6363
#endif
6464
if (failed) {
@@ -69,7 +69,7 @@ jit_alloc(size_t size)
6969
}
7070

7171
static int
72-
jit_free(char *memory, size_t size)
72+
jit_free(unsigned char *memory, size_t size)
7373
{
7474
assert(size);
7575
assert(size % get_page_size() == 0);
@@ -86,7 +86,7 @@ jit_free(char *memory, size_t size)
8686
}
8787

8888
static int
89-
mark_executable(char *memory, size_t size)
89+
mark_executable(unsigned char *memory, size_t size)
9090
{
9191
if (size == 0) {
9292
return 0;
@@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size)
113113
}
114114

115115
static int
116-
mark_readable(char *memory, size_t size)
116+
mark_readable(unsigned char *memory, size_t size)
117117
{
118118
if (size == 0) {
119119
return 0;
@@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
169169
// Fill all of stencil's holes in the memory pointed to by base, using the
170170
// values in patches.
171171
static void
172-
patch(char *base, const Stencil *stencil, uint64_t *patches)
172+
patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
173173
{
174174
for (uint64_t i = 0; i < stencil->holes_size; i++) {
175175
const Hole *hole = &stencil->holes[i];
176-
void *location = base + hole->offset;
176+
unsigned char *location = base + hole->offset;
177177
uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend;
178+
uint8_t *loc8 = (uint8_t *)location;
178179
uint32_t *loc32 = (uint32_t *)location;
179180
uint64_t *loc64 = (uint64_t *)location;
180181
// LLD is a great reference for performing relocations... just keep in
181182
// mind that Tools/jit/build.py does filtering and preprocessing for us!
182183
// Here's a good place to start for each platform:
183184
// - aarch64-apple-darwin:
185+
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
184186
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
185187
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
186188
// - aarch64-unknown-linux-gnu:
@@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
208210
// 64-bit absolute address.
209211
*loc64 = value;
210212
continue;
213+
case HoleKind_R_X86_64_GOTPCRELX:
214+
case HoleKind_R_X86_64_REX_GOTPCRELX:
215+
case HoleKind_X86_64_RELOC_GOT:
216+
case HoleKind_X86_64_RELOC_GOT_LOAD: {
217+
// 32-bit relative address.
218+
// Try to relax the GOT load into an immediate value:
219+
uint64_t relaxed = *(uint64_t *)(value + 4) - 4;
220+
if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
221+
(int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
222+
{
223+
if (loc8[-2] == 0x8B) {
224+
// mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
225+
loc8[-2] = 0x8D;
226+
value = relaxed;
227+
}
228+
else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) {
229+
// call qword ptr [rip + AAA] -> nop; call XXX
230+
loc8[-2] = 0x90;
231+
loc8[-1] = 0xE8;
232+
value = relaxed;
233+
}
234+
else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) {
235+
// jmp qword ptr [rip + AAA] -> nop; jmp XXX
236+
loc8[-2] = 0x90;
237+
loc8[-1] = 0xE9;
238+
value = relaxed;
239+
}
240+
}
241+
}
242+
// Fall through...
243+
case HoleKind_R_X86_64_GOTPCREL:
244+
case HoleKind_R_X86_64_PC32:
245+
case HoleKind_X86_64_RELOC_SIGNED:
246+
case HoleKind_X86_64_RELOC_BRANCH:
247+
// 32-bit relative address.
248+
value -= (uint64_t)location;
249+
// Check that we're not out of range of 32 signed bits:
250+
assert((int64_t)value >= -(1LL << 31));
251+
assert((int64_t)value < (1LL << 31));
252+
loc32[0] = (uint32_t)value;
253+
continue;
211254
case HoleKind_R_AARCH64_CALL26:
212255
case HoleKind_R_AARCH64_JUMP26:
213256
// 28-bit relative branch.
@@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
249292
set_bits(loc32, 5, value, 48, 16);
250293
continue;
251294
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21:
295+
case HoleKind_R_AARCH64_ADR_GOT_PAGE:
252296
// 21-bit count of pages between this page and an absolute address's
253297
// page... I know, I know, it's weird. Pairs nicely with
254298
// ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
255299
assert(IS_AARCH64_ADRP(*loc32));
300+
// Try to relax the pair of GOT loads into an immediate value:
301+
const Hole *next_hole = &stencil->holes[i + 1];
302+
if (i + 1 < stencil->holes_size &&
303+
(next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
304+
next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) &&
305+
next_hole->offset == hole->offset + 4 &&
306+
next_hole->symbol == hole->symbol &&
307+
next_hole->addend == hole->addend &&
308+
next_hole->value == hole->value)
309+
{
310+
unsigned char rd = get_bits(loc32[0], 0, 5);
311+
assert(IS_AARCH64_LDR_OR_STR(loc32[1]));
312+
unsigned char rt = get_bits(loc32[1], 0, 5);
313+
unsigned char rn = get_bits(loc32[1], 5, 5);
314+
assert(rd == rn && rn == rt);
315+
uint64_t relaxed = *(uint64_t *)value;
316+
if (relaxed < (1UL << 16)) {
317+
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
318+
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd;
319+
loc32[1] = 0xD503201F;
320+
i++;
321+
continue;
322+
}
323+
if (relaxed < (1ULL << 32)) {
324+
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
325+
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd;
326+
loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd;
327+
i++;
328+
continue;
329+
}
330+
relaxed = (uint64_t)value - (uint64_t)location;
331+
if ((relaxed & 0x3) == 0 &&
332+
(int64_t)relaxed >= -(1L << 19) &&
333+
(int64_t)relaxed < (1L << 19))
334+
{
335+
// adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop
336+
loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd;
337+
loc32[1] = 0xD503201F;
338+
i++;
339+
continue;
340+
}
341+
}
256342
// Number of pages between this page and the value's page:
257343
value = (value >> 12) - ((uint64_t)location >> 12);
258344
// Check that we're not out of range of 21 signed bits:
@@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
264350
set_bits(loc32, 5, value, 2, 19);
265351
continue;
266352
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12:
353+
case HoleKind_R_AARCH64_LD64_GOT_LO12_NC:
267354
// 12-bit low part of an absolute address. Pairs nicely with
268355
// ARM64_RELOC_GOT_LOAD_PAGE21 (above).
269356
assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32));
@@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
285372
}
286373

287374
static void
288-
copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
375+
copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
289376
{
290377
memcpy(base, stencil->body, stencil->body_size);
291378
patch(base, stencil, patches);
@@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
294381
static void
295382
emit(const StencilGroup *group, uint64_t patches[])
296383
{
297-
copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches);
298-
copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches);
384+
copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches);
385+
copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches);
299386
}
300387

301388
// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
@@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
316403
assert((page_size & (page_size - 1)) == 0);
317404
code_size += page_size - (code_size & (page_size - 1));
318405
data_size += page_size - (data_size & (page_size - 1));
319-
char *memory = jit_alloc(code_size + data_size);
406+
unsigned char *memory = jit_alloc(code_size + data_size);
320407
if (memory == NULL) {
321408
return -1;
322409
}
323410
// Loop again to emit the code:
324-
char *code = memory;
325-
char *data = memory + code_size;
326-
char *top = code;
411+
unsigned char *code = memory;
412+
unsigned char *data = memory + code_size;
413+
unsigned char *top = code;
327414
if (trace[0].opcode == _START_EXECUTOR) {
328415
// Don't want to execute this more than once:
329416
top += stencil_groups[_START_EXECUTOR].code.body_size;
@@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
360447
void
361448
_PyJIT_Free(_PyExecutorObject *executor)
362449
{
363-
char *memory = (char *)executor->jit_code;
450+
unsigned char *memory = (unsigned char *)executor->jit_code;
364451
size_t size = executor->jit_size;
365452
if (memory) {
366453
executor->jit_code = NULL;

Tools/jit/_schema.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,23 @@
88
"IMAGE_REL_AMD64_ADDR64",
99
"IMAGE_REL_I386_DIR32",
1010
"R_AARCH64_ABS64",
11+
"R_AARCH64_ADR_GOT_PAGE",
1112
"R_AARCH64_CALL26",
1213
"R_AARCH64_JUMP26",
14+
"R_AARCH64_LD64_GOT_LO12_NC",
1315
"R_AARCH64_MOVW_UABS_G0_NC",
1416
"R_AARCH64_MOVW_UABS_G1_NC",
1517
"R_AARCH64_MOVW_UABS_G2_NC",
1618
"R_AARCH64_MOVW_UABS_G3",
1719
"R_X86_64_64",
20+
"R_X86_64_GOTPCREL",
21+
"R_X86_64_GOTPCRELX",
22+
"R_X86_64_PC32",
23+
"R_X86_64_REX_GOTPCRELX",
24+
"X86_64_RELOC_BRANCH",
25+
"X86_64_RELOC_GOT",
26+
"X86_64_RELOC_GOT_LOAD",
27+
"X86_64_RELOC_SIGNED",
1828
"X86_64_RELOC_UNSIGNED",
1929
]
2030

Tools/jit/_targets.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]):
3737
triple: str
3838
_: dataclasses.KW_ONLY
3939
alignment: int = 1
40+
args: typing.Sequence[str] = ()
4041
prefix: str = ""
4142
debug: bool = False
4243
force: bool = False
@@ -121,21 +122,14 @@ async def _compile(
121122
"-fno-builtin",
122123
# SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds:
123124
"-fno-jump-tables",
124-
# Position-independent code adds indirection to every load and jump:
125-
"-fno-pic",
125+
"-fno-plt",
126126
# Don't make calls to weird stack-smashing canaries:
127127
"-fno-stack-protector",
128-
# We have three options for code model:
129-
# - "small": the default, assumes that code and data reside in the
130-
# lowest 2GB of memory (128MB on aarch64)
131-
# - "medium": assumes that code resides in the lowest 2GB of memory,
132-
# and makes no assumptions about data (not available on aarch64)
133-
# - "large": makes no assumptions about either code or data
134-
"-mcmodel=large",
135128
"-o",
136129
f"{o}",
137130
"-std=c11",
138131
f"{c}",
132+
*self.args,
139133
]
140134
await _llvm.run("clang", args, echo=self.verbose)
141135
return await self._parse(o)
@@ -284,7 +278,23 @@ def _handle_section(
284278
def _handle_relocation(
285279
self, base: int, relocation: _schema.ELFRelocation, raw: bytes
286280
) -> _stencils.Hole:
281+
symbol: str | None
287282
match relocation:
283+
case {
284+
"Addend": addend,
285+
"Offset": offset,
286+
"Symbol": {"Value": s},
287+
"Type": {
288+
"Value": "R_AARCH64_ADR_GOT_PAGE"
289+
| "R_AARCH64_LD64_GOT_LO12_NC"
290+
| "R_X86_64_GOTPCREL"
291+
| "R_X86_64_GOTPCRELX"
292+
| "R_X86_64_REX_GOTPCRELX" as kind
293+
},
294+
}:
295+
offset += base
296+
s = s.removeprefix(self.prefix)
297+
value, symbol = _stencils.HoleValue.GOT, s
288298
case {
289299
"Addend": addend,
290300
"Offset": offset,
@@ -358,6 +368,34 @@ def _handle_relocation(
358368
s = s.removeprefix(self.prefix)
359369
value, symbol = _stencils.HoleValue.GOT, s
360370
addend = 0
371+
case {
372+
"Offset": offset,
373+
"Symbol": {"Value": s},
374+
"Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
375+
}:
376+
offset += base
377+
s = s.removeprefix(self.prefix)
378+
value, symbol = _stencils.HoleValue.GOT, s
379+
addend = (
380+
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
381+
)
382+
case {
383+
"Offset": offset,
384+
"Section": {"Value": s},
385+
"Type": {"Value": "X86_64_RELOC_SIGNED" as kind},
386+
} | {
387+
"Offset": offset,
388+
"Symbol": {"Value": s},
389+
"Type": {
390+
"Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind
391+
},
392+
}:
393+
offset += base
394+
s = s.removeprefix(self.prefix)
395+
value, symbol = _stencils.symbol_to_value(s)
396+
addend = (
397+
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
398+
)
361399
case {
362400
"Offset": offset,
363401
"Section": {"Value": s},
@@ -379,15 +417,19 @@ def _handle_relocation(
379417
def get_target(host: str) -> _COFF | _ELF | _MachO:
380418
"""Build a _Target for the given host "triple" and options."""
381419
if re.fullmatch(r"aarch64-apple-darwin.*", host):
382-
return _MachO(host, alignment=8, prefix="_")
420+
args = ["-mcmodel=large"]
421+
return _MachO(host, alignment=8, args=args, prefix="_")
383422
if re.fullmatch(r"aarch64-.*-linux-gnu", host):
384-
return _ELF(host, alignment=8)
423+
args = ["-mcmodel=large"]
424+
return _ELF(host, alignment=8, args=args)
385425
if re.fullmatch(r"i686-pc-windows-msvc", host):
386-
return _COFF(host, prefix="_")
426+
args = ["-mcmodel=large"]
427+
return _COFF(host, args=args, prefix="_")
387428
if re.fullmatch(r"x86_64-apple-darwin.*", host):
388429
return _MachO(host, prefix="_")
389430
if re.fullmatch(r"x86_64-pc-windows-msvc", host):
390-
return _COFF(host)
431+
args = ["-mcmodel=large"]
432+
return _COFF(host, args=args)
391433
if re.fullmatch(r"x86_64-.*-linux-gnu", host):
392434
return _ELF(host)
393435
raise ValueError(host)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy