Skip to content

Commit 3d8c38f

Browse files
authored
pythonGH-135904: Improve the JIT's performance on macOS (pythonGH-136528)
1 parent a68ddea commit 3d8c38f

File tree

6 files changed

+73
-61
lines changed

6 files changed

+73
-61
lines changed

Python/jit.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -431,8 +431,10 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *s
431431

432432
#if defined(__aarch64__) || defined(_M_ARM64)
433433
#define TRAMPOLINE_SIZE 16
434+
#define DATA_ALIGN 8
434435
#else
435436
#define TRAMPOLINE_SIZE 0
437+
#define DATA_ALIGN 1
436438
#endif
437439

438440
// Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -522,8 +524,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
522524
// Round up to the nearest page:
523525
size_t page_size = get_page_size();
524526
assert((page_size & (page_size - 1)) == 0);
525-
size_t padding = page_size - ((code_size + state.trampolines.size + data_size) & (page_size - 1));
526-
size_t total_size = code_size + state.trampolines.size + data_size + padding;
527+
size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
528+
size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1));
529+
size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding;
527530
unsigned char *memory = jit_alloc(total_size);
528531
if (memory == NULL) {
529532
return -1;
@@ -545,7 +548,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
545548
// Loop again to emit the code:
546549
unsigned char *code = memory;
547550
state.trampolines.mem = memory + code_size;
548-
unsigned char *data = memory + code_size + state.trampolines.size;
551+
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
549552
// Compile the shim, which handles converting between the native
550553
// calling convention and the calling convention used by jitted code
551554
// (which may be different for efficiency reasons).
@@ -567,7 +570,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
567570
code += group->code_size;
568571
data += group->data_size;
569572
assert(code == memory + code_size);
570-
assert(data == memory + code_size + state.trampolines.size + data_size);
573+
assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
571574
#ifdef MAP_JIT
572575
pthread_jit_write_protect_np(1);
573576
#endif

Tools/jit/_optimizers.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -70,21 +70,21 @@ class Optimizer:
7070

7171
path: pathlib.Path
7272
_: dataclasses.KW_ONLY
73-
# prefix used to mangle symbols on some platforms:
74-
prefix: str = ""
73+
# Prefixes used to mangle local labels and symbols:
74+
label_prefix: str
75+
symbol_prefix: str
7576
# The first block in the linked list:
7677
_root: _Block = dataclasses.field(init=False, default_factory=_Block)
7778
_labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict)
7879
# No groups:
7980
_re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
80-
r"\s*(?:\.|#|//|$)"
81+
r"\s*(?:\.|#|//|;|$)"
8182
)
8283
# One group (label):
8384
_re_label: typing.ClassVar[re.Pattern[str]] = re.compile(
8485
r'\s*(?P<label>[\w."$?@]+):'
8586
)
8687
# Override everything that follows in subclasses:
87-
_alignment: typing.ClassVar[int] = 1
8888
_branches: typing.ClassVar[dict[str, str | None]] = {}
8989
# Two groups (instruction and target):
9090
_re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
@@ -131,8 +131,12 @@ def __post_init__(self) -> None:
131131
block.fallthrough = False
132132

133133
def _preprocess(self, text: str) -> str:
134-
# Override this method to do preprocessing of the textual assembly:
135-
return text
134+
# Override this method to do preprocessing of the textual assembly.
135+
# In all cases, replace references to the _JIT_CONTINUE symbol with
136+
# references to a local _JIT_CONTINUE label (which we will add later):
137+
continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
138+
continue_label = f"{self.label_prefix}_JIT_CONTINUE"
139+
return re.sub(continue_symbol, continue_label, text)
136140

137141
@classmethod
138142
def _invert_branch(cls, line: str, target: str) -> str | None:
@@ -197,15 +201,12 @@ def _insert_continue_label(self) -> None:
197201
# jmp FOO
198202
# After:
199203
# jmp FOO
200-
# .balign 8
201204
# _JIT_CONTINUE:
202205
# This lets the assembler encode _JIT_CONTINUE jumps at build time!
203-
align = _Block()
204-
align.noninstructions.append(f"\t.balign\t{self._alignment}")
205-
continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE")
206+
continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE")
206207
assert continuation.label
207208
continuation.noninstructions.append(f"{continuation.label}:")
208-
end.link, align.link, continuation.link = align, continuation, end.link
209+
end.link, continuation.link = continuation, end.link
209210

210211
def _mark_hot_blocks(self) -> None:
211212
# Start with the last block, and perform a DFS to find all blocks that
@@ -285,8 +286,6 @@ def run(self) -> None:
285286
class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods
286287
"""aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu"""
287288

288-
# TODO: @diegorusso
289-
_alignment = 8
290289
# https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
291290
_re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")
292291

@@ -302,18 +301,3 @@ class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods
302301
_re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
303302
# https://www.felixcloutier.com/x86/ret
304303
_re_return = re.compile(r"\s*ret\b")
305-
306-
307-
class OptimizerX8664Windows(OptimizerX86): # pylint: disable = too-few-public-methods
308-
"""x86_64-pc-windows-msvc"""
309-
310-
def _preprocess(self, text: str) -> str:
311-
text = super()._preprocess(text)
312-
# Before:
313-
# rex64 jmpq *__imp__JIT_CONTINUE(%rip)
314-
# After:
315-
# jmp _JIT_CONTINUE
316-
far_indirect_jump = (
317-
rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)"
318-
)
319-
return re.sub(far_indirect_jump, r"jmp\t\g<target>", text)

Tools/jit/_targets.py

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ class _Target(typing.Generic[_S, _R]):
4444
_: dataclasses.KW_ONLY
4545
args: typing.Sequence[str] = ()
4646
optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer
47-
prefix: str = ""
47+
label_prefix: typing.ClassVar[str]
48+
symbol_prefix: typing.ClassVar[str]
4849
stable: bool = False
4950
debug: bool = False
5051
verbose: bool = False
@@ -172,7 +173,9 @@ async def _compile(
172173
*shlex.split(self.cflags),
173174
]
174175
await _llvm.run("clang", args_s, echo=self.verbose)
175-
self.optimizer(s, prefix=self.prefix).run()
176+
self.optimizer(
177+
s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix
178+
).run()
176179
args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
177180
await _llvm.run("clang", args_o, echo=self.verbose)
178181
return await self._parse(o)
@@ -274,7 +277,7 @@ def _handle_section(
274277
symbol = wrapped_symbol["Symbol"]
275278
offset = base + symbol["Value"]
276279
name = symbol["Name"]
277-
name = name.removeprefix(self.prefix)
280+
name = name.removeprefix(self.symbol_prefix)
278281
if name not in group.symbols:
279282
group.symbols[name] = value, offset
280283
for wrapped_relocation in section["Relocations"]:
@@ -285,9 +288,9 @@ def _handle_section(
285288
def _unwrap_dllimport(self, name: str) -> tuple[_stencils.HoleValue, str | None]:
286289
if name.startswith("__imp_"):
287290
name = name.removeprefix("__imp_")
288-
name = name.removeprefix(self.prefix)
291+
name = name.removeprefix(self.symbol_prefix)
289292
return _stencils.HoleValue.GOT, name
290-
name = name.removeprefix(self.prefix)
293+
name = name.removeprefix(self.symbol_prefix)
291294
return _stencils.symbol_to_value(name)
292295

293296
def _handle_relocation(
@@ -335,9 +338,24 @@ def _handle_relocation(
335338
return _stencils.Hole(offset, kind, value, symbol, addend)
336339

337340

341+
class _COFF32(_COFF):
342+
# These mangle like Mach-O and other "older" formats:
343+
label_prefix = "L"
344+
symbol_prefix = "_"
345+
346+
347+
class _COFF64(_COFF):
348+
# These mangle like ELF and other "newer" formats:
349+
label_prefix = ".L"
350+
symbol_prefix = ""
351+
352+
338353
class _ELF(
339354
_Target[_schema.ELFSection, _schema.ELFRelocation]
340355
): # pylint: disable = too-few-public-methods
356+
label_prefix = ".L"
357+
symbol_prefix = ""
358+
341359
def _handle_section(
342360
self, section: _schema.ELFSection, group: _stencils.StencilGroup
343361
) -> None:
@@ -374,7 +392,7 @@ def _handle_section(
374392
symbol = wrapped_symbol["Symbol"]
375393
offset = len(stencil.body) + symbol["Value"]
376394
name = symbol["Name"]["Name"]
377-
name = name.removeprefix(self.prefix)
395+
name = name.removeprefix(self.symbol_prefix)
378396
group.symbols[name] = value, offset
379397
stencil.body.extend(section["SectionData"]["Bytes"])
380398
assert not section["Relocations"]
@@ -409,7 +427,7 @@ def _handle_relocation(
409427
},
410428
}:
411429
offset += base
412-
s = s.removeprefix(self.prefix)
430+
s = s.removeprefix(self.symbol_prefix)
413431
value, symbol = _stencils.HoleValue.GOT, s
414432
case {
415433
"Addend": addend,
@@ -418,7 +436,7 @@ def _handle_relocation(
418436
"Type": {"Name": kind},
419437
}:
420438
offset += base
421-
s = s.removeprefix(self.prefix)
439+
s = s.removeprefix(self.symbol_prefix)
422440
value, symbol = _stencils.symbol_to_value(s)
423441
case _:
424442
raise NotImplementedError(relocation)
@@ -428,17 +446,20 @@ def _handle_relocation(
428446
class _MachO(
429447
_Target[_schema.MachOSection, _schema.MachORelocation]
430448
): # pylint: disable = too-few-public-methods
449+
label_prefix = "L"
450+
symbol_prefix = "_"
451+
431452
def _handle_section(
432453
self, section: _schema.MachOSection, group: _stencils.StencilGroup
433454
) -> None:
434455
assert section["Address"] >= len(group.code.body)
435456
assert "SectionData" in section
436457
flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
437458
name = section["Name"]["Value"]
438-
name = name.removeprefix(self.prefix)
459+
name = name.removeprefix(self.symbol_prefix)
439460
if "Debug" in flags:
440461
return
441-
if "SomeInstructions" in flags:
462+
if "PureInstructions" in flags:
442463
value = _stencils.HoleValue.CODE
443464
stencil = group.code
444465
start_address = 0
@@ -459,7 +480,7 @@ def _handle_section(
459480
symbol = wrapped_symbol["Symbol"]
460481
offset = symbol["Value"] - start_address
461482
name = symbol["Name"]["Name"]
462-
name = name.removeprefix(self.prefix)
483+
name = name.removeprefix(self.symbol_prefix)
463484
group.symbols[name] = value, offset
464485
assert "Relocations" in section
465486
for wrapped_relocation in section["Relocations"]:
@@ -484,7 +505,7 @@ def _handle_relocation(
484505
},
485506
}:
486507
offset += base
487-
s = s.removeprefix(self.prefix)
508+
s = s.removeprefix(self.symbol_prefix)
488509
value, symbol = _stencils.HoleValue.GOT, s
489510
addend = 0
490511
case {
@@ -493,7 +514,7 @@ def _handle_relocation(
493514
"Type": {"Name": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
494515
}:
495516
offset += base
496-
s = s.removeprefix(self.prefix)
517+
s = s.removeprefix(self.symbol_prefix)
497518
value, symbol = _stencils.HoleValue.GOT, s
498519
addend = (
499520
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
@@ -508,7 +529,7 @@ def _handle_relocation(
508529
"Type": {"Name": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind},
509530
}:
510531
offset += base
511-
s = s.removeprefix(self.prefix)
532+
s = s.removeprefix(self.symbol_prefix)
512533
value, symbol = _stencils.symbol_to_value(s)
513534
addend = (
514535
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
@@ -523,27 +544,27 @@ def _handle_relocation(
523544
"Type": {"Name": kind},
524545
}:
525546
offset += base
526-
s = s.removeprefix(self.prefix)
547+
s = s.removeprefix(self.symbol_prefix)
527548
value, symbol = _stencils.symbol_to_value(s)
528549
addend = 0
529550
case _:
530551
raise NotImplementedError(relocation)
531552
return _stencils.Hole(offset, kind, value, symbol, addend)
532553

533554

534-
def get_target(host: str) -> _COFF | _ELF | _MachO:
555+
def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
535556
"""Build a _Target for the given host "triple" and options."""
536557
optimizer: type[_optimizers.Optimizer]
537-
target: _COFF | _ELF | _MachO
558+
target: _COFF32 | _COFF64 | _ELF | _MachO
538559
if re.fullmatch(r"aarch64-apple-darwin.*", host):
539560
condition = "defined(__aarch64__) && defined(__APPLE__)"
540561
optimizer = _optimizers.OptimizerAArch64
541-
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
562+
target = _MachO(host, condition, optimizer=optimizer)
542563
elif re.fullmatch(r"aarch64-pc-windows-msvc", host):
543564
args = ["-fms-runtime-lib=dll", "-fplt"]
544565
condition = "defined(_M_ARM64)"
545566
optimizer = _optimizers.OptimizerAArch64
546-
target = _COFF(host, condition, args=args, optimizer=optimizer)
567+
target = _COFF64(host, condition, args=args, optimizer=optimizer)
547568
elif re.fullmatch(r"aarch64-.*-linux-gnu", host):
548569
# -mno-outline-atomics: Keep intrinsics from being emitted.
549570
args = ["-fpic", "-mno-outline-atomics"]
@@ -555,16 +576,16 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
555576
args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"]
556577
optimizer = _optimizers.OptimizerX86
557578
condition = "defined(_M_IX86)"
558-
target = _COFF(host, condition, args=args, optimizer=optimizer, prefix="_")
579+
target = _COFF32(host, condition, args=args, optimizer=optimizer)
559580
elif re.fullmatch(r"x86_64-apple-darwin.*", host):
560581
condition = "defined(__x86_64__) && defined(__APPLE__)"
561582
optimizer = _optimizers.OptimizerX86
562-
target = _MachO(host, condition, optimizer=optimizer, prefix="_")
583+
target = _MachO(host, condition, optimizer=optimizer)
563584
elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
564585
args = ["-fms-runtime-lib=dll"]
565586
condition = "defined(_M_X64)"
566-
optimizer = _optimizers.OptimizerX8664Windows
567-
target = _COFF(host, condition, args=args, optimizer=optimizer)
587+
optimizer = _optimizers.OptimizerX86
588+
target = _COFF64(host, condition, args=args, optimizer=optimizer)
568589
elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
569590
args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"]
570591
condition = "defined(__x86_64__) && defined(__linux__)"

Tools/jit/jit.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ typedef jit_func __attribute__((preserve_none)) jit_func_preserve_none;
66
#define PATCH_VALUE(TYPE, NAME, ALIAS) \
77
PyAPI_DATA(void) ALIAS; \
88
TYPE NAME = (TYPE)(uintptr_t)&ALIAS;
9+
10+
#define DECLARE_TARGET(NAME) \
11+
_Py_CODEUNIT *__attribute__((preserve_none, visibility("hidden"))) \
12+
NAME(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate);

Tools/jit/shim.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ _Py_CODEUNIT *
1010
_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate)
1111
{
1212
// Note that this is *not* a tail call:
13-
PATCH_VALUE(jit_func_preserve_none, call, _JIT_CONTINUE);
14-
return call(frame, stack_pointer, tstate);
13+
DECLARE_TARGET(_JIT_CONTINUE);
14+
return _JIT_CONTINUE(frame, stack_pointer, tstate);
1515
}

Tools/jit/template.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ do { \
7474
do { \
7575
} while (0)
7676

77-
#define PATCH_JUMP(ALIAS) \
78-
do { \
79-
PATCH_VALUE(jit_func_preserve_none, jump, ALIAS); \
80-
__attribute__((musttail)) return jump(frame, stack_pointer, tstate); \
77+
#define PATCH_JUMP(ALIAS) \
78+
do { \
79+
DECLARE_TARGET(ALIAS); \
80+
__attribute__((musttail)) return ALIAS(frame, stack_pointer, tstate); \
8181
} while (0)
8282

8383
#undef JUMP_TO_JUMP_TARGET

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy