From 56133bb0184e5d2b8bf5809b648aa512693a0a02 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 5 Aug 2023 12:13:46 -0700 Subject: [PATCH 01/26] Split `CALL_PY_EXACT_ARGS` into uops This is only the first step for doing `CALL` in Tier 2. The next step involves tracing into the called code object. After that we'll have to do the remaining `CALL` specialization. Finally we'll have to tweak various things like `KW_NAMES`, and possibly move the `NULL` (for method calls) *above* the callable. But those are things for future PRs. Note: this moves setting `frame->return_offset` directly in front of `DISPATCH_INLINED()`, to make it easier to move it into `_PUSH_FRAME`. --- Include/internal/pycore_opcode_metadata.h | 15 +++- Python/bytecodes.c | 35 +++++++--- Python/executor.c | 13 ++++ Python/executor_cases.c.h | 61 ++++++++++++++++ Python/generated_cases.c.h | 85 ++++++++++++++--------- Python/optimizer.c | 9 +++ Tools/cases_generator/generate_cases.py | 16 ++++- Tools/cases_generator/instructions.py | 20 +++--- Tools/cases_generator/stacking.py | 26 +++++-- 9 files changed, 220 insertions(+), 60 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 9f4437c09e92cb..3957c645c45336 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,9 +52,12 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _POP_JUMP_IF_FALSE 331 -#define _POP_JUMP_IF_TRUE 332 -#define JUMP_TO_TOP 333 +#define _CHECK_CALL_PY_EXACT_ARGS 331 +#define _INIT_CALL_PY_EXACT_ARGS 332 +#define _PUSH_FRAME 333 +#define _POP_JUMP_IF_FALSE 334 +#define _POP_JUMP_IF_TRUE 335 +#define JUMP_TO_TOP 336 #ifndef NEED_OPCODE_METADATA extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); @@ -951,6 +954,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, + INSTR_FMT_IBC0, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, @@ -995,6 +999,7 @@ struct opcode_macro_expansion { #define OPARG_CACHE_4 4 #define OPARG_TOP 5 #define OPARG_BOTTOM 6 +#define OPARG_SAVE_IP 7 #define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format) #define SAME_OPCODE_METADATA(OP1, OP2) \ @@ -1336,6 +1341,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 4, .uops = { { _CHECK_CALL_PY_EXACT_ARGS, 2, 1 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1389,6 +1395,9 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", + [_CHECK_CALL_PY_EXACT_ARGS] = "_CHECK_CALL_PY_EXACT_ARGS", + [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", + [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", [_POP_JUMP_IF_TRUE] = "_POP_JUMP_IF_TRUE", [JUMP_TO_TOP] = "JUMP_TO_TOP", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b2281abc6663da..7e6d1167b76c08 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -957,13 +957,13 @@ dummy_func( { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -996,13 +996,13 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2586,7 +2586,6 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -2594,6 +2593,7 @@ dummy_func( SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2944,7 +2944,7 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { + op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- method, callable, unused[oparg])) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; @@ -2958,19 +2958,36 @@ dummy_func( PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != argcount, CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + + op(_INIT_CALL_PY_EXACT_ARGS, (method, callable, args[oparg] -- new_frame: _PyInterpreterFrame*)) { + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); for (int i = 0; i < argcount; i++) { new_frame->localsplus[i] = args[i]; } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); + } + + op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { frame->return_offset = 0; DISPATCH_INLINED(new_frame); } - inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { + macro(CALL_PY_EXACT_ARGS) = + unused/1 + // Skip over the counter + _CHECK_CALL_PY_EXACT_ARGS + + _INIT_CALL_PY_EXACT_ARGS + + _PUSH_FRAME; + + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, method, callable, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; diff --git a/Python/executor.c b/Python/executor.c index 4a18618c0c6c0c..cd673a7beeef88 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -30,6 +30,19 @@ #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef DISPATCH_INLINED +#define DISPATCH_INLINED(NEW_FRAME) \ + do { \ + assert(tstate->interp->eval_frame == NULL); \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + frame->prev_instr -= 1; \ + (NEW_FRAME)->previous = frame; \ + frame = tstate->cframe->current_frame = (NEW_FRAME); \ + CALL_STAT_INC(inlined_py_calls); \ + stack_pointer = _PyFrame_GetStackPointer(frame); \ + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; \ + } while (0) + _PyInterpreterFrame * _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d6d541a3b61ab4..c7fe281895d945 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2153,6 +2153,67 @@ break; } + case _CHECK_CALL_PY_EXACT_ARGS: { + PyObject *self_or_null; + PyObject *callable; + PyObject *method; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)operand; + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(tstate->interp->eval_frame, CALL); + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + stack_pointer[-2 - oparg] = method; + stack_pointer[-1 - oparg] = callable; + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + PyObject **args; + PyObject *callable; + PyObject *method; + _PyInterpreterFrame *new_frame; + args = stack_pointer - oparg; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } + STACK_SHRINK(oparg); + STACK_SHRINK(1); + stack_pointer[-1] = (PyObject *)new_frame; + break; + } + + case _PUSH_FRAME: { + _PyInterpreterFrame *new_frame; + new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; + STACK_SHRINK(1); + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + break; + } + case CALL_NO_KW_TYPE_1: { PyObject **args; PyObject *null; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index cf20b869b8182f..2137efeb507e82 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -1191,13 +1191,13 @@ { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -1237,13 +1237,13 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -3341,7 +3341,6 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -3349,6 +3348,7 @@ SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); STACK_GROW(1); } @@ -3758,47 +3758,68 @@ TARGET(CALL_PY_EXACT_ARGS) { PREDICTED(CALL_PY_EXACT_ARGS); - PyObject **args; PyObject *self_or_null; PyObject *callable; - args = stack_pointer - oparg; + PyObject *method; + PyObject **args; + _PyInterpreterFrame *new_frame; + // _CHECK_CALL_PY_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = read_u32(&next_instr[1].cache); - ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } - DEOPT_IF(!PyFunction_Check(callable), CALL); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version, CALL); - PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); - for (int i = 0; i < argcount; i++) { - new_frame->localsplus[i] = args[i]; + { + uint32_t func_version = read_u32(&next_instr[1].cache); + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(tstate->interp->eval_frame, CALL); + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + stack_pointer[-2 - oparg] = method; + stack_pointer[-1 - oparg] = callable; + // _INIT_CALL_PY_EXACT_ARGS + args = stack_pointer - oparg; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; + { + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); - frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + // _PUSH_FRAME STACK_SHRINK(oparg); - STACK_SHRINK(1); + STACK_SHRINK(2); + next_instr += 3; + { + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + } } TARGET(CALL_PY_WITH_DEFAULTS) { PyObject **args; - PyObject *self_or_null; PyObject *callable; + PyObject *method; args = stack_pointer - oparg; - self_or_null = stack_pointer[-1 - oparg]; - callable = stack_pointer[-2 - oparg]; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); diff --git a/Python/optimizer.c b/Python/optimizer.c index 6c730aa14b9a47..dbd5142b9101e9 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -602,6 +602,10 @@ translate_bytecode_to_trace( case OPARG_BOTTOM: // Second half of super-instr oparg = orig_oparg & 0xF; break; + case OPARG_SAVE_IP: // op==SAVE_IP; oparg=next instr + oparg = INSTR_IP(instr + offset, code); + break; + default: fprintf(stderr, "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n", @@ -611,6 +615,11 @@ translate_bytecode_to_trace( Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _PUSH_FRAME) { + assert(i + 1 == nuops); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } } break; } diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d35a16a80e8d00..f183845a667003 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -53,6 +53,7 @@ "OPARG_CACHE_4": 4, "OPARG_TOP": 5, "OPARG_BOTTOM": 6, + "OPARG_SAVE_IP": 7, } INSTR_FMT_PREFIX = "INSTR_FMT_" @@ -344,7 +345,7 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part]) + self.write_macro_expansions(instr.name, [part], instr.cache_offset) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -354,7 +355,7 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts) + self.write_macro_expansions(mac.name, mac.parts, mac.cache_offset) case parsing.Pseudo(): pass case _: @@ -428,13 +429,22 @@ def add(name: str) -> None: if instr.kind == "op" and instr.is_viable_uop(): add(instr.name) - def write_macro_expansions(self, name: str, parts: MacroParts) -> None: + def write_macro_expansions( + self, name: str, parts: MacroParts, cache_offset: int + ) -> None: """Write the macro expansions for a macro-instruction.""" # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc. offset = 0 # Cache effect offset expansions: list[tuple[str, int, int]] = [] # [(name, size, offset), ...] for part in parts: if isinstance(part, Component): + # _PUSH_FRAME is super special; it expands to SAVE_IP(next_instr) + _PUSH_FRAME + if part.instr.name == "_PUSH_FRAME": + expansions.append( + ("SAVE_IP", OPARG_SIZES["OPARG_SAVE_IP"], cache_offset) + ) + expansions.append(("_PUSH_FRAME", OPARG_SIZES["OPARG_FULL"], 0)) + continue # All component instructions must be viable uops if not part.instr.is_viable_uop(): # This note just reminds us about macros that cannot diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index aa94dbb07ea1c0..4219fa9c654798 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -59,7 +59,7 @@ class Instruction: block_line: int # First line of block in original code # Computed by constructor - always_exits: bool + always_exits: str # If the block always exits, its last line; else "" has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -120,11 +120,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - # if self.name.startswith("CALL"): - # dprint = print + if "PY_EXACT" in self.name: + dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay + if self.name == "_PUSH_FRAME": + return True # Has DISPATCH_INLINED but it's okay if self.always_exits: dprint(f"Skipping {self.name} because it always exits") return False @@ -322,16 +324,16 @@ def extract_block_text(block: parsing.Block) -> tuple[list[str], bool, int]: return blocklines, check_eval_breaker, block_line -def always_exits(lines: list[str]) -> bool: +def always_exits(lines: list[str]) -> str: """Determine whether a block always ends in a return/goto/etc.""" if not lines: - return False + return "" line = lines[-1].rstrip() # Indent must match exactly (TODO: Do something better) if line[:12] != " " * 12: - return False + return "" line = line[12:] - return line.startswith( + if line.startswith( ( "goto ", "return ", @@ -340,4 +342,6 @@ def always_exits(lines: list[str]) -> bool: "Py_UNREACHABLE()", "ERROR_IF(true, ", ) - ) + ): + return line + return "" diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index d457ce01a8f438..aea6d70e1fcba9 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -295,6 +295,7 @@ def write_single_instr( [Component(instr, instr.active_caches)], out, tier, + 0, ) except AssertionError as err: raise AssertionError(f"Error writing instruction {instr.name}") from err @@ -306,12 +307,14 @@ def write_macro_instr( parts = [part for part in mac.parts if isinstance(part, Component)] cache_adjust = 0 + always_exits = False for part in mac.parts: match part: case CacheEffect(size=size): cache_adjust += size case Component(instr=instr): cache_adjust += instr.cache_offset + always_exits = instr.always_exits case _: typing.assert_never(part) @@ -321,18 +324,20 @@ def write_macro_instr( out.emit(f"PREDICTED({mac.name});") out.static_assert_family_size(mac.name, family, cache_adjust) try: - write_components(parts, out, TIER_ONE) + write_components(parts, out, TIER_ONE, cache_adjust) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") - out.emit("DISPATCH();") + if not always_exits: + if cache_adjust: + out.emit(f"next_instr += {cache_adjust};") + out.emit("DISPATCH();") def write_components( parts: list[Component], out: Formatter, tier: Tiers, + cache_adjust: int, ) -> None: managers = get_managers(parts) @@ -374,13 +379,24 @@ def write_components( poke.as_stack_effect(lax=True), ) + dispatch_inlined_special_case = False + if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": + dispatch_inlined_special_case = True + temp = mgr.final_offset.clone() + temp.deeper(StackEffect(UNUSED)) # Hack + out.stack_adjust(temp.deep, temp.high) + # Use clone() since adjust_inverse() mutates final_offset + mgr.adjust_inverse(mgr.final_offset.clone()) + if cache_adjust: + out.emit(f"next_instr += {cache_adjust};") + if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) else: with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1]: + if mgr is managers[-1] and not dispatch_inlined_special_case: out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) From 907ff95317c8d7b9221ce95c8823ed8d2cf4861c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 12:00:18 -0700 Subject: [PATCH 02/26] Fix merge so it works again (I think) --- Python/bytecodes.c | 17 +++++------------ Python/executor_cases.c.h | 20 +++++--------------- Python/generated_cases.c.h | 24 +++++++----------------- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 7e6d1167b76c08..accd4f798d1b26 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2944,27 +2944,20 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- method, callable, unused[oparg])) { + op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - op(_INIT_CALL_PY_EXACT_ARGS, (method, callable, args[oparg] -- new_frame: _PyInterpreterFrame*)) { - int is_meth = method != NULL; + op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) { int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } @@ -2987,7 +2980,7 @@ dummy_func( _INIT_CALL_PY_EXACT_ARGS + _PUSH_FRAME; - inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, method, callable, args[oparg] -- unused)) { + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index c7fe281895d945..b4d25a4522ada2 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2156,40 +2156,30 @@ case _CHECK_CALL_PY_EXACT_ARGS: { PyObject *self_or_null; PyObject *callable; - PyObject *method; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; uint32_t func_version = (uint32_t)operand; ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - stack_pointer[-2 - oparg] = method; - stack_pointer[-1 - oparg] = callable; break; } case _INIT_CALL_PY_EXACT_ARGS: { PyObject **args; + PyObject *self_or_null; PyObject *callable; - PyObject *method; _PyInterpreterFrame *new_frame; args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; - int is_meth = method != NULL; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 2137efeb507e82..21e00e629d22e0 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3760,7 +3760,6 @@ PREDICTED(CALL_PY_EXACT_ARGS); PyObject *self_or_null; PyObject *callable; - PyObject *method; PyObject **args; _PyInterpreterFrame *new_frame; // _CHECK_CALL_PY_EXACT_ARGS @@ -3770,29 +3769,20 @@ uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - stack_pointer[-2 - oparg] = method; - stack_pointer[-1 - oparg] = callable; // _INIT_CALL_PY_EXACT_ARGS args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; { - int is_meth = method != NULL; int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } @@ -3815,11 +3805,11 @@ TARGET(CALL_PY_WITH_DEFAULTS) { PyObject **args; + PyObject *self_or_null; PyObject *callable; - PyObject *method; args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); From 2c6be6d65290d85a1c43f4491202dbbbb1e38778 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 12:15:46 -0700 Subject: [PATCH 03/26] Split into finer-grained uops --- Include/internal/pycore_opcode_metadata.h | 20 ++++++++++++-------- Python/bytecodes.c | 16 +++++++++++++--- Python/executor_cases.c.h | 16 ++++++++++++++-- Python/generated_cases.c.h | 13 +++++++++++-- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 3957c645c45336..3ebbf3fcc8c8a2 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,12 +52,14 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _CHECK_CALL_PY_EXACT_ARGS 331 -#define _INIT_CALL_PY_EXACT_ARGS 332 -#define _PUSH_FRAME 333 -#define _POP_JUMP_IF_FALSE 334 -#define _POP_JUMP_IF_TRUE 335 -#define JUMP_TO_TOP 336 +#define _CHECK_PEP_523 331 +#define _CHECK_FUNCTION_EXACT_ARGS 332 +#define _CHECK_STACK_SPACE 333 +#define _INIT_CALL_PY_EXACT_ARGS 334 +#define _PUSH_FRAME 335 +#define _POP_JUMP_IF_FALSE 336 +#define _POP_JUMP_IF_TRUE 337 +#define JUMP_TO_TOP 338 #ifndef NEED_OPCODE_METADATA extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); @@ -1341,7 +1343,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, - [CALL_PY_EXACT_ARGS] = { .nuops = 4, .uops = { { _CHECK_CALL_PY_EXACT_ARGS, 2, 1 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 6, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1395,7 +1397,9 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", - [_CHECK_CALL_PY_EXACT_ARGS] = "_CHECK_CALL_PY_EXACT_ARGS", + [_CHECK_PEP_523] = "_CHECK_PEP_523", + [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index accd4f798d1b26..48e523264d8e62 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2944,14 +2944,22 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { - ASSERT_KWNAMES_IS_NULL(); + op(_CHECK_PEP_523, (--)) { DEOPT_IF(tstate->interp->eval_frame, CALL); + } + + op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + + op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } @@ -2976,7 +2984,9 @@ dummy_func( macro(CALL_PY_EXACT_ARGS) = unused/1 + // Skip over the counter - _CHECK_CALL_PY_EXACT_ARGS + + _CHECK_PEP_523 + + _CHECK_FUNCTION_EXACT_ARGS + + _CHECK_STACK_SPACE + _INIT_CALL_PY_EXACT_ARGS + _PUSH_FRAME; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index b4d25a4522ada2..d76c66ee304df1 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2153,19 +2153,31 @@ break; } - case _CHECK_CALL_PY_EXACT_ARGS: { + case _CHECK_PEP_523: { + DEOPT_IF(tstate->interp->eval_frame, CALL); + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { PyObject *self_or_null; PyObject *callable; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; uint32_t func_version = (uint32_t)operand; ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + break; + } + + case _CHECK_STACK_SPACE: { + PyObject *callable; + callable = stack_pointer[-2 - oparg]; + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 21e00e629d22e0..f13b9066f937b2 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3762,18 +3762,27 @@ PyObject *callable; PyObject **args; _PyInterpreterFrame *new_frame; - // _CHECK_CALL_PY_EXACT_ARGS + // _CHECK_PEP_523 + { + DEOPT_IF(tstate->interp->eval_frame, CALL); + } + // _CHECK_FUNCTION_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; { uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + // _CHECK_STACK_SPACE + callable = stack_pointer[-2 - oparg]; + { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } // _INIT_CALL_PY_EXACT_ARGS From 6d78ff2926d558de0508be62c8f23c99962202b5 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 17:24:45 -0700 Subject: [PATCH 04/26] Fix type error in stacking.py --- Tools/cases_generator/stacking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index aea6d70e1fcba9..86b9ee92c80be5 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -307,7 +307,7 @@ def write_macro_instr( parts = [part for part in mac.parts if isinstance(part, Component)] cache_adjust = 0 - always_exits = False + always_exits = "" for part in mac.parts: match part: case CacheEffect(size=size): From 0d8e66c91da65571bd990a3cf9e092fa68d264fb Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 17:44:59 -0700 Subject: [PATCH 05/26] Add test --- Lib/test/test_capi/test_misc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 001d37de8e0eb3..36b8e1fa993b98 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2618,6 +2618,23 @@ def testfunc(it): with self.assertRaises(StopIteration): next(it) + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(10) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + + if __name__ == "__main__": unittest.main() From b75f30eb37c8ac68a014a7a3318e549e9e4d4ea3 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 18:21:20 -0700 Subject: [PATCH 06/26] Add comment explaining _PUSH_FRAME's unused output effect --- Python/bytecodes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 48e523264d8e62..e749348bdae488 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2977,6 +2977,9 @@ dummy_func( } } + // The 'unused' output effect represents the return value + // (which will be pushed when the frame returns). + // It is needed so CALL_PY_EXACT_ARGS matches its family. op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { frame->return_offset = 0; DISPATCH_INLINED(new_frame); From 61c2822cde6b709855c1052df38ed05cb059b7af Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 07:53:35 -0700 Subject: [PATCH 07/26] Make PUSH_FRAME special case a little less myterious --- Tools/cases_generator/stacking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index 86b9ee92c80be5..d9eaacfcbeb974 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -382,9 +382,8 @@ def write_components( dispatch_inlined_special_case = False if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": dispatch_inlined_special_case = True - temp = mgr.final_offset.clone() - temp.deeper(StackEffect(UNUSED)) # Hack - out.stack_adjust(temp.deep, temp.high) + # Adjust stack to min_offset (input effects materialized) + out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) if cache_adjust: @@ -397,6 +396,7 @@ def write_components( mgr.instr.write_body(out, -4, mgr.active_caches, tier) if mgr is managers[-1] and not dispatch_inlined_special_case: + # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) From f73ea90f72f9eeef25f6eb9ada1b83a6cbca54e1 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 15:46:09 -0700 Subject: [PATCH 08/26] Rename Instruction.write to write_case_body --- Tools/cases_generator/generate_cases.py | 5 +++-- Tools/cases_generator/instructions.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index f183845a667003..3563f16a5978a6 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -22,6 +22,7 @@ PseudoInstruction, StackEffect, OverriddenInstructionPlaceHolder, + TIER_ONE, TIER_TWO, ) import parsing @@ -597,7 +598,7 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write(self.out, tier=TIER_TWO) + instr.write_case_body(self.out, tier=TIER_TWO) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -630,7 +631,7 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - instr.write(self.out) + instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 4219fa9c654798..c52f0dcb0ced62 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -142,7 +142,7 @@ def is_viable_uop(self) -> bool: res = False return res - def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None: + def write_case_body(self, out: Formatter, tier: Tiers) -> None: """Write one instruction, sans prologue and epilogue.""" # Write a static assertion that a family's cache size is correct From 12910fcd18be115e99a7575d38c5e7967a97fe45 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:01:28 -0700 Subject: [PATCH 09/26] Move next_instr update to a more logical place --- Tools/cases_generator/generate_cases.py | 2 ++ Tools/cases_generator/instructions.py | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 3563f16a5978a6..d84ee47bc8a5bf 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -633,6 +633,8 @@ def write_instr(self, instr: Instruction) -> None: self.out.emit(f"PREDICTED({name});") instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: + if instr.cache_offset: + self.out.emit(f"next_instr += {instr.cache_offset};") if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit(f"DISPATCH();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index c52f0dcb0ced62..7094320a4a3862 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -151,14 +151,6 @@ def write_case_body(self, out: Formatter, tier: Tiers) -> None: # Write input stack effect variable declarations and initializations stacking.write_single_instr(self, out, tier) - # Skip the rest if the block always exits - if self.always_exits: - return - - # Write cache effect - if tier == TIER_ONE and self.cache_offset: - out.emit(f"next_instr += {self.cache_offset};") - def write_body( self, out: Formatter, From 2fafa2c16116af63834792c06ba88d8f2c22043f Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:19:06 -0700 Subject: [PATCH 10/26] Don't recompute macro cache offset --- Tools/cases_generator/stacking.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index d9eaacfcbeb974..ad63bbbdb0505c 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -305,31 +305,18 @@ def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: parts = [part for part in mac.parts if isinstance(part, Component)] - - cache_adjust = 0 - always_exits = "" - for part in mac.parts: - match part: - case CacheEffect(size=size): - cache_adjust += size - case Component(instr=instr): - cache_adjust += instr.cache_offset - always_exits = instr.always_exits - case _: - typing.assert_never(part) - out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: out.emit(f"PREDICTED({mac.name});") - out.static_assert_family_size(mac.name, family, cache_adjust) + out.static_assert_family_size(mac.name, family, mac.cache_offset) try: - write_components(parts, out, TIER_ONE, cache_adjust) + write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if not always_exits: - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") + if not parts[-1].instr.always_exits: + if mac.cache_offset: + out.emit(f"next_instr += {mac.cache_offset};") out.emit("DISPATCH();") @@ -337,7 +324,7 @@ def write_components( parts: list[Component], out: Formatter, tier: Tiers, - cache_adjust: int, + cache_offset: int, ) -> None: managers = get_managers(parts) @@ -386,8 +373,8 @@ def write_components( out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") + if cache_offset: + out.emit(f"next_instr += {cache_offset};") if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) From 2717b0738b3c25b996288f19579d125c9ffda830 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:22:58 -0700 Subject: [PATCH 11/26] Fold and refactor long line in stacking.py --- Tools/cases_generator/stacking.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index ad63bbbdb0505c..c4f5ee29ef41af 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -366,9 +366,12 @@ def write_components( poke.as_stack_effect(lax=True), ) - dispatch_inlined_special_case = False - if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": - dispatch_inlined_special_case = True + dispatch_inlined_special_case = ( + mgr is managers[-1] + and mgr.instr.always_exits.startswith("DISPATCH_INLINED") + and mgr.instr.name == "_PUSH_FRAME" + ) + if dispatch_inlined_special_case: # Adjust stack to min_offset (input effects materialized) out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset From e48790885a448f1065411ad0eda0e69ca4896808 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:24:33 -0700 Subject: [PATCH 12/26] Fold long lines in generate_cases.py --- Tools/cases_generator/generate_cases.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d84ee47bc8a5bf..c0a11bcd23dd7f 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -346,7 +346,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part], instr.cache_offset) + self.write_macro_expansions( + instr.name, [part], instr.cache_offset + ) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -356,7 +358,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts, mac.cache_offset) + self.write_macro_expansions( + mac.name, mac.parts, mac.cache_offset + ) case parsing.Pseudo(): pass case _: @@ -562,7 +566,9 @@ def write_instructions( case parsing.Macro(): n_macros += 1 mac = self.macro_instrs[thing.name] - stacking.write_macro_instr(mac, self.out, self.families.get(mac.name)) + stacking.write_macro_instr( + mac, self.out, self.families.get(mac.name) + ) # self.write_macro(self.macro_instrs[thing.name]) case parsing.Pseudo(): pass From 1d549af5f4b0d28f22da4561ba0d1ba16e226823 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:52:09 -0700 Subject: [PATCH 13/26] Don't emit static assert to executor cases --- Python/executor_cases.c.h | 9 --------- Tools/cases_generator/generate_cases.py | 1 + Tools/cases_generator/instructions.py | 5 ----- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d76c66ee304df1..f63c5c60fdc085 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -103,7 +103,6 @@ } case TO_BOOL: { - static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; value = stack_pointer[-1]; @@ -363,7 +362,6 @@ } case BINARY_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *res; @@ -557,7 +555,6 @@ } case STORE_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *v; @@ -862,7 +859,6 @@ } case UNPACK_SEQUENCE: { - static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; seq = stack_pointer[-1]; #if ENABLE_SPECIALIZATION @@ -950,7 +946,6 @@ } case STORE_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *v; owner = stack_pointer[-1]; @@ -1061,7 +1056,6 @@ } case LOAD_GLOBAL: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size"); PyObject *res; PyObject *null = NULL; #if ENABLE_SPECIALIZATION @@ -1554,7 +1548,6 @@ } case LOAD_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; PyObject *attr; PyObject *self_or_null = NULL; @@ -1648,7 +1641,6 @@ } case COMPARE_OP: { - static_assert(INLINE_CACHE_ENTRIES_COMPARE_OP == 1, "incorrect cache size"); PyObject *right; PyObject *left; PyObject *res; @@ -2717,7 +2709,6 @@ } case BINARY_OP: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_OP == 1, "incorrect cache size"); PyObject *rhs; PyObject *lhs; PyObject *res; diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index c0a11bcd23dd7f..6e80a06da386d1 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -637,6 +637,7 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") + self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 7094320a4a3862..27cfb2c20ea9e0 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -144,11 +144,6 @@ def is_viable_uop(self) -> bool: def write_case_body(self, out: Formatter, tier: Tiers) -> None: """Write one instruction, sans prologue and epilogue.""" - - # Write a static assertion that a family's cache size is correct - out.static_assert_family_size(self.name, self.family, self.cache_offset) - - # Write input stack effect variable declarations and initializations stacking.write_single_instr(self, out, tier) def write_body( From f40fb1f82267bcc39b0ded8b5e1f2f0b731a24f9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:55:09 -0700 Subject: [PATCH 14/26] Factor away write_case_body (formerly Instruction.write) --- Tools/cases_generator/generate_cases.py | 4 ++-- Tools/cases_generator/instructions.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 6e80a06da386d1..32344ffa8c011e 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -604,7 +604,7 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write_case_body(self.out, tier=TIER_TWO) + stacking.write_single_instr(instr, self.out, tier=TIER_TWO) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -638,7 +638,7 @@ def write_instr(self, instr: Instruction) -> None: if instr.predicted: self.out.emit(f"PREDICTED({name});") self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) - instr.write_case_body(self.out, tier=TIER_ONE) + stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: self.out.emit(f"next_instr += {instr.cache_offset};") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 27cfb2c20ea9e0..1fb291eeca1d0c 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -142,10 +142,6 @@ def is_viable_uop(self) -> bool: res = False return res - def write_case_body(self, out: Formatter, tier: Tiers) -> None: - """Write one instruction, sans prologue and epilogue.""" - stacking.write_single_instr(self, out, tier) - def write_body( self, out: Formatter, From 4f6f8f8f44761ae3945cc873497bbfe2ec07102b Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:58:02 -0700 Subject: [PATCH 15/26] Fold long lines --- Tools/cases_generator/generate_cases.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 32344ffa8c011e..2a695c45684cc1 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -604,7 +604,9 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - stacking.write_single_instr(instr, self.out, tier=TIER_TWO) + stacking.write_single_instr( + instr, self.out, tier=TIER_TWO + ) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -637,7 +639,9 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) + self.out.static_assert_family_size( + instr.name, instr.family, instr.cache_offset + ) stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: From 6facc8dd838998999c117ddf8c225ad09bebfe47 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:19:21 -0700 Subject: [PATCH 16/26] Make less of a special case of _PUSH_FRAME Instead, the special case is an opcode using SAVE_FRAME_STATE(). Introducing #if TIER_ONE and #if TIER_TWO so we can implement _PUSH_FRAME differently for both tiers. --- Python/bytecodes.c | 16 +++++++++++++++- Python/ceval.c | 1 + Python/ceval_macros.h | 9 +++++++-- Python/executor.c | 18 +++++++----------- Python/executor_cases.c.h | 16 +++++++++++++++- Python/generated_cases.c.h | 16 +++++++++++++++- Tools/cases_generator/instructions.py | 8 ++++---- Tools/cases_generator/stacking.py | 12 ++++-------- 8 files changed, 68 insertions(+), 28 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e749348bdae488..9919c4141c9099 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2981,8 +2981,22 @@ dummy_func( // (which will be pushed when the frame returns). // It is needed so CALL_PY_EXACT_ARGS matches its family. op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } macro(CALL_PY_EXACT_ARGS) = diff --git a/Python/ceval.c b/Python/ceval.c index b966399a342d08..2370636d765d9c 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -770,6 +770,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif { +#define TIER_ONE 1 #include "generated_cases.c.h" /* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c, diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8dc8b754485856..77f760f0bb5995 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -103,11 +103,16 @@ DISPATCH_GOTO(); \ } +#define SAVE_FRAME_STATE() \ + do { \ + frame->prev_instr = next_instr - 1; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + } while (0) + #define DISPATCH_INLINED(NEW_FRAME) \ do { \ assert(tstate->interp->eval_frame == NULL); \ - _PyFrame_SetStackPointer(frame, stack_pointer); \ - frame->prev_instr = next_instr - 1; \ + SAVE_FRAME_STATE(); \ (NEW_FRAME)->previous = frame; \ frame = cframe.current_frame = (NEW_FRAME); \ CALL_STAT_INC(inlined_py_calls); \ diff --git a/Python/executor.c b/Python/executor.c index cd673a7beeef88..c18ba6d4ade3e5 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -30,17 +30,12 @@ #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 -#undef DISPATCH_INLINED -#define DISPATCH_INLINED(NEW_FRAME) \ - do { \ - assert(tstate->interp->eval_frame == NULL); \ - _PyFrame_SetStackPointer(frame, stack_pointer); \ - frame->prev_instr -= 1; \ - (NEW_FRAME)->previous = frame; \ - frame = tstate->cframe->current_frame = (NEW_FRAME); \ - CALL_STAT_INC(inlined_py_calls); \ - stack_pointer = _PyFrame_GetStackPointer(frame); \ - ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; \ +#undef SAVE_FRAME_STATE +#define SAVE_FRAME_STATE() \ + do { \ + /* Assume preceding SAVE_IP has set frame->prev_instr */ \ + frame->prev_instr--; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ } while (0) @@ -94,6 +89,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject OBJECT_STAT_INC(optimization_uops_executed); switch (opcode) { +#define TIER_TWO 2 #include "executor_cases.c.h" default: diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index f63c5c60fdc085..4ace06857eb424 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2203,8 +2203,22 @@ _PyInterpreterFrame *new_frame; new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; STACK_SHRINK(1); + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index f13b9066f937b2..a42457d05faf03 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3807,8 +3807,22 @@ STACK_SHRINK(2); next_instr += 3; { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } } diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 1fb291eeca1d0c..6c52cf3e8a3d09 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -60,6 +60,7 @@ class Instruction: # Computed by constructor always_exits: str # If the block always exits, its last line; else "" + save_frame_state: bool # Whether the instruction uses SAVE_FRAME_STATE() has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -83,6 +84,7 @@ def __init__(self, inst: parsing.InstDef): self.block ) self.always_exits = always_exits(self.block_text) + self.save_frame_state = variable_used(self.inst, "SAVE_FRAME_STATE") self.has_deopt = variable_used(self.inst, "DEOPT_IF") self.cache_effects = [ effect for effect in inst.inputs if isinstance(effect, parsing.CacheEffect) @@ -120,15 +122,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - if "PY_EXACT" in self.name: + if "PUSH_FRAME" in self.name: dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay - if self.name == "_PUSH_FRAME": - return True # Has DISPATCH_INLINED but it's okay if self.always_exits: - dprint(f"Skipping {self.name} because it always exits") + dprint(f"Skipping {self.name} because it always exits: {self.always_exits}") return False if len(self.active_caches) > 1: # print(f"Skipping {self.name} because it has >1 cache entries") diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index c4f5ee29ef41af..a721cba0247a8a 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -1,6 +1,7 @@ import dataclasses import typing +from flags import variable_used_unspecialized from formatting import ( Formatter, UNUSED, @@ -314,7 +315,7 @@ def write_macro_instr( write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if not parts[-1].instr.always_exits: + if not parts[-1].instr.always_exits and not parts[-1].instr.save_frame_state: if mac.cache_offset: out.emit(f"next_instr += {mac.cache_offset};") out.emit("DISPATCH();") @@ -366,12 +367,7 @@ def write_components( poke.as_stack_effect(lax=True), ) - dispatch_inlined_special_case = ( - mgr is managers[-1] - and mgr.instr.always_exits.startswith("DISPATCH_INLINED") - and mgr.instr.name == "_PUSH_FRAME" - ) - if dispatch_inlined_special_case: + if mgr.instr.save_frame_state: # Adjust stack to min_offset (input effects materialized) out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset @@ -385,7 +381,7 @@ def write_components( with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1] and not dispatch_inlined_special_case: + if mgr is managers[-1] and not mgr.instr.save_frame_state: # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset From 94630d49b3283657953d44b7fbe2a8a36d624721 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:45:31 -0700 Subject: [PATCH 17/26] Stop special-casing _PUSH_FRAME altogether Instead, we special-case SAVE_IP: - Its Tier 2 expansion sets oparg to the instruction offset - In Tier 1 it is a no-op (and skipped if present in a macro) --- Python/bytecodes.c | 1 + Tools/cases_generator/generate_cases.py | 12 ++++-------- Tools/cases_generator/stacking.py | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 9919c4141c9099..cbbd6d48daae75 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3005,6 +3005,7 @@ dummy_func( _CHECK_FUNCTION_EXACT_ARGS + _CHECK_STACK_SPACE + _INIT_CALL_PY_EXACT_ARGS + + SAVE_IP + // Tier 2 only; special-cased oparg _PUSH_FRAME; inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 2a695c45684cc1..6050fcf84479e9 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -443,13 +443,6 @@ def write_macro_expansions( expansions: list[tuple[str, int, int]] = [] # [(name, size, offset), ...] for part in parts: if isinstance(part, Component): - # _PUSH_FRAME is super special; it expands to SAVE_IP(next_instr) + _PUSH_FRAME - if part.instr.name == "_PUSH_FRAME": - expansions.append( - ("SAVE_IP", OPARG_SIZES["OPARG_SAVE_IP"], cache_offset) - ) - expansions.append(("_PUSH_FRAME", OPARG_SIZES["OPARG_FULL"], 0)) - continue # All component instructions must be viable uops if not part.instr.is_viable_uop(): # This note just reminds us about macros that cannot @@ -463,7 +456,10 @@ def write_macro_expansions( ) return if not part.active_caches: - size, offset = OPARG_SIZES["OPARG_FULL"], 0 + if part.instr.name == "SAVE_IP": + size, offset = OPARG_SIZES["OPARG_SAVE_IP"], cache_offset + else: + size, offset = OPARG_SIZES["OPARG_FULL"], 0 else: # If this assert triggers, is_viable_uops() lied assert len(part.active_caches) == 1, (name, part.instr.name) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index a721cba0247a8a..7154d8fbab454f 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -147,6 +147,8 @@ class EffectManager: # Track offsets from stack pointer min_offset: StackOffset final_offset: StackOffset + # Link to previous manager + pred: "EffectManager | None" = None def __init__( self, @@ -168,7 +170,8 @@ def __init__( self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff)) self.final_offset.higher(eff) - if pred: + self.pred = pred + while pred: # Replace push(x) + pop(y) with copy(x, y). # Check that the sources and destinations are disjoint. sources: set[str] = set() @@ -193,6 +196,11 @@ def __init__( sources, destinations, ) + # See if we can get more copies of a earlier predecessor. + if self.peeks and not pred.pokes and not pred.peeks: + pred = pred.pred + else: + pred = None # Break def adjust_deeper(self, eff: StackEffect) -> None: for peek in self.peeks: @@ -305,7 +313,11 @@ def write_single_instr( def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: - parts = [part for part in mac.parts if isinstance(part, Component)] + parts = [ + part + for part in mac.parts + if isinstance(part, Component) and part.instr.name != "SAVE_IP" + ] out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: From 675f3179fc44e07507dc89a985c07df901431c54 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Aug 2023 12:09:32 -0700 Subject: [PATCH 18/26] Add function-by-version cache --- Include/internal/pycore_function.h | 9 +++++++ Objects/funcobject.c | 41 ++++++++++++++++++++++++++++-- Python/bytecodes.c | 3 ++- Python/executor_cases.c.h | 3 ++- Python/generated_cases.c.h | 3 ++- Python/sysmodule.c | 2 ++ 6 files changed, 56 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index e844d323ec7927..d059c3731b435f 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -16,13 +16,22 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_MAX_WATCHERS 8 +#define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; + // Function objects whose func_version % FUNC_VERSION_CACHE_SIZE + // once equaled the index in the table. The references are owned: + // Call _PyFunction_ClearByVersionCache() to clear. + PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr); extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); +extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); +PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); +void _PyFunction_ClearByVersionCache(void); + extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 8c0bface3ac710..10655fd3def4f3 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,7 +223,44 @@ PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname return NULL; } -uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) +void +_PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) +{ + func->func_version = version; + if (version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (version % FUNC_VERSION_CACHE_SIZE); + Py_XSETREF(*slot, (PyFunctionObject *)Py_NewRef(func)); + } +} + +PyFunctionObject * +_PyFunction_LookupByVersion(uint32_t version) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (version % FUNC_VERSION_CACHE_SIZE); + if (*slot != NULL && (*slot)->func_version == version) { + return (PyFunctionObject *)Py_NewRef(*slot); + } + return NULL; +} + +void +_PyFunction_ClearByVersionCache(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { + PyFunctionObject **slot = interp->func_state.func_version_cache + i; + Py_CLEAR(*slot); + } +} + +uint32_t +_PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { if (func->func_version != 0) { return func->func_version; @@ -236,7 +273,7 @@ uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) return 0; } uint32_t v = interp->func_state.next_version++; - func->func_version = v; + _PyFunction_SetVersion(func, v); return v; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index cbbd6d48daae75..f1826e64313501 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3542,7 +3542,8 @@ dummy_func( goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4ace06857eb424..727b4008f997d2 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2605,7 +2605,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; break; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index a42457d05faf03..d856884d2fcf22 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4561,7 +4561,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; DISPATCH(); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index be026d95ba7e77..f6511fd7b0c9dc 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2079,6 +2079,8 @@ sys__clear_type_cache_impl(PyObject *module) /*[clinic end generated code: output=20e48ca54a6f6971 input=127f3e04a8d9b555]*/ { PyType_ClearCache(); + // Also clear the function-by-version cache + _PyFunction_ClearByVersionCache(); Py_RETURN_NONE; } From bb514e6b913eee836d66a97056909e79bfe88c7b Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Aug 2023 14:51:40 -0700 Subject: [PATCH 19/26] Double max trace length to 64 --- Include/internal/pycore_uops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h index 57a5970353b360..254eeca2361bea 100644 --- a/Include/internal/pycore_uops.h +++ b/Include/internal/pycore_uops.h @@ -8,7 +8,7 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif -#define _Py_UOP_MAX_TRACE_LENGTH 32 +#define _Py_UOP_MAX_TRACE_LENGTH 64 typedef struct { uint32_t opcode; From 61958fc51a5f8836ba806861dceadce6cbd1f8a6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Aug 2023 14:54:46 -0700 Subject: [PATCH 20/26] Trace into function calls - This uses the function-by-version cache I just added - There's not yet a way to trace back via `RETURN_VALUE` --- Python/optimizer.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Python/optimizer.c b/Python/optimizer.c index dbd5142b9101e9..25e375de84e2cf 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -444,6 +444,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); +top: // Jump here after _PUSH_FRAME for (;;) { RESERVE_RAW(2, "epilogue"); // Always need space for SAVE_IP and EXIT_TRACE ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); @@ -617,6 +618,34 @@ translate_bytecode_to_trace( ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); if (expansion->uops[i].uop == _PUSH_FRAME) { assert(i + 1 == nuops); + int func_version_offset = + offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT) + // Add one to account for the actual opcode/oparg pair: + + 1; + uint32_t func_version = read_u32(&instr[func_version_offset].cache); + PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + DPRINTF(3, "Function object: %p\n", func); + if (func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_code == code) { + // Recursive call, bail (we could be here forever). + DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", + PyUnicode_AsUTF8(new_code->co_qualname), + PyUnicode_AsUTF8(new_code->co_filename), + new_code->co_firstlineno); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } + code = new_code; + initial_instr = instr = _PyCode_CODE(code); + DPRINTF(2, + "Continuing in %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(initial_instr, code)); + goto top; + } ADD_TO_TRACE(SAVE_IP, 0, 0); goto done; } From 61e636b661c03c4bee5e892a7bcfdbdb93027aba Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 18:40:41 -0700 Subject: [PATCH 21/26] Make RESUME a viable uop --- Include/internal/pycore_opcode_metadata.h | 1 + Lib/test/test_capi/test_misc.py | 1 + Python/bytecodes.c | 5 ++++- Python/executor_cases.c.h | 18 ++++++++++++++++++ Python/generated_cases.c.h | 5 ++++- Tools/cases_generator/flags.py | 2 +- Tools/cases_generator/instructions.py | 4 ++-- 7 files changed, 31 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 3ebbf3fcc8c8a2..7944a67e8d5467 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1237,6 +1237,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { }; const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPANSION_SIZE] = { [NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } }, + [RESUME] = { .nuops = 1, .uops = { { RESUME, 0, 0 } } }, [LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { LOAD_FAST_CHECK, 0, 0 } } }, [LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } }, [LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } }, diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 36b8e1fa993b98..ea4c4ffc3a8f11 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2633,6 +2633,7 @@ def dummy(x): self.assertIsNotNone(ex) uops = {opname for opname, _, _ in ex} self.assertIn("_PUSH_FRAME", uops) + self.assertIn("_BINARY_OP_ADD_INT", uops) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f1826e64313501..536ee09af9798a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -135,6 +135,7 @@ dummy_func( } inst(RESUME, (--)) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -143,7 +144,9 @@ dummy_func( ERROR_IF(err, error); next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 727b4008f997d2..ba93ef0fac3753 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7,6 +7,24 @@ break; } + case RESUME: { + #if TIER_ONE + assert(tstate->cframe == &cframe); + assert(frame == cframe.current_frame); + /* Possibly combine this with eval breaker */ + if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + if (err) goto error; + next_instr--; + } + else + #endif + if (oparg < 2) { + CHECK_EVAL_BREAKER(); + } + break; + } + case LOAD_FAST_CHECK: { PyObject *value; value = GETLOCAL(oparg); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index d856884d2fcf22..77ea51388617e8 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -8,6 +8,7 @@ } TARGET(RESUME) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -16,7 +17,9 @@ if (err) goto error; next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } DISPATCH(); diff --git a/Tools/cases_generator/flags.py b/Tools/cases_generator/flags.py index f7ebdeb0d65677..962f003b194dbd 100644 --- a/Tools/cases_generator/flags.py +++ b/Tools/cases_generator/flags.py @@ -92,7 +92,7 @@ def variable_used_unspecialized(node: parsing.Node, name: str) -> bool: if text == "#if": if ( i + 1 < len(node.tokens) - and node.tokens[i + 1].text == "ENABLE_SPECIALIZATION" + and node.tokens[i + 1].text in ("ENABLE_SPECIALIZATION", "TIER_ONE") ): skipping = True elif text in ("#else", "#endif"): diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 6c52cf3e8a3d09..260494ec612f8f 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -122,8 +122,8 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - if "PUSH_FRAME" in self.name: - dprint = print + # if "RESUME" in self.name: + # dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay From d0c6f865fc5173cae68619c44d4c47497c203307 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 12:57:13 -0700 Subject: [PATCH 22/26] Clear func_version_cache in interpreter_clear() This should fix leaks and hopefully most failing tests. --- Include/internal/pycore_function.h | 2 +- Objects/funcobject.c | 3 +-- Python/pystate.c | 3 +++ Python/sysmodule.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index d059c3731b435f..cb3f856410d0d1 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -30,7 +30,7 @@ extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr) extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); -void _PyFunction_ClearByVersionCache(void); +void _PyFunction_ClearByVersionCache(PyInterpreterState *interp); extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 10655fd3def4f3..82c358301aa8bf 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -250,9 +250,8 @@ _PyFunction_LookupByVersion(uint32_t version) } void -_PyFunction_ClearByVersionCache(void) +_PyFunction_ClearByVersionCache(PyInterpreterState *interp) { - PyInterpreterState *interp = _PyInterpreterState_GET(); for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { PyFunctionObject **slot = interp->func_state.func_version_cache + i; Py_CLEAR(*slot); diff --git a/Python/pystate.c b/Python/pystate.c index 3a05cb0fa7988d..321abe7a6c633b 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -840,6 +840,9 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) */ // XXX Make sure we properly deal with problematic finalizers. + interp->func_state.next_version = 0; // No more new versions + _PyFunction_ClearByVersionCache(interp); + Py_CLEAR(interp->audit_hooks); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f6511fd7b0c9dc..f8cb8e6d61a048 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2080,7 +2080,8 @@ sys__clear_type_cache_impl(PyObject *module) { PyType_ClearCache(); // Also clear the function-by-version cache - _PyFunction_ClearByVersionCache(); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyFunction_ClearByVersionCache(interp); Py_RETURN_NONE; } From bfa556662cdd0e2f943ae312ae665ea1c6f8c1db Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 14:12:04 -0700 Subject: [PATCH 23/26] Add a small essay on function and code versions --- Objects/funcobject.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 82c358301aa8bf..6a1d38da169163 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,6 +223,48 @@ PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname return NULL; } +/* +Function versions +----------------- + +Function versions are used to detect when a function object has been +updated, invalidating inline cache data used by the `CALL` bytecode +(notably `CALL_PY_EXACT_ARGS` and a few other `CALL` specializations). + +They are also used by the Tier 2 superblock creation code to find +the function being called (and from there the code object). + +How does a function's `func_version` field get initialized? + +- `PyFunction_New` and friends initialize it to 0. +- The `MAKE_FUNCTION` instruction sets it from the code's `co_version`. +- It is reset to 0 when various attributes like `__code__` are set. +- A new version is allocated by `_PyFunction_GetVersionForCurrentState` + when the specializer needs a version and the version is 0. + +The latter allocates versions using a counter in the interpreter state; +when the counter wraps around to 0, no more versions are allocated. +There is one other special case: functions with a non-standard +`vectorcall` field are not given a version. + +When the function version is 0, the `CALL` bytecode is not specialized. + +Code object versions +-------------------- + +So where to code objects get their `co_version`? There is a single +static global counter, `_Py_next_func_version`. This is initialized in +the generated (!) file `Python/deepfreeze/deepfreeze.c`, to 1 plus the +number of deep-frozen function objects in that file. +(In `_bootstrap_python.c` and `freeze_module.c` it is initialized to 1.) + +Code objects get a new `co_version` allocated from this counter upon +creation. Since code objects are nominally immutable, `co_version` can +not be invalidated. The only way it can be 0 is when 2**32 or more +code objects have been created during the process's lifetime. +(The counter isn't reset by `fork()`, extending the lifetime.) +*/ + void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) { From 597b12eb0bddae8eb06b564a9efd0d898658f6e6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 14:49:36 -0700 Subject: [PATCH 24/26] Move function cache clearing earlier in finalization --- Python/pylifecycle.c | 5 +++++ Python/pystate.c | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0de3abf9407899..b3319e81c89e96 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1767,6 +1767,11 @@ Py_FinalizeEx(void) // XXX assert(_Py_IsMainInterpreter(tstate->interp)); // XXX assert(_Py_IsMainThread()); + // The function version cache keeps functions alive, so clear it. + // It's used for optimizations, which we don't need in this stage. + tstate->interp->func_state.next_version = 0; // No more new versions + _PyFunction_ClearByVersionCache(tstate->interp); + // Block some operations. tstate->interp->finalizing = 1; diff --git a/Python/pystate.c b/Python/pystate.c index 321abe7a6c633b..3a05cb0fa7988d 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -840,9 +840,6 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) */ // XXX Make sure we properly deal with problematic finalizers. - interp->func_state.next_version = 0; // No more new versions - _PyFunction_ClearByVersionCache(interp); - Py_CLEAR(interp->audit_hooks); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { From d7b7e01d07f69b6deb0e7e32852409a18ae9bf00 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 18:56:14 -0700 Subject: [PATCH 25/26] Cache borrowed references, cleared in func_dealloc --- Include/internal/pycore_function.h | 8 ++++---- Objects/funcobject.c | 33 ++++++++++++++---------------- Python/pylifecycle.c | 5 ----- Python/sysmodule.c | 3 --- 4 files changed, 19 insertions(+), 30 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index cb3f856410d0d1..3f3da8a44b77e4 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -19,9 +19,10 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; - // Function objects whose func_version % FUNC_VERSION_CACHE_SIZE - // once equaled the index in the table. The references are owned: - // Call _PyFunction_ClearByVersionCache() to clear. + // Borrowed references to function objects whose + // func_version % FUNC_VERSION_CACHE_SIZE + // once was equal to the index in the table. + // They are cleared when the function is deallocated. PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; @@ -30,7 +31,6 @@ extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr) extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); -void _PyFunction_ClearByVersionCache(PyInterpreterState *interp); extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 6a1d38da169163..33191d23f18230 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -271,10 +271,8 @@ _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) func->func_version = version; if (version != 0) { PyInterpreterState *interp = _PyInterpreterState_GET(); - PyFunctionObject **slot = - interp->func_state.func_version_cache - + (version % FUNC_VERSION_CACHE_SIZE); - Py_XSETREF(*slot, (PyFunctionObject *)Py_NewRef(func)); + interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE] = func; } } @@ -282,24 +280,14 @@ PyFunctionObject * _PyFunction_LookupByVersion(uint32_t version) { PyInterpreterState *interp = _PyInterpreterState_GET(); - PyFunctionObject **slot = - interp->func_state.func_version_cache - + (version % FUNC_VERSION_CACHE_SIZE); - if (*slot != NULL && (*slot)->func_version == version) { - return (PyFunctionObject *)Py_NewRef(*slot); + PyFunctionObject *func = interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE]; + if (func != NULL && func->func_version == version) { + return (PyFunctionObject *)Py_NewRef(func); } return NULL; } -void -_PyFunction_ClearByVersionCache(PyInterpreterState *interp) -{ - for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { - PyFunctionObject **slot = interp->func_state.func_version_cache + i; - Py_CLEAR(*slot); - } -} - uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { @@ -929,6 +917,15 @@ func_dealloc(PyFunctionObject *op) if (op->func_weakreflist != NULL) { PyObject_ClearWeakRefs((PyObject *) op); } + if (op->func_version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (op->func_version % FUNC_VERSION_CACHE_SIZE); + if (*slot == op) { + *slot = NULL; + } + } (void)func_clear(op); // These aren't cleared by func_clear(). Py_DECREF(op->func_code); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index b3319e81c89e96..0de3abf9407899 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1767,11 +1767,6 @@ Py_FinalizeEx(void) // XXX assert(_Py_IsMainInterpreter(tstate->interp)); // XXX assert(_Py_IsMainThread()); - // The function version cache keeps functions alive, so clear it. - // It's used for optimizations, which we don't need in this stage. - tstate->interp->func_state.next_version = 0; // No more new versions - _PyFunction_ClearByVersionCache(tstate->interp); - // Block some operations. tstate->interp->finalizing = 1; diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f8cb8e6d61a048..be026d95ba7e77 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2079,9 +2079,6 @@ sys__clear_type_cache_impl(PyObject *module) /*[clinic end generated code: output=20e48ca54a6f6971 input=127f3e04a8d9b555]*/ { PyType_ClearCache(); - // Also clear the function-by-version cache - PyInterpreterState *interp = _PyInterpreterState_GET(); - _PyFunction_ClearByVersionCache(interp); Py_RETURN_NONE; } From da4c7ad2815d4e14b4a08bf0a526c3fc13e59def Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 12:53:14 -0700 Subject: [PATCH 26/26] DO NOT MERGE: Always use -Xuops --- Python/pylifecycle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0de3abf9407899..54889ea87fec91 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1191,6 +1191,7 @@ init_interp_main(PyThreadState *tstate) if (_Py_get_xoption(&config->xoptions, L"uops") != NULL) { enabled = 1; } + enabled = 1; // TEMPORARY: always enable if (enabled) { PyObject *opt = PyUnstable_Optimizer_NewUOpOptimizer(); if (opt == NULL) { pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy