diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index df9b4184e2a50e..9fcf48fa71f88f 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,10 +52,16 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _POP_JUMP_IF_FALSE 331 -#define _POP_JUMP_IF_TRUE 332 -#define JUMP_TO_TOP 333 -#define INSERT 334 +#define _CHECK_PEP_523 331 +#define _CHECK_FUNCTION_EXACT_ARGS 332 +#define _CHECK_STACK_SPACE 333 +#define _INIT_CALL_PY_EXACT_ARGS 334 +#define _PUSH_FRAME 335 +#define _POP_JUMP_IF_FALSE 336 +#define _POP_JUMP_IF_TRUE 337 +#define JUMP_TO_TOP 338 +#define SAVE_CURRENT_IP 339 +#define INSERT 340 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); #ifdef NEED_OPCODE_METADATA @@ -473,6 +479,16 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return oparg + 2; case CALL_BOUND_METHOD_EXACT_ARGS: return oparg + 2; + case _CHECK_PEP_523: + return 0; + case _CHECK_FUNCTION_EXACT_ARGS: + return oparg + 2; + case _CHECK_STACK_SPACE: + return oparg + 2; + case _INIT_CALL_PY_EXACT_ARGS: + return oparg + 2; + case _PUSH_FRAME: + return 1; case CALL_PY_EXACT_ARGS: return oparg + 2; case CALL_PY_WITH_DEFAULTS: @@ -561,6 +577,8 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return 0; case SAVE_IP: return 0; + case SAVE_CURRENT_IP: + return 0; case EXIT_TRACE: return 0; case INSERT: @@ -987,6 +1005,16 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 1; case CALL_BOUND_METHOD_EXACT_ARGS: return 1; + case _CHECK_PEP_523: + return 0; + case _CHECK_FUNCTION_EXACT_ARGS: + return oparg + 2; + case _CHECK_STACK_SPACE: + return oparg + 2; + case _INIT_CALL_PY_EXACT_ARGS: + return 1; + case _PUSH_FRAME: + return 1; case CALL_PY_EXACT_ARGS: return 1; case CALL_PY_WITH_DEFAULTS: @@ -1075,6 +1103,8 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 0; case SAVE_IP: return 0; + case SAVE_CURRENT_IP: + return 0; case EXIT_TRACE: return 0; case INSERT: @@ -1088,6 +1118,7 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, + INSTR_FMT_IBC0, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, @@ -1132,6 +1163,7 @@ struct opcode_macro_expansion { #define OPARG_CACHE_4 4 #define OPARG_TOP 5 #define OPARG_BOTTOM 6 +#define OPARG_SAVE_IP 7 #define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format) #define SAME_OPCODE_METADATA(OP1, OP2) \ @@ -1474,6 +1506,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 7, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { SAVE_CURRENT_IP, 0, 0 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1531,9 +1564,15 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", + [_CHECK_PEP_523] = "_CHECK_PEP_523", + [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", + [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", + [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", [_POP_JUMP_IF_TRUE] = "_POP_JUMP_IF_TRUE", [JUMP_TO_TOP] = "JUMP_TO_TOP", + [SAVE_CURRENT_IP] = "SAVE_CURRENT_IP", [INSERT] = "INSERT", }; #endif // NEED_OPCODE_METADATA diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index c81212202d9ef2..3dfbfdc26e7416 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2618,6 +2618,23 @@ def testfunc(it): with self.assertRaises(StopIteration): next(it) + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(10) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + + if __name__ == "__main__": unittest.main() diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index 6bfcf534646b1e..eef071119bcd84 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -612,6 +612,30 @@ break; } + case _CHECK_PEP_523: { + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { + break; + } + + case _CHECK_STACK_SPACE: { + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + STACK_SHRINK(oparg); + STACK_SHRINK(1); + PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); + break; + } + + case _PUSH_FRAME: { + PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); + break; + } + case CALL_NO_KW_TYPE_1: { STACK_SHRINK(oparg); STACK_SHRINK(1); @@ -751,6 +775,10 @@ break; } + case SAVE_CURRENT_IP: { + break; + } + case EXIT_TRACE: { break; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e9a5cf59e7d689..9d28c4ce5f2837 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -957,13 +957,13 @@ dummy_func( { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -996,13 +996,13 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2588,7 +2588,6 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -2596,6 +2595,7 @@ dummy_func( SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2950,32 +2950,72 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { - ASSERT_KWNAMES_IS_NULL(); + op(_CHECK_PEP_523, (--)) { DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } + } + + op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + + op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + + op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); for (int i = 0; i < argcount; i++) { new_frame->localsplus[i] = args[i]; } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); + } + + // The 'unused' output effect represents the return value + // (which will be pushed when the frame returns). + // It is needed so CALL_PY_EXACT_ARGS matches its family. + op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + ERROR_IF(_Py_EnterRecursivePy(tstate), exit_unwind); + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } + macro(CALL_PY_EXACT_ARGS) = + unused/1 + // Skip over the counter + _CHECK_PEP_523 + + _CHECK_FUNCTION_EXACT_ARGS + + _CHECK_STACK_SPACE + + _INIT_CALL_PY_EXACT_ARGS + + SAVE_IP + // Tier 2 only; special-cased oparg + SAVE_CURRENT_IP + // Sets frame->prev_instr + _PUSH_FRAME; + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); @@ -3736,6 +3776,16 @@ dummy_func( frame->prev_instr = ip_offset + oparg; } + op(SAVE_CURRENT_IP, (--)) { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + op(EXIT_TRACE, (--)) { frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/ceval.c b/Python/ceval.c index b966399a342d08..26e741ed7c7547 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -602,11 +602,6 @@ int _Py_CheckRecursiveCallPy( return 0; } -static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { - return (tstate->py_recursion_remaining-- <= 0) && - _Py_CheckRecursiveCallPy(tstate); -} - static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { tstate->py_recursion_remaining++; @@ -770,6 +765,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif { +#define TIER_ONE 1 #include "generated_cases.c.h" /* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c, diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8dc8b754485856..5e2db1e0b394e6 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -364,3 +364,8 @@ static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = { #else #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL) #endif + +static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { + return (tstate->py_recursion_remaining-- <= 0) && + _Py_CheckRecursiveCallPy(tstate); +} diff --git a/Python/executor.c b/Python/executor.c index 4a18618c0c6c0c..5a571e6da4673f 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -81,6 +81,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject OBJECT_STAT_INC(optimization_uops_executed); switch (opcode) { +#define TIER_TWO 2 #include "executor_cases.c.h" default: @@ -106,6 +107,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject pop_2_error: STACK_SHRINK(1); pop_1_error: +pop_1_exit_unwind: STACK_SHRINK(1); error: // On ERROR_IF we return NULL as the frame. diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 85d27777423abd..b3dd3133530562 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -103,7 +103,6 @@ } case TO_BOOL: { - static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; value = stack_pointer[-1]; @@ -363,7 +362,6 @@ } case BINARY_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *res; @@ -557,7 +555,6 @@ } case STORE_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *v; @@ -862,7 +859,6 @@ } case UNPACK_SEQUENCE: { - static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; seq = stack_pointer[-1]; #if ENABLE_SPECIALIZATION @@ -950,7 +946,6 @@ } case STORE_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *v; owner = stack_pointer[-1]; @@ -1061,7 +1056,6 @@ } case LOAD_GLOBAL: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size"); PyObject *res; PyObject *null = NULL; #if ENABLE_SPECIALIZATION @@ -1554,7 +1548,6 @@ } case LOAD_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; PyObject *attr; PyObject *self_or_null = NULL; @@ -1650,7 +1643,6 @@ } case COMPARE_OP: { - static_assert(INLINE_CACHE_ENTRIES_COMPARE_OP == 1, "incorrect cache size"); PyObject *right; PyObject *left; PyObject *res; @@ -2155,6 +2147,84 @@ break; } + case _CHECK_PEP_523: { + DEOPT_IF(tstate->interp->eval_frame, CALL); + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { + PyObject *self_or_null; + PyObject *callable; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)operand; + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + break; + } + + case _CHECK_STACK_SPACE: { + PyObject *callable; + callable = stack_pointer[-2 - oparg]; + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + PyObject **args; + PyObject *self_or_null; + PyObject *callable; + _PyInterpreterFrame *new_frame; + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } + STACK_SHRINK(oparg); + STACK_SHRINK(1); + stack_pointer[-1] = (PyObject *)new_frame; + break; + } + + case _PUSH_FRAME: { + _PyInterpreterFrame *new_frame; + new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; + STACK_SHRINK(1); + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + break; + } + case CALL_NO_KW_TYPE_1: { PyObject **args; PyObject *null; @@ -2656,7 +2726,6 @@ } case BINARY_OP: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_OP == 1, "incorrect cache size"); PyObject *rhs; PyObject *lhs; PyObject *res; @@ -2726,6 +2795,17 @@ break; } + case SAVE_CURRENT_IP: { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + break; + } + case EXIT_TRACE: { frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 2661a39e047c4d..11d560a6e77adf 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -1191,13 +1191,13 @@ { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -1237,13 +1237,13 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -3343,7 +3343,6 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -3351,6 +3350,7 @@ SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); STACK_GROW(1); } @@ -3764,38 +3764,83 @@ TARGET(CALL_PY_EXACT_ARGS) { PREDICTED(CALL_PY_EXACT_ARGS); - PyObject **args; PyObject *self_or_null; PyObject *callable; - args = stack_pointer - oparg; + PyObject **args; + _PyInterpreterFrame *new_frame; + // _CHECK_PEP_523 + { + DEOPT_IF(tstate->interp->eval_frame, CALL); + } + // _CHECK_FUNCTION_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = read_u32(&next_instr[1].cache); - ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; + { + uint32_t func_version = read_u32(&next_instr[1].cache); + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + // _CHECK_STACK_SPACE + callable = stack_pointer[-2 - oparg]; + { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - DEOPT_IF(!PyFunction_Check(callable), CALL); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version, CALL); - PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); - for (int i = 0; i < argcount; i++) { - new_frame->localsplus[i] = args[i]; + // _INIT_CALL_PY_EXACT_ARGS + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); - frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + // SAVE_CURRENT_IP + next_instr += 3; + { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + // _PUSH_FRAME STACK_SHRINK(oparg); - STACK_SHRINK(1); + STACK_SHRINK(2); + { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(CALL_PY_WITH_DEFAULTS) { diff --git a/Python/optimizer.c b/Python/optimizer.c index d3ac2424038ef9..559c4ae987263e 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -606,6 +606,10 @@ translate_bytecode_to_trace( case OPARG_BOTTOM: // Second half of super-instr oparg = orig_oparg & 0xF; break; + case OPARG_SAVE_IP: // op==SAVE_IP; oparg=next instr + oparg = INSTR_IP(instr + offset, code); + break; + default: fprintf(stderr, "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n", @@ -615,6 +619,11 @@ translate_bytecode_to_trace( Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _PUSH_FRAME) { + assert(i + 1 == nuops); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } } break; } diff --git a/Tools/cases_generator/flags.py b/Tools/cases_generator/flags.py index f7ebdeb0d65677..962f003b194dbd 100644 --- a/Tools/cases_generator/flags.py +++ b/Tools/cases_generator/flags.py @@ -92,7 +92,7 @@ def variable_used_unspecialized(node: parsing.Node, name: str) -> bool: if text == "#if": if ( i + 1 < len(node.tokens) - and node.tokens[i + 1].text == "ENABLE_SPECIALIZATION" + and node.tokens[i + 1].text in ("ENABLE_SPECIALIZATION", "TIER_ONE") ): skipping = True elif text in ("#else", "#endif"): diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index e170e110f80cfb..8c29f29ae92cf6 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -24,6 +24,7 @@ PseudoInstruction, StackEffect, OverriddenInstructionPlaceHolder, + TIER_ONE, TIER_TWO, ) import parsing @@ -58,6 +59,7 @@ "OPARG_CACHE_4": 4, "OPARG_TOP": 5, "OPARG_BOTTOM": 6, + "OPARG_SAVE_IP": 7, } INSTR_FMT_PREFIX = "INSTR_FMT_" @@ -357,7 +359,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part]) + self.write_macro_expansions( + instr.name, [part], instr.cache_offset + ) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -367,7 +371,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts) + self.write_macro_expansions( + mac.name, mac.parts, mac.cache_offset + ) case parsing.Pseudo(): pass case _: @@ -439,7 +445,9 @@ def add(name: str) -> None: if instr.kind == "op" and instr.is_viable_uop(): add(instr.name) - def write_macro_expansions(self, name: str, parts: MacroParts) -> None: + def write_macro_expansions( + self, name: str, parts: MacroParts, cache_offset: int + ) -> None: """Write the macro expansions for a macro-instruction.""" # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc. offset = 0 # Cache effect offset @@ -459,7 +467,10 @@ def write_macro_expansions(self, name: str, parts: MacroParts) -> None: ) return if not part.active_caches: - size, offset = OPARG_SIZES["OPARG_FULL"], 0 + if part.instr.name == "SAVE_IP": + size, offset = OPARG_SIZES["OPARG_SAVE_IP"], cache_offset + else: + size, offset = OPARG_SIZES["OPARG_FULL"], 0 else: # If this assert triggers, is_viable_uops() lied assert len(part.active_caches) == 1, (name, part.instr.name) @@ -562,7 +573,9 @@ def write_instructions( case parsing.Macro(): n_macros += 1 mac = self.macro_instrs[thing.name] - stacking.write_macro_instr(mac, self.out, self.families.get(mac.name)) + stacking.write_macro_instr( + mac, self.out, self.families.get(mac.name) + ) # self.write_macro(self.macro_instrs[thing.name]) case parsing.Pseudo(): pass @@ -598,7 +611,9 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write(self.out, tier=TIER_TWO) + stacking.write_single_instr( + instr, self.out, tier=TIER_TWO + ) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -660,8 +675,13 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - instr.write(self.out) + self.out.static_assert_family_size( + instr.name, instr.family, instr.cache_offset + ) + stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: + if instr.cache_offset: + self.out.emit(f"next_instr += {instr.cache_offset};") if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit(f"DISPATCH();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index a505df08fa265b..9143ae0db7be81 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -59,7 +59,7 @@ class Instruction: block_line: int # First line of block in original code # Computed by constructor - always_exits: bool + always_exits: str # If the block always exits, its last line; else "" has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -120,13 +120,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - # if self.name.startswith("CALL"): - # dprint = print + if "FRAME" in self.name: + dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay if self.always_exits: - dprint(f"Skipping {self.name} because it always exits") + dprint(f"Skipping {self.name} because it always exits: {self.always_exits}") return False if len(self.active_caches) > 1: # print(f"Skipping {self.name} because it has >1 cache entries") @@ -140,23 +140,6 @@ def is_viable_uop(self) -> bool: res = False return res - def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None: - """Write one instruction, sans prologue and epilogue.""" - - # Write a static assertion that a family's cache size is correct - out.static_assert_family_size(self.name, self.family, self.cache_offset) - - # Write input stack effect variable declarations and initializations - stacking.write_single_instr(self, out, tier) - - # Skip the rest if the block always exits - if self.always_exits: - return - - # Write cache effect - if tier == TIER_ONE and self.cache_offset: - out.emit(f"next_instr += {self.cache_offset};") - def write_body( self, out: Formatter, @@ -341,16 +324,16 @@ def extract_block_text(block: parsing.Block) -> tuple[list[str], bool, int]: return blocklines, check_eval_breaker, block_line -def always_exits(lines: list[str]) -> bool: +def always_exits(lines: list[str]) -> str: """Determine whether a block always ends in a return/goto/etc.""" if not lines: - return False + return "" line = lines[-1].rstrip() # Indent must match exactly (TODO: Do something better) if line[:12] != " " * 12: - return False + return "" line = line[12:] - return line.startswith( + if line.startswith( ( "goto ", "return ", @@ -359,4 +342,6 @@ def always_exits(lines: list[str]) -> bool: "Py_UNREACHABLE()", "ERROR_IF(true, ", ) - ) + ): + return line + return "" diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index 31a21e026cb49c..8361eb99f88a7c 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -1,6 +1,7 @@ import dataclasses import typing +from flags import variable_used_unspecialized from formatting import ( Formatter, UNUSED, @@ -146,6 +147,8 @@ class EffectManager: # Track offsets from stack pointer min_offset: StackOffset final_offset: StackOffset + # Link to previous manager + pred: "EffectManager | None" = None def __init__( self, @@ -167,7 +170,8 @@ def __init__( self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff)) self.final_offset.higher(eff) - if pred: + self.pred = pred + while pred: # Replace push(x) + pop(y) with copy(x, y). # Check that the sources and destinations are disjoint. sources: set[str] = set() @@ -192,6 +196,11 @@ def __init__( sources, destinations, ) + # See if we can get more copies of a earlier predecessor. + if self.peeks and not pred.pokes and not pred.peeks: + pred = pred.pred + else: + pred = None # Break def adjust_deeper(self, eff: StackEffect) -> None: for peek in self.peeks: @@ -295,6 +304,7 @@ def write_single_instr( [Component(instr, instr.active_caches)], out, tier, + 0, ) except AssertionError as err: raise AssertionError(f"Error writing instruction {instr.name}") from err @@ -303,37 +313,32 @@ def write_single_instr( def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: - parts = [part for part in mac.parts if isinstance(part, Component)] - - cache_adjust = 0 - for part in mac.parts: - match part: - case CacheEffect(size=size): - cache_adjust += size - case Component(instr=instr): - cache_adjust += instr.cache_offset - case _: - typing.assert_never(part) - + parts = [ + part + for part in mac.parts + if isinstance(part, Component) and part.instr.name != "SAVE_IP" + ] out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: out.emit(f"PREDICTED({mac.name});") - out.static_assert_family_size(mac.name, family, cache_adjust) + out.static_assert_family_size(mac.name, family, mac.cache_offset) try: - write_components(parts, out, TIER_ONE) + next_instr_is_set = write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") - out.emit("DISPATCH();") + if not parts[-1].instr.always_exits and not next_instr_is_set: + if mac.cache_offset: + out.emit(f"next_instr += {mac.cache_offset};") + out.emit("DISPATCH();") def write_components( parts: list[Component], out: Formatter, tier: Tiers, -) -> None: + cache_offset: int, +) -> bool: managers = get_managers(parts) all_vars: dict[str, StackEffect] = {} @@ -354,6 +359,7 @@ def write_components( for name, eff in all_vars.items(): out.declare(eff, None) + next_instr_is_set = False for mgr in managers: if len(parts) > 1: out.emit(f"// {mgr.instr.name}") @@ -374,13 +380,25 @@ def write_components( poke.as_stack_effect(lax=True), ) + if mgr.instr.name == "_PUSH_FRAME": + # Adjust stack to min_offset (input effects materialized) + out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) + # Use clone() since adjust_inverse() mutates final_offset + mgr.adjust_inverse(mgr.final_offset.clone()) + + if mgr.instr.name == "SAVE_CURRENT_IP": + next_instr_is_set = True + if cache_offset: + out.emit(f"next_instr += {cache_offset};") + if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) else: with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1]: + if mgr is managers[-1] and not next_instr_is_set: + # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) @@ -392,6 +410,8 @@ def write_components( poke.effect, ) + return next_instr_is_set + def write_single_instr_for_abstract_interp( instr: Instruction, out: Formatter
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: