diff options
author | Mark Shannon <mark@hotpy.org> | 2023-10-23 14:49:09 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-23 14:49:09 +0100 |
commit | 52e902ccf0178d7a3f26de4bba7922b01c0d4d3c (patch) | |
tree | 1f5924ef032ba13f1115c4c88541d293f6331875 /Python | |
parent | 32c37fe1ba708b8290cfa971e130bcfc29f1ead3 (diff) | |
download | cpython-52e902ccf0178d7a3f26de4bba7922b01c0d4d3c.tar.gz cpython-52e902ccf0178d7a3f26de4bba7922b01c0d4d3c.zip |
GH-109369: Add machinery for deoptimizing tier2 executors, both individually and globally. (GH-110384)
Diffstat (limited to 'Python')
-rw-r--r-- | Python/instrumentation.c | 3 | ||||
-rw-r--r-- | Python/optimizer.c | 235 | ||||
-rw-r--r-- | Python/pystate.c | 1 |
3 files changed, 237 insertions, 2 deletions
diff --git a/Python/instrumentation.c b/Python/instrumentation.c index eee1908e503..4bb57a621e3 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1582,6 +1582,7 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } + _Py_Executors_InvalidateDependency(interp, code); int code_len = (int)Py_SIZE(code); /* code->_co_firsttraceable >= code_len indicates * that no instrumentation can be inserted. @@ -1803,6 +1804,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events) return -1; } set_global_version(interp, new_version); + _Py_Executors_InvalidateAll(interp); return instrument_all_executing_code_objects(interp); } @@ -1832,6 +1834,7 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent /* Force instrumentation update */ code->_co_instrumentation_version -= MONITORING_VERSION_INCREMENT; } + _Py_Executors_InvalidateDependency(interp, code); if (_Py_Instrument(code, interp)) { return -1; } diff --git a/Python/optimizer.c b/Python/optimizer.c index 955ac812177..8d19de220d3 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -220,10 +220,16 @@ typedef struct { static void counter_dealloc(_PyCounterExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); Py_DECREF(self->optimizer); PyObject_Free(self); } +static PyMemberDef counter_members[] = { + { "valid", Py_T_UBYTE, offsetof(_PyCounterExecutorObject, executor.vm_data.valid), Py_READONLY, "is valid?" }, + { NULL } +}; + static PyTypeObject CounterExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "counting_executor", @@ -231,6 +237,7 @@ static PyTypeObject CounterExecutor_Type = { .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)counter_dealloc, + .tp_members = counter_members, }; static _PyInterpreterFrame * @@ -261,6 +268,9 @@ counter_optimize( executor->optimizer = (_PyCounterOptimizerObject *)self; executor->next_instr = instr; *exec_ptr = (_PyExecutorObject *)executor; + _PyBloomFilter empty; + _Py_BloomFilter_Init(&empty); + _Py_ExecutorInit((_PyExecutorObject *)executor, &empty); return 1; } @@ -288,6 +298,8 @@ static PyTypeObject CounterOptimizer_Type = { PyObject * PyUnstable_Optimizer_NewCounter(void) { + PyType_Ready(&CounterExecutor_Type); + PyType_Ready(&CounterOptimizer_Type); _PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&CounterOptimizer_Type); if (opt == NULL) { return NULL; @@ -303,6 +315,7 @@ PyUnstable_Optimizer_NewCounter(void) static void uop_dealloc(_PyUOpExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); PyObject_Free(self); } @@ -356,6 +369,12 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; + +static PyMemberDef uop_members[] = { + { "valid", Py_T_UBYTE, offsetof(_PyUOpExecutorObject, base.vm_data.valid), Py_READONLY, "is valid?" }, + { NULL } +}; + static PyTypeObject UOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", @@ -364,6 +383,7 @@ static PyTypeObject UOpExecutor_Type = { .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, + .tp_members = uop_members, }; static int @@ -399,9 +419,11 @@ translate_bytecode_to_trace( PyCodeObject *code, _Py_CODEUNIT *instr, _PyUOpInstruction *trace, - int buffer_size) + int buffer_size, + _PyBloomFilter *dependencies) { PyCodeObject *initial_code = code; + _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; @@ -735,6 +757,7 @@ pop_jump_if_bool: // Increment IP to the return address instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1; TRACE_STACK_PUSH(); + _Py_BloomFilter_Add(dependencies, new_code); code = new_code; instr = _PyCode_CODE(code); DPRINTF(2, @@ -895,8 +918,10 @@ uop_optimize( _PyExecutorObject **exec_ptr, int curr_stackentries) { + _PyBloomFilter dependencies; + _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; - int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH); + int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies); if (trace_length <= 0) { // Error or nothing translated return trace_length; @@ -915,6 +940,7 @@ uop_optimize( OPT_HIST(trace_length, optimized_trace_length_hist); executor->base.execute = _PyUopExecute; memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction)); + _Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies); *exec_ptr = (_PyExecutorObject *)executor; return 1; } @@ -936,6 +962,8 @@ static PyTypeObject UOpOptimizer_Type = { PyObject * PyUnstable_Optimizer_NewUOpOptimizer(void) { + PyType_Ready(&UOpExecutor_Type); + PyType_Ready(&UOpOptimizer_Type); _PyOptimizerObject *opt = PyObject_New(_PyOptimizerObject, &UOpOptimizer_Type); if (opt == NULL) { return NULL; @@ -947,3 +975,206 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; return (PyObject *)opt; } + + +/***************************************** + * Executor management + ****************************************/ + +/* We use a bloomfilter with k = 6, m = 256 + * The choice of k and the following constants + * could do with a more rigourous analysis, + * but here is a simple analysis: + * + * We want to keep the false positive rate low. + * For n = 5 (a trace depends on 5 objects), + * we expect 30 bits set, giving a false positive + * rate of (30/256)**6 == 2.5e-6 which is plenty + * good enough. + * + * However with n = 10 we expect 60 bits set (worst case), + * giving a false positive of (60/256)**6 == 0.0001 + * + * We choose k = 6, rather than a higher number as + * it means the false positive rate grows slower for high n. + * + * n = 5, k = 6 => fp = 2.6e-6 + * n = 5, k = 8 => fp = 3.5e-7 + * n = 10, k = 6 => fp = 1.6e-4 + * n = 10, k = 8 => fp = 0.9e-4 + * n = 15, k = 6 => fp = 0.18% + * n = 15, k = 8 => fp = 0.23% + * n = 20, k = 6 => fp = 1.1% + * n = 20, k = 8 => fp = 2.3% + * + * The above analysis assumes perfect hash functions, + * but those don't exist, so the real false positive + * rates may be worse. + */ + +#define K 6 + +#define SEED 20221211 + +/* TO DO -- Use more modern hash functions with better distribution of bits */ +static uint64_t +address_to_hash(void *ptr) { + assert(ptr != NULL); + uint64_t uhash = SEED; + uintptr_t addr = (uintptr_t)ptr; + for (int i = 0; i < SIZEOF_VOID_P; i++) { + uhash ^= addr & 255; + uhash *= (uint64_t)_PyHASH_MULTIPLIER; + addr >>= 8; + } + return uhash; +} + +void +_Py_BloomFilter_Init(_PyBloomFilter *bloom) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + bloom->bits[i] = 0; + } +} + +/* We want K hash functions that each set 1 bit. + * A hash function that sets 1 bit in M bits can be trivially + * derived from a log2(M) bit hash function. + * So we extract 8 (log2(256)) bits at a time from + * the 64bit hash. */ +void +_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr) +{ + uint64_t hash = address_to_hash(ptr); + assert(K <= 8); + for (int i = 0; i < K; i++) { + uint8_t bits = hash & 255; + bloom->bits[bits >> 5] |= (1 << (bits&31)); + hash >>= 8; + } +} + +static bool +bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) { + return false; + } + } + return true; +} + +static void +link_executor(_PyExecutorObject *executor) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *head = interp->executor_list_head; + if (head == NULL) { + interp->executor_list_head = executor; + links->previous = NULL; + links->next = NULL; + } + else { + _PyExecutorObject *next = head->vm_data.links.next; + links->previous = head; + links->next = next; + if (next != NULL) { + next->vm_data.links.previous = executor; + } + head->vm_data.links.next = executor; + } + executor->vm_data.linked = true; + /* executor_list_head must be first in list */ + assert(interp->executor_list_head->vm_data.links.previous == NULL); +} + +static void +unlink_executor(_PyExecutorObject *executor) +{ + if (!executor->vm_data.linked) { + return; + } + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *next = links->next; + _PyExecutorObject *prev = links->previous; + if (next != NULL) { + next->vm_data.links.previous = prev; + } + if (prev != NULL) { + prev->vm_data.links.next = next; + } + else { + // prev == NULL implies that executor is the list head + PyInterpreterState *interp = PyInterpreterState_Get(); + assert(interp->executor_list_head == executor); + interp->executor_list_head = next; + } + executor->vm_data.linked = false; +} + +/* This must be called by optimizers before using the executor */ +void +_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) +{ + executor->vm_data.valid = true; + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + executor->vm_data.bloom.bits[i] = dependency_set->bits[i]; + } + link_executor(executor); +} + +/* This must be called by executors during dealloc */ +void +_Py_ExecutorClear(_PyExecutorObject *executor) +{ + unlink_executor(executor); +} + +void +_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) +{ + assert(executor->vm_data.valid = true); + _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); +} + +/* Invalidate all executors that depend on `obj` + * May cause other executors to be invalidated as well + */ +void +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj) +{ + _PyBloomFilter obj_filter; + _Py_BloomFilter_Init(&obj_filter); + _Py_BloomFilter_Add(&obj_filter, obj); + /* Walk the list of executors */ + /* TO DO -- Use a tree to avoid traversing as many objects */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) { + exec->vm_data.valid = false; + unlink_executor(exec); + } + exec = next; + } +} + +/* Invalidate all executors */ +void +_Py_Executors_InvalidateAll(PyInterpreterState *interp) +{ + /* Walk the list of executors */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + exec->vm_data.links.next = NULL; + exec->vm_data.links.previous = NULL; + exec->vm_data.valid = false; + exec->vm_data.linked = false; + exec = next; + } + interp->executor_list_head = NULL; +} diff --git a/Python/pystate.c b/Python/pystate.c index 2e6f07e6003..c44a28ca6d3 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -713,6 +713,7 @@ init_interpreter(PyInterpreterState *interp, interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold; interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold; interp->next_func_version = 1; + interp->executor_list_head = NULL; if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp); |