aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Python
diff options
context:
space:
mode:
authorNeil Schemenauer <nas-github@arctrix.com>2025-02-05 11:38:30 -0800
committerGitHub <noreply@github.com>2025-02-05 11:38:30 -0800
commitcdcacec79f7a216c3c988baa4dc31ce4e76c97ac (patch)
tree43aeae78e47e7fd96d657c58e45088e2a3975f43 /Python
parent5fb019fc29a90e722aff20a9522bf588351358cd (diff)
downloadcpython-cdcacec79f7a216c3c988baa4dc31ce4e76c97ac.tar.gz
cpython-cdcacec79f7a216c3c988baa4dc31ce4e76c97ac.zip
gh-129201: Use prefetch in GC mark alive phase. (gh-129203)
For the free-threaded version of the cyclic GC, restructure the "mark alive" phase to use software prefetch instructions. This gives a speedup in most cases when the number of objects is large enough. The prefetching is enabled conditionally based on the number of long-lived objects the GC finds.
Diffstat (limited to 'Python')
-rw-r--r--Python/gc_free_threading.c472
1 files changed, 430 insertions, 42 deletions
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 5d264e407e1..10c76a67979 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -21,6 +21,9 @@
// enable the "mark alive" pass of GC
#define GC_ENABLE_MARK_ALIVE 1
+// if true, enable the use of "prefetch" CPU instructions
+#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
+
// include additional roots in "mark alive" pass
#define GC_MARK_ALIVE_EXTRA_ROOTS 1
@@ -472,13 +475,193 @@ gc_maybe_untrack(PyObject *op)
}
#ifdef GC_ENABLE_MARK_ALIVE
+
+// prefetch buffer and stack //////////////////////////////////
+
+// The buffer is a circular FIFO queue of PyObject pointers. We take
+// care to not dereference these pointers until they are taken out of
+// the buffer. A prefetch CPU instruction is issued when a pointer is
+// put into the buffer. If all is working as expected, there will be
+// enough time between the enqueue and dequeue so that the needed memory
+// for the object, most importantly ob_gc_bits and ob_type words, will
+// already be in the CPU cache.
+#define BUFFER_SIZE 256
+#define BUFFER_HI 16
+#define BUFFER_LO 8
+#define BUFFER_MASK (BUFFER_SIZE - 1)
+
+// the buffer size must be an exact power of two
+static_assert(BUFFER_SIZE > 0 && !(BUFFER_SIZE & BUFFER_MASK),
+ "Invalid BUFFER_SIZE, must be power of 2");
+// the code below assumes these relationships are true
+static_assert(BUFFER_HI < BUFFER_SIZE &&
+ BUFFER_LO < BUFFER_HI &&
+ BUFFER_LO > 0,
+ "Invalid prefetch buffer level settings.");
+
+// Prefetch intructions will fetch the line of data from memory that
+// contains the byte specified with the source operand to a location in
+// the cache hierarchy specified by a locality hint. The instruction
+// is only a hint and the CPU is free to ignore it. Instructions and
+// behaviour are CPU specific but the definitions of locality hints
+// below are mostly consistent.
+//
+// * T0 (temporal data) prefetch data into all levels of the cache hierarchy.
+//
+// * T1 (temporal data with respect to first level cache) prefetch data into
+// level 2 cache and higher.
+//
+// * T2 (temporal data with respect to second level cache) prefetch data into
+// level 3 cache and higher, or an implementation-specific choice.
+//
+// * NTA (non-temporal data with respect to all cache levels) prefetch data into
+// non-temporal cache structure and into a location close to the processor,
+// minimizing cache pollution.
+
+#if defined(__GNUC__) || defined(__clang__)
+ #define PREFETCH_T0(ptr) __builtin_prefetch(ptr, 0, 3)
+ #define PREFETCH_T1(ptr) __builtin_prefetch(ptr, 0, 2)
+ #define PREFETCH_T2(ptr) __builtin_prefetch(ptr, 0, 1)
+ #define PREFETCH_NTA(ptr) __builtin_prefetch(ptr, 0, 0)
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
+ #include <mmintrin.h>
+ #define PREFETCH_T0(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+ #define PREFETCH_T1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+ #define PREFETCH_T2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T2)
+ #define PREFETCH_NTA(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_NTA)
+#elif defined (__aarch64__)
+ #define PREFETCH_T0(ptr) \
+ do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+ #define PREFETCH_T1(ptr) \
+ do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #define PREFETCH_T2(ptr) \
+ do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0)
+ #define PREFETCH_NTA(ptr) \
+ do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0)
+#else
+ #define PREFETCH_T0(ptr) do { (void)(ptr); } while (0) /* disabled */
+ #define PREFETCH_T1(ptr) do { (void)(ptr); } while (0) /* disabled */
+ #define PREFETCH_T2(ptr) do { (void)(ptr); } while (0) /* disabled */
+ #define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0) /* disabled */
+#endif
+
+#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+ #define prefetch(ptr) PREFETCH_T1(ptr)
+#else
+ #define prefetch(ptr)
+#endif
+
+// a contigous sequence of PyObject pointers, can contain NULLs
+typedef struct {
+ PyObject **start;
+ PyObject **end;
+} gc_span_t;
+
+typedef struct {
+ Py_ssize_t size;
+ Py_ssize_t capacity;
+ gc_span_t *stack;
+} gc_span_stack_t;
+
+typedef struct {
+ unsigned int in;
+ unsigned int out;
+ _PyObjectStack stack;
+ gc_span_stack_t spans;
+ PyObject *buffer[BUFFER_SIZE];
+ bool use_prefetch;
+} gc_mark_args_t;
+
+
+// Returns number of entries in buffer
+static inline unsigned int
+gc_mark_buffer_len(gc_mark_args_t *args)
+{
+ return args->in - args->out;
+}
+
+// Returns number of free entry slots in buffer
+static inline unsigned int
+gc_mark_buffer_avail(gc_mark_args_t *args)
+{
+ return BUFFER_SIZE - gc_mark_buffer_len(args);
+}
+
+static inline bool
+gc_mark_buffer_is_empty(gc_mark_args_t *args)
+{
+ return args->in == args->out;
+}
+
+static inline bool
+gc_mark_buffer_is_full(gc_mark_args_t *args)
+{
+ return gc_mark_buffer_len(args) == BUFFER_SIZE;
+}
+
+static inline PyObject *
+gc_mark_buffer_pop(gc_mark_args_t *args)
+{
+ assert(!gc_mark_buffer_is_empty(args));
+ PyObject *op = args->buffer[args->out & BUFFER_MASK];
+ args->out++;
+ return op;
+}
+
+// Called when there is space in the buffer for the object. Issue the
+// prefetch instruction and add it to the end of the buffer.
+static inline void
+gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
+{
+ assert(!gc_mark_buffer_is_full(args));
+ prefetch(op);
+ args->buffer[args->in & BUFFER_MASK] = op;
+ args->in++;
+}
+
+// Called when we run out of space in the buffer or if the prefetching
+// is disabled. The object will be pushed on the gc_mark_args.stack.
+static int
+gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
+{
+ if (_PyObjectStack_Push(ms, op) < 0) {
+ return -1;
+ }
+ return 0;
+}
+
static int
-mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
+gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
+{
+ if (start == end) {
+ return 0;
+ }
+ if (ss->size >= ss->capacity) {
+ if (ss->capacity == 0) {
+ ss->capacity = 256;
+ }
+ else {
+ ss->capacity *= 2;
+ }
+ ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t));
+ if (ss->stack == NULL) {
+ return -1;
+ }
+ }
+ assert(end > start);
+ ss->stack[ss->size].start = start;
+ ss->stack[ss->size].end = end;
+ ss->size++;
+ return 0;
+}
+
+static int
+gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args)
{
if (op == NULL) {
return 0;
}
- if (!_PyObject_GC_IS_TRACKED(op)) {
+ if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
return 0;
}
if (gc_is_alive(op)) {
@@ -491,12 +674,68 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
// Need to call tp_traverse on this object. Add to stack and mark it
// alive so we don't traverse it a second time.
gc_set_alive(op);
- if (_PyObjectStack_Push(stack, op) < 0) {
+ if (_PyObjectStack_Push(&args->stack, op) < 0) {
return -1;
}
return 0;
}
+static int
+gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args)
+{
+ assert(op != NULL);
+ if (!gc_mark_buffer_is_full(args)) {
+ gc_mark_buffer_push(op, args);
+ return 0;
+ }
+ else {
+ return gc_mark_stack_push(&args->stack, op);
+ }
+}
+
+// Called when we find an object that needs to be marked alive (either from a
+// root or from calling tp_traverse).
+static int
+gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
+{
+ if (args->use_prefetch) {
+ return gc_mark_enqueue_buffer(op, args);
+ }
+ else {
+ return gc_mark_enqueue_no_buffer(op, args);
+ }
+}
+
+// Called when we have a contigous sequence of PyObject pointers, either
+// a tuple or list object. This will add the items to the buffer if there
+// is space for them all otherwise push a new "span" on the span stack. Using
+// spans has the advantage of not creating a deep _PyObjectStack stack when
+// dealing with long sequences. Those sequences will be processed in smaller
+// chunks by the gc_prime_from_spans() function.
+static int
+gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
+{
+ Py_ssize_t used = gc_mark_buffer_len(args);
+ Py_ssize_t free = BUFFER_SIZE - used;
+ if (free >= size) {
+ for (Py_ssize_t i = 0; i < size; i++) {
+ PyObject *op = item[i];
+ if (op == NULL) {
+ continue;
+ }
+ gc_mark_buffer_push(op, args);
+ }
+ }
+ else {
+ assert(size > 0);
+ PyObject **end = &item[size];
+ if (gc_mark_span_push(&args->spans, item, end) < 0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
static bool
gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
void *block, size_t block_size, void *args)
@@ -511,25 +750,56 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
return true;
}
+static int
+gc_mark_traverse_list(PyObject *self, void *args)
+{
+ PyListObject *list = (PyListObject *)self;
+ if (list->ob_item == NULL) {
+ return 0;
+ }
+ if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static int
+gc_mark_traverse_tuple(PyObject *self, void *args)
+{
+ _PyTuple_MaybeUntrack(self);
+ if (!gc_has_bit(self, _PyGC_BITS_TRACKED)) {
+ gc_clear_alive(self);
+ return 0;
+ }
+ PyTupleObject *tuple = _PyTuple_CAST(self);
+ if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) {
+ return -1;
+ }
+ return 0;
+}
+
static void
gc_abort_mark_alive(PyInterpreterState *interp,
struct collection_state *state,
- _PyObjectStack *stack)
+ gc_mark_args_t *args)
{
- // We failed to allocate memory for "stack" while doing the "mark
- // alive" phase. In that case, free the object stack and make sure
- // that no objects have the alive bit set.
- _PyObjectStack_Clear(stack);
+ // We failed to allocate memory while doing the "mark alive" phase.
+ // In that case, free the memory used for marking state and make
+ // sure that no objects have the alive bit set.
+ _PyObjectStack_Clear(&args->stack);
+ if (args->spans.stack != NULL) {
+ PyMem_Free(args->spans.stack);
+ }
gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
}
#ifdef GC_MARK_ALIVE_STACKS
static int
-gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
+gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref)
{
if (!PyStackRef_IsNull(stackref)) {
PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
- if (mark_alive_stack_push(op, stack) < 0) {
+ if (gc_mark_enqueue(op, args) < 0) {
return -1;
}
}
@@ -537,7 +807,7 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
}
static int
-gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack)
+gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args)
{
int err = 0;
_Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
@@ -554,13 +824,13 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
}
_PyStackRef *top = f->stackpointer;
- if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) {
+ if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) {
err = -1;
goto exit;
}
while (top != f->localsplus) {
--top;
- if (gc_visit_stackref_mark_alive(stack, *top) < 0) {
+ if (gc_visit_stackref_mark_alive(args, *top) < 0) {
err = -1;
goto exit;
}
@@ -904,22 +1174,124 @@ static int
move_legacy_finalizer_reachable(struct collection_state *state);
#ifdef GC_ENABLE_MARK_ALIVE
+
+static void
+gc_prime_from_spans(gc_mark_args_t *args)
+{
+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+ // there should always be at least this amount of space
+ assert(space <= gc_mark_buffer_avail(args));
+ assert(space > 0);
+ gc_span_t entry = args->spans.stack[--args->spans.size];
+ // spans on the stack should always have one or more elements
+ assert(entry.start < entry.end);
+ do {
+ PyObject *op = *entry.start;
+ entry.start++;
+ if (op != NULL) {
+ gc_mark_buffer_push(op, args);
+ space--;
+ if (space == 0) {
+ // buffer is as full as we want and not done with span
+ gc_mark_span_push(&args->spans, entry.start, entry.end);
+ return;
+ }
+ }
+ } while (entry.start < entry.end);
+}
+
+static void
+gc_prime_buffer(gc_mark_args_t *args)
+{
+ if (args->spans.size > 0) {
+ gc_prime_from_spans(args);
+ }
+ else {
+ // When priming, don't fill the buffer too full since that would
+ // likely cause the stack to be used shortly after when it
+ // fills. We want to use the buffer as much as possible and so
+ // we only fill to BUFFER_HI, not BUFFER_SIZE.
+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+ assert(space > 0);
+ do {
+ PyObject *op = _PyObjectStack_Pop(&args->stack);
+ if (op == NULL) {
+ return;
+ }
+ gc_mark_buffer_push(op, args);
+ space--;
+ } while (space > 0);
+ }
+}
+
static int
-propagate_alive_bits(_PyObjectStack *stack)
+gc_propagate_alive_prefetch(gc_mark_args_t *args)
{
for (;;) {
- PyObject *op = _PyObjectStack_Pop(stack);
- if (op == NULL) {
- break;
+ Py_ssize_t buf_used = gc_mark_buffer_len(args);
+ if (buf_used <= BUFFER_LO) {
+ // The mark buffer is getting empty. If it's too empty
+ // then there will not be enough delay between issuing
+ // the prefetch and when the object is actually accessed.
+ // Prime the buffer with object pointers from the stack or
+ // from the spans, if there are any available.
+ gc_prime_buffer(args);
+ if (gc_mark_buffer_is_empty(args)) {
+ return 0;
+ }
+ }
+ PyObject *op = gc_mark_buffer_pop(args);
+
+ if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
+ continue;
}
- assert(_PyObject_GC_IS_TRACKED(op));
- assert(gc_is_alive(op));
+
+ if (gc_is_alive(op)) {
+ continue; // already visited this object
+ }
+
+ // Need to call tp_traverse on this object. Mark it alive so we
+ // don't traverse it a second time.
+ gc_set_alive(op);
+
traverseproc traverse = Py_TYPE(op)->tp_traverse;
- if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) {
+ if (traverse == PyList_Type.tp_traverse) {
+ if (gc_mark_traverse_list(op, args) < 0) {
+ return -1;
+ }
+ }
+ else if (traverse == PyTuple_Type.tp_traverse) {
+ if (gc_mark_traverse_tuple(op, args) < 0) {
+ return -1;
+ }
+ }
+ else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) {
return -1;
}
}
- return 0;
+}
+
+static int
+gc_propagate_alive(gc_mark_args_t *args)
+{
+ if (args->use_prefetch) {
+ return gc_propagate_alive_prefetch(args);
+ }
+ else {
+ for (;;) {
+ PyObject *op = _PyObjectStack_Pop(&args->stack);
+ if (op == NULL) {
+ break;
+ }
+ assert(_PyObject_GC_IS_TRACKED(op));
+ assert(gc_is_alive(op));
+ traverseproc traverse = Py_TYPE(op)->tp_traverse;
+ if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) {
+ return -1;
+ }
+ }
+ return 0;
+ }
}
// Using tp_traverse, mark everything reachable from known root objects
@@ -939,48 +1311,64 @@ propagate_alive_bits(_PyObjectStack *stack)
//
// Returns -1 on failure (out of memory).
static int
-mark_alive_from_roots(PyInterpreterState *interp,
- struct collection_state *state)
+gc_mark_alive_from_roots(PyInterpreterState *interp,
+ struct collection_state *state)
{
#ifdef GC_DEBUG
// Check that all objects don't have alive bit set
gc_visit_heaps(interp, &validate_alive_bits, &state->base);
#endif
- _PyObjectStack stack = { NULL };
-
- #define STACK_PUSH(op) \
- if (mark_alive_stack_push(op, &stack) < 0) { \
- gc_abort_mark_alive(interp, state, &stack); \
- return -1; \
+ gc_mark_args_t mark_args = { 0 };
+
+ // Using prefetch instructions is only a win if the set of objects being
+ // examined by the GC does not fit into CPU caches. Otherwise, using the
+ // buffer and prefetch instructions is just overhead. Using the long lived
+ // object count seems a good estimate of if things will fit in the cache.
+ // On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache
+ // would hold about 130k objects.
+ mark_args.use_prefetch = interp->gc.long_lived_total > 200000;
+
+ #define MARK_ENQUEUE(op) \
+ if (op != NULL ) { \
+ if (gc_mark_enqueue(op, &mark_args) < 0) { \
+ gc_abort_mark_alive(interp, state, &mark_args); \
+ return -1; \
+ } \
}
- STACK_PUSH(interp->sysdict);
+ MARK_ENQUEUE(interp->sysdict);
#ifdef GC_MARK_ALIVE_EXTRA_ROOTS
- STACK_PUSH(interp->builtins);
- STACK_PUSH(interp->dict);
+ MARK_ENQUEUE(interp->builtins);
+ MARK_ENQUEUE(interp->dict);
struct types_state *types = &interp->types;
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) {
- STACK_PUSH(types->builtins.initialized[i].tp_dict);
- STACK_PUSH(types->builtins.initialized[i].tp_subclasses);
+ MARK_ENQUEUE(types->builtins.initialized[i].tp_dict);
+ MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses);
}
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) {
- STACK_PUSH(types->for_extensions.initialized[i].tp_dict);
- STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses);
+ MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict);
+ MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses);
}
#endif
#ifdef GC_MARK_ALIVE_STACKS
- if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) {
- gc_abort_mark_alive(interp, state, &stack);
+ if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) {
+ gc_abort_mark_alive(interp, state, &mark_args);
return -1;
}
#endif
- #undef STACK_PUSH
+ #undef MARK_ENQUEUE
// Use tp_traverse to find everything reachable from roots.
- if (propagate_alive_bits(&stack) < 0) {
- gc_abort_mark_alive(interp, state, &stack);
+ if (gc_propagate_alive(&mark_args) < 0) {
+ gc_abort_mark_alive(interp, state, &mark_args);
return -1;
}
+ assert(mark_args.spans.size == 0);
+ if (mark_args.spans.stack != NULL) {
+ PyMem_Free(mark_args.spans.stack);
+ }
+ assert(mark_args.stack.head == NULL);
+
return 0;
}
#endif // GC_ENABLE_MARK_ALIVE
@@ -1559,7 +1947,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
if (!state->gcstate->freeze_active) {
// Mark objects reachable from known roots as "alive". These will
// be ignored for rest of the GC pass.
- int err = mark_alive_from_roots(interp, state);
+ int err = gc_mark_alive_from_roots(interp, state);
if (err < 0) {
_PyEval_StartTheWorld(interp);
PyErr_NoMemory();