28 files changed, 1398 insertions, 476 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index fc407ec6bf9..87ea1162e03 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1075,10 +1075,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
 }
 
 /* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
                                 Py_ssize_t len,
                                 const char *errors,
-                                const char **first_invalid_escape)
+                                int *first_invalid_escape_char,
+                                const char **first_invalid_escape_ptr)
 {
     int c;
     char *p;
@@ -1092,7 +1093,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
         return NULL;
     writer.overallocate = 1;
 
-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
+    *first_invalid_escape_ptr = NULL;
 
     end = s + len;
     while (s < end) {
@@ -1130,9 +1132,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
                     c = (c<<3) + *s++ - '0';
             }
             if (c > 0377) {
-                if (*first_invalid_escape == NULL) {
-                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
-                                                    already incremented s. */
+                if (*first_invalid_escape_char == -1) {
+                    *first_invalid_escape_char = c;
+                    /* Back up 3 chars, since we've already incremented s. */
+                    *first_invalid_escape_ptr = s - 3;
                 }
             }
             *p++ = c;
@@ -1173,9 +1176,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
             break;
 
         default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape_char == -1) {
+                *first_invalid_escape_char = (unsigned char)s[-1];
+                /* Back up one char, since we've already incremented s. */
+                *first_invalid_escape_ptr = s - 1;
             }
             *p++ = '\\';
             s--;
@@ -1195,18 +1199,19 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                                 Py_ssize_t Py_UNUSED(unicode),
                                 const char *Py_UNUSED(recode_encoding))
 {
-    const char* first_invalid_escape;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
-                                             &first_invalid_escape);
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
+                                             &first_invalid_escape_char,
+                                             &first_invalid_escape_ptr);
     if (result == NULL)
         return NULL;
-    if (first_invalid_escape != NULL) {
-        unsigned char c = *first_invalid_escape;
-        if ('4' <= c && c <= '7') {
+    if (first_invalid_escape_char != -1) {
+        if (first_invalid_escape_char > 0xff) {
             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "b\"\\%.3s\" is an invalid octal escape sequence. "
+                                 "b\"\\%o\" is an invalid octal escape sequence. "
                                  "Such sequences will not work in the future. ",
-                                 first_invalid_escape) < 0)
+                                 first_invalid_escape_char) < 0)
             {
                 Py_DECREF(result);
                 return NULL;
@@ -1216,7 +1221,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                  "b\"\\%c\" is an invalid escape sequence. "
                                  "Such sequences will not work in the future. ",
-                                 c) < 0)
+                                 first_invalid_escape_char) < 0)
             {
                 Py_DECREF(result);
                 return NULL;
diff --git a/Objects/call.c b/Objects/call.c
index b1610dababd..c9a18bcc3da 100644
--- a/Objects/call.c
+++ b/Objects/call.c
@@ -834,12 +834,15 @@ PyObject_VectorcallMethod(PyObject *name, PyObject *const *args,
     assert(PyVectorcall_NARGS(nargsf) >= 1);
 
     PyThreadState *tstate = _PyThreadState_GET();
-    PyObject *callable = NULL;
+    _PyCStackRef method;
+    _PyThreadState_PushCStackRef(tstate, &method);
     /* Use args[0] as "self" argument */
-    int unbound = _PyObject_GetMethod(args[0], name, &callable);
-    if (callable == NULL) {
+    int unbound = _PyObject_GetMethodStackRef(tstate, args[0], name, &method.ref);
+    if (PyStackRef_IsNull(method.ref)) {
+        _PyThreadState_PopCStackRef(tstate, &method);
         return NULL;
     }
+    PyObject *callable = PyStackRef_AsPyObjectBorrow(method.ref);
 
     if (unbound) {
         /* We must remove PY_VECTORCALL_ARGUMENTS_OFFSET since
@@ -855,7 +858,7 @@ PyObject_VectorcallMethod(PyObject *name, PyObject *const *args,
     EVAL_CALL_STAT_INC_IF_FUNCTION(EVAL_CALL_METHOD, callable);
     PyObject *result = _PyObject_VectorcallTstate(tstate, callable,
                                                   args, nargsf, kwnames);
-    Py_DECREF(callable);
+    _PyThreadState_PopCStackRef(tstate, &method);
     return result;
 }
 
@@ -868,11 +871,14 @@ PyObject_CallMethodObjArgs(PyObject *obj, PyObject *name, ...)
         return null_error(tstate);
     }
 
-    PyObject *callable = NULL;
-    int is_method = _PyObject_GetMethod(obj, name, &callable);
-    if (callable == NULL) {
+    _PyCStackRef method;
+    _PyThreadState_PushCStackRef(tstate, &method);
+    int is_method = _PyObject_GetMethodStackRef(tstate, obj, name, &method.ref);
+    if (PyStackRef_IsNull(method.ref)) {
+        _PyThreadState_PopCStackRef(tstate, &method);
         return NULL;
     }
+    PyObject *callable = PyStackRef_AsPyObjectBorrow(method.ref);
     obj = is_method ? obj : NULL;
 
     va_list vargs;
@@ -880,7 +886,7 @@ PyObject_CallMethodObjArgs(PyObject *obj, PyObject *name, ...)
     PyObject *result = object_vacall(tstate, obj, callable, vargs);
     va_end(vargs);
 
-    Py_DECREF(callable);
+    _PyThreadState_PopCStackRef(tstate, &method);
     return result;
 }
 
@@ -897,12 +903,15 @@ _PyObject_CallMethodIdObjArgs(PyObject *obj, _Py_Identifier *name, ...)
     if (!oname) {
         return NULL;
     }
-
-    PyObject *callable = NULL;
-    int is_method = _PyObject_GetMethod(obj, oname, &callable);
-    if (callable == NULL) {
+    _PyCStackRef method;
+    _PyThreadState_PushCStackRef(tstate, &method);
+    int is_method = _PyObject_GetMethodStackRef(tstate, obj, oname, &method.ref);
+    if (PyStackRef_IsNull(method.ref)) {
+        _PyThreadState_PopCStackRef(tstate, &method);
         return NULL;
     }
+    PyObject *callable = PyStackRef_AsPyObjectBorrow(method.ref);
+
     obj = is_method ? obj : NULL;
 
     va_list vargs;
@@ -910,7 +919,7 @@ _PyObject_CallMethodIdObjArgs(PyObject *obj, _Py_Identifier *name, ...)
     PyObject *result = object_vacall(tstate, obj, callable, vargs);
     va_end(vargs);
 
-    Py_DECREF(callable);
+    _PyThreadState_PopCStackRef(tstate, &method);
     return result;
 }
 
diff --git a/Objects/classobject.c b/Objects/classobject.c
index 58e1d179773..e71f301f2ef 100644
--- a/Objects/classobject.c
+++ b/Objects/classobject.c
@@ -7,6 +7,7 @@
 #include "pycore_object.h"
 #include "pycore_pyerrors.h"
 #include "pycore_pystate.h"       // _PyThreadState_GET()
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 #include "clinic/classobject.c.h"
@@ -245,8 +246,7 @@ method_dealloc(PyObject *self)
 {
     PyMethodObject *im = _PyMethodObject_CAST(self);
     _PyObject_GC_UNTRACK(im);
-    if (im->im_weakreflist != NULL)
-        PyObject_ClearWeakRefs((PyObject *)im);
+    FT_CLEAR_WEAKREFS(self, im->im_weakreflist);
     Py_DECREF(im->im_func);
     Py_XDECREF(im->im_self);
     assert(Py_IS_TYPE(self, &PyMethod_Type));
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 4f06a36a130..ba178abc0c0 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -17,6 +17,7 @@
 #include "pycore_tuple.h"         // _PyTuple_ITEMS()
 #include "pycore_unicodeobject.h" // _PyUnicode_InternImmortal()
 #include "pycore_uniqueid.h"      // _PyObject_AssignUniqueId()
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 #include "clinic/codeobject.c.h"
 #include <stdbool.h>
@@ -1714,7 +1715,7 @@ static int
 identify_unbound_names(PyThreadState *tstate, PyCodeObject *co,
                        PyObject *globalnames, PyObject *attrnames,
                        PyObject *globalsns, PyObject *builtinsns,
-                       struct co_unbound_counts *counts)
+                       struct co_unbound_counts *counts, int *p_numdupes)
 {
     // This function is inspired by inspect.getclosurevars().
     // It would be nicer if we had something similar to co_localspluskinds,
@@ -1729,6 +1730,7 @@ identify_unbound_names(PyThreadState *tstate, PyCodeObject *co,
     assert(builtinsns == NULL || PyDict_Check(builtinsns));
     assert(counts == NULL || counts->total == 0);
     struct co_unbound_counts unbound = {0};
+    int numdupes = 0;
     Py_ssize_t len = Py_SIZE(co);
     for (int i = 0; i < len; i += _PyInstruction_GetLength(co, i)) {
         _Py_CODEUNIT inst = _Py_GetBaseCodeUnit(co, i);
@@ -1747,6 +1749,12 @@ identify_unbound_names(PyThreadState *tstate, PyCodeObject *co,
             if (PySet_Add(attrnames, name) < 0) {
                 return -1;
             }
+            if (PySet_Contains(globalnames, name)) {
+                if (_PyErr_Occurred(tstate)) {
+                    return -1;
+                }
+                numdupes += 1;
+            }
         }
         else if (inst.op.code == LOAD_GLOBAL) {
             int oparg = GET_OPARG(co, i, inst.op.arg);
@@ -1778,11 +1786,20 @@ identify_unbound_names(PyThreadState *tstate, PyCodeObject *co,
             if (PySet_Add(globalnames, name) < 0) {
                 return -1;
             }
+            if (PySet_Contains(attrnames, name)) {
+                if (_PyErr_Occurred(tstate)) {
+                    return -1;
+                }
+                numdupes += 1;
+            }
         }
     }
     if (counts != NULL) {
         *counts = unbound;
     }
+    if (p_numdupes != NULL) {
+        *p_numdupes = numdupes;
+    }
     return 0;
 }
 
@@ -1932,20 +1949,24 @@ _PyCode_SetUnboundVarCounts(PyThreadState *tstate,
 
     // Fill in unbound.globals and unbound.numattrs.
     struct co_unbound_counts unbound = {0};
+    int numdupes = 0;
     Py_BEGIN_CRITICAL_SECTION(co);
     res = identify_unbound_names(
             tstate, co, globalnames, attrnames, globalsns, builtinsns,
-            &unbound);
+            &unbound, &numdupes);
     Py_END_CRITICAL_SECTION();
     if (res < 0) {
         goto finally;
     }
     assert(unbound.numunknown == 0);
-    assert(unbound.total <= counts->unbound.total);
+    assert(unbound.total - numdupes <= counts->unbound.total);
     assert(counts->unbound.numunknown == counts->unbound.total);
-    unbound.numunknown = counts->unbound.total - unbound.total;
-    unbound.total = counts->unbound.total;
+    // There may be a name that is both a global and an attr.
+    int totalunbound = counts->unbound.total + numdupes;
+    unbound.numunknown = totalunbound - unbound.total;
+    unbound.total = totalunbound;
     counts->unbound = unbound;
+    counts->total += numdupes;
     res = 0;
 
 finally:
@@ -1979,7 +2000,6 @@ _PyCode_CheckNoExternalState(PyCodeObject *co, _PyCode_var_counts_t *counts,
                              const char **p_errmsg)
 {
     const char *errmsg = NULL;
-    assert(counts->locals.hidden.total == 0);
     if (counts->numfree > 0) {  // It's a closure.
         errmsg = "closures not supported";
     }
@@ -2364,6 +2384,8 @@ free_monitoring_data(_PyCoMonitoringData *data)
 static void
 code_dealloc(PyObject *self)
 {
+    PyThreadState *tstate = PyThreadState_GET();
+    _Py_atomic_add_uint64(&tstate->interp->_code_object_generation, 1);
     PyCodeObject *co = _PyCodeObject_CAST(self);
     _PyObject_ResurrectStart(self);
     notify_code_watchers(PY_CODE_EVENT_DESTROY, co);
@@ -2415,9 +2437,7 @@ code_dealloc(PyObject *self)
         Py_XDECREF(co->_co_cached->_co_varnames);
         PyMem_Free(co->_co_cached);
     }
-    if (co->co_weakreflist != NULL) {
-        PyObject_ClearWeakRefs(self);
-    }
+    FT_CLEAR_WEAKREFS(self, co->co_weakreflist);
     free_monitoring_data(co->_co_monitoring);
 #ifdef Py_GIL_DISABLED
     // The first element always points to the mutable bytecode at the end of
@@ -3348,7 +3368,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
         }
         memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
         _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
-        _PyMem_FreeDelayed(tlbc);
+        _PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
         tlbc = new_tlbc;
     }
     char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index c2dd320ae73..b66ebe131ae 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -1,4 +1,3 @@
-
 /* Complex object implementation */
 
 /* Borrows heavily from floatobject.c */
@@ -9,6 +8,7 @@
 #include "pycore_call.h"          // _PyObject_CallNoArgs()
 #include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
 #include "pycore_floatobject.h"   // _Py_convert_int_to_double()
+#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
 #include "pycore_long.h"          // _PyLong_GetZero()
 #include "pycore_object.h"        // _PyObject_Init()
 #include "pycore_pymath.h"        // _Py_ADJUST_ERANGE2()
@@ -410,16 +410,32 @@ complex_subtype_from_c_complex(PyTypeObject *type, Py_complex cval)
 PyObject *
 PyComplex_FromCComplex(Py_complex cval)
 {
-    /* Inline PyObject_New */
-    PyComplexObject *op = PyObject_Malloc(sizeof(PyComplexObject));
+    PyComplexObject *op = _Py_FREELIST_POP(PyComplexObject, complexes);
+
     if (op == NULL) {
-        return PyErr_NoMemory();
+        /* Inline PyObject_New */
+        op = PyObject_Malloc(sizeof(PyComplexObject));
+        if (op == NULL) {
+            return PyErr_NoMemory();
+        }
+        _PyObject_Init((PyObject*)op, &PyComplex_Type);
     }
-    _PyObject_Init((PyObject*)op, &PyComplex_Type);
     op->cval = cval;
     return (PyObject *) op;
 }
 
+static void
+complex_dealloc(PyObject *op)
+{
+    assert(PyComplex_Check(op));
+    if (PyComplex_CheckExact(op)) {
+        _Py_FREELIST_FREE(complexes, op, PyObject_Free);
+    }
+    else {
+        Py_TYPE(op)->tp_free(op);
+    }
+}
+
 static PyObject *
 complex_subtype_from_doubles(PyTypeObject *type, double real, double imag)
 {
@@ -1383,7 +1399,7 @@ PyTypeObject PyComplex_Type = {
     "complex",
     sizeof(PyComplexObject),
     0,
-    0,                                          /* tp_dealloc */
+    complex_dealloc,                            /* tp_dealloc */
     0,                                          /* tp_vectorcall_offset */
     0,                                          /* tp_getattr */
     0,                                          /* tp_setattr */
diff --git a/Objects/descrobject.c b/Objects/descrobject.c
index 10c465b95ac..d3d17e92b6d 100644
--- a/Objects/descrobject.c
+++ b/Objects/descrobject.c
@@ -1233,7 +1233,10 @@ static PyObject *
 mappingproxy_richcompare(PyObject *self, PyObject *w, int op)
 {
     mappingproxyobject *v = (mappingproxyobject *)self;
-    return PyObject_RichCompare(v->mapping, w, op);
+    if (op == Py_EQ || op == Py_NE) {
+        return PyObject_RichCompare(v->mapping, w, op);
+    }
+    Py_RETURN_NOTIMPLEMENTED;
 }
 
 static int
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 32356f0634d..6b7b150f0e2 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -547,13 +547,13 @@ static inline uint8_t
 calculate_log2_keysize(Py_ssize_t minsize)
 {
 #if SIZEOF_LONG == SIZEOF_SIZE_T
-    minsize = (minsize | PyDict_MINSIZE) - 1;
-    return _Py_bit_length(minsize | (PyDict_MINSIZE-1));
+    minsize = Py_MAX(minsize, PyDict_MINSIZE);
+    return _Py_bit_length(minsize - 1);
 #elif defined(_MSC_VER)
-    // On 64bit Windows, sizeof(long) == 4.
-    minsize = (minsize | PyDict_MINSIZE) - 1;
+    // On 64bit Windows, sizeof(long) == 4. We cannot use _Py_bit_length.
+    minsize = Py_MAX(minsize, PyDict_MINSIZE);
     unsigned long msb;
-    _BitScanReverse64(&msb, (uint64_t)minsize);
+    _BitScanReverse64(&msb, (uint64_t)minsize - 1);
     return (uint8_t)(msb + 1);
 #else
     uint8_t log2_size;
@@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
 {
 #ifdef Py_GIL_DISABLED
     if (use_qsbr) {
-        _PyMem_FreeDelayed(keys);
+        _PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
         return;
     }
 #endif
@@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
     assert(values->embedded == 0);
 #ifdef Py_GIL_DISABLED
     if (use_qsbr) {
-        _PyMem_FreeDelayed(values);
+        _PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
         return;
     }
 #endif
@@ -2916,6 +2916,11 @@ clear_lock_held(PyObject *op)
 }
 
 void
+_PyDict_Clear_LockHeld(PyObject *op) {
+    clear_lock_held(op);
+}
+
+void
 PyDict_Clear(PyObject *op)
 {
     Py_BEGIN_CRITICAL_SECTION(op);
@@ -3853,7 +3858,7 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
         }
     }
 
-    Py_ssize_t orig_size = other->ma_keys->dk_nentries;
+    Py_ssize_t orig_size = other->ma_used;
     Py_ssize_t pos = 0;
     Py_hash_t hash;
     PyObject *key, *value;
@@ -3887,7 +3892,7 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
         if (err != 0)
             return -1;
 
-        if (orig_size != other->ma_keys->dk_nentries) {
+        if (orig_size != other->ma_used) {
             PyErr_SetString(PyExc_RuntimeError,
                     "dict mutated during update");
             return -1;
diff --git a/Objects/frameobject.c b/Objects/frameobject.c
index 76b52efccf8..601fc69c4b1 100644
--- a/Objects/frameobject.c
+++ b/Objects/frameobject.c
@@ -1386,6 +1386,10 @@ mark_stacks(PyCodeObject *code_obj, int len)
                     stacks[j] = next_stack;
                     break;
                 case GET_ITER:
+                    next_stack = push_value(pop_value(next_stack), Iterator);
+                    next_stack = push_value(next_stack, Iterator);
+                    stacks[next_i] = next_stack;
+                    break;
                 case GET_AITER:
                     next_stack = push_value(pop_value(next_stack), Iterator);
                     stacks[next_i] = next_stack;
diff --git a/Objects/funcobject.c b/Objects/funcobject.c
index 27214a129c2..9532c21fc70 100644
--- a/Objects/funcobject.c
+++ b/Objects/funcobject.c
@@ -10,6 +10,7 @@
 #include "pycore_pyerrors.h"      // _PyErr_Occurred()
 #include "pycore_setobject.h"     // _PySet_NextEntry()
 #include "pycore_stats.h"
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 static const char *
@@ -1148,9 +1149,7 @@ func_dealloc(PyObject *self)
         return;
     }
     _PyObject_GC_UNTRACK(op);
-    if (op->func_weakreflist != NULL) {
-        PyObject_ClearWeakRefs((PyObject *) op);
-    }
+    FT_CLEAR_WEAKREFS(self, op->func_weakreflist);
     (void)func_clear((PyObject*)op);
     // These aren't cleared by func_clear().
     _Py_DECREF_CODE((PyCodeObject *)op->func_code);
@@ -1256,7 +1255,7 @@ _PyFunction_VerifyStateless(PyThreadState *tstate, PyObject *func)
         return -1;
     }
     // Check the builtins.
-    PyObject *builtinsns = PyFunction_GET_BUILTINS(func);
+    PyObject *builtinsns = _PyFunction_GET_BUILTINS(func);
     if (builtinsns != NULL && !PyDict_Check(builtinsns)) {
         _PyErr_Format(tstate, PyExc_TypeError,
                       "unsupported builtins %R", builtinsns);
@@ -1264,26 +1263,32 @@ _PyFunction_VerifyStateless(PyThreadState *tstate, PyObject *func)
     }
     // Disallow __defaults__.
     PyObject *defaults = PyFunction_GET_DEFAULTS(func);
-    if (defaults != NULL && defaults != Py_None && PyDict_Size(defaults) > 0)
-    {
-        _PyErr_SetString(tstate, PyExc_ValueError, "defaults not supported");
-        return -1;
+    if (defaults != NULL) {
+        assert(PyTuple_Check(defaults));  // per PyFunction_New()
+        if (PyTuple_GET_SIZE(defaults) > 0) {
+            _PyErr_SetString(tstate, PyExc_ValueError,
+                             "defaults not supported");
+            return -1;
+        }
     }
     // Disallow __kwdefaults__.
     PyObject *kwdefaults = PyFunction_GET_KW_DEFAULTS(func);
-    if (kwdefaults != NULL && kwdefaults != Py_None
-            && PyDict_Size(kwdefaults) > 0)
-    {
-        _PyErr_SetString(tstate, PyExc_ValueError,
-                         "keyword defaults not supported");
-        return -1;
+    if (kwdefaults != NULL) {
+        assert(PyDict_Check(kwdefaults));  // per PyFunction_New()
+        if (PyDict_Size(kwdefaults) > 0) {
+            _PyErr_SetString(tstate, PyExc_ValueError,
+                             "keyword defaults not supported");
+            return -1;
+        }
     }
     // Disallow __closure__.
     PyObject *closure = PyFunction_GET_CLOSURE(func);
-    if (closure != NULL && closure != Py_None && PyTuple_GET_SIZE(closure) > 0)
-    {
-        _PyErr_SetString(tstate, PyExc_ValueError, "closures not supported");
-        return -1;
+    if (closure != NULL) {
+        assert(PyTuple_Check(closure));  // per PyFunction_New()
+        if (PyTuple_GET_SIZE(closure) > 0) {
+            _PyErr_SetString(tstate, PyExc_ValueError, "closures not supported");
+            return -1;
+        }
     }
     // Check the code.
     PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
diff --git a/Objects/genericaliasobject.c b/Objects/genericaliasobject.c
index ec3d01f00a3..3bb961aa2b6 100644
--- a/Objects/genericaliasobject.c
+++ b/Objects/genericaliasobject.c
@@ -7,6 +7,7 @@
 #include "pycore_typevarobject.h" // _Py_typing_type_repr
 #include "pycore_unicodeobject.h" // _PyUnicode_EqualToASCIIString()
 #include "pycore_unionobject.h"   // _Py_union_type_or, _PyGenericAlias_Check
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 #include <stdbool.h>
@@ -33,9 +34,7 @@ ga_dealloc(PyObject *self)
     gaobject *alias = (gaobject *)self;
 
     _PyObject_GC_UNTRACK(self);
-    if (alias->weakreflist != NULL) {
-        PyObject_ClearWeakRefs((PyObject *)alias);
-    }
+    FT_CLEAR_WEAKREFS(self, alias->weakreflist);
     Py_XDECREF(alias->origin);
     Py_XDECREF(alias->args);
     Py_XDECREF(alias->parameters);
@@ -65,7 +64,7 @@ ga_repr_items_list(PyUnicodeWriter *writer, PyObject *p)
 
     for (Py_ssize_t i = 0; i < len; i++) {
         if (i > 0) {
-            if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) {
                 return -1;
             }
         }
@@ -109,7 +108,7 @@ ga_repr(PyObject *self)
     }
     for (Py_ssize_t i = 0; i < len; i++) {
         if (i > 0) {
-            if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) {
                 goto error;
             }
         }
@@ -126,7 +125,7 @@ ga_repr(PyObject *self)
     }
     if (len == 0) {
         // for something like tuple[()] we should print a "()"
-        if (PyUnicodeWriter_WriteUTF8(writer, "()", 2) < 0) {
+        if (PyUnicodeWriter_WriteASCII(writer, "()", 2) < 0) {
             goto error;
         }
     }
diff --git a/Objects/genobject.c b/Objects/genobject.c
index 98b2c5004df..3e7d6257006 100644
--- a/Objects/genobject.c
+++ b/Objects/genobject.c
@@ -17,6 +17,7 @@
 #include "pycore_pyerrors.h"      // _PyErr_ClearExcState()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
 #include "pycore_warnings.h"      // _PyErr_WarnUnawaitedCoroutine()
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 #include "opcode_ids.h"           // RESUME, etc
@@ -161,8 +162,7 @@ gen_dealloc(PyObject *self)
 
     _PyObject_GC_UNTRACK(gen);
 
-    if (gen->gi_weakreflist != NULL)
-        PyObject_ClearWeakRefs(self);
+    FT_CLEAR_WEAKREFS(self, gen->gi_weakreflist);
 
     _PyObject_GC_TRACK(self);
 
@@ -704,7 +704,8 @@ static PyObject *
 gen_get_name(PyObject *self, void *Py_UNUSED(ignored))
 {
     PyGenObject *op = _PyGen_CAST(self);
-    return Py_NewRef(op->gi_name);
+    PyObject *name = FT_ATOMIC_LOAD_PTR_ACQUIRE(op->gi_name);
+    return Py_NewRef(name);
 }
 
 static int
@@ -718,7 +719,11 @@ gen_set_name(PyObject *self, PyObject *value, void *Py_UNUSED(ignored))
                         "__name__ must be set to a string object");
         return -1;
     }
-    Py_XSETREF(op->gi_name, Py_NewRef(value));
+    Py_BEGIN_CRITICAL_SECTION(self);
+    // gh-133931: To prevent use-after-free from other threads that reference
+    // the gi_name.
+    _PyObject_XSetRefDelayed(&op->gi_name, Py_NewRef(value));
+    Py_END_CRITICAL_SECTION();
     return 0;
 }
 
@@ -726,7 +731,8 @@ static PyObject *
 gen_get_qualname(PyObject *self, void *Py_UNUSED(ignored))
 {
     PyGenObject *op = _PyGen_CAST(self);
-    return Py_NewRef(op->gi_qualname);
+    PyObject *qualname = FT_ATOMIC_LOAD_PTR_ACQUIRE(op->gi_qualname);
+    return Py_NewRef(qualname);
 }
 
 static int
@@ -740,7 +746,11 @@ gen_set_qualname(PyObject *self, PyObject *value, void *Py_UNUSED(ignored))
                         "__qualname__ must be set to a string object");
         return -1;
     }
-    Py_XSETREF(op->gi_qualname, Py_NewRef(value));
+    Py_BEGIN_CRITICAL_SECTION(self);
+    // gh-133931: To prevent use-after-free from other threads that reference
+    // the gi_qualname.
+    _PyObject_XSetRefDelayed(&op->gi_qualname, Py_NewRef(value));
+    Py_END_CRITICAL_SECTION();
     return 0;
 }
 
@@ -1451,7 +1461,9 @@ typedef struct PyAsyncGenAThrow {
 
     /* Can be NULL, when in the "aclose()" mode
        (equivalent of "athrow(GeneratorExit)") */
-    PyObject *agt_args;
+    PyObject *agt_typ;
+    PyObject *agt_tb;
+    PyObject *agt_val;
 
     AwaitableState agt_state;
 } PyAsyncGenAThrow;
@@ -2078,7 +2090,9 @@ async_gen_athrow_dealloc(PyObject *self)
 
     _PyObject_GC_UNTRACK(self);
     Py_CLEAR(agt->agt_gen);
-    Py_CLEAR(agt->agt_args);
+    Py_XDECREF(agt->agt_typ);
+    Py_XDECREF(agt->agt_tb);
+    Py_XDECREF(agt->agt_val);
     PyObject_GC_Del(self);
 }
 
@@ -2088,7 +2102,9 @@ async_gen_athrow_traverse(PyObject *self, visitproc visit, void *arg)
 {
     PyAsyncGenAThrow *agt = _PyAsyncGenAThrow_CAST(self);
     Py_VISIT(agt->agt_gen);
-    Py_VISIT(agt->agt_args);
+    Py_VISIT(agt->agt_typ);
+    Py_VISIT(agt->agt_tb);
+    Py_VISIT(agt->agt_val);
     return 0;
 }
 
@@ -2116,7 +2132,7 @@ async_gen_athrow_send(PyObject *self, PyObject *arg)
     if (o->agt_state == AWAITABLE_STATE_INIT) {
         if (o->agt_gen->ag_running_async) {
             o->agt_state = AWAITABLE_STATE_CLOSED;
-            if (o->agt_args == NULL) {
+            if (o->agt_typ == NULL) {
                 PyErr_SetString(
                     PyExc_RuntimeError,
                     "aclose(): asynchronous generator is already running");
@@ -2143,7 +2159,7 @@ async_gen_athrow_send(PyObject *self, PyObject *arg)
         o->agt_state = AWAITABLE_STATE_ITER;
         o->agt_gen->ag_running_async = 1;
 
-        if (o->agt_args == NULL) {
+        if (o->agt_typ == NULL) {
             /* aclose() mode */
             o->agt_gen->ag_closed = 1;
 
@@ -2157,19 +2173,10 @@ async_gen_athrow_send(PyObject *self, PyObject *arg)
                 goto yield_close;
             }
         } else {
-            PyObject *typ;
-            PyObject *tb = NULL;
-            PyObject *val = NULL;
-
-            if (!PyArg_UnpackTuple(o->agt_args, "athrow", 1, 3,
-                                   &typ, &val, &tb)) {
-                return NULL;
-            }
-
             retval = _gen_throw((PyGenObject *)gen,
                                 0,  /* Do not close generator when
                                        PyExc_GeneratorExit is passed */
-                                typ, val, tb);
+                                o->agt_typ, o->agt_val, o->agt_tb);
             retval = async_gen_unwrap_value(o->agt_gen, retval);
         }
         if (retval == NULL) {
@@ -2181,7 +2188,7 @@ async_gen_athrow_send(PyObject *self, PyObject *arg)
     assert(o->agt_state == AWAITABLE_STATE_ITER);
 
     retval = gen_send((PyObject *)gen, arg);
-    if (o->agt_args) {
+    if (o->agt_typ) {
         return async_gen_unwrap_value(o->agt_gen, retval);
     } else {
         /* aclose() mode */
@@ -2212,7 +2219,7 @@ check_error:
     if (PyErr_ExceptionMatches(PyExc_StopAsyncIteration) ||
             PyErr_ExceptionMatches(PyExc_GeneratorExit))
     {
-        if (o->agt_args == NULL) {
+        if (o->agt_typ == NULL) {
             /* when aclose() is called we don't want to propagate
                StopAsyncIteration or GeneratorExit; just raise
                StopIteration, signalling that this 'aclose()' await
@@ -2241,7 +2248,7 @@ async_gen_athrow_throw(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
     if (o->agt_state == AWAITABLE_STATE_INIT) {
         if (o->agt_gen->ag_running_async) {
             o->agt_state = AWAITABLE_STATE_CLOSED;
-            if (o->agt_args == NULL) {
+            if (o->agt_typ == NULL) {
                 PyErr_SetString(
                     PyExc_RuntimeError,
                     "aclose(): asynchronous generator is already running");
@@ -2259,7 +2266,7 @@ async_gen_athrow_throw(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
     }
 
     PyObject *retval = gen_throw((PyObject*)o->agt_gen, args, nargs);
-    if (o->agt_args) {
+    if (o->agt_typ) {
         retval = async_gen_unwrap_value(o->agt_gen, retval);
         if (retval == NULL) {
             o->agt_gen->ag_running_async = 0;
@@ -2334,7 +2341,7 @@ async_gen_athrow_finalize(PyObject *op)
 {
     PyAsyncGenAThrow *o = (PyAsyncGenAThrow*)op;
     if (o->agt_state == AWAITABLE_STATE_INIT) {
-        PyObject *method = o->agt_args ? &_Py_ID(athrow) : &_Py_ID(aclose);
+        PyObject *method = o->agt_typ ? &_Py_ID(athrow) : &_Py_ID(aclose);
         _PyErr_WarnUnawaitedAgenMethod(o->agt_gen, method);
     }
 }
@@ -2403,13 +2410,23 @@ PyTypeObject _PyAsyncGenAThrow_Type = {
 static PyObject *
 async_gen_athrow_new(PyAsyncGenObject *gen, PyObject *args)
 {
+    PyObject *typ = NULL;
+    PyObject *tb = NULL;
+    PyObject *val = NULL;
+    if (args && !PyArg_UnpackTuple(args, "athrow", 1, 3, &typ, &val, &tb)) {
+        return NULL;
+    }
+
     PyAsyncGenAThrow *o;
     o = PyObject_GC_New(PyAsyncGenAThrow, &_PyAsyncGenAThrow_Type);
     if (o == NULL) {
         return NULL;
     }
     o->agt_gen = (PyAsyncGenObject*)Py_NewRef(gen);
-    o->agt_args = Py_XNewRef(args);
+    o->agt_typ = Py_XNewRef(typ);
+    o->agt_tb = Py_XNewRef(tb);
+    o->agt_val = Py_XNewRef(val);
+
     o->agt_state = AWAITABLE_STATE_INIT;
     _PyObject_GC_TRACK((PyObject*)o);
     return (PyObject*)o;
diff --git a/Objects/interpolationobject.c b/Objects/interpolationobject.c
index aaea3b8c067..a5d407a7b0e 100644
--- a/Objects/interpolationobject.c
+++ b/Objects/interpolationobject.c
@@ -137,6 +137,8 @@ interpolation_reduce(PyObject *op, PyObject *Py_UNUSED(dummy))
 static PyMethodDef interpolation_methods[] = {
     {"__reduce__", interpolation_reduce, METH_NOARGS,
         PyDoc_STR("__reduce__() -> (cls, state)")},
+    {"__class_getitem__", Py_GenericAlias,
+        METH_O|METH_CLASS, PyDoc_STR("See PEP 585")},
     {NULL, NULL},
 };
 
diff --git a/Objects/listobject.c b/Objects/listobject.c
index c5895645a2d..1b36f4c25ab 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
 #ifdef Py_GIL_DISABLED
     _PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
     if (use_qsbr) {
-        _PyMem_FreeDelayed(array);
+        size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
+        _PyMem_FreeDelayed(array, size);
     }
     else {
         PyMem_Free(array);
@@ -1684,10 +1685,7 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 /* Avoid malloc for small temp arrays. */
 #define MERGESTATE_TEMP_SIZE 256
 
-/* The largest value of minrun. This must be a power of 2, and >= 1, so that
- * the compute_minrun() algorithm guarantees to return a result no larger than
- * this,
- */
+/* The largest value of minrun. This must be a power of 2, and >= 1 */
 #define MAX_MINRUN 64
 #if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
 #error "MAX_MINRUN must be a power of 2, and >= 1"
@@ -1748,6 +1746,11 @@ struct s_MergeState {
      * of tuples. It may be set to safe_object_compare, but the idea is that hopefully
      * we can assume more, and use one of the special-case compares. */
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
+
+    /* Varisbles used for minrun computation. The "ideal" minrun length is
+     * the infinite precision listlen / 2**e. See listsort.txt.
+     */
+     Py_ssize_t mr_current, mr_e, mr_mask;
 };
 
 /* binarysort is the best method for sorting small arrays: it does few
@@ -2209,6 +2212,14 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
     ms->min_gallop = MIN_GALLOP;
     ms->listlen = list_size;
     ms->basekeys = lo->keys;
+
+    /* State for generating minrun values. See listsort.txt. */
+    ms->mr_e = 0;
+    while (list_size >> ms->mr_e >= MAX_MINRUN) {
+        ++ms->mr_e;
+    }
+    ms->mr_mask = (1 << ms->mr_e) - 1;
+    ms->mr_current = 0;
 }
 
 /* Free all the temp memory owned by the MergeState.  This must be called
@@ -2686,27 +2697,15 @@ merge_force_collapse(MergeState *ms)
     return 0;
 }
 
-/* Compute a good value for the minimum run length; natural runs shorter
- * than this are boosted artificially via binary insertion.
- *
- * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
- * Else if n is an exact power of 2, return MAX_MINRUN / 2.
- * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
- * close to, but strictly less than, an exact power of 2.
- *
- * See listsort.txt for more info.
- */
-static Py_ssize_t
-merge_compute_minrun(Py_ssize_t n)
+/* Return the next minrun value to use. See listsort.txt. */
+Py_LOCAL_INLINE(Py_ssize_t)
+minrun_next(MergeState *ms)
 {
-    Py_ssize_t r = 0;           /* becomes 1 if any 1 bits are shifted off */
-
-    assert(n >= 0);
-    while (n >= MAX_MINRUN) {
-        r |= n & 1;
-        n >>= 1;
-    }
-    return n + r;
+    ms->mr_current += ms->listlen;
+    assert(ms->mr_current >= 0); /* no overflow */
+    Py_ssize_t result = ms->mr_current >> ms->mr_e;
+    ms->mr_current &= ms->mr_mask;
+    return result;
 }
 
 /* Here we define custom comparison functions to optimize for the cases one commonly
@@ -3074,7 +3073,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
     /* March over the array once, left to right, finding natural runs,
      * and extending short natural runs to minrun elements.
      */
-    minrun = merge_compute_minrun(nremaining);
     do {
         Py_ssize_t n;
 
@@ -3083,6 +3081,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         if (n < 0)
             goto fail;
         /* If short, extend to min(minrun, nremaining). */
+        minrun = minrun_next(&ms);
         if (n < minrun) {
             const Py_ssize_t force = nremaining <= minrun ?
                               nremaining : minrun;
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index f387d9c116e..5b2fc7d50a2 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -270,8 +270,8 @@ result.  This has two primary good effects:
 
 Computing minrun
 ----------------
-If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the 
-whole array then; it's hard to beat that given the overheads of trying 
+If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the
+whole array then; it's hard to beat that given the overheads of trying
 something fancier (see note BINSORT).
 
 When N is a power of 2, testing on random data showed that minrun values of
@@ -288,7 +288,6 @@ that 32 isn't a good choice for the general case!  Consider N=2112:
 
 >>> divmod(2112, 32)
 (66, 0)
->>>
 
 If the data is randomly ordered, we're very likely to end up with 66 runs
 each of length 32.  The first 64 of these trigger a sequence of perfectly
@@ -301,22 +300,94 @@ to get 64 elements into place).
 If we take minrun=33 in this case, then we're very likely to end up with 64
 runs each of length 33, and then all merges are perfectly balanced.  Better!
 
-What we want to avoid is picking minrun such that in
+The original code used a cheap heuristic to pick a minrun that avoided the
+very worst cases of imbalance for the final merge, but "pretty bad" cases
+still existed.
 
-    q, r = divmod(N, minrun)
+In 2025, Stefan Pochmann found a much better approach, based on letting minrun
+vary a bit from one run to the next. Under his scheme, at _all_ levels of the
+merge tree:
 
-q is a power of 2 and r>0 (then the last merge only gets r elements into
-place, and r < minrun is small compared to N), or q a little larger than a
-power of 2 regardless of r (then we've got a case similar to "2112", again
-leaving too little work for the last merge to do).
+- The number of runs is a power of 2.
+- At most two different run lengths appear.
+- When two do appear, the smaller is one less than the larger.
+- The lengths of run pairs merged never differ by more than one.
 
-Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that 
-N/minrun is exactly a power of 2, or if that isn't possible, is close to, but 
-strictly less than, a power of 2.  This is easier to do than it may sound: 
-take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining 
-bits are set. In fact, that rule covers every case in this section, including 
-small N and exact powers of 2; merge_compute_minrun() is a deceptively simple 
-function.
+So, in all respects, as perfectly balanced as possible.
+
+For the 2112 case, that also keeps minrun at 33, but we were lucky there
+that 2112 is 33 times a power of 2. The new approach doesn't rely on luck.
+
+For example, with 315 random elements, the old scheme uses fixed minrun=40 and
+produces runs of length 40, except for the last. The new scheme produces a
+mix of lengths 39 and 40:
+
+old:  40 40 40 40 40 40 40 35
+new:  39 39 40 39 39 40 39 40
+
+Both schemes produce eight runs, a power of 2. That's good for a balanced
+merge tree. But the new scheme allows merges where left and right length
+never differ by more than 1:
+
+39 39 40 39 39 40 39 40
+  78   79     79   79
+    157         158
+          315
+
+(This shows merges downward, e.g., two runs of length 39 are merged and
+become a run of length 78.)
+
+With larger lists, the old scheme can get even more unbalanced. For example,
+with 32769 elements (that's 2**15 + 1), it uses minrun=33 and produces 993
+runs (of length 33). That's not even a power of 2. The new scheme instead
+produces 1024 runs, all with length 32 except for the last one with length 33.
+
+How does it work? Ideally, all runs would be exactly equally long. For the
+above example, each run would have 315/8 = 39.375 elements. Which of course
+doesn't work. But we can get close:
+
+For the first run, we'd like 39.375 elements. Since that's impossible, we
+instead use 39 (the floor) and remember the current leftover fraction 0.375.
+For the second run, we add 0.375 + 39.375 = 39.75. Again impossible, so we
+instead use 39 and remember 0.75. For the third run, we add 0.75 + 39.375 =
+40.125. This time we get 40 and remember 0.125. And so on. Here's a Python
+generator doing that:
+
+def gen_minruns_with_floats(n):
+    mr = n
+    while mr >= MAX_MINRUN:
+        mr /= 2
+
+    mr_current = 0
+    while True:
+        mr_current += mr
+        yield int(mr_current)
+        mr_current %= 1
+
+But while all arithmetic here can be done exactly using binery floating point,
+floats have less precision that a Py_ssize_t, and mixing floats with ints is
+needlessly expensive anyway.
+
+So here's an integer version, where the internal numbers are scaled up by
+2**e, or rather not divided by 2**e. Instead, only each yielded minrun gets
+divided (by right-shifting). For example instead of adding 39.375 and
+reducing modulo 1, it just adds 315 and reduces modulo 8. And always divides
+by 8 to get each actual minrun value:
+
+def gen_minruns_simpler(n):
+    e = 0
+    while (n >> e) >= MAX_MINRUN:
+        e += 1
+    mask = (1 << e) - 1
+
+    mr_current = 0
+    while True:
+        mr_current += n
+        yield mr_current >> e
+        mr_current &= mask
+
+See note MINRUN CODE for a full implementation and a driver that exhaustively
+verifies the claims above for all list lengths through 2 million.
 
 
 The Merge Pattern
@@ -820,3 +891,75 @@ partially mitigated by pre-scanning the data to determine whether the data is
 homogeneous with respect to type.  If so, it is sometimes possible to
 substitute faster type-specific comparisons for the slower, generic
 PyObject_RichCompareBool.
+
+MINRUN CODE
+from itertools import accumulate
+try:
+    from itertools import batched
+except ImportError:
+    from itertools import islice
+    def batched(xs, k):
+        it = iter(xs)
+        while chunk := tuple(islice(it, k)):
+            yield chunk
+
+MAX_MINRUN = 64
+
+def gen_minruns(n):
+    # In listobject.c, initialization is done in merge_init(), and
+    # the body of the loop in minrun_next().
+    mr_e = 0
+    while (n >> mr_e) >= MAX_MINRUN:
+        mr_e += 1
+    mr_mask = (1 << mr_e) - 1
+
+    mr_current = 0
+    while True:
+        mr_current += n
+        yield mr_current >> mr_e
+        mr_current &= mr_mask
+
+def chew(n, show=False):
+    if n < 1:
+        return
+
+    sizes = []
+    tot = 0
+    for size in gen_minruns(n):
+        sizes.append(size)
+        tot += size
+        if tot >= n:
+            break
+    assert tot == n
+    print(n, len(sizes))
+
+    small, large = MAX_MINRUN // 2, MAX_MINRUN
+    while len(sizes) > 1:
+        assert not len(sizes) & 1
+        assert len(sizes).bit_count() == 1 # i.e., power of 2
+        assert sum(sizes) == n
+        assert min(sizes) >= min(n, small)
+        assert max(sizes) <= large
+
+        d = set(sizes)
+        assert len(d) <= 2
+        if len(d) == 2:
+            lo, hi = sorted(d)
+            assert lo + 1 == hi
+
+        mr = n / len(sizes)
+        for i, s in enumerate(accumulate(sizes, initial=0)):
+            assert int(mr * i) == s
+
+        newsizes = []
+        for a, b in batched(sizes, 2):
+            assert abs(a - b) <= 1
+            newsizes.append(a + b)
+        sizes = newsizes
+        smsll = large
+        large *= 2
+
+    assert sizes[0] == n
+
+for n in range(2_000_001):
+    chew(n)
+\ No newline at end of file
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 0b2dfa003fa..557bb6e1dd9 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -10,6 +10,7 @@
 #include "pycore_long.h"          // _Py_SmallInts
 #include "pycore_object.h"        // _PyObject_Init()
 #include "pycore_runtime.h"       // _PY_NSMALLPOSINTS
+#include "pycore_stackref.h"
 #include "pycore_structseq.h"     // _PyStructSequence_FiniBuiltin()
 #include "pycore_unicodeobject.h" // _PyUnicode_Equal()
 
@@ -316,6 +317,33 @@ _PyLong_FromSTwoDigits(stwodigits x)
     return (PyLongObject*)_PyLong_FromLarge(x);
 }
 
+/* Create a new medium int object from a medium int.
+ * Do not raise. Return NULL if not medium or can't allocate. */
+static inline _PyStackRef
+medium_from_stwodigits(stwodigits x)
+{
+    if (IS_SMALL_INT(x)) {
+        return PyStackRef_FromPyObjectBorrow(get_small_int((sdigit)x));
+    }
+    assert(x != 0);
+    if(!is_medium_int(x)) {
+        return PyStackRef_NULL;
+    }
+    PyLongObject *v = (PyLongObject *)_Py_FREELIST_POP(PyLongObject, ints);
+    if (v == NULL) {
+        v = PyObject_Malloc(sizeof(PyLongObject));
+        if (v == NULL) {
+            return PyStackRef_NULL;
+        }
+        _PyObject_Init((PyObject*)v, &PyLong_Type);
+    }
+    digit abs_x = x < 0 ? (digit)(-x) : (digit)x;
+    _PyLong_SetSignAndDigitCount(v, x<0?-1:1, 1);
+    v->long_value.ob_digit[0] = abs_x;
+    return PyStackRef_FromPyObjectStealMortal((PyObject *)v);
+}
+
+
 /* If a freshly-allocated int is already shared, it must
    be a small integer, so negating it must go to PyLong_FromLong */
 Py_LOCAL_INLINE(void)
@@ -971,16 +999,9 @@ _PyLong_FromByteArray(const unsigned char* bytes, size_t n,
             ++numsignificantbytes;
     }
 
-    /* How many Python int digits do we need?  We have
-       8*numsignificantbytes bits, and each Python int digit has
-       PyLong_SHIFT bits, so it's the ceiling of the quotient. */
-    /* catch overflow before it happens */
-    if (numsignificantbytes > (PY_SSIZE_T_MAX - PyLong_SHIFT) / 8) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "byte array too long to convert to int");
-        return NULL;
-    }
-    ndigits = (numsignificantbytes * 8 + PyLong_SHIFT - 1) / PyLong_SHIFT;
+    /* avoid integer overflow */
+    ndigits = numsignificantbytes / PyLong_SHIFT * 8
+        + (numsignificantbytes % PyLong_SHIFT * 8 + PyLong_SHIFT - 1) / PyLong_SHIFT;
     v = long_alloc(ndigits);
     if (v == NULL)
         return NULL;
@@ -3778,10 +3799,12 @@ long_add(PyLongObject *a, PyLongObject *b)
     return z;
 }
 
-PyObject *
-_PyLong_Add(PyLongObject *a, PyLongObject *b)
+_PyStackRef
+_PyCompactLong_Add(PyLongObject *a, PyLongObject *b)
 {
-    return (PyObject*)long_add(a, b);
+    assert(_PyLong_BothAreCompact(a, b));
+    stwodigits v = medium_value(a) + medium_value(b);
+    return medium_from_stwodigits(v);
 }
 
 static PyObject *
@@ -3821,10 +3844,12 @@ long_sub(PyLongObject *a, PyLongObject *b)
     return z;
 }
 
-PyObject *
-_PyLong_Subtract(PyLongObject *a, PyLongObject *b)
+_PyStackRef
+_PyCompactLong_Subtract(PyLongObject *a, PyLongObject *b)
 {
-    return (PyObject*)long_sub(a, b);
+    assert(_PyLong_BothAreCompact(a, b));
+    stwodigits v = medium_value(a) - medium_value(b);
+    return medium_from_stwodigits(v);
 }
 
 static PyObject *
@@ -4268,10 +4293,14 @@ long_mul(PyLongObject *a, PyLongObject *b)
     return z;
 }
 
-PyObject *
-_PyLong_Multiply(PyLongObject *a, PyLongObject *b)
+/* This function returns NULL if the result is not compact,
+ * or if it fails to allocate, but never raises */
+_PyStackRef
+_PyCompactLong_Multiply(PyLongObject *a, PyLongObject *b)
 {
-    return (PyObject*)long_mul(a, b);
+    assert(_PyLong_BothAreCompact(a, b));
+    stwodigits v = medium_value(a) * medium_value(b);
+    return medium_from_stwodigits(v);
 }
 
 static PyObject *
diff --git a/Objects/methodobject.c b/Objects/methodobject.c
index c3dcd09ad1c..e6e469ca270 100644
--- a/Objects/methodobject.c
+++ b/Objects/methodobject.c
@@ -8,6 +8,7 @@
 #include "pycore_object.h"
 #include "pycore_pyerrors.h"
 #include "pycore_pystate.h"       // _PyThreadState_GET()
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 /* undefine macro trampoline to PyCFunction_NewEx */
@@ -167,9 +168,7 @@ meth_dealloc(PyObject *self)
 {
     PyCFunctionObject *m = _PyCFunctionObject_CAST(self);
     PyObject_GC_UnTrack(m);
-    if (m->m_weakreflist != NULL) {
-        PyObject_ClearWeakRefs((PyObject*) m);
-    }
+    FT_CLEAR_WEAKREFS(self, m->m_weakreflist);
     // We need to access ml_flags here rather than later.
     // `m->m_ml` might have the same lifetime
     // as `m_self` when it's dynamically allocated.
diff --git a/Objects/moduleobject.c b/Objects/moduleobject.c
index f363ef173cb..862395e7881 100644
--- a/Objects/moduleobject.c
+++ b/Objects/moduleobject.c
@@ -12,8 +12,8 @@
 #include "pycore_object.h"        // _PyType_AllocNoTrack
 #include "pycore_pyerrors.h"      // _PyErr_FormatFromCause()
 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
-#include "pycore_sysmodule.h"     // _PySys_GetOptionalAttrString()
 #include "pycore_unicodeobject.h" // _PyUnicode_EqualToASCIIString()
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 #include "osdefs.h"               // MAXPATHLEN
 
@@ -827,8 +827,7 @@ module_dealloc(PyObject *self)
     if (verbose && m->md_name) {
         PySys_FormatStderr("# destroy %U\n", m->md_name);
     }
-    if (m->md_weaklist != NULL)
-        PyObject_ClearWeakRefs((PyObject *) m);
+    FT_CLEAR_WEAKREFS(self, m->md_weaklist);
 
     /* bpo-39824: Don't call m_free() if m_size > 0 and md_state=NULL */
     if (m->md_def && m->md_def->m_free
@@ -1058,7 +1057,7 @@ _Py_module_getattro_impl(PyModuleObject *m, PyObject *name, int suppress)
     int is_possibly_shadowing_stdlib = 0;
     if (is_possibly_shadowing) {
         PyObject *stdlib_modules;
-        if (_PySys_GetOptionalAttrString("stdlib_module_names", &stdlib_modules) < 0) {
+        if (PySys_GetOptionalAttrString("stdlib_module_names", &stdlib_modules) < 0) {
             goto done;
         }
         if (stdlib_modules && PyAnySet_Check(stdlib_modules)) {
diff --git a/Objects/namespaceobject.c b/Objects/namespaceobject.c
index caebe6bf543..201cb8a7df8 100644
--- a/Objects/namespaceobject.c
+++ b/Objects/namespaceobject.c
@@ -124,9 +124,10 @@ namespace_repr(PyObject *ns)
         if (PyUnicode_Check(key) && PyUnicode_GET_LENGTH(key) > 0) {
             PyObject *value, *item;
 
-            value = PyDict_GetItemWithError(d, key);
-            if (value != NULL) {
+            int has_key = PyDict_GetItemRef(d, key, &value);
+            if (has_key == 1) {
                 item = PyUnicode_FromFormat("%U=%R", key, value);
+                Py_DECREF(value);
                 if (item == NULL) {
                     loop_error = 1;
                 }
@@ -135,7 +136,7 @@ namespace_repr(PyObject *ns)
                     Py_DECREF(item);
                 }
             }
-            else if (PyErr_Occurred()) {
+            else if (has_key < 0) {
                 loop_error = 1;
             }
         }
@@ -193,10 +194,14 @@ namespace_clear(PyObject *op)
 static PyObject *
 namespace_richcompare(PyObject *self, PyObject *other, int op)
 {
-    if (PyObject_TypeCheck(self, &_PyNamespace_Type) &&
-        PyObject_TypeCheck(other, &_PyNamespace_Type))
+    if (
+        (op == Py_EQ || op == Py_NE) &&
+        PyObject_TypeCheck(self, &_PyNamespace_Type) &&
+        PyObject_TypeCheck(other, &_PyNamespace_Type)
+    ) {
         return PyObject_RichCompare(((_PyNamespaceObject *)self)->ns_dict,
                                    ((_PyNamespaceObject *)other)->ns_dict, op);
+    }
     Py_RETURN_NOTIMPLEMENTED;
 }
 
diff --git a/Objects/object.c b/Objects/object.c
index 723b0427e69..3ed7d55593d 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -925,6 +925,7 @@ _PyObject_ClearFreeLists(struct _Py_freelists *freelists, int is_finalization)
     // In the free-threaded build, freelists are per-PyThreadState and cleared in PyThreadState_Clear()
     // In the default build, freelists are per-interpreter and cleared in finalize_interp_types()
     clear_freelist(&freelists->floats, is_finalization, free_object);
+    clear_freelist(&freelists->complexes, is_finalization, free_object);
     for (Py_ssize_t i = 0; i < PyTuple_MAXSAVESIZE; i++) {
         clear_freelist(&freelists->tuples[i], is_finalization, free_object);
     }
@@ -1130,11 +1131,14 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
     res = PyObject_RichCompare(v, w, op);
     if (res == NULL)
         return -1;
-    if (PyBool_Check(res))
+    if (PyBool_Check(res)) {
         ok = (res == Py_True);
-    else
+        assert(_Py_IsImmortal(res));
+    }
+    else {
         ok = PyObject_IsTrue(res);
-    Py_DECREF(res);
+        Py_DECREF(res);
+    }
     return ok;
 }
 
@@ -1209,16 +1213,27 @@ PyObject_HasAttrString(PyObject *obj, const char *name)
 int
 PyObject_SetAttrString(PyObject *v, const char *name, PyObject *w)
 {
-    PyObject *s;
-    int res;
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (w == NULL && _PyErr_Occurred(tstate)) {
+        PyObject *exc = _PyErr_GetRaisedException(tstate);
+        _PyErr_SetString(tstate, PyExc_SystemError,
+            "PyObject_SetAttrString() must not be called with NULL value "
+            "and an exception set");
+        _PyErr_ChainExceptions1Tstate(tstate, exc);
+        return -1;
+    }
 
-    if (Py_TYPE(v)->tp_setattr != NULL)
+    if (Py_TYPE(v)->tp_setattr != NULL) {
         return (*Py_TYPE(v)->tp_setattr)(v, (char*)name, w);
-    s = PyUnicode_InternFromString(name);
-    if (s == NULL)
+    }
+
+    PyObject *s = PyUnicode_InternFromString(name);
+    if (s == NULL) {
         return -1;
-    res = PyObject_SetAttr(v, s, w);
-    Py_XDECREF(s);
+    }
+
+    int res = PyObject_SetAttr(v, s, w);
+    Py_DECREF(s);
     return res;
 }
 
@@ -1436,6 +1451,16 @@ PyObject_HasAttr(PyObject *obj, PyObject *name)
 int
 PyObject_SetAttr(PyObject *v, PyObject *name, PyObject *value)
 {
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (value == NULL && _PyErr_Occurred(tstate)) {
+        PyObject *exc = _PyErr_GetRaisedException(tstate);
+        _PyErr_SetString(tstate, PyExc_SystemError,
+            "PyObject_SetAttr() must not be called with NULL value "
+            "and an exception set");
+        _PyErr_ChainExceptions1Tstate(tstate, exc);
+        return -1;
+    }
+
     PyTypeObject *tp = Py_TYPE(v);
     int err;
 
@@ -1447,8 +1472,7 @@ PyObject_SetAttr(PyObject *v, PyObject *name, PyObject *value)
     }
     Py_INCREF(name);
 
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyUnicode_InternMortal(interp, &name);
+    _PyUnicode_InternMortal(tstate->interp, &name);
     if (tp->tp_setattro != NULL) {
         err = (*tp->tp_setattro)(v, name, value);
         Py_DECREF(name);
@@ -1664,6 +1688,116 @@ _PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method)
     return 0;
 }
 
+int
+_PyObject_GetMethodStackRef(PyThreadState *ts, PyObject *obj,
+                            PyObject *name, _PyStackRef *method)
+{
+    int meth_found = 0;
+
+    assert(PyStackRef_IsNull(*method));
+
+    PyTypeObject *tp = Py_TYPE(obj);
+    if (!_PyType_IsReady(tp)) {
+        if (PyType_Ready(tp) < 0) {
+            return 0;
+        }
+    }
+
+    if (tp->tp_getattro != PyObject_GenericGetAttr || !PyUnicode_CheckExact(name)) {
+        PyObject *res = PyObject_GetAttr(obj, name);
+        if (res != NULL) {
+            *method = PyStackRef_FromPyObjectSteal(res);
+        }
+        return 0;
+    }
+
+    _PyType_LookupStackRefAndVersion(tp, name, method);
+    PyObject *descr = PyStackRef_AsPyObjectBorrow(*method);
+    descrgetfunc f = NULL;
+    if (descr != NULL) {
+        if (_PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_METHOD_DESCRIPTOR)) {
+            meth_found = 1;
+        }
+        else {
+            f = Py_TYPE(descr)->tp_descr_get;
+            if (f != NULL && PyDescr_IsData(descr)) {
+                PyObject *value = f(descr, obj, (PyObject *)Py_TYPE(obj));
+                PyStackRef_CLEAR(*method);
+                if (value != NULL) {
+                    *method = PyStackRef_FromPyObjectSteal(value);
+                }
+                return 0;
+            }
+        }
+    }
+    PyObject *dict, *attr;
+    if ((tp->tp_flags & Py_TPFLAGS_INLINE_VALUES) &&
+         _PyObject_TryGetInstanceAttribute(obj, name, &attr)) {
+        if (attr != NULL) {
+           PyStackRef_CLEAR(*method);
+           *method = PyStackRef_FromPyObjectSteal(attr);
+           return 0;
+        }
+        dict = NULL;
+    }
+    else if ((tp->tp_flags & Py_TPFLAGS_MANAGED_DICT)) {
+        dict = (PyObject *)_PyObject_GetManagedDict(obj);
+    }
+    else {
+        PyObject **dictptr = _PyObject_ComputedDictPointer(obj);
+        if (dictptr != NULL) {
+            dict = FT_ATOMIC_LOAD_PTR_ACQUIRE(*dictptr);
+        }
+        else {
+            dict = NULL;
+        }
+    }
+    if (dict != NULL) {
+        // TODO: use _Py_dict_lookup_threadsafe_stackref
+        Py_INCREF(dict);
+        PyObject *value;
+        if (PyDict_GetItemRef(dict, name, &value) != 0) {
+            // found or error
+            Py_DECREF(dict);
+            PyStackRef_CLEAR(*method);
+            if (value != NULL) {
+                *method = PyStackRef_FromPyObjectSteal(value);
+            }
+            return 0;
+        }
+        // not found
+        Py_DECREF(dict);
+    }
+
+    if (meth_found) {
+        assert(!PyStackRef_IsNull(*method));
+        return 1;
+    }
+
+    if (f != NULL) {
+        PyObject *value = f(descr, obj, (PyObject *)Py_TYPE(obj));
+        PyStackRef_CLEAR(*method);
+        if (value) {
+            *method = PyStackRef_FromPyObjectSteal(value);
+        }
+        return 0;
+    }
+
+    if (descr != NULL) {
+        assert(!PyStackRef_IsNull(*method));
+        return 0;
+    }
+
+    PyErr_Format(PyExc_AttributeError,
+                 "'%.100s' object has no attribute '%U'",
+                 tp->tp_name, name);
+
+    _PyObject_SetAttributeErrorContext(obj, name);
+    assert(PyStackRef_IsNull(*method));
+    return 0;
+}
+
+
 /* Generic GetAttr functions - put these in your tp_[gs]etattro slot. */
 
 PyObject *
@@ -1906,34 +2040,11 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value)
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject **dictptr = _PyObject_GetDictPtr(obj);
-    if (dictptr == NULL) {
-        if (_PyType_HasFeature(Py_TYPE(obj), Py_TPFLAGS_INLINE_VALUES) &&
-            _PyObject_GetManagedDict(obj) == NULL
-        ) {
-            /* Was unable to convert to dict */
-            PyErr_NoMemory();
-        }
-        else {
-            PyErr_SetString(PyExc_AttributeError,
-                            "This object has no __dict__");
-        }
-        return -1;
-    }
     if (value == NULL) {
         PyErr_SetString(PyExc_TypeError, "cannot delete __dict__");
         return -1;
     }
-    if (!PyDict_Check(value)) {
-        PyErr_Format(PyExc_TypeError,
-                     "__dict__ must be set to a dictionary, "
-                     "not a '%.200s'", Py_TYPE(value)->tp_name);
-        return -1;
-    }
-    Py_BEGIN_CRITICAL_SECTION(obj);
-    Py_XSETREF(*dictptr, Py_NewRef(value));
-    Py_END_CRITICAL_SECTION();
-    return 0;
+    return _PyObject_SetDict(obj, value);
 }
 
 
@@ -1996,9 +2107,25 @@ _dir_locals(void)
     PyObject *names;
     PyObject *locals;
 
-    locals = _PyEval_GetFrameLocals();
-    if (locals == NULL)
+    if (_PyEval_GetFrame() != NULL) {
+        locals = _PyEval_GetFrameLocals();
+    }
+    else {
+        PyThreadState *tstate = _PyThreadState_GET();
+        locals = _PyEval_GetGlobalsFromRunningMain(tstate);
+        if (locals == NULL) {
+            if (!_PyErr_Occurred(tstate)) {
+                locals = _PyEval_GetFrameLocals();
+                assert(_PyErr_Occurred(tstate));
+            }
+        }
+        else {
+            Py_INCREF(locals);
+        }
+    }
+    if (locals == NULL) {
         return NULL;
+    }
 
     names = PyMapping_Keys(locals);
     Py_DECREF(locals);
@@ -2930,57 +3057,28 @@ finally:
 
 /* Trashcan support. */
 
-#ifndef Py_GIL_DISABLED
-/* We need to store a pointer in the refcount field of
- * an object. It is important that we never store 0 (NULL).
-* It is also important to not make the object appear immortal,
-* or it might be untracked by the cycle GC. */
-static uintptr_t
-pointer_to_safe_refcount(void *ptr)
-{
-    uintptr_t full = (uintptr_t)ptr;
-    assert((full & 3) == 0);
-#if SIZEOF_VOID_P > 4
-    uint32_t refcnt = (uint32_t)full;
-    if (refcnt >= (uint32_t)_Py_IMMORTAL_MINIMUM_REFCNT) {
-        full = full - ((uintptr_t)_Py_IMMORTAL_MINIMUM_REFCNT) + 1;
-    }
-    return full + 2;
-#else
-    // Make the top two bits 0, so it appears mortal.
-    return (full >> 2) + 1;
-#endif
-}
-
-static void *
-safe_refcount_to_pointer(uintptr_t refcnt)
-{
-#if SIZEOF_VOID_P > 4
-    if (refcnt & 1) {
-        refcnt += _Py_IMMORTAL_MINIMUM_REFCNT - 1;
-    }
-    return (void *)(refcnt - 2);
-#else
-    return (void *)((refcnt -1) << 2);
-#endif
-}
-#endif
-
 /* Add op to the gcstate->trash_delete_later list.  Called when the current
- * call-stack depth gets large.  op must be a currently untracked gc'ed
- * object, with refcount 0.  Py_DECREF must already have been called on it.
+ * call-stack depth gets large.  op must be a gc'ed object, with refcount 0.
+ *  Py_DECREF must already have been called on it.
  */
 void
 _PyTrash_thread_deposit_object(PyThreadState *tstate, PyObject *op)
 {
     _PyObject_ASSERT(op, Py_REFCNT(op) == 0);
+    PyTypeObject *tp = Py_TYPE(op);
+    assert(tp->tp_flags & Py_TPFLAGS_HAVE_GC);
+    int tracked = 0;
+    if (tp->tp_is_gc == NULL || tp->tp_is_gc(op)) {
+        tracked = _PyObject_GC_IS_TRACKED(op);
+        if (tracked) {
+            _PyObject_GC_UNTRACK(op);
+        }
+    }
+    uintptr_t tagged_ptr = ((uintptr_t)tstate->delete_later) | tracked;
 #ifdef Py_GIL_DISABLED
-    op->ob_tid = (uintptr_t)tstate->delete_later;
+    op->ob_tid = tagged_ptr;
 #else
-    /* Store the delete_later pointer in the refcnt field. */
-    uintptr_t refcnt = pointer_to_safe_refcount(tstate->delete_later);
-    *((uintptr_t*)op) = refcnt;
-    assert(!_Py_IsImmortal(op));
+    _Py_AS_GC(op)->_gc_next = tagged_ptr;
 #endif
     tstate->delete_later = op;
 }
@@ -2995,17 +3093,17 @@ _PyTrash_thread_destroy_chain(PyThreadState *tstate)
         destructor dealloc = Py_TYPE(op)->tp_dealloc;
 
 #ifdef Py_GIL_DISABLED
-        tstate->delete_later = (PyObject*) op->ob_tid;
+        uintptr_t tagged_ptr = op->ob_tid;
         op->ob_tid = 0;
         _Py_atomic_store_ssize_relaxed(&op->ob_ref_shared, _Py_REF_MERGED);
 #else
-        /* Get the delete_later pointer from the refcnt field.
-         * See _PyTrash_thread_deposit_object(). */
-        uintptr_t refcnt = *((uintptr_t*)op);
-        tstate->delete_later = safe_refcount_to_pointer(refcnt);
-        op->ob_refcnt = 0;
+        uintptr_t tagged_ptr = _Py_AS_GC(op)->_gc_next;
+        _Py_AS_GC(op)->_gc_next = 0;
 #endif
-
+        tstate->delete_later = (PyObject *)(tagged_ptr & ~1);
+        if (tagged_ptr & 1) {
+            _PyObject_GC_TRACK(op);
+        }
         /* Call the deallocator directly.  This used to try to
          * fool Py_DECREF into calling it indirectly, but
          * Py_DECREF was already called on this object, and in
@@ -3079,10 +3177,11 @@ void
 _Py_Dealloc(PyObject *op)
 {
     PyTypeObject *type = Py_TYPE(op);
+    unsigned long gc_flag = type->tp_flags & Py_TPFLAGS_HAVE_GC;
     destructor dealloc = type->tp_dealloc;
     PyThreadState *tstate = _PyThreadState_GET();
     intptr_t margin = _Py_RecursionLimit_GetMargin(tstate);
-    if (margin < 2) {
+    if (margin < 2 && gc_flag) {
         _PyTrash_thread_deposit_object(tstate, (PyObject *)op);
         return;
     }
@@ -3128,7 +3227,7 @@ _Py_Dealloc(PyObject *op)
     Py_XDECREF(old_exc);
     Py_DECREF(type);
 #endif
-    if (tstate->delete_later && margin >= 4) {
+    if (tstate->delete_later && margin >= 4 && gc_flag) {
         _PyTrash_thread_destroy_chain(tstate);
     }
 }
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index b209808da90..deb7fd957e5 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -124,6 +124,33 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)
 
 }
 
+#ifdef Py_GIL_DISABLED
+
+// If we are deferring collection of more than this amount of memory for
+// mimalloc pages, advance the write sequence.  Advancing allows these
+// pages to be re-used in a different thread or for a different size class.
+#define QSBR_PAGE_MEM_LIMIT 4096*20
+
+// Return true if the global write sequence should be advanced for a mimalloc
+// page that is deferred from collection.
+static bool
+should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page)
+{
+    size_t bsize = mi_page_block_size(page);
+    size_t page_size = page->capacity*bsize;
+    if (page_size > QSBR_PAGE_MEM_LIMIT) {
+        qsbr->deferred_page_memory = 0;
+        return true;
+    }
+    qsbr->deferred_page_memory += page_size;
+    if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
+        qsbr->deferred_page_memory = 0;
+        return true;
+    }
+    return false;
+}
+#endif
+
 static bool
 _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
 {
@@ -139,7 +166,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
 
         _PyMem_mi_page_clear_qsbr(page);
         page->retire_expire = 0;
-        page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);
+
+        if (should_advance_qsbr_for_page(tstate->qsbr, page)) {
+            page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared);
+        }
+        else {
+            page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared);
+        }
+
         llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
         return false;
     }
@@ -1141,8 +1175,44 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
     }
 }
 
+
+#ifdef Py_GIL_DISABLED
+
+// For deferred advance on free: the number of deferred items before advancing
+// the write sequence.  This is based on WORK_ITEMS_PER_CHUNK.  We ideally
+// want to process a chunk before it overflows.
+#define QSBR_DEFERRED_LIMIT 127
+
+// If the deferred memory exceeds 1 MiB, advance the write sequence.  This
+// helps limit memory usage due to QSBR delaying frees too long.
+#define QSBR_FREE_MEM_LIMIT 1024*1024
+
+// Return true if the global write sequence should be advanced for a deferred
+// memory free.
+static bool
+should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size)
+{
+    if (size > QSBR_FREE_MEM_LIMIT) {
+        qsbr->deferred_count = 0;
+        qsbr->deferred_memory = 0;
+        qsbr->should_process = true;
+        return true;
+    }
+    qsbr->deferred_count++;
+    qsbr->deferred_memory += size;
+    if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT ||
+            qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
+        qsbr->deferred_count = 0;
+        qsbr->deferred_memory = 0;
+        qsbr->should_process = true;
+        return true;
+    }
+    return false;
+}
+#endif
+
 static void
-free_delayed(uintptr_t ptr)
+free_delayed(uintptr_t ptr, size_t size)
 {
 #ifndef Py_GIL_DISABLED
     free_work_item(ptr, NULL, NULL);
@@ -1200,23 +1270,32 @@ free_delayed(uintptr_t ptr)
     }
 
     assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
-    uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
+    uint64_t seq;
+    if (should_advance_qsbr_for_free(tstate->qsbr, size)) {
+        seq = _Py_qsbr_advance(tstate->qsbr->shared);
+    }
+    else {
+        seq = _Py_qsbr_shared_next(tstate->qsbr->shared);
+    }
     buf->array[buf->wr_idx].ptr = ptr;
     buf->array[buf->wr_idx].qsbr_goal = seq;
     buf->wr_idx++;
 
     if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
+        // Normally the processing of delayed items is done from the eval
+        // breaker.  Processing here is a safety measure to ensure too much
+        // work does not accumulate.
         _PyMem_ProcessDelayed((PyThreadState *)tstate);
     }
 #endif
 }
 
 void
-_PyMem_FreeDelayed(void *ptr)
+_PyMem_FreeDelayed(void *ptr, size_t size)
 {
     assert(!((uintptr_t)ptr & 0x01));
     if (ptr != NULL) {
-        free_delayed((uintptr_t)ptr);
+        free_delayed((uintptr_t)ptr, size);
     }
 }
 
@@ -1226,7 +1305,25 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
 {
     assert(!((uintptr_t)ptr & 0x01));
     if (ptr != NULL) {
-        free_delayed(((uintptr_t)ptr)|0x01);
+        // We use 0 as the size since we don't have an easy way to know the
+        // actual size.  If we are freeing many objects, the write sequence
+        // will be advanced due to QSBR_DEFERRED_LIMIT.
+        free_delayed(((uintptr_t)ptr)|0x01, 0);
+    }
+}
+#endif
+
+#ifdef Py_GIL_DISABLED
+void
+_PyObject_XSetRefDelayed(PyObject **ptr, PyObject *value)
+{
+    PyObject *old = *ptr;
+    FT_ATOMIC_STORE_PTR_RELEASE(*ptr, value);
+    if (old == NULL) {
+        return;
+    }
+    if (!_Py_IsImmortal(old)) {
+         _PyObject_XDecRefDelayed(old);
     }
 }
 #endif
@@ -1238,7 +1335,7 @@ work_queue_first(struct llist_node *head)
 }
 
 static void
-process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr,
+process_queue(struct llist_node *head, _PyThreadStateImpl *tstate,
               bool keep_empty, delayed_dealloc_cb cb, void *state)
 {
     while (!llist_empty(head)) {
@@ -1246,7 +1343,7 @@ process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr,
 
         if (buf->rd_idx < buf->wr_idx) {
             struct _mem_work_item *item = &buf->array[buf->rd_idx];
-            if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) {
+            if (!_Py_qsbr_poll(tstate->qsbr, item->qsbr_goal)) {
                 return;
             }
 
@@ -1270,11 +1367,11 @@ process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr,
 
 static void
 process_interp_queue(struct _Py_mem_interp_free_queue *queue,
-                     struct _qsbr_thread_state *qsbr, delayed_dealloc_cb cb,
+                     _PyThreadStateImpl *tstate, delayed_dealloc_cb cb,
                      void *state)
 {
     assert(PyMutex_IsLocked(&queue->mutex));
-    process_queue(&queue->head, qsbr, false, cb, state);
+    process_queue(&queue->head, tstate, false, cb, state);
 
     int more_work = !llist_empty(&queue->head);
     _Py_atomic_store_int_relaxed(&queue->has_work, more_work);
@@ -1282,7 +1379,7 @@ process_interp_queue(struct _Py_mem_interp_free_queue *queue,
 
 static void
 maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue,
-                           struct _qsbr_thread_state *qsbr, delayed_dealloc_cb cb,
+                           _PyThreadStateImpl *tstate, delayed_dealloc_cb cb,
                            void *state)
 {
     if (!_Py_atomic_load_int_relaxed(&queue->has_work)) {
@@ -1291,7 +1388,7 @@ maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue,
 
     // Try to acquire the lock, but don't block if it's already held.
     if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) {
-        process_interp_queue(queue, qsbr, cb, state);
+        process_interp_queue(queue, tstate, cb, state);
         PyMutex_Unlock(&queue->mutex);
     }
 }
@@ -1302,11 +1399,13 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
     PyInterpreterState *interp = tstate->interp;
     _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
 
+    tstate_impl->qsbr->should_process = false;
+
     // Process thread-local work
-    process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true, NULL, NULL);
+    process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);
 
     // Process shared interpreter work
-    maybe_process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr, NULL, NULL);
+    maybe_process_interp_queue(&interp->mem_free_queue, tstate_impl, NULL, NULL);
 }
 
 void
@@ -1316,10 +1415,10 @@ _PyMem_ProcessDelayedNoDealloc(PyThreadState *tstate, delayed_dealloc_cb cb, voi
     _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
 
     // Process thread-local work
-    process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true, cb, state);
+    process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, cb, state);
 
     // Process shared interpreter work
-    maybe_process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr, cb, state);
+    maybe_process_interp_queue(&interp->mem_free_queue, tstate_impl, cb, state);
 }
 
 void
@@ -1348,7 +1447,7 @@ _PyMem_AbandonDelayed(PyThreadState *tstate)
 
     // Process the merged queue now (see gh-130794).
     _PyThreadStateImpl *this_tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
-    process_interp_queue(&interp->mem_free_queue, this_tstate->qsbr, NULL, NULL);
+    process_interp_queue(&interp->mem_free_queue, this_tstate, NULL, NULL);
 
     PyMutex_Unlock(&interp->mem_free_queue.mutex);
 
diff --git a/Objects/odictobject.c b/Objects/odictobject.c
index 891f6197401..02fcbbaa0d4 100644
--- a/Objects/odictobject.c
+++ b/Objects/odictobject.c
@@ -473,6 +473,7 @@ later:
 #include "pycore_pyerrors.h"         // _PyErr_ChainExceptions1()
 #include "pycore_tuple.h"            // _PyTuple_Recycle()
 #include <stddef.h>                  // offsetof()
+#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
 
 #include "clinic/odictobject.c.h"
 
@@ -1391,8 +1392,7 @@ odict_dealloc(PyObject *op)
     PyObject_GC_UnTrack(self);
 
     Py_XDECREF(self->od_inst_dict);
-    if (self->od_weakreflist != NULL)
-        PyObject_ClearWeakRefs((PyObject *)self);
+    FT_CLEAR_WEAKREFS(op, self->od_weakreflist);
 
     _odict_clear_nodes(self);
     PyDict_Type.tp_dealloc((PyObject *)self);
diff --git a/Objects/picklebufobject.c b/Objects/picklebufobject.c
index 3ce800de04c..50f17687bc4 100644
--- a/Objects/picklebufobject.c
+++ b/Objects/picklebufobject.c
@@ -1,6 +1,7 @@
 /* PickleBuffer object implementation */
 
 #include "Python.h"
+#include "pycore_weakref.h"     // FT_CLEAR_WEAKREFS()
 #include <stddef.h>
 
 typedef struct {
@@ -111,8 +112,7 @@ picklebuf_dealloc(PyObject *op)
 {
     PyPickleBufferObject *self = (PyPickleBufferObject*)op;
     PyObject_GC_UnTrack(self);
-    if (self->weakreflist != NULL)
-        PyObject_ClearWeakRefs((PyObject *) self);
+    FT_CLEAR_WEAKREFS(op, self->weakreflist);
     PyBuffer_Release(&self->view);
     Py_TYPE(self)->tp_free((PyObject *) self);
 }
diff --git a/Objects/setobject.c b/Objects/setobject.c
index 8aa6b0d1809..6e4fc5957ca 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -40,6 +40,7 @@
 #include "pycore_pyatomic_ft_wrappers.h"  // FT_ATOMIC_LOAD_SSIZE_RELAXED()
 #include "pycore_pyerrors.h"            // _PyErr_SetKeyError()
 #include "pycore_setobject.h"           // _PySet_NextEntry() definition
+#include "pycore_weakref.h"             // FT_CLEAR_WEAKREFS()
 
 #include "stringlib/eq.h"               // unicode_eq()
 #include <stddef.h>                     // offsetof()
@@ -536,8 +537,7 @@ set_dealloc(PyObject *self)
 
     /* bpo-31095: UnTrack is needed before calling any callbacks */
     PyObject_GC_UnTrack(so);
-    if (so->weakreflist != NULL)
-        PyObject_ClearWeakRefs((PyObject *) so);
+    FT_CLEAR_WEAKREFS(self, so->weakreflist);
 
     for (entry = so->table; used > 0; entry++) {
         if (entry->key && entry->key != dummy) {
diff --git a/Objects/templateobject.c b/Objects/templateobject.c
index 7d356980b56..4293a311c44 100644
--- a/Objects/templateobject.c
+++ b/Objects/templateobject.c
@@ -23,6 +23,9 @@ templateiter_next(PyObject *op)
     if (self->from_strings) {
         item = PyIter_Next(self->stringsiter);
         self->from_strings = 0;
+        if (item == NULL) {
+            return NULL;
+        }
         if (PyUnicode_GET_LENGTH(item) == 0) {
             Py_SETREF(item, PyIter_Next(self->interpolationsiter));
             self->from_strings = 1;
@@ -444,6 +447,8 @@ template_reduce(PyObject *op, PyObject *Py_UNUSED(dummy))
 
 static PyMethodDef template_methods[] = {
     {"__reduce__", template_reduce, METH_NOARGS, NULL},
+    {"__class_getitem__", Py_GenericAlias,
+        METH_O|METH_CLASS, PyDoc_STR("See PEP 585")},
     {NULL, NULL},
 };
 
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index a7ab69fef4c..6e7471cb594 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -48,7 +48,7 @@ class object "PyObject *" "&PyBaseObject_Type"
          & ((1 << MCACHE_SIZE_EXP) - 1))
 
 #define MCACHE_HASH_METHOD(type, name)                                  \
-    MCACHE_HASH(FT_ATOMIC_LOAD_UINT32_RELAXED((type)->tp_version_tag),   \
+    MCACHE_HASH(FT_ATOMIC_LOAD_UINT_RELAXED((type)->tp_version_tag),   \
                 ((Py_ssize_t)(name)) >> 3)
 #define MCACHE_CACHEABLE_NAME(name)                             \
         PyUnicode_CheckExact(name) &&                           \
@@ -60,11 +60,19 @@ class object "PyObject *" "&PyBaseObject_Type"
 
 #ifdef Py_GIL_DISABLED
 
-// There's a global lock for mutation of types.  This avoids having to take
-// additional locks while doing various subclass processing which may result
-// in odd behaviors w.r.t. running with the GIL as the outer type lock could
-// be released and reacquired during a subclass update if there's contention
-// on the subclass lock.
+// There's a global lock for types that ensures that tp_version_tag and
+// _spec_cache are correctly updated if the type is modified.  It also protects
+// tp_mro, tp_bases, and tp_base.  This avoids having to take additional locks
+// while doing various subclass processing which may result in odd behaviors
+// w.r.t. running with the GIL as the outer type lock could be released and
+// reacquired during a subclass update if there's contention on the subclass
+// lock.
+//
+// Note that this lock does not protect updates of other type slots or the
+// tp_flags member.  Instead, we either ensure those updates are done before
+// the type has been revealed to other threads or we only do those updates
+// while the stop-the-world mechanism is active.  The slots and flags are read
+// in many places without holding a lock and without atomics.
 #define TYPE_LOCK &PyInterpreterState_Get()->types.mutex
 #define BEGIN_TYPE_LOCK() Py_BEGIN_CRITICAL_SECTION_MUT(TYPE_LOCK)
 #define END_TYPE_LOCK() Py_END_CRITICAL_SECTION()
@@ -74,8 +82,100 @@ class object "PyObject *" "&PyBaseObject_Type"
 
 #define END_TYPE_DICT_LOCK() Py_END_CRITICAL_SECTION2()
 
+#ifdef Py_DEBUG
+// Return true if the world is currently stopped.
+static bool
+types_world_is_stopped(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    return interp->stoptheworld.world_stopped;
+}
+#endif
+
+// Checks that the type has not yet been revealed (exposed) to other
+// threads.  The _Py_TYPE_REVEALED_FLAG flag is set by type_new() and
+// PyType_FromMetaclass() to indicate that a newly initialized type might be
+// revealed.  We only have ob_flags on 64-bit platforms.
+#if SIZEOF_VOID_P > 4
+#define TYPE_IS_REVEALED(tp) ((((PyObject *)(tp))->ob_flags & _Py_TYPE_REVEALED_FLAG) != 0)
+#else
+#define TYPE_IS_REVEALED(tp) 0
+#endif
+
+#ifdef Py_DEBUG
 #define ASSERT_TYPE_LOCK_HELD() \
-    _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK)
+    if (!types_world_is_stopped()) { _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(TYPE_LOCK); }
+
+// Checks if we can safely update type slots or tp_flags.
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp) \
+    assert(!TYPE_IS_REVEALED(tp) || types_world_is_stopped())
+
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp) \
+    if (TYPE_IS_REVEALED(tp)) { ASSERT_TYPE_LOCK_HELD(); }
+#else
+#define ASSERT_TYPE_LOCK_HELD()
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp)
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp)
+#endif
+
+static void
+types_stop_world(void)
+{
+    assert(!types_world_is_stopped());
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    _PyEval_StopTheWorld(interp);
+    assert(types_world_is_stopped());
+}
+
+static void
+types_start_world(void)
+{
+    assert(types_world_is_stopped());
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    _PyEval_StartTheWorld(interp);
+    assert(!types_world_is_stopped());
+}
+
+// This is used to temporarily prevent the TYPE_LOCK from being suspended
+// when held by the topmost critical section.
+static void
+type_lock_prevent_release(void)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t *tagptr = &tstate->critical_section;
+    PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
+    if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) {
+        assert(c->_cs_mutex == TYPE_LOCK);
+        c->_cs_mutex = NULL;
+    }
+    else {
+        PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
+        if (c->_cs_mutex == TYPE_LOCK) {
+            c->_cs_mutex = c2->_cs_mutex2;
+            c2->_cs_mutex2 = NULL;
+        } else {
+            assert(c2->_cs_mutex2 == TYPE_LOCK);
+            c2->_cs_mutex2 = NULL;
+        }
+    }
+}
+
+static void
+type_lock_allow_release(void)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    uintptr_t *tagptr = &tstate->critical_section;
+    PyCriticalSection *c = (PyCriticalSection *)(*tagptr & ~_Py_CRITICAL_SECTION_MASK);
+    if (!(*tagptr & _Py_CRITICAL_SECTION_TWO_MUTEXES)) {
+        assert(c->_cs_mutex == NULL);
+        c->_cs_mutex = TYPE_LOCK;
+    }
+    else {
+        PyCriticalSection2 *c2 = (PyCriticalSection2 *)c;
+        assert(c2->_cs_mutex2 == NULL);
+        c2->_cs_mutex2 = TYPE_LOCK;
+    }
+}
 
 #else
 
@@ -84,6 +184,12 @@ class object "PyObject *" "&PyBaseObject_Type"
 #define BEGIN_TYPE_DICT_LOCK(d)
 #define END_TYPE_DICT_LOCK()
 #define ASSERT_TYPE_LOCK_HELD()
+#define TYPE_IS_REVEALED(tp) 0
+#define ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp)
+#define ASSERT_NEW_TYPE_OR_LOCKED(tp)
+#define types_world_is_stopped() 1
+#define types_stop_world()
+#define types_start_world()
 
 #endif
 
@@ -106,6 +212,9 @@ slot_tp_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
 static int
 slot_tp_setattro(PyObject *self, PyObject *name, PyObject *value);
 
+static PyObject *
+slot_tp_call(PyObject *self, PyObject *args, PyObject *kwds);
+
 static inline PyTypeObject *
 type_from_ref(PyObject *ref)
 {
@@ -346,21 +455,14 @@ _PyStaticType_GetBuiltins(void)
 static void
 type_set_flags(PyTypeObject *tp, unsigned long flags)
 {
-    if (tp->tp_flags & Py_TPFLAGS_READY) {
-        // It's possible the type object has been exposed to other threads
-        // if it's been marked ready.  In that case, the type lock should be
-        // held when flags are modified.
-        ASSERT_TYPE_LOCK_HELD();
-    }
-    // Since PyType_HasFeature() reads the flags without holding the type
-    // lock, we need an atomic store here.
-    FT_ATOMIC_STORE_ULONG_RELAXED(tp->tp_flags, flags);
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp);
+    tp->tp_flags = flags;
 }
 
 static void
 type_set_flags_with_mask(PyTypeObject *tp, unsigned long mask, unsigned long flags)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_WORLD_STOPPED_OR_NEW_TYPE(tp);
     unsigned long new_flags = (tp->tp_flags & ~mask) | flags;
     type_set_flags(tp, new_flags);
 }
@@ -498,6 +600,7 @@ static inline void
 set_tp_bases(PyTypeObject *self, PyObject *bases, int initial)
 {
     assert(PyTuple_Check(bases));
+    ASSERT_NEW_TYPE_OR_LOCKED(self);
     if (self->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) {
         // XXX tp_bases can probably be statically allocated for each
         // static builtin type.
@@ -542,7 +645,7 @@ clear_tp_bases(PyTypeObject *self, int final)
 static inline PyObject *
 lookup_tp_mro(PyTypeObject *self)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(self);
     return self->tp_mro;
 }
 
@@ -1027,7 +1130,6 @@ PyType_Unwatch(int watcher_id, PyObject* obj)
 static void
 set_version_unlocked(PyTypeObject *tp, unsigned int version)
 {
-    ASSERT_TYPE_LOCK_HELD();
     assert(version == 0 || (tp->tp_versions_used != _Py_ATTR_CACHE_UNUSED));
 #ifndef Py_GIL_DISABLED
     PyInterpreterState *interp = _PyInterpreterState_GET();
@@ -1075,7 +1177,12 @@ type_modified_unlocked(PyTypeObject *type)
        We don't assign new version tags eagerly, but only as
        needed.
      */
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
+#ifdef Py_GIL_DISABLED
+    // This function is re-entrant and it's not safe to call it
+    // with the world stopped.
+    assert(!types_world_is_stopped());
+#endif
     if (type->tp_version_tag == 0) {
         return;
     }
@@ -1106,6 +1213,8 @@ type_modified_unlocked(PyTypeObject *type)
         while (bits) {
             assert(i < TYPE_MAX_WATCHERS);
             if (bits & 1) {
+                // Note that PyErr_FormatUnraisable is potentially re-entrant
+                // and the watcher callback might be too.
                 PyType_WatchCallback cb = interp->type_watchers[i];
                 if (cb && (cb(type) < 0)) {
                     PyErr_FormatUnraisable(
@@ -1245,14 +1354,6 @@ _PyType_LookupByVersion(unsigned int version)
 #endif
 }
 
-unsigned int
-_PyType_GetVersionForCurrentState(PyTypeObject *tp)
-{
-    return tp->tp_version_tag;
-}
-
-
-
 #define MAX_VERSIONS_PER_CLASS 1000
 #if _Py_ATTR_CACHE_UNUSED < MAX_VERSIONS_PER_CLASS
 #error "_Py_ATTR_CACHE_UNUSED must be bigger than max"
@@ -1586,10 +1687,13 @@ type_set_abstractmethods(PyObject *tp, PyObject *value, void *Py_UNUSED(closure)
 
     BEGIN_TYPE_LOCK();
     type_modified_unlocked(type);
+    types_stop_world();
     if (abstract)
         type_add_flags(type, Py_TPFLAGS_IS_ABSTRACT);
     else
         type_clear_flags(type, Py_TPFLAGS_IS_ABSTRACT);
+    types_start_world();
+    ASSERT_TYPE_LOCK_HELD();
     END_TYPE_LOCK();
 
     return 0;
@@ -1624,15 +1728,15 @@ type_get_mro(PyObject *tp, void *Py_UNUSED(closure))
     return mro;
 }
 
-static PyTypeObject *best_base(PyObject *);
-static int mro_internal(PyTypeObject *, PyObject **);
+static PyTypeObject *find_best_base(PyObject *);
+static int mro_internal(PyTypeObject *, int, PyObject **);
 static int type_is_subtype_base_chain(PyTypeObject *, PyTypeObject *);
 static int compatible_for_assignment(PyTypeObject *, PyTypeObject *, const char *);
 static int add_subclass(PyTypeObject*, PyTypeObject*);
 static int add_all_subclasses(PyTypeObject *type, PyObject *bases);
 static void remove_subclass(PyTypeObject *, PyTypeObject *);
 static void remove_all_subclasses(PyTypeObject *type, PyObject *bases);
-static void update_all_slots(PyTypeObject *);
+static int update_all_slots(PyTypeObject *);
 
 typedef int (*update_callback)(PyTypeObject *, void *);
 static int update_subclasses(PyTypeObject *type, PyObject *attr_name,
@@ -1640,13 +1744,15 @@ static int update_subclasses(PyTypeObject *type, PyObject *attr_name,
 static int recurse_down_subclasses(PyTypeObject *type, PyObject *name,
                                    update_callback callback, void *data);
 
+// Compute tp_mro for this type and all of its subclasses.  This
+// is called after __bases__ is assigned to an existing type.
 static int
 mro_hierarchy(PyTypeObject *type, PyObject *temp)
 {
     ASSERT_TYPE_LOCK_HELD();
 
     PyObject *old_mro;
-    int res = mro_internal(type, &old_mro);
+    int res = mro_internal(type, 0, &old_mro);
     if (res <= 0) {
         /* error / reentrance */
         return res;
@@ -1708,9 +1814,9 @@ mro_hierarchy(PyTypeObject *type, PyObject *temp)
 }
 
 static int
-type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
+type_check_new_bases(PyTypeObject *type, PyObject *new_bases, PyTypeObject **best_base)
 {
-    // Check arguments
+    // Check arguments, this is re-entrant due to the PySys_Audit() call
     if (!check_set_special_type_attr(type, new_bases, "__bases__")) {
         return -1;
     }
@@ -1759,20 +1865,29 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
     }
 
     // Compute the new MRO and the new base class
-    PyTypeObject *new_base = best_base(new_bases);
-    if (new_base == NULL)
+    *best_base = find_best_base(new_bases);
+    if (*best_base == NULL)
         return -1;
 
-    if (!compatible_for_assignment(type->tp_base, new_base, "__bases__")) {
+    if (!compatible_for_assignment(type->tp_base, *best_base, "__bases__")) {
         return -1;
     }
 
+    return 0;
+}
+
+static int
+type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, PyTypeObject *best_base)
+{
+    ASSERT_TYPE_LOCK_HELD();
+
+    Py_ssize_t n;
     PyObject *old_bases = lookup_tp_bases(type);
     assert(old_bases != NULL);
     PyTypeObject *old_base = type->tp_base;
 
     set_tp_bases(type, Py_NewRef(new_bases), 0);
-    type->tp_base = (PyTypeObject *)Py_NewRef(new_base);
+    type->tp_base = (PyTypeObject *)Py_NewRef(best_base);
 
     PyObject *temp = PyList_New(0);
     if (temp == NULL) {
@@ -1796,7 +1911,11 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
            add to all new_bases */
         remove_all_subclasses(type, old_bases);
         res = add_all_subclasses(type, new_bases);
-        update_all_slots(type);
+        if (update_all_slots(type) < 0) {
+            goto bail;
+        }
+        /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
+        type_modified_unlocked(type);
     }
     else {
         res = 0;
@@ -1827,13 +1946,13 @@ type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases)
 
   bail:
     if (lookup_tp_bases(type) == new_bases) {
-        assert(type->tp_base == new_base);
+        assert(type->tp_base == best_base);
 
         set_tp_bases(type, old_bases, 0);
         type->tp_base = old_base;
 
         Py_DECREF(new_bases);
-        Py_DECREF(new_base);
+        Py_DECREF(best_base);
     }
     else {
         Py_DECREF(old_bases);
@@ -1848,9 +1967,13 @@ static int
 type_set_bases(PyObject *tp, PyObject *new_bases, void *Py_UNUSED(closure))
 {
     PyTypeObject *type = PyTypeObject_CAST(tp);
+    PyTypeObject *best_base;
     int res;
     BEGIN_TYPE_LOCK();
-    res = type_set_bases_unlocked(type, new_bases);
+    res = type_check_new_bases(type, new_bases, &best_base);
+    if (res == 0) {
+        res = type_set_bases_unlocked(type, new_bases, best_base);
+    }
     END_TYPE_LOCK();
     return res;
 }
@@ -2065,19 +2188,46 @@ type_set_annotations(PyObject *tp, PyObject *value, void *Py_UNUSED(closure))
         return -1;
     }
 
-    int result;
     PyObject *dict = PyType_GetDict(type);
-    if (value != NULL) {
-        /* set */
-        result = PyDict_SetItem(dict, &_Py_ID(__annotations_cache__), value);
-    } else {
-        /* delete */
-        result = PyDict_Pop(dict, &_Py_ID(__annotations_cache__), NULL);
-        if (result == 0) {
-            PyErr_SetString(PyExc_AttributeError, "__annotations__");
+    int result = PyDict_ContainsString(dict, "__annotations__");
+    if (result < 0) {
+        Py_DECREF(dict);
+        return -1;
+    }
+    if (result) {
+        // If __annotations__ is currently in the dict, we update it,
+        if (value != NULL) {
+            result = PyDict_SetItem(dict, &_Py_ID(__annotations__), value);
+        } else {
+            result = PyDict_Pop(dict, &_Py_ID(__annotations__), NULL);
+            if (result == 0) {
+                // Somebody else just deleted it?
+                PyErr_SetString(PyExc_AttributeError, "__annotations__");
+                Py_DECREF(dict);
+                return -1;
+            }
+        }
+        if (result < 0) {
             Py_DECREF(dict);
             return -1;
         }
+        // Also clear __annotations_cache__ just in case.
+        result = PyDict_Pop(dict, &_Py_ID(__annotations_cache__), NULL);
+    }
+    else {
+        // Else we update only __annotations_cache__.
+        if (value != NULL) {
+            /* set */
+            result = PyDict_SetItem(dict, &_Py_ID(__annotations_cache__), value);
+        } else {
+            /* delete */
+            result = PyDict_Pop(dict, &_Py_ID(__annotations_cache__), NULL);
+            if (result == 0) {
+                PyErr_SetString(PyExc_AttributeError, "__annotations__");
+                Py_DECREF(dict);
+                return -1;
+            }
+        }
     }
     if (result < 0) {
         Py_DECREF(dict);
@@ -3051,6 +3201,7 @@ static PyObject *
 class_name(PyObject *cls)
 {
     PyObject *name;
+    // Note that this is potentially re-entrant.
     if (PyObject_GetOptionalAttr(cls, &_Py_ID(__name__), &name) == 0) {
         name = PyObject_Repr(cls);
     }
@@ -3387,9 +3538,13 @@ mro_invoke(PyTypeObject *type)
     const int custom = !Py_IS_TYPE(type, &PyType_Type);
 
     if (custom) {
+        // Custom mro() method on metaclass.  This is potentially re-entrant.
+        // We are called either from type_ready() or from type_set_bases().
         mro_result = call_method_noarg((PyObject *)type, &_Py_ID(mro));
     }
     else {
+        // In this case, the mro() method on the type object is being used and
+        // we know that these calls are not re-entrant.
         mro_result = mro_implementation_unlocked(type);
     }
     if (mro_result == NULL)
@@ -3437,7 +3592,7 @@ mro_invoke(PyTypeObject *type)
      - Returns -1 in case of an error.
 */
 static int
-mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro)
+mro_internal(PyTypeObject *type, int initial, PyObject **p_old_mro)
 {
     ASSERT_TYPE_LOCK_HELD();
 
@@ -3485,21 +3640,11 @@ mro_internal_unlocked(PyTypeObject *type, int initial, PyObject **p_old_mro)
     return 1;
 }
 
-static int
-mro_internal(PyTypeObject *type, PyObject **p_old_mro)
-{
-    int res;
-    BEGIN_TYPE_LOCK();
-    res = mro_internal_unlocked(type, 0, p_old_mro);
-    END_TYPE_LOCK();
-    return res;
-}
-
 /* Calculate the best base amongst multiple base classes.
    This is the first one that's on the path to the "solid base". */
 
 static PyTypeObject *
-best_base(PyObject *bases)
+find_best_base(PyObject *bases)
 {
     Py_ssize_t i, n;
     PyTypeObject *base, *winner, *candidate;
@@ -3581,13 +3726,167 @@ solid_base(PyTypeObject *type)
     }
 }
 
+#ifdef Py_GIL_DISABLED
+
+// The structures and functions below are used in the free-threaded build
+// to safely make updates to type slots, on type_setattro() for a slot
+// or when __bases__ is re-assigned.  Since the slots are read without atomic
+// operations and without locking, we can only safely update them while the
+// world is stopped.  However, with the world stopped, we are very limited on
+// which APIs can be safely used.  For example, calling _PyObject_HashFast()
+// or _PyDict_GetItemRef_KnownHash() are not safe and can potentially cause
+// deadlocks.  Hashing can be re-entrant and _PyDict_GetItemRef_KnownHash can
+// acquire a lock if the dictionary is not owned by the current thread, to
+// mark it shared on reading.
+//
+// We do the slot updates in two steps.  First, with TYPE_LOCK held, we lookup
+// the descriptor for each slot, for each subclass. We build a queue of
+// updates to perform but don't actually update the type structures.  After we
+// are finished the lookups, we stop-the-world and apply all of the updates.
+// The apply_slot_updates() code is simple and easy to confirm that it is
+// safe.
+
+typedef struct {
+    PyTypeObject *type;
+    void **slot_ptr;
+    void *slot_value;
+} slot_update_item_t;
+
+// The number of slot updates performed is based on the number of changed
+// slots and the number of subclasses.  It's possible there are many updates
+// required if there are many subclasses (potentially an unbounded amount).
+// Usually the number of slot updates is small, most often zero or one.  When
+// running the unit tests, we don't exceed 20.  The chunk size is set to
+// handle the common case with a single chunk and to not require too many
+// chunk allocations if there are many subclasses.
+#define SLOT_UPDATE_CHUNK_SIZE 30
+
+typedef struct _slot_update {
+    struct _slot_update *prev;
+    Py_ssize_t n;
+    slot_update_item_t updates[SLOT_UPDATE_CHUNK_SIZE];
+} slot_update_chunk_t;
+
+// a queue of updates to be performed
+typedef struct {
+    slot_update_chunk_t *head;
+} slot_update_t;
+
+static slot_update_chunk_t *
+slot_update_new_chunk(void)
+{
+    slot_update_chunk_t *chunk = PyMem_Malloc(sizeof(slot_update_chunk_t));
+    if (chunk == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    chunk->prev = NULL;
+    chunk->n = 0;
+    return chunk;
+}
+
+static void
+slot_update_free_chunks(slot_update_t *updates)
+{
+    slot_update_chunk_t *chunk = updates->head;
+    while (chunk != NULL) {
+        slot_update_chunk_t *prev = chunk->prev;
+        PyMem_Free(chunk);
+        chunk = prev;
+    }
+}
+
+static int
+queue_slot_update(slot_update_t *updates, PyTypeObject *type,
+                  void **slot_ptr, void *slot_value)
+{
+    if (*slot_ptr == slot_value) {
+        return 0; // slot pointer not actually changed, don't queue update
+    }
+    if (updates->head == NULL || updates->head->n == SLOT_UPDATE_CHUNK_SIZE) {
+        slot_update_chunk_t *chunk = slot_update_new_chunk();
+        if (chunk == NULL) {
+            return -1; // out-of-memory
+        }
+        chunk->prev = updates->head;
+        updates->head = chunk;
+    }
+    slot_update_item_t *item = &updates->head->updates[updates->head->n];
+    item->type = type;
+    item->slot_ptr = slot_ptr;
+    item->slot_value = slot_value;
+    updates->head->n++;
+    assert(updates->head->n <= SLOT_UPDATE_CHUNK_SIZE);
+    return 0;
+}
+
+static void
+apply_slot_updates(slot_update_t *updates)
+{
+    assert(types_world_is_stopped());
+    slot_update_chunk_t *chunk = updates->head;
+    while (chunk != NULL) {
+        for (Py_ssize_t i = 0; i < chunk->n; i++) {
+            slot_update_item_t *item = &chunk->updates[i];
+            *(item->slot_ptr) = item->slot_value;
+            if (item->slot_value == slot_tp_call) {
+                /* A generic __call__ is incompatible with vectorcall */
+                type_clear_flags(item->type, Py_TPFLAGS_HAVE_VECTORCALL);
+            }
+        }
+        chunk = chunk->prev;
+    }
+}
+
+static void
+apply_type_slot_updates(slot_update_t *updates)
+{
+    // This must be done carefully to avoid data races and deadlocks.  We
+    // have just updated the type __dict__, while holding TYPE_LOCK.  We have
+    // collected all of the required type slot updates into the 'updates'
+    // queue.  Note that those updates can apply to multiple types since
+    // subclasses might also be affected by the dict change.
+    //
+    // We need to prevent other threads from writing to the dict before we can
+    // finish updating the slots. The actual stores to the slots are done
+    // with the world stopped.  If we block on the stop-the-world mutex then
+    // we could release TYPE_LOCK mutex and potentially allow other threads
+    // to update the dict.  That's because TYPE_LOCK was acquired using a
+    // critical section.
+    //
+    // The type_lock_prevent_release() call prevents the TYPE_LOCK mutex from
+    // being released even if we block on the STM mutex.  We need to take care
+    // that we do not deadlock because of that.  It is safe because we always
+    // acquire locks in the same order: first the TYPE_LOCK mutex and then the
+    // STM mutex.
+    type_lock_prevent_release();
+    types_stop_world();
+    apply_slot_updates(updates);
+    types_start_world();
+    type_lock_allow_release();
+}
+
+#else
+
+// dummy definition, this parameter is only NULL in the default build
+typedef void slot_update_t;
+
+#endif
+
+/// data passed to update_slots_callback()
+typedef struct {
+    slot_update_t *queued_updates;
+    pytype_slotdef **defs;
+} update_callback_data_t;
+
 static void object_dealloc(PyObject *);
 static PyObject *object_new(PyTypeObject *, PyObject *, PyObject *);
 static int object_init(PyObject *, PyObject *, PyObject *);
-static int update_slot(PyTypeObject *, PyObject *);
+static int update_slot(PyTypeObject *, PyObject *, slot_update_t *update);
 static void fixup_slot_dispatchers(PyTypeObject *);
 static int type_new_set_names(PyTypeObject *);
 static int type_new_init_subclass(PyTypeObject *, PyObject *);
+static bool has_slotdef(PyObject *);
 
 /*
  * Helpers for  __dict__ descriptor.  We don't want to expose the dicts
@@ -3649,10 +3948,35 @@ subtype_dict(PyObject *obj, void *context)
     return PyObject_GenericGetDict(obj, context);
 }
 
+int
+_PyObject_SetDict(PyObject *obj, PyObject *value)
+{
+    if (value != NULL && !PyDict_Check(value)) {
+        PyErr_Format(PyExc_TypeError,
+                     "__dict__ must be set to a dictionary, "
+                     "not a '%.200s'", Py_TYPE(value)->tp_name);
+        return -1;
+    }
+    if (Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT) {
+        return _PyObject_SetManagedDict(obj, value);
+    }
+    PyObject **dictptr = _PyObject_ComputedDictPointer(obj);
+    if (dictptr == NULL) {
+        PyErr_SetString(PyExc_AttributeError,
+                        "This object has no __dict__");
+        return -1;
+    }
+    Py_BEGIN_CRITICAL_SECTION(obj);
+    // gh-133980: To prevent use-after-free from other threads that reference
+    // the __dict__
+    _PyObject_XSetRefDelayed(dictptr, Py_NewRef(value));
+    Py_END_CRITICAL_SECTION();
+    return 0;
+}
+
 static int
 subtype_setdict(PyObject *obj, PyObject *value, void *context)
 {
-    PyObject **dictptr;
     PyTypeObject *base;
 
     base = get_builtin_base_with_dict(Py_TYPE(obj));
@@ -3670,28 +3994,7 @@ subtype_setdict(PyObject *obj, PyObject *value, void *context)
         }
         return func(descr, obj, value);
     }
-    /* Almost like PyObject_GenericSetDict, but allow __dict__ to be deleted. */
-    if (value != NULL && !PyDict_Check(value)) {
-        PyErr_Format(PyExc_TypeError,
-                     "__dict__ must be set to a dictionary, "
-                     "not a '%.200s'", Py_TYPE(value)->tp_name);
-        return -1;
-    }
-
-    if (Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT) {
-        return _PyObject_SetManagedDict(obj, value);
-    }
-    else {
-        dictptr = _PyObject_ComputedDictPointer(obj);
-        if (dictptr == NULL) {
-            PyErr_SetString(PyExc_AttributeError,
-                            "This object has no __dict__");
-            return -1;
-        }
-        Py_CLEAR(*dictptr);
-        *dictptr = Py_XNewRef(value);
-    }
-    return 0;
+    return _PyObject_SetDict(obj, value);
 }
 
 static PyObject *
@@ -3785,7 +4088,7 @@ type_init(PyObject *cls, PyObject *args, PyObject *kwds)
 unsigned long
 PyType_GetFlags(PyTypeObject *type)
 {
-    return FT_ATOMIC_LOAD_ULONG_RELAXED(type->tp_flags);
+    return type->tp_flags;
 }
 
 
@@ -4563,6 +4866,10 @@ type_new_impl(type_new_ctx *ctx)
     }
 
     assert(_PyType_CheckConsistency(type));
+#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4
+    // After this point, other threads can potentally use this type.
+    ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG;
+#endif
 
     return (PyObject *)type;
 
@@ -4625,7 +4932,7 @@ type_new_get_bases(type_new_ctx *ctx, PyObject **type)
     }
 
     /* Calculate best base, and check that all bases are type objects */
-    PyTypeObject *base = best_base(ctx->bases);
+    PyTypeObject *base = find_best_base(ctx->bases);
     if (base == NULL) {
         return -1;
     }
@@ -5040,12 +5347,12 @@ PyType_FromMetaclass(
     }
 
     /* Calculate best base, and check that all bases are type objects */
-    PyTypeObject *base = best_base(bases);  // borrowed ref
+    PyTypeObject *base = find_best_base(bases);  // borrowed ref
     if (base == NULL) {
         goto finally;
     }
-    // best_base should check Py_TPFLAGS_BASETYPE & raise a proper exception,
-    // here we just check its work
+    // find_best_base() should check Py_TPFLAGS_BASETYPE & raise a proper
+    // exception, here we just check its work
     assert(_PyType_HasFeature(base, Py_TPFLAGS_BASETYPE));
 
     /* Calculate sizes */
@@ -5276,6 +5583,10 @@ PyType_FromMetaclass(
     }
 
     assert(_PyType_CheckConsistency(type));
+#if defined(Py_GIL_DISABLED) && defined(Py_DEBUG) && SIZEOF_VOID_P > 4
+    // After this point, other threads can potentally use this type.
+    ((PyObject*)type)->ob_flags |= _Py_TYPE_REVEALED_FLAG;
+#endif
 
  finally:
     if (PyErr_Occurred()) {
@@ -5571,8 +5882,6 @@ PyObject_GetItemData(PyObject *obj)
 static PyObject *
 find_name_in_mro(PyTypeObject *type, PyObject *name, int *error)
 {
-    ASSERT_TYPE_LOCK_HELD();
-
     Py_hash_t hash = _PyObject_HashFast(name);
     if (hash == -1) {
         *error = -1;
@@ -5881,9 +6190,13 @@ _PyType_CacheGetItemForSpecialization(PyHeapTypeObject *ht, PyObject *descriptor
 void
 _PyType_SetFlags(PyTypeObject *self, unsigned long mask, unsigned long flags)
 {
-    BEGIN_TYPE_LOCK();
-    type_set_flags_with_mask(self, mask, flags);
-    END_TYPE_LOCK();
+    unsigned long new_flags = (self->tp_flags & ~mask) | flags;
+    if (new_flags != self->tp_flags) {
+        types_stop_world();
+        // can't use new_flags here since they could be out-of-date
+        self->tp_flags = (self->tp_flags & ~mask) | flags;
+        types_start_world();
+    }
 }
 
 int
@@ -5930,9 +6243,9 @@ set_flags_recursive(PyTypeObject *self, unsigned long mask, unsigned long flags)
 void
 _PyType_SetFlagsRecursive(PyTypeObject *self, unsigned long mask, unsigned long flags)
 {
-    BEGIN_TYPE_LOCK();
+    types_stop_world();
     set_flags_recursive(self, mask, flags);
-    END_TYPE_LOCK();
+    types_start_world();
 }
 
 /* This is similar to PyObject_GenericGetAttr(),
@@ -6046,6 +6359,8 @@ _Py_type_getattro(PyObject *tp, PyObject *name)
     return _Py_type_getattro_impl(type, name, NULL);
 }
 
+// Called by type_setattro().  Updates both the type dict and
+// the type versions.
 static int
 type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name,
                  PyObject *value, PyObject **old_value)
@@ -6075,10 +6390,30 @@ type_update_dict(PyTypeObject *type, PyDictObject *dict, PyObject *name,
         return -1;
     }
 
-    if (is_dunder_name(name)) {
-        return update_slot(type, name);
-    }
+    return 0;
+}
 
+static int
+update_slot_after_setattr(PyTypeObject *type, PyObject *name)
+{
+#ifdef Py_GIL_DISABLED
+    // stack allocate one chunk since that's all we need
+    assert(SLOT_UPDATE_CHUNK_SIZE >= MAX_EQUIV);
+    slot_update_chunk_t chunk = {0};
+    slot_update_t queued_updates = {&chunk};
+
+    if (update_slot(type, name, &queued_updates) < 0) {
+        return -1;
+    }
+    if (queued_updates.head->n > 0) {
+        apply_type_slot_updates(&queued_updates);
+        ASSERT_TYPE_LOCK_HELD();
+        // should never allocate another chunk
+        assert(chunk.prev == NULL);
+    }
+#else
+    update_slot(type, name, NULL);
+#endif
     return 0;
 }
 
@@ -6136,7 +6471,9 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
 
     PyObject *dict = type->tp_dict;
     if (dict == NULL) {
-        // We don't just do PyType_Ready because we could already be readying
+        // This is an unlikely case.  PyType_Ready has not yet been done and
+        // we need to initialize tp_dict.  We don't just do PyType_Ready
+        // because we could already be readying.
         BEGIN_TYPE_LOCK();
         dict = type->tp_dict;
         if (dict == NULL) {
@@ -6152,6 +6489,12 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
     BEGIN_TYPE_DICT_LOCK(dict);
     res = type_update_dict(type, (PyDictObject *)dict, name, value, &old_value);
     assert(_PyType_CheckConsistency(type));
+    if (res == 0) {
+        if (is_dunder_name(name) && has_slotdef(name)) {
+            // The name corresponds to a type slot.
+            res = update_slot_after_setattr(type, name);
+        }
+    }
     END_TYPE_DICT_LOCK();
 
 done:
@@ -7081,15 +7424,10 @@ object_set_class(PyObject *self, PyObject *value, void *closure)
         return -1;
     }
 
-#ifdef Py_GIL_DISABLED
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyEval_StopTheWorld(interp);
-#endif
+    types_stop_world();
     PyTypeObject *oldto = Py_TYPE(self);
     int res = object_set_class_world_stopped(self, newto);
-#ifdef Py_GIL_DISABLED
-    _PyEval_StartTheWorld(interp);
-#endif
+    types_start_world();
     if (res == 0) {
         if (oldto->tp_flags & Py_TPFLAGS_HEAPTYPE) {
             Py_DECREF(oldto);
@@ -8497,7 +8835,7 @@ type_ready_mro(PyTypeObject *type, int initial)
     }
 
     /* Calculate method resolution order */
-    if (mro_internal_unlocked(type, initial, NULL) < 0) {
+    if (mro_internal(type, initial, NULL) < 0) {
         return -1;
     }
     PyObject *mro = lookup_tp_mro(type);
@@ -9682,6 +10020,11 @@ tp_new_wrapper(PyObject *self, PyObject *args, PyObject *kwds)
     /* If staticbase is NULL now, it is a really weird type.
        In the spirit of backwards compatibility (?), just shut up. */
     if (staticbase && staticbase->tp_new != type->tp_new) {
+        if (staticbase->tp_new == NULL) {
+            PyErr_Format(PyExc_TypeError,
+                         "cannot create '%s' instances", subtype->tp_name);
+            return NULL;
+        }
         PyErr_Format(PyExc_TypeError,
                      "%s.__new__(%s) is not safe, use %s.__new__()",
                      type->tp_name,
@@ -11020,12 +11363,21 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
 {
     /* XXX Maybe this could be optimized more -- but is it worth it? */
 
+#ifdef Py_GIL_DISABLED
+    pytype_slotdef *ptrs[MAX_EQUIV];
+    pytype_slotdef **pp = ptrs;
+    /* Collect all slotdefs that match name into ptrs. */
+    for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) {
+        if (p->name_strobj == name)
+            *pp++ = p;
+    }
+    *pp = NULL;
+#else
     /* pname and ptrs act as a little cache */
     PyInterpreterState *interp = _PyInterpreterState_GET();
 #define pname _Py_INTERP_CACHED_OBJECT(interp, type_slots_pname)
 #define ptrs _Py_INTERP_CACHED_OBJECT(interp, type_slots_ptrs)
     pytype_slotdef *p, **pp;
-    void **res, **ptr;
 
     if (pname != name) {
         /* Collect all slotdefs that match name into ptrs. */
@@ -11037,10 +11389,12 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
         }
         *pp = NULL;
     }
+#endif
 
     /* Look in all slots of the type matching the name. If exactly one of these
        has a filled-in slot, return a pointer to that slot.
        Otherwise, return NULL. */
+    void **res, **ptr;
     res = NULL;
     for (pp = ptrs; *pp; pp++) {
         ptr = slotptr(type, (*pp)->offset);
@@ -11050,11 +11404,25 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
             return NULL;
         res = ptr;
     }
-    return res;
+#ifndef Py_GIL_DISABLED
 #undef pname
 #undef ptrs
+#endif
+    return res;
 }
 
+// Return true if "name" corresponds to at least one slot definition.  This is
+// a more accurate but more expensive test compared to is_dunder_name().
+static bool
+has_slotdef(PyObject *name)
+{
+    for (pytype_slotdef *p = slotdefs; p->name_strobj; p++) {
+        if (p->name_strobj == name) {
+            return true;
+        }
+    }
+    return false;
+}
 
 /* Common code for update_slots_callback() and fixup_slot_dispatchers().
  *
@@ -11107,13 +11475,22 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
  * There are some further special cases for specific slots, like supporting
  * __hash__ = None for tp_hash and special code for tp_new.
  *
- * When done, return a pointer to the next slotdef with a different offset,
- * because that's convenient for fixup_slot_dispatchers(). This function never
- * sets an exception: if an internal error happens (unlikely), it's ignored. */
-static pytype_slotdef *
-update_one_slot(PyTypeObject *type, pytype_slotdef *p)
+ * When done, next_p is set to the next slotdef with a different offset,
+ * because that's convenient for fixup_slot_dispatchers().
+ *
+ * If the queued_updates pointer is provided, the actual updates to the slot
+ * pointers are queued, rather than being immediately performed.  That argument
+ * is only used for the free-threaded build since those updates need to be
+ * done while the world is stopped.
+ *
+ * This function will only return an error if the queued_updates argument is
+ * provided and allocating memory for the queue fails.  Other exceptions that
+ * occur internally are ignored, such as when looking up descriptors. */
+static int
+update_one_slot(PyTypeObject *type, pytype_slotdef *p, pytype_slotdef **next_p,
+                slot_update_t *queued_updates)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
 
     PyObject *descr;
     PyWrapperDescrObject *d;
@@ -11136,7 +11513,10 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
         do {
             ++p;
         } while (p->offset == offset);
-        return p;
+        if (next_p != NULL) {
+            *next_p = p;
+        }
+        return 0;
     }
     /* We may end up clearing live exceptions below, so make sure it's ours. */
     assert(!PyErr_Occurred());
@@ -11219,16 +11599,41 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
             }
             if (p->function == slot_tp_call) {
                 /* A generic __call__ is incompatible with vectorcall */
-                type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL);
+                if (queued_updates == NULL) {
+                    type_clear_flags(type, Py_TPFLAGS_HAVE_VECTORCALL);
+                }
             }
         }
         Py_DECREF(descr);
     } while ((++p)->offset == offset);
-    if (specific && !use_generic)
-        *ptr = specific;
-    else
-        *ptr = generic;
-    return p;
+
+    void *slot_value;
+    if (specific && !use_generic) {
+        slot_value = specific;
+    } else {
+        slot_value = generic;
+    }
+
+#ifdef Py_GIL_DISABLED
+    if (queued_updates != NULL) {
+        // queue the update to perform later, while world is stopped
+        if (queue_slot_update(queued_updates, type, ptr, slot_value) < 0) {
+            return -1;
+        }
+    } else {
+        // do the update to the type structure now
+        *ptr = slot_value;
+    }
+#else
+    // always do the update immediately
+    assert(queued_updates == NULL);
+    *ptr = slot_value;
+#endif
+
+    if (next_p != NULL) {
+        *next_p = p;
+    }
+    return 0;
 }
 
 /* In the type, update the slots whose slotdefs are gathered in the pp array.
@@ -11236,18 +11641,21 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 static int
 update_slots_callback(PyTypeObject *type, void *data)
 {
-    ASSERT_TYPE_LOCK_HELD();
+    ASSERT_NEW_TYPE_OR_LOCKED(type);
 
-    pytype_slotdef **pp = (pytype_slotdef **)data;
+    update_callback_data_t *update_data = (update_callback_data_t *)data;
+    pytype_slotdef **pp = update_data->defs;
     for (; *pp; pp++) {
-        update_one_slot(type, *pp);
+        if (update_one_slot(type, *pp, NULL, update_data->queued_updates) < 0) {
+            return -1;
+        }
     }
     return 0;
 }
 
 /* Update the slots after assignment to a class (type) attribute. */
 static int
-update_slot(PyTypeObject *type, PyObject *name)
+update_slot(PyTypeObject *type, PyObject *name, slot_update_t *queued_updates)
 {
     pytype_slotdef *ptrs[MAX_EQUIV];
     pytype_slotdef *p;
@@ -11278,8 +11686,12 @@ update_slot(PyTypeObject *type, PyObject *name)
     }
     if (ptrs[0] == NULL)
         return 0; /* Not an attribute that affects any slots */
+
+    update_callback_data_t callback_data;
+    callback_data.defs = ptrs;
+    callback_data.queued_updates = queued_updates;
     return update_subclasses(type, name,
-                             update_slots_callback, (void *)ptrs);
+                             update_slots_callback, (void *)&callback_data);
 }
 
 /* Store the proper functions in the slot dispatches at class (type)
@@ -11288,35 +11700,56 @@ update_slot(PyTypeObject *type, PyObject *name)
 static void
 fixup_slot_dispatchers(PyTypeObject *type)
 {
-    // This lock isn't strictly necessary because the type has not been
-    // exposed to anyone else yet, but update_ont_slot calls find_name_in_mro
-    // where we'd like to assert that the type is locked.
-    BEGIN_TYPE_LOCK();
-
     assert(!PyErr_Occurred());
     for (pytype_slotdef *p = slotdefs; p->name; ) {
-        p = update_one_slot(type, p);
+        update_one_slot(type, p, &p, NULL);
     }
-
-    END_TYPE_LOCK();
 }
 
-static void
+#ifdef Py_GIL_DISABLED
+
+// Called when __bases__ is re-assigned.
+static int
 update_all_slots(PyTypeObject* type)
 {
-    pytype_slotdef *p;
-
-    ASSERT_TYPE_LOCK_HELD();
+    // Note that update_slot() can fail due to out-of-memory when allocating
+    // the queue chunks to hold the updates.  That's unlikely since the number
+    // of updates is normally small but we handle that case.  update_slot()
+    // can fail internally for other reasons (a lookup fails) but those
+    // errors are suppressed.
+    slot_update_t queued_updates = {0};
+    for (pytype_slotdef *p = slotdefs; p->name; p++) {
+        if (update_slot(type, p->name_strobj, &queued_updates) < 0) {
+            if (queued_updates.head) {
+                slot_update_free_chunks(&queued_updates);
+            }
+            return -1;
+        }
+    }
+    if (queued_updates.head != NULL) {
+        apply_type_slot_updates(&queued_updates);
+        ASSERT_TYPE_LOCK_HELD();
+        slot_update_free_chunks(&queued_updates);
+    }
+    return 0;
+}
 
-    /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
-    type_modified_unlocked(type);
+#else
 
+// Called when __bases__ is re-assigned.
+static int
+update_all_slots(PyTypeObject* type)
+{
+    pytype_slotdef *p;
     for (p = slotdefs; p->name; p++) {
-        /* update_slot returns int but can't actually fail */
-        update_slot(type, p->name_strobj);
+        /* update_slot returns int but can't actually fail in this case*/
+        update_slot(type, p->name_strobj, NULL);
     }
+    return 0;
 }
 
+#endif
+
 
 PyObject *
 _PyType_GetSlotWrapperNames(void)
@@ -11586,7 +12019,10 @@ PyType_Freeze(PyTypeObject *type)
     }
 
     BEGIN_TYPE_LOCK();
+    types_stop_world();
     type_add_flags(type, Py_TPFLAGS_IMMUTABLETYPE);
+    types_start_world();
+    ASSERT_TYPE_LOCK_HELD();
     type_modified_unlocked(type);
     END_TYPE_LOCK();
 
diff --git a/Objects/typevarobject.c b/Objects/typevarobject.c
index 6c199a52aa0..cead6e69af5 100644
--- a/Objects/typevarobject.c
+++ b/Objects/typevarobject.c
@@ -192,7 +192,7 @@ constevaluator_call(PyObject *self, PyObject *args, PyObject *kwargs)
             for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(value); i++) {
                 PyObject *item = PyTuple_GET_ITEM(value, i);
                 if (i > 0) {
-                    if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) {
+                    if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) {
                         PyUnicodeWriter_Discard(writer);
                         return NULL;
                     }
@@ -273,7 +273,7 @@ _Py_typing_type_repr(PyUnicodeWriter *writer, PyObject *p)
     }
 
     if (p == (PyObject *)&_PyNone_Type) {
-        return PyUnicodeWriter_WriteUTF8(writer, "None", 4);
+        return PyUnicodeWriter_WriteASCII(writer, "None", 4);
     }
 
     if ((rc = PyObject_HasAttrWithError(p, &_Py_ID(__origin__))) > 0 &&
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f3f0c9646a6..5c2308a0121 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -167,11 +167,7 @@ static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
 #define _PyUnicode_HASH(op)                             \
     (_PyASCIIObject_CAST(op)->hash)
 
-static inline Py_hash_t PyUnicode_HASH(PyObject *op)
-{
-    assert(_PyUnicode_CHECK(op));
-    return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash);
-}
+#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
 
 static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
 {
@@ -6596,13 +6592,15 @@ _PyUnicode_GetNameCAPI(void)
 /* --- Unicode Escape Codec ----------------------------------------------- */
 
 PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
                                Py_ssize_t size,
                                const char *errors,
                                Py_ssize_t *consumed,
-                               const char **first_invalid_escape)
+                               int *first_invalid_escape_char,
+                               const char **first_invalid_escape_ptr)
 {
     const char *starts = s;
+    const char *initial_starts = starts;
     _PyUnicodeWriter writer;
     const char *end;
     PyObject *errorHandler = NULL;
@@ -6610,7 +6608,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
     _PyUnicode_Name_CAPI *ucnhash_capi;
 
     // so we can remember if we've seen an invalid escape char or not
-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
+    *first_invalid_escape_ptr = NULL;
 
     if (size == 0) {
         if (consumed) {
@@ -6698,9 +6697,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                 }
             }
             if (ch > 0377) {
-                if (*first_invalid_escape == NULL) {
-                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
-                                                    already incremented s. */
+                if (*first_invalid_escape_char == -1) {
+                    *first_invalid_escape_char = ch;
+                    if (starts == initial_starts) {
+                        /* Back up 3 chars, since we've already incremented s. */
+                        *first_invalid_escape_ptr = s - 3;
+                    }
                 }
             }
             WRITE_CHAR(ch);
@@ -6795,9 +6797,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
             goto error;
 
         default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape_char == -1) {
+                *first_invalid_escape_char = c;
+                if (starts == initial_starts) {
+                    /* Back up one char, since we've already incremented s. */
+                    *first_invalid_escape_ptr = s - 1;
+                }
             }
             WRITE_ASCII_CHAR('\\');
             WRITE_CHAR(c);
@@ -6842,19 +6847,20 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                               const char *errors,
                               Py_ssize_t *consumed)
 {
-    const char *first_invalid_escape;
-    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
                                                       consumed,
-                                                      &first_invalid_escape);
+                                                      &first_invalid_escape_char,
+                                                      &first_invalid_escape_ptr);
     if (result == NULL)
         return NULL;
-    if (first_invalid_escape != NULL) {
-        unsigned char c = *first_invalid_escape;
-        if ('4' <= c && c <= '7') {
+    if (first_invalid_escape_char != -1) {
+        if (first_invalid_escape_char > 0xff) {
             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "\"\\%.3s\" is an invalid octal escape sequence. "
+                                 "\"\\%o\" is an invalid octal escape sequence. "
                                  "Such sequences will not work in the future. ",
-                                 first_invalid_escape) < 0)
+                                 first_invalid_escape_char) < 0)
             {
                 Py_DECREF(result);
                 return NULL;
@@ -6864,7 +6870,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                  "\"\\%c\" is an invalid escape sequence. "
                                  "Such sequences will not work in the future. ",
-                                 c) < 0)
+                                 first_invalid_escape_char) < 0)
             {
                 Py_DECREF(result);
                 return NULL;
@@ -13919,7 +13925,12 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
 int
 PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
 {
-    if (Py_TYPE(obj) == &PyLong_Type) {
+    PyTypeObject *type = Py_TYPE(obj);
+    if (type == &PyUnicode_Type) {
+        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
+    }
+
+    if (type == &PyLong_Type) {
         return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
     }
 
@@ -14068,6 +14079,20 @@ _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
     return 0;
 }
 
+
+int
+PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
+                           const char *str,
+                           Py_ssize_t size)
+{
+    assert(writer != NULL);
+    _Py_AssertHoldsTstate();
+
+    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
+    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
+}
+
+
 int
 PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
                           const char *str,
diff --git a/Objects/unionobject.c b/Objects/unionobject.c
index 66435924b6c..2206ed80ef0 100644
--- a/Objects/unionobject.c
+++ b/Objects/unionobject.c
@@ -4,6 +4,7 @@
 #include "pycore_typevarobject.h"  // _PyTypeAlias_Type, _Py_typing_type_repr
 #include "pycore_unicodeobject.h" // _PyUnicode_EqualToASCIIString
 #include "pycore_unionobject.h"
+#include "pycore_weakref.h"       // FT_CLEAR_WEAKREFS()
 
 
 typedef struct {
@@ -21,9 +22,7 @@ unionobject_dealloc(PyObject *self)
     unionobject *alias = (unionobject *)self;
 
     _PyObject_GC_UNTRACK(self);
-    if (alias->weakreflist != NULL) {
-        PyObject_ClearWeakRefs((PyObject *)alias);
-    }
+    FT_CLEAR_WEAKREFS(self, alias->weakreflist);
 
     Py_XDECREF(alias->args);
     Py_XDECREF(alias->hashable_args);
@@ -290,7 +289,7 @@ union_repr(PyObject *self)
     }
 
     for (Py_ssize_t i = 0; i < len; i++) {
-        if (i > 0 && PyUnicodeWriter_WriteUTF8(writer, " | ", 3) < 0) {
+        if (i > 0 && PyUnicodeWriter_WriteASCII(writer, " | ", 3) < 0) {
             goto error;
         }
         PyObject *p = PyTuple_GET_ITEM(alias->args, i);
@@ -300,12 +299,12 @@ union_repr(PyObject *self)
     }
 
 #if 0
-    PyUnicodeWriter_WriteUTF8(writer, "|args=", 6);
+    PyUnicodeWriter_WriteASCII(writer, "|args=", 6);
     PyUnicodeWriter_WriteRepr(writer, alias->args);
-    PyUnicodeWriter_WriteUTF8(writer, "|h=", 3);
+    PyUnicodeWriter_WriteASCII(writer, "|h=", 3);
     PyUnicodeWriter_WriteRepr(writer, alias->hashable_args);
     if (alias->unhashable_args) {
-        PyUnicodeWriter_WriteUTF8(writer, "|u=", 3);
+        PyUnicodeWriter_WriteASCII(writer, "|u=", 3);
         PyUnicodeWriter_WriteRepr(writer, alias->unhashable_args);
     }
 #endif