43 files changed, 4032 insertions, 2526 deletions
diff --git a/Python/Python-ast.c b/Python/Python-ast.c
index f7625ab1205..660bc598a48 100644
--- a/Python/Python-ast.c
+++ b/Python/Python-ast.c
@@ -5796,7 +5796,7 @@ ast_repr_list(PyObject *list, int depth)
 
     for (Py_ssize_t i = 0; i < Py_MIN(length, 2); i++) {
         if (i > 0) {
-            if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) {
                 goto error;
             }
         }
@@ -5820,7 +5820,7 @@ ast_repr_list(PyObject *list, int depth)
         }
 
         if (i == 0 && length > 2) {
-            if (PyUnicodeWriter_WriteUTF8(writer, ", ...", 5) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, ", ...", 5) < 0) {
                 goto error;
             }
         }
@@ -5924,7 +5924,7 @@ ast_repr_max_depth(AST_object *self, int depth)
         }
 
         if (i > 0) {
-            if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) {
                 Py_DECREF(name);
                 Py_DECREF(value_repr);
                 goto error;
diff --git a/Python/_warnings.c b/Python/_warnings.c
index 39bf1b225cc..12e6172b0cf 100644
--- a/Python/_warnings.c
+++ b/Python/_warnings.c
@@ -6,7 +6,6 @@
 #include "pycore_long.h"          // _PyLong_GetZero()
 #include "pycore_pylifecycle.h"   // _Py_IsInterpreterFinalizing()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
-#include "pycore_sysmodule.h"     // _PySys_GetOptionalAttr()
 #include "pycore_traceback.h"     // _Py_DisplaySourceLine()
 #include "pycore_unicodeobject.h" // _PyUnicode_EqualToASCIIString()
 
@@ -678,7 +677,7 @@ show_warning(PyThreadState *tstate, PyObject *filename, int lineno,
         goto error;
     }
 
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &f_stderr) <= 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &f_stderr) <= 0) {
         fprintf(stderr, "lost sys.stderr\n");
         goto error;
     }
diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S
index 0a3265dfeee..616752459ba 100644
--- a/Python/asm_trampoline.S
+++ b/Python/asm_trampoline.S
@@ -9,6 +9,9 @@
 # }
 _Py_trampoline_func_start:
 #ifdef __x86_64__
+#if defined(__CET__) && (__CET__ & 1)
+    endbr64
+#endif
     sub    $8, %rsp
     call    *%rcx
     add    $8, %rsp
@@ -34,3 +37,22 @@ _Py_trampoline_func_start:
     .globl	_Py_trampoline_func_end
 _Py_trampoline_func_end:
     .section        .note.GNU-stack,"",@progbits
+# Note for indicating the assembly code supports CET
+#if defined(__x86_64__) && defined(__CET__) && (__CET__ & 1)
+    .section    .note.gnu.property,"a"
+    .align 8
+    .long    1f - 0f
+    .long    4f - 1f
+    .long    5
+0:
+    .string  "GNU"
+1:
+    .align 8
+    .long    0xc0000002
+    .long    3f - 2f
+2:
+    .long    0x3
+3:
+    .align 8
+4:
+#endif // __x86_64__
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c
index 3d0295ee388..e08c63924ca 100644
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@@ -14,7 +14,6 @@
 #include "pycore_pyerrors.h"      // _PyErr_NoMemory()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
 #include "pycore_pythonrun.h"     // _Py_SourceAsString()
-#include "pycore_sysmodule.h"     // _PySys_GetRequiredAttr()
 #include "pycore_tuple.h"         // _PyTuple_FromArray()
 #include "pycore_cell.h"          // PyCell_GetRef()
 
@@ -465,7 +464,7 @@ builtin_callable(PyObject *module, PyObject *obj)
 static PyObject *
 builtin_breakpoint(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *keywords)
 {
-    PyObject *hook = _PySys_GetRequiredAttrString("breakpointhook");
+    PyObject *hook = PySys_GetAttrString("breakpointhook");
     if (hook == NULL) {
         return NULL;
     }
@@ -2164,7 +2163,7 @@ builtin_print_impl(PyObject *module, PyObject * const *args,
     int i, err;
 
     if (file == Py_None) {
-        file = _PySys_GetRequiredAttr(&_Py_ID(stdout));
+        file = PySys_GetAttr(&_Py_ID(stdout));
         if (file == NULL) {
             return NULL;
         }
@@ -2270,7 +2269,7 @@ builtin_input_impl(PyObject *module, PyObject *prompt)
     int tty;
 
     /* Check that stdin/out/err are intact */
-    fin = _PySys_GetRequiredAttr(&_Py_ID(stdin));
+    fin = PySys_GetAttr(&_Py_ID(stdin));
     if (fin == NULL) {
         goto error;
     }
@@ -2278,7 +2277,7 @@ builtin_input_impl(PyObject *module, PyObject *prompt)
         PyErr_SetString(PyExc_RuntimeError, "lost sys.stdin");
         goto error;
     }
-    fout = _PySys_GetRequiredAttr(&_Py_ID(stdout));
+    fout = PySys_GetAttr(&_Py_ID(stdout));
     if (fout == NULL) {
         goto error;
     }
@@ -2286,7 +2285,7 @@ builtin_input_impl(PyObject *module, PyObject *prompt)
         PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
         goto error;
     }
-    ferr = _PySys_GetRequiredAttr(&_Py_ID(stderr));
+    ferr = PySys_GetAttr(&_Py_ID(stderr));
     if (ferr == NULL) {
         goto error;
     }
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 42e4f581894..c4b13da5db4 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -295,55 +295,18 @@ dummy_func(
             value2 = PyStackRef_Borrow(GETLOCAL(oparg2));
         }
 
-        family(LOAD_CONST, 0) = {
-            LOAD_CONST_MORTAL,
-            LOAD_CONST_IMMORTAL,
-        };
-
         inst(LOAD_CONST, (-- value)) {
-            /* We can't do this in the bytecode compiler as
-             * marshalling can intern strings and make them immortal. */
             PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            value = PyStackRef_FromPyObjectNew(obj);
-#if ENABLE_SPECIALIZATION_FT
-#ifdef Py_GIL_DISABLED
-            uint8_t expected = LOAD_CONST;
-            if (!_Py_atomic_compare_exchange_uint8(
-                    &this_instr->op.code, &expected,
-                    _Py_IsImmortal(obj) ? LOAD_CONST_IMMORTAL : LOAD_CONST_MORTAL)) {
-                // We might lose a race with instrumentation, which we don't care about.
-                assert(expected >= MIN_INSTRUMENTED_OPCODE);
-            }
-#else
-            if (this_instr->op.code == LOAD_CONST) {
-                this_instr->op.code = _Py_IsImmortal(obj) ? LOAD_CONST_IMMORTAL : LOAD_CONST_MORTAL;
-            }
-#endif
-#endif
-        }
-
-        inst(LOAD_CONST_MORTAL, (-- value)) {
-            PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            value = PyStackRef_FromPyObjectNewMortal(obj);
-        }
-
-        inst(LOAD_CONST_IMMORTAL, (-- value)) {
-            PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            assert(_Py_IsImmortal(obj));
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
         }
 
         replicate(4) inst(LOAD_SMALL_INT, (-- value)) {
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
         }
 
         replicate(8) inst(STORE_FAST, (value --)) {
-            assert(
-                ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             DEAD(value);
@@ -355,10 +318,6 @@ dummy_func(
         };
 
         inst(STORE_FAST_LOAD_FAST, (value1 -- value2)) {
-            assert(
-                ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                PyStackRef_IsHeapSafe(value1)
-            );
             uint32_t oparg1 = oparg >> 4;
             uint32_t oparg2 = oparg & 15;
             _PyStackRef tmp = GETLOCAL(oparg1);
@@ -369,14 +328,6 @@ dummy_func(
         }
 
         inst(STORE_FAST_STORE_FAST, (value2, value1 --)) {
-            assert(
-                ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                PyStackRef_IsHeapSafe(value1)
-            );
-            assert(
-                ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                PyStackRef_IsHeapSafe(value2)
-            );
             uint32_t oparg1 = oparg >> 4;
             uint32_t oparg2 = oparg & 15;
             _PyStackRef tmp = GETLOCAL(oparg1);
@@ -390,7 +341,12 @@ dummy_func(
         }
 
         pure inst(POP_TOP, (value --)) {
-            PyStackRef_CLOSE(value);
+            PyStackRef_XCLOSE(value);
+        }
+
+        tier2 op(_POP_TWO, (nos, tos --)) {
+            PyStackRef_CLOSE(tos);
+            PyStackRef_CLOSE(nos);
         }
 
         pure inst(PUSH_NULL, (-- res)) {
@@ -406,9 +362,14 @@ dummy_func(
             PyStackRef_CLOSE(value);
         }
 
-        macro(POP_ITER) = POP_TOP;
 
-        no_save_ip tier1 inst(INSTRUMENTED_END_FOR, (receiver, value -- receiver)) {
+        inst(POP_ITER, (iter, index_or_null -- )) {
+            (void)index_or_null;
+            DEAD(index_or_null);
+            PyStackRef_CLOSE(iter);
+        }
+
+        no_save_ip tier1 inst(INSTRUMENTED_END_FOR, (receiver, index_or_null, value -- receiver, index_or_null)) {
             /* Need to create a fake StopIteration error here,
              * to conform to PEP 380 */
             if (PyStackRef_GenCheck(receiver)) {
@@ -420,7 +381,9 @@ dummy_func(
             PyStackRef_CLOSE(value);
         }
 
-        tier1 inst(INSTRUMENTED_POP_ITER, (iter -- )) {
+        tier1 inst(INSTRUMENTED_POP_ITER, (iter, index_or_null -- )) {
+            (void)index_or_null;
+            DEAD(index_or_null);
             INSTRUMENTED_JUMP(prev_instr, this_instr+1, PY_MONITORING_EVENT_BRANCH_RIGHT);
             PyStackRef_CLOSE(iter);
         }
@@ -942,7 +905,7 @@ dummy_func(
             PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc);
             DEAD(sub_st);
             PyStackRef_CLOSE(str_st);
-            res = PyStackRef_FromPyObjectImmortal(res_o);
+            res = PyStackRef_FromPyObjectBorrow(res_o);
         }
 
         op(_GUARD_NOS_TUPLE, (nos, unused -- nos, unused)) {
@@ -3085,15 +3048,24 @@ dummy_func(
             values_or_none = PyStackRef_FromPyObjectSteal(values_or_none_o);
         }
 
-        inst(GET_ITER, (iterable -- iter)) {
+        inst(GET_ITER, (iterable -- iter, index_or_null)) {
             #ifdef Py_STATS
             _Py_GatherStats_GetIter(iterable);
             #endif
             /* before: [obj]; after [getiter(obj)] */
-            PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
-            PyStackRef_CLOSE(iterable);
-            ERROR_IF(iter_o == NULL);
-            iter = PyStackRef_FromPyObjectSteal(iter_o);
+            PyTypeObject *tp = PyStackRef_TYPE(iterable);
+            if (tp == &PyTuple_Type || tp == &PyList_Type) {
+                iter = iterable;
+                DEAD(iterable);
+                index_or_null = PyStackRef_TagInt(0);
+            }
+            else {
+                PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
+                PyStackRef_CLOSE(iterable);
+                ERROR_IF(iter_o == NULL);
+                iter = PyStackRef_FromPyObjectSteal(iter_o);
+                index_or_null = PyStackRef_NULL;
+            }
         }
 
         inst(GET_YIELD_FROM_ITER, (iterable -- iter)) {
@@ -3140,11 +3112,11 @@ dummy_func(
             FOR_ITER_GEN,
         };
 
-        specializing op(_SPECIALIZE_FOR_ITER, (counter/1, iter -- iter)) {
+        specializing op(_SPECIALIZE_FOR_ITER, (counter/1, iter, null_or_index -- iter, null_or_index)) {
             #if ENABLE_SPECIALIZATION_FT
             if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
                 next_instr = this_instr;
-                _Py_Specialize_ForIter(iter, next_instr, oparg);
+                _Py_Specialize_ForIter(iter, null_or_index, next_instr, oparg);
                 DISPATCH_SAME_OPARG();
             }
             OPCODE_DEFERRED_INC(FOR_ITER);
@@ -3152,111 +3124,71 @@ dummy_func(
             #endif  /* ENABLE_SPECIALIZATION_FT */
         }
 
-        replaced op(_FOR_ITER, (iter -- iter, next)) {
-            /* before: [iter]; after: [iter, iter()] *or* [] (and jump over END_FOR.) */
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
-            if (next_o == NULL) {
-                if (_PyErr_Occurred(tstate)) {
-                    int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                    if (!matches) {
-                        ERROR_NO_POP();
-                    }
-                    _PyEval_MonitorRaise(tstate, frame, this_instr);
-                    _PyErr_Clear(tstate);
+        replaced op(_FOR_ITER, (iter, null_or_index -- iter, null_or_index, next)) {
+            _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
+            if (!PyStackRef_IsValid(item)) {
+                if (PyStackRef_IsError(item)) {
+                    ERROR_NO_POP();
                 }
-                /* iterator ended normally */
-                assert(next_instr[oparg].op.code == END_FOR ||
-                       next_instr[oparg].op.code == INSTRUMENTED_END_FOR);
-                /* Jump forward oparg, then skip following END_FOR */
+                // Jump forward by oparg and skip the following END_FOR
                 JUMPBY(oparg + 1);
                 DISPATCH();
             }
-            next = PyStackRef_FromPyObjectSteal(next_o);
-            // Common case: no jump, leave it to the code generator
+            next = item;
         }
 
-        op(_FOR_ITER_TIER_TWO, (iter -- iter, next)) {
-            /* before: [iter]; after: [iter, iter()] *or* [] (and jump over END_FOR.) */
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
-            if (next_o == NULL) {
-                if (_PyErr_Occurred(tstate)) {
-                    int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                    if (!matches) {
-                        ERROR_NO_POP();
-                    }
-                    _PyEval_MonitorRaise(tstate, frame, frame->instr_ptr);
-                    _PyErr_Clear(tstate);
+        op(_FOR_ITER_TIER_TWO, (iter, null_or_index -- iter, null_or_index, next)) {
+            _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
+            if (!PyStackRef_IsValid(item)) {
+                if (PyStackRef_IsError(item)) {
+                    ERROR_NO_POP();
                 }
                 /* iterator ended normally */
                 /* The translator sets the deopt target just past the matching END_FOR */
                 EXIT_IF(true);
             }
-            next = PyStackRef_FromPyObjectSteal(next_o);
-            // Common case: no jump, leave it to the code generator
+            next = item;
         }
 
+
         macro(FOR_ITER) = _SPECIALIZE_FOR_ITER + _FOR_ITER;
 
 
-        inst(INSTRUMENTED_FOR_ITER, (unused/1, iter -- iter, next)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
-            if (next_o != NULL) {
-                next = PyStackRef_FromPyObjectSteal(next_o);
-                INSTRUMENTED_JUMP(this_instr, next_instr, PY_MONITORING_EVENT_BRANCH_LEFT);
-            }
-            else {
-                if (_PyErr_Occurred(tstate)) {
-                    int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                    if (!matches) {
-                        ERROR_NO_POP();
-                    }
-                    _PyEval_MonitorRaise(tstate, frame, this_instr);
-                    _PyErr_Clear(tstate);
+        inst(INSTRUMENTED_FOR_ITER, (unused/1, iter, null_or_index -- iter, null_or_index, next)) {
+            _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
+            if (!PyStackRef_IsValid(item)) {
+                if (PyStackRef_IsError(item)) {
+                    ERROR_NO_POP();
                 }
-                /* iterator ended normally */
-                assert(next_instr[oparg].op.code == END_FOR ||
-                       next_instr[oparg].op.code == INSTRUMENTED_END_FOR);
-                /* Skip END_FOR */
+                // Jump forward by oparg and skip the following END_FOR
                 JUMPBY(oparg + 1);
                 DISPATCH();
             }
+            next = item;
+            INSTRUMENTED_JUMP(this_instr, next_instr, PY_MONITORING_EVENT_BRANCH_LEFT);
         }
 
-
-        op(_ITER_CHECK_LIST, (iter -- iter)) {
+        op(_ITER_CHECK_LIST, (iter, null_or_index -- iter, null_or_index)) {
             PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            EXIT_IF(Py_TYPE(iter_o) != &PyListIter_Type);
+            EXIT_IF(Py_TYPE(iter_o) != &PyList_Type);
+            assert(PyStackRef_IsTaggedInt(null_or_index));
 #ifdef Py_GIL_DISABLED
-            EXIT_IF(!_PyObject_IsUniquelyReferenced(iter_o));
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            EXIT_IF(!_Py_IsOwnedByCurrentThread((PyObject *)it->it_seq) ||
-                    !_PyObject_GC_IS_SHARED(it->it_seq));
+            EXIT_IF(!_Py_IsOwnedByCurrentThread(iter_o) && !_PyObject_GC_IS_SHARED(iter_o));
 #endif
         }
 
-        replaced op(_ITER_JUMP_LIST, (iter -- iter)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-// For free-threaded Python, the loop exit can happen at any point during
-// item retrieval, so it doesn't make much sense to check and jump
-// separately before item retrieval. Any length check we do here can be
-// invalid by the time we actually try to fetch the item.
+        replaced op(_ITER_JUMP_LIST, (iter, null_or_index -- iter, null_or_index)) {
 #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            (void)iter_o;
+            // For free-threaded Python, the loop exit can happen at any point during
+            // item retrieval, so it doesn't make much sense to check and jump
+            // separately before item retrieval. Any length check we do here can be
+            // invalid by the time we actually try to fetch the item.
 #else
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(list_o) == &PyList_Type);
             STAT_INC(FOR_ITER, hit);
-            PyListObject *seq = it->it_seq;
-            if (seq == NULL || (size_t)it->it_index >= (size_t)PyList_GET_SIZE(seq)) {
-                it->it_index = -1;
-                if (seq != NULL) {
-                    it->it_seq = NULL;
-                    Py_DECREF(seq);
-                }
+            if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o)) {
+                null_or_index = PyStackRef_TagInt(-1);
                 /* Jump forward oparg, then skip following END_FOR instruction */
                 JUMPBY(oparg + 1);
                 DISPATCH();
@@ -3265,73 +3197,54 @@ dummy_func(
         }
 
         // Only used by Tier 2
-        op(_GUARD_NOT_EXHAUSTED_LIST, (iter -- iter)) {
+        op(_GUARD_NOT_EXHAUSTED_LIST, (iter, null_or_index -- iter, null_or_index)) {
 #ifndef Py_GIL_DISABLED
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-            PyListObject *seq = it->it_seq;
-            EXIT_IF(seq == NULL);
-            if ((size_t)it->it_index >= (size_t)PyList_GET_SIZE(seq)) {
-                it->it_index = -1;
-                EXIT_IF(1);
-            }
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(list_o) == &PyList_Type);
+            EXIT_IF((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o));
 #endif
         }
 
-        replaced op(_ITER_NEXT_LIST, (iter -- iter, next)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-            PyListObject *seq = it->it_seq;
-            assert(seq);
+        replaced op(_ITER_NEXT_LIST, (iter, null_or_index -- iter, null_or_index, next)) {
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(PyList_CheckExact(list_o));
 #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            assert(_Py_IsOwnedByCurrentThread((PyObject *)seq) ||
-                   _PyObject_GC_IS_SHARED(seq));
+            assert(_Py_IsOwnedByCurrentThread(list_o) ||
+                   _PyObject_GC_IS_SHARED(list_o));
             STAT_INC(FOR_ITER, hit);
-            int result = _PyList_GetItemRefNoLock(seq, it->it_index, &next);
+            int result = _PyList_GetItemRefNoLock((PyListObject *)list_o, PyStackRef_UntagInt(null_or_index), &next);
             // A negative result means we lost a race with another thread
             // and we need to take the slow path.
             DEOPT_IF(result < 0);
             if (result == 0) {
-                it->it_index = -1;
+                null_or_index = PyStackRef_TagInt(-1);
                 /* Jump forward oparg, then skip following END_FOR instruction */
                 JUMPBY(oparg + 1);
                 DISPATCH();
             }
-            it->it_index++;
 #else
-            assert(it->it_index < PyList_GET_SIZE(seq));
-            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(seq, it->it_index++));
+            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(list_o, PyStackRef_UntagInt(null_or_index)));
 #endif
+            null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
         }
 
         // Only used by Tier 2
-        op(_ITER_NEXT_LIST_TIER_TWO, (iter -- iter, next)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-            PyListObject *seq = it->it_seq;
-            assert(seq);
+        op(_ITER_NEXT_LIST_TIER_TWO, (iter, null_or_index -- iter, null_or_index, next)) {
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(PyList_CheckExact(list_o));
 #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            assert(_Py_IsOwnedByCurrentThread((PyObject *)seq) ||
-                   _PyObject_GC_IS_SHARED(seq));
+            assert(_Py_IsOwnedByCurrentThread((PyObject *)list_o) ||
+                   _PyObject_GC_IS_SHARED(list_o));
             STAT_INC(FOR_ITER, hit);
-            int result = _PyList_GetItemRefNoLock(seq, it->it_index, &next);
+            int result = _PyList_GetItemRefNoLock((PyListObject *)list_o, PyStackRef_UntagInt(null_or_index), &next);
             // A negative result means we lost a race with another thread
             // and we need to take the slow path.
-            EXIT_IF(result < 0);
-            if (result == 0) {
-                it->it_index = -1;
-                EXIT_IF(1);
-            }
-            it->it_index++;
+            DEOPT_IF(result <= 0);
 #else
-            assert(it->it_index < PyList_GET_SIZE(seq));
-            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(seq, it->it_index++));
+            assert(PyStackRef_UntagInt(null_or_index) < PyList_GET_SIZE(list_o));
+            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(list_o, PyStackRef_UntagInt(null_or_index)));
 #endif
+            null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
         }
 
         macro(FOR_ITER_LIST) =
@@ -3340,31 +3253,19 @@ dummy_func(
             _ITER_JUMP_LIST +
             _ITER_NEXT_LIST;
 
-        op(_ITER_CHECK_TUPLE, (iter -- iter)) {
+        op(_ITER_CHECK_TUPLE, (iter, null_or_index -- iter, null_or_index)) {
             PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            EXIT_IF(Py_TYPE(iter_o) != &PyTupleIter_Type);
-#ifdef Py_GIL_DISABLED
-            EXIT_IF(!_PyObject_IsUniquelyReferenced(iter_o));
-#endif
+            EXIT_IF(Py_TYPE(iter_o) != &PyTuple_Type);
+            assert(PyStackRef_IsTaggedInt(null_or_index));
         }
 
-        replaced op(_ITER_JUMP_TUPLE, (iter -- iter)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            (void)iter_o;
-            assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-#ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-#endif
-            _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
+        replaced op(_ITER_JUMP_TUPLE, (iter, null_or_index -- iter, null_or_index)) {
+            PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+            (void)tuple_o;
+            assert(Py_TYPE(tuple_o) == &PyTuple_Type);
             STAT_INC(FOR_ITER, hit);
-            PyTupleObject *seq = it->it_seq;
-            if (seq == NULL || (size_t)it->it_index >= (size_t)PyTuple_GET_SIZE(seq)) {
-#ifndef Py_GIL_DISABLED
-                if (seq != NULL) {
-                    it->it_seq = NULL;
-                    Py_DECREF(seq);
-                }
-#endif
+            if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyTuple_GET_SIZE(tuple_o)) {
+                null_or_index = PyStackRef_TagInt(-1);
                 /* Jump forward oparg, then skip following END_FOR instruction */
                 JUMPBY(oparg + 1);
                 DISPATCH();
@@ -3372,29 +3273,19 @@ dummy_func(
         }
 
         // Only used by Tier 2
-        op(_GUARD_NOT_EXHAUSTED_TUPLE, (iter -- iter)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-#ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-#endif
-            PyTupleObject *seq = it->it_seq;
-            EXIT_IF(seq == NULL);
-            EXIT_IF(it->it_index >= PyTuple_GET_SIZE(seq));
+        op(_GUARD_NOT_EXHAUSTED_TUPLE, (iter, null_or_index -- iter, null_or_index)) {
+            PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(tuple_o) == &PyTuple_Type);
+            EXIT_IF((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyTuple_GET_SIZE(tuple_o));
         }
 
-        op(_ITER_NEXT_TUPLE, (iter -- iter, next)) {
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-            PyTupleObject *seq = it->it_seq;
-#ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-#endif
-            assert(seq);
-            assert(it->it_index < PyTuple_GET_SIZE(seq));
-            next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(seq, it->it_index++));
+        op(_ITER_NEXT_TUPLE, (iter, null_or_index -- iter, null_or_index, next)) {
+            PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(tuple_o) == &PyTuple_Type);
+            uintptr_t i = PyStackRef_UntagInt(null_or_index);
+            assert((size_t)i < (size_t)PyTuple_GET_SIZE(tuple_o));
+            next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(tuple_o, i));
+            null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
         }
 
         macro(FOR_ITER_TUPLE) =
@@ -3403,7 +3294,7 @@ dummy_func(
             _ITER_JUMP_TUPLE +
             _ITER_NEXT_TUPLE;
 
-        op(_ITER_CHECK_RANGE, (iter -- iter)) {
+        op(_ITER_CHECK_RANGE, (iter, null_or_index -- iter, null_or_index)) {
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             EXIT_IF(Py_TYPE(r) != &PyRangeIter_Type);
 #ifdef Py_GIL_DISABLED
@@ -3411,7 +3302,7 @@ dummy_func(
 #endif
         }
 
-        replaced op(_ITER_JUMP_RANGE, (iter -- iter)) {
+        replaced op(_ITER_JUMP_RANGE, (iter, null_or_index -- iter, null_or_index)) {
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             assert(Py_TYPE(r) == &PyRangeIter_Type);
 #ifdef Py_GIL_DISABLED
@@ -3426,13 +3317,13 @@ dummy_func(
         }
 
         // Only used by Tier 2
-        op(_GUARD_NOT_EXHAUSTED_RANGE, (iter -- iter)) {
+        op(_GUARD_NOT_EXHAUSTED_RANGE, (iter, null_or_index -- iter, null_or_index)) {
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             assert(Py_TYPE(r) == &PyRangeIter_Type);
             EXIT_IF(r->len <= 0);
         }
 
-        op(_ITER_NEXT_RANGE, (iter -- iter, next)) {
+        op(_ITER_NEXT_RANGE, (iter, null_or_index -- iter, null_or_index, next)) {
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             assert(Py_TYPE(r) == &PyRangeIter_Type);
 #ifdef Py_GIL_DISABLED
@@ -3453,7 +3344,7 @@ dummy_func(
             _ITER_JUMP_RANGE +
             _ITER_NEXT_RANGE;
 
-        op(_FOR_ITER_GEN_FRAME, (iter -- iter, gen_frame: _PyInterpreterFrame*)) {
+        op(_FOR_ITER_GEN_FRAME, (iter, null -- iter, null, gen_frame: _PyInterpreterFrame*)) {
             PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
             DEOPT_IF(Py_TYPE(gen) != &PyGen_Type);
 #ifdef Py_GIL_DISABLED
@@ -4041,6 +3932,11 @@ dummy_func(
             DEOPT_IF(!PyStackRef_IsNull(null));
         }
 
+        op(_GUARD_NOS_NOT_NULL, (nos, unused -- nos, unused)) {
+            PyObject *o = PyStackRef_AsPyObjectBorrow(nos);
+            EXIT_IF(o == NULL);
+        }
+
         op(_GUARD_THIRD_NULL, (null, unused, unused -- null, unused, unused)) {
             DEOPT_IF(!PyStackRef_IsNull(null));
         }
@@ -4394,16 +4290,26 @@ dummy_func(
             _GUARD_CALLABLE_ISINSTANCE +
             _CALL_ISINSTANCE;
 
+        macro(CALL_LIST_APPEND) =
+            unused/1 +
+            unused/2 +
+            _GUARD_CALLABLE_LIST_APPEND +
+            _GUARD_NOS_NOT_NULL +
+            _GUARD_NOS_LIST +
+            _CALL_LIST_APPEND;
+
+        op(_GUARD_CALLABLE_LIST_APPEND, (callable, unused, unused -- callable, unused, unused)){
+            PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
+            PyInterpreterState *interp = tstate->interp;
+            DEOPT_IF(callable_o != interp->callable_cache.list_append);
+        }
+
         // This is secretly a super-instruction
-        inst(CALL_LIST_APPEND, (unused/1, unused/2, callable, self, arg -- )) {
+        op(_CALL_LIST_APPEND, (callable, self, arg -- )) {
             assert(oparg == 1);
-            PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
             PyObject *self_o = PyStackRef_AsPyObjectBorrow(self);
 
-            PyInterpreterState *interp = tstate->interp;
-            DEOPT_IF(callable_o != interp->callable_cache.list_append);
-            DEOPT_IF(self_o == NULL);
-            DEOPT_IF(!PyList_Check(self_o));
+            DEOPT_IF(!PyList_CheckExact(self_o));
             DEOPT_IF(!LOCK_OBJECT(self_o));
             STAT_INC(CALL, hit);
             int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg));
@@ -5313,18 +5219,75 @@ dummy_func(
         }
 
         tier2 pure op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) {
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
         }
 
-        tier2 pure op (_POP_TOP_LOAD_CONST_INLINE_BORROW, (ptr/4, pop -- value)) {
+        tier2 op(_POP_CALL, (callable, null --)) {
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
+        }
+
+        tier2 op(_POP_CALL_ONE, (callable, null, pop --)) {
             PyStackRef_CLOSE(pop);
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
         }
 
-        tier2 pure op(_POP_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, pop1, pop2 -- value)) {
+        tier2 op(_POP_CALL_TWO, (callable, null, pop1, pop2 --)) {
             PyStackRef_CLOSE(pop2);
             PyStackRef_CLOSE(pop1);
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
+        }
+
+        tier2 op(_POP_TOP_LOAD_CONST_INLINE_BORROW, (ptr/4, pop -- value)) {
+            PyStackRef_CLOSE(pop);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+        }
+
+        tier2 op(_POP_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, pop1, pop2 -- value)) {
+            PyStackRef_CLOSE(pop2);
+            PyStackRef_CLOSE(pop1);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+        }
+
+        tier2 op(_POP_CALL_LOAD_CONST_INLINE_BORROW, (ptr/4, callable, null -- value)) {
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+        }
+
+        tier2 op(_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW, (ptr/4, callable, null, pop -- value)) {
+            PyStackRef_CLOSE(pop);
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+        }
+
+        tier2 op(_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, callable, null, pop1, pop2 -- value)) {
+            PyStackRef_CLOSE(pop2);
+            PyStackRef_CLOSE(pop1);
+            (void)null; // Silence compiler warnings about unused variables
+            DEAD(null);
+            PyStackRef_CLOSE(callable);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+        }
+
+        tier2 op(_LOAD_CONST_UNDER_INLINE, (ptr/4, old -- value, new)) {
+            new = old;
+            DEAD(old);
+            value = PyStackRef_FromPyObjectNew(ptr);
+        }
+
+        tier2 op(_LOAD_CONST_UNDER_INLINE_BORROW, (ptr/4, old -- value, new)) {
+            new = old;
+            DEAD(old);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
         }
 
         tier2 op(_CHECK_FUNCTION, (func_version/2 -- )) {
diff --git a/Python/ceval.c b/Python/ceval.c
index 490b653f132..4cfe4bb88f4 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -139,6 +139,19 @@
 #endif
 
 
+static void
+check_invalid_reentrancy(void)
+{
+#if defined(Py_DEBUG) && defined(Py_GIL_DISABLED)
+    // In the free-threaded build, the interpreter must not be re-entered if
+    // the world-is-stopped.  If so, that's a bug somewhere (quite likely in
+    // the painfully complex typeobject code).
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    assert(!interp->stoptheworld.world_stopped);
+#endif
+}
+
+
 #ifdef Py_DEBUG
 static void
 dump_item(_PyStackRef item)
@@ -360,9 +373,6 @@ _Py_EnterRecursiveCallUnchecked(PyThreadState *tstate)
 #  define Py_C_STACK_SIZE 1200000
 #elif defined(__sparc__)
 #  define Py_C_STACK_SIZE 1600000
-#elif defined(__wasi__)
-   /* Web assembly has two stacks, so this isn't really the stack depth */
-#  define Py_C_STACK_SIZE 131072  // wasi-libc DEFAULT_STACK_SIZE
 #elif defined(__hppa__) || defined(__powerpc64__)
 #  define Py_C_STACK_SIZE 2000000
 #else
@@ -999,6 +1009,7 @@ PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE
 _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
 {
     _Py_EnsureTstateNotNULL(tstate);
+    check_invalid_reentrancy();
     CALL_STAT_INC(pyeval_calls);
 
 #if USE_COMPUTED_GOTOS && !Py_TAIL_CALL_INTERP
@@ -2968,7 +2979,7 @@ _PyEval_ImportFrom(PyThreadState *tstate, PyObject *v, PyObject *name)
     int is_possibly_shadowing_stdlib = 0;
     if (is_possibly_shadowing) {
         PyObject *stdlib_modules;
-        if (_PySys_GetOptionalAttrString("stdlib_module_names", &stdlib_modules) < 0) {
+        if (PySys_GetOptionalAttrString("stdlib_module_names", &stdlib_modules) < 0) {
             goto done;
         }
         if (stdlib_modules && PyAnySet_Check(stdlib_modules)) {
@@ -3179,7 +3190,7 @@ _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject *kwarg
     else if (_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
         PyObject *exc = _PyErr_GetRaisedException(tstate);
         PyObject *args = PyException_GetArgs(exc);
-        if (exc && PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1) {
+        if (PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1) {
             _PyErr_Clear(tstate);
             PyObject *funcstr = _PyObject_FunctionStr(func);
             if (funcstr != NULL) {
@@ -3428,6 +3439,50 @@ _PyEval_LoadName(PyThreadState *tstate, _PyInterpreterFrame *frame, PyObject *na
     return value;
 }
 
+static _PyStackRef
+foriter_next(PyObject *seq, _PyStackRef index)
+{
+    assert(PyStackRef_IsTaggedInt(index));
+    assert(PyTuple_CheckExact(seq) || PyList_CheckExact(seq));
+    intptr_t i = PyStackRef_UntagInt(index);
+    if (PyTuple_CheckExact(seq)) {
+        size_t size = PyTuple_GET_SIZE(seq);
+        if ((size_t)i >= size) {
+            return PyStackRef_NULL;
+        }
+        return PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(seq, i));
+    }
+    PyObject *item = _PyList_GetItemRef((PyListObject *)seq, i);
+    if (item == NULL) {
+        return PyStackRef_NULL;
+    }
+    return PyStackRef_FromPyObjectSteal(item);
+}
+
+_PyStackRef _PyForIter_VirtualIteratorNext(PyThreadState* tstate, _PyInterpreterFrame* frame, _PyStackRef iter, _PyStackRef* index_ptr)
+{
+    PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
+    _PyStackRef index = *index_ptr;
+    if (PyStackRef_IsTaggedInt(index)) {
+        *index_ptr = PyStackRef_IncrementTaggedIntNoOverflow(index);
+        return foriter_next(iter_o, index);
+    }
+    PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
+    if (next_o == NULL) {
+        if (_PyErr_Occurred(tstate)) {
+            if (_PyErr_ExceptionMatches(tstate, PyExc_StopIteration)) {
+                _PyEval_MonitorRaise(tstate, frame, frame->instr_ptr);
+                _PyErr_Clear(tstate);
+            }
+            else {
+                return PyStackRef_ERROR;
+            }
+        }
+        return PyStackRef_NULL;
+    }
+    return PyStackRef_FromPyObjectSteal(next_o);
+}
+
 /* Check if a 'cls' provides the given special method. */
 static inline int
 type_has_special_method(PyTypeObject *cls, PyObject *name)
diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c
index 5b5018a6373..6d2383ac7c1 100644
--- a/Python/ceval_gil.c
+++ b/Python/ceval_gil.c
@@ -1218,30 +1218,30 @@ static inline int run_remote_debugger_source(PyObject *source)
 
 // Note that this function is inline to avoid creating a PLT entry
 // that would be an easy target for a ROP gadget.
-static inline void run_remote_debugger_script(const char *path)
+static inline void run_remote_debugger_script(PyObject *path)
 {
-    if (0 != PySys_Audit("remote_debugger_script", "s", path)) {
+    if (0 != PySys_Audit("remote_debugger_script", "O", path)) {
         PyErr_FormatUnraisable(
-            "Audit hook failed for remote debugger script %s", path);
+            "Audit hook failed for remote debugger script %U", path);
         return;
     }
 
     // Open the debugger script with the open code hook, and reopen the
     // resulting file object to get a C FILE* object.
-    PyObject* fileobj = PyFile_OpenCode(path);
+    PyObject* fileobj = PyFile_OpenCodeObject(path);
     if (!fileobj) {
-        PyErr_FormatUnraisable("Can't open debugger script %s", path);
+        PyErr_FormatUnraisable("Can't open debugger script %U", path);
         return;
     }
 
     PyObject* source = PyObject_CallMethodNoArgs(fileobj, &_Py_ID(read));
     if (!source) {
-        PyErr_FormatUnraisable("Error reading debugger script %s", path);
+        PyErr_FormatUnraisable("Error reading debugger script %U", path);
     }
 
     PyObject* res = PyObject_CallMethodNoArgs(fileobj, &_Py_ID(close));
     if (!res) {
-        PyErr_FormatUnraisable("Error closing debugger script %s", path);
+        PyErr_FormatUnraisable("Error closing debugger script %U", path);
     } else {
         Py_DECREF(res);
     }
@@ -1249,7 +1249,7 @@ static inline void run_remote_debugger_script(const char *path)
 
     if (source) {
         if (0 != run_remote_debugger_source(source)) {
-            PyErr_FormatUnraisable("Error executing debugger script %s", path);
+            PyErr_FormatUnraisable("Error executing debugger script %U", path);
         }
         Py_DECREF(source);
     }
@@ -1278,7 +1278,14 @@ int _PyRunRemoteDebugger(PyThreadState *tstate)
                 pathsz);
             path[pathsz - 1] = '\0';
             if (*path) {
-                run_remote_debugger_script(path);
+                PyObject *path_obj = PyUnicode_DecodeFSDefault(path);
+                if (path_obj == NULL) {
+                    PyErr_FormatUnraisable("Can't decode debugger script");
+                }
+                else {
+                    run_remote_debugger_script(path_obj);
+                    Py_DECREF(path_obj);
+                }
             }
             PyMem_Free(path);
         }
diff --git a/Python/codegen.c b/Python/codegen.c
index 683601103ec..0023d72cd5e 100644
--- a/Python/codegen.c
+++ b/Python/codegen.c
@@ -527,6 +527,15 @@ codegen_unwind_fblock(compiler *c, location *ploc,
         case COMPILE_FBLOCK_FOR_LOOP:
             /* Pop the iterator */
             if (preserve_tos) {
+                ADDOP_I(c, *ploc, SWAP, 3);
+            }
+            ADDOP(c, *ploc, POP_TOP);
+            ADDOP(c, *ploc, POP_TOP);
+            return SUCCESS;
+
+        case COMPILE_FBLOCK_ASYNC_FOR_LOOP:
+            /* Pop the iterator */
+            if (preserve_tos) {
                 ADDOP_I(c, *ploc, SWAP, 2);
             }
             ADDOP(c, *ploc, POP_TOP);
@@ -629,7 +638,8 @@ codegen_unwind_fblock_stack(compiler *c, location *ploc,
             c, *ploc, "'break', 'continue' and 'return' cannot appear in an except* block");
     }
     if (loop != NULL && (top->fb_type == COMPILE_FBLOCK_WHILE_LOOP ||
-                         top->fb_type == COMPILE_FBLOCK_FOR_LOOP)) {
+                         top->fb_type == COMPILE_FBLOCK_FOR_LOOP ||
+                         top->fb_type == COMPILE_FBLOCK_ASYNC_FOR_LOOP)) {
         *loop = top;
         return SUCCESS;
     }
@@ -2125,7 +2135,7 @@ codegen_async_for(compiler *c, stmt_ty s)
     ADDOP(c, LOC(s->v.AsyncFor.iter), GET_AITER);
 
     USE_LABEL(c, start);
-    RETURN_IF_ERROR(_PyCompile_PushFBlock(c, loc, COMPILE_FBLOCK_FOR_LOOP, start, end, NULL));
+    RETURN_IF_ERROR(_PyCompile_PushFBlock(c, loc, COMPILE_FBLOCK_ASYNC_FOR_LOOP, start, end, NULL));
 
     /* SETUP_FINALLY to guard the __anext__ call */
     ADDOP_JUMP(c, loc, SETUP_FINALLY, except);
@@ -2142,7 +2152,7 @@ codegen_async_for(compiler *c, stmt_ty s)
     /* Mark jump as artificial */
     ADDOP_JUMP(c, NO_LOCATION, JUMP, start);
 
-    _PyCompile_PopFBlock(c, COMPILE_FBLOCK_FOR_LOOP, start);
+    _PyCompile_PopFBlock(c, COMPILE_FBLOCK_ASYNC_FOR_LOOP, start);
 
     /* Except block for __anext__ */
     USE_LABEL(c, except);
@@ -3895,10 +3905,11 @@ maybe_optimize_function_call(compiler *c, expr_ty e, jump_target_label end)
         NEW_JUMP_TARGET_LABEL(c, loop);
         NEW_JUMP_TARGET_LABEL(c, cleanup);
 
+        ADDOP(c, loc, PUSH_NULL); // Push NULL index for loop
         USE_LABEL(c, loop);
         ADDOP_JUMP(c, loc, FOR_ITER, cleanup);
         if (const_oparg == CONSTANT_BUILTIN_TUPLE) {
-            ADDOP_I(c, loc, LIST_APPEND, 2);
+            ADDOP_I(c, loc, LIST_APPEND, 3);
             ADDOP_JUMP(c, loc, JUMP, loop);
         }
         else {
@@ -4442,13 +4453,12 @@ codegen_sync_comprehension_generator(compiler *c, location loc,
             }
             if (IS_JUMP_TARGET_LABEL(start)) {
                 VISIT(c, expr, gen->iter);
-                ADDOP(c, LOC(gen->iter), GET_ITER);
             }
         }
     }
 
     if (IS_JUMP_TARGET_LABEL(start)) {
-        depth++;
+        depth += 2;
         ADDOP(c, LOC(gen->iter), GET_ITER);
         USE_LABEL(c, start);
         ADDOP_JUMP(c, LOC(gen->iter), FOR_ITER, anchor);
@@ -4543,9 +4553,9 @@ codegen_async_comprehension_generator(compiler *c, location loc,
         else {
             /* Sub-iter - calculate on the fly */
             VISIT(c, expr, gen->iter);
-            ADDOP(c, LOC(gen->iter), GET_AITER);
         }
     }
+    ADDOP(c, LOC(gen->iter), GET_AITER);
 
     USE_LABEL(c, start);
     /* Runtime will push a block here, so we need to account for that */
@@ -4757,19 +4767,6 @@ pop_inlined_comprehension_state(compiler *c, location loc,
     return SUCCESS;
 }
 
-static inline int
-codegen_comprehension_iter(compiler *c, comprehension_ty comp)
-{
-    VISIT(c, expr, comp->iter);
-    if (comp->is_async) {
-        ADDOP(c, LOC(comp->iter), GET_AITER);
-    }
-    else {
-        ADDOP(c, LOC(comp->iter), GET_ITER);
-    }
-    return SUCCESS;
-}
-
 static int
 codegen_comprehension(compiler *c, expr_ty e, int type,
                       identifier name, asdl_comprehension_seq *generators, expr_ty elt,
@@ -4789,9 +4786,7 @@ codegen_comprehension(compiler *c, expr_ty e, int type,
 
     outermost = (comprehension_ty) asdl_seq_GET(generators, 0);
     if (is_inlined) {
-        if (codegen_comprehension_iter(c, outermost)) {
-            goto error;
-        }
+        VISIT(c, expr, outermost->iter);
         if (push_inlined_comprehension_state(c, loc, entry, &inline_state)) {
             goto error;
         }
diff --git a/Python/context.c b/Python/context.c
index dceaae9b429..9927cab915c 100644
--- a/Python/context.c
+++ b/Python/context.c
@@ -979,7 +979,7 @@ contextvar_tp_repr(PyObject *op)
         return NULL;
     }
 
-    if (PyUnicodeWriter_WriteUTF8(writer, "<ContextVar name=", 17) < 0) {
+    if (PyUnicodeWriter_WriteASCII(writer, "<ContextVar name=", 17) < 0) {
         goto error;
     }
     if (PyUnicodeWriter_WriteRepr(writer, self->var_name) < 0) {
@@ -987,7 +987,7 @@ contextvar_tp_repr(PyObject *op)
     }
 
     if (self->var_default != NULL) {
-        if (PyUnicodeWriter_WriteUTF8(writer, " default=", 9) < 0) {
+        if (PyUnicodeWriter_WriteASCII(writer, " default=", 9) < 0) {
             goto error;
         }
         if (PyUnicodeWriter_WriteRepr(writer, self->var_default) < 0) {
@@ -1182,15 +1182,15 @@ token_tp_repr(PyObject *op)
     if (writer == NULL) {
         return NULL;
     }
-    if (PyUnicodeWriter_WriteUTF8(writer, "<Token", 6) < 0) {
+    if (PyUnicodeWriter_WriteASCII(writer, "<Token", 6) < 0) {
         goto error;
     }
     if (self->tok_used) {
-        if (PyUnicodeWriter_WriteUTF8(writer, " used", 5) < 0) {
+        if (PyUnicodeWriter_WriteASCII(writer, " used", 5) < 0) {
             goto error;
         }
     }
-    if (PyUnicodeWriter_WriteUTF8(writer, " var=", 5) < 0) {
+    if (PyUnicodeWriter_WriteASCII(writer, " var=", 5) < 0) {
         goto error;
     }
     if (PyUnicodeWriter_WriteRepr(writer, (PyObject *)self->tok_var) < 0) {
diff --git a/Python/crossinterp.c b/Python/crossinterp.c
index 7d7e6551c3f..5e73ab28f2b 100644
--- a/Python/crossinterp.c
+++ b/Python/crossinterp.c
@@ -10,6 +10,7 @@
 #include "pycore_initconfig.h"    // _PyStatus_OK()
 #include "pycore_namespace.h"     // _PyNamespace_New()
 #include "pycore_pythonrun.h"     // _Py_SourceAsString()
+#include "pycore_setobject.h"     // _PySet_NextEntry()
 #include "pycore_typeobject.h"    // _PyStaticType_InitBuiltin()
 
 
@@ -69,6 +70,17 @@ runpy_run_path(const char *filename, const char *modname)
 }
 
 
+static void
+set_exc_with_cause(PyObject *exctype, const char *msg)
+{
+    PyObject *cause = PyErr_GetRaisedException();
+    PyErr_SetString(exctype, msg);
+    PyObject *exc = PyErr_GetRaisedException();
+    PyException_SetCause(exc, cause);
+    PyErr_SetRaisedException(exc);
+}
+
+
 static PyObject *
 pyerr_get_message(PyObject *exc)
 {
@@ -209,16 +221,16 @@ _Py_CallInInterpreterAndRawFree(PyInterpreterState *interp,
 /* cross-interpreter data */
 /**************************/
 
-/* registry of {type -> xidatafunc} */
+/* registry of {type -> _PyXIData_getdata_t} */
 
-/* For now we use a global registry of shareable classes.  An
-   alternative would be to add a tp_* slot for a class's
-   xidatafunc. It would be simpler and more efficient. */
+/* For now we use a global registry of shareable classes.
+   An alternative would be to add a tp_* slot for a class's
+   _PyXIData_getdata_t.  It would be simpler and more efficient. */
 
 static void xid_lookup_init(_PyXIData_lookup_t *);
 static void xid_lookup_fini(_PyXIData_lookup_t *);
 struct _dlcontext;
-static xidatafunc lookup_getdata(struct _dlcontext *, PyObject *);
+static _PyXIData_getdata_t lookup_getdata(struct _dlcontext *, PyObject *);
 #include "crossinterp_data_lookup.h"
 
 
@@ -342,7 +354,7 @@ _set_xid_lookup_failure(PyThreadState *tstate, PyObject *obj, const char *msg,
         set_notshareableerror(tstate, cause, 0, msg);
     }
     else {
-        msg = "%S does not support cross-interpreter data";
+        msg = "%R does not support cross-interpreter data";
         format_notshareableerror(tstate, cause, 0, msg, obj);
     }
 }
@@ -355,8 +367,8 @@ _PyObject_CheckXIData(PyThreadState *tstate, PyObject *obj)
     if (get_lookup_context(tstate, &ctx) < 0) {
         return -1;
     }
-    xidatafunc getdata = lookup_getdata(&ctx, obj);
-    if (getdata == NULL) {
+    _PyXIData_getdata_t getdata = lookup_getdata(&ctx, obj);
+    if (getdata.basic == NULL && getdata.fallback == NULL) {
         if (!_PyErr_Occurred(tstate)) {
             _set_xid_lookup_failure(tstate, obj, NULL, NULL);
         }
@@ -387,9 +399,9 @@ _check_xidata(PyThreadState *tstate, _PyXIData_t *xidata)
     return 0;
 }
 
-int
-_PyObject_GetXIData(PyThreadState *tstate,
-                    PyObject *obj, _PyXIData_t *xidata)
+static int
+_get_xidata(PyThreadState *tstate,
+            PyObject *obj, xidata_fallback_t fallback, _PyXIData_t *xidata)
 {
     PyInterpreterState *interp = tstate->interp;
 
@@ -397,6 +409,7 @@ _PyObject_GetXIData(PyThreadState *tstate,
     assert(xidata->obj == NULL);
     if (xidata->data != NULL || xidata->obj != NULL) {
         _PyErr_SetString(tstate, PyExc_ValueError, "xidata not cleared");
+        return -1;
     }
 
     // Call the "getdata" func for the object.
@@ -405,8 +418,8 @@ _PyObject_GetXIData(PyThreadState *tstate,
         return -1;
     }
     Py_INCREF(obj);
-    xidatafunc getdata = lookup_getdata(&ctx, obj);
-    if (getdata == NULL) {
+    _PyXIData_getdata_t getdata = lookup_getdata(&ctx, obj);
+    if (getdata.basic == NULL && getdata.fallback == NULL) {
         if (PyErr_Occurred()) {
             Py_DECREF(obj);
             return -1;
@@ -418,7 +431,9 @@ _PyObject_GetXIData(PyThreadState *tstate,
         }
         return -1;
     }
-    int res = getdata(tstate, obj, xidata);
+    int res = getdata.basic != NULL
+        ? getdata.basic(tstate, obj, xidata)
+        : getdata.fallback(tstate, obj, fallback, xidata);
     Py_DECREF(obj);
     if (res != 0) {
         PyObject *cause = _PyErr_GetRaisedException(tstate);
@@ -438,6 +453,51 @@ _PyObject_GetXIData(PyThreadState *tstate,
     return 0;
 }
 
+int
+_PyObject_GetXIDataNoFallback(PyThreadState *tstate,
+                              PyObject *obj, _PyXIData_t *xidata)
+{
+    return _get_xidata(tstate, obj, _PyXIDATA_XIDATA_ONLY, xidata);
+}
+
+int
+_PyObject_GetXIData(PyThreadState *tstate,
+                    PyObject *obj, xidata_fallback_t fallback,
+                    _PyXIData_t *xidata)
+{
+    switch (fallback) {
+        case _PyXIDATA_XIDATA_ONLY:
+            return _get_xidata(tstate, obj, fallback, xidata);
+        case _PyXIDATA_FULL_FALLBACK:
+            if (_get_xidata(tstate, obj, fallback, xidata) == 0) {
+                return 0;
+            }
+            PyObject *exc = _PyErr_GetRaisedException(tstate);
+            if (PyFunction_Check(obj)) {
+                if (_PyFunction_GetXIData(tstate, obj, xidata) == 0) {
+                    Py_DECREF(exc);
+                    return 0;
+                }
+                _PyErr_Clear(tstate);
+            }
+            // We could try _PyMarshal_GetXIData() but we won't for now.
+            if (_PyPickle_GetXIData(tstate, obj, xidata) == 0) {
+                Py_DECREF(exc);
+                return 0;
+            }
+            // Raise the original exception.
+            _PyErr_SetRaisedException(tstate, exc);
+            return -1;
+        default:
+#ifdef Py_DEBUG
+            Py_FatalError("unsupported xidata fallback option");
+#endif
+            _PyErr_SetString(tstate, PyExc_SystemError,
+                             "unsupported xidata fallback option");
+            return -1;
+    }
+}
+
 
 /* pickle C-API */
 
@@ -859,8 +919,15 @@ get_script_xidata(PyThreadState *tstate, PyObject *obj, int pure,
             }
             goto error;
         }
+#ifdef Py_GIL_DISABLED
+        // Don't immortalize code constants to avoid memory leaks.
+        ((_PyThreadStateImpl *)tstate)->suppress_co_const_immortalization++;
+#endif
         code = Py_CompileStringExFlags(
                     script, filename, Py_file_input, &cf, optimize);
+#ifdef Py_GIL_DISABLED
+        ((_PyThreadStateImpl *)tstate)->suppress_co_const_immortalization--;
+#endif
         Py_XDECREF(ref);
         if (code == NULL) {
             goto error;
@@ -1258,7 +1325,7 @@ _excinfo_normalize_type(struct _excinfo_type *info,
 }
 
 static void
-_PyXI_excinfo_Clear(_PyXI_excinfo *info)
+_PyXI_excinfo_clear(_PyXI_excinfo *info)
 {
     _excinfo_clear_type(&info->type);
     if (info->msg != NULL) {
@@ -1308,7 +1375,7 @@ _PyXI_excinfo_InitFromException(_PyXI_excinfo *info, PyObject *exc)
     assert(exc != NULL);
 
     if (PyErr_GivenExceptionMatches(exc, PyExc_MemoryError)) {
-        _PyXI_excinfo_Clear(info);
+        _PyXI_excinfo_clear(info);
         return NULL;
     }
     const char *failure = NULL;
@@ -1354,7 +1421,7 @@ _PyXI_excinfo_InitFromException(_PyXI_excinfo *info, PyObject *exc)
 
 error:
     assert(failure != NULL);
-    _PyXI_excinfo_Clear(info);
+    _PyXI_excinfo_clear(info);
     return failure;
 }
 
@@ -1405,7 +1472,7 @@ _PyXI_excinfo_InitFromObject(_PyXI_excinfo *info, PyObject *obj)
 
 error:
     assert(failure != NULL);
-    _PyXI_excinfo_Clear(info);
+    _PyXI_excinfo_clear(info);
     return failure;
 }
 
@@ -1600,7 +1667,7 @@ _PyXI_ExcInfoAsObject(_PyXI_excinfo *info)
 void
 _PyXI_ClearExcInfo(_PyXI_excinfo *info)
 {
-    _PyXI_excinfo_Clear(info);
+    _PyXI_excinfo_clear(info);
 }
 
 
@@ -1616,14 +1683,9 @@ _PyXI_ApplyErrorCode(_PyXI_errcode code, PyInterpreterState *interp)
     PyThreadState *tstate = _PyThreadState_GET();
 
     assert(!PyErr_Occurred());
+    assert(code != _PyXI_ERR_NO_ERROR);
+    assert(code != _PyXI_ERR_UNCAUGHT_EXCEPTION);
     switch (code) {
-    case _PyXI_ERR_NO_ERROR: _Py_FALLTHROUGH;
-    case _PyXI_ERR_UNCAUGHT_EXCEPTION:
-        // There is nothing to apply.
-#ifdef Py_DEBUG
-        Py_UNREACHABLE();
-#endif
-        return 0;
     case _PyXI_ERR_OTHER:
         // XXX msg?
         PyErr_SetNone(PyExc_InterpreterError);
@@ -1643,12 +1705,20 @@ _PyXI_ApplyErrorCode(_PyXI_errcode code, PyInterpreterState *interp)
         PyErr_SetString(PyExc_InterpreterError,
                         "failed to apply namespace to __main__");
         break;
+    case _PyXI_ERR_PRESERVE_FAILURE:
+        PyErr_SetString(PyExc_InterpreterError,
+                        "failed to preserve objects across session");
+        break;
+    case _PyXI_ERR_EXC_PROPAGATION_FAILURE:
+        PyErr_SetString(PyExc_InterpreterError,
+                        "failed to transfer exception between interpreters");
+        break;
     case _PyXI_ERR_NOT_SHAREABLE:
         _set_xid_lookup_failure(tstate, NULL, NULL, NULL);
         break;
     default:
 #ifdef Py_DEBUG
-        Py_UNREACHABLE();
+        Py_FatalError("unsupported error code");
 #else
         PyErr_Format(PyExc_RuntimeError, "unsupported error code %d", code);
 #endif
@@ -1692,7 +1762,7 @@ _PyXI_InitError(_PyXI_error *error, PyObject *excobj, _PyXI_errcode code)
         assert(excobj == NULL);
         assert(code != _PyXI_ERR_NO_ERROR);
         error->code = code;
-        _PyXI_excinfo_Clear(&error->uncaught);
+        _PyXI_excinfo_clear(&error->uncaught);
     }
     return failure;
 }
@@ -1702,7 +1772,7 @@ _PyXI_ApplyError(_PyXI_error *error)
 {
     PyThreadState *tstate = PyThreadState_Get();
     if (error->code == _PyXI_ERR_UNCAUGHT_EXCEPTION) {
-        // Raise an exception that proxies the propagated exception.
+        // We will raise an exception that proxies the propagated exception.
        return _PyXI_excinfo_AsObject(&error->uncaught);
     }
     else if (error->code == _PyXI_ERR_NOT_SHAREABLE) {
@@ -1751,6 +1821,7 @@ typedef struct _sharednsitem {
     // in a different interpreter to release the XI data.
 } _PyXI_namespace_item;
 
+#ifndef NDEBUG
 static int
 _sharednsitem_is_initialized(_PyXI_namespace_item *item)
 {
@@ -1759,6 +1830,7 @@ _sharednsitem_is_initialized(_PyXI_namespace_item *item)
     }
     return 0;
 }
+#endif
 
 static int
 _sharednsitem_init(_PyXI_namespace_item *item, PyObject *key)
@@ -1786,7 +1858,8 @@ _sharednsitem_has_value(_PyXI_namespace_item *item, int64_t *p_interpid)
 }
 
 static int
-_sharednsitem_set_value(_PyXI_namespace_item *item, PyObject *value)
+_sharednsitem_set_value(_PyXI_namespace_item *item, PyObject *value,
+                        xidata_fallback_t fallback)
 {
     assert(_sharednsitem_is_initialized(item));
     assert(item->xidata == NULL);
@@ -1795,7 +1868,7 @@ _sharednsitem_set_value(_PyXI_namespace_item *item, PyObject *value)
         return -1;
     }
     PyThreadState *tstate = PyThreadState_Get();
-    if (_PyObject_GetXIData(tstate, value, item->xidata) != 0) {
+    if (_PyObject_GetXIData(tstate, value, fallback, item->xidata) < 0) {
         PyMem_RawFree(item->xidata);
         item->xidata = NULL;
         // The caller may want to propagate PyExc_NotShareableError
@@ -1827,7 +1900,8 @@ _sharednsitem_clear(_PyXI_namespace_item *item)
 }
 
 static int
-_sharednsitem_copy_from_ns(struct _sharednsitem *item, PyObject *ns)
+_sharednsitem_copy_from_ns(struct _sharednsitem *item, PyObject *ns,
+                           xidata_fallback_t fallback)
 {
     assert(item->name != NULL);
     assert(item->xidata == NULL);
@@ -1839,7 +1913,7 @@ _sharednsitem_copy_from_ns(struct _sharednsitem *item, PyObject *ns)
         // When applied, this item will be set to the default (or fail).
         return 0;
     }
-    if (_sharednsitem_set_value(item, value) < 0) {
+    if (_sharednsitem_set_value(item, value, fallback) < 0) {
         return -1;
     }
     return 0;
@@ -1869,156 +1943,212 @@ _sharednsitem_apply(_PyXI_namespace_item *item, PyObject *ns, PyObject *dflt)
     return res;
 }
 
-struct _sharedns {
-    Py_ssize_t len;
-    _PyXI_namespace_item *items;
-};
 
-static _PyXI_namespace *
-_sharedns_new(void)
-{
-    _PyXI_namespace *ns = PyMem_RawCalloc(sizeof(_PyXI_namespace), 1);
-    if (ns == NULL) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    *ns = (_PyXI_namespace){ 0 };
-    return ns;
-}
+typedef struct {
+    Py_ssize_t maxitems;
+    Py_ssize_t numnames;
+    Py_ssize_t numvalues;
+    _PyXI_namespace_item items[1];
+} _PyXI_namespace;
 
+#ifndef NDEBUG
 static int
-_sharedns_is_initialized(_PyXI_namespace *ns)
+_sharedns_check_counts(_PyXI_namespace *ns)
 {
-    if (ns->len == 0) {
-        assert(ns->items == NULL);
+    if (ns->maxitems <= 0) {
+        return 0;
+    }
+    if (ns->numnames < 0) {
+        return 0;
+    }
+    if (ns->numnames > ns->maxitems) {
+        return 0;
+    }
+    if (ns->numvalues < 0) {
+        return 0;
+    }
+    if (ns->numvalues > ns->numnames) {
         return 0;
     }
-
-    assert(ns->len > 0);
-    assert(ns->items != NULL);
-    assert(_sharednsitem_is_initialized(&ns->items[0]));
-    assert(ns->len == 1
-           || _sharednsitem_is_initialized(&ns->items[ns->len - 1]));
     return 1;
 }
 
-#define HAS_COMPLETE_DATA 1
-#define HAS_PARTIAL_DATA 2
-
 static int
-_sharedns_has_xidata(_PyXI_namespace *ns, int64_t *p_interpid)
+_sharedns_check_consistency(_PyXI_namespace *ns)
 {
-    // We expect _PyXI_namespace to always be initialized.
-    assert(_sharedns_is_initialized(ns));
-    int res = 0;
-    _PyXI_namespace_item *item0 = &ns->items[0];
-    if (!_sharednsitem_is_initialized(item0)) {
+    if (!_sharedns_check_counts(ns)) {
         return 0;
     }
-    int64_t interpid0 = -1;
-    if (!_sharednsitem_has_value(item0, &interpid0)) {
-        return 0;
+
+    Py_ssize_t i = 0;
+    _PyXI_namespace_item *item;
+    if (ns->numvalues > 0) {
+        item = &ns->items[0];
+        if (!_sharednsitem_is_initialized(item)) {
+            return 0;
+        }
+        int64_t interpid0 = -1;
+        if (!_sharednsitem_has_value(item, &interpid0)) {
+            return 0;
+        }
+        i += 1;
+        for (; i < ns->numvalues; i++) {
+            item = &ns->items[i];
+            if (!_sharednsitem_is_initialized(item)) {
+                return 0;
+            }
+            int64_t interpid = -1;
+            if (!_sharednsitem_has_value(item, &interpid)) {
+                return 0;
+            }
+            if (interpid != interpid0) {
+                return 0;
+            }
+        }
     }
-    if (ns->len > 1) {
-        // At this point we know it is has at least partial data.
-        _PyXI_namespace_item *itemN = &ns->items[ns->len-1];
-        if (!_sharednsitem_is_initialized(itemN)) {
-            res = HAS_PARTIAL_DATA;
-            goto finally;
+    for (; i < ns->numnames; i++) {
+        item = &ns->items[i];
+        if (!_sharednsitem_is_initialized(item)) {
+            return 0;
         }
-        int64_t interpidN = -1;
-        if (!_sharednsitem_has_value(itemN, &interpidN)) {
-            res = HAS_PARTIAL_DATA;
-            goto finally;
+        if (_sharednsitem_has_value(item, NULL)) {
+            return 0;
         }
-        assert(interpidN == interpid0);
     }
-    res = HAS_COMPLETE_DATA;
-    *p_interpid = interpid0;
-
-finally:
-    return res;
+    for (; i < ns->maxitems; i++) {
+        item = &ns->items[i];
+        if (_sharednsitem_is_initialized(item)) {
+            return 0;
+        }
+        if (_sharednsitem_has_value(item, NULL)) {
+            return 0;
+        }
+    }
+    return 1;
 }
+#endif
 
-static void
-_sharedns_clear(_PyXI_namespace *ns)
+static _PyXI_namespace *
+_sharedns_alloc(Py_ssize_t maxitems)
 {
-    if (!_sharedns_is_initialized(ns)) {
-        return;
+    if (maxitems < 0) {
+        if (!PyErr_Occurred()) {
+            PyErr_BadInternalCall();
+        }
+        return NULL;
+    }
+    else if (maxitems == 0) {
+        PyErr_SetString(PyExc_ValueError, "empty namespaces not allowed");
+        return NULL;
     }
 
-    // If the cross-interpreter data were allocated as part of
-    // _PyXI_namespace_item (instead of dynamically), this is where
-    // we would need verify that we are clearing the items in the
-    // correct interpreter, to avoid a race with releasing the XI data
-    // via a pending call.  See _sharedns_has_xidata().
-    for (Py_ssize_t i=0; i < ns->len; i++) {
-        _sharednsitem_clear(&ns->items[i]);
+    // Check for overflow.
+    size_t fixedsize = sizeof(_PyXI_namespace) - sizeof(_PyXI_namespace_item);
+    if ((size_t)maxitems >
+        ((size_t)PY_SSIZE_T_MAX - fixedsize) / sizeof(_PyXI_namespace_item))
+    {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    // Allocate the value, including items.
+    size_t size = fixedsize + sizeof(_PyXI_namespace_item) * maxitems;
+
+    _PyXI_namespace *ns = PyMem_RawCalloc(size, 1);
+    if (ns == NULL) {
+        PyErr_NoMemory();
+        return NULL;
     }
-    PyMem_RawFree(ns->items);
-    ns->items = NULL;
-    ns->len = 0;
+    ns->maxitems = maxitems;
+    assert(_sharedns_check_consistency(ns));
+    return ns;
 }
 
 static void
 _sharedns_free(_PyXI_namespace *ns)
 {
-    _sharedns_clear(ns);
+    // If we weren't always dynamically allocating the cross-interpreter
+    // data in each item then we would need to use a pending call
+    // to call _sharedns_free(), to avoid the race between freeing
+    // the shared namespace and releasing the XI data.
+    assert(_sharedns_check_counts(ns));
+    Py_ssize_t i = 0;
+    _PyXI_namespace_item *item;
+    if (ns->numvalues > 0) {
+        // One or more items may have interpreter-specific data.
+#ifndef NDEBUG
+        int64_t interpid = PyInterpreterState_GetID(PyInterpreterState_Get());
+        int64_t interpid_i;
+#endif
+        for (; i < ns->numvalues; i++) {
+            item = &ns->items[i];
+            assert(_sharednsitem_is_initialized(item));
+            // While we do want to ensure consistency across items,
+            // technically they don't need to match the current
+            // interpreter.  However, we keep the constraint for
+            // simplicity, by giving _PyXI_FreeNamespace() the exclusive
+            // responsibility of dealing with the owning interpreter.
+            assert(_sharednsitem_has_value(item, &interpid_i));
+            assert(interpid_i == interpid);
+            _sharednsitem_clear(item);
+        }
+    }
+    for (; i < ns->numnames; i++) {
+        item = &ns->items[i];
+        assert(_sharednsitem_is_initialized(item));
+        assert(!_sharednsitem_has_value(item, NULL));
+        _sharednsitem_clear(item);
+    }
+#ifndef NDEBUG
+    for (; i < ns->maxitems; i++) {
+        item = &ns->items[i];
+        assert(!_sharednsitem_is_initialized(item));
+        assert(!_sharednsitem_has_value(item, NULL));
+    }
+#endif
+
     PyMem_RawFree(ns);
 }
 
-static int
-_sharedns_init(_PyXI_namespace *ns, PyObject *names)
+static _PyXI_namespace *
+_create_sharedns(PyObject *names)
 {
-    assert(!_sharedns_is_initialized(ns));
     assert(names != NULL);
-    Py_ssize_t len = PyDict_CheckExact(names)
+    Py_ssize_t numnames = PyDict_CheckExact(names)
         ? PyDict_Size(names)
         : PySequence_Size(names);
-    if (len < 0) {
-        return -1;
-    }
-    if (len == 0) {
-        PyErr_SetString(PyExc_ValueError, "empty namespaces not allowed");
-        return -1;
-    }
-    assert(len > 0);
 
-    // Allocate the items.
-    _PyXI_namespace_item *items =
-            PyMem_RawCalloc(sizeof(struct _sharednsitem), len);
-    if (items == NULL) {
-        PyErr_NoMemory();
-        return -1;
+    _PyXI_namespace *ns = _sharedns_alloc(numnames);
+    if (ns == NULL) {
+        return NULL;
     }
+    _PyXI_namespace_item *items = ns->items;
 
     // Fill in the names.
-    Py_ssize_t i = -1;
     if (PyDict_CheckExact(names)) {
+        Py_ssize_t i = 0;
         Py_ssize_t pos = 0;
-        for (i=0; i < len; i++) {
-            PyObject *key;
-            if (!PyDict_Next(names, &pos, &key, NULL)) {
-                // This should not be possible.
-                assert(0);
-                goto error;
-            }
-            if (_sharednsitem_init(&items[i], key) < 0) {
+        PyObject *name;
+        while(PyDict_Next(names, &pos, &name, NULL)) {
+            if (_sharednsitem_init(&items[i], name) < 0) {
                 goto error;
             }
+            ns->numnames += 1;
+            i += 1;
         }
     }
     else if (PySequence_Check(names)) {
-        for (i=0; i < len; i++) {
-            PyObject *key = PySequence_GetItem(names, i);
-            if (key == NULL) {
+        for (Py_ssize_t i = 0; i < numnames; i++) {
+            PyObject *name = PySequence_GetItem(names, i);
+            if (name == NULL) {
                 goto error;
             }
-            int res = _sharednsitem_init(&items[i], key);
-            Py_DECREF(key);
+            int res = _sharednsitem_init(&items[i], name);
+            Py_DECREF(name);
             if (res < 0) {
                 goto error;
             }
+            ns->numnames += 1;
         }
     }
     else {
@@ -2026,151 +2156,187 @@ _sharedns_init(_PyXI_namespace *ns, PyObject *names)
                         "non-sequence namespace not supported");
         goto error;
     }
-
-    ns->items = items;
-    ns->len = len;
-    assert(_sharedns_is_initialized(ns));
-    return 0;
+    assert(ns->numnames == ns->maxitems);
+    return ns;
 
 error:
-    for (Py_ssize_t j=0; j < i; j++) {
-        _sharednsitem_clear(&items[j]);
+    _sharedns_free(ns);
+    return NULL;
+}
+
+static void _propagate_not_shareable_error(_PyXI_errcode *);
+
+static int
+_fill_sharedns(_PyXI_namespace *ns, PyObject *nsobj,
+               xidata_fallback_t fallback, _PyXI_errcode *p_errcode)
+{
+    // All items are expected to be shareable.
+    assert(_sharedns_check_counts(ns));
+    assert(ns->numnames == ns->maxitems);
+    assert(ns->numvalues == 0);
+    for (Py_ssize_t i=0; i < ns->maxitems; i++) {
+        if (_sharednsitem_copy_from_ns(&ns->items[i], nsobj, fallback) < 0) {
+            if (p_errcode != NULL) {
+                _propagate_not_shareable_error(p_errcode);
+            }
+            // Clear out the ones we set so far.
+            for (Py_ssize_t j=0; j < i; j++) {
+                _sharednsitem_clear_value(&ns->items[j]);
+                ns->numvalues -= 1;
+            }
+            return -1;
+        }
+        ns->numvalues += 1;
     }
-    PyMem_RawFree(items);
-    assert(!_sharedns_is_initialized(ns));
-    return -1;
+    return 0;
 }
 
-void
-_PyXI_FreeNamespace(_PyXI_namespace *ns)
+static int
+_sharedns_free_pending(void *data)
 {
-    if (!_sharedns_is_initialized(ns)) {
-        return;
-    }
+    _sharedns_free((_PyXI_namespace *)data);
+    return 0;
+}
 
-    int64_t interpid = -1;
-    if (!_sharedns_has_xidata(ns, &interpid)) {
+static void
+_destroy_sharedns(_PyXI_namespace *ns)
+{
+    assert(_sharedns_check_counts(ns));
+    assert(ns->numnames == ns->maxitems);
+    if (ns->numvalues == 0) {
         _sharedns_free(ns);
         return;
     }
 
-    if (interpid == PyInterpreterState_GetID(PyInterpreterState_Get())) {
+    int64_t interpid0;
+    if (!_sharednsitem_has_value(&ns->items[0], &interpid0)) {
+        // This shouldn't have been possible.
+        // We can deal with it in _sharedns_free().
         _sharedns_free(ns);
+        return;
     }
-    else {
-        // If we weren't always dynamically allocating the cross-interpreter
-        // data in each item then we would need to using a pending call
-        // to call _sharedns_free(), to avoid the race between freeing
-        // the shared namespace and releasing the XI data.
+    PyInterpreterState *interp = _PyInterpreterState_LookUpID(interpid0);
+    if (interp == PyInterpreterState_Get()) {
         _sharedns_free(ns);
+        return;
     }
+
+    // One or more items may have interpreter-specific data.
+    // Currently the xidata for each value is dynamically allocated,
+    // so technically we don't need to worry about that.
+    // However, explicitly adding a pending call here is simpler.
+    (void)_Py_CallInInterpreter(interp, _sharedns_free_pending, ns);
 }
 
-_PyXI_namespace *
-_PyXI_NamespaceFromNames(PyObject *names)
+static int
+_apply_sharedns(_PyXI_namespace *ns, PyObject *nsobj, PyObject *dflt)
 {
-    if (names == NULL || names == Py_None) {
-        return NULL;
+    for (Py_ssize_t i=0; i < ns->maxitems; i++) {
+        if (_sharednsitem_apply(&ns->items[i], nsobj, dflt) != 0) {
+            return -1;
+        }
     }
+    return 0;
+}
 
-    _PyXI_namespace *ns = _sharedns_new();
-    if (ns == NULL) {
-        return NULL;
-    }
 
-    if (_sharedns_init(ns, names) < 0) {
-        PyMem_RawFree(ns);
-        if (PySequence_Size(names) == 0) {
-            PyErr_Clear();
-        }
-        return NULL;
-    }
+/*********************************/
+/* switched-interpreter sessions */
+/*********************************/
 
-    return ns;
-}
+struct xi_session_error {
+    // This is set if the interpreter is entered and raised an exception
+    // that needs to be handled in some special way during exit.
+    _PyXI_errcode *override;
+    // This is set if exit captured an exception to propagate.
+    _PyXI_error *info;
 
-#ifndef NDEBUG
-static int _session_is_active(_PyXI_session *);
-#endif
-static void _propagate_not_shareable_error(_PyXI_session *);
+    // -- pre-allocated memory --
+    _PyXI_error _info;
+    _PyXI_errcode _override;
+};
 
-int
-_PyXI_FillNamespaceFromDict(_PyXI_namespace *ns, PyObject *nsobj,
-                            _PyXI_session *session)
-{
-    // session must be entered already, if provided.
-    assert(session == NULL || _session_is_active(session));
-    assert(_sharedns_is_initialized(ns));
-    for (Py_ssize_t i=0; i < ns->len; i++) {
-        _PyXI_namespace_item *item = &ns->items[i];
-        if (_sharednsitem_copy_from_ns(item, nsobj) < 0) {
-            _propagate_not_shareable_error(session);
-            // Clear out the ones we set so far.
-            for (Py_ssize_t j=0; j < i; j++) {
-                _sharednsitem_clear_value(&ns->items[j]);
-            }
-            return -1;
-        }
-    }
-    return 0;
-}
+struct xi_session {
+#define SESSION_UNUSED 0
+#define SESSION_ACTIVE 1
+    int status;
+    int switched;
+
+    // Once a session has been entered, this is the tstate that was
+    // current before the session.  If it is different from cur_tstate
+    // then we must have switched interpreters.  Either way, this will
+    // be the current tstate once we exit the session.
+    PyThreadState *prev_tstate;
+    // Once a session has been entered, this is the current tstate.
+    // It must be current when the session exits.
+    PyThreadState *init_tstate;
+    // This is true if init_tstate needs cleanup during exit.
+    int own_init_tstate;
+
+    // This is true if, while entering the session, init_thread took
+    // "ownership" of the interpreter's __main__ module.  This means
+    // it is the only thread that is allowed to run code there.
+    // (Caveat: for now, users may still run exec() against the
+    // __main__ module's dict, though that isn't advisable.)
+    int running;
+    // This is a cached reference to the __dict__ of the entered
+    // interpreter's __main__ module.  It is looked up when at the
+    // beginning of the session as a convenience.
+    PyObject *main_ns;
+
+    // This is a dict of objects that will be available (via sharing)
+    // once the session exits.  Do not access this directly; use
+    // _PyXI_Preserve() and _PyXI_GetPreserved() instead;
+    PyObject *_preserved;
+
+    struct xi_session_error error;
+};
 
-// All items are expected to be shareable.
-static _PyXI_namespace *
-_PyXI_NamespaceFromDict(PyObject *nsobj, _PyXI_session *session)
+_PyXI_session *
+_PyXI_NewSession(void)
 {
-    // session must be entered already, if provided.
-    assert(session == NULL || _session_is_active(session));
-    if (nsobj == NULL || nsobj == Py_None) {
-        return NULL;
-    }
-    if (!PyDict_CheckExact(nsobj)) {
-        PyErr_SetString(PyExc_TypeError, "expected a dict");
-        return NULL;
-    }
-
-    _PyXI_namespace *ns = _sharedns_new();
-    if (ns == NULL) {
+    _PyXI_session *session = PyMem_RawCalloc(1, sizeof(_PyXI_session));
+    if (session == NULL) {
+        PyErr_NoMemory();
         return NULL;
     }
+    return session;
+}
 
-    if (_sharedns_init(ns, nsobj) < 0) {
-        if (PyDict_Size(nsobj) == 0) {
-            PyMem_RawFree(ns);
-            PyErr_Clear();
-            return NULL;
-        }
-        goto error;
-    }
-
-    if (_PyXI_FillNamespaceFromDict(ns, nsobj, session) < 0) {
-        goto error;
-    }
+void
+_PyXI_FreeSession(_PyXI_session *session)
+{
+    assert(session->status == SESSION_UNUSED);
+    PyMem_RawFree(session);
+}
 
-    return ns;
 
-error:
-    assert(PyErr_Occurred()
-           || (session != NULL && session->error_override != NULL));
-    _sharedns_free(ns);
-    return NULL;
+static inline int
+_session_is_active(_PyXI_session *session)
+{
+    return session->status == SESSION_ACTIVE;
 }
 
-int
-_PyXI_ApplyNamespace(_PyXI_namespace *ns, PyObject *nsobj, PyObject *dflt)
+static int
+_session_pop_error(_PyXI_session *session, struct xi_session_error *err)
 {
-    for (Py_ssize_t i=0; i < ns->len; i++) {
-        if (_sharednsitem_apply(&ns->items[i], nsobj, dflt) != 0) {
-            return -1;
-        }
+    if (session->error.info == NULL) {
+        assert(session->error.override == NULL);
+        *err = (struct xi_session_error){0};
+        return 0;
     }
-    return 0;
+    *err = session->error;
+    err->info = &err->_info;
+    if (err->override != NULL) {
+        err->override = &err->_override;
+    }
+    session->error = (struct xi_session_error){0};
+    return 1;
 }
 
+static int _ensure_main_ns(_PyXI_session *, _PyXI_errcode *);
+static inline void _session_set_error(_PyXI_session *, _PyXI_errcode);
 
-/**********************/
-/* high-level helpers */
-/**********************/
 
 /* enter/exit a cross-interpreter session */
 
@@ -2178,6 +2344,7 @@ static void
 _enter_session(_PyXI_session *session, PyInterpreterState *interp)
 {
     // Set here and cleared in _exit_session().
+    assert(session->status == SESSION_UNUSED);
     assert(!session->own_init_tstate);
     assert(session->init_tstate == NULL);
     assert(session->prev_tstate == NULL);
@@ -2185,22 +2352,29 @@ _enter_session(_PyXI_session *session, PyInterpreterState *interp)
     assert(!session->running);
     assert(session->main_ns == NULL);
     // Set elsewhere and cleared in _capture_current_exception().
-    assert(session->error_override == NULL);
-    // Set elsewhere and cleared in _PyXI_ApplyCapturedException().
-    assert(session->error == NULL);
+    assert(session->error.override == NULL);
+    // Set elsewhere and cleared in _PyXI_Exit().
+    assert(session->error.info == NULL);
 
     // Switch to interpreter.
     PyThreadState *tstate = PyThreadState_Get();
     PyThreadState *prev = tstate;
-    if (interp != tstate->interp) {
+    int same_interp = (interp == tstate->interp);
+    if (!same_interp) {
         tstate = _PyThreadState_NewBound(interp, _PyThreadState_WHENCE_EXEC);
         // XXX Possible GILState issues?
-        session->prev_tstate = PyThreadState_Swap(tstate);
-        assert(session->prev_tstate == prev);
-        session->own_init_tstate = 1;
+        PyThreadState *swapped = PyThreadState_Swap(tstate);
+        assert(swapped == prev);
+        (void)swapped;
     }
-    session->init_tstate = tstate;
-    session->prev_tstate = prev;
+
+    *session = (_PyXI_session){
+        .status = SESSION_ACTIVE,
+        .switched = !same_interp,
+        .init_tstate = tstate,
+        .prev_tstate = prev,
+        .own_init_tstate = !same_interp,
+    };
 }
 
 static void
@@ -2209,16 +2383,16 @@ _exit_session(_PyXI_session *session)
     PyThreadState *tstate = session->init_tstate;
     assert(tstate != NULL);
     assert(PyThreadState_Get() == tstate);
+    assert(!_PyErr_Occurred(tstate));
 
     // Release any of the entered interpreters resources.
-    if (session->main_ns != NULL) {
-        Py_CLEAR(session->main_ns);
-    }
+    Py_CLEAR(session->main_ns);
+    Py_CLEAR(session->_preserved);
 
     // Ensure this thread no longer owns __main__.
     if (session->running) {
         _PyInterpreterState_SetNotRunningMain(tstate->interp);
-        assert(!PyErr_Occurred());
+        assert(!_PyErr_Occurred(tstate));
         session->running = 0;
     }
 
@@ -2234,24 +2408,17 @@ _exit_session(_PyXI_session *session)
     else {
         assert(!session->own_init_tstate);
     }
-    session->prev_tstate = NULL;
-    session->init_tstate = NULL;
-}
 
-#ifndef NDEBUG
-static int
-_session_is_active(_PyXI_session *session)
-{
-    return (session->init_tstate != NULL);
+    assert(session->error.info == NULL);
+    assert(session->error.override == _PyXI_ERR_NO_ERROR);
+
+    *session = (_PyXI_session){0};
 }
-#endif
 
 static void
-_propagate_not_shareable_error(_PyXI_session *session)
+_propagate_not_shareable_error(_PyXI_errcode *p_errcode)
 {
-    if (session == NULL) {
-        return;
-    }
+    assert(p_errcode != NULL);
     PyThreadState *tstate = PyThreadState_Get();
     PyObject *exctype = get_notshareableerror_type(tstate);
     if (exctype == NULL) {
@@ -2261,23 +2428,218 @@ _propagate_not_shareable_error(_PyXI_session *session)
     }
     if (PyErr_ExceptionMatches(exctype)) {
         // We want to propagate the exception directly.
-        session->_error_override = _PyXI_ERR_NOT_SHAREABLE;
-        session->error_override = &session->_error_override;
+        *p_errcode = _PyXI_ERR_NOT_SHAREABLE;
+    }
+}
+
+int
+_PyXI_Enter(_PyXI_session *session,
+            PyInterpreterState *interp, PyObject *nsupdates,
+            _PyXI_session_result *result)
+{
+    // Convert the attrs for cross-interpreter use.
+    _PyXI_namespace *sharedns = NULL;
+    if (nsupdates != NULL) {
+        Py_ssize_t len = PyDict_Size(nsupdates);
+        if (len < 0) {
+            if (result != NULL) {
+                result->errcode = _PyXI_ERR_APPLY_NS_FAILURE;
+            }
+            return -1;
+        }
+        if (len > 0) {
+            sharedns = _create_sharedns(nsupdates);
+            if (sharedns == NULL) {
+                if (result != NULL) {
+                    result->errcode = _PyXI_ERR_APPLY_NS_FAILURE;
+                }
+                return -1;
+            }
+            // For now we limit it to shareable objects.
+            xidata_fallback_t fallback = _PyXIDATA_XIDATA_ONLY;
+            _PyXI_errcode errcode = _PyXI_ERR_NO_ERROR;
+            if (_fill_sharedns(sharedns, nsupdates, fallback, &errcode) < 0) {
+                assert(PyErr_Occurred());
+                assert(session->error.info == NULL);
+                if (errcode == _PyXI_ERR_NO_ERROR) {
+                    errcode = _PyXI_ERR_UNCAUGHT_EXCEPTION;
+                }
+                _destroy_sharedns(sharedns);
+                if (result != NULL) {
+                    result->errcode = errcode;
+                }
+                return -1;
+            }
+        }
+    }
+
+    // Switch to the requested interpreter (if necessary).
+    _enter_session(session, interp);
+    _PyXI_errcode errcode = _PyXI_ERR_UNCAUGHT_EXCEPTION;
+
+    // Ensure this thread owns __main__.
+    if (_PyInterpreterState_SetRunningMain(interp) < 0) {
+        // In the case where we didn't switch interpreters, it would
+        // be more efficient to leave the exception in place and return
+        // immediately.  However, life is simpler if we don't.
+        errcode = _PyXI_ERR_ALREADY_RUNNING;
+        goto error;
+    }
+    session->running = 1;
+
+    // Apply the cross-interpreter data.
+    if (sharedns != NULL) {
+        if (_ensure_main_ns(session, &errcode) < 0) {
+            goto error;
+        }
+        if (_apply_sharedns(sharedns, session->main_ns, NULL) < 0) {
+            errcode = _PyXI_ERR_APPLY_NS_FAILURE;
+            goto error;
+        }
+        _destroy_sharedns(sharedns);
+    }
+
+    errcode = _PyXI_ERR_NO_ERROR;
+    assert(!PyErr_Occurred());
+    return 0;
+
+error:
+    // We want to propagate all exceptions here directly (best effort).
+    assert(errcode != _PyXI_ERR_NO_ERROR);
+    _session_set_error(session, errcode);
+    assert(!PyErr_Occurred());
+
+    // Exit the session.
+    struct xi_session_error err;
+    (void)_session_pop_error(session, &err);
+    _exit_session(session);
+
+    if (sharedns != NULL) {
+        _destroy_sharedns(sharedns);
+    }
+
+    // Apply the error from the other interpreter.
+    PyObject *excinfo = _PyXI_ApplyError(err.info);
+    _PyXI_excinfo_clear(&err.info->uncaught);
+    if (excinfo != NULL) {
+        if (result != NULL) {
+            result->excinfo = excinfo;
+        }
+        else {
+#ifdef Py_DEBUG
+            fprintf(stderr, "_PyXI_Enter(): uncaught exception discarded");
+#endif
+        }
+    }
+    assert(PyErr_Occurred());
+
+    return -1;
+}
+
+static int _pop_preserved(_PyXI_session *, _PyXI_namespace **, PyObject **,
+                          _PyXI_errcode *);
+static int _finish_preserved(_PyXI_namespace *, PyObject **);
+
+int
+_PyXI_Exit(_PyXI_session *session, _PyXI_errcode errcode,
+           _PyXI_session_result *result)
+{
+    int res = 0;
+
+    // Capture the raised exception, if any.
+    assert(session->error.info == NULL);
+    if (PyErr_Occurred()) {
+        _session_set_error(session, errcode);
+        assert(!PyErr_Occurred());
+    }
+    else {
+        assert(errcode == _PyXI_ERR_NO_ERROR);
+        assert(session->error.override == NULL);
+    }
+
+    // Capture the preserved namespace.
+    _PyXI_namespace *preserved = NULL;
+    PyObject *preservedobj = NULL;
+    if (result != NULL) {
+        errcode = _PyXI_ERR_NO_ERROR;
+        if (_pop_preserved(session, &preserved, &preservedobj, &errcode) < 0) {
+            if (session->error.info != NULL) {
+                // XXX Chain the exception (i.e. set __context__)?
+                PyErr_FormatUnraisable(
+                    "Exception ignored while capturing preserved objects");
+            }
+            else {
+                _session_set_error(session, errcode);
+            }
+        }
+    }
+
+    // Exit the session.
+    struct xi_session_error err;
+    (void)_session_pop_error(session, &err);
+    _exit_session(session);
+
+    // Restore the preserved namespace.
+    assert(preserved == NULL || preservedobj == NULL);
+    if (_finish_preserved(preserved, &preservedobj) < 0) {
+        assert(preservedobj == NULL);
+        if (err.info != NULL) {
+            // XXX Chain the exception (i.e. set __context__)?
+            PyErr_FormatUnraisable(
+                "Exception ignored while capturing preserved objects");
+        }
+        else {
+            errcode = _PyXI_ERR_PRESERVE_FAILURE;
+            _propagate_not_shareable_error(&errcode);
+        }
+    }
+    if (result != NULL) {
+        result->preserved = preservedobj;
+        result->errcode = errcode;
+    }
+
+    // Apply the error from the other interpreter, if any.
+    if (err.info != NULL) {
+        res = -1;
+        assert(!PyErr_Occurred());
+        PyObject *excinfo = _PyXI_ApplyError(err.info);
+        _PyXI_excinfo_clear(&err.info->uncaught);
+        if (excinfo == NULL) {
+            assert(PyErr_Occurred());
+            if (result != NULL) {
+                _PyXI_ClearResult(result);
+                *result = (_PyXI_session_result){
+                    .errcode = _PyXI_ERR_EXC_PROPAGATION_FAILURE,
+                };
+            }
+        }
+        else if (result != NULL) {
+            result->excinfo = excinfo;
+        }
+        else {
+#ifdef Py_DEBUG
+            fprintf(stderr, "_PyXI_Exit(): uncaught exception discarded");
+#endif
+        }
     }
+    return res;
 }
 
+
+/* in an active cross-interpreter session */
+
 static void
 _capture_current_exception(_PyXI_session *session)
 {
-    assert(session->error == NULL);
+    assert(session->error.info == NULL);
     if (!PyErr_Occurred()) {
-        assert(session->error_override == NULL);
+        assert(session->error.override == NULL);
         return;
     }
 
     // Handle the exception override.
-    _PyXI_errcode *override = session->error_override;
-    session->error_override = NULL;
+    _PyXI_errcode *override = session->error.override;
+    session->error.override = NULL;
     _PyXI_errcode errcode = override != NULL
         ? *override
         : _PyXI_ERR_UNCAUGHT_EXCEPTION;
@@ -2300,7 +2662,7 @@ _capture_current_exception(_PyXI_session *session)
     }
 
     // Capture the exception.
-    _PyXI_error *err = &session->_error;
+    _PyXI_error *err = &session->error._info;
     *err = (_PyXI_error){
         .interp = session->init_tstate->interp,
     };
@@ -2327,100 +2689,194 @@ _capture_current_exception(_PyXI_session *session)
 
     // Finished!
     assert(!PyErr_Occurred());
-    session->error  = err;
+    session->error.info = err;
 }
 
-PyObject *
-_PyXI_ApplyCapturedException(_PyXI_session *session)
+static inline void
+_session_set_error(_PyXI_session *session, _PyXI_errcode errcode)
 {
-    assert(!PyErr_Occurred());
-    assert(session->error != NULL);
-    PyObject *res = _PyXI_ApplyError(session->error);
-    assert((res == NULL) != (PyErr_Occurred() == NULL));
-    session->error = NULL;
-    return res;
+    assert(_session_is_active(session));
+    assert(PyErr_Occurred());
+    if (errcode == _PyXI_ERR_NO_ERROR) {
+        // We're a bit forgiving here.
+        errcode = _PyXI_ERR_UNCAUGHT_EXCEPTION;
+    }
+    if (errcode != _PyXI_ERR_UNCAUGHT_EXCEPTION) {
+        session->error._override = errcode;
+        session->error.override = &session->error._override;
+    }
+    _capture_current_exception(session);
 }
 
-int
-_PyXI_HasCapturedException(_PyXI_session *session)
+static int
+_ensure_main_ns(_PyXI_session *session, _PyXI_errcode *p_errcode)
 {
-    return session->error != NULL;
+    assert(_session_is_active(session));
+    if (session->main_ns != NULL) {
+        return 0;
+    }
+    // Cache __main__.__dict__.
+    PyObject *main_mod = _Py_GetMainModule(session->init_tstate);
+    if (_Py_CheckMainModule(main_mod) < 0) {
+        if (p_errcode != NULL) {
+            *p_errcode = _PyXI_ERR_MAIN_NS_FAILURE;
+        }
+        return -1;
+    }
+    PyObject *ns = PyModule_GetDict(main_mod);  // borrowed
+    Py_DECREF(main_mod);
+    if (ns == NULL) {
+        if (p_errcode != NULL) {
+            *p_errcode = _PyXI_ERR_MAIN_NS_FAILURE;
+        }
+        return -1;
+    }
+    session->main_ns = Py_NewRef(ns);
+    return 0;
 }
 
-int
-_PyXI_Enter(_PyXI_session *session,
-            PyInterpreterState *interp, PyObject *nsupdates)
+PyObject *
+_PyXI_GetMainNamespace(_PyXI_session *session, _PyXI_errcode *p_errcode)
 {
-    // Convert the attrs for cross-interpreter use.
-    _PyXI_namespace *sharedns = NULL;
-    if (nsupdates != NULL) {
-        sharedns = _PyXI_NamespaceFromDict(nsupdates, NULL);
-        if (sharedns == NULL && PyErr_Occurred()) {
-            assert(session->error == NULL);
-            return -1;
-        }
+    if (!_session_is_active(session)) {
+        PyErr_SetString(PyExc_RuntimeError, "session not active");
+        return NULL;
+    }
+    if (_ensure_main_ns(session, p_errcode) < 0) {
+        return NULL;
     }
+    return session->main_ns;
+}
 
-    // Switch to the requested interpreter (if necessary).
-    _enter_session(session, interp);
-    PyThreadState *session_tstate = session->init_tstate;
-    _PyXI_errcode errcode = _PyXI_ERR_UNCAUGHT_EXCEPTION;
 
-    // Ensure this thread owns __main__.
-    if (_PyInterpreterState_SetRunningMain(interp) < 0) {
-        // In the case where we didn't switch interpreters, it would
-        // be more efficient to leave the exception in place and return
-        // immediately.  However, life is simpler if we don't.
-        errcode = _PyXI_ERR_ALREADY_RUNNING;
-        goto error;
+static int
+_pop_preserved(_PyXI_session *session,
+               _PyXI_namespace **p_xidata, PyObject **p_obj,
+               _PyXI_errcode *p_errcode)
+{
+    assert(_PyThreadState_GET() == session->init_tstate);  // active session
+    if (session->_preserved == NULL) {
+        *p_xidata = NULL;
+        *p_obj = NULL;
+        return 0;
     }
-    session->running = 1;
+    if (session->init_tstate == session->prev_tstate) {
+        // We did not switch interpreters.
+        *p_xidata = NULL;
+        *p_obj = session->_preserved;
+        session->_preserved = NULL;
+        return 0;
+    }
+    *p_obj = NULL;
 
-    // Cache __main__.__dict__.
-    PyObject *main_mod = _Py_GetMainModule(session_tstate);
-    if (_Py_CheckMainModule(main_mod) < 0) {
-        errcode = _PyXI_ERR_MAIN_NS_FAILURE;
-        goto error;
+    // We did switch interpreters.
+    Py_ssize_t len = PyDict_Size(session->_preserved);
+    if (len < 0) {
+        if (p_errcode != NULL) {
+            *p_errcode = _PyXI_ERR_PRESERVE_FAILURE;
+        }
+        return -1;
     }
-    PyObject *ns = PyModule_GetDict(main_mod);  // borrowed
-    Py_DECREF(main_mod);
-    if (ns == NULL) {
-        errcode = _PyXI_ERR_MAIN_NS_FAILURE;
-        goto error;
+    else if (len == 0) {
+        *p_xidata = NULL;
     }
-    session->main_ns = Py_NewRef(ns);
+    else {
+        _PyXI_namespace *xidata = _create_sharedns(session->_preserved);
+        if (xidata == NULL) {
+            if (p_errcode != NULL) {
+                *p_errcode = _PyXI_ERR_PRESERVE_FAILURE;
+            }
+            return -1;
+        }
+        _PyXI_errcode errcode = _PyXI_ERR_NO_ERROR;
+        if (_fill_sharedns(xidata, session->_preserved,
+                           _PyXIDATA_FULL_FALLBACK, &errcode) < 0)
+        {
+            assert(session->error.info == NULL);
+            if (errcode != _PyXI_ERR_NOT_SHAREABLE) {
+                errcode = _PyXI_ERR_PRESERVE_FAILURE;
+            }
+            if (p_errcode != NULL) {
+                *p_errcode = errcode;
+            }
+            _destroy_sharedns(xidata);
+            return -1;
+        }
+        *p_xidata = xidata;
+    }
+    Py_CLEAR(session->_preserved);
+    return 0;
+}
 
-    // Apply the cross-interpreter data.
-    if (sharedns != NULL) {
-        if (_PyXI_ApplyNamespace(sharedns, ns, NULL) < 0) {
-            errcode = _PyXI_ERR_APPLY_NS_FAILURE;
-            goto error;
+static int
+_finish_preserved(_PyXI_namespace *xidata, PyObject **p_preserved)
+{
+    if (xidata == NULL) {
+        return 0;
+    }
+    int res = -1;
+    if (p_preserved != NULL) {
+        PyObject *ns = PyDict_New();
+        if (ns == NULL) {
+            goto finally;
+        }
+        if (_apply_sharedns(xidata, ns, NULL) < 0) {
+            Py_CLEAR(ns);
+            goto finally;
         }
-        _PyXI_FreeNamespace(sharedns);
+        *p_preserved = ns;
     }
+    res = 0;
 
-    errcode = _PyXI_ERR_NO_ERROR;
-    assert(!PyErr_Occurred());
+finally:
+    _destroy_sharedns(xidata);
+    return res;
+}
+
+int
+_PyXI_Preserve(_PyXI_session *session, const char *name, PyObject *value,
+               _PyXI_errcode *p_errcode)
+{
+    if (!_session_is_active(session)) {
+        PyErr_SetString(PyExc_RuntimeError, "session not active");
+        return -1;
+    }
+    if (session->_preserved == NULL) {
+        session->_preserved = PyDict_New();
+        if (session->_preserved == NULL) {
+            set_exc_with_cause(PyExc_RuntimeError,
+                               "failed to initialize preserved objects");
+            if (p_errcode != NULL) {
+                *p_errcode = _PyXI_ERR_PRESERVE_FAILURE;
+            }
+            return -1;
+        }
+    }
+    if (PyDict_SetItemString(session->_preserved, name, value) < 0) {
+        set_exc_with_cause(PyExc_RuntimeError, "failed to preserve object");
+        if (p_errcode != NULL) {
+            *p_errcode = _PyXI_ERR_PRESERVE_FAILURE;
+        }
+        return -1;
+    }
     return 0;
+}
 
-error:
-    assert(PyErr_Occurred());
-    // We want to propagate all exceptions here directly (best effort).
-    assert(errcode != _PyXI_ERR_UNCAUGHT_EXCEPTION);
-    session->error_override = &errcode;
-    _capture_current_exception(session);
-    _exit_session(session);
-    if (sharedns != NULL) {
-        _PyXI_FreeNamespace(sharedns);
+PyObject *
+_PyXI_GetPreserved(_PyXI_session_result *result, const char *name)
+{
+    PyObject *value = NULL;
+    if (result->preserved != NULL) {
+        (void)PyDict_GetItemStringRef(result->preserved, name, &value);
     }
-    return -1;
+    return value;
 }
 
 void
-_PyXI_Exit(_PyXI_session *session)
+_PyXI_ClearResult(_PyXI_session_result *result)
 {
-    _capture_current_exception(session);
-    _exit_session(session);
+    Py_CLEAR(result->preserved);
+    Py_CLEAR(result->excinfo);
 }
 
 
diff --git a/Python/crossinterp_data_lookup.h b/Python/crossinterp_data_lookup.h
index 231537c66d7..b16f38b847f 100644
--- a/Python/crossinterp_data_lookup.h
+++ b/Python/crossinterp_data_lookup.h
@@ -12,7 +12,8 @@ typedef _PyXIData_regitem_t dlregitem_t;
 // forward
 static void _xidregistry_init(dlregistry_t *);
 static void _xidregistry_fini(dlregistry_t *);
-static xidatafunc _lookup_getdata_from_registry(dlcontext_t *, PyObject *);
+static _PyXIData_getdata_t _lookup_getdata_from_registry(
+                                            dlcontext_t *, PyObject *);
 
 
 /* used in crossinterp.c */
@@ -49,7 +50,7 @@ get_lookup_context(PyThreadState *tstate, dlcontext_t *res)
     return 0;
 }
 
-static xidatafunc
+static _PyXIData_getdata_t
 lookup_getdata(dlcontext_t *ctx, PyObject *obj)
 {
    /* Cross-interpreter objects are looked up by exact match on the class.
@@ -88,24 +89,24 @@ _PyXIData_FormatNotShareableError(PyThreadState *tstate,
 }
 
 
-xidatafunc
+_PyXIData_getdata_t
 _PyXIData_Lookup(PyThreadState *tstate, PyObject *obj)
 {
     dlcontext_t ctx;
     if (get_lookup_context(tstate, &ctx) < 0) {
-        return NULL;
+        return (_PyXIData_getdata_t){0};
     }
     return lookup_getdata(&ctx, obj);
 }
 
 
 /***********************************************/
-/* a registry of {type -> xidatafunc} */
+/* a registry of {type -> _PyXIData_getdata_t} */
 /***********************************************/
 
-/* For now we use a global registry of shareable classes.  An
-   alternative would be to add a tp_* slot for a class's
-   xidatafunc. It would be simpler and more efficient.  */
+/* For now we use a global registry of shareable classes.
+   An alternative would be to add a tp_* slot for a class's
+   _PyXIData_getdata_t.  It would be simpler and more efficient. */
 
 
 /* registry lifecycle */
@@ -200,7 +201,7 @@ _xidregistry_find_type(dlregistry_t *xidregistry, PyTypeObject *cls)
     return NULL;
 }
 
-static xidatafunc
+static _PyXIData_getdata_t
 _lookup_getdata_from_registry(dlcontext_t *ctx, PyObject *obj)
 {
     PyTypeObject *cls = Py_TYPE(obj);
@@ -209,10 +210,12 @@ _lookup_getdata_from_registry(dlcontext_t *ctx, PyObject *obj)
     _xidregistry_lock(xidregistry);
 
     dlregitem_t *matched = _xidregistry_find_type(xidregistry, cls);
-    xidatafunc func = matched != NULL ? matched->getdata : NULL;
+    _PyXIData_getdata_t getdata = matched != NULL
+        ? matched->getdata
+        : (_PyXIData_getdata_t){0};
 
     _xidregistry_unlock(xidregistry);
-    return func;
+    return getdata;
 }
 
 
@@ -220,12 +223,13 @@ _lookup_getdata_from_registry(dlcontext_t *ctx, PyObject *obj)
 
 static int
 _xidregistry_add_type(dlregistry_t *xidregistry,
-                      PyTypeObject *cls, xidatafunc getdata)
+                      PyTypeObject *cls, _PyXIData_getdata_t getdata)
 {
     dlregitem_t *newhead = PyMem_RawMalloc(sizeof(dlregitem_t));
     if (newhead == NULL) {
         return -1;
     }
+    assert((getdata.basic == NULL) != (getdata.fallback == NULL));
     *newhead = (dlregitem_t){
         // We do not keep a reference, to avoid keeping the class alive.
         .cls = cls,
@@ -283,13 +287,13 @@ _xidregistry_clear(dlregistry_t *xidregistry)
 
 int
 _PyXIData_RegisterClass(PyThreadState *tstate,
-                        PyTypeObject *cls, xidatafunc getdata)
+                        PyTypeObject *cls, _PyXIData_getdata_t getdata)
 {
     if (!PyType_Check(cls)) {
         PyErr_Format(PyExc_ValueError, "only classes may be registered");
         return -1;
     }
-    if (getdata == NULL) {
+    if (getdata.basic == NULL && getdata.fallback == NULL) {
         PyErr_Format(PyExc_ValueError, "missing 'getdata' func");
         return -1;
     }
@@ -304,7 +308,8 @@ _PyXIData_RegisterClass(PyThreadState *tstate,
 
     dlregitem_t *matched = _xidregistry_find_type(xidregistry, cls);
     if (matched != NULL) {
-        assert(matched->getdata == getdata);
+        assert(matched->getdata.basic == getdata.basic);
+        assert(matched->getdata.fallback == getdata.fallback);
         matched->refcount += 1;
         goto finally;
     }
@@ -608,7 +613,8 @@ _tuple_shared_free(void* data)
 }
 
 static int
-_tuple_shared(PyThreadState *tstate, PyObject *obj, _PyXIData_t *xidata)
+_tuple_shared(PyThreadState *tstate, PyObject *obj, xidata_fallback_t fallback,
+              _PyXIData_t *xidata)
 {
     Py_ssize_t len = PyTuple_GET_SIZE(obj);
     if (len < 0) {
@@ -636,7 +642,7 @@ _tuple_shared(PyThreadState *tstate, PyObject *obj, _PyXIData_t *xidata)
 
         int res = -1;
         if (!_Py_EnterRecursiveCallTstate(tstate, " while sharing a tuple")) {
-            res = _PyObject_GetXIData(tstate, item, xidata_i);
+            res = _PyObject_GetXIData(tstate, item, fallback, xidata_i);
             _Py_LeaveRecursiveCallTstate(tstate);
         }
         if (res < 0) {
@@ -677,44 +683,116 @@ _PyCode_GetXIData(PyThreadState *tstate, PyObject *obj, _PyXIData_t *xidata)
     return 0;
 }
 
+// function
+
+PyObject *
+_PyFunction_FromXIData(_PyXIData_t *xidata)
+{
+    // For now "stateless" functions are the only ones we must accommodate.
+
+    PyObject *code = _PyMarshal_ReadObjectFromXIData(xidata);
+    if (code == NULL) {
+        return NULL;
+    }
+    // Create a new function.
+    assert(PyCode_Check(code));
+    PyObject *globals = PyDict_New();
+    if (globals == NULL) {
+        Py_DECREF(code);
+        return NULL;
+    }
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (PyDict_SetItem(globals, &_Py_ID(__builtins__),
+                       tstate->interp->builtins) < 0)
+    {
+        Py_DECREF(code);
+        Py_DECREF(globals);
+        return NULL;
+    }
+    PyObject *func = PyFunction_New(code, globals);
+    Py_DECREF(code);
+    Py_DECREF(globals);
+    return func;
+}
+
+int
+_PyFunction_GetXIData(PyThreadState *tstate, PyObject *func,
+                      _PyXIData_t *xidata)
+{
+    if (!PyFunction_Check(func)) {
+        const char *msg = "expected a function, got %R";
+        format_notshareableerror(tstate, NULL, 0, msg, func);
+        return -1;
+    }
+    if (_PyFunction_VerifyStateless(tstate, func) < 0) {
+        PyObject *cause = _PyErr_GetRaisedException(tstate);
+        assert(cause != NULL);
+        const char *msg = "only stateless functions are shareable";
+        set_notshareableerror(tstate, cause, 0, msg);
+        Py_DECREF(cause);
+        return -1;
+    }
+    PyObject *code = PyFunction_GET_CODE(func);
+
+    // Ideally code objects would be immortal and directly shareable.
+    // In the meantime, we use marshal.
+    if (_PyMarshal_GetXIData(tstate, code, xidata) < 0) {
+        return -1;
+    }
+    // Replace _PyMarshal_ReadObjectFromXIData.
+    // (_PyFunction_FromXIData() will call it.)
+    _PyXIData_SET_NEW_OBJECT(xidata, _PyFunction_FromXIData);
+    return 0;
+}
+
 
 // registration
 
 static void
 _register_builtins_for_crossinterpreter_data(dlregistry_t *xidregistry)
 {
+#define REGISTER(TYPE, GETDATA) \
+    _xidregistry_add_type(xidregistry, (PyTypeObject *)TYPE, \
+                          ((_PyXIData_getdata_t){.basic=(GETDATA)}))
+#define REGISTER_FALLBACK(TYPE, GETDATA) \
+    _xidregistry_add_type(xidregistry, (PyTypeObject *)TYPE, \
+                          ((_PyXIData_getdata_t){.fallback=(GETDATA)}))
     // None
-    if (_xidregistry_add_type(xidregistry, (PyTypeObject *)PyObject_Type(Py_None), _none_shared) != 0) {
+    if (REGISTER(Py_TYPE(Py_None), _none_shared) != 0) {
         Py_FatalError("could not register None for cross-interpreter sharing");
     }
 
     // int
-    if (_xidregistry_add_type(xidregistry, &PyLong_Type, _long_shared) != 0) {
+    if (REGISTER(&PyLong_Type, _long_shared) != 0) {
         Py_FatalError("could not register int for cross-interpreter sharing");
     }
 
     // bytes
-    if (_xidregistry_add_type(xidregistry, &PyBytes_Type, _PyBytes_GetXIData) != 0) {
+    if (REGISTER(&PyBytes_Type, _PyBytes_GetXIData) != 0) {
         Py_FatalError("could not register bytes for cross-interpreter sharing");
     }
 
     // str
-    if (_xidregistry_add_type(xidregistry, &PyUnicode_Type, _str_shared) != 0) {
+    if (REGISTER(&PyUnicode_Type, _str_shared) != 0) {
         Py_FatalError("could not register str for cross-interpreter sharing");
     }
 
     // bool
-    if (_xidregistry_add_type(xidregistry, &PyBool_Type, _bool_shared) != 0) {
+    if (REGISTER(&PyBool_Type, _bool_shared) != 0) {
         Py_FatalError("could not register bool for cross-interpreter sharing");
     }
 
     // float
-    if (_xidregistry_add_type(xidregistry, &PyFloat_Type, _float_shared) != 0) {
+    if (REGISTER(&PyFloat_Type, _float_shared) != 0) {
         Py_FatalError("could not register float for cross-interpreter sharing");
     }
 
     // tuple
-    if (_xidregistry_add_type(xidregistry, &PyTuple_Type, _tuple_shared) != 0) {
+    if (REGISTER_FALLBACK(&PyTuple_Type, _tuple_shared) != 0) {
         Py_FatalError("could not register tuple for cross-interpreter sharing");
     }
+
+    // For now, we do not register PyCode_Type or PyFunction_Type.
+#undef REGISTER
+#undef REGISTER_FALLBACK
 }
diff --git a/Python/emscripten_trampoline.c b/Python/emscripten_trampoline.c
index a7bb685bf3d..cc5047d6bda 100644
--- a/Python/emscripten_trampoline.c
+++ b/Python/emscripten_trampoline.c
@@ -35,7 +35,7 @@ EM_JS(CountArgsFunc, _PyEM_GetCountArgsPtr, (), {
 //     (type $type1 (func (param i32) (result i32)))
 //     (type $type2 (func (param i32 i32) (result i32)))
 //     (type $type3 (func (param i32 i32 i32) (result i32)))
-//     (type $blocktype (func (param i32) (result)))
+//     (type $blocktype (func (param) (result)))
 //     (table $funcs (import "e" "t") 0 funcref)
 //     (export "f" (func $f))
 //     (func $f (param $fptr i32) (result i32)
@@ -44,36 +44,28 @@ EM_JS(CountArgsFunc, _PyEM_GetCountArgsPtr, (), {
 //         table.get $funcs
 //         local.tee $fref
 //         ref.test $type3
-//         (block $b (type $blocktype)
-//             i32.eqz
-//             br_if $b
+//         if $blocktype
 //             i32.const 3
 //             return
-//         )
+//         end
 //         local.get $fref
 //         ref.test $type2
-//         (block $b (type $blocktype)
-//             i32.eqz
-//             br_if $b
+//         if $blocktype
 //             i32.const 2
 //             return
-//         )
+//         end
 //         local.get $fref
 //         ref.test $type1
-//         (block $b (type $blocktype)
-//             i32.eqz
-//             br_if $b
+//         if $blocktype
 //             i32.const 1
 //             return
-//         )
+//         end
 //         local.get $fref
 //         ref.test $type0
-//         (block $b (type $blocktype)
-//             i32.eqz
-//             br_if $b
+//         if $blocktype
 //             i32.const 0
 //             return
-//         )
+//         end
 //         i32.const -1
 //     )
 // )
@@ -88,13 +80,13 @@ function getPyEMCountArgsPtr() {
     const code = new Uint8Array([
         0x00, 0x61, 0x73, 0x6d, // \0asm magic number
         0x01, 0x00, 0x00, 0x00, // version 1
-        0x01, 0x1b, // Type section, body is 0x1b bytes
+        0x01, 0x1a, // Type section, body is 0x1a bytes
             0x05, // 6 entries
-            0x60, 0x00, 0x01, 0x7f,                         // (type $type0 (func (param) (result i32)))
-            0x60, 0x01, 0x7f, 0x01, 0x7f,                   // (type $type1 (func (param i32) (result i32)))
-            0x60, 0x02, 0x7f, 0x7f, 0x01, 0x7f,             // (type $type2 (func (param i32 i32) (result i32)))
-            0x60, 0x03, 0x7f, 0x7f, 0x7f, 0x01, 0x7f,       // (type $type3 (func (param i32 i32 i32) (result i32)))
-            0x60, 0x01, 0x7f, 0x00,                         // (type $blocktype (func (param i32) (result)))
+            0x60, 0x00, 0x01, 0x7f,                      // (type $type0 (func (param) (result i32)))
+            0x60, 0x01, 0x7f, 0x01, 0x7f,                // (type $type1 (func (param i32) (result i32)))
+            0x60, 0x02, 0x7f, 0x7f, 0x01, 0x7f,          // (type $type2 (func (param i32 i32) (result i32)))
+            0x60, 0x03, 0x7f, 0x7f, 0x7f, 0x01, 0x7f,    // (type $type3 (func (param i32 i32 i32) (result i32)))
+            0x60, 0x00, 0x00,                            // (type $blocktype (func (param) (result)))
         0x02, 0x09, // Import section, 0x9 byte body
             0x01, // 1 import (table $funcs (import "e" "t") 0 funcref)
             0x01, 0x65, // "e"
@@ -110,44 +102,36 @@ function getPyEMCountArgsPtr() {
             0x00, // a function
             0x00, // at index 0
 
-        0x0a, 0x44,  // Code section,
-            0x01, 0x42, // one entry of length 50
+        0x0a, 56,  // Code section,
+            0x01, 54, // one entry of length 54
             0x01, 0x01, 0x70, // one local of type funcref
             // Body of the function
             0x20, 0x00,       // local.get $fptr
             0x25, 0x00,       // table.get $funcs
             0x22, 0x01,       // local.tee $fref
             0xfb, 0x14, 0x03, // ref.test $type3
-            0x02, 0x04,       // block $b (type $blocktype)
-                0x45,         //   i32.eqz
-                0x0d, 0x00,   //   br_if $b
+            0x04, 0x04,       // if (type $blocktype)
                 0x41, 0x03,   //   i32.const 3
                 0x0f,         //   return
             0x0b,             // end block
 
             0x20, 0x01,       // local.get $fref
             0xfb, 0x14, 0x02, // ref.test $type2
-            0x02, 0x04,       // block $b (type $blocktype)
-                0x45,         //   i32.eqz
-                0x0d, 0x00,   //   br_if $b
+            0x04, 0x04,       // if (type $blocktype)
                 0x41, 0x02,   //   i32.const 2
                 0x0f,         //   return
             0x0b,             // end block
 
             0x20, 0x01,       // local.get $fref
             0xfb, 0x14, 0x01, // ref.test $type1
-            0x02, 0x04,       // block $b (type $blocktype)
-                0x45,         //   i32.eqz
-                0x0d, 0x00,   //   br_if $b
+            0x04, 0x04,       // if (type $blocktype)
                 0x41, 0x01,   //   i32.const 1
                 0x0f,         //   return
             0x0b,             // end block
 
             0x20, 0x01,       // local.get $fref
             0xfb, 0x14, 0x00, // ref.test $type0
-            0x02, 0x04,       // block $b (type $blocktype)
-                0x45,         //   i32.eqz
-                0x0d, 0x00,   //   br_if $b
+            0x04, 0x04,       // if (type $blocktype)
                 0x41, 0x00,   //   i32.const 0
                 0x0f,         //   return
             0x0b,             // end block
diff --git a/Python/errors.c b/Python/errors.c
index 81f267b043a..a3122f76bdd 100644
--- a/Python/errors.c
+++ b/Python/errors.c
@@ -10,7 +10,6 @@
 #include "pycore_pystate.h"       // _PyThreadState_GET()
 #include "pycore_runtime.h"       // _Py_ID()
 #include "pycore_structseq.h"     // _PyStructSequence_FiniBuiltin()
-#include "pycore_sysmodule.h"     // _PySys_GetOptionalAttr()
 #include "pycore_traceback.h"     // _PyTraceBack_FromFrame()
 #include "pycore_unicodeobject.h" // _PyUnicode_Equal()
 
@@ -1570,7 +1569,7 @@ write_unraisable_exc(PyThreadState *tstate, PyObject *exc_type,
                      PyObject *obj)
 {
     PyObject *file;
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
         return -1;
     }
     if (file == NULL || file == Py_None) {
@@ -1677,7 +1676,7 @@ format_unraisable_v(const char *format, va_list va, PyObject *obj)
     }
 
     PyObject *hook;
-    if (_PySys_GetOptionalAttr(&_Py_ID(unraisablehook), &hook) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(unraisablehook), &hook) < 0) {
         Py_DECREF(hook_args);
         err_msg_str = NULL;
         obj = NULL;
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index 41c9bd5ba70..d19605169d5 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -319,25 +319,11 @@
             break;
         }
 
-        /* _LOAD_CONST is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */
-
-        case _LOAD_CONST_MORTAL: {
-            _PyStackRef value;
-            oparg = CURRENT_OPARG();
-            PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            value = PyStackRef_FromPyObjectNewMortal(obj);
-            stack_pointer[0] = value;
-            stack_pointer += 1;
-            assert(WITHIN_STACK_BOUNDS());
-            break;
-        }
-
-        case _LOAD_CONST_IMMORTAL: {
+        case _LOAD_CONST: {
             _PyStackRef value;
             oparg = CURRENT_OPARG();
             PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            assert(_Py_IsImmortal(obj));
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -350,7 +336,7 @@
             assert(oparg == CURRENT_OPARG());
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -363,7 +349,7 @@
             assert(oparg == CURRENT_OPARG());
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -376,7 +362,7 @@
             assert(oparg == CURRENT_OPARG());
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -389,7 +375,7 @@
             assert(oparg == CURRENT_OPARG());
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -401,7 +387,7 @@
             oparg = CURRENT_OPARG();
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -413,10 +399,6 @@
             oparg = 0;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -432,10 +414,6 @@
             oparg = 1;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -451,10 +429,6 @@
             oparg = 2;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -470,10 +444,6 @@
             oparg = 3;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -489,10 +459,6 @@
             oparg = 4;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -508,10 +474,6 @@
             oparg = 5;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -527,10 +489,6 @@
             oparg = 6;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -546,10 +504,6 @@
             oparg = 7;
             assert(oparg == CURRENT_OPARG());
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -564,10 +518,6 @@
             _PyStackRef value;
             oparg = CURRENT_OPARG();
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -584,7 +534,25 @@
             stack_pointer += -1;
             assert(WITHIN_STACK_BOUNDS());
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(value);
+            PyStackRef_XCLOSE(value);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            break;
+        }
+
+        case _POP_TWO: {
+            _PyStackRef tos;
+            _PyStackRef nos;
+            tos = stack_pointer[-1];
+            nos = stack_pointer[-2];
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(tos);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(nos);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             break;
         }
@@ -609,6 +577,20 @@
             break;
         }
 
+        case _POP_ITER: {
+            _PyStackRef index_or_null;
+            _PyStackRef iter;
+            index_or_null = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            (void)index_or_null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(iter);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            break;
+        }
+
         case _END_SEND: {
             _PyStackRef value;
             _PyStackRef receiver;
@@ -1403,7 +1385,7 @@
             _PyFrame_SetStackPointer(frame, stack_pointer);
             PyStackRef_CLOSE(str_st);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            res = PyStackRef_FromPyObjectImmortal(res_o);
+            res = PyStackRef_FromPyObjectBorrow(res_o);
             stack_pointer[0] = res;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -4204,25 +4186,37 @@
         case _GET_ITER: {
             _PyStackRef iterable;
             _PyStackRef iter;
+            _PyStackRef index_or_null;
             iterable = stack_pointer[-1];
             #ifdef Py_STATS
             _PyFrame_SetStackPointer(frame, stack_pointer);
             _Py_GatherStats_GetIter(iterable);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             #endif
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            stack_pointer += -1;
-            assert(WITHIN_STACK_BOUNDS());
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(iterable);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (iter_o == NULL) {
-                JUMP_TO_ERROR();
+
+            PyTypeObject *tp = PyStackRef_TYPE(iterable);
+            if (tp == &PyTuple_Type || tp == &PyList_Type) {
+                iter = iterable;
+                index_or_null = PyStackRef_TagInt(0);
             }
-            iter = PyStackRef_FromPyObjectSteal(iter_o);
-            stack_pointer[0] = iter;
+            else {
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                stack_pointer += -1;
+                assert(WITHIN_STACK_BOUNDS());
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_CLOSE(iterable);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (iter_o == NULL) {
+                    JUMP_TO_ERROR();
+                }
+                iter = PyStackRef_FromPyObjectSteal(iter_o);
+                index_or_null = PyStackRef_NULL;
+                stack_pointer += 1;
+            }
+            stack_pointer[-1] = iter;
+            stack_pointer[0] = index_or_null;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
             break;
@@ -4269,32 +4263,25 @@
         /* _FOR_ITER is not a viable micro-op for tier 2 because it is replaced */
 
         case _FOR_ITER_TIER_TWO: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
             _PyStackRef next;
-            iter = stack_pointer[-1];
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
+            _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (next_o == NULL) {
-                if (_PyErr_Occurred(tstate)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
-                    if (!matches) {
-                        JUMP_TO_ERROR();
-                    }
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyEval_MonitorRaise(tstate, frame, frame->instr_ptr);
-                    _PyErr_Clear(tstate);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (!PyStackRef_IsValid(item)) {
+                if (PyStackRef_IsError(item)) {
+                    JUMP_TO_ERROR();
                 }
                 if (true) {
                     UOP_STAT_INC(uopcode, miss);
                     JUMP_TO_JUMP_TARGET();
                 }
             }
-            next = PyStackRef_FromPyObjectSteal(next_o);
+            next = item;
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -4304,21 +4291,18 @@
         /* _INSTRUMENTED_FOR_ITER is not a viable micro-op for tier 2 because it is instrumented */
 
         case _ITER_CHECK_LIST: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
             PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            if (Py_TYPE(iter_o) != &PyListIter_Type) {
+            if (Py_TYPE(iter_o) != &PyList_Type) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
+            assert(PyStackRef_IsTaggedInt(null_or_index));
             #ifdef Py_GIL_DISABLED
-            if (!_PyObject_IsUniquelyReferenced(iter_o)) {
-                UOP_STAT_INC(uopcode, miss);
-                JUMP_TO_JUMP_TARGET();
-            }
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            if (!_Py_IsOwnedByCurrentThread((PyObject *)it->it_seq) ||
-                    !_PyObject_GC_IS_SHARED(it->it_seq)) {
+            if (!_Py_IsOwnedByCurrentThread(iter_o) && !_PyObject_GC_IS_SHARED(iter_o)) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
@@ -4329,24 +4313,17 @@
         /* _ITER_JUMP_LIST is not a viable micro-op for tier 2 because it is replaced */
 
         case _GUARD_NOT_EXHAUSTED_LIST: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
             #ifndef Py_GIL_DISABLED
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-            PyListObject *seq = it->it_seq;
-            if (seq == NULL) {
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(list_o) == &PyList_Type);
+            if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o)) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
-            if ((size_t)it->it_index >= (size_t)PyList_GET_SIZE(seq)) {
-                it->it_index = -1;
-                if (1) {
-                    UOP_STAT_INC(uopcode, miss);
-                    JUMP_TO_JUMP_TARGET();
-                }
-            }
             #endif
             break;
         }
@@ -4354,38 +4331,30 @@
         /* _ITER_NEXT_LIST is not a viable micro-op for tier 2 because it is replaced */
 
         case _ITER_NEXT_LIST_TIER_TWO: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
             _PyStackRef next;
-            iter = stack_pointer[-1];
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyListIterObject *it = (_PyListIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyListIter_Type);
-            PyListObject *seq = it->it_seq;
-            assert(seq);
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(PyList_CheckExact(list_o));
             #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            assert(_Py_IsOwnedByCurrentThread((PyObject *)seq) ||
-                   _PyObject_GC_IS_SHARED(seq));
+            assert(_Py_IsOwnedByCurrentThread((PyObject *)list_o) ||
+                  _PyObject_GC_IS_SHARED(list_o));
             STAT_INC(FOR_ITER, hit);
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int result = _PyList_GetItemRefNoLock(seq, it->it_index, &next);
+            int result = _PyList_GetItemRefNoLock((PyListObject *)list_o, PyStackRef_UntagInt(null_or_index), &next);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (result < 0) {
+            if (result <= 0) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
-            if (result == 0) {
-                it->it_index = -1;
-                if (1) {
-                    UOP_STAT_INC(uopcode, miss);
-                    JUMP_TO_JUMP_TARGET();
-                }
-            }
-            it->it_index++;
             #else
-            assert(it->it_index < PyList_GET_SIZE(seq));
-            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(seq, it->it_index++));
+            assert(PyStackRef_UntagInt(null_or_index) < PyList_GET_SIZE(list_o));
+            next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(list_o, PyStackRef_UntagInt(null_or_index)));
             #endif
+            null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -4393,39 +4362,29 @@
         }
 
         case _ITER_CHECK_TUPLE: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
             PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            if (Py_TYPE(iter_o) != &PyTupleIter_Type) {
-                UOP_STAT_INC(uopcode, miss);
-                JUMP_TO_JUMP_TARGET();
-            }
-            #ifdef Py_GIL_DISABLED
-            if (!_PyObject_IsUniquelyReferenced(iter_o)) {
+            if (Py_TYPE(iter_o) != &PyTuple_Type) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
-            #endif
+            assert(PyStackRef_IsTaggedInt(null_or_index));
             break;
         }
 
         /* _ITER_JUMP_TUPLE is not a viable micro-op for tier 2 because it is replaced */
 
         case _GUARD_NOT_EXHAUSTED_TUPLE: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
-            iter = stack_pointer[-1];
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-            #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            #endif
-            PyTupleObject *seq = it->it_seq;
-            if (seq == NULL) {
-                UOP_STAT_INC(uopcode, miss);
-                JUMP_TO_JUMP_TARGET();
-            }
-            if (it->it_index >= PyTuple_GET_SIZE(seq)) {
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(tuple_o) == &PyTuple_Type);
+            if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyTuple_GET_SIZE(tuple_o)) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
@@ -4433,19 +4392,18 @@
         }
 
         case _ITER_NEXT_TUPLE: {
+            _PyStackRef null_or_index;
             _PyStackRef iter;
             _PyStackRef next;
-            iter = stack_pointer[-1];
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-            _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
-            assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-            PyTupleObject *seq = it->it_seq;
-            #ifdef Py_GIL_DISABLED
-            assert(_PyObject_IsUniquelyReferenced(iter_o));
-            #endif
-            assert(seq);
-            assert(it->it_index < PyTuple_GET_SIZE(seq));
-            next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(seq, it->it_index++));
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+            assert(Py_TYPE(tuple_o) == &PyTuple_Type);
+            uintptr_t i = PyStackRef_UntagInt(null_or_index);
+            assert((size_t)i < (size_t)PyTuple_GET_SIZE(tuple_o));
+            next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(tuple_o, i));
+            null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -4454,7 +4412,7 @@
 
         case _ITER_CHECK_RANGE: {
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            iter = stack_pointer[-2];
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             if (Py_TYPE(r) != &PyRangeIter_Type) {
                 UOP_STAT_INC(uopcode, miss);
@@ -4473,7 +4431,7 @@
 
         case _GUARD_NOT_EXHAUSTED_RANGE: {
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            iter = stack_pointer[-2];
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             assert(Py_TYPE(r) == &PyRangeIter_Type);
             if (r->len <= 0) {
@@ -4486,7 +4444,7 @@
         case _ITER_NEXT_RANGE: {
             _PyStackRef iter;
             _PyStackRef next;
-            iter = stack_pointer[-1];
+            iter = stack_pointer[-2];
             _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
             assert(Py_TYPE(r) == &PyRangeIter_Type);
             #ifdef Py_GIL_DISABLED
@@ -4511,7 +4469,7 @@
             _PyStackRef iter;
             _PyInterpreterFrame *gen_frame;
             oparg = CURRENT_OPARG();
-            iter = stack_pointer[-1];
+            iter = stack_pointer[-2];
             PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
             if (Py_TYPE(gen) != &PyGen_Type) {
                 UOP_STAT_INC(uopcode, miss);
@@ -5276,6 +5234,17 @@
             break;
         }
 
+        case _GUARD_NOS_NOT_NULL: {
+            _PyStackRef nos;
+            nos = stack_pointer[-2];
+            PyObject *o = PyStackRef_AsPyObjectBorrow(nos);
+            if (o == NULL) {
+                UOP_STAT_INC(uopcode, miss);
+                JUMP_TO_JUMP_TARGET();
+            }
+            break;
+        }
+
         case _GUARD_THIRD_NULL: {
             _PyStackRef null;
             null = stack_pointer[-3];
@@ -5920,6 +5889,18 @@
             break;
         }
 
+        case _GUARD_CALLABLE_LIST_APPEND: {
+            _PyStackRef callable;
+            callable = stack_pointer[-3];
+            PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
+            PyInterpreterState *interp = tstate->interp;
+            if (callable_o != interp->callable_cache.list_append) {
+                UOP_STAT_INC(uopcode, miss);
+                JUMP_TO_JUMP_TARGET();
+            }
+            break;
+        }
+
         case _CALL_LIST_APPEND: {
             _PyStackRef arg;
             _PyStackRef self;
@@ -5929,18 +5910,8 @@
             self = stack_pointer[-2];
             callable = stack_pointer[-3];
             assert(oparg == 1);
-            PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
             PyObject *self_o = PyStackRef_AsPyObjectBorrow(self);
-            PyInterpreterState *interp = tstate->interp;
-            if (callable_o != interp->callable_cache.list_append) {
-                UOP_STAT_INC(uopcode, miss);
-                JUMP_TO_JUMP_TARGET();
-            }
-            if (self_o == NULL) {
-                UOP_STAT_INC(uopcode, miss);
-                JUMP_TO_JUMP_TARGET();
-            }
-            if (!PyList_Check(self_o)) {
+            if (!PyList_CheckExact(self_o)) {
                 UOP_STAT_INC(uopcode, miss);
                 JUMP_TO_JUMP_TARGET();
             }
@@ -7042,13 +7013,76 @@
         case _LOAD_CONST_INLINE_BORROW: {
             _PyStackRef value;
             PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
             break;
         }
 
+        case _POP_CALL: {
+            _PyStackRef null;
+            _PyStackRef callable;
+            null = stack_pointer[-1];
+            callable = stack_pointer[-2];
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            break;
+        }
+
+        case _POP_CALL_ONE: {
+            _PyStackRef pop;
+            _PyStackRef null;
+            _PyStackRef callable;
+            pop = stack_pointer[-1];
+            null = stack_pointer[-2];
+            callable = stack_pointer[-3];
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            break;
+        }
+
+        case _POP_CALL_TWO: {
+            _PyStackRef pop2;
+            _PyStackRef pop1;
+            _PyStackRef null;
+            _PyStackRef callable;
+            pop2 = stack_pointer[-1];
+            pop1 = stack_pointer[-2];
+            null = stack_pointer[-3];
+            callable = stack_pointer[-4];
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop2);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop1);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            break;
+        }
+
         case _POP_TOP_LOAD_CONST_INLINE_BORROW: {
             _PyStackRef pop;
             _PyStackRef value;
@@ -7059,7 +7093,7 @@
             _PyFrame_SetStackPointer(frame, stack_pointer);
             PyStackRef_CLOSE(pop);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -7083,13 +7117,124 @@
             _PyFrame_SetStackPointer(frame, stack_pointer);
             PyStackRef_CLOSE(pop1);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            value = PyStackRef_FromPyObjectImmortal(ptr);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+            stack_pointer[0] = value;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_LOAD_CONST_INLINE_BORROW: {
+            _PyStackRef null;
+            _PyStackRef callable;
+            _PyStackRef value;
+            null = stack_pointer[-1];
+            callable = stack_pointer[-2];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
             break;
         }
 
+        case _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW: {
+            _PyStackRef pop;
+            _PyStackRef null;
+            _PyStackRef callable;
+            _PyStackRef value;
+            pop = stack_pointer[-1];
+            null = stack_pointer[-2];
+            callable = stack_pointer[-3];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+            stack_pointer[0] = value;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW: {
+            _PyStackRef pop2;
+            _PyStackRef pop1;
+            _PyStackRef null;
+            _PyStackRef callable;
+            _PyStackRef value;
+            pop2 = stack_pointer[-1];
+            pop1 = stack_pointer[-2];
+            null = stack_pointer[-3];
+            callable = stack_pointer[-4];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop2);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(pop1);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            (void)null;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            PyStackRef_CLOSE(callable);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+            stack_pointer[0] = value;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _LOAD_CONST_UNDER_INLINE: {
+            _PyStackRef old;
+            _PyStackRef value;
+            _PyStackRef new;
+            old = stack_pointer[-1];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
+            new = old;
+            value = PyStackRef_FromPyObjectNew(ptr);
+            stack_pointer[-1] = value;
+            stack_pointer[0] = new;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _LOAD_CONST_UNDER_INLINE_BORROW: {
+            _PyStackRef old;
+            _PyStackRef value;
+            _PyStackRef new;
+            old = stack_pointer[-1];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0();
+            new = old;
+            value = PyStackRef_FromPyObjectBorrow(ptr);
+            stack_pointer[-1] = value;
+            stack_pointer[0] = new;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
         case _CHECK_FUNCTION: {
             uint32_t func_version = (uint32_t)CURRENT_OPERAND0();
             assert(PyStackRef_FunctionCheck(frame->f_funcobj));
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 78603d40704..2a3f12d4e87 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -2784,6 +2784,43 @@ error:
     return -1;
 }
 #else   /* MS_WINDOWS */
+
+// The Windows Games API family doesn't expose GetNamedPipeHandleStateW so attempt
+// to load it directly from the Kernel32.dll
+#if !defined(MS_WINDOWS_APP) && !defined(MS_WINDOWS_SYSTEM)
+BOOL
+GetNamedPipeHandleStateW(HANDLE hNamedPipe, LPDWORD lpState, LPDWORD lpCurInstances, LPDWORD lpMaxCollectionCount,
+                         LPDWORD lpCollectDataTimeout, LPWSTR lpUserName, DWORD nMaxUserNameSize)
+{
+    static int initialized = 0;
+    typedef BOOL(__stdcall* PGetNamedPipeHandleStateW) (
+        HANDLE hNamedPipe, LPDWORD lpState, LPDWORD lpCurInstances, LPDWORD lpMaxCollectionCount,
+        LPDWORD lpCollectDataTimeout, LPWSTR lpUserName, DWORD nMaxUserNameSize);
+    static PGetNamedPipeHandleStateW _GetNamedPipeHandleStateW;
+
+    if (initialized == 0) {
+        HMODULE api = LoadLibraryExW(L"Kernel32.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
+        if (api) {
+            _GetNamedPipeHandleStateW = (PGetNamedPipeHandleStateW)GetProcAddress(
+                api, "GetNamedPipeHandleStateW");
+        }
+        else {
+            _GetNamedPipeHandleStateW = NULL;
+        }
+        initialized = 1;
+    }
+
+    if (!_GetNamedPipeHandleStateW) {
+        SetLastError(E_NOINTERFACE);
+        return FALSE;
+    }
+
+    return _GetNamedPipeHandleStateW(
+        hNamedPipe, lpState, lpCurInstances, lpMaxCollectionCount, lpCollectDataTimeout, lpUserName, nMaxUserNameSize
+    );
+}
+#endif /* !MS_WINDOWS_APP && !MS_WINDOWS_SYSTEM */
+
 int
 _Py_get_blocking(int fd)
 {
diff --git a/Python/flowgraph.c b/Python/flowgraph.c
index 78ef02a911a..2adc8c84d83 100644
--- a/Python/flowgraph.c
+++ b/Python/flowgraph.c
@@ -299,26 +299,34 @@ basicblock_returns(const basicblock *b) {
 }
 
 static void
-dump_basicblock(const basicblock *b)
+dump_basicblock(const basicblock *b, bool highlight)
 {
     const char *b_return = basicblock_returns(b) ? "return " : "";
+    if (highlight) {
+        fprintf(stderr, ">>> ");
+    }
     fprintf(stderr, "%d: [EH=%d CLD=%d WRM=%d NO_FT=%d %p] used: %d, depth: %d, preds: %d %s\n",
         b->b_label.id, b->b_except_handler, b->b_cold, b->b_warm, BB_NO_FALLTHROUGH(b), b, b->b_iused,
         b->b_startdepth, b->b_predecessors, b_return);
+    int depth = b->b_startdepth;
     if (b->b_instr) {
         int i;
         for (i = 0; i < b->b_iused; i++) {
-            fprintf(stderr, "  [%02d] ", i);
+            fprintf(stderr, "  [%02d] depth: %d ", i, depth);
             dump_instr(b->b_instr + i);
+
+            int popped = _PyOpcode_num_popped(b->b_instr[i].i_opcode, b->b_instr[i].i_oparg);
+            int pushed = _PyOpcode_num_pushed(b->b_instr[i].i_opcode, b->b_instr[i].i_oparg);
+            depth += (pushed - popped);
         }
     }
 }
 
 void
-_PyCfgBuilder_DumpGraph(const basicblock *entryblock)
+_PyCfgBuilder_DumpGraph(const basicblock *entryblock, const basicblock *mark)
 {
     for (const basicblock *b = entryblock; b != NULL; b = b->b_next) {
-        dump_basicblock(b);
+        dump_basicblock(b, b == mark);
     }
 }
 
@@ -2862,8 +2870,11 @@ optimize_load_fast(cfg_builder *g)
                 // how many inputs should be left on the stack.
 
                 // Opcodes that consume no inputs
+                case FORMAT_SIMPLE:
                 case GET_ANEXT:
+                case GET_ITER:
                 case GET_LEN:
+                case GET_YIELD_FROM_ITER:
                 case IMPORT_FROM:
                 case MATCH_KEYS:
                 case MATCH_MAPPING:
@@ -2898,6 +2909,16 @@ optimize_load_fast(cfg_builder *g)
                     break;
                 }
 
+                case END_SEND:
+                case SET_FUNCTION_ATTRIBUTE: {
+                    assert(_PyOpcode_num_popped(opcode, oparg) == 2);
+                    assert(_PyOpcode_num_pushed(opcode, oparg) == 1);
+                    ref tos = ref_stack_pop(&refs);
+                    ref_stack_pop(&refs);
+                    PUSH_REF(tos.instr, tos.local);
+                    break;
+                }
+
                 // Opcodes that consume some inputs and push new values
                 case CHECK_EXC_MATCH: {
                     ref_stack_pop(&refs);
@@ -2927,6 +2948,14 @@ optimize_load_fast(cfg_builder *g)
                     break;
                 }
 
+                case LOAD_SPECIAL:
+                case PUSH_EXC_INFO: {
+                    ref tos = ref_stack_pop(&refs);
+                    PUSH_REF(i, NOT_LOCAL);
+                    PUSH_REF(tos.instr, tos.local);
+                    break;
+                }
+
                 case SEND: {
                     load_fast_push_block(&sp, instr->i_target, refs.size);
                     ref_stack_pop(&refs);
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index d2ea5b5e06b..5aaa68c5b51 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -2062,7 +2062,7 @@ gc_should_collect_mem_usage(GCState *gcstate)
         // 70,000 new container objects.
         return true;
     }
-    Py_ssize_t last_mem = gcstate->last_mem;
+    Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem);
     Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128);
     if ((mem - last_mem) > mem_threshold) {
         // The process memory usage has increased too much, do a collection.
@@ -2245,7 +2245,8 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
 
     // Store the current memory usage, can be smaller now if breaking cycles
     // freed some memory.
-    state->gcstate->last_mem = get_process_mem_usage();
+    Py_ssize_t last_mem = get_process_mem_usage();
+    _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem);
 
     // Append objects with legacy finalizers to the "gc.garbage" list.
     handle_legacy_finalizers(state);
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index b3f2a2067f7..c8825df3ade 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -905,7 +905,7 @@
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 PyStackRef_CLOSE(str_st);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
-                res = PyStackRef_FromPyObjectImmortal(res_o);
+                res = PyStackRef_FromPyObjectBorrow(res_o);
             }
             stack_pointer[0] = res;
             stack_pointer += 1;
@@ -3475,58 +3475,79 @@
             INSTRUCTION_STATS(CALL_LIST_APPEND);
             static_assert(INLINE_CACHE_ENTRIES_CALL == 3, "incorrect cache size");
             _PyStackRef callable;
+            _PyStackRef nos;
             _PyStackRef self;
             _PyStackRef arg;
             /* Skip 1 cache entry */
             /* Skip 2 cache entries */
-            arg = stack_pointer[-1];
-            self = stack_pointer[-2];
-            callable = stack_pointer[-3];
-            assert(oparg == 1);
-            PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
-            PyObject *self_o = PyStackRef_AsPyObjectBorrow(self);
-            PyInterpreterState *interp = tstate->interp;
-            if (callable_o != interp->callable_cache.list_append) {
-                UPDATE_MISS_STATS(CALL);
-                assert(_PyOpcode_Deopt[opcode] == (CALL));
-                JUMP_TO_PREDICTED(CALL);
-            }
-            if (self_o == NULL) {
-                UPDATE_MISS_STATS(CALL);
-                assert(_PyOpcode_Deopt[opcode] == (CALL));
-                JUMP_TO_PREDICTED(CALL);
-            }
-            if (!PyList_Check(self_o)) {
-                UPDATE_MISS_STATS(CALL);
-                assert(_PyOpcode_Deopt[opcode] == (CALL));
-                JUMP_TO_PREDICTED(CALL);
-            }
-            if (!LOCK_OBJECT(self_o)) {
-                UPDATE_MISS_STATS(CALL);
-                assert(_PyOpcode_Deopt[opcode] == (CALL));
-                JUMP_TO_PREDICTED(CALL);
-            }
-            STAT_INC(CALL, hit);
-            int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg));
-            UNLOCK_OBJECT(self_o);
-            stack_pointer += -2;
-            assert(WITHIN_STACK_BOUNDS());
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(self);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            stack_pointer += -1;
-            assert(WITHIN_STACK_BOUNDS());
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(callable);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (err) {
-                JUMP_TO_LABEL(error);
+            // _GUARD_CALLABLE_LIST_APPEND
+            {
+                callable = stack_pointer[-3];
+                PyObject *callable_o = PyStackRef_AsPyObjectBorrow(callable);
+                PyInterpreterState *interp = tstate->interp;
+                if (callable_o != interp->callable_cache.list_append) {
+                    UPDATE_MISS_STATS(CALL);
+                    assert(_PyOpcode_Deopt[opcode] == (CALL));
+                    JUMP_TO_PREDICTED(CALL);
+                }
             }
-            #if TIER_ONE
+            // _GUARD_NOS_NOT_NULL
+            {
+                nos = stack_pointer[-2];
+                PyObject *o = PyStackRef_AsPyObjectBorrow(nos);
+                if (o == NULL) {
+                    UPDATE_MISS_STATS(CALL);
+                    assert(_PyOpcode_Deopt[opcode] == (CALL));
+                    JUMP_TO_PREDICTED(CALL);
+                }
+            }
+            // _GUARD_NOS_LIST
+            {
+                PyObject *o = PyStackRef_AsPyObjectBorrow(nos);
+                if (!PyList_CheckExact(o)) {
+                    UPDATE_MISS_STATS(CALL);
+                    assert(_PyOpcode_Deopt[opcode] == (CALL));
+                    JUMP_TO_PREDICTED(CALL);
+                }
+            }
+            // _CALL_LIST_APPEND
+            {
+                arg = stack_pointer[-1];
+                self = nos;
+                assert(oparg == 1);
+                PyObject *self_o = PyStackRef_AsPyObjectBorrow(self);
+                if (!PyList_CheckExact(self_o)) {
+                    UPDATE_MISS_STATS(CALL);
+                    assert(_PyOpcode_Deopt[opcode] == (CALL));
+                    JUMP_TO_PREDICTED(CALL);
+                }
+                if (!LOCK_OBJECT(self_o)) {
+                    UPDATE_MISS_STATS(CALL);
+                    assert(_PyOpcode_Deopt[opcode] == (CALL));
+                    JUMP_TO_PREDICTED(CALL);
+                }
+                STAT_INC(CALL, hit);
+                int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg));
+                UNLOCK_OBJECT(self_o);
+                stack_pointer += -2;
+                assert(WITHIN_STACK_BOUNDS());
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_CLOSE(self);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                stack_pointer += -1;
+                assert(WITHIN_STACK_BOUNDS());
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_CLOSE(callable);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (err) {
+                    JUMP_TO_LABEL(error);
+                }
+                #if TIER_ONE
 
-            assert(next_instr->op.code == POP_TOP);
-            SKIP_OVER(1);
-            #endif
+                assert(next_instr->op.code == POP_TOP);
+                SKIP_OVER(1);
+                #endif
+            }
             DISPATCH();
         }
 
@@ -5710,17 +5731,19 @@
             _Py_CODEUNIT* const this_instr = next_instr - 2;
             (void)this_instr;
             _PyStackRef iter;
+            _PyStackRef null_or_index;
             _PyStackRef next;
             // _SPECIALIZE_FOR_ITER
             {
-                iter = stack_pointer[-1];
+                null_or_index = stack_pointer[-1];
+                iter = stack_pointer[-2];
                 uint16_t counter = read_u16(&this_instr[1].cache);
                 (void)counter;
                 #if ENABLE_SPECIALIZATION_FT
                 if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
                     next_instr = this_instr;
                     _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _Py_Specialize_ForIter(iter, next_instr, oparg);
+                    _Py_Specialize_ForIter(iter, null_or_index, next_instr, oparg);
                     stack_pointer = _PyFrame_GetStackPointer(frame);
                     DISPATCH_SAME_OPARG();
                 }
@@ -5730,30 +5753,20 @@
             }
             // _FOR_ITER
             {
-                PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
+                _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
-                if (next_o == NULL) {
-                    if (_PyErr_Occurred(tstate)) {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                        if (!matches) {
-                            JUMP_TO_LABEL(error);
-                        }
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        _PyEval_MonitorRaise(tstate, frame, this_instr);
-                        _PyErr_Clear(tstate);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (!PyStackRef_IsValid(item)) {
+                    if (PyStackRef_IsError(item)) {
+                        JUMP_TO_LABEL(error);
                     }
-                    assert(next_instr[oparg].op.code == END_FOR ||
-                       next_instr[oparg].op.code == INSTRUMENTED_END_FOR);
                     JUMPBY(oparg + 1);
+                    stack_pointer[-1] = null_or_index;
                     DISPATCH();
                 }
-                next = PyStackRef_FromPyObjectSteal(next_o);
+                next = item;
             }
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -5785,7 +5798,7 @@
             }
             // _FOR_ITER_GEN_FRAME
             {
-                iter = stack_pointer[-1];
+                iter = stack_pointer[-2];
                 PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
                 if (Py_TYPE(gen) != &PyGen_Type) {
                     UPDATE_MISS_STATS(FOR_ITER);
@@ -5842,26 +5855,22 @@
             INSTRUCTION_STATS(FOR_ITER_LIST);
             static_assert(INLINE_CACHE_ENTRIES_FOR_ITER == 1, "incorrect cache size");
             _PyStackRef iter;
+            _PyStackRef null_or_index;
             _PyStackRef next;
             /* Skip 1 cache entry */
             // _ITER_CHECK_LIST
             {
-                iter = stack_pointer[-1];
+                null_or_index = stack_pointer[-1];
+                iter = stack_pointer[-2];
                 PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                if (Py_TYPE(iter_o) != &PyListIter_Type) {
+                if (Py_TYPE(iter_o) != &PyList_Type) {
                     UPDATE_MISS_STATS(FOR_ITER);
                     assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
                     JUMP_TO_PREDICTED(FOR_ITER);
                 }
+                assert(PyStackRef_IsTaggedInt(null_or_index));
                 #ifdef Py_GIL_DISABLED
-                if (!_PyObject_IsUniquelyReferenced(iter_o)) {
-                    UPDATE_MISS_STATS(FOR_ITER);
-                    assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
-                    JUMP_TO_PREDICTED(FOR_ITER);
-                }
-                _PyListIterObject *it = (_PyListIterObject *)iter_o;
-                if (!_Py_IsOwnedByCurrentThread((PyObject *)it->it_seq) ||
-                    !_PyObject_GC_IS_SHARED(it->it_seq)) {
+                if (!_Py_IsOwnedByCurrentThread(iter_o) && !_PyObject_GC_IS_SHARED(iter_o)) {
                     UPDATE_MISS_STATS(FOR_ITER);
                     assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
                     JUMP_TO_PREDICTED(FOR_ITER);
@@ -5870,42 +5879,30 @@
             }
             // _ITER_JUMP_LIST
             {
-                PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                assert(Py_TYPE(iter_o) == &PyListIter_Type);
                 #ifdef Py_GIL_DISABLED
-                assert(_PyObject_IsUniquelyReferenced(iter_o));
-                (void)iter_o;
+
                 #else
-                _PyListIterObject *it = (_PyListIterObject *)iter_o;
+                PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+                assert(Py_TYPE(list_o) == &PyList_Type);
                 STAT_INC(FOR_ITER, hit);
-                PyListObject *seq = it->it_seq;
-                if (seq == NULL || (size_t)it->it_index >= (size_t)PyList_GET_SIZE(seq)) {
-                    it->it_index = -1;
-                    if (seq != NULL) {
-                        it->it_seq = NULL;
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        Py_DECREF(seq);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
+                if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyList_GET_SIZE(list_o)) {
+                    null_or_index = PyStackRef_TagInt(-1);
                     JUMPBY(oparg + 1);
+                    stack_pointer[-1] = null_or_index;
                     DISPATCH();
                 }
                 #endif
             }
             // _ITER_NEXT_LIST
             {
-                PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                _PyListIterObject *it = (_PyListIterObject *)iter_o;
-                assert(Py_TYPE(iter_o) == &PyListIter_Type);
-                PyListObject *seq = it->it_seq;
-                assert(seq);
+                PyObject *list_o = PyStackRef_AsPyObjectBorrow(iter);
+                assert(PyList_CheckExact(list_o));
                 #ifdef Py_GIL_DISABLED
-                assert(_PyObject_IsUniquelyReferenced(iter_o));
-                assert(_Py_IsOwnedByCurrentThread((PyObject *)seq) ||
-                   _PyObject_GC_IS_SHARED(seq));
+                assert(_Py_IsOwnedByCurrentThread(list_o) ||
+                  _PyObject_GC_IS_SHARED(list_o));
                 STAT_INC(FOR_ITER, hit);
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                int result = _PyList_GetItemRefNoLock(seq, it->it_index, &next);
+                int result = _PyList_GetItemRefNoLock((PyListObject *)list_o, PyStackRef_UntagInt(null_or_index), &next);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
                 if (result < 0) {
                     UPDATE_MISS_STATS(FOR_ITER);
@@ -5913,16 +5910,17 @@
                     JUMP_TO_PREDICTED(FOR_ITER);
                 }
                 if (result == 0) {
-                    it->it_index = -1;
+                    null_or_index = PyStackRef_TagInt(-1);
                     JUMPBY(oparg + 1);
+                    stack_pointer[-1] = null_or_index;
                     DISPATCH();
                 }
-                it->it_index++;
                 #else
-                assert(it->it_index < PyList_GET_SIZE(seq));
-                next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(seq, it->it_index++));
+                next = PyStackRef_FromPyObjectNew(PyList_GET_ITEM(list_o, PyStackRef_UntagInt(null_or_index)));
                 #endif
+                null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
             }
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -5945,7 +5943,7 @@
             /* Skip 1 cache entry */
             // _ITER_CHECK_RANGE
             {
-                iter = stack_pointer[-1];
+                iter = stack_pointer[-2];
                 _PyRangeIterObject *r = (_PyRangeIterObject *)PyStackRef_AsPyObjectBorrow(iter);
                 if (Py_TYPE(r) != &PyRangeIter_Type) {
                     UPDATE_MISS_STATS(FOR_ITER);
@@ -6008,63 +6006,44 @@
             INSTRUCTION_STATS(FOR_ITER_TUPLE);
             static_assert(INLINE_CACHE_ENTRIES_FOR_ITER == 1, "incorrect cache size");
             _PyStackRef iter;
+            _PyStackRef null_or_index;
             _PyStackRef next;
             /* Skip 1 cache entry */
             // _ITER_CHECK_TUPLE
             {
-                iter = stack_pointer[-1];
+                null_or_index = stack_pointer[-1];
+                iter = stack_pointer[-2];
                 PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                if (Py_TYPE(iter_o) != &PyTupleIter_Type) {
-                    UPDATE_MISS_STATS(FOR_ITER);
-                    assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
-                    JUMP_TO_PREDICTED(FOR_ITER);
-                }
-                #ifdef Py_GIL_DISABLED
-                if (!_PyObject_IsUniquelyReferenced(iter_o)) {
+                if (Py_TYPE(iter_o) != &PyTuple_Type) {
                     UPDATE_MISS_STATS(FOR_ITER);
                     assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
                     JUMP_TO_PREDICTED(FOR_ITER);
                 }
-                #endif
+                assert(PyStackRef_IsTaggedInt(null_or_index));
             }
             // _ITER_JUMP_TUPLE
             {
-                PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                (void)iter_o;
-                assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-                #ifdef Py_GIL_DISABLED
-                assert(_PyObject_IsUniquelyReferenced(iter_o));
-                #endif
-                _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
+                PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+                (void)tuple_o;
+                assert(Py_TYPE(tuple_o) == &PyTuple_Type);
                 STAT_INC(FOR_ITER, hit);
-                PyTupleObject *seq = it->it_seq;
-                if (seq == NULL || (size_t)it->it_index >= (size_t)PyTuple_GET_SIZE(seq)) {
-                    #ifndef Py_GIL_DISABLED
-                    if (seq != NULL) {
-                        it->it_seq = NULL;
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        Py_DECREF(seq);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
-                    #endif
-
+                if ((size_t)PyStackRef_UntagInt(null_or_index) >= (size_t)PyTuple_GET_SIZE(tuple_o)) {
+                    null_or_index = PyStackRef_TagInt(-1);
                     JUMPBY(oparg + 1);
+                    stack_pointer[-1] = null_or_index;
                     DISPATCH();
                 }
             }
             // _ITER_NEXT_TUPLE
             {
-                PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
-                _PyTupleIterObject *it = (_PyTupleIterObject *)iter_o;
-                assert(Py_TYPE(iter_o) == &PyTupleIter_Type);
-                PyTupleObject *seq = it->it_seq;
-                #ifdef Py_GIL_DISABLED
-                assert(_PyObject_IsUniquelyReferenced(iter_o));
-                #endif
-                assert(seq);
-                assert(it->it_index < PyTuple_GET_SIZE(seq));
-                next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(seq, it->it_index++));
+                PyObject *tuple_o = PyStackRef_AsPyObjectBorrow(iter);
+                assert(Py_TYPE(tuple_o) == &PyTuple_Type);
+                uintptr_t i = PyStackRef_UntagInt(null_or_index);
+                assert((size_t)i < (size_t)PyTuple_GET_SIZE(tuple_o));
+                next = PyStackRef_FromPyObjectNew(PyTuple_GET_ITEM(tuple_o, i));
+                null_or_index = PyStackRef_IncrementTaggedIntNoOverflow(null_or_index);
             }
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -6195,25 +6174,37 @@
             INSTRUCTION_STATS(GET_ITER);
             _PyStackRef iterable;
             _PyStackRef iter;
+            _PyStackRef index_or_null;
             iterable = stack_pointer[-1];
             #ifdef Py_STATS
             _PyFrame_SetStackPointer(frame, stack_pointer);
             _Py_GatherStats_GetIter(iterable);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             #endif
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            stack_pointer += -1;
-            assert(WITHIN_STACK_BOUNDS());
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(iterable);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (iter_o == NULL) {
-                JUMP_TO_LABEL(error);
+
+            PyTypeObject *tp = PyStackRef_TYPE(iterable);
+            if (tp == &PyTuple_Type || tp == &PyList_Type) {
+                iter = iterable;
+                index_or_null = PyStackRef_TagInt(0);
             }
-            iter = PyStackRef_FromPyObjectSteal(iter_o);
-            stack_pointer[0] = iter;
+            else {
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject *iter_o = PyObject_GetIter(PyStackRef_AsPyObjectBorrow(iterable));
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                stack_pointer += -1;
+                assert(WITHIN_STACK_BOUNDS());
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_CLOSE(iterable);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (iter_o == NULL) {
+                    JUMP_TO_LABEL(error);
+                }
+                iter = PyStackRef_FromPyObjectSteal(iter_o);
+                index_or_null = PyStackRef_NULL;
+                stack_pointer += 1;
+            }
+            stack_pointer[-1] = iter;
+            stack_pointer[0] = index_or_null;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
             DISPATCH();
@@ -6978,7 +6969,7 @@
             _PyStackRef receiver;
             _PyStackRef value;
             value = stack_pointer[-1];
-            receiver = stack_pointer[-2];
+            receiver = stack_pointer[-3];
             if (PyStackRef_GenCheck(receiver)) {
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 int err = monitor_stop_iteration(tstate, frame, this_instr, PyStackRef_AsPyObjectBorrow(value));
@@ -7040,35 +7031,25 @@
             next_instr += 2;
             INSTRUCTION_STATS(INSTRUMENTED_FOR_ITER);
             _PyStackRef iter;
+            _PyStackRef null_or_index;
             _PyStackRef next;
             /* Skip 1 cache entry */
-            iter = stack_pointer[-1];
-            PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
+            null_or_index = stack_pointer[-1];
+            iter = stack_pointer[-2];
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *next_o = (*Py_TYPE(iter_o)->tp_iternext)(iter_o);
+            _PyStackRef item = _PyForIter_VirtualIteratorNext(tstate, frame, iter, &null_or_index);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            if (next_o != NULL) {
-                next = PyStackRef_FromPyObjectSteal(next_o);
-                INSTRUMENTED_JUMP(this_instr, next_instr, PY_MONITORING_EVENT_BRANCH_LEFT);
-            }
-            else {
-                if (_PyErr_Occurred(tstate)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    int matches = _PyErr_ExceptionMatches(tstate, PyExc_StopIteration);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
-                    if (!matches) {
-                        JUMP_TO_LABEL(error);
-                    }
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyEval_MonitorRaise(tstate, frame, this_instr);
-                    _PyErr_Clear(tstate);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (!PyStackRef_IsValid(item)) {
+                if (PyStackRef_IsError(item)) {
+                    JUMP_TO_LABEL(error);
                 }
-                assert(next_instr[oparg].op.code == END_FOR ||
-                       next_instr[oparg].op.code == INSTRUMENTED_END_FOR);
                 JUMPBY(oparg + 1);
+                stack_pointer[-1] = null_or_index;
                 DISPATCH();
             }
+            next = item;
+            INSTRUMENTED_JUMP(this_instr, next_instr, PY_MONITORING_EVENT_BRANCH_LEFT);
+            stack_pointer[-1] = null_or_index;
             stack_pointer[0] = next;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -7335,9 +7316,12 @@
             next_instr += 1;
             INSTRUCTION_STATS(INSTRUMENTED_POP_ITER);
             _PyStackRef iter;
-            iter = stack_pointer[-1];
+            _PyStackRef index_or_null;
+            index_or_null = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            (void)index_or_null;
             INSTRUMENTED_JUMP(prev_instr, this_instr+1, PY_MONITORING_EVENT_BRANCH_RIGHT);
-            stack_pointer += -1;
+            stack_pointer += -2;
             assert(WITHIN_STACK_BOUNDS());
             _PyFrame_SetStackPointer(frame, stack_pointer);
             PyStackRef_CLOSE(iter);
@@ -8981,63 +8965,9 @@
             frame->instr_ptr = next_instr;
             next_instr += 1;
             INSTRUCTION_STATS(LOAD_CONST);
-            PREDICTED_LOAD_CONST:;
-            _Py_CODEUNIT* const this_instr = next_instr - 1;
-            (void)this_instr;
-            _PyStackRef value;
-            PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            value = PyStackRef_FromPyObjectNew(obj);
-            #if ENABLE_SPECIALIZATION_FT
-            #ifdef Py_GIL_DISABLED
-            uint8_t expected = LOAD_CONST;
-            if (!_Py_atomic_compare_exchange_uint8(
-                    &this_instr->op.code, &expected,
-                    _Py_IsImmortal(obj) ? LOAD_CONST_IMMORTAL : LOAD_CONST_MORTAL)) {
-                assert(expected >= MIN_INSTRUMENTED_OPCODE);
-            }
-            #else
-            if (this_instr->op.code == LOAD_CONST) {
-                this_instr->op.code = _Py_IsImmortal(obj) ? LOAD_CONST_IMMORTAL : LOAD_CONST_MORTAL;
-            }
-            #endif
-            #endif
-            stack_pointer[0] = value;
-            stack_pointer += 1;
-            assert(WITHIN_STACK_BOUNDS());
-            DISPATCH();
-        }
-
-        TARGET(LOAD_CONST_IMMORTAL) {
-            #if Py_TAIL_CALL_INTERP
-            int opcode = LOAD_CONST_IMMORTAL;
-            (void)(opcode);
-            #endif
-            frame->instr_ptr = next_instr;
-            next_instr += 1;
-            INSTRUCTION_STATS(LOAD_CONST_IMMORTAL);
-            static_assert(0 == 0, "incorrect cache size");
-            _PyStackRef value;
-            PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            assert(_Py_IsImmortal(obj));
-            value = PyStackRef_FromPyObjectImmortal(obj);
-            stack_pointer[0] = value;
-            stack_pointer += 1;
-            assert(WITHIN_STACK_BOUNDS());
-            DISPATCH();
-        }
-
-        TARGET(LOAD_CONST_MORTAL) {
-            #if Py_TAIL_CALL_INTERP
-            int opcode = LOAD_CONST_MORTAL;
-            (void)(opcode);
-            #endif
-            frame->instr_ptr = next_instr;
-            next_instr += 1;
-            INSTRUCTION_STATS(LOAD_CONST_MORTAL);
-            static_assert(0 == 0, "incorrect cache size");
             _PyStackRef value;
             PyObject *obj = GETITEM(FRAME_CO_CONSTS, oparg);
-            value = PyStackRef_FromPyObjectNewMortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -9572,7 +9502,7 @@
             _PyStackRef value;
             assert(oparg < _PY_NSMALLPOSINTS);
             PyObject *obj = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + oparg];
-            value = PyStackRef_FromPyObjectImmortal(obj);
+            value = PyStackRef_FromPyObjectBorrow(obj);
             stack_pointer[0] = value;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -10155,12 +10085,15 @@
             frame->instr_ptr = next_instr;
             next_instr += 1;
             INSTRUCTION_STATS(POP_ITER);
-            _PyStackRef value;
-            value = stack_pointer[-1];
-            stack_pointer += -1;
+            _PyStackRef iter;
+            _PyStackRef index_or_null;
+            index_or_null = stack_pointer[-1];
+            iter = stack_pointer[-2];
+            (void)index_or_null;
+            stack_pointer += -2;
             assert(WITHIN_STACK_BOUNDS());
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(value);
+            PyStackRef_CLOSE(iter);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             DISPATCH();
         }
@@ -10308,7 +10241,7 @@
             stack_pointer += -1;
             assert(WITHIN_STACK_BOUNDS());
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(value);
+            PyStackRef_XCLOSE(value);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             DISPATCH();
         }
@@ -11219,10 +11152,6 @@
             INSTRUCTION_STATS(STORE_FAST);
             _PyStackRef value;
             value = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value)
-            );
             _PyStackRef tmp = GETLOCAL(oparg);
             GETLOCAL(oparg) = value;
             stack_pointer += -1;
@@ -11244,10 +11173,6 @@
             _PyStackRef value1;
             _PyStackRef value2;
             value1 = stack_pointer[-1];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value1)
-            );
             uint32_t oparg1 = oparg >> 4;
             uint32_t oparg2 = oparg & 15;
             _PyStackRef tmp = GETLOCAL(oparg1);
@@ -11272,14 +11197,6 @@
             _PyStackRef value1;
             value1 = stack_pointer[-1];
             value2 = stack_pointer[-2];
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value1)
-            );
-            assert(
-                   ((_PyFrame_GetCode(frame)->co_flags & (CO_COROUTINE | CO_GENERATOR)) == 0) ||
-                   PyStackRef_IsHeapSafe(value2)
-            );
             uint32_t oparg1 = oparg >> 4;
             uint32_t oparg2 = oparg & 15;
             _PyStackRef tmp = GETLOCAL(oparg1);
diff --git a/Python/hamt.c b/Python/hamt.c
index f9bbf63961d..906149cc6cd 100644
--- a/Python/hamt.c
+++ b/Python/hamt.c
@@ -1176,7 +1176,7 @@ hamt_node_bitmap_dump(PyHamtNode_Bitmap *node,
         }
 
         if (key_or_null == NULL) {
-            if (PyUnicodeWriter_WriteUTF8(writer, "NULL:\n", -1) < 0) {
+            if (PyUnicodeWriter_WriteASCII(writer, "NULL:\n", 6) < 0) {
                 goto error;
             }
 
@@ -1194,7 +1194,7 @@ hamt_node_bitmap_dump(PyHamtNode_Bitmap *node,
             }
         }
 
-        if (PyUnicodeWriter_WriteUTF8(writer, "\n", 1) < 0) {
+        if (PyUnicodeWriter_WriteASCII(writer, "\n", 1) < 0) {
             goto error;
         }
     }
@@ -1915,7 +1915,7 @@ hamt_node_array_dump(PyHamtNode_Array *node,
             goto error;
         }
 
-        if (PyUnicodeWriter_WriteUTF8(writer, "\n", 1) < 0) {
+        if (PyUnicodeWriter_WriteASCII(writer, "\n", 1) < 0) {
             goto error;
         }
     }
diff --git a/Python/import.c b/Python/import.c
index 9dec0f488a3..184dede335d 100644
--- a/Python/import.c
+++ b/Python/import.c
@@ -3369,11 +3369,11 @@ PyObject *
 PyImport_GetImporter(PyObject *path)
 {
     PyThreadState *tstate = _PyThreadState_GET();
-    PyObject *path_importer_cache = _PySys_GetRequiredAttrString("path_importer_cache");
+    PyObject *path_importer_cache = PySys_GetAttrString("path_importer_cache");
     if (path_importer_cache == NULL) {
         return NULL;
     }
-    PyObject *path_hooks = _PySys_GetRequiredAttrString("path_hooks");
+    PyObject *path_hooks = PySys_GetAttrString("path_hooks");
     if (path_hooks == NULL) {
         Py_DECREF(path_importer_cache);
         return NULL;
@@ -3682,14 +3682,14 @@ import_find_and_load(PyThreadState *tstate, PyObject *abs_name)
     PyTime_t t1 = 0, accumulated_copy = accumulated;
 
     PyObject *sys_path, *sys_meta_path, *sys_path_hooks;
-    if (_PySys_GetOptionalAttrString("path", &sys_path) < 0) {
+    if (PySys_GetOptionalAttrString("path", &sys_path) < 0) {
         return NULL;
     }
-    if (_PySys_GetOptionalAttrString("meta_path", &sys_meta_path) < 0) {
+    if (PySys_GetOptionalAttrString("meta_path", &sys_meta_path) < 0) {
         Py_XDECREF(sys_path);
         return NULL;
     }
-    if (_PySys_GetOptionalAttrString("path_hooks", &sys_path_hooks) < 0) {
+    if (PySys_GetOptionalAttrString("path_hooks", &sys_path_hooks) < 0) {
         Py_XDECREF(sys_meta_path);
         Py_XDECREF(sys_path);
         return NULL;
@@ -3854,15 +3854,17 @@ PyImport_ImportModuleLevelObject(PyObject *name, PyObject *globals,
                 }
 
                 final_mod = import_get_module(tstate, to_return);
-                Py_DECREF(to_return);
                 if (final_mod == NULL) {
                     if (!_PyErr_Occurred(tstate)) {
                         _PyErr_Format(tstate, PyExc_KeyError,
                                       "%R not in sys.modules as expected",
                                       to_return);
                     }
+                    Py_DECREF(to_return);
                     goto error;
                 }
+
+                Py_DECREF(to_return);
             }
         }
         else {
@@ -3962,8 +3964,10 @@ PyImport_Import(PyObject *module_name)
     if (globals != NULL) {
         Py_INCREF(globals);
         builtins = PyObject_GetItem(globals, &_Py_ID(__builtins__));
-        if (builtins == NULL)
+        if (builtins == NULL) {
+            // XXX Fall back to interp->builtins or sys.modules['builtins']?
             goto err;
+        }
     }
     else {
         /* No globals -- use standard builtins, and fake globals */
@@ -4125,7 +4129,7 @@ _PyImport_FiniCore(PyInterpreterState *interp)
 static int
 init_zipimport(PyThreadState *tstate, int verbose)
 {
-    PyObject *path_hooks = _PySys_GetRequiredAttrString("path_hooks");
+    PyObject *path_hooks = PySys_GetAttrString("path_hooks");
     if (path_hooks == NULL) {
         return -1;
     }
diff --git a/Python/index_pool.c b/Python/index_pool.c
index 007c81a0fc1..520a65938ec 100644
--- a/Python/index_pool.c
+++ b/Python/index_pool.c
@@ -172,6 +172,9 @@ _PyIndexPool_AllocIndex(_PyIndexPool *pool)
     else {
         index = heap_pop(free_indices);
     }
+
+    pool->tlbc_generation++;
+
     UNLOCK_POOL(pool);
     return index;
 }
@@ -180,6 +183,7 @@ void
 _PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index)
 {
     LOCK_POOL(pool);
+    pool->tlbc_generation++;
     heap_add(&pool->free_indices, index);
     UNLOCK_POOL(pool);
 }
diff --git a/Python/initconfig.c b/Python/initconfig.c
index 25e30aa648e..71d7cfed5c4 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -3647,7 +3647,7 @@ _Py_DumpPathConfig(PyThreadState *tstate)
 #define DUMP_SYS(NAME) \
         do { \
             PySys_FormatStderr("  sys.%s = ", #NAME); \
-            if (_PySys_GetOptionalAttrString(#NAME, &obj) < 0) { \
+            if (PySys_GetOptionalAttrString(#NAME, &obj) < 0) { \
                 PyErr_Clear(); \
             } \
             if (obj != NULL) { \
@@ -3671,7 +3671,7 @@ _Py_DumpPathConfig(PyThreadState *tstate)
 #undef DUMP_SYS
 
     PyObject *sys_path;
-    (void) _PySys_GetOptionalAttrString("path", &sys_path);
+    (void) PySys_GetOptionalAttrString("path", &sys_path);
     if (sys_path != NULL && PyList_Check(sys_path)) {
         PySys_WriteStderr("  sys.path = [\n");
         Py_ssize_t len = PyList_GET_SIZE(sys_path);
@@ -4294,7 +4294,7 @@ _PyConfig_CreateXOptionsDict(const PyConfig *config)
 static int
 config_get_sys_write_bytecode(const PyConfig *config, int *value)
 {
-    PyObject *attr = _PySys_GetRequiredAttrString("dont_write_bytecode");
+    PyObject *attr = PySys_GetAttrString("dont_write_bytecode");
     if (attr == NULL) {
         return -1;
     }
@@ -4315,7 +4315,7 @@ config_get(const PyConfig *config, const PyConfigSpec *spec,
 {
     if (use_sys) {
         if (spec->sys.attr != NULL) {
-            return _PySys_GetRequiredAttrString(spec->sys.attr);
+            return PySys_GetAttrString(spec->sys.attr);
         }
 
         if (strcmp(spec->name, "write_bytecode") == 0) {
diff --git a/Python/intrinsics.c b/Python/intrinsics.c
index ff44ba0ee64..8ea920e690c 100644
--- a/Python/intrinsics.c
+++ b/Python/intrinsics.c
@@ -9,7 +9,6 @@
 #include "pycore_intrinsics.h"    // INTRINSIC_PRINT
 #include "pycore_pyerrors.h"      // _PyErr_SetString()
 #include "pycore_runtime.h"       // _Py_ID()
-#include "pycore_sysmodule.h"     // _PySys_GetRequiredAttr()
 #include "pycore_tuple.h"         // _PyTuple_FromArray()
 #include "pycore_typevarobject.h" // _Py_make_typevar()
 #include "pycore_unicodeobject.h" // _PyUnicode_FromASCII()
@@ -27,7 +26,7 @@ no_intrinsic1(PyThreadState* tstate, PyObject *unused)
 static PyObject *
 print_expr(PyThreadState* Py_UNUSED(ignored), PyObject *value)
 {
-    PyObject *hook = _PySys_GetRequiredAttr(&_Py_ID(displayhook));
+    PyObject *hook = PySys_GetAttr(&_Py_ID(displayhook));
     if (hook == NULL) {
         return NULL;
     }
diff --git a/Python/lock.c b/Python/lock.c
index 28a12ad1835..b125ad0c9e3 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -119,6 +119,9 @@ _PyMutex_LockTimed(PyMutex *m, PyTime_t timeout, _PyLockFlags flags)
                 return PY_LOCK_INTR;
             }
         }
+        else if (ret == Py_PARK_INTR && (flags & _PY_FAIL_IF_INTERRUPTED)) {
+            return PY_LOCK_INTR;
+        }
         else if (ret == Py_PARK_TIMEOUT) {
             assert(timeout >= 0);
             return PY_LOCK_FAILURE;
diff --git a/Python/modsupport.c b/Python/modsupport.c
index 2caf595949d..437ad412027 100644
--- a/Python/modsupport.c
+++ b/Python/modsupport.c
@@ -669,5 +669,5 @@ Py_PACK_FULL_VERSION(int x, int y, int z, int level, int serial)
 uint32_t
 Py_PACK_VERSION(int x, int y)
 {
-    return Py_PACK_FULL_VERSION(x, y, 0, 0, 0);
+    return _Py_PACK_VERSION(x, y);
 }
diff --git a/Python/opcode_targets.h b/Python/opcode_targets.h
index 8af445d7d6a..1d6dcddab4b 100644
--- a/Python/opcode_targets.h
+++ b/Python/opcode_targets.h
@@ -190,8 +190,6 @@ static void *opcode_targets[256] = {
     &&TARGET_LOAD_ATTR_PROPERTY,
     &&TARGET_LOAD_ATTR_SLOT,
     &&TARGET_LOAD_ATTR_WITH_HINT,
-    &&TARGET_LOAD_CONST_IMMORTAL,
-    &&TARGET_LOAD_CONST_MORTAL,
     &&TARGET_LOAD_GLOBAL_BUILTIN,
     &&TARGET_LOAD_GLOBAL_MODULE,
     &&TARGET_LOAD_SUPER_ATTR_ATTR,
@@ -234,6 +232,8 @@ static void *opcode_targets[256] = {
     &&_unknown_opcode,
     &&_unknown_opcode,
     &&_unknown_opcode,
+    &&_unknown_opcode,
+    &&_unknown_opcode,
     &&TARGET_INSTRUMENTED_END_FOR,
     &&TARGET_INSTRUMENTED_POP_ITER,
     &&TARGET_INSTRUMENTED_END_SEND,
@@ -410,8 +410,6 @@ Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_WITH_HINT(TAIL_CALL_PA
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_BUILD_CLASS(TAIL_CALL_PARAMS);
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_COMMON_CONSTANT(TAIL_CALL_PARAMS);
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_CONST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_CONST_IMMORTAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_CONST_MORTAL(TAIL_CALL_PARAMS);
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_DEREF(TAIL_CALL_PARAMS);
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST(TAIL_CALL_PARAMS);
 Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_AND_CLEAR(TAIL_CALL_PARAMS);
@@ -649,8 +647,6 @@ static py_tail_call_funcptr INSTRUCTION_TABLE[256] = {
     [LOAD_BUILD_CLASS] = _TAIL_CALL_LOAD_BUILD_CLASS,
     [LOAD_COMMON_CONSTANT] = _TAIL_CALL_LOAD_COMMON_CONSTANT,
     [LOAD_CONST] = _TAIL_CALL_LOAD_CONST,
-    [LOAD_CONST_IMMORTAL] = _TAIL_CALL_LOAD_CONST_IMMORTAL,
-    [LOAD_CONST_MORTAL] = _TAIL_CALL_LOAD_CONST_MORTAL,
     [LOAD_DEREF] = _TAIL_CALL_LOAD_DEREF,
     [LOAD_FAST] = _TAIL_CALL_LOAD_FAST,
     [LOAD_FAST_AND_CLEAR] = _TAIL_CALL_LOAD_FAST_AND_CLEAR,
@@ -740,6 +736,8 @@ static py_tail_call_funcptr INSTRUCTION_TABLE[256] = {
     [125] = _TAIL_CALL_UNKNOWN_OPCODE,
     [126] = _TAIL_CALL_UNKNOWN_OPCODE,
     [127] = _TAIL_CALL_UNKNOWN_OPCODE,
+    [210] = _TAIL_CALL_UNKNOWN_OPCODE,
+    [211] = _TAIL_CALL_UNKNOWN_OPCODE,
     [212] = _TAIL_CALL_UNKNOWN_OPCODE,
     [213] = _TAIL_CALL_UNKNOWN_OPCODE,
     [214] = _TAIL_CALL_UNKNOWN_OPCODE,
diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c
index 8b0bd1e9518..6a7df233819 100644
--- a/Python/optimizer_analysis.c
+++ b/Python/optimizer_analysis.c
@@ -375,6 +375,23 @@ eliminate_pop_guard(_PyUOpInstruction *this_instr, bool exit)
     }
 }
 
+static JitOptSymbol *
+lookup_attr(JitOptContext *ctx, _PyUOpInstruction *this_instr,
+            PyTypeObject *type, PyObject *name, uint16_t immortal,
+            uint16_t mortal)
+{
+    // The cached value may be dead, so we need to do the lookup again... :(
+    if (type && PyType_Check(type)) {
+        PyObject *lookup = _PyType_Lookup(type, name);
+        if (lookup) {
+            int opcode = _Py_IsImmortal(lookup) ? immortal : mortal;
+            REPLACE_OP(this_instr, opcode, 0, (uintptr_t)lookup);
+            return sym_new_const(ctx, lookup);
+        }
+    }
+    return sym_new_not_null(ctx);
+}
+
 /* _PUSH_FRAME/_RETURN_VALUE's operand can be 0, a PyFunctionObject *, or a
  * PyCodeObject *. Retrieve the code object if possible.
  */
@@ -523,6 +540,45 @@ error:
 
 }
 
+const uint16_t op_without_push[MAX_UOP_ID + 1] = {
+    [_COPY] = _NOP,
+    [_LOAD_CONST_INLINE] = _NOP,
+    [_LOAD_CONST_INLINE_BORROW] = _NOP,
+    [_LOAD_CONST_UNDER_INLINE] = _POP_TOP_LOAD_CONST_INLINE,
+    [_LOAD_CONST_UNDER_INLINE_BORROW] = _POP_TOP_LOAD_CONST_INLINE_BORROW,
+    [_LOAD_FAST] = _NOP,
+    [_LOAD_FAST_BORROW] = _NOP,
+    [_LOAD_SMALL_INT] = _NOP,
+    [_POP_TOP_LOAD_CONST_INLINE] = _POP_TOP,
+    [_POP_TOP_LOAD_CONST_INLINE_BORROW] = _POP_TOP,
+    [_POP_TWO_LOAD_CONST_INLINE_BORROW] = _POP_TWO,
+    [_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW] = _POP_CALL_TWO,
+};
+
+const bool op_skip[MAX_UOP_ID + 1] = {
+    [_NOP] = true,
+    [_CHECK_VALIDITY] = true,
+    [_CHECK_PERIODIC] = true,
+    [_SET_IP] = true,
+};
+
+const uint16_t op_without_pop[MAX_UOP_ID + 1] = {
+    [_POP_TOP] = _NOP,
+    [_POP_TOP_LOAD_CONST_INLINE] = _LOAD_CONST_INLINE,
+    [_POP_TOP_LOAD_CONST_INLINE_BORROW] = _LOAD_CONST_INLINE_BORROW,
+    [_POP_TWO] = _POP_TOP,
+    [_POP_TWO_LOAD_CONST_INLINE_BORROW] = _POP_TOP_LOAD_CONST_INLINE_BORROW,
+    [_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW] = _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW,
+    [_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW] = _POP_CALL_LOAD_CONST_INLINE_BORROW,
+    [_POP_CALL_TWO] = _POP_CALL_ONE,
+    [_POP_CALL_ONE] = _POP_CALL,
+};
+
+const uint16_t op_without_pop_null[MAX_UOP_ID + 1] = {
+    [_POP_CALL] = _POP_TOP,
+    [_POP_CALL_LOAD_CONST_INLINE_BORROW] = _POP_TOP_LOAD_CONST_INLINE_BORROW,
+};
+
 
 static int
 remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
@@ -551,50 +607,37 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
                     buffer[pc].opcode = _NOP;
                 }
                 break;
-            case _POP_TOP:
-            case _POP_TOP_LOAD_CONST_INLINE:
-            case _POP_TOP_LOAD_CONST_INLINE_BORROW:
-            case _POP_TWO_LOAD_CONST_INLINE_BORROW:
-            optimize_pop_top_again:
+            default:
             {
-                _PyUOpInstruction *last = &buffer[pc-1];
-                while (last->opcode == _NOP) {
-                    last--;
-                }
-                switch (last->opcode) {
-                    case _POP_TWO_LOAD_CONST_INLINE_BORROW:
-                        last->opcode = _POP_TOP;
-                        break;
-                    case _POP_TOP_LOAD_CONST_INLINE:
-                    case _POP_TOP_LOAD_CONST_INLINE_BORROW:
-                        last->opcode = _NOP;
-                        goto optimize_pop_top_again;
-                    case _COPY:
-                    case _LOAD_CONST_INLINE:
-                    case _LOAD_CONST_INLINE_BORROW:
-                    case _LOAD_FAST:
-                    case _LOAD_FAST_BORROW:
-                    case _LOAD_SMALL_INT:
-                        last->opcode = _NOP;
-                        if (opcode == _POP_TOP) {
-                            opcode = buffer[pc].opcode = _NOP;
-                        }
-                        else if (opcode == _POP_TOP_LOAD_CONST_INLINE) {
-                            opcode = buffer[pc].opcode = _LOAD_CONST_INLINE;
-                        }
-                        else if (opcode == _POP_TOP_LOAD_CONST_INLINE_BORROW) {
-                            opcode = buffer[pc].opcode = _LOAD_CONST_INLINE_BORROW;
-                        }
-                        else {
-                            assert(opcode == _POP_TWO_LOAD_CONST_INLINE_BORROW);
-                            opcode = buffer[pc].opcode = _POP_TOP_LOAD_CONST_INLINE_BORROW;
-                            goto optimize_pop_top_again;
+                // Cancel out pushes and pops, repeatedly. So:
+                //     _LOAD_FAST + _POP_TWO_LOAD_CONST_INLINE_BORROW + _POP_TOP
+                // ...becomes:
+                //     _NOP + _POP_TOP + _NOP
+                while (op_without_pop[opcode] || op_without_pop_null[opcode]) {
+                    _PyUOpInstruction *last = &buffer[pc - 1];
+                    while (op_skip[last->opcode]) {
+                        last--;
+                    }
+                    if (op_without_push[last->opcode] && op_without_pop[opcode]) {
+                        last->opcode = op_without_push[last->opcode];
+                        opcode = buffer[pc].opcode = op_without_pop[opcode];
+                        if (op_without_pop[last->opcode]) {
+                            opcode = last->opcode;
+                            pc = last - buffer;
                         }
+                    }
+                    else if (last->opcode == _PUSH_NULL) {
+                        // Handle _POP_CALL and _POP_CALL_LOAD_CONST_INLINE_BORROW separately.
+                        // This looks for a preceding _PUSH_NULL instruction and
+                        // simplifies to _POP_TOP(_LOAD_CONST_INLINE_BORROW).
+                        last->opcode = _NOP;
+                        opcode = buffer[pc].opcode = op_without_pop_null[opcode];
+                        assert(opcode);
+                    }
+                    else {
+                        break;
+                    }
                 }
-                _Py_FALLTHROUGH;
-            }
-            default:
-            {
                 /* _PUSH_FRAME doesn't escape or error, but it
                  * does need the IP for the return address */
                 bool needs_ip = opcode == _PUSH_FRAME;
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index 7c160cdcb0c..b4220e2c627 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -118,6 +118,18 @@ dummy_func(void) {
         sym_set_type(left, &PyLong_Type);
     }
 
+    op(_CHECK_ATTR_CLASS, (type_version/2, owner -- owner)) {
+        PyObject *type = (PyObject *)_PyType_LookupByVersion(type_version);
+        if (type) {
+            if (type == sym_get_const(ctx, owner)) {
+                REPLACE_OP(this_instr, _NOP, 0, 0);
+            }
+            else {
+                sym_set_const(owner, type);
+            }
+        }
+    }
+
     op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) {
         assert(type_version);
         if (sym_matches_type_version(owner, type_version)) {
@@ -510,27 +522,16 @@ dummy_func(void) {
     }
 
     op(_LOAD_CONST, (-- value)) {
-        PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
-        int opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
-        REPLACE_OP(this_instr, opcode, 0, (uintptr_t)val);
-        value = sym_new_const(ctx, val);
-    }
-
-    op(_LOAD_CONST_MORTAL, (-- value)) {
-        PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
-        int opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
-        REPLACE_OP(this_instr, opcode, 0, (uintptr_t)val);
-        value = sym_new_const(ctx, val);
-    }
-
-    op(_LOAD_CONST_IMMORTAL, (-- value)) {
-        PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
+        PyObject *val = PyTuple_GET_ITEM(co->co_consts, oparg);
         REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)val);
         value = sym_new_const(ctx, val);
     }
 
     op(_LOAD_SMALL_INT, (-- value)) {
-        PyObject *val = PyLong_FromLong(this_instr->oparg);
+        PyObject *val = PyLong_FromLong(oparg);
+        assert(val);
+        assert(_Py_IsImmortal(val));
+        REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)val);
         value = sym_new_const(ctx, val);
     }
 
@@ -550,6 +551,18 @@ dummy_func(void) {
         value = sym_new_const(ctx, ptr);
     }
 
+    op(_POP_CALL_LOAD_CONST_INLINE_BORROW, (ptr/4, unused, unused -- value)) {
+        value = sym_new_const(ctx, ptr);
+    }
+
+    op(_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW, (ptr/4, unused, unused, unused -- value)) {
+        value = sym_new_const(ctx, ptr);
+    }
+
+    op(_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, unused, unused, unused, unused -- value)) {
+        value = sym_new_const(ctx, ptr);
+    }
+
     op(_COPY, (bottom, unused[oparg-1] -- bottom, unused[oparg-1], top)) {
         assert(oparg > 0);
         top = bottom;
@@ -603,7 +616,7 @@ dummy_func(void) {
     op(_LOAD_ATTR, (owner -- attr, self_or_null[oparg&1])) {
         (void)owner;
         attr = sym_new_not_null(ctx);
-        if (oparg &1) {
+        if (oparg & 1) {
             self_or_null[0] = sym_new_unknown(ctx);
         }
     }
@@ -619,25 +632,59 @@ dummy_func(void) {
     }
 
     op(_LOAD_ATTR_CLASS, (descr/4, owner -- attr)) {
-        attr = sym_new_not_null(ctx);
         (void)descr;
+        PyTypeObject *type = (PyTypeObject *)sym_get_const(ctx, owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
+    }
+
+    op(_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES, (descr/4, owner -- attr)) {
+        (void)descr;
+        PyTypeObject *type = sym_get_type(owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
+    }
+
+    op(_LOAD_ATTR_NONDESCRIPTOR_NO_DICT, (descr/4, owner -- attr)) {
+        (void)descr;
+        PyTypeObject *type = sym_get_type(owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
     }
 
     op(_LOAD_ATTR_METHOD_WITH_VALUES, (descr/4, owner -- attr, self)) {
         (void)descr;
-        attr = sym_new_not_null(ctx);
+        PyTypeObject *type = sym_get_type(owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
         self = owner;
     }
 
     op(_LOAD_ATTR_METHOD_NO_DICT, (descr/4, owner -- attr, self)) {
         (void)descr;
-        attr = sym_new_not_null(ctx);
+        PyTypeObject *type = sym_get_type(owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
         self = owner;
     }
 
     op(_LOAD_ATTR_METHOD_LAZY_DICT, (descr/4, owner -- attr, self)) {
         (void)descr;
-        attr = sym_new_not_null(ctx);
+        PyTypeObject *type = sym_get_type(owner);
+        PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+        attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
         self = owner;
     }
 
@@ -793,7 +840,18 @@ dummy_func(void) {
         value = sym_new_unknown(ctx);
     }
 
-    op(_FOR_ITER_GEN_FRAME, (unused -- unused, gen_frame: _Py_UOpsAbstractFrame*)) {
+    op(_GET_ITER, (iterable -- iter, index_or_null)) {
+        if (sym_matches_type(iterable, &PyTuple_Type) || sym_matches_type(iterable, &PyList_Type)) {
+            iter = iterable;
+            index_or_null = sym_new_not_null(ctx);
+        }
+        else {
+            iter = sym_new_not_null(ctx);
+            index_or_null = sym_new_unknown(ctx);
+        }
+    }
+
+    op(_FOR_ITER_GEN_FRAME, (unused, unused -- unused, unused, gen_frame: _Py_UOpsAbstractFrame*)) {
         gen_frame = NULL;
         /* We are about to hit the end of the trace */
         ctx->done = true;
@@ -867,7 +925,14 @@ dummy_func(void) {
         }
     }
 
-    op(_ITER_NEXT_RANGE, (iter -- iter, next)) {
+    op(_ITER_CHECK_TUPLE, (iter, null_or_index -- iter, null_or_index)) {
+        if (sym_matches_type(iter, &PyTuple_Type)) {
+            REPLACE_OP(this_instr, _NOP, 0, 0);
+        }
+        sym_set_type(iter, &PyTuple_Type);
+    }
+
+    op(_ITER_NEXT_RANGE, (iter, null_or_index -- iter, null_or_index, next)) {
        next = sym_new_type(ctx, &PyLong_Type);
     }
 
@@ -890,6 +955,26 @@ dummy_func(void) {
         }
     }
 
+    op(_CALL_ISINSTANCE, (unused, unused, instance, cls -- res)) {
+        // the result is always a bool, but sometimes we can
+        // narrow it down to True or False
+        res = sym_new_type(ctx, &PyBool_Type);
+        PyTypeObject *inst_type = sym_get_type(instance);
+        PyTypeObject *cls_o = (PyTypeObject *)sym_get_const(ctx, cls);
+        if (inst_type && cls_o && sym_matches_type(cls, &PyType_Type)) {
+            // isinstance(inst, cls) where both inst and cls have
+            // known types, meaning we can deduce either True or False
+
+            // The below check is equivalent to PyObject_TypeCheck(inst, cls)
+            PyObject *out = Py_False;
+            if (inst_type == cls_o || PyType_IsSubtype(inst_type, cls_o)) {
+                out = Py_True;
+            }
+            sym_set_const(res, out);
+            REPLACE_OP(this_instr, _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)out);
+        }
+    }
+
     op(_GUARD_IS_TRUE_POP, (flag -- )) {
         if (sym_is_const(ctx, flag)) {
             PyObject *value = sym_get_const(ctx, flag);
@@ -1067,6 +1152,13 @@ dummy_func(void) {
         sym_set_null(null);
     }
 
+    op(_GUARD_NOS_NOT_NULL, (nos, unused -- nos, unused)) {
+        if (sym_is_not_null(nos)) {
+            REPLACE_OP(this_instr, _NOP, 0, 0);
+        }
+        sym_set_non_null(nos);
+    }
+
     op(_GUARD_THIRD_NULL, (null, unused, unused -- null, unused, unused)) {
         if (sym_is_null(null)) {
             REPLACE_OP(this_instr, _NOP, 0, 0);
@@ -1099,6 +1191,25 @@ dummy_func(void) {
         res = sym_new_type(ctx, &PyLong_Type);
     }
 
+    op(_GET_LEN, (obj -- obj, len)) {
+        int tuple_length = sym_tuple_length(obj);
+        if (tuple_length == -1) {
+            len = sym_new_type(ctx, &PyLong_Type);
+        }
+        else {
+            assert(tuple_length >= 0);
+            PyObject *temp = PyLong_FromLong(tuple_length);
+            if (temp == NULL) {
+                goto error;
+            }
+            if (_Py_IsImmortal(temp)) {
+                REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)temp);
+            }
+            len = sym_new_const(ctx, temp);
+            Py_DECREF(temp);
+        }
+    }
+
     op(_GUARD_CALLABLE_LEN, (callable, unused, unused -- callable, unused, unused)) {
         PyObject *len = _PyInterpreterState_GET()->callable_cache.len;
         if (sym_get_const(ctx, callable) == len) {
@@ -1115,6 +1226,14 @@ dummy_func(void) {
         sym_set_const(callable, isinstance);
     }
 
+    op(_GUARD_CALLABLE_LIST_APPEND, (callable, unused, unused -- callable, unused, unused)) {
+        PyObject *list_append = _PyInterpreterState_GET()->callable_cache.list_append;
+        if (sym_get_const(ctx, callable) == list_append) {
+            REPLACE_OP(this_instr, _NOP, 0, 0);
+        }
+        sym_set_const(callable, list_append);
+    }
+
 // END BYTECODES //
 
 }
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index deb912662e4..960c6838004 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -66,23 +66,9 @@
             break;
         }
 
-        /* _LOAD_CONST is not a viable micro-op for tier 2 */
-
-        case _LOAD_CONST_MORTAL: {
-            JitOptSymbol *value;
-            PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
-            int opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE;
-            REPLACE_OP(this_instr, opcode, 0, (uintptr_t)val);
-            value = sym_new_const(ctx, val);
-            stack_pointer[0] = value;
-            stack_pointer += 1;
-            assert(WITHIN_STACK_BOUNDS());
-            break;
-        }
-
-        case _LOAD_CONST_IMMORTAL: {
+        case _LOAD_CONST: {
             JitOptSymbol *value;
-            PyObject *val = PyTuple_GET_ITEM(co->co_consts, this_instr->oparg);
+            PyObject *val = PyTuple_GET_ITEM(co->co_consts, oparg);
             REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)val);
             value = sym_new_const(ctx, val);
             stack_pointer[0] = value;
@@ -93,7 +79,10 @@
 
         case _LOAD_SMALL_INT: {
             JitOptSymbol *value;
-            PyObject *val = PyLong_FromLong(this_instr->oparg);
+            PyObject *val = PyLong_FromLong(oparg);
+            assert(val);
+            assert(_Py_IsImmortal(val));
+            REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)val);
             value = sym_new_const(ctx, val);
             stack_pointer[0] = value;
             stack_pointer += 1;
@@ -116,6 +105,12 @@
             break;
         }
 
+        case _POP_TWO: {
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
         case _PUSH_NULL: {
             JitOptSymbol *res;
             res = sym_new_null(ctx);
@@ -131,6 +126,12 @@
             break;
         }
 
+        case _POP_ITER: {
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
         case _END_SEND: {
             JitOptSymbol *val;
             val = sym_new_not_null(ctx);
@@ -1179,7 +1180,7 @@
             self_or_null = &stack_pointer[0];
             (void)owner;
             attr = sym_new_not_null(ctx);
-            if (oparg &1) {
+            if (oparg & 1) {
                 self_or_null[0] = sym_new_unknown(ctx);
             }
             stack_pointer[-1] = attr;
@@ -1273,14 +1274,32 @@
         }
 
         case _CHECK_ATTR_CLASS: {
+            JitOptSymbol *owner;
+            owner = stack_pointer[-1];
+            uint32_t type_version = (uint32_t)this_instr->operand0;
+            PyObject *type = (PyObject *)_PyType_LookupByVersion(type_version);
+            if (type) {
+                if (type == sym_get_const(ctx, owner)) {
+                    REPLACE_OP(this_instr, _NOP, 0, 0);
+                }
+                else {
+                    sym_set_const(owner, type);
+                }
+            }
             break;
         }
 
         case _LOAD_ATTR_CLASS: {
+            JitOptSymbol *owner;
             JitOptSymbol *attr;
+            owner = stack_pointer[-1];
             PyObject *descr = (PyObject *)this_instr->operand0;
-            attr = sym_new_not_null(ctx);
             (void)descr;
+            PyTypeObject *type = (PyTypeObject *)sym_get_const(ctx, owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
             stack_pointer[-1] = attr;
             break;
         }
@@ -1477,8 +1496,29 @@
         }
 
         case _GET_LEN: {
+            JitOptSymbol *obj;
             JitOptSymbol *len;
-            len = sym_new_not_null(ctx);
+            obj = stack_pointer[-1];
+            int tuple_length = sym_tuple_length(obj);
+            if (tuple_length == -1) {
+                len = sym_new_type(ctx, &PyLong_Type);
+            }
+            else {
+                assert(tuple_length >= 0);
+                PyObject *temp = PyLong_FromLong(tuple_length);
+                if (temp == NULL) {
+                    goto error;
+                }
+                if (_Py_IsImmortal(temp)) {
+                    REPLACE_OP(this_instr, _LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)temp);
+                }
+                len = sym_new_const(ctx, temp);
+                stack_pointer[0] = len;
+                stack_pointer += 1;
+                assert(WITHIN_STACK_BOUNDS());
+                Py_DECREF(temp);
+                stack_pointer += -1;
+            }
             stack_pointer[0] = len;
             stack_pointer += 1;
             assert(WITHIN_STACK_BOUNDS());
@@ -1522,9 +1562,22 @@
         }
 
         case _GET_ITER: {
+            JitOptSymbol *iterable;
             JitOptSymbol *iter;
-            iter = sym_new_not_null(ctx);
+            JitOptSymbol *index_or_null;
+            iterable = stack_pointer[-1];
+            if (sym_matches_type(iterable, &PyTuple_Type) || sym_matches_type(iterable, &PyList_Type)) {
+                iter = iterable;
+                index_or_null = sym_new_not_null(ctx);
+            }
+            else {
+                iter = sym_new_not_null(ctx);
+                index_or_null = sym_new_unknown(ctx);
+            }
             stack_pointer[-1] = iter;
+            stack_pointer[0] = index_or_null;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
             break;
         }
 
@@ -1570,6 +1623,12 @@
         }
 
         case _ITER_CHECK_TUPLE: {
+            JitOptSymbol *iter;
+            iter = stack_pointer[-2];
+            if (sym_matches_type(iter, &PyTuple_Type)) {
+                REPLACE_OP(this_instr, _NOP, 0, 0);
+            }
+            sym_set_type(iter, &PyTuple_Type);
             break;
         }
 
@@ -1673,7 +1732,11 @@
             owner = stack_pointer[-1];
             PyObject *descr = (PyObject *)this_instr->operand0;
             (void)descr;
-            attr = sym_new_not_null(ctx);
+            PyTypeObject *type = sym_get_type(owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
             self = owner;
             stack_pointer[-1] = attr;
             stack_pointer[0] = self;
@@ -1689,7 +1752,11 @@
             owner = stack_pointer[-1];
             PyObject *descr = (PyObject *)this_instr->operand0;
             (void)descr;
-            attr = sym_new_not_null(ctx);
+            PyTypeObject *type = sym_get_type(owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
             self = owner;
             stack_pointer[-1] = attr;
             stack_pointer[0] = self;
@@ -1699,15 +1766,31 @@
         }
 
         case _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES: {
+            JitOptSymbol *owner;
             JitOptSymbol *attr;
-            attr = sym_new_not_null(ctx);
+            owner = stack_pointer[-1];
+            PyObject *descr = (PyObject *)this_instr->operand0;
+            (void)descr;
+            PyTypeObject *type = sym_get_type(owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
             stack_pointer[-1] = attr;
             break;
         }
 
         case _LOAD_ATTR_NONDESCRIPTOR_NO_DICT: {
+            JitOptSymbol *owner;
             JitOptSymbol *attr;
-            attr = sym_new_not_null(ctx);
+            owner = stack_pointer[-1];
+            PyObject *descr = (PyObject *)this_instr->operand0;
+            (void)descr;
+            PyTypeObject *type = sym_get_type(owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _POP_TOP_LOAD_CONST_INLINE_BORROW,
+                           _POP_TOP_LOAD_CONST_INLINE);
             stack_pointer[-1] = attr;
             break;
         }
@@ -1723,7 +1806,11 @@
             owner = stack_pointer[-1];
             PyObject *descr = (PyObject *)this_instr->operand0;
             (void)descr;
-            attr = sym_new_not_null(ctx);
+            PyTypeObject *type = sym_get_type(owner);
+            PyObject *name = PyTuple_GET_ITEM(co->co_names, oparg >> 1);
+            attr = lookup_attr(ctx, this_instr, type, name,
+                           _LOAD_CONST_UNDER_INLINE_BORROW,
+                           _LOAD_CONST_UNDER_INLINE);
             self = owner;
             stack_pointer[-1] = attr;
             stack_pointer[0] = self;
@@ -1935,6 +2022,16 @@
             break;
         }
 
+        case _GUARD_NOS_NOT_NULL: {
+            JitOptSymbol *nos;
+            nos = stack_pointer[-2];
+            if (sym_is_not_null(nos)) {
+                REPLACE_OP(this_instr, _NOP, 0, 0);
+            }
+            sym_set_non_null(nos);
+            break;
+        }
+
         case _GUARD_THIRD_NULL: {
             JitOptSymbol *null;
             null = stack_pointer[-3];
@@ -2124,14 +2221,39 @@
         }
 
         case _CALL_ISINSTANCE: {
+            JitOptSymbol *cls;
+            JitOptSymbol *instance;
             JitOptSymbol *res;
-            res = sym_new_not_null(ctx);
+            cls = stack_pointer[-1];
+            instance = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyBool_Type);
+            PyTypeObject *inst_type = sym_get_type(instance);
+            PyTypeObject *cls_o = (PyTypeObject *)sym_get_const(ctx, cls);
+            if (inst_type && cls_o && sym_matches_type(cls, &PyType_Type)) {
+                PyObject *out = Py_False;
+                if (inst_type == cls_o || PyType_IsSubtype(inst_type, cls_o)) {
+                    out = Py_True;
+                }
+                sym_set_const(res, out);
+                REPLACE_OP(this_instr, _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)out);
+            }
             stack_pointer[-4] = res;
             stack_pointer += -3;
             assert(WITHIN_STACK_BOUNDS());
             break;
         }
 
+        case _GUARD_CALLABLE_LIST_APPEND: {
+            JitOptSymbol *callable;
+            callable = stack_pointer[-3];
+            PyObject *list_append = _PyInterpreterState_GET()->callable_cache.list_append;
+            if (sym_get_const(ctx, callable) == list_append) {
+                REPLACE_OP(this_instr, _NOP, 0, 0);
+            }
+            sym_set_const(callable, list_append);
+            break;
+        }
+
         case _CALL_LIST_APPEND: {
             stack_pointer += -3;
             assert(WITHIN_STACK_BOUNDS());
@@ -2504,6 +2626,24 @@
             break;
         }
 
+        case _POP_CALL: {
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_ONE: {
+            stack_pointer += -3;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_TWO: {
+            stack_pointer += -4;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
         case _POP_TOP_LOAD_CONST_INLINE_BORROW: {
             JitOptSymbol *value;
             PyObject *ptr = (PyObject *)this_instr->operand0;
@@ -2521,6 +2661,60 @@
             break;
         }
 
+        case _POP_CALL_LOAD_CONST_INLINE_BORROW: {
+            JitOptSymbol *value;
+            PyObject *ptr = (PyObject *)this_instr->operand0;
+            value = sym_new_const(ctx, ptr);
+            stack_pointer[-2] = value;
+            stack_pointer += -1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW: {
+            JitOptSymbol *value;
+            PyObject *ptr = (PyObject *)this_instr->operand0;
+            value = sym_new_const(ctx, ptr);
+            stack_pointer[-3] = value;
+            stack_pointer += -2;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW: {
+            JitOptSymbol *value;
+            PyObject *ptr = (PyObject *)this_instr->operand0;
+            value = sym_new_const(ctx, ptr);
+            stack_pointer[-4] = value;
+            stack_pointer += -3;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _LOAD_CONST_UNDER_INLINE: {
+            JitOptSymbol *value;
+            JitOptSymbol *new;
+            value = sym_new_not_null(ctx);
+            new = sym_new_not_null(ctx);
+            stack_pointer[-1] = value;
+            stack_pointer[0] = new;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
+        case _LOAD_CONST_UNDER_INLINE_BORROW: {
+            JitOptSymbol *value;
+            JitOptSymbol *new;
+            value = sym_new_not_null(ctx);
+            new = sym_new_not_null(ctx);
+            stack_pointer[-1] = value;
+            stack_pointer[0] = new;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
+            break;
+        }
+
         case _CHECK_FUNCTION: {
             break;
         }
diff --git a/Python/optimizer_symbols.c b/Python/optimizer_symbols.c
index e8a4f87031b..25de5d83166 100644
--- a/Python/optimizer_symbols.c
+++ b/Python/optimizer_symbols.c
@@ -13,22 +13,46 @@
 #include <stdint.h>
 #include <stddef.h>
 
-/* Symbols
-   =======
-
-   See the diagram at
-   https://github.com/faster-cpython/ideas/blob/main/3.13/redundancy_eliminator.md
-
-   We represent the nodes in the diagram as follows
-   (the flag bits are only defined in optimizer_symbols.c):
-   - Top: no flag bits, typ and const_val are NULL.
-   - NULL: IS_NULL flag set, type and const_val NULL.
-   - Not NULL: NOT_NULL flag set, type and const_val NULL.
-   - None/not None: not used. (None could be represented as any other constant.)
-   - Known type: NOT_NULL flag set and typ set; const_val is NULL.
-   - Known constant: NOT_NULL flag set, type set, const_val set.
-   - Bottom: IS_NULL and NOT_NULL flags set, type and const_val NULL.
- */
+/*
+
+Symbols
+=======
+
+https://github.com/faster-cpython/ideas/blob/main/3.13/redundancy_eliminator.md
+
+Logically, all symbols begin as UNKNOWN, and can transition downwards along the
+edges of the lattice, but *never* upwards (see the diagram below). The UNKNOWN
+state represents no information, and the BOTTOM state represents contradictory
+information. Though symbols logically progress through all intermediate nodes,
+we often skip in-between states for convenience:
+
+   UNKNOWN
+   |     |
+NULL     |
+|        |                <- Anything below this level is an object.
+|        NON_NULL
+|        |      |         <- Anything below this level has a known type version.
+| TYPE_VERSION  |
+| |             |         <- Anything below this level has a known type.
+| KNOWN_CLASS   |
+| |         |   |         <- Anything below this level has a known truthiness.
+| |         |  TRUTHINESS
+| |         |  |
+| TUPLE     |  |
+|     |     |  |          <- Anything below this level is a known constant.
+|    KNOWN_VALUE
+|    |                    <- Anything below this level is unreachable.
+BOTTOM
+
+For example, after guarding that the type of an UNKNOWN local is int, we can
+narrow the symbol to KNOWN_CLASS (logically progressing though NON_NULL and
+TYPE_VERSION to get there). Later, we may learn that it is falsey based on the
+result of a truth test, which would allow us to narrow the symbol to KNOWN_VALUE
+(with a value of integer zero). If at any point we encounter a float guard on
+the same symbol, that would be a contradiction, and the symbol would be set to
+BOTTOM (indicating that the code is unreachable).
+
+*/
 
 #ifdef Py_DEBUG
 static inline int get_lltrace(void) {
@@ -200,6 +224,10 @@ _Py_uop_sym_set_type(JitOptContext *ctx, JitOptSymbol *sym, PyTypeObject *typ)
 bool
 _Py_uop_sym_set_type_version(JitOptContext *ctx, JitOptSymbol *sym, unsigned int version)
 {
+    PyTypeObject *type = _PyType_LookupByVersion(version);
+    if (type) {
+        _Py_uop_sym_set_type(ctx, sym, type);
+    }
     JitSymType tag = sym->tag;
     switch(tag) {
         case JIT_SYM_NULL_TAG:
@@ -215,18 +243,24 @@ _Py_uop_sym_set_type_version(JitOptContext *ctx, JitOptSymbol *sym, unsigned int
                 return true;
             }
         case JIT_SYM_KNOWN_VALUE_TAG:
-            Py_CLEAR(sym->value.value);
-            sym_set_bottom(ctx, sym);
-            return false;
+            if (Py_TYPE(sym->value.value)->tp_version_tag != version) {
+                Py_CLEAR(sym->value.value);
+                sym_set_bottom(ctx, sym);
+                return false;
+            };
+            return true;
         case JIT_SYM_TUPLE_TAG:
-            sym_set_bottom(ctx, sym);
-            return false;
+            if (PyTuple_Type.tp_version_tag != version) {
+                sym_set_bottom(ctx, sym);
+                return false;
+            };
+            return true;
         case JIT_SYM_TYPE_VERSION_TAG:
-            if (sym->version.version == version) {
-                return true;
+            if (sym->version.version != version) {
+                sym_set_bottom(ctx, sym);
+                return false;
             }
-            sym_set_bottom(ctx, sym);
-            return false;
+            return true;
         case JIT_SYM_BOTTOM_TAG:
             return false;
         case JIT_SYM_NON_NULL_TAG:
@@ -266,6 +300,18 @@ _Py_uop_sym_set_const(JitOptContext *ctx, JitOptSymbol *sym, PyObject *const_val
             }
             return;
         case JIT_SYM_TUPLE_TAG:
+            if (PyTuple_CheckExact(const_val)) {
+                Py_ssize_t len = _Py_uop_sym_tuple_length(sym);
+                if (len == PyTuple_GET_SIZE(const_val)) {
+                    for (Py_ssize_t i = 0; i < len; i++) {
+                        JitOptSymbol *sym_item = _Py_uop_sym_tuple_getitem(ctx, sym, i);
+                        PyObject *item = PyTuple_GET_ITEM(const_val, i);
+                        _Py_uop_sym_set_const(ctx, sym_item, item);
+                    }
+                    make_const(sym, const_val);
+                    return;
+                }
+            }
             sym_set_bottom(ctx, sym);
             return;
         case JIT_SYM_TYPE_VERSION_TAG:
@@ -398,7 +444,6 @@ _Py_uop_sym_get_type(JitOptSymbol *sym)
     JitSymType tag = sym->tag;
     switch(tag) {
         case JIT_SYM_NULL_TAG:
-        case JIT_SYM_TYPE_VERSION_TAG:
         case JIT_SYM_BOTTOM_TAG:
         case JIT_SYM_NON_NULL_TAG:
         case JIT_SYM_UNKNOWN_TAG:
@@ -407,6 +452,8 @@ _Py_uop_sym_get_type(JitOptSymbol *sym)
             return sym->cls.type;
         case JIT_SYM_KNOWN_VALUE_TAG:
             return Py_TYPE(sym->value.value);
+        case JIT_SYM_TYPE_VERSION_TAG:
+            return _PyType_LookupByVersion(sym->version.version);
         case JIT_SYM_TUPLE_TAG:
             return &PyTuple_Type;
         case JIT_SYM_TRUTHINESS_TAG:
@@ -442,21 +489,7 @@ _Py_uop_sym_get_type_version(JitOptSymbol *sym)
 bool
 _Py_uop_sym_has_type(JitOptSymbol *sym)
 {
-    JitSymType tag = sym->tag;
-    switch(tag) {
-        case JIT_SYM_NULL_TAG:
-        case JIT_SYM_TYPE_VERSION_TAG:
-        case JIT_SYM_BOTTOM_TAG:
-        case JIT_SYM_NON_NULL_TAG:
-        case JIT_SYM_UNKNOWN_TAG:
-            return false;
-        case JIT_SYM_KNOWN_CLASS_TAG:
-        case JIT_SYM_KNOWN_VALUE_TAG:
-        case JIT_SYM_TUPLE_TAG:
-        case JIT_SYM_TRUTHINESS_TAG:
-            return true;
-    }
-    Py_UNREACHABLE();
+    return _Py_uop_sym_get_type(sym) != NULL;
 }
 
 bool
@@ -554,7 +587,7 @@ _Py_uop_sym_tuple_getitem(JitOptContext *ctx, JitOptSymbol *sym, int item)
     else if (sym->tag == JIT_SYM_TUPLE_TAG && item < sym->tuple.length) {
         return allocation_base(ctx) + sym->tuple.items[item];
     }
-    return _Py_uop_sym_new_unknown(ctx);
+    return _Py_uop_sym_new_not_null(ctx);
 }
 
 int
@@ -841,6 +874,11 @@ _Py_uop_symbols_test(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(ignored))
         _Py_uop_sym_get_const(ctx, _Py_uop_sym_tuple_getitem(ctx, sym, 1)) == val_43,
         "tuple item does not match value used to create tuple"
     );
+    sym = _Py_uop_sym_new_type(ctx, &PyTuple_Type);
+    TEST_PREDICATE(
+        _Py_uop_sym_is_not_null(_Py_uop_sym_tuple_getitem(ctx, sym, 42)),
+        "Unknown tuple item is not narrowed to non-NULL"
+    );
     JitOptSymbol *value = _Py_uop_sym_new_type(ctx, &PyBool_Type);
     sym = _Py_uop_sym_new_truthiness(ctx, value, false);
     TEST_PREDICATE(_Py_uop_sym_matches_type(sym, &PyBool_Type), "truthiness is not boolean");
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 8edf4323594..e896dea0271 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -112,17 +112,27 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, PyTime_t timeout)
         }
     }
 
-    // NOTE: we wait on the sigint event even in non-main threads to match the
-    // behavior of the other platforms. Non-main threads will ignore the
-    // Py_PARK_INTR result.
-    HANDLE sigint_event = _PyOS_SigintEvent();
-    HANDLE handles[2] = { sema->platform_sem, sigint_event };
-    DWORD count = sigint_event != NULL ? 2 : 1;
+    HANDLE handles[2] = { sema->platform_sem, NULL };
+    HANDLE sigint_event = NULL;
+    DWORD count = 1;
+    if (_Py_IsMainThread()) {
+        // gh-135099: Wait on the SIGINT event only in the main thread. Other
+        // threads would ignore the result anyways, and accessing
+        // `_PyOS_SigintEvent()` from non-main threads may race with
+        // interpreter shutdown, which closes the event handle. Note that
+        // non-main interpreters will ignore the result.
+        sigint_event = _PyOS_SigintEvent();
+        if (sigint_event != NULL) {
+            handles[1] = sigint_event;
+            count = 2;
+        }
+    }
     wait = WaitForMultipleObjects(count, handles, FALSE, millis);
     if (wait == WAIT_OBJECT_0) {
         res = Py_PARK_OK;
     }
     else if (wait == WAIT_OBJECT_0 + 1) {
+        assert(sigint_event != NULL);
         ResetEvent(sigint_event);
         res = Py_PARK_INTR;
     }
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 1211e0e9f11..2ca18c23593 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -1,241 +1,354 @@
+/*
+ * Python Perf Trampoline Support - JIT Dump Implementation
+ *
+ * This file implements the perf jitdump API for Python's performance profiling
+ * integration. It allows perf (Linux performance analysis tool) to understand
+ * and profile dynamically generated Python bytecode by creating JIT dump files
+ * that perf can inject into its analysis.
+ *
+ *
+ * IMPORTANT: This file exports specific callback functions that are part of
+ * Python's internal API. Do not modify the function signatures or behavior
+ * of exported functions without coordinating with the Python core team.
+ *
+ * Usually the binary and libraries are mapped in separate region like below:
+ *
+ *   address ->
+ *    --+---------------------+--//--+---------------------+--
+ *      | .text | .data | ... |      | .text | .data | ... |
+ *    --+---------------------+--//--+---------------------+--
+ *          myprog                      libc.so
+ *
+ * So it'd be easy and straight-forward to find a mapped binary or library from an
+ * address.
+ *
+ * But for JIT code, the code arena only cares about the code section. But the
+ * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
+ * unwind info too. Then it'd generate following address space with synthesized
+ * MMAP events. Let's say it has a sample between address B and C.
+ *
+ *                                                sample
+ *                                                  |
+ *   address ->                         A       B   v   C
+ *   ---------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
+ *     ...
+ *   ---------------------------------------------------------------------------------------------------
+ *
+ * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
+ * the unwind info. If it maps both .text section and unwind sections, the sample
+ * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
+ * which one is right. So to make perf happy we have non-overlapping ranges for each
+ * DSO:
+ *
+ *   address ->
+ *   -------------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
+ *     ...
+ *   -------------------------------------------------------------------------------------------------------
+ *
+ * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
+ * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ */
+
+
+
 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_runtime.h"       // _PyRuntime
 
-
 #ifdef PY_HAVE_PERF_TRAMPOLINE
 
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>             // mmap()
-#include <sys/types.h>
-#include <unistd.h>               // sysconf()
-#include <sys/time.h>           // gettimeofday()
-#include <sys/syscall.h>
-
-// ----------------------------------
-//         Perf jitdump API
-// ----------------------------------
-
-typedef struct {
-    FILE* perf_map;
-    PyThread_type_lock map_lock;
-    void* mapped_buffer;
-    size_t mapped_size;
-    int code_id;
-} PerfMapJitState;
-
-static PerfMapJitState perf_jit_map_state;
+/* Standard library includes for perf jitdump implementation */
+#include <elf.h>                  // ELF architecture constants
+#include <fcntl.h>                // File control operations
+#include <stdio.h>                // Standard I/O operations
+#include <stdlib.h>               // Standard library functions
+#include <sys/mman.h>             // Memory mapping functions (mmap)
+#include <sys/types.h>            // System data types
+#include <unistd.h>               // System calls (sysconf, getpid)
+#include <sys/time.h>             // Time functions (gettimeofday)
+#include <sys/syscall.h>          // System call interface
+
+// =============================================================================
+//                           CONSTANTS AND CONFIGURATION
+// =============================================================================
 
 /*
-Usually the binary and libraries are mapped in separate region like below:
-
-  address ->
-   --+---------------------+--//--+---------------------+--
-     | .text | .data | ... |      | .text | .data | ... |
-   --+---------------------+--//--+---------------------+--
-         myprog                      libc.so
-
-So it'd be easy and straight-forward to find a mapped binary or library from an
-address.
-
-But for JIT code, the code arena only cares about the code section. But the
-resulting DSOs (which is generated by perf inject -j) contain ELF headers and
-unwind info too. Then it'd generate following address space with synthesized
-MMAP events. Let's say it has a sample between address B and C.
-
-                                               sample
-                                                 |
-  address ->                         A       B   v   C
-  ---------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
-    ...
-  ---------------------------------------------------------------------------------------------------
-
-If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
-the unwind info. If it maps both .text section and unwind sections, the sample
-could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
-which one is right. So to make perf happy we have non-overlapping ranges for each
-DSO:
-
-  address ->
-  -------------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
-    ...
-  -------------------------------------------------------------------------------------------------------
-
-As the trampolines are constant, we add a constant padding but in general the padding needs to have the
-size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ * Memory layout considerations for perf jitdump:
+ *
+ * Perf expects non-overlapping memory regions for each JIT-compiled function.
+ * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
+ * Shared Object) files that contain:
+ * - ELF headers
+ * - .text section (actual machine code)
+ * - Unwind information (for stack traces)
+ *
+ * To ensure proper address space layout, we add padding between code regions.
+ * This prevents address conflicts when perf maps the synthesized DSOs.
+ *
+ * Memory layout example:
+ * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
+ * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
+ *
+ * The padding size (0x100) is chosen to accommodate typical unwind info sizes
+ * while maintaining 16-byte alignment requirements.
  */
-
 #define PERF_JIT_CODE_PADDING 0x100
-#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
-
-typedef uint64_t uword;
-typedef const char* CodeComments;
 
-#define Pd "d"
-#define MB (1024 * 1024)
-
-#define EM_386      3
-#define EM_X86_64   62
-#define EM_ARM      40
-#define EM_AARCH64  183
-#define EM_RISCV    243
+/* Convenient access to the global trampoline API state */
+#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 
-#define TARGET_ARCH_IA32   0
-#define TARGET_ARCH_X64    0
-#define TARGET_ARCH_ARM    0
-#define TARGET_ARCH_ARM64  0
-#define TARGET_ARCH_RISCV32 0
-#define TARGET_ARCH_RISCV64 0
+/* Type aliases for clarity and portability */
+typedef uint64_t uword;                    // Word-sized unsigned integer
+typedef const char* CodeComments;          // Code comment strings
 
-#define FLAG_generate_perf_jitdump 0
-#define FLAG_write_protect_code 0
-#define FLAG_write_protect_vm_isolate 0
-#define FLAG_code_comments 0
+/* Memory size constants */
+#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
 
-#define UNREACHABLE()
+// =============================================================================
+//                        ARCHITECTURE-SPECIFIC DEFINITIONS
+// =============================================================================
 
-static uword GetElfMachineArchitecture(void) {
-#if TARGET_ARCH_IA32
-    return EM_386;
-#elif TARGET_ARCH_X64
+/*
+ * Returns the ELF machine architecture constant for the current platform.
+ * This is required for the jitdump header to correctly identify the target
+ * architecture for perf processing.
+ *
+ */
+static uint64_t GetElfMachineArchitecture(void) {
+#if defined(__x86_64__) || defined(_M_X64)
     return EM_X86_64;
-#elif TARGET_ARCH_ARM
-    return EM_ARM;
-#elif TARGET_ARCH_ARM64
+#elif defined(__i386__) || defined(_M_IX86)
+    return EM_386;
+#elif defined(__aarch64__)
     return EM_AARCH64;
-#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
+#elif defined(__arm__) || defined(_M_ARM)
+    return EM_ARM;
+#elif defined(__riscv)
     return EM_RISCV;
 #else
-    UNREACHABLE();
+    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
     return 0;
 #endif
 }
 
+// =============================================================================
+//                           PERF JITDUMP DATA STRUCTURES
+// =============================================================================
+
+/*
+ * Perf jitdump file format structures
+ *
+ * These structures define the binary format that perf expects for JIT dump files.
+ * The format is documented in the Linux perf tools source code and must match
+ * exactly for proper perf integration.
+ */
+
+/*
+ * Jitdump file header - written once at the beginning of each jitdump file
+ * Contains metadata about the process and jitdump format version
+ */
 typedef struct {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t size;
-    uint32_t elf_mach_target;
-    uint32_t reserved;
-    uint32_t process_id;
-    uint64_t time_stamp;
-    uint64_t flags;
+    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
+    uint32_t version;            // Jitdump format version (currently 1)
+    uint32_t size;               // Size of this header structure
+    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
+    uint32_t reserved;           // Reserved field (must be 0)
+    uint32_t process_id;         // Process ID of the JIT compiler
+    uint64_t time_stamp;         // Timestamp when jitdump was created
+    uint64_t flags;              // Feature flags (currently unused)
 } Header;
 
- enum PerfEvent {
-    PerfLoad = 0,
-    PerfMove = 1,
-    PerfDebugInfo = 2,
-    PerfClose = 3,
-    PerfUnwindingInfo = 4
+/*
+ * Perf event types supported by the jitdump format
+ * Each event type has a corresponding structure format
+ */
+enum PerfEvent {
+    PerfLoad = 0,           // Code load event (new JIT function)
+    PerfMove = 1,           // Code move event (function relocated)
+    PerfDebugInfo = 2,      // Debug information event
+    PerfClose = 3,          // JIT session close event
+    PerfUnwindingInfo = 4   // Stack unwinding information event
 };
 
+/*
+ * Base event structure - common header for all perf events
+ * Every event in the jitdump file starts with this structure
+ */
 struct BaseEvent {
-    uint32_t event;
-    uint32_t size;
-    uint64_t time_stamp;
-  };
+    uint32_t event;         // Event type (from PerfEvent enum)
+    uint32_t size;          // Total size of this event including payload
+    uint64_t time_stamp;    // Timestamp when event occurred
+};
 
+/*
+ * Code load event - indicates a new JIT-compiled function is available
+ * This is the most important event type for Python profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint32_t process_id;
-    uint32_t thread_id;
-    uint64_t vma;
-    uint64_t code_address;
-    uint64_t code_size;
-    uint64_t code_id;
+    struct BaseEvent base;   // Common event header
+    uint32_t process_id;     // Process ID where code was generated
+    uint32_t thread_id;      // Thread ID where code was generated
+    uint64_t vma;            // Virtual memory address where code is loaded
+    uint64_t code_address;   // Address of the actual machine code
+    uint64_t code_size;      // Size of the machine code in bytes
+    uint64_t code_id;        // Unique identifier for this code region
+    /* Followed by:
+     * - null-terminated function name string
+     * - raw machine code bytes
+     */
 } CodeLoadEvent;
 
+/*
+ * Code unwinding information event - provides DWARF data for stack traces
+ * Essential for proper stack unwinding during profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint64_t unwind_data_size;
-    uint64_t eh_frame_hdr_size;
-    uint64_t mapped_size;
+    struct BaseEvent base;      // Common event header
+    uint64_t unwind_data_size;  // Size of the unwinding data
+    uint64_t eh_frame_hdr_size; // Size of the EH frame header
+    uint64_t mapped_size;       // Total mapped size (with padding)
+    /* Followed by:
+     * - EH frame header
+     * - DWARF unwinding information
+     * - Padding to alignment boundary
+     */
 } CodeUnwindingInfoEvent;
 
-static const intptr_t nanoseconds_per_second = 1000000000;
-
-// Dwarf encoding constants
+// =============================================================================
+//                              GLOBAL STATE MANAGEMENT
+// =============================================================================
 
-static const uint8_t DwarfUData4 = 0x03;
-static const uint8_t DwarfSData4 = 0x0b;
-static const uint8_t DwarfPcRel = 0x10;
-static const uint8_t DwarfDataRel = 0x30;
-// static uint8_t DwarfOmit = 0xff;
+/*
+ * Global state for the perf jitdump implementation
+ *
+ * This structure maintains all the state needed for generating jitdump files.
+ * It's designed as a singleton since there's typically only one jitdump file
+ * per Python process.
+ */
 typedef struct {
-    unsigned char version;
-    unsigned char eh_frame_ptr_enc;
-    unsigned char fde_count_enc;
-    unsigned char table_enc;
-    int32_t eh_frame_ptr;
-    int32_t eh_fde_count;
-    int32_t from;
-    int32_t to;
-} EhFrameHeader;
+    FILE* perf_map;          // File handle for the jitdump file
+    PyThread_type_lock map_lock;  // Thread synchronization lock
+    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
+    size_t mapped_size;      // Size of the mapped region
+    int code_id;             // Counter for unique code region identifiers
+} PerfMapJitState;
+
+/* Global singleton instance */
+static PerfMapJitState perf_jit_map_state;
+
+// =============================================================================
+//                              TIME UTILITIES
+// =============================================================================
 
+/* Time conversion constant */
+static const intptr_t nanoseconds_per_second = 1000000000;
+
+/*
+ * Get current monotonic time in nanoseconds
+ *
+ * Monotonic time is preferred for event timestamps because it's not affected
+ * by system clock adjustments. This ensures consistent timing relationships
+ * between events even if the system clock is changed.
+ *
+ * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
+ */
 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
-        UNREACHABLE();
+        Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }
-    // Convert to nanoseconds.
+
+    /* Convert to nanoseconds for maximum precision */
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }
 
+/*
+ * Get current wall clock time in microseconds
+ *
+ * Used for the jitdump file header timestamp. Unlike monotonic time,
+ * this represents actual wall clock time that can be correlated with
+ * other system events.
+ *
+ * Returns: Current time in microseconds since Unix epoch
+ */
 static int64_t get_current_time_microseconds(void) {
-  // gettimeofday has microsecond resolution.
-  struct timeval tv;
-  if (gettimeofday(&tv, NULL) < 0) {
-    UNREACHABLE();
-    return 0;
-  }
-  return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        Py_UNREACHABLE();  // Should never fail on supported systems
+        return 0;
+    }
+    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
+// =============================================================================
+//                              UTILITY FUNCTIONS
+// =============================================================================
 
+/*
+ * Round up a value to the next multiple of a given number
+ *
+ * This is essential for maintaining proper alignment requirements in the
+ * jitdump format. Many structures need to be aligned to specific boundaries
+ * (typically 8 or 16 bytes) for efficient processing by perf.
+ *
+ * Args:
+ *   value: The value to round up
+ *   multiple: The multiple to round up to
+ *
+ * Returns: The smallest value >= input that is a multiple of 'multiple'
+ */
 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
-        // Avoid division by zero
-        return value;
+        return value;  // Avoid division by zero
     }
 
     int64_t remainder = value % multiple;
     if (remainder == 0) {
-        // Value is already a multiple of 'multiple'
-        return value;
+        return value;  // Already aligned
     }
 
-    // Calculate the difference to the next multiple
+    /* Calculate how much to add to reach the next multiple */
     int64_t difference = multiple - remainder;
-
-    // Add the difference to the value
     int64_t rounded_up_value = value + difference;
 
     return rounded_up_value;
 }
 
+// =============================================================================
+//                              FILE I/O UTILITIES
+// =============================================================================
 
+/*
+ * Write data to the jitdump file with error handling
+ *
+ * This function ensures that all data is written to the file, handling
+ * partial writes that can occur with large buffers or when the system
+ * is under load.
+ *
+ * Args:
+ *   buffer: Pointer to data to write
+ *   size: Number of bytes to write
+ */
 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);
+
     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
-            UNREACHABLE();
+            Py_UNREACHABLE();  // Write failure - should be very rare
             break;
         }
         size -= written;
@@ -243,284 +356,724 @@ static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     }
 }
 
+/*
+ * Write the jitdump file header
+ *
+ * The header must be written exactly once at the beginning of each jitdump
+ * file. It provides metadata that perf uses to parse the rest of the file.
+ *
+ * Args:
+ *   pid: Process ID to include in the header
+ *   out_file: File handle to write to (currently unused, uses global state)
+ */
 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;
-    header.magic = 0x4A695444;
-    header.version = 1;
-    header.size = sizeof(Header);
-    header.elf_mach_target = GetElfMachineArchitecture();
-    header.process_id = pid;
-    header.time_stamp = get_current_time_microseconds();
-    header.flags = 0;
-    perf_map_jit_write_fully(&header, sizeof(header));
-}
 
-static void* perf_map_jit_init(void) {
-    char filename[100];
-    int pid = getpid();
-    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
-    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
-    if (fd == -1) {
-        return NULL;
-    }
+    /* Initialize header with required values */
+    header.magic = 0x4A695444;                    // "JiTD" magic number
+    header.version = 1;                           // Current jitdump version
+    header.size = sizeof(Header);                 // Header size for validation
+    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
+    header.process_id = pid;                      // Process identifier
+    header.time_stamp = get_current_time_microseconds();   // Creation time
+    header.flags = 0;                             // No special flags currently used
 
-    const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
-    if (page_size == -1) {
-        close(fd);
-        return NULL;
-    }
-
-    // The perf jit interface forces us to map the first page of the file
-    // to signal that we are using the interface.
-    perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
-    if (perf_jit_map_state.mapped_buffer == NULL) {
-        close(fd);
-        return NULL;
-    }
-    perf_jit_map_state.mapped_size = page_size;
-    perf_jit_map_state.perf_map = fdopen(fd, "w+");
-    if (perf_jit_map_state.perf_map == NULL) {
-        close(fd);
-        return NULL;
-    }
-    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
-    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
-
-    perf_jit_map_state.map_lock = PyThread_allocate_lock();
-    if (perf_jit_map_state.map_lock == NULL) {
-        fclose(perf_jit_map_state.perf_map);
-        return NULL;
-    }
-    perf_jit_map_state.code_id = 0;
-
-    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
-    return &perf_jit_map_state;
+    perf_map_jit_write_fully(&header, sizeof(header));
 }
 
-/* DWARF definitions. */
+// =============================================================================
+//                              DWARF CONSTANTS AND UTILITIES
+// =============================================================================
+
+/*
+ * DWARF (Debug With Arbitrary Record Formats) constants
+ *
+ * DWARF is a debugging data format used to provide stack unwinding information.
+ * These constants define the various encoding types and opcodes used in
+ * DWARF Call Frame Information (CFI) records.
+ */
 
+/* DWARF Call Frame Information version */
 #define DWRF_CIE_VERSION 1
 
+/* DWARF CFA (Call Frame Address) opcodes */
 enum {
-    DWRF_CFA_nop = 0x0,
-    DWRF_CFA_offset_extended = 0x5,
-    DWRF_CFA_def_cfa = 0xc,
-    DWRF_CFA_def_cfa_offset = 0xe,
-    DWRF_CFA_offset_extended_sf = 0x11,
-    DWRF_CFA_advance_loc = 0x40,
-    DWRF_CFA_offset = 0x80
+    DWRF_CFA_nop = 0x0,                    // No operation
+    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
+    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
+    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
+    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
+    DWRF_CFA_offset = 0x80                // Simple offset instruction
 };
 
-enum
-  {
-    DWRF_EH_PE_absptr = 0x00,
-    DWRF_EH_PE_omit = 0xff,
-
-    /* FDE data encoding.  */
-    DWRF_EH_PE_uleb128 = 0x01,
-    DWRF_EH_PE_udata2 = 0x02,
-    DWRF_EH_PE_udata4 = 0x03,
-    DWRF_EH_PE_udata8 = 0x04,
-    DWRF_EH_PE_sleb128 = 0x09,
-    DWRF_EH_PE_sdata2 = 0x0a,
-    DWRF_EH_PE_sdata4 = 0x0b,
-    DWRF_EH_PE_sdata8 = 0x0c,
-    DWRF_EH_PE_signed = 0x08,
-
-    /* FDE flags.  */
-    DWRF_EH_PE_pcrel = 0x10,
-    DWRF_EH_PE_textrel = 0x20,
-    DWRF_EH_PE_datarel = 0x30,
-    DWRF_EH_PE_funcrel = 0x40,
-    DWRF_EH_PE_aligned = 0x50,
-
-    DWRF_EH_PE_indirect = 0x80
-  };
+/* DWARF Exception Handling pointer encodings */
+enum {
+    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
+    DWRF_EH_PE_omit = 0xff,               // Omitted value
+
+    /* Data type encodings */
+    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
+    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
+    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
+    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
+    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
+    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
+    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
+    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
+    DWRF_EH_PE_signed = 0x08,             // Signed flag
+
+    /* Reference type encodings */
+    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
+    DWRF_EH_PE_textrel = 0x20,            // Text-relative
+    DWRF_EH_PE_datarel = 0x30,            // Data-relative
+    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
+    DWRF_EH_PE_aligned = 0x50,            // Aligned
+    DWRF_EH_PE_indirect = 0x80            // Indirect
+};
 
+/* Additional DWARF constants for debug information */
 enum { DWRF_TAG_compile_unit = 0x11 };
-
 enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
+enum {
+    DWRF_AT_name = 0x03,         // Name attribute
+    DWRF_AT_stmt_list = 0x10,    // Statement list
+    DWRF_AT_low_pc = 0x11,       // Low PC address
+    DWRF_AT_high_pc = 0x12       // High PC address
+};
+enum {
+    DWRF_FORM_addr = 0x01,       // Address form
+    DWRF_FORM_data4 = 0x06,      // 4-byte data
+    DWRF_FORM_string = 0x08      // String form
+};
 
-enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
-
-enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
-
-enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
+/* Line number program opcodes */
+enum {
+    DWRF_LNS_extended_op = 0,    // Extended opcode
+    DWRF_LNS_copy = 1,           // Copy operation
+    DWRF_LNS_advance_pc = 2,     // Advance program counter
+    DWRF_LNS_advance_line = 3    // Advance line number
+};
 
-enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
+/* Line number extended opcodes */
+enum {
+    DWRF_LNE_end_sequence = 1,   // End of sequence
+    DWRF_LNE_set_address = 2     // Set address
+};
 
+/*
+ * Architecture-specific DWARF register numbers
+ *
+ * These constants define the register numbering scheme used by DWARF
+ * for each supported architecture. The numbers must match the ABI
+ * specification for proper stack unwinding.
+ */
 enum {
 #ifdef __x86_64__
-    /* Yes, the order is strange, but correct. */
-    DWRF_REG_AX,
-    DWRF_REG_DX,
-    DWRF_REG_CX,
-    DWRF_REG_BX,
-    DWRF_REG_SI,
-    DWRF_REG_DI,
-    DWRF_REG_BP,
-    DWRF_REG_SP,
-    DWRF_REG_8,
-    DWRF_REG_9,
-    DWRF_REG_10,
-    DWRF_REG_11,
-    DWRF_REG_12,
-    DWRF_REG_13,
-    DWRF_REG_14,
-    DWRF_REG_15,
-    DWRF_REG_RA,
+    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
+    DWRF_REG_AX,    // RAX
+    DWRF_REG_DX,    // RDX
+    DWRF_REG_CX,    // RCX
+    DWRF_REG_BX,    // RBX
+    DWRF_REG_SI,    // RSI
+    DWRF_REG_DI,    // RDI
+    DWRF_REG_BP,    // RBP
+    DWRF_REG_SP,    // RSP
+    DWRF_REG_8,     // R8
+    DWRF_REG_9,     // R9
+    DWRF_REG_10,    // R10
+    DWRF_REG_11,    // R11
+    DWRF_REG_12,    // R12
+    DWRF_REG_13,    // R13
+    DWRF_REG_14,    // R14
+    DWRF_REG_15,    // R15
+    DWRF_REG_RA,    // Return address (RIP)
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    DWRF_REG_SP = 31,
-    DWRF_REG_RA = 30,
+    /* AArch64 register numbering */
+    DWRF_REG_FP = 29,  // Frame Pointer
+    DWRF_REG_RA = 30,  // Link register (return address)
+    DWRF_REG_SP = 31,  // Stack pointer
 #else
 #    error "Unsupported target architecture"
 #endif
 };
 
-typedef struct ELFObjectContext
-{
-    uint8_t* p; /* Pointer to next address in obj.space. */
-    uint8_t* startp; /* Pointer to start address in obj.space. */
-    uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
-    uint32_t code_size; /* Size of machine code. */
+/* DWARF encoding constants used in EH frame headers */
+static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
+static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
+static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
+static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
+
+// =============================================================================
+//                              ELF OBJECT CONTEXT
+// =============================================================================
+
+/*
+ * Context for building ELF/DWARF structures
+ *
+ * This structure maintains state while constructing DWARF unwind information.
+ * It acts as a simple buffer manager with pointers to track current position
+ * and important landmarks within the buffer.
+ */
+typedef struct ELFObjectContext {
+    uint8_t* p;            // Current write position in buffer
+    uint8_t* startp;       // Start of buffer (for offset calculations)
+    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
+    uint32_t code_size;    // Size of the code being described
 } ELFObjectContext;
 
-/* Append a null-terminated string. */
-static uint32_t
-elfctx_append_string(ELFObjectContext* ctx, const char* str)
-{
+/*
+ * EH Frame Header structure for DWARF unwinding
+ *
+ * This structure provides metadata about the DWARF unwinding information
+ * that follows. It's required by the perf jitdump format to enable proper
+ * stack unwinding during profiling.
+ */
+typedef struct {
+    unsigned char version;           // EH frame version (always 1)
+    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
+    unsigned char fde_count_enc;     // Encoding of FDE count
+    unsigned char table_enc;         // Encoding of table entries
+    int32_t eh_frame_ptr;           // Pointer to EH frame data
+    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
+    int32_t from;                   // Start address of code range
+    int32_t to;                     // End address of code range
+} EhFrameHeader;
+
+// =============================================================================
+//                              DWARF GENERATION UTILITIES
+// =============================================================================
+
+/*
+ * Append a null-terminated string to the ELF context buffer
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   str: String to append (must be null-terminated)
+ *
+ * Returns: Offset from start of buffer where string was written
+ */
+static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
     uint8_t* p = ctx->p;
     uint32_t ofs = (uint32_t)(p - ctx->startp);
+
+    /* Copy string including null terminator */
     do {
         *p++ = (uint8_t)*str;
     } while (*str++);
+
     ctx->p = p;
     return ofs;
 }
 
-/* Append a SLEB128 value. */
-static void
-elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
-{
+/*
+ * Append a SLEB128 (Signed Little Endian Base 128) value
+ *
+ * SLEB128 is a variable-length encoding used extensively in DWARF.
+ * It efficiently encodes small numbers in fewer bytes.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Signed value to encode
+ */
+static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
-        *p++ = (uint8_t)((v & 0x7f) | 0x80);
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (uint8_t)(v & 0x7f);
+    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Append a ULEB128 to buffer. */
-static void
-elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
-{
+/*
+ * Append a ULEB128 (Unsigned Little Endian Base 128) value
+ *
+ * Similar to SLEB128 but for unsigned values.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Unsigned value to encode
+ */
+static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; v >= 0x80; v >>= 7) {
-        *p++ = (char)((v & 0x7f) | 0x80);
+        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (char)v;
+    *p++ = (char)v;  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Shortcuts to generate DWARF structures. */
-#define DWRF_U8(x) (*p++ = (x))
-#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
-#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
-#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
-#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
-#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
-#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
-#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
-#define DWRF_ALIGNNOP(s)                                                                                \
-    while ((uintptr_t)p & ((s)-1)) {                                                                    \
-        *p++ = DWRF_CFA_nop;                                                                            \
+/*
+ * Macros for generating DWARF structures
+ *
+ * These macros provide a convenient way to write various data types
+ * to the DWARF buffer while automatically advancing the pointer.
+ */
+#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
+
+/* Align to specified boundary with NOP instructions */
+#define DWRF_ALIGNNOP(s)                                          \
+    while ((uintptr_t)p & ((s)-1)) {                              \
+        *p++ = DWRF_CFA_nop;                                       \
     }
-#define DWRF_SECTION(name, stmt)                                                                        \
-    {                                                                                                   \
-        uint32_t* szp_##name = (uint32_t*)p;                                                            \
-        p += 4;                                                                                         \
-        stmt;                                                                                           \
-        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
+
+/* Write a DWARF section with automatic size calculation */
+#define DWRF_SECTION(name, stmt)                                  \
+    {                                                             \
+        uint32_t* szp_##name = (uint32_t*)p;                      \
+        p += 4;                                                   \
+        stmt;                                                     \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
     }
 
-/* Initialize .eh_frame section. */
-static void
-elf_init_ehframe(ELFObjectContext* ctx)
-{
+// =============================================================================
+//                              DWARF EH FRAME GENERATION
+// =============================================================================
+
+/*
+ * Initialize DWARF .eh_frame section for a code region
+ *
+ * The .eh_frame section contains Call Frame Information (CFI) that describes
+ * how to unwind the stack at any point in the code. This is essential for
+ * proper profiling as it allows perf to generate accurate call graphs.
+ *
+ * The function generates two main components:
+ * 1. CIE (Common Information Entry) - describes calling conventions
+ * 2. FDE (Frame Description Entry) - describes specific function unwinding
+ *
+ * Args:
+ *   ctx: ELF object context containing code size and buffer pointers
+ */
+static void elf_init_ehframe(ELFObjectContext* ctx) {
     uint8_t* p = ctx->p;
-    uint8_t* framep = p;
-
-    /* Emit DWARF EH CIE. */
-    DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
-                 DWRF_U8(DWRF_CIE_VERSION);
-                 DWRF_STR("zR"); /* Augmentation. */
-                 DWRF_UV(1); /* Code alignment factor. */
-                 DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
-                 DWRF_U8(DWRF_REG_RA); /* Return address register. */
-                 DWRF_UV(1);
-                 DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
-                 DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
-                 DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));
+    uint8_t* framep = p;  // Remember start of frame data
+
+    /*
+    * DWARF Unwind Table for Trampoline Function
+    *
+    * This section defines DWARF Call Frame Information (CFI) using encoded macros
+    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
+    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
+    * and debuggers for stack unwinding in JIT-compiled code.
+    *
+    * -------------------------------------------------
+    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
+    * -------------------------------------------------
+    *
+    * 1. Create a trampoline source file (e.g., `trampoline.c`):
+    *
+    *      #include <Python.h>
+    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
+    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
+    *          return evaluator(ts, f, throwflag);
+    *      }
+    *
+    * 2. Compile to an object file with frame pointer preservation:
+    *
+    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * 3. Extract DWARF unwind info from the object file:
+    *
+    *      readelf -w trampoline.o
+    *
+    *    Example output from `.eh_frame`:
+    *
+    *      00000000 CIE
+    *        Version:               1
+    *        Augmentation:          "zR"
+    *        Code alignment factor: 4
+    *        Data alignment factor: -8
+    *        Return address column: 30
+    *        DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    *      00000014 FDE cie=00000000 pc=0..14
+    *        DW_CFA_advance_loc: 4
+    *        DW_CFA_def_cfa_offset: 16
+    *        DW_CFA_offset: r29 at cfa-16
+    *        DW_CFA_offset: r30 at cfa-8
+    *        DW_CFA_advance_loc: 12
+    *        DW_CFA_restore: r30
+    *        DW_CFA_restore: r29
+    *        DW_CFA_def_cfa_offset: 0
+    *
+    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
+    *
+    * ----------------------------------
+    * HOW TO TRANSLATE TO DWRF_* MACROS:
+    * ----------------------------------
+    *
+    * After compiling your trampoline with:
+    *
+    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * run:
+    *
+    *     readelf -w trampoline.o
+    *
+    * to inspect the generated `.eh_frame` data. You will see two main components:
+    *
+    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
+    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
+    *
+    * ---------------------
+    * Translating the CIE:
+    * ---------------------
+    * From `readelf -w`, you might see:
+    *
+    *   00000000 0000000000000010 00000000 CIE
+    *     Version:               1
+    *     Augmentation:          "zR"
+    *     Code alignment factor: 4
+    *     Data alignment factor: -8
+    *     Return address column: 30
+    *     Augmentation data:     1b
+    *     DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    * Map this to:
+    *
+    *     DWRF_SECTION(CIE,
+    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
+    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
+    *         DWRF_STR("zR");                         // Augmentation string "zR"
+    *         DWRF_UV(4);                             // Code alignment factor = 4
+    *         DWRF_SV(-8);                            // Data alignment factor = -8
+    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
+    *         DWRF_UV(1);                             // Augmentation data length = 1
+    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
+    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
+    *         DWRF_UV(0);                             // Offset = 0
+    *
+    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
+    *     )
+    *
+    * Notes:
+    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
+    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
+    *
+    * ---------------------
+    * Translating the FDE:
+    * ---------------------
+    * From `readelf -w`:
+    *
+    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
+    *     DW_CFA_advance_loc: 4
+    *     DW_CFA_def_cfa_offset: 16
+    *     DW_CFA_offset: r29 at cfa-16
+    *     DW_CFA_offset: r30 at cfa-8
+    *     DW_CFA_advance_loc: 12
+    *     DW_CFA_restore: r30
+    *     DW_CFA_restore: r29
+    *     DW_CFA_def_cfa_offset: 0
+    *
+    * Map the FDE header and instructions to:
+    *
+    *     DWRF_SECTION(FDE,
+    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
+    *         DWRF_U32(-0x30);                        // Initial PC-relative location of the code
+    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
+    *         DWRF_U8(0);                             // Augmentation data length (none)
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
+    *         DWRF_UV(16);
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
+    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
+    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
+    *         DWRF_UV(0);
+    *     )
+    *
+    * To regenerate:
+    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
+    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
+    *      the code is in a different address space every time.
+    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
+    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
+    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
+    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
+    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
+    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
+    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
+    */
+
+    /*
+     * Emit DWARF EH CIE (Common Information Entry)
+     *
+     * The CIE describes the calling conventions and basic unwinding rules
+     * that apply to all functions in this compilation unit.
+     */
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
+        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
+        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
+        DWRF_UV(1);                           // Code alignment factor
+        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
+        DWRF_U8(DWRF_REG_RA);                 // Return address register number
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
+
+        /* Initial CFI instructions - describe default calling convention */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
+        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
+        DWRF_UV(1);                           // At offset 1 from CFA
+
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
     )
 
-    ctx->eh_frame_p = p;
-
-    /* Emit DWARF EH FDE. */
-    DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
-                 DWRF_U32(-0x30); /* Machine code offset relative to .text. */
-                 DWRF_U32(ctx->code_size); /* Machine code length. */
-                 DWRF_U8(0); /* Augmentation data. */
-    /* Registers saved in CFRAME. */
+    ctx->eh_frame_p = p;  // Remember start of FDE data
+
+    /*
+     * Emit DWARF EH FDE (Frame Description Entry)
+     *
+     * The FDE describes unwinding information specific to this function.
+     * It references the CIE and provides function-specific CFI instructions.
+     */
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        DWRF_U32(-0x30);                      // Machine code offset relative to .text
+        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code lenght)
+        DWRF_U8(0);                           // Augmentation data length (none)
+
+        /*
+         * Architecture-specific CFI instructions
+         *
+         * These instructions describe how registers are saved and restored
+         * during function calls. Each architecture has different calling
+         * conventions and register usage patterns.
+         */
 #ifdef __x86_64__
-                 DWRF_U8(DWRF_CFA_advance_loc | 4);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_advance_loc | 6);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
-    /* Extra registers saved for JIT-compiled code. */
+        /* x86_64 calling convention unwinding rules */
+#  if defined(__CET__) && (__CET__ & 1)
+        DWRF_U8(DWRF_CFA_advance_loc | 8);    // Advance location by 8 bytes when CET protection is enabled
+#  else
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
+#  endif
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(16);                          // New offset: SP + 16
+        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(8);                           // New offset: SP + 8
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-                 DWRF_U8(DWRF_CFA_advance_loc | 1);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
-                 DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
-                 DWRF_U8(DWRF_CFA_advance_loc | 3);
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 29));
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 30));
-                 DWRF_U8(DWRF_CFA_def_cfa_offset);
-                 DWRF_UV(0);
+        /* AArch64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 instruction (stp x29, x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
+        DWRF_UV(16);                              // CFA = SP + 16 (stack pointer after push)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
+        DWRF_UV(2);                               // At offset 2 from CFA (2 * 8 = 16 bytes)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
+        DWRF_UV(1);                               // At offset 1 from CFA (1 * 8 = 8 bytes)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer (x29)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register (x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
+        DWRF_UV(0);                               // CFA = SP + 0 (stack restored)
+
 #else
 #    error "Unsupported target architecture"
 #endif
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));)
 
-    ctx->p = p;
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    ctx->p = p;  // Update context pointer to end of generated data
+}
+
+// =============================================================================
+//                              JITDUMP INITIALIZATION
+// =============================================================================
+
+/*
+ * Initialize the perf jitdump interface
+ *
+ * This function sets up everything needed to generate jitdump files:
+ * 1. Creates the jitdump file with a unique name
+ * 2. Maps the first page to signal perf that we're using the interface
+ * 3. Writes the jitdump header
+ * 4. Initializes synchronization primitives
+ *
+ * The memory mapping is crucial - perf detects jitdump files by scanning
+ * for processes that have mapped files matching the pattern /tmp/jit-*.dump
+ *
+ * Returns: Pointer to initialized state, or NULL on failure
+ */
+static void* perf_map_jit_init(void) {
+    char filename[100];
+    int pid = getpid();
+
+    /* Create unique filename based on process ID */
+    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
+
+    /* Create/open the jitdump file with appropriate permissions */
+    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
+    if (fd == -1) {
+        return NULL;  // Failed to create file
+    }
+
+    /* Get system page size for memory mapping */
+    const long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size == -1) {
+        close(fd);
+        return NULL;  // Failed to get page size
+    }
+
+    /*
+     * Map the first page of the jitdump file
+     *
+     * This memory mapping serves as a signal to perf that this process
+     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
+     * files that match the jitdump naming pattern.
+     *
+     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
+     */
+    perf_jit_map_state.mapped_buffer = mmap(
+        NULL,                    // Let kernel choose address
+        page_size,               // Map one page
+        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
+        MAP_PRIVATE,             // Private mapping
+        fd,                      // File descriptor
+        0                        // Offset 0 (first page)
+    );
+
+    if (perf_jit_map_state.mapped_buffer == NULL) {
+        close(fd);
+        return NULL;  // Memory mapping failed
+    }
+
+    perf_jit_map_state.mapped_size = page_size;
+
+    /* Convert file descriptor to FILE* for easier I/O operations */
+    perf_jit_map_state.perf_map = fdopen(fd, "w+");
+    if (perf_jit_map_state.perf_map == NULL) {
+        close(fd);
+        return NULL;  // Failed to create FILE*
+    }
+
+    /*
+     * Set up file buffering for better performance
+     *
+     * We use a large buffer (2MB) because jitdump files can be written
+     * frequently during program execution. Buffering reduces system call
+     * overhead and improves overall performance.
+     */
+    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
+
+    /* Write the jitdump file header */
+    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
+
+    /*
+     * Initialize thread synchronization lock
+     *
+     * Multiple threads may attempt to write to the jitdump file
+     * simultaneously. This lock ensures thread-safe access to the
+     * global jitdump state.
+     */
+    perf_jit_map_state.map_lock = PyThread_allocate_lock();
+    if (perf_jit_map_state.map_lock == NULL) {
+        fclose(perf_jit_map_state.perf_map);
+        return NULL;  // Failed to create lock
+    }
+
+    /* Initialize code ID counter */
+    perf_jit_map_state.code_id = 0;
+
+    /* Configure trampoline API with padding information */
+    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+
+    return &perf_jit_map_state;
 }
 
+// =============================================================================
+//                              MAIN JITDUMP ENTRY WRITING
+// =============================================================================
+
+/*
+ * Write a complete jitdump entry for a Python function
+ *
+ * This is the main function called by Python's trampoline system whenever
+ * a new piece of JIT-compiled code needs to be recorded. It writes both
+ * the unwinding information and the code load event to the jitdump file.
+ *
+ * The function performs these steps:
+ * 1. Initialize jitdump system if not already done
+ * 2. Extract function name and filename from Python code object
+ * 3. Generate DWARF unwinding information
+ * 4. Write unwinding info event to jitdump file
+ * 5. Write code load event to jitdump file
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *   code_addr: Address where the compiled code resides
+ *   code_size: Size of the compiled code in bytes
+ *   co: Python code object containing metadata
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static void perf_map_jit_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
+                                    unsigned int code_size, PyCodeObject *co)
 {
-
+    /* Initialize jitdump system on first use */
     if (perf_jit_map_state.perf_map == NULL) {
         void* ret = perf_map_jit_init();
         if(ret == NULL){
-            return;
+            return;  // Initialization failed, silently abort
         }
     }
 
+    /*
+     * Extract function information from Python code object
+     *
+     * We create a human-readable function name by combining the qualified
+     * name (includes class/module context) with the filename. This helps
+     * developers identify functions in perf reports.
+     */
     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
+
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }
 
-
+    /*
+     * Create formatted function name for perf display
+     *
+     * Format: "py::<function_name>:<filename>"
+     * The "py::" prefix helps identify Python functions in mixed-language
+     * profiles (e.g., when profiling C extensions alongside Python code).
+     */
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
-        return;
+        return;  // Memory allocation failed
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
 
@@ -528,90 +1081,185 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     uword base = (uword)code_addr;
     uword size = code_size;
 
-    // Write the code unwinding info event.
-
-    // Create unwinding information (eh frame)
+    /*
+     * Generate DWARF unwinding information
+     *
+     * DWARF data is essential for proper stack unwinding during profiling.
+     * Without it, perf cannot generate accurate call graphs, especially
+     * in optimized code where frame pointers may be omitted.
+     */
     ELFObjectContext ctx;
-    char buffer[1024];
+    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
+
+    /* Generate EH frame (Exception Handling frame) data */
     elf_init_ehframe(&ctx);
     int eh_frame_size = ctx.p - ctx.startp;
 
-    // Populate the unwind info event for perf
+    /*
+     * Write Code Unwinding Information Event
+     *
+     * This event must be written before the code load event to ensure
+     * perf has the unwinding information available when it processes
+     * the code region.
+     */
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
-    // Ensure we have enough space between DSOs when perf maps them
+
+    /* Verify we don't exceed our padding budget */
     assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
-    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
+    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
+
+    /* Calculate total event size with padding */
     int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
-    int padding_size = round_up(content_size, 8) - content_size;
+    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
     ev2.base.size = content_size + padding_size;
-    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
+    /* Write the unwinding info event header */
+    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
-    // Populate the eh Frame header
+    /*
+     * Write EH Frame Header
+     *
+     * The EH frame header provides metadata about the DWARF unwinding
+     * information that follows. It includes pointers and counts that
+     * help perf navigate the unwinding data efficiently.
+     */
     EhFrameHeader f;
     f.version = 1;
-    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
-    f.fde_count_enc = DwarfUData4;
-    f.table_enc = DwarfSData4 | DwarfDataRel;
+    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
+    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
+    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
+
+    /* Calculate relative offsets for EH frame navigation */
     f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
-    f.eh_fde_count = 1;
+    f.eh_fde_count = 1;  // We generate exactly one FDE per function
     f.from = -(round_up(code_size, 8) + eh_frame_size);
+
     int cie_size = ctx.eh_frame_p - ctx.startp;
     f.to = -(eh_frame_size - cie_size);
 
+    /* Write EH frame data and header */
     perf_map_jit_write_fully(ctx.startp, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));
 
+    /* Write padding to maintain alignment */
     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);
 
-    // Write the code load event.
+    /*
+     * Write Code Load Event
+     *
+     * This event tells perf about the new code region. It includes:
+     * - Memory addresses and sizes
+     * - Process and thread identification
+     * - Function name for symbol resolution
+     * - The actual machine code bytes
+     */
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
-    ev.thread_id = syscall(SYS_gettid);
-    ev.vma = base;
-    ev.code_address = base;
+    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
+    ev.vma = base;                       // Virtual memory address
+    ev.code_address = base;              // Same as VMA for our use case
     ev.code_size = size;
+
+    /* Assign unique code ID and increment counter */
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;
 
+    /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
-    perf_map_jit_write_fully(perf_map_entry, name_length+1);
-    perf_map_jit_write_fully((void*)(base), size);
-    return;
+    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
+    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
+
+    /* Clean up allocated memory */
+    PyMem_RawFree(perf_map_entry);
 }
 
+// =============================================================================
+//                              CLEANUP AND FINALIZATION
+// =============================================================================
+
+/*
+ * Finalize and cleanup the perf jitdump system
+ *
+ * This function is called when Python is shutting down or when the
+ * perf trampoline system is being disabled. It ensures all resources
+ * are properly released and all buffered data is flushed to disk.
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *
+ * Returns: 0 on success
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static int perf_map_jit_fini(void* state) {
+    /*
+     * Close jitdump file with proper synchronization
+     *
+     * We need to acquire the lock to ensure no other threads are
+     * writing to the file when we close it. This prevents corruption
+     * and ensures all data is properly flushed.
+     */
     if (perf_jit_map_state.perf_map != NULL) {
-        // close the file
         PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
-        fclose(perf_jit_map_state.perf_map);
+        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
         PyThread_release_lock(perf_jit_map_state.map_lock);
 
-        // clean up the lock and state
+        /* Clean up synchronization primitive */
         PyThread_free_lock(perf_jit_map_state.map_lock);
         perf_jit_map_state.perf_map = NULL;
     }
+
+    /*
+     * Unmap the memory region
+     *
+     * This removes the signal to perf that we were generating JIT code.
+     * After this point, perf will no longer detect this process as
+     * having JIT capabilities.
+     */
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
+        perf_jit_map_state.mapped_buffer = NULL;
     }
+
+    /* Clear global state reference */
     trampoline_api.state = NULL;
-    return 0;
+
+    return 0;  // Success
 }
 
+// =============================================================================
+//                              PUBLIC API EXPORT
+// =============================================================================
+
+/*
+ * Python Perf Callbacks Structure
+ *
+ * This structure defines the callback interface that Python's trampoline
+ * system uses to integrate with perf profiling. It contains function
+ * pointers for initialization, event writing, and cleanup.
+ *
+ * CRITICAL: This structure and its contents are part of Python's internal
+ * API. The function signatures and behavior must remain stable to maintain
+ * compatibility with the Python interpreter's perf integration system.
+ *
+ * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
+ */
 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
-    &perf_map_jit_init,
-    &perf_map_jit_write_entry,
-    &perf_map_jit_fini,
+    &perf_map_jit_init,        // Initialization function
+    &perf_map_jit_write_entry, // Event writing function
+    &perf_map_jit_fini,        // Cleanup function
 };
 
-#endif
+#endif /* PY_HAVE_PERF_TRAMPOLINE */
+\ No newline at end of file
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 8394245d373..724fda63511 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1283,7 +1283,7 @@ init_interp_main(PyThreadState *tstate)
     if (is_main_interp) {
         /* Initialize warnings. */
         PyObject *warnoptions;
-        if (_PySys_GetOptionalAttrString("warnoptions", &warnoptions) < 0) {
+        if (PySys_GetOptionalAttrString("warnoptions", &warnoptions) < 0) {
             return _PyStatus_ERR("can't initialize warnings");
         }
         if (warnoptions != NULL && PyList_Check(warnoptions) &&
@@ -1806,7 +1806,7 @@ flush_std_files(void)
     PyObject *file;
     int status = 0;
 
-    if (_PySys_GetOptionalAttr(&_Py_ID(stdout), &file) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stdout), &file) < 0) {
         status = -1;
     }
     else if (file != NULL && file != Py_None && !file_is_closed(file)) {
@@ -1819,7 +1819,7 @@ flush_std_files(void)
     }
     Py_XDECREF(file);
 
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
         PyErr_Clear();
         status = -1;
     }
@@ -3046,7 +3046,7 @@ _Py_FatalError_PrintExc(PyThreadState *tstate)
     }
 
     PyObject *ferr;
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &ferr) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &ferr) < 0) {
         _PyErr_Clear(tstate);
     }
     if (ferr == NULL || ferr == Py_None) {
diff --git a/Python/pystate.c b/Python/pystate.c
index 1ac13440085..0544b15aad1 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -69,7 +69,12 @@ to avoid the expense of doing their own locking).
 
 
 #ifdef HAVE_THREAD_LOCAL
+/* The attached thread state for the current thread. */
 _Py_thread_local PyThreadState *_Py_tss_tstate = NULL;
+
+/* The "bound" thread state used by PyGILState_Ensure(),
+   also known as a "gilstate." */
+_Py_thread_local PyThreadState *_Py_tss_gilstate = NULL;
 #endif
 
 static inline PyThreadState *
@@ -118,79 +123,9 @@ _PyThreadState_GetCurrent(void)
 }
 
 
-//------------------------------------------------
-// the thread state bound to the current OS thread
-//------------------------------------------------
-
-static inline int
-tstate_tss_initialized(Py_tss_t *key)
-{
-    return PyThread_tss_is_created(key);
-}
-
-static inline int
-tstate_tss_init(Py_tss_t *key)
-{
-    assert(!tstate_tss_initialized(key));
-    return PyThread_tss_create(key);
-}
-
-static inline void
-tstate_tss_fini(Py_tss_t *key)
-{
-    assert(tstate_tss_initialized(key));
-    PyThread_tss_delete(key);
-}
-
-static inline PyThreadState *
-tstate_tss_get(Py_tss_t *key)
-{
-    assert(tstate_tss_initialized(key));
-    return (PyThreadState *)PyThread_tss_get(key);
-}
-
-static inline int
-tstate_tss_set(Py_tss_t *key, PyThreadState *tstate)
-{
-    assert(tstate != NULL);
-    assert(tstate_tss_initialized(key));
-    return PyThread_tss_set(key, (void *)tstate);
-}
-
-static inline int
-tstate_tss_clear(Py_tss_t *key)
-{
-    assert(tstate_tss_initialized(key));
-    return PyThread_tss_set(key, (void *)NULL);
-}
-
-#ifdef HAVE_FORK
-/* Reset the TSS key - called by PyOS_AfterFork_Child().
- * This should not be necessary, but some - buggy - pthread implementations
- * don't reset TSS upon fork(), see issue #10517.
- */
-static PyStatus
-tstate_tss_reinit(Py_tss_t *key)
-{
-    if (!tstate_tss_initialized(key)) {
-        return _PyStatus_OK();
-    }
-    PyThreadState *tstate = tstate_tss_get(key);
-
-    tstate_tss_fini(key);
-    if (tstate_tss_init(key) != 0) {
-        return _PyStatus_NO_MEMORY();
-    }
-
-    /* If the thread had an associated auto thread state, reassociate it with
-     * the new key. */
-    if (tstate && tstate_tss_set(key, tstate) != 0) {
-        return _PyStatus_ERR("failed to re-set autoTSSkey");
-    }
-    return _PyStatus_OK();
-}
-#endif
-
+//---------------------------------------------
+// The thread state used by PyGILState_Ensure()
+//---------------------------------------------
 
 /*
    The stored thread state is set by bind_tstate() (AKA PyThreadState_Bind().
@@ -198,36 +133,23 @@ tstate_tss_reinit(Py_tss_t *key)
    The GIL does no need to be held for these.
   */
 
-#define gilstate_tss_initialized(runtime) \
-    tstate_tss_initialized(&(runtime)->autoTSSkey)
-#define gilstate_tss_init(runtime) \
-    tstate_tss_init(&(runtime)->autoTSSkey)
-#define gilstate_tss_fini(runtime) \
-    tstate_tss_fini(&(runtime)->autoTSSkey)
-#define gilstate_tss_get(runtime) \
-    tstate_tss_get(&(runtime)->autoTSSkey)
-#define _gilstate_tss_set(runtime, tstate) \
-    tstate_tss_set(&(runtime)->autoTSSkey, tstate)
-#define _gilstate_tss_clear(runtime) \
-    tstate_tss_clear(&(runtime)->autoTSSkey)
-#define gilstate_tss_reinit(runtime) \
-    tstate_tss_reinit(&(runtime)->autoTSSkey)
+static inline PyThreadState *
+gilstate_get(void)
+{
+    return _Py_tss_gilstate;
+}
 
 static inline void
-gilstate_tss_set(_PyRuntimeState *runtime, PyThreadState *tstate)
+gilstate_set(PyThreadState *tstate)
 {
-    assert(tstate != NULL && tstate->interp->runtime == runtime);
-    if (_gilstate_tss_set(runtime, tstate) != 0) {
-        Py_FatalError("failed to set current tstate (TSS)");
-    }
+    assert(tstate != NULL);
+    _Py_tss_gilstate = tstate;
 }
 
 static inline void
-gilstate_tss_clear(_PyRuntimeState *runtime)
+gilstate_clear(void)
 {
-    if (_gilstate_tss_clear(runtime) != 0) {
-        Py_FatalError("failed to clear current tstate (TSS)");
-    }
+    _Py_tss_gilstate = NULL;
 }
 
 
@@ -253,7 +175,7 @@ bind_tstate(PyThreadState *tstate)
     assert(tstate_is_alive(tstate) && !tstate->_status.bound);
     assert(!tstate->_status.unbound);  // just in case
     assert(!tstate->_status.bound_gilstate);
-    assert(tstate != gilstate_tss_get(tstate->interp->runtime));
+    assert(tstate != gilstate_get());
     assert(!tstate->_status.active);
     assert(tstate->thread_id == 0);
     assert(tstate->native_thread_id == 0);
@@ -328,14 +250,13 @@ bind_gilstate_tstate(PyThreadState *tstate)
     // XXX assert(!tstate->_status.active);
     assert(!tstate->_status.bound_gilstate);
 
-    _PyRuntimeState *runtime = tstate->interp->runtime;
-    PyThreadState *tcur = gilstate_tss_get(runtime);
+    PyThreadState *tcur = gilstate_get();
     assert(tstate != tcur);
 
     if (tcur != NULL) {
         tcur->_status.bound_gilstate = 0;
     }
-    gilstate_tss_set(runtime, tstate);
+    gilstate_set(tstate);
     tstate->_status.bound_gilstate = 1;
 }
 
@@ -347,9 +268,8 @@ unbind_gilstate_tstate(PyThreadState *tstate)
     assert(tstate_is_bound(tstate));
     // XXX assert(!tstate->_status.active);
     assert(tstate->_status.bound_gilstate);
-    assert(tstate == gilstate_tss_get(tstate->interp->runtime));
-
-    gilstate_tss_clear(tstate->interp->runtime);
+    assert(tstate == gilstate_get());
+    gilstate_clear();
     tstate->_status.bound_gilstate = 0;
 }
 
@@ -373,7 +293,7 @@ holds_gil(PyThreadState *tstate)
     // (and tstate->interp->runtime->ceval.gil.locked).
     assert(tstate != NULL);
     /* Must be the tstate for this thread */
-    assert(tstate == gilstate_tss_get(tstate->interp->runtime));
+    assert(tstate == gilstate_get());
     return tstate == current_fast_get();
 }
 
@@ -469,16 +389,6 @@ _PyRuntimeState_Init(_PyRuntimeState *runtime)
         return status;
     }
 
-    if (gilstate_tss_init(runtime) != 0) {
-        _PyRuntimeState_Fini(runtime);
-        return _PyStatus_NO_MEMORY();
-    }
-
-    if (PyThread_tss_create(&runtime->trashTSSkey) != 0) {
-        _PyRuntimeState_Fini(runtime);
-        return _PyStatus_NO_MEMORY();
-    }
-
     init_runtime(runtime, open_code_hook, open_code_userdata, audit_hook_head,
                  unicode_next_index);
 
@@ -492,14 +402,7 @@ _PyRuntimeState_Fini(_PyRuntimeState *runtime)
     /* The count is cleared by _Py_FinalizeRefTotal(). */
     assert(runtime->object_state.interpreter_leaks == 0);
 #endif
-
-    if (gilstate_tss_initialized(runtime)) {
-        gilstate_tss_fini(runtime);
-    }
-
-    if (PyThread_tss_is_created(&runtime->trashTSSkey)) {
-        PyThread_tss_delete(&runtime->trashTSSkey);
-    }
+    gilstate_clear();
 }
 
 #ifdef HAVE_FORK
@@ -532,18 +435,6 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime)
 
     _PyTypes_AfterFork();
 
-    PyStatus status = gilstate_tss_reinit(runtime);
-    if (_PyStatus_EXCEPTION(status)) {
-        return status;
-    }
-
-    if (PyThread_tss_is_created(&runtime->trashTSSkey)) {
-        PyThread_tss_delete(&runtime->trashTSSkey);
-    }
-    if (PyThread_tss_create(&runtime->trashTSSkey) != 0) {
-        return _PyStatus_NO_MEMORY();
-    }
-
     _PyThread_AfterFork(&runtime->threads);
 
     return _PyStatus_OK();
@@ -676,6 +567,7 @@ init_interpreter(PyInterpreterState *interp,
     }
     interp->sys_profile_initialized = false;
     interp->sys_trace_initialized = false;
+    interp->_code_object_generation = 0;
     interp->jit = false;
     interp->executor_list_head = NULL;
     interp->executor_deletion_list_head = NULL;
@@ -886,6 +778,10 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
     for (int t = 0; t < PY_MONITORING_TOOL_IDS; t++) {
         Py_CLEAR(interp->monitoring_tool_names[t]);
     }
+    interp->_code_object_generation = 0;
+#ifdef Py_GIL_DISABLED
+    interp->tlbc_indices.tlbc_generation = 0;
+#endif
 
     PyConfig_Clear(&interp->config);
     _PyCodec_Fini(interp);
@@ -1393,10 +1289,8 @@ interp_look_up_id(_PyRuntimeState *runtime, int64_t requested_id)
 {
     PyInterpreterState *interp = runtime->interpreters.head;
     while (interp != NULL) {
-        int64_t id = PyInterpreterState_GetID(interp);
-        if (id < 0) {
-            return NULL;
-        }
+        int64_t id = interp->id;
+        assert(id >= 0);
         if (requested_id == id) {
             return interp;
         }
@@ -1457,9 +1351,6 @@ tstate_is_alive(PyThreadState *tstate)
 // lifecycle
 //----------
 
-/* Minimum size of data stack chunk */
-#define DATA_STACK_CHUNK_SIZE (16*1024)
-
 static _PyStackChunk*
 allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
 {
@@ -1671,7 +1562,7 @@ _PyThreadState_NewBound(PyInterpreterState *interp, int whence)
         bind_tstate(tstate);
         // This makes sure there's a gilstate tstate bound
         // as soon as possible.
-        if (gilstate_tss_get(tstate->interp->runtime) == NULL) {
+        if (gilstate_get() == NULL) {
             bind_gilstate_tstate(tstate);
         }
     }
@@ -1908,9 +1799,14 @@ tstate_delete_common(PyThreadState *tstate, int release_gil)
 static void
 zapthreads(PyInterpreterState *interp)
 {
+    PyThreadState *tstate;
     /* No need to lock the mutex here because this should only happen
-       when the threads are all really dead (XXX famous last words). */
-    _Py_FOR_EACH_TSTATE_UNLOCKED(interp, tstate) {
+       when the threads are all really dead (XXX famous last words).
+
+       Cannot use _Py_FOR_EACH_TSTATE_UNLOCKED because we are freeing
+       the thread states here.
+    */
+    while ((tstate = interp->threads.head) != NULL) {
         tstate_verify_not_active(tstate);
         tstate_delete_common(tstate, 0);
         free_threadstate((_PyThreadStateImpl *)tstate);
@@ -2092,7 +1988,7 @@ tstate_activate(PyThreadState *tstate)
     assert(!tstate->_status.active);
 
     assert(!tstate->_status.bound_gilstate ||
-           tstate == gilstate_tss_get((tstate->interp->runtime)));
+           tstate == gilstate_get());
     if (!tstate->_status.bound_gilstate) {
         bind_gilstate_tstate(tstate);
     }
@@ -2560,7 +2456,7 @@ _PyThreadState_Bind(PyThreadState *tstate)
     bind_tstate(tstate);
     // This makes sure there's a gilstate tstate bound
     // as soon as possible.
-    if (gilstate_tss_get(tstate->interp->runtime) == NULL) {
+    if (gilstate_get() == NULL) {
         bind_gilstate_tstate(tstate);
     }
 }
@@ -2762,7 +2658,7 @@ _PyGILState_Init(PyInterpreterState *interp)
         return _PyStatus_OK();
     }
     _PyRuntimeState *runtime = interp->runtime;
-    assert(gilstate_tss_get(runtime) == NULL);
+    assert(gilstate_get() == NULL);
     assert(runtime->gilstate.autoInterpreterState == NULL);
     runtime->gilstate.autoInterpreterState = interp;
     return _PyStatus_OK();
@@ -2798,7 +2694,7 @@ _PyGILState_SetTstate(PyThreadState *tstate)
     _PyRuntimeState *runtime = tstate->interp->runtime;
 
     assert(runtime->gilstate.autoInterpreterState == tstate->interp);
-    assert(gilstate_tss_get(runtime) == tstate);
+    assert(gilstate_get() == tstate);
     assert(tstate->gilstate_counter == 1);
 #endif
 }
@@ -2814,11 +2710,7 @@ _PyGILState_GetInterpreterStateUnsafe(void)
 PyThreadState *
 PyGILState_GetThisThreadState(void)
 {
-    _PyRuntimeState *runtime = &_PyRuntime;
-    if (!gilstate_tss_initialized(runtime)) {
-        return NULL;
-    }
-    return gilstate_tss_get(runtime);
+    return gilstate_get();
 }
 
 int
@@ -2829,16 +2721,12 @@ PyGILState_Check(void)
         return 1;
     }
 
-    if (!gilstate_tss_initialized(runtime)) {
-        return 1;
-    }
-
     PyThreadState *tstate = current_fast_get();
     if (tstate == NULL) {
         return 0;
     }
 
-    PyThreadState *tcur = gilstate_tss_get(runtime);
+    PyThreadState *tcur = gilstate_get();
     return (tstate == tcur);
 }
 
@@ -2853,12 +2741,17 @@ PyGILState_Ensure(void)
        called Py_Initialize(). */
 
     /* Ensure that _PyEval_InitThreads() and _PyGILState_Init() have been
-       called by Py_Initialize() */
-    assert(_PyEval_ThreadsInitialized());
-    assert(gilstate_tss_initialized(runtime));
-    assert(runtime->gilstate.autoInterpreterState != NULL);
+       called by Py_Initialize()
 
-    PyThreadState *tcur = gilstate_tss_get(runtime);
+       TODO: This isn't thread-safe. There's no protection here against
+       concurrent finalization of the interpreter; it's simply a guard
+       for *after* the interpreter has finalized.
+     */
+    if (!_PyEval_ThreadsInitialized() || runtime->gilstate.autoInterpreterState == NULL) {
+        PyThread_hang_thread();
+    }
+
+    PyThreadState *tcur = gilstate_get();
     int has_gil;
     if (tcur == NULL) {
         /* Create a new Python thread state for this thread */
@@ -2898,8 +2791,7 @@ PyGILState_Ensure(void)
 void
 PyGILState_Release(PyGILState_STATE oldstate)
 {
-    _PyRuntimeState *runtime = &_PyRuntime;
-    PyThreadState *tstate = gilstate_tss_get(runtime);
+    PyThreadState *tstate = gilstate_get();
     if (tstate == NULL) {
         Py_FatalError("auto-releasing thread-state, "
                       "but no thread-state for this thread");
@@ -3007,7 +2899,7 @@ _PyInterpreterState_HasFeature(PyInterpreterState *interp, unsigned long feature
 static PyObject **
 push_chunk(PyThreadState *tstate, int size)
 {
-    int allocate_size = DATA_STACK_CHUNK_SIZE;
+    int allocate_size = _PY_DATA_STACK_CHUNK_SIZE;
     while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
         allocate_size *= 2;
     }
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index f67b72aa91f..8f1c78bf831 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -114,7 +114,7 @@ _PyRun_InteractiveLoopObject(FILE *fp, PyObject *filename, PyCompilerFlags *flag
     }
 
     PyObject *v;
-    if (_PySys_GetOptionalAttr(&_Py_ID(ps1), &v) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(ps1), &v) < 0) {
         PyErr_Print();
         return -1;
     }
@@ -128,7 +128,7 @@ _PyRun_InteractiveLoopObject(FILE *fp, PyObject *filename, PyCompilerFlags *flag
         }
     }
     Py_XDECREF(v);
-    if (_PySys_GetOptionalAttr(&_Py_ID(ps2), &v) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(ps2), &v) < 0) {
         PyErr_Print();
         return -1;
     }
@@ -206,7 +206,7 @@ pyrun_one_parse_ast(FILE *fp, PyObject *filename,
     PyObject *encoding_obj = NULL;
     const char *encoding = NULL;
     if (fp == stdin) {
-        if (_PySys_GetOptionalAttr(&_Py_ID(stdin), &attr) < 0) {
+        if (PySys_GetOptionalAttr(&_Py_ID(stdin), &attr) < 0) {
             PyErr_Clear();
         }
         else if (attr != NULL && attr != Py_None) {
@@ -226,7 +226,7 @@ pyrun_one_parse_ast(FILE *fp, PyObject *filename,
     // Get sys.ps1 (as UTF-8)
     PyObject *ps1_obj = NULL;
     const char *ps1 = "";
-    if (_PySys_GetOptionalAttr(&_Py_ID(ps1), &attr) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(ps1), &attr) < 0) {
         PyErr_Clear();
     }
     else if (attr != NULL) {
@@ -247,7 +247,7 @@ pyrun_one_parse_ast(FILE *fp, PyObject *filename,
     // Get sys.ps2 (as UTF-8)
     PyObject *ps2_obj = NULL;
     const char *ps2 = "";
-    if (_PySys_GetOptionalAttr(&_Py_ID(ps2), &attr) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(ps2), &attr) < 0) {
         PyErr_Clear();
     }
     else if (attr != NULL) {
@@ -658,7 +658,7 @@ _Py_HandleSystemExitAndKeyboardInterrupt(int *exitcode_p)
     }
 
     PyObject *sys_stderr;
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &sys_stderr) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &sys_stderr) < 0) {
         PyErr_Clear();
     }
     else if (sys_stderr != NULL && sys_stderr != Py_None) {
@@ -722,7 +722,7 @@ _PyErr_PrintEx(PyThreadState *tstate, int set_sys_last_vars)
             _PyErr_Clear(tstate);
         }
     }
-    if (_PySys_GetOptionalAttr(&_Py_ID(excepthook), &hook) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(excepthook), &hook) < 0) {
         PyErr_Clear();
     }
     if (_PySys_Audit(tstate, "sys.excepthook", "OOOO", hook ? hook : Py_None,
@@ -1197,7 +1197,7 @@ void
 PyErr_Display(PyObject *unused, PyObject *value, PyObject *tb)
 {
     PyObject *file;
-    if (_PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(stderr), &file) < 0) {
         PyObject *exc = PyErr_GetRaisedException();
         _PyObject_Dump(value);
         fprintf(stderr, "lost sys.stderr\n");
@@ -1321,7 +1321,7 @@ static void
 flush_io_stream(PyThreadState *tstate, PyObject *name)
 {
     PyObject *f;
-    if (_PySys_GetOptionalAttr(name, &f) < 0) {
+    if (PySys_GetOptionalAttr(name, &f) < 0) {
         PyErr_Clear();
     }
     if (f != NULL) {
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index edc77c30291..6cbf1c8deaa 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -35,7 +35,7 @@ extern "C" {
 #    include <sys/mman.h>
 #endif
 
-#if defined(__APPLE__) && TARGET_OS_OSX
+#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
 #  include <libproc.h>
 #  include <mach-o/fat.h>
 #  include <mach-o/loader.h>
@@ -73,27 +73,101 @@ extern "C" {
 #    define HAVE_PROCESS_VM_READV 0
 #endif
 
+#define _set_debug_exception_cause(exception, format, ...) \
+    do { \
+        if (!PyErr_ExceptionMatches(PyExc_PermissionError)) { \
+            PyThreadState *tstate = _PyThreadState_GET(); \
+            if (!_PyErr_Occurred(tstate)) { \
+                _PyErr_Format(tstate, exception, format, ##__VA_ARGS__); \
+            } else { \
+                _PyErr_FormatFromCause(exception, format, ##__VA_ARGS__); \
+            } \
+        } \
+    } while (0)
+
+static inline size_t
+get_page_size(void) {
+    size_t page_size = 0;
+    if (page_size == 0) {
+#ifdef MS_WINDOWS
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        page_size = si.dwPageSize;
+#else
+        page_size = (size_t)getpagesize();
+#endif
+    }
+    return page_size;
+}
+
+typedef struct page_cache_entry {
+    uintptr_t page_addr; // page-aligned base address
+    char *data;
+    int valid;
+    struct page_cache_entry *next;
+} page_cache_entry_t;
+
+#define MAX_PAGES 1024
+
 // Define a platform-independent process handle structure
 typedef struct {
     pid_t pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+    mach_port_t task;
+#elif defined(MS_WINDOWS)
     HANDLE hProcess;
 #endif
+    page_cache_entry_t pages[MAX_PAGES];
+    Py_ssize_t page_size;
 } proc_handle_t;
 
+static void
+_Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        PyMem_RawFree(handle->pages[i].data);
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
+}
+
+void
+_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
+{
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].valid = 0;
+    }
+}
+
+#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+static mach_port_t pid_to_task(pid_t pid);
+#endif
+
 // Initialize the process handle
 static int
 _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
     handle->pid = pid;
-#ifdef MS_WINDOWS
+#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+    handle->task = pid_to_task(handle->pid);
+    if (handle->task == 0) {
+        _set_debug_exception_cause(PyExc_RuntimeError, "Failed to initialize macOS process handle");
+        return -1;
+    }
+#elif defined(MS_WINDOWS)
     handle->hProcess = OpenProcess(
         PROCESS_VM_READ | PROCESS_VM_WRITE | PROCESS_VM_OPERATION | PROCESS_QUERY_INFORMATION,
         FALSE, pid);
     if (handle->hProcess == NULL) {
         PyErr_SetFromWindowsErr(0);
+        _set_debug_exception_cause(PyExc_RuntimeError, "Failed to initialize Windows process handle");
         return -1;
     }
 #endif
+    handle->page_size = get_page_size();
+    for (int i = 0; i < MAX_PAGES; i++) {
+        handle->pages[i].data = NULL;
+        handle->pages[i].valid = 0;
+    }
     return 0;
 }
 
@@ -107,9 +181,10 @@ _Py_RemoteDebug_CleanupProcHandle(proc_handle_t *handle) {
     }
 #endif
     handle->pid = 0;
+    _Py_RemoteDebug_FreePageCache(handle);
 }
 
-#if defined(__APPLE__) && TARGET_OS_OSX
+#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
 
 static uintptr_t
 return_section_address64(
@@ -148,8 +223,10 @@ return_section_address64(
                     &object_name
                 );
                 if (ret != KERN_SUCCESS) {
-                    PyErr_SetString(
-                        PyExc_RuntimeError, "Cannot get any more VM maps.\n");
+                    PyErr_Format(PyExc_RuntimeError,
+                        "mach_vm_region failed while parsing 64-bit Mach-O binary "
+                        "at base address 0x%lx (kern_return_t: %d)",
+                        base, ret);
                     return 0;
                 }
             }
@@ -169,9 +246,6 @@ return_section_address64(
         cmd = (struct segment_command_64*)((void*)cmd + cmd->cmdsize);
     }
 
-    // We should not be here, but if we are there, we should say about this
-    PyErr_SetString(
-        PyExc_RuntimeError, "Cannot find section address.\n");
     return 0;
 }
 
@@ -212,8 +286,10 @@ return_section_address32(
                     &object_name
                 );
                 if (ret != KERN_SUCCESS) {
-                    PyErr_SetString(
-                        PyExc_RuntimeError, "Cannot get any more VM maps.\n");
+                    PyErr_Format(PyExc_RuntimeError,
+                        "mach_vm_region failed while parsing 32-bit Mach-O binary "
+                        "at base address 0x%lx (kern_return_t: %d)",
+                        base, ret);
                     return 0;
                 }
             }
@@ -233,9 +309,6 @@ return_section_address32(
         cmd = (struct segment_command*)((void*)cmd + cmd->cmdsize);
     }
 
-    // We should not be here, but if we are there, we should say about this
-    PyErr_SetString(
-        PyExc_RuntimeError, "Cannot find section address.\n");
     return 0;
 }
 
@@ -253,8 +326,20 @@ return_section_address_fat(
     int is_abi64;
     size_t cpu_size = sizeof(cpu), abi64_size = sizeof(is_abi64);
 
-    sysctlbyname("hw.cputype", &cpu, &cpu_size, NULL, 0);
-    sysctlbyname("hw.cpu64bit_capable", &is_abi64, &abi64_size, NULL, 0);
+    if (sysctlbyname("hw.cputype", &cpu, &cpu_size, NULL, 0) != 0) {
+        PyErr_Format(PyExc_OSError,
+            "Failed to determine CPU type via sysctlbyname "
+            "for fat binary analysis at 0x%lx: %s",
+            base, strerror(errno));
+        return 0;
+    }
+    if (sysctlbyname("hw.cpu64bit_capable", &is_abi64, &abi64_size, NULL, 0) != 0) {
+        PyErr_Format(PyExc_OSError,
+            "Failed to determine CPU ABI capability via sysctlbyname "
+            "for fat binary analysis at 0x%lx: %s",
+            base, strerror(errno));
+        return 0;
+    }
 
     cpu |= is_abi64 * CPU_ARCH_ABI64;
 
@@ -285,13 +370,18 @@ return_section_address_fat(
                     return return_section_address64(section, proc_ref, base, (void*)hdr);
 
                 default:
-                    PyErr_SetString(PyExc_RuntimeError, "Unknown Mach-O magic in fat binary.\n");
+                    PyErr_Format(PyExc_RuntimeError,
+                        "Unknown Mach-O magic number 0x%x in fat binary architecture %u at base 0x%lx",
+                        hdr->magic, i, base);
                     return 0;
             }
         }
     }
 
-    PyErr_SetString(PyExc_RuntimeError, "No matching architecture found in fat binary.\n");
+    PyErr_Format(PyExc_RuntimeError,
+        "No matching architecture found for CPU type 0x%x "
+        "in fat binary at base 0x%lx (%u architectures examined)",
+        cpu, base, nfat_arch);
     return 0;
 }
 
@@ -300,20 +390,26 @@ search_section_in_file(const char* secname, char* path, uintptr_t base, mach_vm_
 {
     int fd = open(path, O_RDONLY);
     if (fd == -1) {
-        PyErr_Format(PyExc_RuntimeError, "Cannot open binary %s\n", path);
+        PyErr_Format(PyExc_OSError,
+            "Cannot open binary file '%s' for section '%s' search: %s",
+            path, secname, strerror(errno));
         return 0;
     }
 
     struct stat fs;
     if (fstat(fd, &fs) == -1) {
-        PyErr_Format(PyExc_RuntimeError, "Cannot get size of binary %s\n", path);
+        PyErr_Format(PyExc_OSError,
+            "Cannot get file size for binary '%s' during section '%s' search: %s",
+            path, secname, strerror(errno));
         close(fd);
         return 0;
     }
 
     void* map = mmap(0, fs.st_size, PROT_READ, MAP_SHARED, fd, 0);
     if (map == MAP_FAILED) {
-        PyErr_Format(PyExc_RuntimeError, "Cannot map binary %s\n", path);
+        PyErr_Format(PyExc_OSError,
+            "Cannot memory map binary file '%s' (size: %lld bytes) for section '%s' search: %s",
+            path, (long long)fs.st_size, secname, strerror(errno));
         close(fd);
         return 0;
     }
@@ -335,13 +431,22 @@ search_section_in_file(const char* secname, char* path, uintptr_t base, mach_vm_
         result = return_section_address_fat(secname, proc_ref, base, map);
         break;
     default:
-        PyErr_SetString(PyExc_RuntimeError, "Unknown Mach-O magic");
+        PyErr_Format(PyExc_RuntimeError,
+            "Unrecognized Mach-O magic number 0x%x in binary file '%s' for section '%s' search",
+            magic, path, secname);
         break;
     }
 
-    munmap(map, fs.st_size);
+    if (munmap(map, fs.st_size) != 0) {
+        PyErr_Format(PyExc_OSError,
+            "Failed to unmap binary file '%s' (size: %lld bytes): %s",
+            path, (long long)fs.st_size, strerror(errno));
+        result = 0;
+    }
     if (close(fd) != 0) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Failed to close binary file '%s': %s",
+            path, strerror(errno));
         result = 0;
     }
     return result;
@@ -356,7 +461,10 @@ pid_to_task(pid_t pid)
 
     result = task_for_pid(mach_task_self(), pid, &task);
     if (result != KERN_SUCCESS) {
-        PyErr_Format(PyExc_PermissionError, "Cannot get task for PID %d", pid);
+        PyErr_Format(PyExc_PermissionError,
+            "Cannot get task port for PID %d (kern_return_t: %d). "
+            "This typically requires running as root or having the 'com.apple.system-task-ports' entitlement.",
+            pid, result);
         return 0;
     }
     return task;
@@ -373,13 +481,15 @@ search_map_for_section(proc_handle_t *handle, const char* secname, const char* s
     mach_port_t proc_ref = pid_to_task(handle->pid);
     if (proc_ref == 0) {
         if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_PermissionError, "Cannot get task for PID");
+            PyErr_Format(PyExc_PermissionError,
+                "Cannot get task port for PID %d during section search",
+                handle->pid);
         }
         return 0;
     }
 
-    int match_found = 0;
     char map_filename[MAXPATHLEN + 1];
+
     while (mach_vm_region(
         proc_ref,
         &address,
@@ -389,6 +499,7 @@ search_map_for_section(proc_handle_t *handle, const char* secname, const char* s
         &count,
         &object_name) == KERN_SUCCESS)
     {
+
         if ((region_info.protection & VM_PROT_READ) == 0
             || (region_info.protection & VM_PROT_EXECUTE) == 0) {
             address += size;
@@ -409,21 +520,21 @@ search_map_for_section(proc_handle_t *handle, const char* secname, const char* s
             filename = map_filename;  // No path, use the whole string
         }
 
-        if (!match_found && strncmp(filename, substr, strlen(substr)) == 0) {
-            match_found = 1;
-            return search_section_in_file(
+        if (strncmp(filename, substr, strlen(substr)) == 0) {
+            uintptr_t result = search_section_in_file(
                 secname, map_filename, address, size, proc_ref);
+            if (result != 0) {
+                return result;
+            }
         }
 
         address += size;
     }
 
-    PyErr_SetString(PyExc_RuntimeError,
-        "mach_vm_region failed to find the section");
     return 0;
 }
 
-#endif // (__APPLE__ && TARGET_OS_OSX)
+#endif // (__APPLE__ && defined(TARGET_OS_OSX) && TARGET_OS_OSX)
 
 #if defined(__linux__) && HAVE_PROCESS_VM_READV
 static uintptr_t
@@ -442,24 +553,38 @@ search_elf_file_for_section(
 
     int fd = open(elf_file, O_RDONLY);
     if (fd < 0) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Cannot open ELF file '%s' for section '%s' search: %s",
+            elf_file, secname, strerror(errno));
         goto exit;
     }
 
     struct stat file_stats;
     if (fstat(fd, &file_stats) != 0) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Cannot get file size for ELF file '%s' during section '%s' search: %s",
+            elf_file, secname, strerror(errno));
         goto exit;
     }
 
     file_memory = mmap(NULL, file_stats.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
     if (file_memory == MAP_FAILED) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Cannot memory map ELF file '%s' (size: %lld bytes) for section '%s' search: %s",
+            elf_file, (long long)file_stats.st_size, secname, strerror(errno));
         goto exit;
     }
 
     Elf_Ehdr* elf_header = (Elf_Ehdr*)file_memory;
 
+    // Validate ELF header
+    if (elf_header->e_shstrndx >= elf_header->e_shnum) {
+        PyErr_Format(PyExc_RuntimeError,
+            "Invalid ELF file '%s': string table index %u >= section count %u",
+            elf_file, elf_header->e_shstrndx, elf_header->e_shnum);
+        goto exit;
+    }
+
     Elf_Shdr* section_header_table = (Elf_Shdr*)(file_memory + elf_header->e_shoff);
 
     Elf_Shdr* shstrtab_section = &section_header_table[elf_header->e_shstrndx];
@@ -476,6 +601,10 @@ search_elf_file_for_section(
         }
     }
 
+    if (section == NULL) {
+        goto exit;
+    }
+
     Elf_Phdr* program_header_table = (Elf_Phdr*)(file_memory + elf_header->e_phoff);
     // Find the first PT_LOAD segment
     Elf_Phdr* first_load_segment = NULL;
@@ -486,18 +615,25 @@ search_elf_file_for_section(
         }
     }
 
-    if (section != NULL && first_load_segment != NULL) {
-        uintptr_t elf_load_addr = first_load_segment->p_vaddr
-            - (first_load_segment->p_vaddr % first_load_segment->p_align);
-        result = start_address + (uintptr_t)section->sh_addr - elf_load_addr;
+    if (first_load_segment == NULL) {
+        PyErr_Format(PyExc_RuntimeError,
+            "No PT_LOAD segment found in ELF file '%s' (%u program headers examined)",
+            elf_file, elf_header->e_phnum);
+        goto exit;
     }
 
+    uintptr_t elf_load_addr = first_load_segment->p_vaddr
+        - (first_load_segment->p_vaddr % first_load_segment->p_align);
+    result = start_address + (uintptr_t)section->sh_addr - elf_load_addr;
+
 exit:
     if (file_memory != NULL) {
         munmap(file_memory, file_stats.st_size);
     }
     if (fd >= 0 && close(fd) != 0) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Failed to close ELF file '%s': %s",
+            elf_file, strerror(errno));
         result = 0;
     }
     return result;
@@ -511,7 +647,9 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
 
     FILE* maps_file = fopen(maps_file_path, "r");
     if (maps_file == NULL) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Cannot open process memory map file '%s' for PID %d section search: %s",
+            maps_file_path, handle->pid, strerror(errno));
         return 0;
     }
 
@@ -520,11 +658,16 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
     char *line = PyMem_Malloc(linesz);
     if (!line) {
         fclose(maps_file);
-        PyErr_NoMemory();
+        _set_debug_exception_cause(PyExc_MemoryError,
+            "Cannot allocate memory for reading process map file '%s'",
+            maps_file_path);
         return 0;
     }
 
     uintptr_t retval = 0;
+    int lines_processed = 0;
+    int matches_found = 0;
+
     while (fgets(line + linelen, linesz - linelen, maps_file) != NULL) {
         linelen = strlen(line);
         if (line[linelen - 1] != '\n') {
@@ -535,7 +678,9 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
             if (!biggerline) {
                 PyMem_Free(line);
                 fclose(maps_file);
-                PyErr_NoMemory();
+                _set_debug_exception_cause(PyExc_MemoryError,
+                    "Cannot reallocate memory while reading process map file '%s' (attempted size: %zu)",
+                    maps_file_path, linesz);
                 return 0;
             }
             line = biggerline;
@@ -546,6 +691,7 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
         line[linelen - 1] = '\0';
         // and prepare to read the next line into the start of the buffer.
         linelen = 0;
+        lines_processed++;
 
         unsigned long start = 0;
         unsigned long path_pos = 0;
@@ -566,6 +712,7 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
         }
 
         if (strstr(filename, substr)) {
+            matches_found++;
             retval = search_elf_file_for_section(handle, secname, start, path);
             if (retval) {
                 break;
@@ -575,7 +722,9 @@ search_linux_map_for_section(proc_handle_t *handle, const char* secname, const c
 
     PyMem_Free(line);
     if (fclose(maps_file) != 0) {
-        PyErr_SetFromErrno(PyExc_OSError);
+        PyErr_Format(PyExc_OSError,
+            "Failed to close process map file '%s': %s",
+            maps_file_path, strerror(errno));
         retval = 0;
     }
 
@@ -591,11 +740,20 @@ static void* analyze_pe(const wchar_t* mod_path, BYTE* remote_base, const char*
     HANDLE hFile = CreateFileW(mod_path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
     if (hFile == INVALID_HANDLE_VALUE) {
         PyErr_SetFromWindowsErr(0);
+        DWORD error = GetLastError();
+        PyErr_Format(PyExc_OSError,
+            "Cannot open PE file for section '%s' analysis (error %lu)",
+            secname, error);
         return NULL;
     }
+
     HANDLE hMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, 0);
     if (!hMap) {
         PyErr_SetFromWindowsErr(0);
+        DWORD error = GetLastError();
+        PyErr_Format(PyExc_OSError,
+            "Cannot create file mapping for PE file section '%s' analysis (error %lu)",
+            secname, error);
         CloseHandle(hFile);
         return NULL;
     }
@@ -603,6 +761,10 @@ static void* analyze_pe(const wchar_t* mod_path, BYTE* remote_base, const char*
     BYTE* mapView = (BYTE*)MapViewOfFile(hMap, FILE_MAP_READ, 0, 0, 0);
     if (!mapView) {
         PyErr_SetFromWindowsErr(0);
+        DWORD error = GetLastError();
+        PyErr_Format(PyExc_OSError,
+            "Cannot map view of PE file for section '%s' analysis (error %lu)",
+            secname, error);
         CloseHandle(hMap);
         CloseHandle(hFile);
         return NULL;
@@ -610,7 +772,9 @@ static void* analyze_pe(const wchar_t* mod_path, BYTE* remote_base, const char*
 
     IMAGE_DOS_HEADER* pDOSHeader = (IMAGE_DOS_HEADER*)mapView;
     if (pDOSHeader->e_magic != IMAGE_DOS_SIGNATURE) {
-        PyErr_SetString(PyExc_RuntimeError, "Invalid DOS signature.");
+        PyErr_Format(PyExc_RuntimeError,
+            "Invalid DOS signature (0x%x) in PE file for section '%s' analysis (expected 0x%x)",
+            pDOSHeader->e_magic, secname, IMAGE_DOS_SIGNATURE);
         UnmapViewOfFile(mapView);
         CloseHandle(hMap);
         CloseHandle(hFile);
@@ -619,7 +783,9 @@ static void* analyze_pe(const wchar_t* mod_path, BYTE* remote_base, const char*
 
     IMAGE_NT_HEADERS* pNTHeaders = (IMAGE_NT_HEADERS*)(mapView + pDOSHeader->e_lfanew);
     if (pNTHeaders->Signature != IMAGE_NT_SIGNATURE) {
-        PyErr_SetString(PyExc_RuntimeError, "Invalid NT signature.");
+        PyErr_Format(PyExc_RuntimeError,
+            "Invalid NT signature (0x%lx) in PE file for section '%s' analysis (expected 0x%lx)",
+            pNTHeaders->Signature, secname, IMAGE_NT_SIGNATURE);
         UnmapViewOfFile(mapView);
         CloseHandle(hMap);
         CloseHandle(hFile);
@@ -653,17 +819,27 @@ search_windows_map_for_section(proc_handle_t* handle, const char* secname, const
     } while (hProcSnap == INVALID_HANDLE_VALUE && GetLastError() == ERROR_BAD_LENGTH);
 
     if (hProcSnap == INVALID_HANDLE_VALUE) {
-        PyErr_SetString(PyExc_PermissionError, "Unable to create module snapshot. Check permissions or PID.");
+        PyErr_SetFromWindowsErr(0);
+        DWORD error = GetLastError();
+        PyErr_Format(PyExc_PermissionError,
+            "Unable to create module snapshot for PID %d section '%s' "
+            "search (error %lu). Check permissions or PID validity",
+            handle->pid, secname, error);
         return 0;
     }
 
     MODULEENTRY32W moduleEntry;
     moduleEntry.dwSize = sizeof(moduleEntry);
     void* runtime_addr = NULL;
+    int modules_examined = 0;
+    int matches_found = 0;
 
     for (BOOL hasModule = Module32FirstW(hProcSnap, &moduleEntry); hasModule; hasModule = Module32NextW(hProcSnap, &moduleEntry)) {
+        modules_examined++;
+
         // Look for either python executable or DLL
         if (wcsstr(moduleEntry.szModule, substr)) {
+            matches_found++;
             runtime_addr = analyze_pe(moduleEntry.szExePath, moduleEntry.modBaseAddr, secname);
             if (runtime_addr != NULL) {
                 break;
@@ -672,6 +848,7 @@ search_windows_map_for_section(proc_handle_t* handle, const char* secname, const
     }
 
     CloseHandle(hProcSnap);
+
     return (uintptr_t)runtime_addr;
 }
 
@@ -689,7 +866,9 @@ _Py_RemoteDebug_GetPyRuntimeAddress(proc_handle_t* handle)
     if (address == 0) {
         // Error out: 'python' substring covers both executable and DLL
         PyObject *exc = PyErr_GetRaisedException();
-        PyErr_SetString(PyExc_RuntimeError, "Failed to find the PyRuntime section in the process.");
+        PyErr_Format(PyExc_RuntimeError,
+            "Failed to find the PyRuntime section in process %d on Windows platform",
+            handle->pid);
         _PyErr_ChainExceptions1(exc);
     }
 #elif defined(__linux__)
@@ -698,16 +877,28 @@ _Py_RemoteDebug_GetPyRuntimeAddress(proc_handle_t* handle)
     if (address == 0) {
         // Error out: 'python' substring covers both executable and DLL
         PyObject *exc = PyErr_GetRaisedException();
-        PyErr_SetString(PyExc_RuntimeError, "Failed to find the PyRuntime section in the process.");
+        PyErr_Format(PyExc_RuntimeError,
+            "Failed to find the PyRuntime section in process %d on Linux platform",
+            handle->pid);
         _PyErr_ChainExceptions1(exc);
     }
-#elif defined(__APPLE__) && TARGET_OS_OSX
+#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
     // On macOS, try libpython first, then fall back to python
-    address = search_map_for_section(handle, "PyRuntime", "libpython");
-    if (address == 0) {
-        // TODO: Differentiate between not found and error
+    const char* candidates[] = {"libpython", "python", "Python", NULL};
+    for (const char** candidate = candidates; *candidate; candidate++) {
         PyErr_Clear();
-        address = search_map_for_section(handle, "PyRuntime", "python");
+        address = search_map_for_section(handle, "PyRuntime", *candidate);
+        if (address != 0) {
+            break;
+        }
+    }
+    if (address == 0) {
+        PyObject *exc = PyErr_GetRaisedException();
+        PyErr_Format(PyExc_RuntimeError,
+            "Failed to find the PyRuntime section in process %d "
+            "on macOS platform (tried both libpython and python)",
+            handle->pid);
+        _PyErr_ChainExceptions1(exc);
     }
 #else
     Py_UNREACHABLE();
@@ -726,6 +917,11 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
     do {
         if (!ReadProcessMemory(handle->hProcess, (LPCVOID)(remote_address + result), (char*)dst + result, len - result, &read_bytes)) {
             PyErr_SetFromWindowsErr(0);
+            DWORD error = GetLastError();
+            _set_debug_exception_cause(PyExc_OSError,
+                "ReadProcessMemory failed for PID %d at address 0x%lx "
+                "(size %zu, partial read %zu bytes): Windows error %lu",
+                handle->pid, remote_address + result, len - result, result, error);
             return -1;
         }
         result += read_bytes;
@@ -746,16 +942,20 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
         read_bytes = process_vm_readv(handle->pid, local, 1, remote, 1, 0);
         if (read_bytes < 0) {
             PyErr_SetFromErrno(PyExc_OSError);
+            _set_debug_exception_cause(PyExc_OSError,
+                "process_vm_readv failed for PID %d at address 0x%lx "
+                "(size %zu, partial read %zd bytes): %s",
+                handle->pid, remote_address + result, len - result, result, strerror(errno));
             return -1;
         }
 
         result += read_bytes;
     } while ((size_t)read_bytes != local[0].iov_len);
     return 0;
-#elif defined(__APPLE__) && TARGET_OS_OSX
+#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
     Py_ssize_t result = -1;
     kern_return_t kr = mach_vm_read_overwrite(
-        pid_to_task(handle->pid),
+        handle->task,
         (mach_vm_address_t)remote_address,
         len,
         (mach_vm_address_t)dst,
@@ -764,13 +964,22 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
     if (kr != KERN_SUCCESS) {
         switch (kr) {
         case KERN_PROTECTION_FAILURE:
-            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to read memory");
+            PyErr_Format(PyExc_PermissionError,
+                "Memory protection failure reading from PID %d at address "
+                "0x%lx (size %zu): insufficient permissions",
+                handle->pid, remote_address, len);
             break;
         case KERN_INVALID_ARGUMENT:
-            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_read_overwrite");
+            PyErr_Format(PyExc_ValueError,
+                "Invalid argument to mach_vm_read_overwrite for PID %d at "
+                "address 0x%lx (size %zu)",
+                handle->pid, remote_address, len);
             break;
         default:
-            PyErr_SetString(PyExc_RuntimeError, "Unknown error reading memory");
+            PyErr_Format(PyExc_RuntimeError,
+                "mach_vm_read_overwrite failed for PID %d at address 0x%lx "
+                "(size %zu): kern_return_t %d",
+                handle->pid, remote_address, len, kr);
         }
         return -1;
     }
@@ -780,6 +989,62 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }
 
+int
+_Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
+                                      uintptr_t addr,
+                                      size_t size,
+                                      void *out)
+{
+    size_t page_size = handle->page_size;
+    uintptr_t page_base = addr & ~(page_size - 1);
+    size_t offset_in_page = addr - page_base;
+
+    if (offset_in_page + size > page_size) {
+        return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+    }
+
+    // Search for valid cached page
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (entry->valid && entry->page_addr == page_base) {
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+    // Find reusable slot
+    for (int i = 0; i < MAX_PAGES; i++) {
+        page_cache_entry_t *entry = &handle->pages[i];
+        if (!entry->valid) {
+            if (entry->data == NULL) {
+                entry->data = PyMem_RawMalloc(page_size);
+                if (entry->data == NULL) {
+                    _set_debug_exception_cause(PyExc_MemoryError,
+                        "Cannot allocate %zu bytes for page cache entry "
+                        "during read from PID %d at address 0x%lx",
+                        page_size, handle->pid, addr);
+                    return -1;
+                }
+            }
+
+            if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
+                // Try to just copy the exact ammount as a fallback
+                PyErr_Clear();
+                goto fallback;
+            }
+
+            entry->page_addr = page_base;
+            entry->valid = 1;
+            memcpy(out, entry->data + offset_in_page, size);
+            return 0;
+        }
+    }
+
+fallback:
+    // Cache full — fallback to uncached read
+    return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
+}
+
 static int
 _Py_RemoteDebug_ReadDebugOffsets(
     proc_handle_t *handle,
@@ -789,13 +1054,16 @@ _Py_RemoteDebug_ReadDebugOffsets(
     *runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(handle);
     if (!*runtime_start_address) {
         if (!PyErr_Occurred()) {
-            PyErr_SetString(
-                PyExc_RuntimeError, "Failed to get PyRuntime address");
+            PyErr_Format(PyExc_RuntimeError,
+                "Failed to locate PyRuntime address for PID %d",
+                handle->pid);
         }
+        _set_debug_exception_cause(PyExc_RuntimeError, "PyRuntime address lookup failed during debug offsets initialization");
         return -1;
     }
     size_t size = sizeof(struct _Py_DebugOffsets);
     if (0 != _Py_RemoteDebug_ReadRemoteMemory(handle, *runtime_start_address, size, debug_offsets)) {
+        _set_debug_exception_cause(PyExc_RuntimeError, "Failed to read debug offsets structure from remote process");
         return -1;
     }
     return 0;
diff --git a/Python/specialize.c b/Python/specialize.c
index 06995d46d8b..92f79d39d55 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -2904,53 +2904,57 @@ int
 #endif   // Py_STATS
 
 Py_NO_INLINE void
-_Py_Specialize_ForIter(_PyStackRef iter, _Py_CODEUNIT *instr, int oparg)
+_Py_Specialize_ForIter(_PyStackRef iter, _PyStackRef null_or_index, _Py_CODEUNIT *instr, int oparg)
 {
     assert(ENABLE_SPECIALIZATION_FT);
     assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER);
     PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter);
     PyTypeObject *tp = Py_TYPE(iter_o);
+
+    if (PyStackRef_IsNull(null_or_index)) {
 #ifdef Py_GIL_DISABLED
-    // Only specialize for uniquely referenced iterators, so that we know
-    // they're only referenced by this one thread. This is more limiting
-    // than we need (even `it = iter(mylist); for item in it:` won't get
-    // specialized) but we don't have a way to check whether we're the only
-    // _thread_ who has access to the object.
-    if (!_PyObject_IsUniquelyReferenced(iter_o))
-        goto failure;
-#endif
-    if (tp == &PyListIter_Type) {
-#ifdef Py_GIL_DISABLED
-        _PyListIterObject *it = (_PyListIterObject *)iter_o;
-        if (!_Py_IsOwnedByCurrentThread((PyObject *)it->it_seq) &&
-            !_PyObject_GC_IS_SHARED(it->it_seq)) {
-            // Maybe this should just set GC_IS_SHARED in a critical
-            // section, instead of leaving it to the first iteration?
+        // Only specialize for uniquely referenced iterators, so that we know
+        // they're only referenced by this one thread. This is more limiting
+        // than we need (even `it = iter(mylist); for item in it:` won't get
+        // specialized) but we don't have a way to check whether we're the only
+        // _thread_ who has access to the object.
+        if (!_PyObject_IsUniquelyReferenced(iter_o)) {
             goto failure;
         }
 #endif
-        specialize(instr, FOR_ITER_LIST);
-        return;
-    }
-    else if (tp == &PyTupleIter_Type) {
-        specialize(instr, FOR_ITER_TUPLE);
-        return;
-    }
-    else if (tp == &PyRangeIter_Type) {
-        specialize(instr, FOR_ITER_RANGE);
-        return;
+        if (tp == &PyRangeIter_Type) {
+            specialize(instr, FOR_ITER_RANGE);
+            return;
+        }
+        else if (tp == &PyGen_Type && oparg <= SHRT_MAX) {
+            // Generators are very much not thread-safe, so don't worry about
+            // the specialization not being thread-safe.
+            assert(instr[oparg + INLINE_CACHE_ENTRIES_FOR_ITER + 1].op.code == END_FOR  ||
+                instr[oparg + INLINE_CACHE_ENTRIES_FOR_ITER + 1].op.code == INSTRUMENTED_END_FOR
+            );
+            /* Don't specialize if PEP 523 is active */
+            if (_PyInterpreterState_GET()->eval_frame) {
+                goto failure;
+            }
+            specialize(instr, FOR_ITER_GEN);
+            return;
+        }
     }
-    else if (tp == &PyGen_Type && oparg <= SHRT_MAX) {
-        // Generators are very much not thread-safe, so don't worry about
-        // the specialization not being thread-safe.
-        assert(instr[oparg + INLINE_CACHE_ENTRIES_FOR_ITER + 1].op.code == END_FOR  ||
-            instr[oparg + INLINE_CACHE_ENTRIES_FOR_ITER + 1].op.code == INSTRUMENTED_END_FOR
-        );
-        /* Don't specialize if PEP 523 is active */
-        if (_PyInterpreterState_GET()->eval_frame)
-            goto failure;
-        specialize(instr, FOR_ITER_GEN);
-        return;
+    else {
+        if (tp == &PyList_Type) {
+#ifdef Py_GIL_DISABLED
+            // Only specialize for lists owned by this thread or shared
+            if (!_Py_IsOwnedByCurrentThread(iter_o) && !_PyObject_GC_IS_SHARED(iter_o)) {
+                goto failure;
+            }
+#endif
+            specialize(instr, FOR_ITER_LIST);
+            return;
+        }
+        else if (tp == &PyTuple_Type) {
+            specialize(instr, FOR_ITER_TUPLE);
+            return;
+        }
     }
 failure:
     SPECIALIZATION_FAIL(FOR_ITER,
diff --git a/Python/stackrefs.c b/Python/stackrefs.c
index 69d4e8b9431..ecc0012ef17 100644
--- a/Python/stackrefs.c
+++ b/Python/stackrefs.c
@@ -40,6 +40,7 @@ make_table_entry(PyObject *obj, const char *filename, int linenumber)
 PyObject *
 _Py_stackref_get_object(_PyStackRef ref)
 {
+    assert(!PyStackRef_IsError(ref));
     if (ref.index == 0) {
         return NULL;
     }
@@ -64,6 +65,7 @@ PyStackRef_Is(_PyStackRef a, _PyStackRef b)
 PyObject *
 _Py_stackref_close(_PyStackRef ref, const char *filename, int linenumber)
 {
+    assert(!PyStackRef_IsError(ref));
     PyInterpreterState *interp = PyInterpreterState_Get();
     if (ref.index >= interp->next_stackref) {
         _Py_FatalErrorFormat(__func__, "Invalid StackRef with ID %" PRIu64 " at %s:%d\n", (void *)ref.index, filename, linenumber);
@@ -128,6 +130,7 @@ _Py_stackref_create(PyObject *obj, const char *filename, int linenumber)
 void
 _Py_stackref_record_borrow(_PyStackRef ref, const char *filename, int linenumber)
 {
+    assert(!PyStackRef_IsError(ref));
     if (ref.index < INITIAL_STACKREF_INDEX) {
         return;
     }
@@ -152,6 +155,7 @@ _Py_stackref_record_borrow(_PyStackRef ref, const char *filename, int linenumber
 void
 _Py_stackref_associate(PyInterpreterState *interp, PyObject *obj, _PyStackRef ref)
 {
+    assert(!PyStackRef_IsError(ref));
     assert(ref.index < INITIAL_STACKREF_INDEX);
     TableEntry *entry = make_table_entry(obj, "builtin-object", 0);
     if (entry == NULL) {
@@ -216,4 +220,12 @@ PyStackRef_IsNullOrInt(_PyStackRef ref)
     return PyStackRef_IsNull(ref) || PyStackRef_IsTaggedInt(ref);
 }
 
+_PyStackRef
+PyStackRef_IncrementTaggedIntNoOverflow(_PyStackRef ref)
+{
+    assert(ref.index <= INT_MAX - 2); // No overflow
+    return (_PyStackRef){ .index = ref.index + 2 };
+}
+
+
 #endif
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 41b9a6b276a..e5ae841d195 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -76,12 +76,12 @@ module sys
 
 
 PyObject *
-_PySys_GetRequiredAttr(PyObject *name)
+PySys_GetAttr(PyObject *name)
 {
     if (!PyUnicode_Check(name)) {
         PyErr_Format(PyExc_TypeError,
-                     "attribute name must be string, not '%.200s'",
-                     Py_TYPE(name)->tp_name);
+                     "attribute name must be string, not '%T'",
+                     name);
         return NULL;
     }
     PyThreadState *tstate = _PyThreadState_GET();
@@ -98,7 +98,7 @@ _PySys_GetRequiredAttr(PyObject *name)
 }
 
 PyObject *
-_PySys_GetRequiredAttrString(const char *name)
+PySys_GetAttrString(const char *name)
 {
     PyThreadState *tstate = _PyThreadState_GET();
     PyObject *sysdict = tstate->interp->sysdict;
@@ -114,12 +114,12 @@ _PySys_GetRequiredAttrString(const char *name)
 }
 
 int
-_PySys_GetOptionalAttr(PyObject *name, PyObject **value)
+PySys_GetOptionalAttr(PyObject *name, PyObject **value)
 {
     if (!PyUnicode_Check(name)) {
         PyErr_Format(PyExc_TypeError,
-                     "attribute name must be string, not '%.200s'",
-                     Py_TYPE(name)->tp_name);
+                     "attribute name must be string, not '%T'",
+                     name);
         *value = NULL;
         return -1;
     }
@@ -133,7 +133,7 @@ _PySys_GetOptionalAttr(PyObject *name, PyObject **value)
 }
 
 int
-_PySys_GetOptionalAttrString(const char *name, PyObject **value)
+PySys_GetOptionalAttrString(const char *name, PyObject **value)
 {
     PyThreadState *tstate = _PyThreadState_GET();
     PyObject *sysdict = tstate->interp->sysdict;
@@ -773,7 +773,7 @@ sys_displayhook(PyObject *module, PyObject *o)
     }
     if (PyObject_SetAttr(builtins, _Py_LATIN1_CHR('_'), Py_None) != 0)
         return NULL;
-    outf = _PySys_GetRequiredAttr(&_Py_ID(stdout));
+    outf = PySys_GetAttr(&_Py_ID(stdout));
     if (outf == NULL) {
         return NULL;
     }
@@ -1643,6 +1643,7 @@ static PyObject *
 _sys_getwindowsversion_from_kernel32(void)
 {
 #ifndef MS_WINDOWS_DESKTOP
+    PyErr_SetString(PyExc_OSError, "cannot read version info on this platform");
     return NULL;
 #else
     HANDLE hKernel32;
@@ -2451,26 +2452,58 @@ sys_is_remote_debug_enabled_impl(PyObject *module)
 #endif
 }
 
+/*[clinic input]
+sys.remote_exec
+
+    pid: int
+    script: object
+
+Executes a file containing Python code in a given remote Python process.
+
+This function returns immediately, and the code will be executed by the
+target process's main thread at the next available opportunity, similarly
+to how signals are handled. There is no interface to determine when the
+code has been executed. The caller is responsible for making sure that
+the file still exists whenever the remote process tries to read it and that
+it hasn't been overwritten.
+
+The remote process must be running a CPython interpreter of the same major
+and minor version as the local process. If either the local or remote
+interpreter is pre-release (alpha, beta, or release candidate) then the
+local and remote interpreters must be the same exact version.
+
+Args:
+     pid (int): The process ID of the target Python process.
+     script (str|bytes): The path to a file containing
+         the Python code to be executed.
+[clinic start generated code]*/
+
 static PyObject *
-sys_remote_exec_unicode_path(PyObject *module, int pid, PyObject *script)
+sys_remote_exec_impl(PyObject *module, int pid, PyObject *script)
+/*[clinic end generated code: output=7d94c56afe4a52c0 input=39908ca2c5fe1eb0]*/
 {
-    const char *debugger_script_path = PyUnicode_AsUTF8(script);
-    if (debugger_script_path == NULL) {
+    PyObject *path;
+    const char *debugger_script_path;
+
+    if (PyUnicode_FSConverter(script, &path) == 0) {
         return NULL;
     }
-
+    debugger_script_path = PyBytes_AS_STRING(path);
 #ifdef MS_WINDOWS
+    PyObject *unicode_path;
+    if (PyUnicode_FSDecoder(path, &unicode_path) < 0) {
+        goto error;
+    }
     // Use UTF-16 (wide char) version of the path for permission checks
-    wchar_t *debugger_script_path_w = PyUnicode_AsWideCharString(script, NULL);
+    wchar_t *debugger_script_path_w = PyUnicode_AsWideCharString(unicode_path, NULL);
+    Py_DECREF(unicode_path);
     if (debugger_script_path_w == NULL) {
-        return NULL;
+        goto error;
     }
-
-    // Check file attributes using wide character version (W) instead of ANSI (A)
     DWORD attr = GetFileAttributesW(debugger_script_path_w);
-    PyMem_Free(debugger_script_path_w);
     if (attr == INVALID_FILE_ATTRIBUTES) {
         DWORD err = GetLastError();
+        PyMem_Free(debugger_script_path_w);
         if (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) {
             PyErr_SetString(PyExc_FileNotFoundError, "Script file does not exist");
         }
@@ -2478,11 +2511,12 @@ sys_remote_exec_unicode_path(PyObject *module, int pid, PyObject *script)
             PyErr_SetString(PyExc_PermissionError, "Script file cannot be read");
         }
         else {
-            PyErr_SetFromWindowsErr(0);
+            PyErr_SetFromWindowsErr(err);
         }
-        return NULL;
+        goto error;
     }
-#else
+    PyMem_Free(debugger_script_path_w);
+#else // MS_WINDOWS
     if (access(debugger_script_path, F_OK | R_OK) != 0) {
         switch (errno) {
             case ENOENT:
@@ -2494,54 +2528,19 @@ sys_remote_exec_unicode_path(PyObject *module, int pid, PyObject *script)
             default:
                 PyErr_SetFromErrno(PyExc_OSError);
         }
-        return NULL;
+        goto error;
     }
-#endif
-
+#endif // MS_WINDOWS
     if (_PySysRemoteDebug_SendExec(pid, 0, debugger_script_path) < 0) {
-        return NULL;
+        goto error;
     }
 
+    Py_DECREF(path);
     Py_RETURN_NONE;
-}
-
-/*[clinic input]
-sys.remote_exec
-
-    pid: int
-    script: object
-
-Executes a file containing Python code in a given remote Python process.
-
-This function returns immediately, and the code will be executed by the
-target process's main thread at the next available opportunity, similarly
-to how signals are handled. There is no interface to determine when the
-code has been executed. The caller is responsible for making sure that
-the file still exists whenever the remote process tries to read it and that
-it hasn't been overwritten.
 
-The remote process must be running a CPython interpreter of the same major
-and minor version as the local process. If either the local or remote
-interpreter is pre-release (alpha, beta, or release candidate) then the
-local and remote interpreters must be the same exact version.
-
-Args:
-     pid (int): The process ID of the target Python process.
-     script (str|bytes): The path to a file containing
-         the Python code to be executed.
-[clinic start generated code]*/
-
-static PyObject *
-sys_remote_exec_impl(PyObject *module, int pid, PyObject *script)
-/*[clinic end generated code: output=7d94c56afe4a52c0 input=39908ca2c5fe1eb0]*/
-{
-    PyObject *ret = NULL;
-    PyObject *path;
-    if (PyUnicode_FSDecoder(script, &path)) {
-        ret = sys_remote_exec_unicode_path(module, pid, path);
-        Py_DECREF(path);
-    }
-    return ret;
+error:
+    Py_DECREF(path);
+    return NULL;
 }
 
 
@@ -3006,7 +3005,7 @@ static PyObject *
 get_warnoptions(PyThreadState *tstate)
 {
     PyObject *warnoptions;
-    if (_PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
         return NULL;
     }
     if (warnoptions == NULL || !PyList_Check(warnoptions)) {
@@ -3043,7 +3042,7 @@ PySys_ResetWarnOptions(void)
     }
 
     PyObject *warnoptions;
-    if (_PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
         PyErr_Clear();
         return;
     }
@@ -3107,7 +3106,7 @@ PyAPI_FUNC(int)
 PySys_HasWarnOptions(void)
 {
     PyObject *warnoptions;
-    if (_PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(warnoptions), &warnoptions) < 0) {
         PyErr_Clear();
         return 0;
     }
@@ -3121,7 +3120,7 @@ static PyObject *
 get_xoptions(PyThreadState *tstate)
 {
     PyObject *xoptions;
-    if (_PySys_GetOptionalAttr(&_Py_ID(_xoptions), &xoptions) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(_xoptions), &xoptions) < 0) {
         return NULL;
     }
     if (xoptions == NULL || !PyDict_Check(xoptions)) {
@@ -3374,7 +3373,7 @@ sys_set_flag(PyObject *flags, Py_ssize_t pos, PyObject *value)
 int
 _PySys_SetFlagObj(Py_ssize_t pos, PyObject *value)
 {
-    PyObject *flags = _PySys_GetRequiredAttrString("flags");
+    PyObject *flags = PySys_GetAttrString("flags");
     if (flags == NULL) {
         return -1;
     }
@@ -3936,7 +3935,7 @@ _PySys_UpdateConfig(PyThreadState *tstate)
 #undef COPY_WSTR
 
     // sys.flags
-    PyObject *flags = _PySys_GetRequiredAttrString("flags");
+    PyObject *flags = PySys_GetAttrString("flags");
     if (flags == NULL) {
         return -1;
     }
@@ -4252,7 +4251,7 @@ PySys_SetArgvEx(int argc, wchar_t **argv, int updatepath)
             }
 
             PyObject *sys_path;
-            if (_PySys_GetOptionalAttr(&_Py_ID(path), &sys_path) < 0) {
+            if (PySys_GetOptionalAttr(&_Py_ID(path), &sys_path) < 0) {
                 Py_FatalError("can't get sys.path");
             }
             else if (sys_path != NULL) {
@@ -4348,7 +4347,7 @@ sys_write(PyObject *key, FILE *fp, const char *format, va_list va)
 
     PyObject *exc = _PyErr_GetRaisedException(tstate);
     written = PyOS_vsnprintf(buffer, sizeof(buffer), format, va);
-    file = _PySys_GetRequiredAttr(key);
+    file = PySys_GetAttr(key);
     if (sys_pyfile_write(buffer, file) != 0) {
         _PyErr_Clear(tstate);
         fputs(buffer, fp);
@@ -4392,7 +4391,7 @@ sys_format(PyObject *key, FILE *fp, const char *format, va_list va)
     PyObject *exc = _PyErr_GetRaisedException(tstate);
     message = PyUnicode_FromFormatV(format, va);
     if (message != NULL) {
-        file = _PySys_GetRequiredAttr(key);
+        file = PySys_GetAttr(key);
         if (sys_pyfile_write_unicode(message, file) != 0) {
             _PyErr_Clear(tstate);
             utf8 = PyUnicode_AsUTF8(message);
diff --git a/Python/thread.c b/Python/thread.c
index 4ff5f11a348..18c4af7f634 100644
--- a/Python/thread.c
+++ b/Python/thread.c
@@ -39,7 +39,8 @@
 const long long PY_TIMEOUT_MAX = PY_TIMEOUT_MAX_VALUE;
 
 
-static void PyThread__init_thread(void); /* Forward */
+/* Forward declaration */
+static void PyThread__init_thread(void);
 
 #define initialized _PyRuntime.threads.initialized
 
@@ -71,6 +72,79 @@ PyThread_init_thread(void)
 #endif
 
 
+/*
+ * Lock support.
+ */
+
+PyThread_type_lock
+PyThread_allocate_lock(void)
+{
+    if (!initialized) {
+        PyThread_init_thread();
+    }
+
+    PyMutex *lock = (PyMutex *)PyMem_RawMalloc(sizeof(PyMutex));
+    if (lock) {
+        *lock = (PyMutex){0};
+    }
+
+    return (PyThread_type_lock)lock;
+}
+
+void
+PyThread_free_lock(PyThread_type_lock lock)
+{
+    PyMem_RawFree(lock);
+}
+
+PyLockStatus
+PyThread_acquire_lock_timed(PyThread_type_lock lock, PY_TIMEOUT_T microseconds,
+                            int intr_flag)
+{
+    PyTime_t timeout;  // relative timeout
+    if (microseconds >= 0) {
+        // bpo-41710: PyThread_acquire_lock_timed() cannot report timeout
+        // overflow to the caller, so clamp the timeout to
+        // [PyTime_MIN, PyTime_MAX].
+        //
+        // PyTime_MAX nanoseconds is around 292.3 years.
+        //
+        // _thread.Lock.acquire() and _thread.RLock.acquire() raise an
+        // OverflowError if microseconds is greater than PY_TIMEOUT_MAX.
+        timeout = _PyTime_FromMicrosecondsClamp(microseconds);
+    }
+    else {
+        timeout = -1;
+    }
+
+    _PyLockFlags flags = _Py_LOCK_DONT_DETACH;
+    if (intr_flag) {
+        flags |= _PY_FAIL_IF_INTERRUPTED;
+    }
+
+    return _PyMutex_LockTimed((PyMutex *)lock, timeout, flags);
+}
+
+void
+PyThread_release_lock(PyThread_type_lock lock)
+{
+    PyMutex_Unlock((PyMutex *)lock);
+}
+
+int
+_PyThread_at_fork_reinit(PyThread_type_lock *lock)
+{
+    _PyMutex_at_fork_reinit((PyMutex *)lock);
+    return 0;
+}
+
+int
+PyThread_acquire_lock(PyThread_type_lock lock, int waitflag)
+{
+    return PyThread_acquire_lock_timed(lock, waitflag ? -1 : 0, /*intr_flag=*/0);
+}
+
+
 /* return the current thread stack size */
 size_t
 PyThread_get_stacksize(void)
@@ -261,11 +335,7 @@ PyThread_GetInfo(void)
 #ifdef HAVE_PTHREAD_STUBS
     value = Py_NewRef(Py_None);
 #elif defined(_POSIX_THREADS)
-#ifdef USE_SEMAPHORES
-    value = PyUnicode_FromString("semaphore");
-#else
-    value = PyUnicode_FromString("mutex+cond");
-#endif
+    value = PyUnicode_FromString("pymutex");
     if (value == NULL) {
         Py_DECREF(threadinfo);
         return NULL;
diff --git a/Python/thread_nt.h b/Python/thread_nt.h
index e078b98be3c..9a29d14ef67 100644
--- a/Python/thread_nt.h
+++ b/Python/thread_nt.h
@@ -300,98 +300,6 @@ PyThread_hang_thread(void)
     }
 }
 
-/*
- * Lock support. It has to be implemented as semaphores.
- * I [Dag] tried to implement it with mutex but I could find a way to
- * tell whether a thread already own the lock or not.
- */
-PyThread_type_lock
-PyThread_allocate_lock(void)
-{
-    PNRMUTEX mutex;
-
-    if (!initialized)
-        PyThread_init_thread();
-
-    mutex = AllocNonRecursiveMutex() ;
-
-    PyThread_type_lock aLock = (PyThread_type_lock) mutex;
-    assert(aLock);
-
-    return aLock;
-}
-
-void
-PyThread_free_lock(PyThread_type_lock aLock)
-{
-    FreeNonRecursiveMutex(aLock) ;
-}
-
-// WaitForSingleObject() accepts timeout in milliseconds in the range
-// [0; 0xFFFFFFFE] (DWORD type). INFINITE value (0xFFFFFFFF) means no
-// timeout. 0xFFFFFFFE milliseconds is around 49.7 days.
-const DWORD TIMEOUT_MS_MAX = 0xFFFFFFFE;
-
-/*
- * Return 1 on success if the lock was acquired
- *
- * and 0 if the lock was not acquired. This means a 0 is returned
- * if the lock has already been acquired by this thread!
- */
-PyLockStatus
-PyThread_acquire_lock_timed(PyThread_type_lock aLock,
-                            PY_TIMEOUT_T microseconds, int intr_flag)
-{
-    assert(aLock);
-
-    /* Fow now, intr_flag does nothing on Windows, and lock acquires are
-     * uninterruptible.  */
-    PyLockStatus success;
-    PY_TIMEOUT_T milliseconds;
-
-    if (microseconds >= 0) {
-        milliseconds = microseconds / 1000;
-        // Round milliseconds away from zero
-        if (microseconds % 1000 > 0) {
-            milliseconds++;
-        }
-        if (milliseconds > (PY_TIMEOUT_T)TIMEOUT_MS_MAX) {
-            // bpo-41710: PyThread_acquire_lock_timed() cannot report timeout
-            // overflow to the caller, so clamp the timeout to
-            // [0, TIMEOUT_MS_MAX] milliseconds.
-            //
-            // _thread.Lock.acquire() and _thread.RLock.acquire() raise an
-            // OverflowError if microseconds is greater than PY_TIMEOUT_MAX.
-            milliseconds = TIMEOUT_MS_MAX;
-        }
-        assert(milliseconds != INFINITE);
-    }
-    else {
-        milliseconds = INFINITE;
-    }
-
-    if (EnterNonRecursiveMutex((PNRMUTEX)aLock,
-                               (DWORD)milliseconds) == WAIT_OBJECT_0) {
-        success = PY_LOCK_ACQUIRED;
-    }
-    else {
-        success = PY_LOCK_FAILURE;
-    }
-
-    return success;
-}
-int
-PyThread_acquire_lock(PyThread_type_lock aLock, int waitflag)
-{
-    return PyThread_acquire_lock_timed(aLock, waitflag ? -1 : 0, 0);
-}
-
-void
-PyThread_release_lock(PyThread_type_lock aLock)
-{
-    assert(aLock);
-    (void)LeaveNonRecursiveMutex((PNRMUTEX) aLock);
-}
 
 /* minimum/maximum thread stack sizes supported */
 #define THREAD_MIN_STACKSIZE    0x8000          /* 32 KiB */
diff --git a/Python/thread_pthread.h b/Python/thread_pthread.h
index da405824244..13992f95723 100644
--- a/Python/thread_pthread.h
+++ b/Python/thread_pthread.h
@@ -99,16 +99,6 @@
 #undef HAVE_SEM_CLOCKWAIT
 #endif
 
-/* Whether or not to use semaphores directly rather than emulating them with
- * mutexes and condition variables:
- */
-#if (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \
-     (defined(HAVE_SEM_TIMEDWAIT) || defined(HAVE_SEM_CLOCKWAIT)))
-#  define USE_SEMAPHORES
-#else
-#  undef USE_SEMAPHORES
-#endif
-
 
 /* On platforms that don't use standard POSIX threads pthread_sigmask()
  * isn't present.  DEC threads uses sigprocmask() instead as do most
@@ -442,388 +432,6 @@ PyThread_hang_thread(void)
     }
 }
 
-#ifdef USE_SEMAPHORES
-
-/*
- * Lock support.
- */
-
-PyThread_type_lock
-PyThread_allocate_lock(void)
-{
-    sem_t *lock;
-    int status, error = 0;
-
-    if (!initialized)
-        PyThread_init_thread();
-
-    lock = (sem_t *)PyMem_RawMalloc(sizeof(sem_t));
-
-    if (lock) {
-        status = sem_init(lock,0,1);
-        CHECK_STATUS("sem_init");
-
-        if (error) {
-            PyMem_RawFree((void *)lock);
-            lock = NULL;
-        }
-    }
-
-    return (PyThread_type_lock)lock;
-}
-
-void
-PyThread_free_lock(PyThread_type_lock lock)
-{
-    sem_t *thelock = (sem_t *)lock;
-    int status, error = 0;
-
-    (void) error; /* silence unused-but-set-variable warning */
-
-    if (!thelock)
-        return;
-
-    status = sem_destroy(thelock);
-    CHECK_STATUS("sem_destroy");
-
-    PyMem_RawFree((void *)thelock);
-}
-
-/*
- * As of February 2002, Cygwin thread implementations mistakenly report error
- * codes in the return value of the sem_ calls (like the pthread_ functions).
- * Correct implementations return -1 and put the code in errno. This supports
- * either.
- */
-static int
-fix_status(int status)
-{
-    return (status == -1) ? errno : status;
-}
-
-PyLockStatus
-PyThread_acquire_lock_timed(PyThread_type_lock lock, PY_TIMEOUT_T microseconds,
-                            int intr_flag)
-{
-    PyLockStatus success;
-    sem_t *thelock = (sem_t *)lock;
-    int status, error = 0;
-
-    (void) error; /* silence unused-but-set-variable warning */
-
-    PyTime_t timeout;  // relative timeout
-    if (microseconds >= 0) {
-        // bpo-41710: PyThread_acquire_lock_timed() cannot report timeout
-        // overflow to the caller, so clamp the timeout to
-        // [PyTime_MIN, PyTime_MAX].
-        //
-        // PyTime_MAX nanoseconds is around 292.3 years.
-        //
-        // _thread.Lock.acquire() and _thread.RLock.acquire() raise an
-        // OverflowError if microseconds is greater than PY_TIMEOUT_MAX.
-        timeout = _PyTime_FromMicrosecondsClamp(microseconds);
-    }
-    else {
-        timeout = -1;
-    }
-
-#ifdef HAVE_SEM_CLOCKWAIT
-    struct timespec abs_timeout;
-    // Local scope for deadline
-    {
-        PyTime_t now;
-        // silently ignore error: cannot report error to the caller
-        (void)PyTime_MonotonicRaw(&now);
-        PyTime_t deadline = _PyTime_Add(now, timeout);
-        _PyTime_AsTimespec_clamp(deadline, &abs_timeout);
-    }
-#else
-    PyTime_t deadline = 0;
-    if (timeout > 0 && !intr_flag) {
-        deadline = _PyDeadline_Init(timeout);
-    }
-#endif
-
-    while (1) {
-        if (timeout > 0) {
-#ifdef HAVE_SEM_CLOCKWAIT
-            status = fix_status(sem_clockwait(thelock, CLOCK_MONOTONIC,
-                                              &abs_timeout));
-#else
-            PyTime_t now;
-            // silently ignore error: cannot report error to the caller
-            (void)PyTime_TimeRaw(&now);
-            PyTime_t abs_time = _PyTime_Add(now, timeout);
-
-            struct timespec ts;
-            _PyTime_AsTimespec_clamp(abs_time, &ts);
-            status = fix_status(sem_timedwait(thelock, &ts));
-#endif
-        }
-        else if (timeout == 0) {
-            status = fix_status(sem_trywait(thelock));
-        }
-        else {
-            status = fix_status(sem_wait(thelock));
-        }
-
-        /* Retry if interrupted by a signal, unless the caller wants to be
-           notified.  */
-        if (intr_flag || status != EINTR) {
-            break;
-        }
-
-        // sem_clockwait() uses an absolute timeout, there is no need
-        // to recompute the relative timeout.
-#ifndef HAVE_SEM_CLOCKWAIT
-        if (timeout > 0) {
-            /* wait interrupted by a signal (EINTR): recompute the timeout */
-            timeout = _PyDeadline_Get(deadline);
-            if (timeout < 0) {
-                status = ETIMEDOUT;
-                break;
-            }
-        }
-#endif
-    }
-
-    /* Don't check the status if we're stopping because of an interrupt.  */
-    if (!(intr_flag && status == EINTR)) {
-        if (timeout > 0) {
-            if (status != ETIMEDOUT) {
-#ifdef HAVE_SEM_CLOCKWAIT
-                CHECK_STATUS("sem_clockwait");
-#else
-                CHECK_STATUS("sem_timedwait");
-#endif
-            }
-        }
-        else if (timeout == 0) {
-            if (status != EAGAIN) {
-                CHECK_STATUS("sem_trywait");
-            }
-        }
-        else {
-            CHECK_STATUS("sem_wait");
-        }
-    }
-
-    if (status == 0) {
-        success = PY_LOCK_ACQUIRED;
-    } else if (intr_flag && status == EINTR) {
-        success = PY_LOCK_INTR;
-    } else {
-        success = PY_LOCK_FAILURE;
-    }
-
-    return success;
-}
-
-void
-PyThread_release_lock(PyThread_type_lock lock)
-{
-    sem_t *thelock = (sem_t *)lock;
-    int status, error = 0;
-
-    (void) error; /* silence unused-but-set-variable warning */
-
-    status = sem_post(thelock);
-    CHECK_STATUS("sem_post");
-}
-
-#else /* USE_SEMAPHORES */
-
-/*
- * Lock support.
- */
-PyThread_type_lock
-PyThread_allocate_lock(void)
-{
-    pthread_lock *lock;
-    int status, error = 0;
-
-    if (!initialized)
-        PyThread_init_thread();
-
-    lock = (pthread_lock *) PyMem_RawCalloc(1, sizeof(pthread_lock));
-    if (lock) {
-        lock->locked = 0;
-
-        status = pthread_mutex_init(&lock->mut, NULL);
-        CHECK_STATUS_PTHREAD("pthread_mutex_init");
-        /* Mark the pthread mutex underlying a Python mutex as
-           pure happens-before.  We can't simply mark the
-           Python-level mutex as a mutex because it can be
-           acquired and released in different threads, which
-           will cause errors. */
-        _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(&lock->mut);
-
-        status = _PyThread_cond_init(&lock->lock_released);
-        CHECK_STATUS_PTHREAD("pthread_cond_init");
-
-        if (error) {
-            PyMem_RawFree((void *)lock);
-            lock = 0;
-        }
-    }
-
-    return (PyThread_type_lock) lock;
-}
-
-void
-PyThread_free_lock(PyThread_type_lock lock)
-{
-    pthread_lock *thelock = (pthread_lock *)lock;
-    int status, error = 0;
-
-    (void) error; /* silence unused-but-set-variable warning */
-
-    /* some pthread-like implementations tie the mutex to the cond
-     * and must have the cond destroyed first.
-     */
-    status = pthread_cond_destroy( &thelock->lock_released );
-    CHECK_STATUS_PTHREAD("pthread_cond_destroy");
-
-    status = pthread_mutex_destroy( &thelock->mut );
-    CHECK_STATUS_PTHREAD("pthread_mutex_destroy");
-
-    PyMem_RawFree((void *)thelock);
-}
-
-PyLockStatus
-PyThread_acquire_lock_timed(PyThread_type_lock lock, PY_TIMEOUT_T microseconds,
-                            int intr_flag)
-{
-    PyLockStatus success = PY_LOCK_FAILURE;
-    pthread_lock *thelock = (pthread_lock *)lock;
-    int status, error = 0;
-
-    if (microseconds == 0) {
-        status = pthread_mutex_trylock( &thelock->mut );
-        if (status != EBUSY) {
-            CHECK_STATUS_PTHREAD("pthread_mutex_trylock[1]");
-        }
-    }
-    else {
-        status = pthread_mutex_lock( &thelock->mut );
-        CHECK_STATUS_PTHREAD("pthread_mutex_lock[1]");
-    }
-    if (status != 0) {
-        goto done;
-    }
-
-    if (thelock->locked == 0) {
-        success = PY_LOCK_ACQUIRED;
-        goto unlock;
-    }
-    if (microseconds == 0) {
-        goto unlock;
-    }
-
-    struct timespec abs_timeout;
-    if (microseconds > 0) {
-        _PyThread_cond_after(microseconds, &abs_timeout);
-    }
-    // Continue trying until we get the lock
-
-    // mut must be locked by me -- part of the condition protocol
-    while (1) {
-        if (microseconds > 0) {
-            status = pthread_cond_timedwait(&thelock->lock_released,
-                                            &thelock->mut, &abs_timeout);
-            if (status == 1) {
-                break;
-            }
-            if (status == ETIMEDOUT) {
-                break;
-            }
-            CHECK_STATUS_PTHREAD("pthread_cond_timedwait");
-        }
-        else {
-            status = pthread_cond_wait(
-                &thelock->lock_released,
-                &thelock->mut);
-            CHECK_STATUS_PTHREAD("pthread_cond_wait");
-        }
-
-        if (intr_flag && status == 0 && thelock->locked) {
-            // We were woken up, but didn't get the lock.  We probably received
-            // a signal.  Return PY_LOCK_INTR to allow the caller to handle
-            // it and retry.
-            success = PY_LOCK_INTR;
-            break;
-        }
-
-        if (status == 0 && !thelock->locked) {
-            success = PY_LOCK_ACQUIRED;
-            break;
-        }
-
-        // Wait got interrupted by a signal: retry
-    }
-
-unlock:
-    if (success == PY_LOCK_ACQUIRED) {
-        thelock->locked = 1;
-    }
-    status = pthread_mutex_unlock( &thelock->mut );
-    CHECK_STATUS_PTHREAD("pthread_mutex_unlock[1]");
-
-done:
-    if (error) {
-        success = PY_LOCK_FAILURE;
-    }
-    return success;
-}
-
-void
-PyThread_release_lock(PyThread_type_lock lock)
-{
-    pthread_lock *thelock = (pthread_lock *)lock;
-    int status, error = 0;
-
-    (void) error; /* silence unused-but-set-variable warning */
-
-    status = pthread_mutex_lock( &thelock->mut );
-    CHECK_STATUS_PTHREAD("pthread_mutex_lock[3]");
-
-    thelock->locked = 0;
-
-    /* wake up someone (anyone, if any) waiting on the lock */
-    status = pthread_cond_signal( &thelock->lock_released );
-    CHECK_STATUS_PTHREAD("pthread_cond_signal");
-
-    status = pthread_mutex_unlock( &thelock->mut );
-    CHECK_STATUS_PTHREAD("pthread_mutex_unlock[3]");
-}
-
-#endif /* USE_SEMAPHORES */
-
-int
-_PyThread_at_fork_reinit(PyThread_type_lock *lock)
-{
-    PyThread_type_lock new_lock = PyThread_allocate_lock();
-    if (new_lock == NULL) {
-        return -1;
-    }
-
-    /* bpo-6721, bpo-40089: The old lock can be in an inconsistent state.
-       fork() can be called in the middle of an operation on the lock done by
-       another thread. So don't call PyThread_free_lock(*lock).
-
-       Leak memory on purpose. Don't release the memory either since the
-       address of a mutex is relevant. Putting two mutexes at the same address
-       can lead to problems. */
-
-    *lock = new_lock;
-    return 0;
-}
-
-int
-PyThread_acquire_lock(PyThread_type_lock lock, int waitflag)
-{
-    return PyThread_acquire_lock_timed(lock, waitflag ? -1 : 0, /*intr_flag=*/0);
-}
 
 /* set the thread stack size.
  * Return 0 if size is valid, -1 if size is invalid,
diff --git a/Python/traceback.c b/Python/traceback.c
index c06cb1a5908..4f674eaf557 100644
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -9,7 +9,6 @@
 #include "pycore_interpframe.h"   // _PyFrame_GetCode()
 #include "pycore_pyerrors.h"      // _PyErr_GetRaisedException()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
-#include "pycore_sysmodule.h"     // _PySys_GetOptionalAttr()
 #include "pycore_traceback.h"     // EXCEPTION_TB_HEADER
 
 #include "frameobject.h"          // PyFrame_New()
@@ -399,7 +398,7 @@ _Py_FindSourceFile(PyObject *filename, char* namebuf, size_t namelen, PyObject *
     taillen = strlen(tail);
 
     PyThreadState *tstate = _PyThreadState_GET();
-    if (_PySys_GetOptionalAttr(&_Py_ID(path), &syspath) < 0) {
+    if (PySys_GetOptionalAttr(&_Py_ID(path), &syspath) < 0) {
         PyErr_Clear();
         goto error;
     }
@@ -777,7 +776,7 @@ _PyTraceBack_Print(PyObject *v, const char *header, PyObject *f)
         PyErr_BadInternalCall();
         return -1;
     }
-    if (_PySys_GetOptionalAttrString("tracebacklimit", &limitv) < 0) {
+    if (PySys_GetOptionalAttrString("tracebacklimit", &limitv) < 0) {
         return -1;
     }
     else if (limitv != NULL && PyLong_Check(limitv)) {