1 files changed, 1029 insertions, 385 deletions
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 1211e0e9f11..5c7cb5b0a99 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -1,241 +1,354 @@
+/*
+ * Python Perf Trampoline Support - JIT Dump Implementation
+ *
+ * This file implements the perf jitdump API for Python's performance profiling
+ * integration. It allows perf (Linux performance analysis tool) to understand
+ * and profile dynamically generated Python bytecode by creating JIT dump files
+ * that perf can inject into its analysis.
+ *
+ *
+ * IMPORTANT: This file exports specific callback functions that are part of
+ * Python's internal API. Do not modify the function signatures or behavior
+ * of exported functions without coordinating with the Python core team.
+ *
+ * Usually the binary and libraries are mapped in separate region like below:
+ *
+ *   address ->
+ *    --+---------------------+--//--+---------------------+--
+ *      | .text | .data | ... |      | .text | .data | ... |
+ *    --+---------------------+--//--+---------------------+--
+ *          myprog                      libc.so
+ *
+ * So it'd be easy and straight-forward to find a mapped binary or library from an
+ * address.
+ *
+ * But for JIT code, the code arena only cares about the code section. But the
+ * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
+ * unwind info too. Then it'd generate following address space with synthesized
+ * MMAP events. Let's say it has a sample between address B and C.
+ *
+ *                                                sample
+ *                                                  |
+ *   address ->                         A       B   v   C
+ *   ---------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
+ *     ...
+ *   ---------------------------------------------------------------------------------------------------
+ *
+ * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
+ * the unwind info. If it maps both .text section and unwind sections, the sample
+ * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
+ * which one is right. So to make perf happy we have non-overlapping ranges for each
+ * DSO:
+ *
+ *   address ->
+ *   -------------------------------------------------------------------------------------------------------
+ *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
+ *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
+ *     ...
+ *   -------------------------------------------------------------------------------------------------------
+ *
+ * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
+ * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ */
+
+
+
 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_runtime.h"       // _PyRuntime
 
-
 #ifdef PY_HAVE_PERF_TRAMPOLINE
 
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>             // mmap()
-#include <sys/types.h>
-#include <unistd.h>               // sysconf()
-#include <sys/time.h>           // gettimeofday()
-#include <sys/syscall.h>
-
-// ----------------------------------
-//         Perf jitdump API
-// ----------------------------------
-
-typedef struct {
-    FILE* perf_map;
-    PyThread_type_lock map_lock;
-    void* mapped_buffer;
-    size_t mapped_size;
-    int code_id;
-} PerfMapJitState;
-
-static PerfMapJitState perf_jit_map_state;
+/* Standard library includes for perf jitdump implementation */
+#include <elf.h>                  // ELF architecture constants
+#include <fcntl.h>                // File control operations
+#include <stdio.h>                // Standard I/O operations
+#include <stdlib.h>               // Standard library functions
+#include <sys/mman.h>             // Memory mapping functions (mmap)
+#include <sys/types.h>            // System data types
+#include <unistd.h>               // System calls (sysconf, getpid)
+#include <sys/time.h>             // Time functions (gettimeofday)
+#include <sys/syscall.h>          // System call interface
+
+// =============================================================================
+//                           CONSTANTS AND CONFIGURATION
+// =============================================================================
 
 /*
-Usually the binary and libraries are mapped in separate region like below:
-
-  address ->
-   --+---------------------+--//--+---------------------+--
-     | .text | .data | ... |      | .text | .data | ... |
-   --+---------------------+--//--+---------------------+--
-         myprog                      libc.so
-
-So it'd be easy and straight-forward to find a mapped binary or library from an
-address.
-
-But for JIT code, the code arena only cares about the code section. But the
-resulting DSOs (which is generated by perf inject -j) contain ELF headers and
-unwind info too. Then it'd generate following address space with synthesized
-MMAP events. Let's say it has a sample between address B and C.
-
-                                               sample
-                                                 |
-  address ->                         A       B   v   C
-  ---------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
-    ...
-  ---------------------------------------------------------------------------------------------------
-
-If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
-the unwind info. If it maps both .text section and unwind sections, the sample
-could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
-which one is right. So to make perf happy we have non-overlapping ranges for each
-DSO:
-
-  address ->
-  -------------------------------------------------------------------------------------------------------
-  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
-  /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
-  /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
-    ...
-  -------------------------------------------------------------------------------------------------------
-
-As the trampolines are constant, we add a constant padding but in general the padding needs to have the
-size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ * Memory layout considerations for perf jitdump:
+ *
+ * Perf expects non-overlapping memory regions for each JIT-compiled function.
+ * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
+ * Shared Object) files that contain:
+ * - ELF headers
+ * - .text section (actual machine code)
+ * - Unwind information (for stack traces)
+ *
+ * To ensure proper address space layout, we add padding between code regions.
+ * This prevents address conflicts when perf maps the synthesized DSOs.
+ *
+ * Memory layout example:
+ * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
+ * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
+ *
+ * The padding size (0x100) is chosen to accommodate typical unwind info sizes
+ * while maintaining 16-byte alignment requirements.
  */
-
 #define PERF_JIT_CODE_PADDING 0x100
-#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
-
-typedef uint64_t uword;
-typedef const char* CodeComments;
 
-#define Pd "d"
-#define MB (1024 * 1024)
-
-#define EM_386      3
-#define EM_X86_64   62
-#define EM_ARM      40
-#define EM_AARCH64  183
-#define EM_RISCV    243
+/* Convenient access to the global trampoline API state */
+#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 
-#define TARGET_ARCH_IA32   0
-#define TARGET_ARCH_X64    0
-#define TARGET_ARCH_ARM    0
-#define TARGET_ARCH_ARM64  0
-#define TARGET_ARCH_RISCV32 0
-#define TARGET_ARCH_RISCV64 0
+/* Type aliases for clarity and portability */
+typedef uint64_t uword;                    // Word-sized unsigned integer
+typedef const char* CodeComments;          // Code comment strings
 
-#define FLAG_generate_perf_jitdump 0
-#define FLAG_write_protect_code 0
-#define FLAG_write_protect_vm_isolate 0
-#define FLAG_code_comments 0
+/* Memory size constants */
+#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
 
-#define UNREACHABLE()
+// =============================================================================
+//                        ARCHITECTURE-SPECIFIC DEFINITIONS
+// =============================================================================
 
-static uword GetElfMachineArchitecture(void) {
-#if TARGET_ARCH_IA32
-    return EM_386;
-#elif TARGET_ARCH_X64
+/*
+ * Returns the ELF machine architecture constant for the current platform.
+ * This is required for the jitdump header to correctly identify the target
+ * architecture for perf processing.
+ *
+ */
+static uint64_t GetElfMachineArchitecture(void) {
+#if defined(__x86_64__) || defined(_M_X64)
     return EM_X86_64;
-#elif TARGET_ARCH_ARM
-    return EM_ARM;
-#elif TARGET_ARCH_ARM64
+#elif defined(__i386__) || defined(_M_IX86)
+    return EM_386;
+#elif defined(__aarch64__)
     return EM_AARCH64;
-#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
+#elif defined(__arm__) || defined(_M_ARM)
+    return EM_ARM;
+#elif defined(__riscv)
     return EM_RISCV;
 #else
-    UNREACHABLE();
+    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
     return 0;
 #endif
 }
 
+// =============================================================================
+//                           PERF JITDUMP DATA STRUCTURES
+// =============================================================================
+
+/*
+ * Perf jitdump file format structures
+ *
+ * These structures define the binary format that perf expects for JIT dump files.
+ * The format is documented in the Linux perf tools source code and must match
+ * exactly for proper perf integration.
+ */
+
+/*
+ * Jitdump file header - written once at the beginning of each jitdump file
+ * Contains metadata about the process and jitdump format version
+ */
 typedef struct {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t size;
-    uint32_t elf_mach_target;
-    uint32_t reserved;
-    uint32_t process_id;
-    uint64_t time_stamp;
-    uint64_t flags;
+    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
+    uint32_t version;            // Jitdump format version (currently 1)
+    uint32_t size;               // Size of this header structure
+    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
+    uint32_t reserved;           // Reserved field (must be 0)
+    uint32_t process_id;         // Process ID of the JIT compiler
+    uint64_t time_stamp;         // Timestamp when jitdump was created
+    uint64_t flags;              // Feature flags (currently unused)
 } Header;
 
- enum PerfEvent {
-    PerfLoad = 0,
-    PerfMove = 1,
-    PerfDebugInfo = 2,
-    PerfClose = 3,
-    PerfUnwindingInfo = 4
+/*
+ * Perf event types supported by the jitdump format
+ * Each event type has a corresponding structure format
+ */
+enum PerfEvent {
+    PerfLoad = 0,           // Code load event (new JIT function)
+    PerfMove = 1,           // Code move event (function relocated)
+    PerfDebugInfo = 2,      // Debug information event
+    PerfClose = 3,          // JIT session close event
+    PerfUnwindingInfo = 4   // Stack unwinding information event
 };
 
+/*
+ * Base event structure - common header for all perf events
+ * Every event in the jitdump file starts with this structure
+ */
 struct BaseEvent {
-    uint32_t event;
-    uint32_t size;
-    uint64_t time_stamp;
-  };
+    uint32_t event;         // Event type (from PerfEvent enum)
+    uint32_t size;          // Total size of this event including payload
+    uint64_t time_stamp;    // Timestamp when event occurred
+};
 
+/*
+ * Code load event - indicates a new JIT-compiled function is available
+ * This is the most important event type for Python profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint32_t process_id;
-    uint32_t thread_id;
-    uint64_t vma;
-    uint64_t code_address;
-    uint64_t code_size;
-    uint64_t code_id;
+    struct BaseEvent base;   // Common event header
+    uint32_t process_id;     // Process ID where code was generated
+    uint32_t thread_id;      // Thread ID where code was generated
+    uint64_t vma;            // Virtual memory address where code is loaded
+    uint64_t code_address;   // Address of the actual machine code
+    uint64_t code_size;      // Size of the machine code in bytes
+    uint64_t code_id;        // Unique identifier for this code region
+    /* Followed by:
+     * - null-terminated function name string
+     * - raw machine code bytes
+     */
 } CodeLoadEvent;
 
+/*
+ * Code unwinding information event - provides DWARF data for stack traces
+ * Essential for proper stack unwinding during profiling
+ */
 typedef struct {
-    struct BaseEvent base;
-    uint64_t unwind_data_size;
-    uint64_t eh_frame_hdr_size;
-    uint64_t mapped_size;
+    struct BaseEvent base;      // Common event header
+    uint64_t unwind_data_size;  // Size of the unwinding data
+    uint64_t eh_frame_hdr_size; // Size of the EH frame header
+    uint64_t mapped_size;       // Total mapped size (with padding)
+    /* Followed by:
+     * - EH frame header
+     * - DWARF unwinding information
+     * - Padding to alignment boundary
+     */
 } CodeUnwindingInfoEvent;
 
-static const intptr_t nanoseconds_per_second = 1000000000;
-
-// Dwarf encoding constants
+// =============================================================================
+//                              GLOBAL STATE MANAGEMENT
+// =============================================================================
 
-static const uint8_t DwarfUData4 = 0x03;
-static const uint8_t DwarfSData4 = 0x0b;
-static const uint8_t DwarfPcRel = 0x10;
-static const uint8_t DwarfDataRel = 0x30;
-// static uint8_t DwarfOmit = 0xff;
+/*
+ * Global state for the perf jitdump implementation
+ *
+ * This structure maintains all the state needed for generating jitdump files.
+ * It's designed as a singleton since there's typically only one jitdump file
+ * per Python process.
+ */
 typedef struct {
-    unsigned char version;
-    unsigned char eh_frame_ptr_enc;
-    unsigned char fde_count_enc;
-    unsigned char table_enc;
-    int32_t eh_frame_ptr;
-    int32_t eh_fde_count;
-    int32_t from;
-    int32_t to;
-} EhFrameHeader;
+    FILE* perf_map;          // File handle for the jitdump file
+    PyThread_type_lock map_lock;  // Thread synchronization lock
+    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
+    size_t mapped_size;      // Size of the mapped region
+    int code_id;             // Counter for unique code region identifiers
+} PerfMapJitState;
+
+/* Global singleton instance */
+static PerfMapJitState perf_jit_map_state;
+
+// =============================================================================
+//                              TIME UTILITIES
+// =============================================================================
 
+/* Time conversion constant */
+static const intptr_t nanoseconds_per_second = 1000000000;
+
+/*
+ * Get current monotonic time in nanoseconds
+ *
+ * Monotonic time is preferred for event timestamps because it's not affected
+ * by system clock adjustments. This ensures consistent timing relationships
+ * between events even if the system clock is changed.
+ *
+ * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
+ */
 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
-        UNREACHABLE();
+        Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }
-    // Convert to nanoseconds.
+
+    /* Convert to nanoseconds for maximum precision */
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }
 
+/*
+ * Get current wall clock time in microseconds
+ *
+ * Used for the jitdump file header timestamp. Unlike monotonic time,
+ * this represents actual wall clock time that can be correlated with
+ * other system events.
+ *
+ * Returns: Current time in microseconds since Unix epoch
+ */
 static int64_t get_current_time_microseconds(void) {
-  // gettimeofday has microsecond resolution.
-  struct timeval tv;
-  if (gettimeofday(&tv, NULL) < 0) {
-    UNREACHABLE();
-    return 0;
-  }
-  return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        Py_UNREACHABLE();  // Should never fail on supported systems
+        return 0;
+    }
+    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
+// =============================================================================
+//                              UTILITY FUNCTIONS
+// =============================================================================
 
+/*
+ * Round up a value to the next multiple of a given number
+ *
+ * This is essential for maintaining proper alignment requirements in the
+ * jitdump format. Many structures need to be aligned to specific boundaries
+ * (typically 8 or 16 bytes) for efficient processing by perf.
+ *
+ * Args:
+ *   value: The value to round up
+ *   multiple: The multiple to round up to
+ *
+ * Returns: The smallest value >= input that is a multiple of 'multiple'
+ */
 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
-        // Avoid division by zero
-        return value;
+        return value;  // Avoid division by zero
     }
 
     int64_t remainder = value % multiple;
     if (remainder == 0) {
-        // Value is already a multiple of 'multiple'
-        return value;
+        return value;  // Already aligned
     }
 
-    // Calculate the difference to the next multiple
+    /* Calculate how much to add to reach the next multiple */
     int64_t difference = multiple - remainder;
-
-    // Add the difference to the value
     int64_t rounded_up_value = value + difference;
 
     return rounded_up_value;
 }
 
+// =============================================================================
+//                              FILE I/O UTILITIES
+// =============================================================================
 
+/*
+ * Write data to the jitdump file with error handling
+ *
+ * This function ensures that all data is written to the file, handling
+ * partial writes that can occur with large buffers or when the system
+ * is under load.
+ *
+ * Args:
+ *   buffer: Pointer to data to write
+ *   size: Number of bytes to write
+ */
 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);
+
     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
-            UNREACHABLE();
+            Py_UNREACHABLE();  // Write failure - should be very rare
             break;
         }
         size -= written;
@@ -243,284 +356,720 @@ static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     }
 }
 
+/*
+ * Write the jitdump file header
+ *
+ * The header must be written exactly once at the beginning of each jitdump
+ * file. It provides metadata that perf uses to parse the rest of the file.
+ *
+ * Args:
+ *   pid: Process ID to include in the header
+ *   out_file: File handle to write to (currently unused, uses global state)
+ */
 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;
-    header.magic = 0x4A695444;
-    header.version = 1;
-    header.size = sizeof(Header);
-    header.elf_mach_target = GetElfMachineArchitecture();
-    header.process_id = pid;
-    header.time_stamp = get_current_time_microseconds();
-    header.flags = 0;
-    perf_map_jit_write_fully(&header, sizeof(header));
-}
 
-static void* perf_map_jit_init(void) {
-    char filename[100];
-    int pid = getpid();
-    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
-    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
-    if (fd == -1) {
-        return NULL;
-    }
+    /* Initialize header with required values */
+    header.magic = 0x4A695444;                    // "JiTD" magic number
+    header.version = 1;                           // Current jitdump version
+    header.size = sizeof(Header);                 // Header size for validation
+    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
+    header.process_id = pid;                      // Process identifier
+    header.time_stamp = get_current_time_microseconds();   // Creation time
+    header.flags = 0;                             // No special flags currently used
 
-    const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
-    if (page_size == -1) {
-        close(fd);
-        return NULL;
-    }
-
-    // The perf jit interface forces us to map the first page of the file
-    // to signal that we are using the interface.
-    perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
-    if (perf_jit_map_state.mapped_buffer == NULL) {
-        close(fd);
-        return NULL;
-    }
-    perf_jit_map_state.mapped_size = page_size;
-    perf_jit_map_state.perf_map = fdopen(fd, "w+");
-    if (perf_jit_map_state.perf_map == NULL) {
-        close(fd);
-        return NULL;
-    }
-    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
-    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
-
-    perf_jit_map_state.map_lock = PyThread_allocate_lock();
-    if (perf_jit_map_state.map_lock == NULL) {
-        fclose(perf_jit_map_state.perf_map);
-        return NULL;
-    }
-    perf_jit_map_state.code_id = 0;
-
-    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
-    return &perf_jit_map_state;
+    perf_map_jit_write_fully(&header, sizeof(header));
 }
 
-/* DWARF definitions. */
+// =============================================================================
+//                              DWARF CONSTANTS AND UTILITIES
+// =============================================================================
+
+/*
+ * DWARF (Debug With Arbitrary Record Formats) constants
+ *
+ * DWARF is a debugging data format used to provide stack unwinding information.
+ * These constants define the various encoding types and opcodes used in
+ * DWARF Call Frame Information (CFI) records.
+ */
 
+/* DWARF Call Frame Information version */
 #define DWRF_CIE_VERSION 1
 
+/* DWARF CFA (Call Frame Address) opcodes */
 enum {
-    DWRF_CFA_nop = 0x0,
-    DWRF_CFA_offset_extended = 0x5,
-    DWRF_CFA_def_cfa = 0xc,
-    DWRF_CFA_def_cfa_offset = 0xe,
-    DWRF_CFA_offset_extended_sf = 0x11,
-    DWRF_CFA_advance_loc = 0x40,
-    DWRF_CFA_offset = 0x80
+    DWRF_CFA_nop = 0x0,                    // No operation
+    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
+    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
+    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
+    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
+    DWRF_CFA_offset = 0x80                // Simple offset instruction
 };
 
-enum
-  {
-    DWRF_EH_PE_absptr = 0x00,
-    DWRF_EH_PE_omit = 0xff,
-
-    /* FDE data encoding.  */
-    DWRF_EH_PE_uleb128 = 0x01,
-    DWRF_EH_PE_udata2 = 0x02,
-    DWRF_EH_PE_udata4 = 0x03,
-    DWRF_EH_PE_udata8 = 0x04,
-    DWRF_EH_PE_sleb128 = 0x09,
-    DWRF_EH_PE_sdata2 = 0x0a,
-    DWRF_EH_PE_sdata4 = 0x0b,
-    DWRF_EH_PE_sdata8 = 0x0c,
-    DWRF_EH_PE_signed = 0x08,
-
-    /* FDE flags.  */
-    DWRF_EH_PE_pcrel = 0x10,
-    DWRF_EH_PE_textrel = 0x20,
-    DWRF_EH_PE_datarel = 0x30,
-    DWRF_EH_PE_funcrel = 0x40,
-    DWRF_EH_PE_aligned = 0x50,
-
-    DWRF_EH_PE_indirect = 0x80
-  };
+/* DWARF Exception Handling pointer encodings */
+enum {
+    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
+    DWRF_EH_PE_omit = 0xff,               // Omitted value
+
+    /* Data type encodings */
+    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
+    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
+    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
+    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
+    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
+    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
+    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
+    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
+    DWRF_EH_PE_signed = 0x08,             // Signed flag
+
+    /* Reference type encodings */
+    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
+    DWRF_EH_PE_textrel = 0x20,            // Text-relative
+    DWRF_EH_PE_datarel = 0x30,            // Data-relative
+    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
+    DWRF_EH_PE_aligned = 0x50,            // Aligned
+    DWRF_EH_PE_indirect = 0x80            // Indirect
+};
 
+/* Additional DWARF constants for debug information */
 enum { DWRF_TAG_compile_unit = 0x11 };
-
 enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
+enum {
+    DWRF_AT_name = 0x03,         // Name attribute
+    DWRF_AT_stmt_list = 0x10,    // Statement list
+    DWRF_AT_low_pc = 0x11,       // Low PC address
+    DWRF_AT_high_pc = 0x12       // High PC address
+};
+enum {
+    DWRF_FORM_addr = 0x01,       // Address form
+    DWRF_FORM_data4 = 0x06,      // 4-byte data
+    DWRF_FORM_string = 0x08      // String form
+};
 
-enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
-
-enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
-
-enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
+/* Line number program opcodes */
+enum {
+    DWRF_LNS_extended_op = 0,    // Extended opcode
+    DWRF_LNS_copy = 1,           // Copy operation
+    DWRF_LNS_advance_pc = 2,     // Advance program counter
+    DWRF_LNS_advance_line = 3    // Advance line number
+};
 
-enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
+/* Line number extended opcodes */
+enum {
+    DWRF_LNE_end_sequence = 1,   // End of sequence
+    DWRF_LNE_set_address = 2     // Set address
+};
 
+/*
+ * Architecture-specific DWARF register numbers
+ *
+ * These constants define the register numbering scheme used by DWARF
+ * for each supported architecture. The numbers must match the ABI
+ * specification for proper stack unwinding.
+ */
 enum {
 #ifdef __x86_64__
-    /* Yes, the order is strange, but correct. */
-    DWRF_REG_AX,
-    DWRF_REG_DX,
-    DWRF_REG_CX,
-    DWRF_REG_BX,
-    DWRF_REG_SI,
-    DWRF_REG_DI,
-    DWRF_REG_BP,
-    DWRF_REG_SP,
-    DWRF_REG_8,
-    DWRF_REG_9,
-    DWRF_REG_10,
-    DWRF_REG_11,
-    DWRF_REG_12,
-    DWRF_REG_13,
-    DWRF_REG_14,
-    DWRF_REG_15,
-    DWRF_REG_RA,
+    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
+    DWRF_REG_AX,    // RAX
+    DWRF_REG_DX,    // RDX
+    DWRF_REG_CX,    // RCX
+    DWRF_REG_BX,    // RBX
+    DWRF_REG_SI,    // RSI
+    DWRF_REG_DI,    // RDI
+    DWRF_REG_BP,    // RBP
+    DWRF_REG_SP,    // RSP
+    DWRF_REG_8,     // R8
+    DWRF_REG_9,     // R9
+    DWRF_REG_10,    // R10
+    DWRF_REG_11,    // R11
+    DWRF_REG_12,    // R12
+    DWRF_REG_13,    // R13
+    DWRF_REG_14,    // R14
+    DWRF_REG_15,    // R15
+    DWRF_REG_RA,    // Return address (RIP)
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    DWRF_REG_SP = 31,
-    DWRF_REG_RA = 30,
+    /* AArch64 register numbering */
+    DWRF_REG_FP = 29,  // Frame Pointer
+    DWRF_REG_RA = 30,  // Link register (return address)
+    DWRF_REG_SP = 31,  // Stack pointer
 #else
 #    error "Unsupported target architecture"
 #endif
 };
 
-typedef struct ELFObjectContext
-{
-    uint8_t* p; /* Pointer to next address in obj.space. */
-    uint8_t* startp; /* Pointer to start address in obj.space. */
-    uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
-    uint32_t code_size; /* Size of machine code. */
+/* DWARF encoding constants used in EH frame headers */
+static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
+static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
+static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
+static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
+
+// =============================================================================
+//                              ELF OBJECT CONTEXT
+// =============================================================================
+
+/*
+ * Context for building ELF/DWARF structures
+ *
+ * This structure maintains state while constructing DWARF unwind information.
+ * It acts as a simple buffer manager with pointers to track current position
+ * and important landmarks within the buffer.
+ */
+typedef struct ELFObjectContext {
+    uint8_t* p;            // Current write position in buffer
+    uint8_t* startp;       // Start of buffer (for offset calculations)
+    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
+    uint32_t code_size;    // Size of the code being described
 } ELFObjectContext;
 
-/* Append a null-terminated string. */
-static uint32_t
-elfctx_append_string(ELFObjectContext* ctx, const char* str)
-{
+/*
+ * EH Frame Header structure for DWARF unwinding
+ *
+ * This structure provides metadata about the DWARF unwinding information
+ * that follows. It's required by the perf jitdump format to enable proper
+ * stack unwinding during profiling.
+ */
+typedef struct {
+    unsigned char version;           // EH frame version (always 1)
+    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
+    unsigned char fde_count_enc;     // Encoding of FDE count
+    unsigned char table_enc;         // Encoding of table entries
+    int32_t eh_frame_ptr;           // Pointer to EH frame data
+    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
+    int32_t from;                   // Start address of code range
+    int32_t to;                     // End address of code range
+} EhFrameHeader;
+
+// =============================================================================
+//                              DWARF GENERATION UTILITIES
+// =============================================================================
+
+/*
+ * Append a null-terminated string to the ELF context buffer
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   str: String to append (must be null-terminated)
+ *
+ * Returns: Offset from start of buffer where string was written
+ */
+static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
     uint8_t* p = ctx->p;
     uint32_t ofs = (uint32_t)(p - ctx->startp);
+
+    /* Copy string including null terminator */
     do {
         *p++ = (uint8_t)*str;
     } while (*str++);
+
     ctx->p = p;
     return ofs;
 }
 
-/* Append a SLEB128 value. */
-static void
-elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
-{
+/*
+ * Append a SLEB128 (Signed Little Endian Base 128) value
+ *
+ * SLEB128 is a variable-length encoding used extensively in DWARF.
+ * It efficiently encodes small numbers in fewer bytes.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Signed value to encode
+ */
+static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
-        *p++ = (uint8_t)((v & 0x7f) | 0x80);
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (uint8_t)(v & 0x7f);
+    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Append a ULEB128 to buffer. */
-static void
-elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
-{
+/*
+ * Append a ULEB128 (Unsigned Little Endian Base 128) value
+ *
+ * Similar to SLEB128 but for unsigned values.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Unsigned value to encode
+ */
+static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
     uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
     for (; v >= 0x80; v >>= 7) {
-        *p++ = (char)((v & 0x7f) | 0x80);
+        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
     }
-    *p++ = (char)v;
+    *p++ = (char)v;  // Final byte without continuation bit
+
     ctx->p = p;
 }
 
-/* Shortcuts to generate DWARF structures. */
-#define DWRF_U8(x) (*p++ = (x))
-#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
-#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
-#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
-#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
-#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
-#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
-#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
-#define DWRF_ALIGNNOP(s)                                                                                \
-    while ((uintptr_t)p & ((s)-1)) {                                                                    \
-        *p++ = DWRF_CFA_nop;                                                                            \
+/*
+ * Macros for generating DWARF structures
+ *
+ * These macros provide a convenient way to write various data types
+ * to the DWARF buffer while automatically advancing the pointer.
+ */
+#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
+
+/* Align to specified boundary with NOP instructions */
+#define DWRF_ALIGNNOP(s)                                          \
+    while ((uintptr_t)p & ((s)-1)) {                              \
+        *p++ = DWRF_CFA_nop;                                       \
     }
-#define DWRF_SECTION(name, stmt)                                                                        \
-    {                                                                                                   \
-        uint32_t* szp_##name = (uint32_t*)p;                                                            \
-        p += 4;                                                                                         \
-        stmt;                                                                                           \
-        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
+
+/* Write a DWARF section with automatic size calculation */
+#define DWRF_SECTION(name, stmt)                                  \
+    {                                                             \
+        uint32_t* szp_##name = (uint32_t*)p;                      \
+        p += 4;                                                   \
+        stmt;                                                     \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
     }
 
-/* Initialize .eh_frame section. */
-static void
-elf_init_ehframe(ELFObjectContext* ctx)
-{
+// =============================================================================
+//                              DWARF EH FRAME GENERATION
+// =============================================================================
+
+/*
+ * Initialize DWARF .eh_frame section for a code region
+ *
+ * The .eh_frame section contains Call Frame Information (CFI) that describes
+ * how to unwind the stack at any point in the code. This is essential for
+ * proper profiling as it allows perf to generate accurate call graphs.
+ *
+ * The function generates two main components:
+ * 1. CIE (Common Information Entry) - describes calling conventions
+ * 2. FDE (Frame Description Entry) - describes specific function unwinding
+ *
+ * Args:
+ *   ctx: ELF object context containing code size and buffer pointers
+ */
+static void elf_init_ehframe(ELFObjectContext* ctx) {
     uint8_t* p = ctx->p;
-    uint8_t* framep = p;
-
-    /* Emit DWARF EH CIE. */
-    DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
-                 DWRF_U8(DWRF_CIE_VERSION);
-                 DWRF_STR("zR"); /* Augmentation. */
-                 DWRF_UV(1); /* Code alignment factor. */
-                 DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
-                 DWRF_U8(DWRF_REG_RA); /* Return address register. */
-                 DWRF_UV(1);
-                 DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
-                 DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
-                 DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));
+    uint8_t* framep = p;  // Remember start of frame data
+
+    /*
+    * DWARF Unwind Table for Trampoline Function
+    *
+    * This section defines DWARF Call Frame Information (CFI) using encoded macros
+    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
+    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
+    * and debuggers for stack unwinding in JIT-compiled code.
+    *
+    * -------------------------------------------------
+    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
+    * -------------------------------------------------
+    *
+    * 1. Create a trampoline source file (e.g., `trampoline.c`):
+    *
+    *      #include <Python.h>
+    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
+    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
+    *          return evaluator(ts, f, throwflag);
+    *      }
+    *
+    * 2. Compile to an object file with frame pointer preservation:
+    *
+    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * 3. Extract DWARF unwind info from the object file:
+    *
+    *      readelf -w trampoline.o
+    *
+    *    Example output from `.eh_frame`:
+    *
+    *      00000000 CIE
+    *        Version:               1
+    *        Augmentation:          "zR"
+    *        Code alignment factor: 4
+    *        Data alignment factor: -8
+    *        Return address column: 30
+    *        DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    *      00000014 FDE cie=00000000 pc=0..14
+    *        DW_CFA_advance_loc: 4
+    *        DW_CFA_def_cfa_offset: 16
+    *        DW_CFA_offset: r29 at cfa-16
+    *        DW_CFA_offset: r30 at cfa-8
+    *        DW_CFA_advance_loc: 12
+    *        DW_CFA_restore: r30
+    *        DW_CFA_restore: r29
+    *        DW_CFA_def_cfa_offset: 0
+    *
+    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
+    *
+    * ----------------------------------
+    * HOW TO TRANSLATE TO DWRF_* MACROS:
+    * ----------------------------------
+    *
+    * After compiling your trampoline with:
+    *
+    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * run:
+    *
+    *     readelf -w trampoline.o
+    *
+    * to inspect the generated `.eh_frame` data. You will see two main components:
+    *
+    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
+    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
+    *
+    * ---------------------
+    * Translating the CIE:
+    * ---------------------
+    * From `readelf -w`, you might see:
+    *
+    *   00000000 0000000000000010 00000000 CIE
+    *     Version:               1
+    *     Augmentation:          "zR"
+    *     Code alignment factor: 4
+    *     Data alignment factor: -8
+    *     Return address column: 30
+    *     Augmentation data:     1b
+    *     DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    * Map this to:
+    *
+    *     DWRF_SECTION(CIE,
+    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
+    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
+    *         DWRF_STR("zR");                         // Augmentation string "zR"
+    *         DWRF_UV(4);                             // Code alignment factor = 4
+    *         DWRF_SV(-8);                            // Data alignment factor = -8
+    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
+    *         DWRF_UV(1);                             // Augmentation data length = 1
+    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
+    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
+    *         DWRF_UV(0);                             // Offset = 0
+    *
+    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
+    *     )
+    *
+    * Notes:
+    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
+    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
+    *
+    * ---------------------
+    * Translating the FDE:
+    * ---------------------
+    * From `readelf -w`:
+    *
+    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
+    *     DW_CFA_advance_loc: 4
+    *     DW_CFA_def_cfa_offset: 16
+    *     DW_CFA_offset: r29 at cfa-16
+    *     DW_CFA_offset: r30 at cfa-8
+    *     DW_CFA_advance_loc: 12
+    *     DW_CFA_restore: r30
+    *     DW_CFA_restore: r29
+    *     DW_CFA_def_cfa_offset: 0
+    *
+    * Map the FDE header and instructions to:
+    *
+    *     DWRF_SECTION(FDE,
+    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
+    *         DWRF_U32(-0x30);                        // Initial PC-relative location of the code
+    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
+    *         DWRF_U8(0);                             // Augmentation data length (none)
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
+    *         DWRF_UV(16);
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
+    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
+    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
+    *         DWRF_UV(0);
+    *     )
+    *
+    * To regenerate:
+    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
+    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
+    *      the code is in a different address space every time.
+    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
+    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
+    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
+    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
+    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
+    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
+    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
+    */
+
+    /*
+     * Emit DWARF EH CIE (Common Information Entry)
+     *
+     * The CIE describes the calling conventions and basic unwinding rules
+     * that apply to all functions in this compilation unit.
+     */
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
+        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
+        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
+        DWRF_UV(1);                           // Code alignment factor
+        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
+        DWRF_U8(DWRF_REG_RA);                 // Return address register number
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
+
+        /* Initial CFI instructions - describe default calling convention */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
+        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
+        DWRF_UV(1);                           // At offset 1 from CFA
+
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
     )
 
-    ctx->eh_frame_p = p;
-
-    /* Emit DWARF EH FDE. */
-    DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
-                 DWRF_U32(-0x30); /* Machine code offset relative to .text. */
-                 DWRF_U32(ctx->code_size); /* Machine code length. */
-                 DWRF_U8(0); /* Augmentation data. */
-    /* Registers saved in CFRAME. */
+    ctx->eh_frame_p = p;  // Remember start of FDE data
+
+    /*
+     * Emit DWARF EH FDE (Frame Description Entry)
+     *
+     * The FDE describes unwinding information specific to this function.
+     * It references the CIE and provides function-specific CFI instructions.
+     */
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        DWRF_U32(-0x30);                      // Machine code offset relative to .text
+        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code lenght)
+        DWRF_U8(0);                           // Augmentation data length (none)
+
+        /*
+         * Architecture-specific CFI instructions
+         *
+         * These instructions describe how registers are saved and restored
+         * during function calls. Each architecture has different calling
+         * conventions and register usage patterns.
+         */
 #ifdef __x86_64__
-                 DWRF_U8(DWRF_CFA_advance_loc | 4);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_advance_loc | 6);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
-    /* Extra registers saved for JIT-compiled code. */
+        /* x86_64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(16);                          // New offset: SP + 16
+        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
+        DWRF_UV(8);                           // New offset: SP + 8
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-                 DWRF_U8(DWRF_CFA_advance_loc | 1);
-                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
-                 DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
-                 DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
-                 DWRF_U8(DWRF_CFA_advance_loc | 3);
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 29));
-                 DWRF_U8(DWRF_CFA_offset | -(64 - 30));
-                 DWRF_U8(DWRF_CFA_def_cfa_offset);
-                 DWRF_UV(0);
+        /* AArch64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 instruction (stp x29, x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
+        DWRF_UV(16);                              // CFA = SP + 16 (stack pointer after push)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
+        DWRF_UV(2);                               // At offset 2 from CFA (2 * 8 = 16 bytes)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
+        DWRF_UV(1);                               // At offset 1 from CFA (1 * 8 = 8 bytes)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer (x29)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register (x30)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
+        DWRF_UV(0);                               // CFA = SP + 0 (stack restored)
+
 #else
 #    error "Unsupported target architecture"
 #endif
-                 DWRF_ALIGNNOP(sizeof(uintptr_t));)
 
-    ctx->p = p;
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    ctx->p = p;  // Update context pointer to end of generated data
+}
+
+// =============================================================================
+//                              JITDUMP INITIALIZATION
+// =============================================================================
+
+/*
+ * Initialize the perf jitdump interface
+ *
+ * This function sets up everything needed to generate jitdump files:
+ * 1. Creates the jitdump file with a unique name
+ * 2. Maps the first page to signal perf that we're using the interface
+ * 3. Writes the jitdump header
+ * 4. Initializes synchronization primitives
+ *
+ * The memory mapping is crucial - perf detects jitdump files by scanning
+ * for processes that have mapped files matching the pattern /tmp/jit-*.dump
+ *
+ * Returns: Pointer to initialized state, or NULL on failure
+ */
+static void* perf_map_jit_init(void) {
+    char filename[100];
+    int pid = getpid();
+
+    /* Create unique filename based on process ID */
+    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
+
+    /* Create/open the jitdump file with appropriate permissions */
+    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
+    if (fd == -1) {
+        return NULL;  // Failed to create file
+    }
+
+    /* Get system page size for memory mapping */
+    const long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size == -1) {
+        close(fd);
+        return NULL;  // Failed to get page size
+    }
+
+    /*
+     * Map the first page of the jitdump file
+     *
+     * This memory mapping serves as a signal to perf that this process
+     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
+     * files that match the jitdump naming pattern.
+     *
+     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
+     */
+    perf_jit_map_state.mapped_buffer = mmap(
+        NULL,                    // Let kernel choose address
+        page_size,               // Map one page
+        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
+        MAP_PRIVATE,             // Private mapping
+        fd,                      // File descriptor
+        0                        // Offset 0 (first page)
+    );
+
+    if (perf_jit_map_state.mapped_buffer == NULL) {
+        close(fd);
+        return NULL;  // Memory mapping failed
+    }
+
+    perf_jit_map_state.mapped_size = page_size;
+
+    /* Convert file descriptor to FILE* for easier I/O operations */
+    perf_jit_map_state.perf_map = fdopen(fd, "w+");
+    if (perf_jit_map_state.perf_map == NULL) {
+        close(fd);
+        return NULL;  // Failed to create FILE*
+    }
+
+    /*
+     * Set up file buffering for better performance
+     *
+     * We use a large buffer (2MB) because jitdump files can be written
+     * frequently during program execution. Buffering reduces system call
+     * overhead and improves overall performance.
+     */
+    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
+
+    /* Write the jitdump file header */
+    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
+
+    /*
+     * Initialize thread synchronization lock
+     *
+     * Multiple threads may attempt to write to the jitdump file
+     * simultaneously. This lock ensures thread-safe access to the
+     * global jitdump state.
+     */
+    perf_jit_map_state.map_lock = PyThread_allocate_lock();
+    if (perf_jit_map_state.map_lock == NULL) {
+        fclose(perf_jit_map_state.perf_map);
+        return NULL;  // Failed to create lock
+    }
+
+    /* Initialize code ID counter */
+    perf_jit_map_state.code_id = 0;
+
+    /* Configure trampoline API with padding information */
+    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+
+    return &perf_jit_map_state;
 }
 
+// =============================================================================
+//                              MAIN JITDUMP ENTRY WRITING
+// =============================================================================
+
+/*
+ * Write a complete jitdump entry for a Python function
+ *
+ * This is the main function called by Python's trampoline system whenever
+ * a new piece of JIT-compiled code needs to be recorded. It writes both
+ * the unwinding information and the code load event to the jitdump file.
+ *
+ * The function performs these steps:
+ * 1. Initialize jitdump system if not already done
+ * 2. Extract function name and filename from Python code object
+ * 3. Generate DWARF unwinding information
+ * 4. Write unwinding info event to jitdump file
+ * 5. Write code load event to jitdump file
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *   code_addr: Address where the compiled code resides
+ *   code_size: Size of the compiled code in bytes
+ *   co: Python code object containing metadata
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static void perf_map_jit_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
+                                    unsigned int code_size, PyCodeObject *co)
 {
-
+    /* Initialize jitdump system on first use */
     if (perf_jit_map_state.perf_map == NULL) {
         void* ret = perf_map_jit_init();
         if(ret == NULL){
-            return;
+            return;  // Initialization failed, silently abort
         }
     }
 
+    /*
+     * Extract function information from Python code object
+     *
+     * We create a human-readable function name by combining the qualified
+     * name (includes class/module context) with the filename. This helps
+     * developers identify functions in perf reports.
+     */
     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
+
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }
 
-
+    /*
+     * Create formatted function name for perf display
+     *
+     * Format: "py::<function_name>:<filename>"
+     * The "py::" prefix helps identify Python functions in mixed-language
+     * profiles (e.g., when profiling C extensions alongside Python code).
+     */
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
-        return;
+        return;  // Memory allocation failed
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
 
@@ -528,90 +1077,185 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     uword base = (uword)code_addr;
     uword size = code_size;
 
-    // Write the code unwinding info event.
-
-    // Create unwinding information (eh frame)
+    /*
+     * Generate DWARF unwinding information
+     *
+     * DWARF data is essential for proper stack unwinding during profiling.
+     * Without it, perf cannot generate accurate call graphs, especially
+     * in optimized code where frame pointers may be omitted.
+     */
     ELFObjectContext ctx;
-    char buffer[1024];
+    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
+
+    /* Generate EH frame (Exception Handling frame) data */
     elf_init_ehframe(&ctx);
     int eh_frame_size = ctx.p - ctx.startp;
 
-    // Populate the unwind info event for perf
+    /*
+     * Write Code Unwinding Information Event
+     *
+     * This event must be written before the code load event to ensure
+     * perf has the unwinding information available when it processes
+     * the code region.
+     */
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
-    // Ensure we have enough space between DSOs when perf maps them
+
+    /* Verify we don't exceed our padding budget */
     assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
-    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
+    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
+
+    /* Calculate total event size with padding */
     int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
-    int padding_size = round_up(content_size, 8) - content_size;
+    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
     ev2.base.size = content_size + padding_size;
-    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
+    /* Write the unwinding info event header */
+    perf_map_jit_write_fully(&ev2, sizeof(ev2));
 
-    // Populate the eh Frame header
+    /*
+     * Write EH Frame Header
+     *
+     * The EH frame header provides metadata about the DWARF unwinding
+     * information that follows. It includes pointers and counts that
+     * help perf navigate the unwinding data efficiently.
+     */
     EhFrameHeader f;
     f.version = 1;
-    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
-    f.fde_count_enc = DwarfUData4;
-    f.table_enc = DwarfSData4 | DwarfDataRel;
+    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
+    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
+    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
+
+    /* Calculate relative offsets for EH frame navigation */
     f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
-    f.eh_fde_count = 1;
+    f.eh_fde_count = 1;  // We generate exactly one FDE per function
     f.from = -(round_up(code_size, 8) + eh_frame_size);
+
     int cie_size = ctx.eh_frame_p - ctx.startp;
     f.to = -(eh_frame_size - cie_size);
 
+    /* Write EH frame data and header */
     perf_map_jit_write_fully(ctx.startp, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));
 
+    /* Write padding to maintain alignment */
     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);
 
-    // Write the code load event.
+    /*
+     * Write Code Load Event
+     *
+     * This event tells perf about the new code region. It includes:
+     * - Memory addresses and sizes
+     * - Process and thread identification
+     * - Function name for symbol resolution
+     * - The actual machine code bytes
+     */
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
-    ev.thread_id = syscall(SYS_gettid);
-    ev.vma = base;
-    ev.code_address = base;
+    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
+    ev.vma = base;                       // Virtual memory address
+    ev.code_address = base;              // Same as VMA for our use case
     ev.code_size = size;
+
+    /* Assign unique code ID and increment counter */
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;
 
+    /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
-    perf_map_jit_write_fully(perf_map_entry, name_length+1);
-    perf_map_jit_write_fully((void*)(base), size);
-    return;
+    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
+    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
+
+    /* Clean up allocated memory */
+    PyMem_RawFree(perf_map_entry);
 }
 
+// =============================================================================
+//                              CLEANUP AND FINALIZATION
+// =============================================================================
+
+/*
+ * Finalize and cleanup the perf jitdump system
+ *
+ * This function is called when Python is shutting down or when the
+ * perf trampoline system is being disabled. It ensures all resources
+ * are properly released and all buffered data is flushed to disk.
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *
+ * Returns: 0 on success
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
 static int perf_map_jit_fini(void* state) {
+    /*
+     * Close jitdump file with proper synchronization
+     *
+     * We need to acquire the lock to ensure no other threads are
+     * writing to the file when we close it. This prevents corruption
+     * and ensures all data is properly flushed.
+     */
     if (perf_jit_map_state.perf_map != NULL) {
-        // close the file
         PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
-        fclose(perf_jit_map_state.perf_map);
+        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
         PyThread_release_lock(perf_jit_map_state.map_lock);
 
-        // clean up the lock and state
+        /* Clean up synchronization primitive */
         PyThread_free_lock(perf_jit_map_state.map_lock);
         perf_jit_map_state.perf_map = NULL;
     }
+
+    /*
+     * Unmap the memory region
+     *
+     * This removes the signal to perf that we were generating JIT code.
+     * After this point, perf will no longer detect this process as
+     * having JIT capabilities.
+     */
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
+        perf_jit_map_state.mapped_buffer = NULL;
     }
+
+    /* Clear global state reference */
     trampoline_api.state = NULL;
-    return 0;
+
+    return 0;  // Success
 }
 
+// =============================================================================
+//                              PUBLIC API EXPORT
+// =============================================================================
+
+/*
+ * Python Perf Callbacks Structure
+ *
+ * This structure defines the callback interface that Python's trampoline
+ * system uses to integrate with perf profiling. It contains function
+ * pointers for initialization, event writing, and cleanup.
+ *
+ * CRITICAL: This structure and its contents are part of Python's internal
+ * API. The function signatures and behavior must remain stable to maintain
+ * compatibility with the Python interpreter's perf integration system.
+ *
+ * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
+ */
 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
-    &perf_map_jit_init,
-    &perf_map_jit_write_entry,
-    &perf_map_jit_fini,
+    &perf_map_jit_init,        // Initialization function
+    &perf_map_jit_write_entry, // Event writing function
+    &perf_map_jit_fini,        // Cleanup function
 };
 
-#endif
+#endif /* PY_HAVE_PERF_TRAMPOLINE */
+\ No newline at end of file