Python/perf_jit_trampoline.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261

/*
 * Python Perf Trampoline Support - JIT Dump Implementation
 *
 * This file implements the perf jitdump API for Python's performance profiling
 * integration. It allows perf (Linux performance analysis tool) to understand
 * and profile dynamically generated Python bytecode by creating JIT dump files
 * that perf can inject into its analysis.
 *
 *
 * IMPORTANT: This file exports specific callback functions that are part of
 * Python's internal API. Do not modify the function signatures or behavior
 * of exported functions without coordinating with the Python core team.
 *
 * Usually the binary and libraries are mapped in separate region like below:
 *
 *   address ->
 *    --+---------------------+--//--+---------------------+--
 *      | .text | .data | ... |      | .text | .data | ... |
 *    --+---------------------+--//--+---------------------+--
 *          myprog                      libc.so
 *
 * So it'd be easy and straight-forward to find a mapped binary or library from an
 * address.
 *
 * But for JIT code, the code arena only cares about the code section. But the
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
 * unwind info too. Then it'd generate following address space with synthesized
 * MMAP events. Let's say it has a sample between address B and C.
 *
 *                                                sample
 *                                                  |
 *   address ->                         A       B   v   C
 *   ---------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
 *     ...
 *   ---------------------------------------------------------------------------------------------------
 *
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
 * the unwind info. If it maps both .text section and unwind sections, the sample
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
 * which one is right. So to make perf happy we have non-overlapping ranges for each
 * DSO:
 *
 *   address ->
 *   -------------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
 *     ...
 *   -------------------------------------------------------------------------------------------------------
 *
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
 */


#include "Python.h"
#include "pycore_ceval.h"         // _PyPerf_Callbacks
#include "pycore_frame.h"
#include "pycore_interp.h"
#include "pycore_runtime.h"       // _PyRuntime

#ifdef PY_HAVE_PERF_TRAMPOLINE

/* Standard library includes for perf jitdump implementation */
#include <elf.h>                  // ELF architecture constants
#include <fcntl.h>                // File control operations
#include <stdio.h>                // Standard I/O operations
#include <stdlib.h>               // Standard library functions
#include <sys/mman.h>             // Memory mapping functions (mmap)
#include <sys/types.h>            // System data types
#include <unistd.h>               // System calls (sysconf, getpid)
#include <sys/time.h>             // Time functions (gettimeofday)
#include <sys/syscall.h>          // System call interface

// =============================================================================
//                           CONSTANTS AND CONFIGURATION
// =============================================================================

/*
 * Memory layout considerations for perf jitdump:
 *
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
 * Shared Object) files that contain:
 * - ELF headers
 * - .text section (actual machine code)
 * - Unwind information (for stack traces)
 *
 * To ensure proper address space layout, we add padding between code regions.
 * This prevents address conflicts when perf maps the synthesized DSOs.
 *
 * Memory layout example:
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
 *
 * The padding size (0x100) is chosen to accommodate typical unwind info sizes
 * while maintaining 16-byte alignment requirements.
 */
#define PERF_JIT_CODE_PADDING 0x100

/* Convenient access to the global trampoline API state */
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api

/* Type aliases for clarity and portability */
typedef uint64_t uword;                    // Word-sized unsigned integer
typedef const char* CodeComments;          // Code comment strings

/* Memory size constants */
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing

// =============================================================================
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
// =============================================================================

/*
 * Returns the ELF machine architecture constant for the current platform.
 * This is required for the jitdump header to correctly identify the target
 * architecture for perf processing.
 *
 */
static uint64_t GetElfMachineArchitecture(void) {
#if defined(__x86_64__) || defined(_M_X64)
    return EM_X86_64;
#elif defined(__i386__) || defined(_M_IX86)
    return EM_386;
#elif defined(__aarch64__)
    return EM_AARCH64;
#elif defined(__arm__) || defined(_M_ARM)
    return EM_ARM;
#elif defined(__riscv)
    return EM_RISCV;
#else
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
    return 0;
#endif
}

// =============================================================================
//                           PERF JITDUMP DATA STRUCTURES
// =============================================================================

/*
 * Perf jitdump file format structures
 *
 * These structures define the binary format that perf expects for JIT dump files.
 * The format is documented in the Linux perf tools source code and must match
 * exactly for proper perf integration.
 */

/*
 * Jitdump file header - written once at the beginning of each jitdump file
 * Contains metadata about the process and jitdump format version
 */
typedef struct {
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
    uint32_t version;            // Jitdump format version (currently 1)
    uint32_t size;               // Size of this header structure
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
    uint32_t reserved;           // Reserved field (must be 0)
    uint32_t process_id;         // Process ID of the JIT compiler
    uint64_t time_stamp;         // Timestamp when jitdump was created
    uint64_t flags;              // Feature flags (currently unused)
} Header;

/*
 * Perf event types supported by the jitdump format
 * Each event type has a corresponding structure format
 */
enum PerfEvent {
    PerfLoad = 0,           // Code load event (new JIT function)
    PerfMove = 1,           // Code move event (function relocated)
    PerfDebugInfo = 2,      // Debug information event
    PerfClose = 3,          // JIT session close event
    PerfUnwindingInfo = 4   // Stack unwinding information event
};

/*
 * Base event structure - common header for all perf events
 * Every event in the jitdump file starts with this structure
 */
struct BaseEvent {
    uint32_t event;         // Event type (from PerfEvent enum)
    uint32_t size;          // Total size of this event including payload
    uint64_t time_stamp;    // Timestamp when event occurred
};

/*
 * Code load event - indicates a new JIT-compiled function is available
 * This is the most important event type for Python profiling
 */
typedef struct {
    struct BaseEvent base;   // Common event header
    uint32_t process_id;     // Process ID where code was generated
    uint32_t thread_id;      // Thread ID where code was generated
    uint64_t vma;            // Virtual memory address where code is loaded
    uint64_t code_address;   // Address of the actual machine code
    uint64_t code_size;      // Size of the machine code in bytes
    uint64_t code_id;        // Unique identifier for this code region
    /* Followed by:
     * - null-terminated function name string
     * - raw machine code bytes
     */
} CodeLoadEvent;

/*
 * Code unwinding information event - provides DWARF data for stack traces
 * Essential for proper stack unwinding during profiling
 */
typedef struct {
    struct BaseEvent base;      // Common event header
    uint64_t unwind_data_size;  // Size of the unwinding data
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
    uint64_t mapped_size;       // Total mapped size (with padding)
    /* Followed by:
     * - EH frame header
     * - DWARF unwinding information
     * - Padding to alignment boundary
     */
} CodeUnwindingInfoEvent;

// =============================================================================
//                              GLOBAL STATE MANAGEMENT
// =============================================================================

/*
 * Global state for the perf jitdump implementation
 *
 * This structure maintains all the state needed for generating jitdump files.
 * It's designed as a singleton since there's typically only one jitdump file
 * per Python process.
 */
typedef struct {
    FILE* perf_map;          // File handle for the jitdump file
    PyThread_type_lock map_lock;  // Thread synchronization lock
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
    size_t mapped_size;      // Size of the mapped region
    int code_id;             // Counter for unique code region identifiers
} PerfMapJitState;

/* Global singleton instance */
static PerfMapJitState perf_jit_map_state;

// =============================================================================
//                              TIME UTILITIES
// =============================================================================

/* Time conversion constant */
static const intptr_t nanoseconds_per_second = 1000000000;

/*
 * Get current monotonic time in nanoseconds
 *
 * Monotonic time is preferred for event timestamps because it's not affected
 * by system clock adjustments. This ensures consistent timing relationships
 * between events even if the system clock is changed.
 *
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
 */
static int64_t get_current_monotonic_ticks(void) {
    struct timespec ts;
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }

    /* Convert to nanoseconds for maximum precision */
    int64_t result = ts.tv_sec;
    result *= nanoseconds_per_second;
    result += ts.tv_nsec;
    return result;
}

/*
 * Get current wall clock time in microseconds
 *
 * Used for the jitdump file header timestamp. Unlike monotonic time,
 * this represents actual wall clock time that can be correlated with
 * other system events.
 *
 * Returns: Current time in microseconds since Unix epoch
 */
static int64_t get_current_time_microseconds(void) {
    struct timeval tv;
    if (gettimeofday(&tv, NULL) < 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
}

// =============================================================================
//                              UTILITY FUNCTIONS
// =============================================================================

/*
 * Round up a value to the next multiple of a given number
 *
 * This is essential for maintaining proper alignment requirements in the
 * jitdump format. Many structures need to be aligned to specific boundaries
 * (typically 8 or 16 bytes) for efficient processing by perf.
 *
 * Args:
 *   value: The value to round up
 *   multiple: The multiple to round up to
 *
 * Returns: The smallest value >= input that is a multiple of 'multiple'
 */
static size_t round_up(int64_t value, int64_t multiple) {
    if (multiple == 0) {
        return value;  // Avoid division by zero
    }

    int64_t remainder = value % multiple;
    if (remainder == 0) {
        return value;  // Already aligned
    }

    /* Calculate how much to add to reach the next multiple */
    int64_t difference = multiple - remainder;
    int64_t rounded_up_value = value + difference;

    return rounded_up_value;
}

// =============================================================================
//                              FILE I/O UTILITIES
// =============================================================================

/*
 * Write data to the jitdump file with error handling
 *
 * This function ensures that all data is written to the file, handling
 * partial writes that can occur with large buffers or when the system
 * is under load.
 *
 * Args:
 *   buffer: Pointer to data to write
 *   size: Number of bytes to write
 */
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
    FILE* out_file = perf_jit_map_state.perf_map;
    const char* ptr = (const char*)(buffer);

    while (size > 0) {
        const size_t written = fwrite(ptr, 1, size, out_file);
        if (written == 0) {
            Py_UNREACHABLE();  // Write failure - should be very rare
            break;
        }
        size -= written;
        ptr += written;
    }
}

/*
 * Write the jitdump file header
 *
 * The header must be written exactly once at the beginning of each jitdump
 * file. It provides metadata that perf uses to parse the rest of the file.
 *
 * Args:
 *   pid: Process ID to include in the header
 *   out_file: File handle to write to (currently unused, uses global state)
 */
static void perf_map_jit_write_header(int pid, FILE* out_file) {
    Header header;

    /* Initialize header with required values */
    header.magic = 0x4A695444;                    // "JiTD" magic number
    header.version = 1;                           // Current jitdump version
    header.size = sizeof(Header);                 // Header size for validation
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
    header.process_id = pid;                      // Process identifier
    header.time_stamp = get_current_time_microseconds();   // Creation time
    header.flags = 0;                             // No special flags currently used

    perf_map_jit_write_fully(&header, sizeof(header));
}

// =============================================================================
//                              DWARF CONSTANTS AND UTILITIES
// =============================================================================

/*
 * DWARF (Debug With Arbitrary Record Formats) constants
 *
 * DWARF is a debugging data format used to provide stack unwinding information.
 * These constants define the various encoding types and opcodes used in
 * DWARF Call Frame Information (CFI) records.
 */

/* DWARF Call Frame Information version */
#define DWRF_CIE_VERSION 1

/* DWARF CFA (Call Frame Address) opcodes */
enum {
    DWRF_CFA_nop = 0x0,                    // No operation
    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
    DWRF_CFA_offset = 0x80                // Simple offset instruction
};

/* DWARF Exception Handling pointer encodings */
enum {
    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
    DWRF_EH_PE_omit = 0xff,               // Omitted value

    /* Data type encodings */
    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
    DWRF_EH_PE_signed = 0x08,             // Signed flag

    /* Reference type encodings */
    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
    DWRF_EH_PE_textrel = 0x20,            // Text-relative
    DWRF_EH_PE_datarel = 0x30,            // Data-relative
    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
    DWRF_EH_PE_aligned = 0x50,            // Aligned
    DWRF_EH_PE_indirect = 0x80            // Indirect
};

/* Additional DWARF constants for debug information */
enum { DWRF_TAG_compile_unit = 0x11 };
enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
enum {
    DWRF_AT_name = 0x03,         // Name attribute
    DWRF_AT_stmt_list = 0x10,    // Statement list
    DWRF_AT_low_pc = 0x11,       // Low PC address
    DWRF_AT_high_pc = 0x12       // High PC address
};
enum {
    DWRF_FORM_addr = 0x01,       // Address form
    DWRF_FORM_data4 = 0x06,      // 4-byte data
    DWRF_FORM_string = 0x08      // String form
};

/* Line number program opcodes */
enum {
    DWRF_LNS_extended_op = 0,    // Extended opcode
    DWRF_LNS_copy = 1,           // Copy operation
    DWRF_LNS_advance_pc = 2,     // Advance program counter
    DWRF_LNS_advance_line = 3    // Advance line number
};

/* Line number extended opcodes */
enum {
    DWRF_LNE_end_sequence = 1,   // End of sequence
    DWRF_LNE_set_address = 2     // Set address
};

/*
 * Architecture-specific DWARF register numbers
 *
 * These constants define the register numbering scheme used by DWARF
 * for each supported architecture. The numbers must match the ABI
 * specification for proper stack unwinding.
 */
enum {
#ifdef __x86_64__
    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
    DWRF_REG_AX,    // RAX
    DWRF_REG_DX,    // RDX
    DWRF_REG_CX,    // RCX
    DWRF_REG_BX,    // RBX
    DWRF_REG_SI,    // RSI
    DWRF_REG_DI,    // RDI
    DWRF_REG_BP,    // RBP
    DWRF_REG_SP,    // RSP
    DWRF_REG_8,     // R8
    DWRF_REG_9,     // R9
    DWRF_REG_10,    // R10
    DWRF_REG_11,    // R11
    DWRF_REG_12,    // R12
    DWRF_REG_13,    // R13
    DWRF_REG_14,    // R14
    DWRF_REG_15,    // R15
    DWRF_REG_RA,    // Return address (RIP)
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
    /* AArch64 register numbering */
    DWRF_REG_FP = 29,  // Frame Pointer
    DWRF_REG_RA = 30,  // Link register (return address)
    DWRF_REG_SP = 31,  // Stack pointer
#else
#    error "Unsupported target architecture"
#endif
};

/* DWARF encoding constants used in EH frame headers */
static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding

// =============================================================================
//                              ELF OBJECT CONTEXT
// =============================================================================

/*
 * Context for building ELF/DWARF structures
 *
 * This structure maintains state while constructing DWARF unwind information.
 * It acts as a simple buffer manager with pointers to track current position
 * and important landmarks within the buffer.
 */
typedef struct ELFObjectContext {
    uint8_t* p;            // Current write position in buffer
    uint8_t* startp;       // Start of buffer (for offset calculations)
    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
    uint32_t code_size;    // Size of the code being described
} ELFObjectContext;

/*
 * EH Frame Header structure for DWARF unwinding
 *
 * This structure provides metadata about the DWARF unwinding information
 * that follows. It's required by the perf jitdump format to enable proper
 * stack unwinding during profiling.
 */
typedef struct {
    unsigned char version;           // EH frame version (always 1)
    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
    unsigned char fde_count_enc;     // Encoding of FDE count
    unsigned char table_enc;         // Encoding of table entries
    int32_t eh_frame_ptr;           // Pointer to EH frame data
    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
    int32_t from;                   // Start address of code range
    int32_t to;                     // End address of code range
} EhFrameHeader;

// =============================================================================
//                              DWARF GENERATION UTILITIES
// =============================================================================

/*
 * Append a null-terminated string to the ELF context buffer
 *
 * Args:
 *   ctx: ELF object context
 *   str: String to append (must be null-terminated)
 *
 * Returns: Offset from start of buffer where string was written
 */
static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
    uint8_t* p = ctx->p;
    uint32_t ofs = (uint32_t)(p - ctx->startp);

    /* Copy string including null terminator */
    do {
        *p++ = (uint8_t)*str;
    } while (*str++);

    ctx->p = p;
    return ofs;
}

/*
 * Append a SLEB128 (Signed Little Endian Base 128) value
 *
 * SLEB128 is a variable-length encoding used extensively in DWARF.
 * It efficiently encodes small numbers in fewer bytes.
 *
 * Args:
 *   ctx: ELF object context
 *   v: Signed value to encode
 */
static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
    uint8_t* p = ctx->p;

    /* Encode 7 bits at a time, with continuation bit in MSB */
    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
    }
    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit

    ctx->p = p;
}

/*
 * Append a ULEB128 (Unsigned Little Endian Base 128) value
 *
 * Similar to SLEB128 but for unsigned values.
 *
 * Args:
 *   ctx: ELF object context
 *   v: Unsigned value to encode
 */
static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
    uint8_t* p = ctx->p;

    /* Encode 7 bits at a time, with continuation bit in MSB */
    for (; v >= 0x80; v >>= 7) {
        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
    }
    *p++ = (char)v;  // Final byte without continuation bit

    ctx->p = p;
}

/*
 * Macros for generating DWARF structures
 *
 * These macros provide a convenient way to write various data types
 * to the DWARF buffer while automatically advancing the pointer.
 */
#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string

/* Align to specified boundary with NOP instructions */
#define DWRF_ALIGNNOP(s)                                          \
    while ((uintptr_t)p & ((s)-1)) {                              \
        *p++ = DWRF_CFA_nop;                                       \
    }

/* Write a DWARF section with automatic size calculation */
#define DWRF_SECTION(name, stmt)                                  \
    {                                                             \
        uint32_t* szp_##name = (uint32_t*)p;                      \
        p += 4;                                                   \
        stmt;                                                     \
        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
    }

// =============================================================================
//                              DWARF EH FRAME GENERATION
// =============================================================================

/*
 * Initialize DWARF .eh_frame section for a code region
 *
 * The .eh_frame section contains Call Frame Information (CFI) that describes
 * how to unwind the stack at any point in the code. This is essential for
 * proper profiling as it allows perf to generate accurate call graphs.
 *
 * The function generates two main components:
 * 1. CIE (Common Information Entry) - describes calling conventions
 * 2. FDE (Frame Description Entry) - describes specific function unwinding
 *
 * Args:
 *   ctx: ELF object context containing code size and buffer pointers
 */
static void elf_init_ehframe(ELFObjectContext* ctx) {
    uint8_t* p = ctx->p;
    uint8_t* framep = p;  // Remember start of frame data

    /*
    * DWARF Unwind Table for Trampoline Function
    *
    * This section defines DWARF Call Frame Information (CFI) using encoded macros
    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
    * and debuggers for stack unwinding in JIT-compiled code.
    *
    * -------------------------------------------------
    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
    * -------------------------------------------------
    *
    * 1. Create a trampoline source file (e.g., `trampoline.c`):
    *
    *      #include <Python.h>
    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
    *          return evaluator(ts, f, throwflag);
    *      }
    *
    * 2. Compile to an object file with frame pointer preservation:
    *
    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
    *
    * 3. Extract DWARF unwind info from the object file:
    *
    *      readelf -w trampoline.o
    *
    *    Example output from `.eh_frame`:
    *
    *      00000000 CIE
    *        Version:               1
    *        Augmentation:          "zR"
    *        Code alignment factor: 4
    *        Data alignment factor: -8
    *        Return address column: 30
    *        DW_CFA_def_cfa: r31 (sp) ofs 0
    *
    *      00000014 FDE cie=00000000 pc=0..14
    *        DW_CFA_advance_loc: 4
    *        DW_CFA_def_cfa_offset: 16
    *        DW_CFA_offset: r29 at cfa-16
    *        DW_CFA_offset: r30 at cfa-8
    *        DW_CFA_advance_loc: 12
    *        DW_CFA_restore: r30
    *        DW_CFA_restore: r29
    *        DW_CFA_def_cfa_offset: 0
    *
    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
    *
    * ----------------------------------
    * HOW TO TRANSLATE TO DWRF_* MACROS:
    * ----------------------------------
    *
    * After compiling your trampoline with:
    *
    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
    *
    * run:
    *
    *     readelf -w trampoline.o
    *
    * to inspect the generated `.eh_frame` data. You will see two main components:
    *
    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
    *
    * ---------------------
    * Translating the CIE:
    * ---------------------
    * From `readelf -w`, you might see:
    *
    *   00000000 0000000000000010 00000000 CIE
    *     Version:               1
    *     Augmentation:          "zR"
    *     Code alignment factor: 4
    *     Data alignment factor: -8
    *     Return address column: 30
    *     Augmentation data:     1b
    *     DW_CFA_def_cfa: r31 (sp) ofs 0
    *
    * Map this to:
    *
    *     DWRF_SECTION(CIE,
    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
    *         DWRF_STR("zR");                         // Augmentation string "zR"
    *         DWRF_UV(4);                             // Code alignment factor = 4
    *         DWRF_SV(-8);                            // Data alignment factor = -8
    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
    *         DWRF_UV(1);                             // Augmentation data length = 1
    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
    *
    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
    *         DWRF_UV(0);                             // Offset = 0
    *
    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
    *     )
    *
    * Notes:
    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
    *
    * ---------------------
    * Translating the FDE:
    * ---------------------
    * From `readelf -w`:
    *
    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
    *     DW_CFA_advance_loc: 4
    *     DW_CFA_def_cfa_offset: 16
    *     DW_CFA_offset: r29 at cfa-16
    *     DW_CFA_offset: r30 at cfa-8
    *     DW_CFA_advance_loc: 12
    *     DW_CFA_restore: r30
    *     DW_CFA_restore: r29
    *     DW_CFA_def_cfa_offset: 0
    *
    * Map the FDE header and instructions to:
    *
    *     DWRF_SECTION(FDE,
    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
    *         DWRF_U32(-0x30);                        // Initial PC-relative location of the code
    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
    *         DWRF_U8(0);                             // Augmentation data length (none)
    *
    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
    *         DWRF_UV(16);
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
    *
    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
    *
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
    *
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
    *         DWRF_UV(0);
    *     )
    *
    * To regenerate:
    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
    *      the code is in a different address space every time.
    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
    */

    /*
     * Emit DWARF EH CIE (Common Information Entry)
     *
     * The CIE describes the calling conventions and basic unwinding rules
     * that apply to all functions in this compilation unit.
     */
    DWRF_SECTION(CIE,
        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
        DWRF_UV(1);                           // Code alignment factor
        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
        DWRF_U8(DWRF_REG_RA);                 // Return address register number
        DWRF_UV(1);                           // Augmentation data length
        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding

        /* Initial CFI instructions - describe default calling convention */
        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
        DWRF_UV(1);                           // At offset 1 from CFA

        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
    )

    ctx->eh_frame_p = p;  // Remember start of FDE data

    /*
     * Emit DWARF EH FDE (Frame Description Entry)
     *
     * The FDE describes unwinding information specific to this function.
     * It references the CIE and provides function-specific CFI instructions.
     */
    DWRF_SECTION(FDE,
        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
        DWRF_U32(-0x30);                      // Machine code offset relative to .text
        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code lenght)
        DWRF_U8(0);                           // Augmentation data length (none)

        /*
         * Architecture-specific CFI instructions
         *
         * These instructions describe how registers are saved and restored
         * during function calls. Each architecture has different calling
         * conventions and register usage patterns.
         */
#ifdef __x86_64__
        /* x86_64 calling convention unwinding rules */
        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance location by 4 bytes
        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
        DWRF_UV(16);                          // New offset: SP + 16
        DWRF_U8(DWRF_CFA_advance_loc | 6);    // Advance location by 6 bytes
        DWRF_U8(DWRF_CFA_def_cfa_offset);     // Redefine CFA offset
        DWRF_UV(8);                           // New offset: SP + 8
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
        /* AArch64 calling convention unwinding rules */
        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance location by 1 instruction (stp x29, x30)
        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Redefine CFA offset
        DWRF_UV(16);                              // CFA = SP + 16 (stack pointer after push)
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Frame pointer (x29) saved
        DWRF_UV(2);                               // At offset 2 from CFA (2 * 8 = 16 bytes)
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Link register (x30) saved
        DWRF_UV(1);                               // At offset 1 from CFA (1 * 8 = 8 bytes)
        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // Restore frame pointer (x29)
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // Restore link register (x30)
        DWRF_U8(DWRF_CFA_def_cfa_offset);         // Final CFA adjustment
        DWRF_UV(0);                               // CFA = SP + 0 (stack restored)

#else
#    error "Unsupported target architecture"
#endif

        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
    )

    ctx->p = p;  // Update context pointer to end of generated data
}

// =============================================================================
//                              JITDUMP INITIALIZATION
// =============================================================================

/*
 * Initialize the perf jitdump interface
 *
 * This function sets up everything needed to generate jitdump files:
 * 1. Creates the jitdump file with a unique name
 * 2. Maps the first page to signal perf that we're using the interface
 * 3. Writes the jitdump header
 * 4. Initializes synchronization primitives
 *
 * The memory mapping is crucial - perf detects jitdump files by scanning
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
 *
 * Returns: Pointer to initialized state, or NULL on failure
 */
static void* perf_map_jit_init(void) {
    char filename[100];
    int pid = getpid();

    /* Create unique filename based on process ID */
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);

    /* Create/open the jitdump file with appropriate permissions */
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
    if (fd == -1) {
        return NULL;  // Failed to create file
    }

    /* Get system page size for memory mapping */
    const long page_size = sysconf(_SC_PAGESIZE);
    if (page_size == -1) {
        close(fd);
        return NULL;  // Failed to get page size
    }

    /*
     * Map the first page of the jitdump file
     *
     * This memory mapping serves as a signal to perf that this process
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
     * files that match the jitdump naming pattern.
     *
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
     */
    perf_jit_map_state.mapped_buffer = mmap(
        NULL,                    // Let kernel choose address
        page_size,               // Map one page
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
        MAP_PRIVATE,             // Private mapping
        fd,                      // File descriptor
        0                        // Offset 0 (first page)
    );

    if (perf_jit_map_state.mapped_buffer == NULL) {
        close(fd);
        return NULL;  // Memory mapping failed
    }

    perf_jit_map_state.mapped_size = page_size;

    /* Convert file descriptor to FILE* for easier I/O operations */
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
    if (perf_jit_map_state.perf_map == NULL) {
        close(fd);
        return NULL;  // Failed to create FILE*
    }

    /*
     * Set up file buffering for better performance
     *
     * We use a large buffer (2MB) because jitdump files can be written
     * frequently during program execution. Buffering reduces system call
     * overhead and improves overall performance.
     */
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);

    /* Write the jitdump file header */
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

    /*
     * Initialize thread synchronization lock
     *
     * Multiple threads may attempt to write to the jitdump file
     * simultaneously. This lock ensures thread-safe access to the
     * global jitdump state.
     */
    perf_jit_map_state.map_lock = PyThread_allocate_lock();
    if (perf_jit_map_state.map_lock == NULL) {
        fclose(perf_jit_map_state.perf_map);
        return NULL;  // Failed to create lock
    }

    /* Initialize code ID counter */
    perf_jit_map_state.code_id = 0;

    /* Configure trampoline API with padding information */
    trampoline_api.code_padding = PERF_JIT_CODE_PADDING;

    return &perf_jit_map_state;
}

// =============================================================================
//                              MAIN JITDUMP ENTRY WRITING
// =============================================================================

/*
 * Write a complete jitdump entry for a Python function
 *
 * This is the main function called by Python's trampoline system whenever
 * a new piece of JIT-compiled code needs to be recorded. It writes both
 * the unwinding information and the code load event to the jitdump file.
 *
 * The function performs these steps:
 * 1. Initialize jitdump system if not already done
 * 2. Extract function name and filename from Python code object
 * 3. Generate DWARF unwinding information
 * 4. Write unwinding info event to jitdump file
 * 5. Write code load event to jitdump file
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *   code_addr: Address where the compiled code resides
 *   code_size: Size of the compiled code in bytes
 *   co: Python code object containing metadata
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static void perf_map_jit_write_entry(void *state, const void *code_addr,
                                    unsigned int code_size, PyCodeObject *co)
{
    /* Initialize jitdump system on first use */
    if (perf_jit_map_state.perf_map == NULL) {
        void* ret = perf_map_jit_init();
        if(ret == NULL){
            return;  // Initialization failed, silently abort
        }
    }

    /*
     * Extract function information from Python code object
     *
     * We create a human-readable function name by combining the qualified
     * name (includes class/module context) with the filename. This helps
     * developers identify functions in perf reports.
     */
    const char *entry = "";
    if (co->co_qualname != NULL) {
        entry = PyUnicode_AsUTF8(co->co_qualname);
    }

    const char *filename = "";
    if (co->co_filename != NULL) {
        filename = PyUnicode_AsUTF8(co->co_filename);
    }

    /*
     * Create formatted function name for perf display
     *
     * Format: "py::<function_name>:<filename>"
     * The "py::" prefix helps identify Python functions in mixed-language
     * profiles (e.g., when profiling C extensions alongside Python code).
     */
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
    if (perf_map_entry == NULL) {
        return;  // Memory allocation failed
    }
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

    const size_t name_length = strlen(perf_map_entry);
    uword base = (uword)code_addr;
    uword size = code_size;

    /*
     * Generate DWARF unwinding information
     *
     * DWARF data is essential for proper stack unwinding during profiling.
     * Without it, perf cannot generate accurate call graphs, especially
     * in optimized code where frame pointers may be omitted.
     */
    ELFObjectContext ctx;
    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
    ctx.code_size = code_size;
    ctx.startp = ctx.p = (uint8_t*)buffer;

    /* Generate EH frame (Exception Handling frame) data */
    elf_init_ehframe(&ctx);
    int eh_frame_size = ctx.p - ctx.startp;

    /*
     * Write Code Unwinding Information Event
     *
     * This event must be written before the code load event to ensure
     * perf has the unwinding information available when it processes
     * the code region.
     */
    CodeUnwindingInfoEvent ev2;
    ev2.base.event = PerfUnwindingInfo;
    ev2.base.time_stamp = get_current_monotonic_ticks();
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;

    /* Verify we don't exceed our padding budget */
    assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);

    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment

    /* Calculate total event size with padding */
    int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
    ev2.base.size = content_size + padding_size;

    /* Write the unwinding info event header */
    perf_map_jit_write_fully(&ev2, sizeof(ev2));

    /*
     * Write EH Frame Header
     *
     * The EH frame header provides metadata about the DWARF unwinding
     * information that follows. It includes pointers and counts that
     * help perf navigate the unwinding data efficiently.
     */
    EhFrameHeader f;
    f.version = 1;
    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte

    /* Calculate relative offsets for EH frame navigation */
    f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
    f.from = -(round_up(code_size, 8) + eh_frame_size);

    int cie_size = ctx.eh_frame_p - ctx.startp;
    f.to = -(eh_frame_size - cie_size);

    /* Write EH frame data and header */
    perf_map_jit_write_fully(ctx.startp, eh_frame_size);
    perf_map_jit_write_fully(&f, sizeof(f));

    /* Write padding to maintain alignment */
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
    perf_map_jit_write_fully(&padding_bytes, padding_size);

    /*
     * Write Code Load Event
     *
     * This event tells perf about the new code region. It includes:
     * - Memory addresses and sizes
     * - Process and thread identification
     * - Function name for symbol resolution
     * - The actual machine code bytes
     */
    CodeLoadEvent ev;
    ev.base.event = PerfLoad;
    ev.base.size = sizeof(ev) + (name_length+1) + size;
    ev.base.time_stamp = get_current_monotonic_ticks();
    ev.process_id = getpid();
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
    ev.vma = base;                       // Virtual memory address
    ev.code_address = base;              // Same as VMA for our use case
    ev.code_size = size;

    /* Assign unique code ID and increment counter */
    perf_jit_map_state.code_id += 1;
    ev.code_id = perf_jit_map_state.code_id;

    /* Write code load event and associated data */
    perf_map_jit_write_fully(&ev, sizeof(ev));
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code

    /* Clean up allocated memory */
    PyMem_RawFree(perf_map_entry);
}

// =============================================================================
//                              CLEANUP AND FINALIZATION
// =============================================================================

/*
 * Finalize and cleanup the perf jitdump system
 *
 * This function is called when Python is shutting down or when the
 * perf trampoline system is being disabled. It ensures all resources
 * are properly released and all buffered data is flushed to disk.
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *
 * Returns: 0 on success
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static int perf_map_jit_fini(void* state) {
    /*
     * Close jitdump file with proper synchronization
     *
     * We need to acquire the lock to ensure no other threads are
     * writing to the file when we close it. This prevents corruption
     * and ensures all data is properly flushed.
     */
    if (perf_jit_map_state.perf_map != NULL) {
        PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
        PyThread_release_lock(perf_jit_map_state.map_lock);

        /* Clean up synchronization primitive */
        PyThread_free_lock(perf_jit_map_state.map_lock);
        perf_jit_map_state.perf_map = NULL;
    }

    /*
     * Unmap the memory region
     *
     * This removes the signal to perf that we were generating JIT code.
     * After this point, perf will no longer detect this process as
     * having JIT capabilities.
     */
    if (perf_jit_map_state.mapped_buffer != NULL) {
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
        perf_jit_map_state.mapped_buffer = NULL;
    }

    /* Clear global state reference */
    trampoline_api.state = NULL;

    return 0;  // Success
}

// =============================================================================
//                              PUBLIC API EXPORT
// =============================================================================

/*
 * Python Perf Callbacks Structure
 *
 * This structure defines the callback interface that Python's trampoline
 * system uses to integrate with perf profiling. It contains function
 * pointers for initialization, event writing, and cleanup.
 *
 * CRITICAL: This structure and its contents are part of Python's internal
 * API. The function signatures and behavior must remain stable to maintain
 * compatibility with the Python interpreter's perf integration system.
 *
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
 */
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
    &perf_map_jit_init,        // Initialization function
    &perf_map_jit_write_entry, // Event writing function
    &perf_map_jit_fini,        // Cleanup function
};

#endif /* PY_HAVE_PERF_TRAMPOLINE */