diff options
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/cases_generator/generators_common.py | 11 | ||||
-rw-r--r-- | Tools/cases_generator/optimizer_generator.py | 195 | ||||
-rw-r--r-- | Tools/inspection/benchmark_external_inspection.py | 20 | ||||
-rw-r--r-- | Tools/jit/_optimizers.py | 319 | ||||
-rw-r--r-- | Tools/jit/_stencils.py | 67 | ||||
-rw-r--r-- | Tools/jit/_targets.py | 68 | ||||
-rw-r--r-- | Tools/requirements-hypothesis.txt | 2 | ||||
-rw-r--r-- | Tools/scripts/summarize_stats.py | 2 |
8 files changed, 579 insertions, 105 deletions
diff --git a/Tools/cases_generator/generators_common.py b/Tools/cases_generator/generators_common.py index 47de205c0e9..4c210fbf8d2 100644 --- a/Tools/cases_generator/generators_common.py +++ b/Tools/cases_generator/generators_common.py @@ -106,8 +106,9 @@ class Emitter: out: CWriter labels: dict[str, Label] _replacers: dict[str, ReplacementFunctionType] + cannot_escape: bool - def __init__(self, out: CWriter, labels: dict[str, Label]): + def __init__(self, out: CWriter, labels: dict[str, Label], cannot_escape: bool = False): self._replacers = { "EXIT_IF": self.exit_if, "DEOPT_IF": self.deopt_if, @@ -127,6 +128,7 @@ class Emitter: } self.out = out self.labels = labels + self.cannot_escape = cannot_escape def dispatch( self, @@ -238,7 +240,8 @@ class Emitter: next(tkn_iter) self._print_storage("DECREF_INPUTS", storage) try: - storage.close_inputs(self.out) + if not self.cannot_escape: + storage.close_inputs(self.out) except StackError as ex: raise analysis_error(ex.args[0], tkn) except Exception as ex: @@ -476,7 +479,7 @@ class Emitter: reachable = True tkn = stmt.contents[-1] try: - if stmt in uop.properties.escaping_calls: + if stmt in uop.properties.escaping_calls and not self.cannot_escape: escape = uop.properties.escaping_calls[stmt] if escape.kills is not None: self.stackref_kill(escape.kills, storage, True) @@ -513,7 +516,7 @@ class Emitter: self.out.emit(tkn) else: self.out.emit(tkn) - if stmt in uop.properties.escaping_calls: + if stmt in uop.properties.escaping_calls and not self.cannot_escape: self.emit_reload(storage) return reachable, None, storage except StackError as ex: diff --git a/Tools/cases_generator/optimizer_generator.py b/Tools/cases_generator/optimizer_generator.py index 3b4fe64b02a..81ae534bdda 100644 --- a/Tools/cases_generator/optimizer_generator.py +++ b/Tools/cases_generator/optimizer_generator.py @@ -12,6 +12,8 @@ from analyzer import ( analyze_files, StackItem, analysis_error, + CodeSection, + Label, ) from generators_common import ( DEFAULT_INPUT, @@ -19,6 +21,7 @@ from generators_common import ( write_header, Emitter, TokenIterator, + always_true, ) from cwriter import CWriter from typing import TextIO @@ -75,6 +78,9 @@ def type_name(var: StackItem) -> str: return "JitOptRef *" return "JitOptRef " +def stackref_type_name(var: StackItem) -> str: + assert not var.is_array(), "Unsafe to convert a symbol to an array-like StackRef." + return "_PyStackRef " def declare_variables(uop: Uop, out: CWriter, skip_inputs: bool) -> None: variables = {"unused"} @@ -135,6 +141,12 @@ def emit_default(out: CWriter, uop: Uop, stack: Stack) -> None: class OptimizerEmitter(Emitter): + def __init__(self, out: CWriter, labels: dict[str, Label], original_uop: Uop, stack: Stack): + super().__init__(out, labels) + self._replacers["REPLACE_OPCODE_IF_EVALUATES_PURE"] = self.replace_opcode_if_evaluates_pure + self.original_uop = original_uop + self.stack = stack + def emit_save(self, storage: Storage) -> None: storage.flush(self.out) @@ -145,6 +157,186 @@ class OptimizerEmitter(Emitter): self.out.emit(goto) self.out.emit(label) + def replace_opcode_if_evaluates_pure( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + assert isinstance(uop, Uop) + input_identifiers = [] + for token in tkn_iter: + if token.kind == "IDENTIFIER": + input_identifiers.append(token) + if token.kind == "SEMI": + break + + if len(input_identifiers) == 0: + raise analysis_error( + "To evaluate an operation as pure, it must have at least 1 input", + tkn + ) + # Check that the input identifiers belong to the uop's + # input stack effect + uop_stack_effect_input_identifers = {inp.name for inp in uop.stack.inputs} + for input_tkn in input_identifiers: + if input_tkn.text not in uop_stack_effect_input_identifers: + raise analysis_error(f"{input_tkn.text} referenced in " + f"REPLACE_OPCODE_IF_EVALUATES_PURE but does not " + f"exist in the base uop's input stack effects", + input_tkn) + input_identifiers_as_str = {tkn.text for tkn in input_identifiers} + used_stack_inputs = [inp for inp in uop.stack.inputs if inp.name in input_identifiers_as_str] + assert len(used_stack_inputs) > 0 + emitter = OptimizerConstantEmitter(self.out, {}, self.original_uop, self.stack.copy()) + emitter.emit("if (\n") + for inp in used_stack_inputs[:-1]: + emitter.emit(f"sym_is_safe_const(ctx, {inp.name}) &&\n") + emitter.emit(f"sym_is_safe_const(ctx, {used_stack_inputs[-1].name})\n") + emitter.emit(') {\n') + # Declare variables, before they are shadowed. + for inp in used_stack_inputs: + if inp.used: + emitter.emit(f"{type_name(inp)}{inp.name}_sym = {inp.name};\n") + # Shadow the symbolic variables with stackrefs. + for inp in used_stack_inputs: + if inp.is_array(): + raise analysis_error("Pure evaluation cannot take array-like inputs.", tkn) + if inp.used: + emitter.emit(f"{stackref_type_name(inp)}{inp.name} = sym_get_const_as_stackref(ctx, {inp.name}_sym);\n") + # Rename all output variables to stackref variant. + for outp in self.original_uop.stack.outputs: + if outp.is_array(): + raise analysis_error( + "Array output StackRefs not supported for evaluating pure ops.", + self.original_uop.body.open + ) + emitter.emit(f"_PyStackRef {outp.name}_stackref;\n") + + + storage = Storage.for_uop(self.stack, self.original_uop, CWriter.null(), check_liveness=False) + # No reference management of outputs needed. + for var in storage.outputs: + var.in_local = True + emitter.emit("/* Start of uop copied from bytecodes for constant evaluation */\n") + emitter.emit_tokens(self.original_uop, storage, inst=None, emit_braces=False) + self.out.start_line() + emitter.emit("/* End of uop copied from bytecodes for constant evaluation */\n") + # Finally, assign back the output stackrefs to symbolics. + for outp in self.original_uop.stack.outputs: + # All new stackrefs are created from new references. + # That's how the stackref contract works. + if not outp.peek: + emitter.emit(f"{outp.name} = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal({outp.name}_stackref));\n") + else: + emitter.emit(f"{outp.name} = sym_new_const(ctx, PyStackRef_AsPyObjectBorrow({outp.name}_stackref));\n") + storage.flush(self.out) + emitter.emit("break;\n") + emitter.emit("}\n") + return True + +class OptimizerConstantEmitter(OptimizerEmitter): + def __init__(self, out: CWriter, labels: dict[str, Label], original_uop: Uop, stack: Stack): + super().__init__(out, labels, original_uop, stack) + # Replace all outputs to point to their stackref versions. + overrides = { + outp.name: self.emit_stackref_override for outp in self.original_uop.stack.outputs + } + self._replacers = {**self._replacers, **overrides} + self.cannot_escape = True + + def emit_to_with_replacement( + self, + out: CWriter, + tkn_iter: TokenIterator, + end: str, + uop: CodeSection, + storage: Storage, + inst: Instruction | None + ) -> Token: + parens = 0 + for tkn in tkn_iter: + if tkn.kind == end and parens == 0: + return tkn + if tkn.kind == "LPAREN": + parens += 1 + if tkn.kind == "RPAREN": + parens -= 1 + if tkn.text in self._replacers: + self._replacers[tkn.text](tkn, tkn_iter, uop, storage, inst) + else: + out.emit(tkn) + raise analysis_error(f"Expecting {end}. Reached end of file", tkn) + + def emit_stackref_override( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + self.out.emit(tkn) + self.out.emit("_stackref ") + return True + + def deopt_if( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + self.out.start_line() + self.out.emit("if (") + lparen = next(tkn_iter) + assert lparen.kind == "LPAREN" + first_tkn = tkn_iter.peek() + self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) + self.emit(") {\n") + next(tkn_iter) # Semi colon + # We guarantee this will deopt in real-world code + # via constants analysis. So just bail. + self.emit("ctx->done = true;\n") + self.emit("break;\n") + self.emit("}\n") + return not always_true(first_tkn) + + exit_if = deopt_if + + def error_if( + self, + tkn: Token, + tkn_iter: TokenIterator, + uop: CodeSection, + storage: Storage, + inst: Instruction | None, + ) -> bool: + lparen = next(tkn_iter) + assert lparen.kind == "LPAREN" + first_tkn = tkn_iter.peek() + unconditional = always_true(first_tkn) + if unconditional: + next(tkn_iter) + next(tkn_iter) # RPAREN + self.out.start_line() + else: + self.out.emit_at("if ", tkn) + self.emit(lparen) + self.emit_to_with_replacement(self.out, tkn_iter, "RPAREN", uop, storage, inst) + self.out.emit(") {\n") + next(tkn_iter) # Semi colon + storage.clear_inputs("at ERROR_IF") + + self.out.emit("goto error;\n") + if not unconditional: + self.out.emit("}\n") + return not unconditional + + def write_uop( override: Uop | None, uop: Uop, @@ -175,13 +367,14 @@ def write_uop( cast = f"uint{cache.size*16}_t" out.emit(f"{type}{cache.name} = ({cast})this_instr->operand0;\n") if override: - emitter = OptimizerEmitter(out, {}) + emitter = OptimizerEmitter(out, {}, uop, stack.copy()) # No reference management of inputs needed. for var in storage.inputs: # type: ignore[possibly-undefined] var.in_local = False _, storage = emitter.emit_tokens(override, storage, None, False) out.start_line() storage.flush(out) + out.start_line() else: emit_default(out, uop, stack) out.start_line() diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py index 62182194c1a..0ac7ac4d385 100644 --- a/Tools/inspection/benchmark_external_inspection.py +++ b/Tools/inspection/benchmark_external_inspection.py @@ -174,6 +174,7 @@ def benchmark(unwinder, duration_seconds=10): total_work_time = 0.0 start_time = time.perf_counter() end_time = start_time + duration_seconds + total_attempts = 0 colors = get_colors(can_colorize()) @@ -183,6 +184,7 @@ def benchmark(unwinder, duration_seconds=10): try: while time.perf_counter() < end_time: + total_attempts += 1 work_start = time.perf_counter() try: stack_trace = unwinder.get_stack_trace() @@ -194,7 +196,6 @@ def benchmark(unwinder, duration_seconds=10): work_end = time.perf_counter() total_work_time += work_end - work_start - total_attempts = sample_count + fail_count if total_attempts % 10000 == 0: avg_work_time_us = (total_work_time / total_attempts) * 1e6 work_rate = ( @@ -221,7 +222,6 @@ def benchmark(unwinder, duration_seconds=10): actual_end_time = time.perf_counter() wall_time = actual_end_time - start_time - total_attempts = sample_count + fail_count # Return final statistics return { @@ -346,6 +346,13 @@ Available code examples: help="Code example to benchmark (default: basic)", ) + parser.add_argument( + "--threads", + choices=["all", "main", "only_active"], + default="all", + help="Which threads to include in the benchmark (default: all)", + ) + return parser.parse_args() @@ -419,8 +426,15 @@ def main(): # Create unwinder and run benchmark print(f"{colors.BLUE}Initializing unwinder...{colors.RESET}") try: + kwargs = {} + if args.threads == "all": + kwargs["all_threads"] = True + elif args.threads == "main": + kwargs["all_threads"] = False + elif args.threads == "only_active": + kwargs["only_active_thread"] = True unwinder = _remote_debugging.RemoteUnwinder( - process.pid, all_threads=True + process.pid, **kwargs ) results = benchmark(unwinder, duration_seconds=args.duration) finally: diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py new file mode 100644 index 00000000000..1077e4106fd --- /dev/null +++ b/Tools/jit/_optimizers.py @@ -0,0 +1,319 @@ +"""Low-level optimization of textual assembly.""" + +import dataclasses +import pathlib +import re +import typing + +# Same as saying "not string.startswith('')": +_RE_NEVER_MATCH = re.compile(r"(?!)") +# Dictionary mapping branch instructions to their inverted branch instructions. +# If a branch cannot be inverted, the value is None: +_X86_BRANCHES = { + # https://www.felixcloutier.com/x86/jcc + "ja": "jna", + "jae": "jnae", + "jb": "jnb", + "jbe": "jnbe", + "jc": "jnc", + "jcxz": None, + "je": "jne", + "jecxz": None, + "jg": "jng", + "jge": "jnge", + "jl": "jnl", + "jle": "jnle", + "jo": "jno", + "jp": "jnp", + "jpe": "jpo", + "jrcxz": None, + "js": "jns", + "jz": "jnz", + # https://www.felixcloutier.com/x86/loop:loopcc + "loop": None, + "loope": None, + "loopne": None, + "loopnz": None, + "loopz": None, +} +# Update with all of the inverted branches, too: +_X86_BRANCHES |= {v: k for k, v in _X86_BRANCHES.items() if v} + + +@dataclasses.dataclass +class _Block: + label: str | None = None + # Non-instruction lines like labels, directives, and comments: + noninstructions: list[str] = dataclasses.field(default_factory=list) + # Instruction lines: + instructions: list[str] = dataclasses.field(default_factory=list) + # If this block ends in a jump, where to? + target: typing.Self | None = None + # The next block in the linked list: + link: typing.Self | None = None + # Whether control flow can fall through to the linked block above: + fallthrough: bool = True + # Whether this block can eventually reach the next uop (_JIT_CONTINUE): + hot: bool = False + + def resolve(self) -> typing.Self: + """Find the first non-empty block reachable from this one.""" + block = self + while block.link and not block.instructions: + block = block.link + return block + + +@dataclasses.dataclass +class Optimizer: + """Several passes of analysis and optimization for textual assembly.""" + + path: pathlib.Path + _: dataclasses.KW_ONLY + # prefix used to mangle symbols on some platforms: + prefix: str = "" + # The first block in the linked list: + _root: _Block = dataclasses.field(init=False, default_factory=_Block) + _labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict) + # No groups: + _re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile( + r"\s*(?:\.|#|//|$)" + ) + # One group (label): + _re_label: typing.ClassVar[re.Pattern[str]] = re.compile( + r'\s*(?P<label>[\w."$?@]+):' + ) + # Override everything that follows in subclasses: + _alignment: typing.ClassVar[int] = 1 + _branches: typing.ClassVar[dict[str, str | None]] = {} + # Two groups (instruction and target): + _re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH + # One group (target): + _re_jump: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH + # No groups: + _re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH + + def __post_init__(self) -> None: + # Split the code into a linked list of basic blocks. A basic block is an + # optional label, followed by zero or more non-instruction lines, + # followed by zero or more instruction lines (only the last of which may + # be a branch, jump, or return): + text = self._preprocess(self.path.read_text()) + block = self._root + for line in text.splitlines(): + # See if we need to start a new block: + if match := self._re_label.match(line): + # Label. New block: + block.link = block = self._lookup_label(match["label"]) + block.noninstructions.append(line) + continue + if self._re_noninstructions.match(line): + if block.instructions: + # Non-instruction lines. New block: + block.link = block = _Block() + block.noninstructions.append(line) + continue + if block.target or not block.fallthrough: + # Current block ends with a branch, jump, or return. New block: + block.link = block = _Block() + block.instructions.append(line) + if match := self._re_branch.match(line): + # A block ending in a branch has a target and fallthrough: + block.target = self._lookup_label(match["target"]) + assert block.fallthrough + elif match := self._re_jump.match(line): + # A block ending in a jump has a target and no fallthrough: + block.target = self._lookup_label(match["target"]) + block.fallthrough = False + elif self._re_return.match(line): + # A block ending in a return has no target and fallthrough: + assert not block.target + block.fallthrough = False + + def _preprocess(self, text: str) -> str: + # Override this method to do preprocessing of the textual assembly: + return text + + @classmethod + def _invert_branch(cls, line: str, target: str) -> str | None: + match = cls._re_branch.match(line) + assert match + inverted = cls._branches.get(match["instruction"]) + if not inverted: + return None + (a, b), (c, d) = match.span("instruction"), match.span("target") + # Before: + # je FOO + # After: + # jne BAR + return "".join([line[:a], inverted, line[b:c], target, line[d:]]) + + @classmethod + def _update_jump(cls, line: str, target: str) -> str: + match = cls._re_jump.match(line) + assert match + a, b = match.span("target") + # Before: + # jmp FOO + # After: + # jmp BAR + return "".join([line[:a], target, line[b:]]) + + def _lookup_label(self, label: str) -> _Block: + if label not in self._labels: + self._labels[label] = _Block(label) + return self._labels[label] + + def _blocks(self) -> typing.Generator[_Block, None, None]: + block: _Block | None = self._root + while block: + yield block + block = block.link + + def _body(self) -> str: + lines = [] + hot = True + for block in self._blocks(): + if hot != block.hot: + hot = block.hot + # Make it easy to tell at a glance where cold code is: + lines.append(f"# JIT: {'HOT' if hot else 'COLD'} ".ljust(80, "#")) + lines.extend(block.noninstructions) + lines.extend(block.instructions) + return "\n".join(lines) + + def _predecessors(self, block: _Block) -> typing.Generator[_Block, None, None]: + # This is inefficient, but it's never wrong: + for pre in self._blocks(): + if pre.target is block or pre.fallthrough and pre.link is block: + yield pre + + def _insert_continue_label(self) -> None: + # Find the block with the last instruction: + for end in reversed(list(self._blocks())): + if end.instructions: + break + # Before: + # jmp FOO + # After: + # jmp FOO + # .balign 8 + # _JIT_CONTINUE: + # This lets the assembler encode _JIT_CONTINUE jumps at build time! + align = _Block() + align.noninstructions.append(f"\t.balign\t{self._alignment}") + continuation = self._lookup_label(f"{self.prefix}_JIT_CONTINUE") + assert continuation.label + continuation.noninstructions.append(f"{continuation.label}:") + end.link, align.link, continuation.link = align, continuation, end.link + + def _mark_hot_blocks(self) -> None: + # Start with the last block, and perform a DFS to find all blocks that + # can eventually reach it: + todo = list(self._blocks())[-1:] + while todo: + block = todo.pop() + block.hot = True + todo.extend(pre for pre in self._predecessors(block) if not pre.hot) + + def _invert_hot_branches(self) -> None: + for branch in self._blocks(): + link = branch.link + if link is None: + continue + jump = link.resolve() + # Before: + # je HOT + # jmp COLD + # After: + # jne COLD + # jmp HOT + if ( + # block ends with a branch to hot code... + branch.target + and branch.fallthrough + and branch.target.hot + # ...followed by a jump to cold code with no other predecessors: + and jump.target + and not jump.fallthrough + and not jump.target.hot + and len(jump.instructions) == 1 + and list(self._predecessors(jump)) == [branch] + ): + assert jump.target.label + assert branch.target.label + inverted = self._invert_branch( + branch.instructions[-1], jump.target.label + ) + # Check to see if the branch can even be inverted: + if inverted is None: + continue + branch.instructions[-1] = inverted + jump.instructions[-1] = self._update_jump( + jump.instructions[-1], branch.target.label + ) + branch.target, jump.target = jump.target, branch.target + jump.hot = True + + def _remove_redundant_jumps(self) -> None: + # Zero-length jumps can be introduced by _insert_continue_label and + # _invert_hot_branches: + for block in self._blocks(): + # Before: + # jmp FOO + # FOO: + # After: + # FOO: + if ( + block.target + and block.link + and block.target.resolve() is block.link.resolve() + ): + block.target = None + block.fallthrough = True + block.instructions.pop() + + def run(self) -> None: + """Run this optimizer.""" + self._insert_continue_label() + self._mark_hot_blocks() + self._invert_hot_branches() + self._remove_redundant_jumps() + self.path.write_text(self._body()) + + +class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods + """aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu""" + + # TODO: @diegorusso + _alignment = 8 + # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch- + _re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)") + + +class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods + """i686-pc-windows-msvc/x86_64-apple-darwin/x86_64-unknown-linux-gnu""" + + _branches = _X86_BRANCHES + _re_branch = re.compile( + rf"\s*(?P<instruction>{'|'.join(_X86_BRANCHES)})\s+(?P<target>[\w.]+)" + ) + # https://www.felixcloutier.com/x86/jmp + _re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)") + # https://www.felixcloutier.com/x86/ret + _re_return = re.compile(r"\s*ret\b") + + +class OptimizerX8664Windows(OptimizerX86): # pylint: disable = too-few-public-methods + """x86_64-pc-windows-msvc""" + + def _preprocess(self, text: str) -> str: + text = super()._preprocess(text) + # Before: + # rex64 jmpq *__imp__JIT_CONTINUE(%rip) + # After: + # jmp _JIT_CONTINUE + far_indirect_jump = ( + rf"rex64\s+jmpq\s+\*__imp_(?P<target>{self.prefix}_JIT_\w+)\(%rip\)" + ) + return re.sub(far_indirect_jump, r"jmp\t\g<target>", text) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 03b0ba647b0..1d82f5366f6 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -17,8 +17,6 @@ class HoleValue(enum.Enum): # The base address of the machine code for the current uop (exposed as _JIT_ENTRY): CODE = enum.auto() - # The base address of the machine code for the next uop (exposed as _JIT_CONTINUE): - CONTINUE = enum.auto() # The base address of the read-only data for this uop: DATA = enum.auto() # The address of the current executor (exposed as _JIT_EXECUTOR): @@ -97,7 +95,6 @@ _PATCH_FUNCS = { # Translate HoleValues to C expressions: _HOLE_EXPRS = { HoleValue.CODE: "(uintptr_t)code", - HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", HoleValue.DATA: "(uintptr_t)data", HoleValue.EXECUTOR: "(uintptr_t)executor", # These should all have been turned into DATA values by process_relocations: @@ -209,64 +206,6 @@ class Stencil: self.disassembly.append(f"{offset:x}: {' '.join(['00'] * padding)}") self.body.extend([0] * padding) - def add_nops(self, nop: bytes, alignment: int) -> None: - """Add NOPs until there is alignment. Fail if it is not possible.""" - offset = len(self.body) - nop_size = len(nop) - - # Calculate the gap to the next multiple of alignment. - gap = -offset % alignment - if gap: - if gap % nop_size == 0: - count = gap // nop_size - self.body.extend(nop * count) - else: - raise ValueError( - f"Cannot add nops of size '{nop_size}' to a body with " - f"offset '{offset}' to align with '{alignment}'" - ) - - def remove_jump(self) -> None: - """Remove a zero-length continuation jump, if it exists.""" - hole = max(self.holes, key=lambda hole: hole.offset) - match hole: - case Hole( - offset=offset, - kind="IMAGE_REL_AMD64_REL32", - value=HoleValue.GOT, - symbol="_JIT_CONTINUE", - addend=-4, - ) as hole: - # jmp qword ptr [rip] - jump = b"\x48\xff\x25\x00\x00\x00\x00" - offset -= 3 - case Hole( - offset=offset, - kind="IMAGE_REL_I386_REL32" | "R_X86_64_PLT32" | "X86_64_RELOC_BRANCH", - value=HoleValue.CONTINUE, - symbol=None, - addend=addend, - ) as hole if ( - _signed(addend) == -4 - ): - # jmp 5 - jump = b"\xe9\x00\x00\x00\x00" - offset -= 1 - case Hole( - offset=offset, - kind="R_AARCH64_JUMP26", - value=HoleValue.CONTINUE, - symbol=None, - addend=0, - ) as hole: - # b #4 - jump = b"\x00\x00\x00\x14" - case _: - return - if self.body[offset:] == jump: - self.body = self.body[:offset] - self.holes.remove(hole) - @dataclasses.dataclass class StencilGroup: @@ -284,9 +223,7 @@ class StencilGroup: _got: dict[str, int] = dataclasses.field(default_factory=dict, init=False) _trampolines: set[int] = dataclasses.field(default_factory=set, init=False) - def process_relocations( - self, known_symbols: dict[str, int], *, alignment: int = 1, nop: bytes = b"" - ) -> None: + def process_relocations(self, known_symbols: dict[str, int]) -> None: """Fix up all GOT and internal relocations for this stencil group.""" for hole in self.code.holes.copy(): if ( @@ -306,8 +243,6 @@ class StencilGroup: self._trampolines.add(ordinal) hole.addend = ordinal hole.symbol = None - self.code.remove_jump() - self.code.add_nops(nop=nop, alignment=alignment) self.data.pad(8) for stencil in [self.code, self.data]: for hole in stencil.holes: diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index b383e39da19..728f48128ce 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -13,6 +13,7 @@ import typing import shlex import _llvm +import _optimizers import _schema import _stencils import _writer @@ -41,8 +42,8 @@ class _Target(typing.Generic[_S, _R]): triple: str condition: str _: dataclasses.KW_ONLY - alignment: int = 1 args: typing.Sequence[str] = () + optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer prefix: str = "" stable: bool = False debug: bool = False @@ -121,8 +122,9 @@ class _Target(typing.Generic[_S, _R]): async def _compile( self, opname: str, c: pathlib.Path, tempdir: pathlib.Path ) -> _stencils.StencilGroup: + s = tempdir / f"{opname}.s" o = tempdir / f"{opname}.o" - args = [ + args_s = [ f"--target={self.triple}", "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", @@ -135,8 +137,16 @@ class _Target(typing.Generic[_S, _R]): f"-I{CPYTHON / 'Include' / 'internal' / 'mimalloc'}", f"-I{CPYTHON / 'Python'}", f"-I{CPYTHON / 'Tools' / 'jit'}", - "-O3", - "-c", + # -O2 and -O3 include some optimizations that make sense for + # standalone functions, but not for snippets of code that are going + # to be laid out end-to-end (like ours)... common examples include + # passes like tail-duplication, or aligning jump targets with nops. + # -Os is equivalent to -O2 with many of these problematic passes + # disabled. Based on manual review, for *our* purposes it usually + # generates better code than -O2 (and -O2 usually generates better + # code than -O3). As a nice benefit, it uses less memory too: + "-Os", + "-S", # Shorten full absolute file paths in the generated code (like the # __FILE__ macro and assert failure messages) for reproducibility: f"-ffile-prefix-map={CPYTHON}=.", @@ -155,13 +165,16 @@ class _Target(typing.Generic[_S, _R]): "-fno-stack-protector", "-std=c11", "-o", - f"{o}", + f"{s}", f"{c}", *self.args, # Allow user-provided CFLAGS to override any defaults *shlex.split(self.cflags), ] - await _llvm.run("clang", args, echo=self.verbose) + await _llvm.run("clang", args_s, echo=self.verbose) + self.optimizer(s, prefix=self.prefix).run() + args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] + await _llvm.run("clang", args_o, echo=self.verbose) return await self._parse(o) async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: @@ -190,11 +203,7 @@ class _Target(typing.Generic[_S, _R]): tasks.append(group.create_task(coro, name=opname)) stencil_groups = {task.get_name(): task.result() for task in tasks} for stencil_group in stencil_groups.values(): - stencil_group.process_relocations( - known_symbols=self.known_symbols, - alignment=self.alignment, - nop=self._get_nop(), - ) + stencil_group.process_relocations(self.known_symbols) return stencil_groups def build( @@ -524,42 +533,43 @@ class _MachO( def get_target(host: str) -> _COFF | _ELF | _MachO: """Build a _Target for the given host "triple" and options.""" + optimizer: type[_optimizers.Optimizer] target: _COFF | _ELF | _MachO if re.fullmatch(r"aarch64-apple-darwin.*", host): condition = "defined(__aarch64__) && defined(__APPLE__)" - target = _MachO(host, condition, alignment=8, prefix="_") + optimizer = _optimizers.OptimizerAArch64 + target = _MachO(host, condition, optimizer=optimizer, prefix="_") elif re.fullmatch(r"aarch64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll", "-fplt"] condition = "defined(_M_ARM64)" - target = _COFF(host, condition, alignment=8, args=args) + optimizer = _optimizers.OptimizerAArch64 + target = _COFF(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"aarch64-.*-linux-gnu", host): - args = [ - "-fpic", - # On aarch64 Linux, intrinsics were being emitted and this flag - # was required to disable them. - "-mno-outline-atomics", - ] + # -mno-outline-atomics: Keep intrinsics from being emitted. + args = ["-fpic", "-mno-outline-atomics"] condition = "defined(__aarch64__) && defined(__linux__)" - target = _ELF(host, condition, alignment=8, args=args) + optimizer = _optimizers.OptimizerAArch64 + target = _ELF(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"i686-pc-windows-msvc", host): - args = [ - "-DPy_NO_ENABLE_SHARED", - # __attribute__((preserve_none)) is not supported - "-Wno-ignored-attributes", - ] + # -Wno-ignored-attributes: __attribute__((preserve_none)) is not supported here. + args = ["-DPy_NO_ENABLE_SHARED", "-Wno-ignored-attributes"] + optimizer = _optimizers.OptimizerX86 condition = "defined(_M_IX86)" - target = _COFF(host, condition, args=args, prefix="_") + target = _COFF(host, condition, args=args, optimizer=optimizer, prefix="_") elif re.fullmatch(r"x86_64-apple-darwin.*", host): condition = "defined(__x86_64__) && defined(__APPLE__)" - target = _MachO(host, condition, prefix="_") + optimizer = _optimizers.OptimizerX86 + target = _MachO(host, condition, optimizer=optimizer, prefix="_") elif re.fullmatch(r"x86_64-pc-windows-msvc", host): args = ["-fms-runtime-lib=dll"] condition = "defined(_M_X64)" - target = _COFF(host, condition, args=args) + optimizer = _optimizers.OptimizerX8664Windows + target = _COFF(host, condition, args=args, optimizer=optimizer) elif re.fullmatch(r"x86_64-.*-linux-gnu", host): args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"] condition = "defined(__x86_64__) && defined(__linux__)" - target = _ELF(host, condition, args=args) + optimizer = _optimizers.OptimizerX86 + target = _ELF(host, condition, args=args, optimizer=optimizer) else: raise ValueError(host) return target diff --git a/Tools/requirements-hypothesis.txt b/Tools/requirements-hypothesis.txt index 66898885c0a..e5deac497fb 100644 --- a/Tools/requirements-hypothesis.txt +++ b/Tools/requirements-hypothesis.txt @@ -1,4 +1,4 @@ # Requirements file for hypothesis that # we use to run our property-based tests in CI. -hypothesis==6.111.2 +hypothesis==6.135.26 diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 68cfad3f92c..905af9dcfd8 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -492,7 +492,7 @@ class Stats: ): (trace_too_long, attempts), Doc( "Trace too short", - "A potential trace is abandoned because it it too short.", + "A potential trace is abandoned because it is too short.", ): (trace_too_short, attempts), Doc( "Inner loop found", "A trace is truncated because it has an inner loop" |