From 904ba7ceaf4bc29cad00bfb23380eb3404148609 Mon Sep 17 00:00:00 2001
From: Marius Wachtler <undingen@gmail.com>
Date: Sat, 14 Mar 2026 17:19:20 +0100
Subject: [PATCH 1/2] JIT: port the x86_64 Linux stencil JIT to DynASM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the copy-and-patch relocation engine with a DynASM-based pipeline.
Instead of manually copying pre-compiled stencil blobs and patching GOT
entries / trampolines at runtime, Clang-generated assembly is converted at
build time into DynASM .dasc source, which is then compiled into a C header
(jit_stencils_dynasm.h).  At runtime the DynASM assembler encodes native
x86-64 directly, resolving all labels, jumps, and data references in a
single pass.

Key changes:

Build pipeline (Tools/jit/):
  - _asm_to_dasc.py: New peephole optimizer that converts Clang AT&T asm
    to DynASM Intel-syntax .dasc.  Uses typed operand classes (Reg, Mem,
    Imm) with Python 3.10+ match/case for pattern matching.  Includes
    15+ optimization patterns (immediate narrowing, test-self elimination,
    indexed memory folding, ALU immediate folding, redundant stack reload
    elimination, dead label removal, etc.).
  - _dasc_writer.py: Generates jit_stencils.h with DynASM preamble,
    emit helpers (emit_mov_imm, emit_call_ext, emit_cmp_reg_imm,
    emit_test/and/or/xor_reg_imm), and per-stencil emit functions.
  - _targets.py: Reworked to drive the DynASM pipeline — compiles
    stencils, converts asm, generates .dasc, runs the DynASM preprocessor,
    and produces the final header.
  - _stencils.py: Adds COLD_CODE HoleValue for hot/cold section splitting.
  - _optimizers.py: Extended with stencil frame-size tracking and
    frame-group merging infrastructure.
  - build.py: Adds --peephole-stats flag for optimization statistics.
  - test_peephole.py: unit tests covering peephole patterns and
    the line classification infrastructure.
  - Lib/test/test_jit_peephole.py: Hooks peephole tests into make test.

Runtime (Python/jit.c):
  - Complete rewrite of _PyJIT_Compile: uses DynASM dasm_init / dasm_setup
    / per-stencil emit / dasm_link / dasm_encode instead of memcpy+patch.
  - Hot/cold code splitting: cold (error) paths are placed in a separate
    DynASM section after the hot code, improving i-cache locality.
  - Frame merging: stencils share a single prologue/epilogue, eliminating redundant rsp adjustments.
  - SET_IP delta encoding: incremental IP updates avoid redundant full
    address loads.
  - Hint-based mmap: jit_alloc() places JIT code near the CPython text
    segment for short (±2 GB) RIP-relative calls and LEAs.
  - jit_shrink(): releases unused pages at the end of each compiled trace.
  - emit_call_ext: emits direct RIP-relative call when target is within
    ±2 GB, otherwise falls back to indirect call through register.
  - emit_mov_imm: picks the shortest encoding (xor/mov32/mov64/lea rip)
    based on the runtime value.

Freelist inlining (Tools/jit/jit.h + template.c):
  - Macro overrides redirect float/int allocation and deallocation to
    JIT-inlined versions that directly access the thread-state freelists,
    avoiding function call overhead for the most common object types.
  - _PyJIT_FloatFromDouble / _PyJIT_FloatDealloc: inline float freelist.
  - _PyJIT_LongDealloc / _PyJIT_FastDealloc: inline int/generic dealloc.
  - _PyJIT_CompactLong_{Add,Subtract,Multiply}: inline compact long ops.
  - PyStackRef_CLOSE / Py_DECREF overrides use the fast dealloc path.

LuaJIT submodule:
  - Added as Tools/jit/LuaJIT for the DynASM assembler (dynasm/ only
    used at build time; no LuaJIT runtime code is linked).

This is an experimental port, currently tested on x86_64 Linux only.
The approach is a hybrid between Pyston's fully hand-written DynASM JIT
(https://github.com/pyston/pyston/blob/pyston_main/Python/aot_ceval_jit.c)
and CPython's Clang-generated stencils: Clang produces the stencil
assembly, and DynASM handles encoding and relocation at runtime.
---
 .gitmodules                     |    3 +
 Lib/test/test_jit_peephole.py   |   33 +
 Makefile.pre.in                 |    5 +-
 Python/jit.c                    |  798 ++++--------
 Python/optimizer.c              |   26 +-
 Tools/jit/LuaJIT                |    1 +
 Tools/jit/_asm_to_dasc.py       | 2093 +++++++++++++++++++++++++++++++
 Tools/jit/_asm_to_dasc_amd64.py | 1464 +++++++++++++++++++++
 Tools/jit/_dasc_writer.py       |  448 +++++++
 Tools/jit/_optimizers.py        |  533 +++++---
 Tools/jit/_schema.py            |    1 +
 Tools/jit/_stencils.py          |   28 +-
 Tools/jit/_targets.py           |  269 +++-
 Tools/jit/_writer.py            |    2 +-
 Tools/jit/build.py              |    7 +
 Tools/jit/jit_fold_pass.cpp     |  682 ++++++++++
 Tools/jit/template.c            |  412 +++++-
 Tools/jit/test_optimizers.py    |   86 ++
 Tools/jit/test_peephole.py      |  687 ++++++++++
 19 files changed, 6797 insertions(+), 781 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 Lib/test/test_jit_peephole.py
 create mode 160000 Tools/jit/LuaJIT
 create mode 100644 Tools/jit/_asm_to_dasc.py
 create mode 100644 Tools/jit/_asm_to_dasc_amd64.py
 create mode 100644 Tools/jit/_dasc_writer.py
 create mode 100644 Tools/jit/jit_fold_pass.cpp
 create mode 100644 Tools/jit/test_optimizers.py
 create mode 100644 Tools/jit/test_peephole.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000000000..671c95aead403d
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Tools/jit/LuaJIT"]
+	path = Tools/jit/LuaJIT
+	url = https://github.com/LuaJIT/LuaJIT.git
diff --git a/Lib/test/test_jit_peephole.py b/Lib/test/test_jit_peephole.py
new file mode 100644
index 00000000000000..bc0ee72d84b6c8
--- /dev/null
+++ b/Lib/test/test_jit_peephole.py
@@ -0,0 +1,33 @@
+"""Wrapper to run the JIT peephole optimizer tests via 'make test'.
+
+The actual tests live in Tools/jit/test_peephole.py.  This module
+adds Tools/jit to sys.path and imports the test cases so they are
+discovered by the standard test runner.
+"""
+
+import os
+import sys
+import unittest
+
+# Tools/jit is not on the default path — add it so test_peephole can
+# import _asm_to_dasc.
+_jit_tools_dir = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "Tools", "jit",
+)
+
+# Skip entirely if Tools/jit doesn't exist (e.g. minimal install).
+if not os.path.isfile(os.path.join(_jit_tools_dir, "test_peephole.py")):
+    raise unittest.SkipTest("Tools/jit/test_peephole.py not found")
+
+_saved_path = sys.path[:]
+try:
+    if _jit_tools_dir not in sys.path:
+        sys.path.insert(0, _jit_tools_dir)
+    # Import all test classes from the real test module.
+    from test_peephole import *  # noqa: F401,F403
+finally:
+    sys.path[:] = _saved_path
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index f4119abf324fca..1285a177768e3d 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -3173,6 +3173,7 @@ Python/emscripten_trampoline_wasm.c: Python/emscripten_trampoline_inner.wasm
 JIT_DEPS = \
 		$(srcdir)/Tools/jit/*.c \
 		$(srcdir)/Tools/jit/*.py \
+		$(srcdir)/Tools/jit/LuaJIT/dynasm \
 		$(srcdir)/Python/executor_cases.c.h \
 		pyconfig.h
 
@@ -3180,7 +3181,7 @@ jit_stencils.h @JIT_STENCILS_H@: $(JIT_DEPS)
 	@REGEN_JIT_COMMAND@
 
 Python/jit.o: $(srcdir)/Python/jit.c @JIT_STENCILS_H@
-	$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
+	$(CC) -c $(PY_CORE_CFLAGS) -I$(srcdir)/Tools/jit/LuaJIT/dynasm -o $@ $<
 
 .PHONY: regen-jit
 regen-jit:
@@ -3305,7 +3306,7 @@ clean-profile: clean-retain-profile clean-bolt
 # gh-141808: The JIT stencils are deliberately kept in clean-profile
 .PHONY: clean-jit-stencils
 clean-jit-stencils:
-	-rm -f jit_stencils*.h
+	-rm -f jit_stencils*.h jit_stencils*.dasc
 
 .PHONY: clean
 clean: clean-profile clean-jit-stencils
diff --git a/Python/jit.c b/Python/jit.c
index 3e0a0aa8bfcc81..31b95755757b5a 100644
--- a/Python/jit.c
+++ b/Python/jit.c
@@ -30,6 +30,7 @@
 #include "pycore_unicodeobject.h"
 
 #include "pycore_jit.h"
+#include "pycore_uop_metadata.h"
 
 // Memory management stuff: ////////////////////////////////////////////////////
 
@@ -103,13 +104,15 @@ _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr)
     return 0;
 }
 
+// Next mmap hint address for placing JIT code near CPython text.
+// File-scope so jit_shrink() can rewind it when releasing unused pages.
+#if defined(__linux__) && defined(__x86_64__)
+static uintptr_t jit_next_hint = 0;
+#endif
+
 static unsigned char *
 jit_alloc(size_t size)
 {
-    if (size > PY_MAX_JIT_CODE_SIZE) {
-        jit_error("code too big; refactor bytecodes.c to keep uop size down, or reduce maximum trace length.");
-        return NULL;
-    }
     assert(size);
     assert(size % get_page_size() == 0);
 #ifdef MS_WINDOWS
@@ -119,8 +122,30 @@ jit_alloc(size_t size)
 #else
     int flags = MAP_ANONYMOUS | MAP_PRIVATE;
     int prot = PROT_READ | PROT_WRITE;
-    unsigned char *memory = mmap(NULL, size, prot, flags, -1, 0);
+    void *hint = NULL;
+#if defined(__linux__) && defined(__x86_64__)
+    // Allocate JIT code near CPython text so emit_call_ext and emit_mov_imm
+    // can use short RIP-relative encodings (within ±2GB).
+    {
+        if (jit_next_hint == 0) {
+            size_t page_size = get_page_size();
+            extern char _end[];
+            // Start 25MB after the end of CPython text, rounded up to the next page.
+            jit_next_hint = ((uintptr_t)_end + 25000000 + page_size - 1) & ~(uintptr_t)(page_size - 1);
+        }
+        hint = (void *)jit_next_hint;
+    }
+#endif
+    unsigned char *memory = mmap(hint, size, prot, flags, -1, 0);
+    if (memory == MAP_FAILED && hint != NULL) {
+        memory = mmap(NULL, size, prot, flags, -1, 0);
+    }
     int failed = memory == MAP_FAILED;
+#if defined(__linux__) && defined(__x86_64__)
+    if (!failed) {
+        jit_next_hint = (uintptr_t)memory + size;
+    }
+#endif
     if (!failed) {
         (void)_PyAnnotateMemoryMap(memory, size, "cpython:jit");
     }
@@ -132,6 +157,30 @@ jit_alloc(size_t size)
     return memory;
 }
 
+// Shrink a JIT allocation by releasing unused tail pages back to the OS.
+// Updates jit_next_hint so the next allocation continues right after the
+// trimmed region (avoids leaving gaps in the address space).
+static void
+jit_shrink(unsigned char *memory, size_t alloc_size, size_t used_size)
+{
+    assert(used_size <= alloc_size);
+    assert(used_size % get_page_size() == 0);
+    assert(alloc_size % get_page_size() == 0);
+    if (used_size < alloc_size) {
+#ifdef MS_WINDOWS
+        VirtualFree(memory + used_size, alloc_size - used_size, MEM_DECOMMIT);
+#else
+        munmap(memory + used_size, alloc_size - used_size);
+#endif
+#if defined(__linux__) && defined(__x86_64__)
+        // Rewind hint so the next allocation fills the gap we just freed.
+        if (jit_next_hint == (uintptr_t)memory + alloc_size) {
+            jit_next_hint = (uintptr_t)memory + used_size;
+        }
+#endif
+    }
+}
+
 static int
 jit_free(unsigned char *memory, size_t size)
 {
@@ -178,592 +227,257 @@ mark_executable(unsigned char *memory, size_t size)
 }
 
 // JIT compiler stuff: /////////////////////////////////////////////////////////
+//
+// DynASM-based JIT: We use Clang to compile each uop template to optimized
+// assembly at build time, convert the assembly to DynASM directives via
+// _asm_to_dasc.py, and run the DynASM preprocessor (dynasm.lua) to produce
+// jit_stencils.h containing an action list and per-uop emit functions.
+//
+// At runtime, DynASM's tiny encoding engine (dasm_x86.h) assembles the trace
+// by replaying the action list with concrete operand values, resolving labels
+// and branches automatically.  This replaces the entire copy-and-patch
+// relocation layer: no more patch_* functions, no trampolines, no GOT.
+
+#include "dasm_proto.h"
+
+// DynASM configuration: Dst is always dasm_State** passed as first argument
+// to emit functions.
+#define Dst_DECL    dasm_State **Dst
+#define Dst_REF     (*Dst)
+
+#include "dasm_x86.h"
+#include "jit_stencils.h"
 
-#define GOT_SLOT_SIZE sizeof(uintptr_t)
-#define SYMBOL_MASK_WORDS 8
-
-typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS];
-
-typedef struct {
-    unsigned char *mem;
-    symbol_mask mask;
-    size_t size;
-} symbol_state;
-
-typedef struct {
-    symbol_state trampolines;
-    symbol_state got_symbols;
-    uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH];
-} jit_state;
-
-// Warning! AArch64 requires you to get your hands dirty. These are your gloves:
-
-// value[value_start : value_start + len]
-static uint32_t
-get_bits(uint64_t value, uint8_t value_start, uint8_t width)
-{
-    assert(width <= 32);
-    return (value >> value_start) & ((1ULL << width) - 1);
-}
-
-// *loc[loc_start : loc_start + width] = value[value_start : value_start + width]
+// Compiles executor in-place using DynASM.
+//
+// The DynASM flow:
+//   1. Initialize DynASM state and pre-allocate PC labels for all uops
+//      plus their internal branch targets.
+//   2. Emit each uop stencil via the generated emit_*() functions.  These
+//      call dasm_put() to append encoded instructions to the action buffer,
+//      using PC labels for inter-uop jumps and DynASM sections for hot/cold
+//      code separation.
+//   3. Append a _FATAL_ERROR sentinel after the last uop to catch overruns.
+//   4. dasm_link() computes the final code layout and resolves all labels.
+//   5. Allocate executable memory (page-aligned) and dasm_encode() into it.
+//   6. Mark memory executable and shrink unused pages.
+//
+// This replaces the old copy-and-patch approach and eliminates all manual
+// relocation patching, GOT/trampoline generation.
+
+/* Emit all uop stencils (Phase 3-4) into the DynASM state.
+ *
+ * Handles _SET_IP delta encoding, shared trace cleanup stubs, and the
+ * _FATAL_ERROR sentinel.
+ */
 static void
-set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
-         uint8_t width)
-{
-    assert(loc_start + width <= 32);
-    uint32_t temp_val;
-    // Use memcpy to safely read the value, avoiding potential alignment
-    // issues and strict aliasing violations.
-    memcpy(&temp_val, loc, sizeof(temp_val));
-    // Clear the bits we're about to patch:
-    temp_val &= ~(((1ULL << width) - 1) << loc_start);
-    assert(get_bits(temp_val, loc_start, width) == 0);
-    // Patch the bits:
-    temp_val |= get_bits(value, value_start, width) << loc_start;
-    assert(get_bits(temp_val, loc_start, width) == get_bits(value, value_start, width));
-    // Safely write the modified value back to memory.
-    memcpy(loc, &temp_val, sizeof(temp_val));
-}
-
-// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions
-// for instruction encodings:
-#define IS_AARCH64_ADD_OR_SUB(I)  (((I) & 0x11C00000) == 0x11000000)
-#define IS_AARCH64_ADRP(I)        (((I) & 0x9F000000) == 0x90000000)
-#define IS_AARCH64_BRANCH(I)      (((I) & 0x7C000000) == 0x14000000)
-#define IS_AARCH64_BRANCH_COND(I) (((I) & 0x7C000000) == 0x54000000)
-#define IS_AARCH64_BRANCH_ZERO(I) (((I) & 0x7E000000) == 0x34000000)
-#define IS_AARCH64_TEST_AND_BRANCH(I) (((I) & 0x7E000000) == 0x36000000)
-#define IS_AARCH64_LDR_OR_STR(I)  (((I) & 0x3B000000) == 0x39000000)
-#define IS_AARCH64_MOV(I)         (((I) & 0x9F800000) == 0x92800000)
-
-// LLD is a great reference for performing relocations... just keep in
-// mind that Tools/jit/build.py does filtering and preprocessing for us!
-// Here's a good place to start for each platform:
-// - aarch64-apple-darwin:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
-//   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
-//   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
-// - aarch64-pc-windows-msvc:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp
-// - aarch64-unknown-linux-gnu:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp
-// - i686-pc-windows-msvc:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp
-// - x86_64-apple-darwin:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp
-// - x86_64-pc-windows-msvc:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp
-// - x86_64-unknown-linux-gnu:
-//   - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp
-
-
-// Get the symbol slot memory location for a given symbol ordinal.
-static unsigned char *
-get_symbol_slot(int ordinal, symbol_state *state, int size)
-{
-    const uint32_t symbol_mask = 1U << (ordinal % 32);
-    const uint32_t state_mask = state->mask[ordinal / 32];
-    assert(symbol_mask & state_mask);
-
-     // Count the number of set bits in the symbol mask lower than ordinal
-    size_t index = _Py_popcount32(state_mask & (symbol_mask - 1));
-    for (int i = 0; i < ordinal / 32; i++) {
-        index += _Py_popcount32(state->mask[i]);
-    }
-
-    unsigned char *slot = state->mem + index * size;
-    assert((size_t)(index + 1) * size <= state->size);
-    return slot;
-}
-
-// Return the address of the GOT slot for the requested symbol ordinal.
-static uintptr_t
-got_symbol_address(int ordinal, jit_state *state)
-{
-    return (uintptr_t)get_symbol_slot(ordinal, &state->got_symbols, GOT_SLOT_SIZE);
-}
-
-// Many of these patches are "relaxing", meaning that they can rewrite the
-// code they're patching to be more efficient (like turning a 64-bit memory
-// load into a 32-bit immediate load). These patches have an "x" in their name.
-// Relative patches have an "r" in their name.
-
-// 32-bit absolute address.
-void
-patch_32(unsigned char *location, uint64_t value)
-{
-    // Check that we're not out of range of 32 unsigned bits:
-    assert(value < (1ULL << 32));
-    uint32_t final_value = (uint32_t)value;
-    memcpy(location, &final_value, sizeof(final_value));
-}
-
-// 32-bit relative address.
-void
-patch_32r(unsigned char *location, uint64_t value)
+emit_trace(dasm_State **Dst,
+           const _PyUOpInstruction *trace, size_t length)
 {
-    value -= (uintptr_t)location;
-    // Check that we're not out of range of 32 signed bits:
-    assert((int64_t)value >= -(1LL << 31));
-    assert((int64_t)value < (1LL << 31));
-    uint32_t final_value = (uint32_t)value;
-    memcpy(location, &final_value, sizeof(final_value));
-}
-
-// 64-bit absolute address.
-void
-patch_64(unsigned char *location, uint64_t value)
-{
-    memcpy(location, &value, sizeof(value));
-}
+    int sentinel_label = (int)length;
+    int label_base = sentinel_label + 1;
+    uintptr_t last_ip = 0;  // track last _SET_IP value for delta encoding
 
-// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r
-// (below).
-void
-patch_aarch64_12(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32));
-    // There might be an implicit shift encoded in the instruction:
-    uint8_t shift = 0;
-    if (IS_AARCH64_LDR_OR_STR(*loc32)) {
-        shift = (uint8_t)get_bits(*loc32, 30, 2);
-        // If both of these are set, the shift is supposed to be 4.
-        // That's pretty weird, and it's never actually been observed...
-        assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0);
-    }
-    value = get_bits(value, 0, 12);
-    assert(get_bits(value, 0, shift) == 0);
-    set_bits(loc32, 10, value, shift, 12);
-}
-
-// Relaxable 12-bit low part of an absolute address. Pairs nicely with
-// patch_aarch64_21rx (below).
-void
-patch_aarch64_12x(unsigned char *location, uint64_t value)
-{
-    // This can *only* be relaxed if it occurs immediately before a matching
-    // patch_aarch64_21rx. If that happens, the JIT build step will replace both
-    // calls with a single call to patch_aarch64_33rx. Otherwise, we end up
-    // here, and the instruction is patched normally:
-    patch_aarch64_12(location, value);
-}
-
-// 16-bit low part of an absolute address.
-void
-patch_aarch64_16a(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_MOV(*loc32));
-    // Check the implicit shift (this is "part 0 of 3"):
-    assert(get_bits(*loc32, 21, 2) == 0);
-    set_bits(loc32, 5, value, 0, 16);
-}
+    emit_trace_entry_frame(Dst);
 
-// 16-bit middle-low part of an absolute address.
-void
-patch_aarch64_16b(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_MOV(*loc32));
-    // Check the implicit shift (this is "part 1 of 3"):
-    assert(get_bits(*loc32, 21, 2) == 1);
-    set_bits(loc32, 5, value, 16, 16);
-}
-
-// 16-bit middle-high part of an absolute address.
-void
-patch_aarch64_16c(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_MOV(*loc32));
-    // Check the implicit shift (this is "part 2 of 3"):
-    assert(get_bits(*loc32, 21, 2) == 2);
-    set_bits(loc32, 5, value, 32, 16);
-}
-
-// 16-bit high part of an absolute address.
-void
-patch_aarch64_16d(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_MOV(*loc32));
-    // Check the implicit shift (this is "part 3 of 3"):
-    assert(get_bits(*loc32, 21, 2) == 3);
-    set_bits(loc32, 5, value, 48, 16);
-}
-
-// 21-bit count of pages between this page and an absolute address's page... I
-// know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above).
-void
-patch_aarch64_21r(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    value = (value >> 12) - ((uintptr_t)location >> 12);
-    // Check that we're not out of range of 21 signed bits:
-    assert((int64_t)value >= -(1 << 20));
-    assert((int64_t)value < (1 << 20));
-    // value[0:2] goes in loc[29:31]:
-    set_bits(loc32, 29, value, 0, 2);
-    // value[2:21] goes in loc[5:26]:
-    set_bits(loc32, 5, value, 2, 19);
-}
-
-// Relaxable 21-bit count of pages between this page and an absolute address's
-// page. Pairs nicely with patch_aarch64_12x (above).
-void
-patch_aarch64_21rx(unsigned char *location, uint64_t value)
-{
-    // This can *only* be relaxed if it occurs immediately before a matching
-    // patch_aarch64_12x. If that happens, the JIT build step will replace both
-    // calls with a single call to patch_aarch64_33rx. Otherwise, we end up
-    // here, and the instruction is patched normally:
-    patch_aarch64_21r(location, value);
-}
-
-// 21-bit relative branch.
-void
-patch_aarch64_19r(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_BRANCH_COND(*loc32) || IS_AARCH64_BRANCH_ZERO(*loc32));
-    value -= (uintptr_t)location;
-    // Check that we're not out of range of 21 signed bits:
-    assert((int64_t)value >= -(1 << 20));
-    assert((int64_t)value < (1 << 20));
-    // Since instructions are 4-byte aligned, only use 19 bits:
-    assert(get_bits(value, 0, 2) == 0);
-    set_bits(loc32, 5, value, 2, 19);
-}
-
-// 28-bit relative branch.
-void
-patch_aarch64_26r(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    assert(IS_AARCH64_BRANCH(*loc32));
-    value -= (uintptr_t)location;
-    // Check that we're not out of range of 28 signed bits:
-    assert((int64_t)value >= -(1 << 27));
-    assert((int64_t)value < (1 << 27));
-    // Since instructions are 4-byte aligned, only use 26 bits:
-    assert(get_bits(value, 0, 2) == 0);
-    set_bits(loc32, 0, value, 2, 26);
-}
-
-// A pair of patch_aarch64_21rx and patch_aarch64_12x.
-void
-patch_aarch64_33rx(unsigned char *location, uint64_t value)
-{
-    uint32_t *loc32 = (uint32_t *)location;
-    // Try to relax the pair of GOT loads into an immediate value:
-    assert(IS_AARCH64_ADRP(*loc32));
-    unsigned char reg = get_bits(loc32[0], 0, 5);
-    assert(IS_AARCH64_LDR_OR_STR(loc32[1]));
-    // There should be only one register involved:
-    assert(reg == get_bits(loc32[1], 0, 5));  // ldr's output register.
-    assert(reg == get_bits(loc32[1], 5, 5));  // ldr's input register.
-    uint64_t relaxed = *(uint64_t *)value;
-    if (relaxed < (1UL << 16)) {
-        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
-        loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg;
-        loc32[1] = 0xD503201F;
-        return;
-    }
-    if (relaxed < (1ULL << 32)) {
-        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
-        loc32[0] = 0xD2800000 | (get_bits(relaxed,  0, 16) << 5) | reg;
-        loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg;
-        return;
-    }
-    int64_t page_delta = (relaxed >> 12) - ((uintptr_t)location >> 12);
-    if (page_delta >= -(1L << 20) &&
-        page_delta < (1L << 20))
-    {
-        // adrp reg, AAA; ldr reg, [reg + BBB] -> adrp reg, AAA; add reg, reg, BBB
-        patch_aarch64_21rx(location, relaxed);
-        loc32[1] = 0x91000000 | get_bits(relaxed, 0, 12) << 10 | reg << 5 | reg;
-        return;
-    }
-    relaxed = value - (uintptr_t)location;
-    if ((relaxed & 0x3) == 0 &&
-        (int64_t)relaxed >= -(1L << 19) &&
-        (int64_t)relaxed < (1L << 19))
-    {
-        // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop
-        loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg;
-        loc32[1] = 0xD503201F;
-        return;
-    }
-    // Couldn't do it. Just patch the two instructions normally:
-    patch_aarch64_21rx(location, value);
-    patch_aarch64_12x(location + 4, value);
-}
-
-// Relaxable 32-bit relative address.
-void
-patch_x86_64_32rx(unsigned char *location, uint64_t value)
-{
-    uint8_t *loc8 = (uint8_t *)location;
-    // Try to relax the GOT load into an immediate value:
-    uint64_t relaxed;
-    memcpy(&relaxed, (void *)(value + 4), sizeof(relaxed));
-    relaxed -= 4;
-
-    if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
-        (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
-    {
-        if (loc8[-2] == 0x8B) {
-            // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
-            loc8[-2] = 0x8D;
-            value = relaxed;
+    for (size_t i = 0; i < length; i++) {
+        const _PyUOpInstruction *instruction = &trace[i];
+        int uop_label = (int)i;
+        int continue_label = (int)(i + 1);
+
+        int opcode = instruction->opcode;
+        if ((opcode == _SET_IP_r00 || opcode == _SET_IP_r11
+             || opcode == _SET_IP_r22 || opcode == _SET_IP_r33)
+            && last_ip != 0)
+        {
+            uintptr_t new_ip = (uintptr_t)instruction->operand0;
+            intptr_t delta = (intptr_t)(new_ip - last_ip);
+            if (delta != 0
+                && delta >= INT32_MIN && delta <= INT32_MAX)
+            {
+                emit_set_ip_delta(Dst, uop_label, delta);
+                label_base += jit_internal_label_count(opcode);
+                last_ip = new_ip;
+                // SET_IP delta only modifies [r13+56], preserves rax
+                continue;
+            }
         }
-        else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) {
-            // call qword ptr [rip + AAA] -> nop; call XXX
-            loc8[-2] = 0x90;
-            loc8[-1] = 0xE8;
-            value = relaxed;
+
+        jit_emit_one(Dst, instruction->opcode, instruction,
+                     uop_label, continue_label, label_base);
+        label_base += jit_internal_label_count(instruction->opcode);
+        if (opcode == _SET_IP_r00 || opcode == _SET_IP_r11
+            || opcode == _SET_IP_r22 || opcode == _SET_IP_r33)
+        {
+            last_ip = (uintptr_t)instruction->operand0;
         }
-        else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) {
-            // jmp qword ptr [rip + AAA] -> nop; jmp XXX
-            loc8[-2] = 0x90;
-            loc8[-1] = 0xE9;
-            value = relaxed;
+        else if (jit_invalidates_ip(opcode)) {
+            last_ip = 0;
         }
     }
-    patch_32r(location, value);
-}
 
-void patch_got_symbol(jit_state *state, int ordinal);
-void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state);
-void patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state);
-
-#include "jit_stencils.h"
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-    #define TRAMPOLINE_SIZE 16
-    #define DATA_ALIGN 8
-#elif defined(__x86_64__) && defined(__APPLE__)
-    // LLVM 20 on macOS x86_64 debug builds: GOT entries may exceed ±2GB PC-relative
-    // range.
-    #define TRAMPOLINE_SIZE 16  // 14 bytes + 2 bytes padding for alignment
-    #define DATA_ALIGN 8
-#else
-    #define TRAMPOLINE_SIZE 0
-    #define DATA_ALIGN 1
-#endif
-
-// Populate the GOT entry for the given symbol ordinal with its resolved address.
-void
-patch_got_symbol(jit_state *state, int ordinal)
-{
-    uint64_t value = (uintptr_t)symbols_map[ordinal];
-    unsigned char *location = (unsigned char *)get_symbol_slot(ordinal, &state->got_symbols, GOT_SLOT_SIZE);
-    patch_64(location, value);
-}
-
-// Generate and patch AArch64 trampolines. The symbols to jump to are stored
-// in the jit_stencils.h in the symbols_map.
-void
-patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state)
-{
-
-    uint64_t value = (uintptr_t)symbols_map[ordinal];
-    int64_t range = value - (uintptr_t)location;
-
-    // If we are in range of 28 signed bits, we patch the instruction with
-    // the address of the symbol.
-    if (range >= -(1 << 27) && range < (1 << 27)) {
-        patch_aarch64_26r(location, (uintptr_t)value);
-        return;
+    // Emit _FATAL_ERROR sentinel after the last uop to catch overruns
+    {
+        _PyUOpInstruction sentinel = {0};
+        sentinel.opcode = _FATAL_ERROR_r00;
+        int sentinel_continue = sentinel_label;
+        jit_emit_one(Dst, _FATAL_ERROR_r00, &sentinel,
+                     sentinel_label, sentinel_continue, label_base);
     }
-
-    // Out of range - need a trampoline
-    uint32_t *p = (uint32_t *)get_symbol_slot(ordinal, &state->trampolines, TRAMPOLINE_SIZE);
-
-    /* Generate the trampoline
-       0: 58000048      ldr     x8, 8
-       4: d61f0100      br      x8
-       8: 00000000      // The next two words contain the 64-bit address to jump to.
-       c: 00000000
-    */
-    p[0] = 0x58000048;
-    p[1] = 0xD61F0100;
-    p[2] = value & 0xffffffff;
-    p[3] = value >> 32;
-
-    patch_aarch64_26r(location, (uintptr_t)p);
-}
-
-// Generate and patch x86_64 trampolines.
-void
-patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state)
-{
-    uint64_t value = (uintptr_t)symbols_map[ordinal];
-    int64_t range = (int64_t)value - 4 - (int64_t)location;
-
-    // If we are in range of 32 signed bits, we can patch directly
-    if (range >= -(1LL << 31) && range < (1LL << 31)) {
-        patch_32r(location, value - 4);
-        return;
-    }
-
-    // Out of range - need a trampoline
-    unsigned char *trampoline = get_symbol_slot(ordinal, &state->trampolines, TRAMPOLINE_SIZE);
-
-    /* Generate the trampoline (14 bytes, padded to 16):
-       0: ff 25 00 00 00 00    jmp *(%rip)
-       6: XX XX XX XX XX XX XX XX   (64-bit target address)
-
-       Reference: https://wiki.osdev.org/X86-64_Instruction_Encoding#FF (JMP r/m64)
-    */
-    trampoline[0] = 0xFF;
-    trampoline[1] = 0x25;
-    memset(trampoline + 2, 0, 4);
-    memcpy(trampoline + 6, &value, 8);
-
-    // Patch the call site to call the trampoline instead
-    patch_32r(location, (uintptr_t)trampoline - 4);
 }
 
+/* Initialize a DynASM state for trace compilation. */
 static void
-combine_symbol_mask(const symbol_mask src, symbol_mask dest)
+init_dasm(dasm_State **Dst, int total_labels)
 {
-    // Calculate the union of the trampolines required by each StencilGroup
-    for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) {
-        dest[i] |= src[i];
-    }
+    dasm_init(Dst, DASM_MAXSECTION);
+    dasm_setup(Dst, jit_actionlist);
+    dasm_growpc(Dst, total_labels);
 }
 
-// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
 int
 _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length)
 {
-    const StencilGroup *group;
-    // Loop once to find the total compiled size:
-    size_t code_size = 0;
-    size_t data_size = 0;
-    jit_state state = {0};
+    // Phase 1: Count total PC labels needed.
+    // Labels [0..length-1] are uop entry points; additional labels are
+    // allocated for internal branch targets within each stencil.
+    int total_labels = (int)length;
     for (size_t i = 0; i < length; i++) {
-        const _PyUOpInstruction *instruction = &trace[i];
-        group = &stencil_groups[instruction->opcode];
-        state.instruction_starts[i] = code_size;
-        code_size += group->code_size;
-        data_size += group->data_size;
-        combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
-        combine_symbol_mask(group->got_mask, state.got_symbols.mask);
-    }
-    group = &stencil_groups[_FATAL_ERROR_r00];
-    code_size += group->code_size;
-    data_size += group->data_size;
-    combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
-    combine_symbol_mask(group->got_mask, state.got_symbols.mask);
-    // Calculate the size of the trampolines required by the whole trace
-    for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) {
-        state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE;
-    }
-    for (size_t i = 0; i < Py_ARRAY_LENGTH(state.got_symbols.mask); i++) {
-        state.got_symbols.size += _Py_popcount32(state.got_symbols.mask[i]) * GOT_SLOT_SIZE;
-    }
-    // Round up to the nearest page:
+        total_labels += jit_internal_label_count(trace[i].opcode);
+    }
+    // One extra label for the _FATAL_ERROR sentinel.
+    total_labels += 1;
+    // Extra internal labels for _FATAL_ERROR
+    total_labels += jit_internal_label_count(_FATAL_ERROR_r00);
+
+    // Phase 2–6: Single-pass JIT compilation.
+    //
+    // Allocate PY_MAX_JIT_CODE_SIZE up front.  Since jit_alloc() places
+    // code near CPython text (via mmap hints on Linux x86-64), the real
+    // allocation address is always usable as jit_code_base — emit_mov_imm()
+    // and emit_call_ext() will use short RIP-relative encodings.
+    //
+    // After encoding, unused tail pages are released back to the OS and
+    // jit_next_hint is rewound so the next allocation fills the gap.
+    dasm_State *d;
+    size_t code_size;
+    int status;
+
     size_t page_size = get_page_size();
     assert((page_size & (page_size - 1)) == 0);
-    size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
-    size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size) & (page_size - 1));
-    size_t total_size = code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size + padding;
-    unsigned char *memory = jit_alloc(total_size);
+    size_t alloc_size = (PY_MAX_JIT_CODE_SIZE + page_size - 1) & ~(page_size - 1);
+    unsigned char *memory = jit_alloc(alloc_size);
     if (memory == NULL) {
         return -1;
     }
+
+    jit_code_base = (uintptr_t)memory;
+
+    init_dasm(&d, total_labels);
+    emit_trace(&d, trace, length);
+    status = dasm_link(&d, &code_size);
+    if (status != DASM_S_OK) {
+        jit_free(memory, alloc_size);
+        dasm_free(&d);
+        PyErr_Format(PyExc_RuntimeWarning,
+                     "JIT DynASM link failed (status %d)", status);
+        return -1;
+    }
+    if (code_size > PY_MAX_JIT_CODE_SIZE) {
+        // Trace too large — give up on this trace.
+        jit_free(memory, alloc_size);
+        dasm_free(&d);
+        jit_error("code too big; refactor bytecodes.c to keep uop size down, or reduce maximum trace length.");
+        return -1;
+    }
+    if (code_size > alloc_size) {
+        // Trace too large — give up on this trace.
+        jit_free(memory, alloc_size);
+        dasm_free(&d);
+        PyErr_Format(PyExc_RuntimeWarning,
+                     "JIT code too large (%zu bytes)", code_size);
+        return -1;
+    }
+
+    // Phase 7: Encode — writes final machine code into memory.
+    status = dasm_encode(&d, memory);
+    if (status != DASM_S_OK) {
+        jit_free(memory, alloc_size);
+        dasm_free(&d);
+        PyErr_Format(PyExc_RuntimeWarning,
+                     "JIT DynASM encode failed (status %d)", status);
+        return -1;
+    }
+
+    dasm_free(&d);
+
+    // Release unused tail pages and rewind jit_next_hint.
+    size_t total_size = (code_size + page_size - 1) & ~(page_size - 1);
+    jit_shrink(memory, alloc_size, total_size);
+
     // Collect memory stats
     OPT_STAT_ADD(jit_total_memory_size, total_size);
     OPT_STAT_ADD(jit_code_size, code_size);
-    OPT_STAT_ADD(jit_trampoline_size, state.trampolines.size);
-    OPT_STAT_ADD(jit_data_size, data_size);
-    OPT_STAT_ADD(jit_got_size, state.got_symbols.size);
-    OPT_STAT_ADD(jit_padding_size, padding);
+    OPT_STAT_ADD(jit_padding_size, total_size - code_size);
     OPT_HIST(total_size, trace_total_memory_hist);
-    // Update the offsets of each instruction:
-    for (size_t i = 0; i < length; i++) {
-        state.instruction_starts[i] += (uintptr_t)memory;
-    }
-    // Loop again to emit the code:
-    unsigned char *code = memory;
-    state.trampolines.mem = memory + code_size;
-    unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
-    assert(trace[0].opcode == _START_EXECUTOR_r00 || trace[0].opcode == _COLD_EXIT_r00 || trace[0].opcode == _COLD_DYNAMIC_EXIT_r00);
-    state.got_symbols.mem = data + data_size;
-    for (size_t i = 0; i < length; i++) {
-        const _PyUOpInstruction *instruction = &trace[i];
-        group = &stencil_groups[instruction->opcode];
-        group->emit(code, data, executor, instruction, &state);
-        code += group->code_size;
-        data += group->data_size;
-    }
-    // Protect against accidental buffer overrun into data:
-    group = &stencil_groups[_FATAL_ERROR_r00];
-    group->emit(code, data, executor, NULL, &state);
-    code += group->code_size;
-    data += group->data_size;
-    assert(code == memory + code_size);
-    assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
+
     if (mark_executable(memory, total_size)) {
         jit_free(memory, total_size);
         return -1;
     }
+
     executor->jit_code = memory;
     executor->jit_size = total_size;
     return 0;
 }
 
-/* One-off compilation of the jit entry shim
- * We compile this once only as it effectively a normal
- * function, but we need to use the JIT because it needs
- * to understand the jit-specific calling convention.
- * Don't forget to call _PyJIT_Fini later!
+/* One-off compilation of the jit entry shim.
+ *
+ * The shim bridges the native C calling convention to the JIT's internal
+ * calling convention.  It is compiled once and shared across all traces.
+ * Uses DynASM just like trace compilation, but with a single emit_shim()
+ * call instead of a loop over uops.
  */
 static _PyJitEntryFuncPtr
 compile_shim(void)
 {
-    _PyExecutorObject dummy;
-    const StencilGroup *group;
-    size_t code_size = 0;
-    size_t data_size = 0;
-    jit_state state = {0};
-    group = &shim;
-    code_size += group->code_size;
-    data_size += group->data_size;
-    combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
-    combine_symbol_mask(group->got_mask, state.got_symbols.mask);
-    // Round up to the nearest page:
+    int total_labels = 1 + jit_internal_label_count_shim();
+    dasm_State *d;
+    size_t code_size;
+    int status;
+
+    // The shim is tiny (~100 bytes).  Allocate one page, compile once.
     size_t page_size = get_page_size();
-    assert((page_size & (page_size - 1)) == 0);
-    size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
-    size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size) & (page_size - 1));
-    size_t total_size = code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size + padding;
-    unsigned char *memory = jit_alloc(total_size);
+    size_t alloc_size = page_size;
+    unsigned char *memory = jit_alloc(alloc_size);
     if (memory == NULL) {
         return NULL;
     }
-    unsigned char *code = memory;
-    state.trampolines.mem = memory + code_size;
-    unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
-    state.got_symbols.mem = data + data_size;
-    // Compile the shim, which handles converting between the native
-    // calling convention and the calling convention used by jitted code
-    // (which may be different for efficiency reasons).
-    group = &shim;
-    group->emit(code, data, &dummy, NULL, &state);
-    code += group->code_size;
-    data += group->data_size;
-    assert(code == memory + code_size);
-    assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
-    if (mark_executable(memory, total_size)) {
-        jit_free(memory, total_size);
+
+    jit_code_base = (uintptr_t)memory;
+
+    init_dasm(&d, total_labels);
+    emit_shim(&d, 0, 1);
+    status = dasm_link(&d, &code_size);
+    if (status != DASM_S_OK) {
+        jit_free(memory, alloc_size);
+        dasm_free(&d);
+        return NULL;
+    }
+    assert(code_size <= alloc_size);
+
+    status = dasm_encode(&d, memory);
+    dasm_free(&d);
+    if (status != DASM_S_OK) {
+        jit_free(memory, alloc_size);
+        return NULL;
+    }
+
+    if (mark_executable(memory, alloc_size)) {
+        jit_free(memory, alloc_size);
         return NULL;
     }
-    _Py_jit_shim_size = total_size;
+    _Py_jit_shim_size = alloc_size;
     return (_PyJitEntryFuncPtr)memory;
 }
 
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 466729b158d345..09936cafbb0a98 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -1469,10 +1469,19 @@ stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int length)
         if (uop == _NOP) {
             continue;
         }
+        if (uop <= 0 || uop > MAX_UOP_ID) {
+            return 0;
+        }
         int new_depth = _PyUop_Caching[uop].best[depth];
+        if (new_depth < 0 || new_depth > MAX_CACHED_REGISTER) {
+            return 0;
+        }
         if (new_depth != depth) {
-            write->opcode = _PyUop_SpillsAndReloads[depth][new_depth];
-            assert(write->opcode != 0);
+            uint16_t spill_reload = _PyUop_SpillsAndReloads[depth][new_depth];
+            if (spill_reload == 0 || spill_reload > MAX_UOP_REGS_ID) {
+                return 0;
+            }
+            write->opcode = spill_reload;
             write->format = UOP_FORMAT_TARGET;
             write->oparg = 0;
             write->target = 0;
@@ -1481,10 +1490,16 @@ stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int length)
         }
         *write = buffer[i];
         uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode;
-        assert(new_opcode != 0);
+        if (new_opcode == 0 || new_opcode > MAX_UOP_REGS_ID) {
+            return 0;
+        }
         write->opcode = new_opcode;
         write++;
-        depth = _PyUop_Caching[uop].entries[depth].output;
+        int output_depth = _PyUop_Caching[uop].entries[depth].output;
+        if (output_depth < 0 || output_depth > MAX_CACHED_REGISTER) {
+            return 0;
+        }
+        depth = output_depth;
     }
     return (int)(write - output);
 }
@@ -1542,6 +1557,9 @@ uop_optimize(
     OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
     _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0];
     length = stack_allocate(buffer, output, length);
+    if (length <= 0) {
+        return 0;
+    }
     buffer = output;
     length = prepare_for_execution(buffer, length);
     assert(length <= UOP_MAX_TRACE_LENGTH);
diff --git a/Tools/jit/LuaJIT b/Tools/jit/LuaJIT
new file mode 160000
index 00000000000000..659a61693aa3b8
--- /dev/null
+++ b/Tools/jit/LuaJIT
@@ -0,0 +1 @@
+Subproject commit 659a61693aa3b87661864ad0f12eee14c865cd7f
diff --git a/Tools/jit/_asm_to_dasc.py b/Tools/jit/_asm_to_dasc.py
new file mode 100644
index 00000000000000..8062fe1b7c770f
--- /dev/null
+++ b/Tools/jit/_asm_to_dasc.py
@@ -0,0 +1,2093 @@
+"""Convert Intel-syntax x86-64 assembly (from Clang) to DynASM .dasc format.
+
+This module transforms the optimized .s files produced by Clang (Intel syntax,
+medium code model, -fno-pic -fno-plt) into DynASM directives suitable for the
+DynASM Lua preprocessor (dynasm.lua).
+
+All labels (uop entry points, internal branch targets, JIT jump/error targets)
+use DynASM PC labels (=>N), which are dynamically allocated.  The label
+numbering scheme is:
+
+    [0 .. trace_len-1]           : uop entry point labels
+    [trace_len .. trace_len+K-1] : internal stencil labels (allocated per-emit)
+
+External symbol references (function pointers, type addresses) use
+``emit_call_ext()`` for direct calls and ``emit_mov_imm()`` for address loads,
+both of which generate optimal encodings at JIT compile time.
+
+JIT Register Roles
+~~~~~~~~~~~~~~~~~~
+
+The preserve_none calling convention assigns fixed register roles (see
+REG_FRAME, REG_STACK_PTR, REG_TSTATE, REG_EXECUTOR constants below).
+Frame struct offsets (FRAME_IP_OFFSET, FRAME_STACKPOINTER_OFFSET) are
+also defined as constants to avoid hardcoded magic numbers.
+
+Peephole Optimization
+~~~~~~~~~~~~~~~~~~~~~
+
+After converting each stencil to DynASM assembly, a multi-pass peephole
+optimizer folds emit_mov_imm sequences with subsequent instructions.
+Since emit_mov_imm values are C expressions evaluated at JIT compile time,
+folding allows moving work from runtime to compile time.
+
+Two categories of patterns:
+
+  1. **emit_mov_imm chain patterns** (Patterns 1-15): Start from an
+      emit_mov_imm call and attempt to fold the loaded value into subsequent
+      instructions (truncation, arithmetic, branch elimination, memory
+      indexing, ALU folding, etc.).  Handled by ``_fold_mov_imm()``.
+
+  2. **Standalone patterns** (SP0-SP3): Independent patterns that operate
+      on raw DynASM assembly lines:
+        SP0 — Preserve flags across immediate loads
+        SP1 — Store-reload elimination (hot jcc fallthrough)
+        SP2 — Cold-path reload insertion (for __del__ safety)
+        SP3 — Inverted store-reload deferral (hot jcc jump-to-merge)
+     Registered in ``_STANDALONE_PATTERNS``.
+
+Use ``--peephole-stats`` in ``build.py`` to see how often each fires.
+
+Cross-Stencil Optimizations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some optimizations span stencil boundaries and are handled at JIT compile
+time (in jit.c) rather than at build time in this module:
+
+  - **Frame merging**: Consecutive stencils with matching frame sizes
+    can elide the epilogue/prologue pair.  Managed by ``emit_trace()``
+    in jit.c.
+
+  - **SET_IP delta encoding**: When consecutive SET_IP values are close,
+    emit ``add qword [frame+56], delta`` instead of a full mov.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import enum
+import re
+import typing
+
+
+# ── Register name mapping ───────────────────────────────────────────────
+# REX-prefix byte registers that DynASM doesn't natively understand.
+# We teach DynASM these names via .define directives in the .dasc header
+# (see _dasc_writer.py), so we keep them as-is in the assembly output
+# for readability — no Rb(N) substitution needed.
+_REX_BYTE_REGS = frozenset({
+    "spl", "bpl", "sil", "dil",
+    "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
+})
+
+# Mapping from 64-bit register name → human-readable register index
+# constant name (used for emit_mov_imm calls).  Prefixed with JREG_
+# to avoid collisions with system headers (e.g. ucontext.h REG_R8).
+_REG_IDX_NAME: dict[str, str] = {
+    "rax": "JREG_RAX", "rcx": "JREG_RCX", "rdx": "JREG_RDX", "rbx": "JREG_RBX",
+    "rsp": "JREG_RSP", "rbp": "JREG_RBP", "rsi": "JREG_RSI", "rdi": "JREG_RDI",
+    "r8": "JREG_R8", "r9": "JREG_R9", "r10": "JREG_R10", "r11": "JREG_R11",
+    "r12": "JREG_R12", "r13": "JREG_R13", "r14": "JREG_R14", "r15": "JREG_R15",
+}
+
+
+# ── _JIT_* symbol → C expression ───────────────────────────────────────
+_JIT_SYMBOL_EXPR: dict[str, str] = {
+    # The stencil template uses PATCH_VALUE(TYPE, NAME, ALIAS) which
+    # expands to ``TYPE NAME = (TYPE)(uintptr_t)&ALIAS;``.  The compiler
+    # generates ``movabs REG, offset _JIT_*`` to load the symbol's
+    # address directly into a register — there is NO dereference.  The
+    # original stencil JIT patches the movabs immediate with the VALUE
+    # itself (not a pointer), so here we emit the value too.
+    "_JIT_OPERAND0": "instruction->operand0",
+    "_JIT_OPERAND1": "instruction->operand1",
+    "_JIT_OPARG": "instruction->oparg",
+    "_JIT_OPARG_16": "instruction->oparg",
+    "_JIT_OPERAND0_16": "instruction->operand0",
+    "_JIT_OPERAND0_32": "instruction->operand0",
+    "_JIT_OPERAND1_16": "instruction->operand1",
+    "_JIT_OPERAND1_32": "instruction->operand1",
+    "_JIT_TARGET": "instruction->target",
+}
+
+# Map 64-bit register name → 32-bit register name.
+_REG64_TO_REG32: dict[str, str] = {
+    "rax": "eax", "rbx": "ebx", "rcx": "ecx", "rdx": "edx",
+    "rsi": "esi", "rdi": "edi", "rbp": "ebp", "rsp": "esp",
+    "r8": "r8d", "r9": "r9d", "r10": "r10d", "r11": "r11d",
+    "r12": "r12d", "r13": "r13d", "r14": "r14d", "r15": "r15d",
+}
+
+# Map 64-bit register name → DynASM register index for Rq()/Rd() macros.
+_REG64_TO_IDX: dict[str, int] = {
+    "rax": 0, "rcx": 1, "rdx": 2, "rbx": 3,
+    "rsp": 4, "rbp": 5, "rsi": 6, "rdi": 7,
+    "r8": 8, "r9": 9, "r10": 10, "r11": 11,
+    "r12": 12, "r13": 13, "r14": 14, "r15": 15,
+}
+
+# Map any register name (64-bit, 32-bit, 16-bit) to DynASM index
+_ANY_REG_TO_IDX: dict[str, int] = {**_REG64_TO_IDX}
+# Map any register name to the human-readable REG_* constant name
+_ANY_REG_TO_NAME: dict[str, str] = {**_REG_IDX_NAME}
+# 16-bit register names
+_REG16_NAMES: dict[str, str] = {
+    "rax": "ax", "rbx": "bx", "rcx": "cx", "rdx": "dx",
+    "rsi": "si", "rdi": "di", "rbp": "bp", "rsp": "sp",
+    "r8": "r8w", "r9": "r9w", "r10": "r10w", "r11": "r11w",
+    "r12": "r12w", "r13": "r13w", "r14": "r14w", "r15": "r15w",
+}
+for _r64, _idx in list(_REG64_TO_IDX.items()):
+    _r32 = _REG64_TO_REG32[_r64]
+    _r16 = _REG16_NAMES[_r64]
+    _ANY_REG_TO_IDX[_r32] = _idx
+    _ANY_REG_TO_IDX[_r16] = _idx
+    _name = _REG_IDX_NAME[_r64]
+    _ANY_REG_TO_NAME[_r32] = _name
+    _ANY_REG_TO_NAME[_r16] = _name
+
+# Map register index → set of all alias names (for liveness analysis)
+_IDX_TO_ALL_NAMES: dict[int, set[str]] = {}
+for _name, _idx in _ANY_REG_TO_IDX.items():
+    _IDX_TO_ALL_NAMES.setdefault(_idx, set()).add(_name)
+# Add 8-bit register names manually
+_8BIT_NAMES: dict[int, list[str]] = {
+    0: ["al", "ah"], 1: ["cl", "ch"], 2: ["dl", "dh"], 3: ["bl", "bh"],
+    4: ["spl"], 5: ["bpl"], 6: ["sil"], 7: ["dil"],
+    8: ["r8b"], 9: ["r9b"], 10: ["r10b"], 11: ["r11b"],
+    12: ["r12b"], 13: ["r13b"], 14: ["r14b"], 15: ["r15b"],
+}
+for _idx, _names in _8BIT_NAMES.items():
+    for _n in _names:
+        _IDX_TO_ALL_NAMES.setdefault(_idx, set()).add(_n)
+        _ANY_REG_TO_IDX[_n] = _idx
+
+# ── Compiled regexes ───────────────────────────────────────────────────
+
+# movabs REG, offset SYMBOL  or  movabs REG, offset SYMBOL+N
+_RE_MOVABS = re.compile(
+    r"^\s*movabs\s+(\w+),\s*offset\s+([\w.]+)(?:\+(\d+))?\s*(?:#.*)?$"
+)
+
+# movabs REG, IMM  (plain integer immediate, no "offset" keyword)
+_RE_MOVABS_IMM = re.compile(
+    r"^\s*movabs\s+(\w+),\s*(-?\d+)\s*(?:#.*)?$"
+)
+
+# call/jmp qword ptr [rip + SYM@GOTPCREL]
+_RE_GOTPCREL_CALL = re.compile(
+    r"^\s*(call|jmp)\s+qword\s+ptr\s+\[rip\s*\+\s*([\w.]+)@GOTPCREL\]\s*(?:#.*)?$"
+)
+
+# Generic instruction with GOTPCREL in a memory operand
+_RE_GOTPCREL_MEM = re.compile(
+    r"^(\s*\w+\s+)(.*?)(byte|word|dword|qword)\s+ptr\s+"
+    r"\[rip\s*\+\s*([\w.]+)@GOTPCREL\](.*?)$"
+)
+
+# jmp/jcc to _JIT_JUMP_TARGET or _JIT_ERROR_TARGET
+_RE_JIT_BRANCH = re.compile(
+    r"^\s*(j\w+)\s+(_JIT_JUMP_TARGET|_JIT_ERROR_TARGET)\s*(?:#.*)?$"
+)
+
+# jmp/jcc to _JIT_CONTINUE or .L_JIT_CONTINUE
+_RE_JIT_CONTINUE = re.compile(
+    r"^\s*(j\w+)\s+(?:\.L)?_JIT_CONTINUE\s*(?:#.*)?$"
+)
+
+# Pattern for recognized local labels from LLVM (broad match for first pass)
+# This matches any label-like definition that is NOT a _JIT_* special symbol.
+_RE_ANY_LABEL_DEF = re.compile(r"^([\w.]+):\s*(?:#.*)?$")
+
+# Local branch: jmp/jcc/call to a non-_JIT_ label (matched dynamically)
+# (Compiled after the first pass discovers which labels are local)
+
+# Local label definition  (compiled after first pass)
+# These are all just re-used later as local_map lookups
+
+_RE_ENTRY = re.compile(r"^_JIT_ENTRY:\s*(?:#.*)?$")
+_RE_CONTINUE_LABEL = re.compile(r"^(?:\.L)?_JIT_CONTINUE:\s*(?:#.*)?$")
+_RE_FUNC_END = re.compile(r"^\.Lfunc_end\d+:\s*$")
+
+# Section directives
+_RE_COLD_SECTION = re.compile(r'^\s*\.section\s+(?:\.text\.cold|__llvm_cold)')
+_RE_TEXT_SECTION = re.compile(r"^\s*\.text\s*$")
+_RE_RODATA_SECTION = re.compile(r"^\s*\.section\s+\.l?rodata")
+
+# Data inside rodata
+_RE_ASCIZ = re.compile(r'^\s*\.asciz\s+"(.*?)"')
+_RE_DATA_LABEL = re.compile(r"^(\.L[\w.]+):\s*(?:#.*)?$")
+_RE_BYTE_DATA = re.compile(r"^\s*\.(byte|short|long|quad)\s+(.*)")
+
+# Directives to skip entirely
+_RE_SKIP = re.compile(
+    r"^\s*\.(file|globl|type|size|addrsig|addrsig_sym|hidden|ident|"
+    r"intel_syntax|section\s+\"\.note|p2align|cfi_\w+)\b"
+)
+
+_RE_BLANK = re.compile(r"^\s*(?:#.*)?$")
+_RE_ALIGN = re.compile(r"^\s*\.p2align\s+(\d+)")
+
+# LLVM JIT fold pass inline-asm markers.
+# Format: nop # @@JIT_MOV_IMM %reg, <C-expression>@@
+_RE_JIT_MARKER = re.compile(
+    r"^\s*nop\s+#\s*@@(JIT_MOV_IMM|JIT_TEST|JIT_CMP|JIT_FRAME_ANCHOR)(?:\s+(%?\w+)(?:,\s*(.+?))?)?@@\s*$"
+)
+
+# ── Peephole optimization patterns ────────────────────────────────────
+
+# emit_mov_imm(Dst, REG_NAME_OR_IDX, EXPR);
+# emit_mov_imm_preserve_flags(Dst, REG_NAME_OR_IDX, EXPR);
+_RE_EMIT_MOV_IMM = re.compile(
+    r"^(\s*)emit_mov_imm(?:_preserve_flags)?\(Dst,\s*(\w+),\s*(.+?)\);$"
+)
+
+# ── Regexes for parse_line() ───────────────────────────────────────────
+#
+# These patterns are used by parse_line() to classify DynASM output lines
+# into typed Line objects (Asm, CCall, Label, Section, FuncDef, Blank).
+
+# C helper calls: emit_mov_imm(Dst, REG, EXPR);
+#                 emit_mov_imm_preserve_flags(Dst, REG, EXPR);
+# Re-uses the existing _RE_EMIT_MOV_IMM but with different group semantics.
+_RE_C_CALL_MOV_IMM = re.compile(
+    r"^(\s*)(emit_mov_imm(?:_preserve_flags)?)\(Dst,\s*(.+)\);$"
+)
+# C helper calls: emit_call_ext(Dst, ARGS);
+_RE_C_CALL_EXT = re.compile(r"^(\s*)emit_call_ext\(Dst,\s*(.+)\);$")
+# C helper calls: emit_cmp_reg_imm(Dst, ARGS);
+_RE_C_CALL_CMP = re.compile(r"^(\s*)emit_cmp_reg_imm\(Dst,\s*(.+)\);$")
+# C helper calls: emit_cmp_mem64_imm(Dst, ARGS);
+_RE_C_CALL_CMP_MEM64 = re.compile(r"^(\s*)emit_cmp_mem64_imm\(Dst,\s*(.+)\);$")
+# C helper calls: emit_{test,and,or,xor}_reg_imm(Dst, ARGS);
+_RE_C_CALL_ALU = re.compile(
+    r"^(\s*)(emit_(?:test|and|or|xor)_reg_imm)\(Dst,\s*(.+)\);$"
+)
+
+# DynASM label definition: |=>LABEL_NAME:
+_RE_DASC_LABEL = re.compile(r"^\s*\|\s*=>\s*(.+?)\s*:\s*$")
+# DynASM section directive: |.code, |.cold, |.data
+_RE_DASC_SECTION = re.compile(r"^\s*\|\s*\.(code|cold|data)\b")
+# DynASM assembly instruction: | mnemonic [operands]
+_RE_ASM_LINE = re.compile(r"^\s*\|\s*(\w+)(?:\s+(.+))?\s*$")
+# Function definition: static void emit_OPNAME(...)
+_RE_DASC_FUNC_DEF = re.compile(r"^\s*static\s+void\s+emit_\w+\s*\(")
+
+# ── Typed operand and line classification ──────────────────────────────
+#
+# Instead of parsing lines into flat strings and then re-matching with
+# per-pattern regexes, we parse each line *once* into typed objects:
+#
+#   Operand types:  Reg, Mem, Imm    (what instructions operate on)
+#   Line types:     Asm, CCall, Label, Section, FuncDef, Blank, CCode
+#
+# Pattern functions use Python 3.10+ structural pattern matching
+# (match/case) to destructure these objects directly.  For example:
+#
+#     match parse_line(text):
+#         case Asm("mov", dst=Reg(name="rax"), src=Mem(size="qword")):
+#             ...  # handle mov rax, qword [...]
+#         case CCall(kind=CCallKind.CALL_EXT):
+#             ...  # handle emit_call_ext(...)
+#
+# This replaces the old LineKind enum + monolithic Line dataclass with
+# proper typing that enables exhaustive matching and IDE autocompletion.
+#
+# Design principles:
+#   - Operand types are frozen (immutable, hashable) for safe matching
+#   - Each line type carries only the fields relevant to that type
+#   - Raw text preserved in every line type for output generation
+#   - Helper functions (is_call, is_branch, etc.) work across types
+
+# ── Operand types ──────────────────────────────────────────────────────
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class Reg:
+    """Register operand (e.g., rax, eax, al, r14d).
+
+    Attributes:
+        name: Register name as it appears in the assembly (case-preserved).
+    """
+
+    name: str
+
+    @property
+    def idx(self) -> int | None:
+        """Canonical register index (0=rax, 1=rcx, ..., 15=r15)."""
+        return _ANY_REG_TO_IDX.get(self.name.lower())
+
+    @property
+    def bits(self) -> int:
+        """Register width in bits (8, 16, 32, or 64)."""
+        return _reg_bits(self.name)
+
+    @property
+    def jreg(self) -> str | None:
+        """JREG_* constant name (e.g., "JREG_RAX"), or None."""
+        return _ANY_REG_TO_NAME.get(self.name.lower())
+
+    def __str__(self) -> str:
+        return self.name
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class Mem:
+    """Memory operand (e.g., qword [r14 + 8], byte [rax]).
+
+    Attributes:
+        size:   Size prefix ("qword", "dword", "word", "byte") or None.
+        base:   Base register name, or None for complex addressing.
+        offset: Displacement (default 0).
+        index:  Index register name, or None.
+        scale:  Scale factor (1, 2, 4, 8), or None.
+        expr:   Full bracket expression for output (e.g., "[r14 + 8]").
+    """
+
+    size: str | None
+    base: str | None
+    offset: int = 0
+    index: str | None = None
+    scale: int | None = None
+    expr: str = ""
+
+    def __str__(self) -> str:
+        return f"{self.size} {self.expr}" if self.size else self.expr
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class Imm:
+    """Immediate operand (e.g., 42, -1, 0xff).
+
+    Attributes:
+        value: Numeric value of the immediate.
+        text:  Original text representation (preserved for output).
+    """
+
+    value: int
+    text: str = ""
+
+    def __str__(self) -> str:
+        return self.text if self.text else str(self.value)
+
+
+# Union of all operand types, for type annotations.
+Op = Reg | Mem | Imm
+
+
+# ── Line types ─────────────────────────────────────────────────────────
+
+
+@enum.unique
+class CCallKind(enum.Enum):
+    """Sub-classification for C helper calls."""
+
+    MOV_IMM = "emit_mov_imm"
+    CALL_EXT = "emit_call_ext"
+    CMP_REG_IMM = "emit_cmp_reg_imm"
+    CMP_MEM64_IMM = "emit_cmp_mem64_imm"
+    ALU_REG_IMM = "emit_alu_reg_imm"  # test/and/or/xor_reg_imm
+    OTHER = "other"
+
+
+@dataclasses.dataclass(slots=True)
+class Asm:
+    """Assembly instruction (e.g., ``| mov rax, qword [r14 + 8]``).
+
+    Attributes:
+        mnemonic: Instruction mnemonic (e.g., "mov", "cmp", "je").
+        dst:      First (destination) operand as typed Reg/Mem/Imm, or None.
+        src:      Second (source) operand as typed Reg/Mem/Imm, or None.
+        target:   Branch target for jmp/jcc (e.g., "=>L(3)"), or None.
+        raw:      Original line text (preserved for output).
+    """
+
+    mnemonic: str
+    dst: Op | None = None
+    src: Op | None = None
+    target: str | None = None
+    raw: str = ""
+
+    def __str__(self) -> str:
+        return self.raw
+
+
+@dataclasses.dataclass(slots=True)
+class CCall:
+    """C helper call (e.g., ``emit_call_ext(Dst, ...)``).
+
+    Attributes:
+        kind:   Which helper (MOV_IMM, CALL_EXT, CMP_REG_IMM).
+        helper: Helper function name as emitted in the C source.
+        args:   Raw argument string inside parentheses.
+        argv:   Parsed argument tokens split at top-level commas.
+        indent: Leading whitespace (for replacement line generation).
+        raw:    Original line text.
+    """
+
+    kind: CCallKind
+    helper: str = ""
+    args: str = ""
+    argv: tuple[str, ...] = ()
+    indent: str = ""
+    raw: str = ""
+
+
+@dataclasses.dataclass(slots=True)
+class Label:
+    """Label definition (e.g., ``|=>L(3):``).
+
+    Attributes:
+        name: Label identifier (e.g., "L(3)", "uop_label").
+        raw:  Original line text.
+    """
+
+    name: str
+    raw: str = ""
+
+
+@dataclasses.dataclass(slots=True)
+class Section:
+    """Section directive (e.g., ``|.code``, ``|.cold``).
+
+    Attributes:
+        name: Section name ("code", "cold", or "data").
+        raw:  Original line text.
+    """
+
+    name: str
+    raw: str = ""
+
+
+@dataclasses.dataclass(slots=True)
+class FuncDef:
+    """Function definition (e.g., ``static void emit_BINARY_OP_...``).
+
+    Attributes:
+        raw: Original line text.
+    """
+
+    raw: str = ""
+
+
+@dataclasses.dataclass(slots=True)
+class Blank:
+    """Empty line or comment.
+
+    Attributes:
+        raw: Original line text.
+    """
+
+    raw: str = ""
+
+
+@dataclasses.dataclass(slots=True)
+class CCode:
+    """C code line (if/else/braces/etc.).
+
+    Attributes:
+        raw: Original line text.
+    """
+
+    raw: str = ""
+
+
+# Union of all line types — use as type annotation for parsed lines.
+Line = Asm | CCall | Label | Section | FuncDef | Blank | CCode
+
+
+# ── Operand parsing ───────────────────────────────────────────────────
+
+_SIZE_PREFIXES = frozenset(("qword", "dword", "word", "byte"))
+
+_ALL_REGS = frozenset(_ANY_REG_TO_IDX.keys())
+
+_RE_MEM_TERM_SCALED = re.compile(r"^(\d+)\s*\*\s*(\w+)$")
+_RE_MEM_TERM_SCALED_REV = re.compile(r"^(\w+)\s*\*\s*(\d+)$")
+
+
+def _parse_mem_expr(inner: str) -> tuple[str | None, int, str | None, int | None]:
+    """Parse the expression inside memory brackets.
+
+    Examples:
+        "r14"           → ("r14", 0, None, None)
+        "r14 + 8"       → ("r14", 8, None, None)
+        "r14 - 8"       → ("r14", -8, None, None)
+        "rdi + rcx*4"   → ("rdi", 0, "rcx", 4)
+        "rdi + 4*rcx + 8" → ("rdi", 8, "rcx", 4)
+        "rcx*8+0"       → (None, 0, "rcx", 8)
+    """
+    base = None
+    offset = 0
+    index = None
+    scale = None
+
+    # Split on + and - while preserving the sign operator
+    terms = re.split(r"\s*([+-])\s*", inner.strip())
+    sign = 1
+    for term in terms:
+        term = term.strip()
+        if not term:
+            continue
+        if term == "+":
+            sign = 1
+            continue
+        if term == "-":
+            sign = -1
+            continue
+        # scale*register (e.g., "4*rcx")
+        m = _RE_MEM_TERM_SCALED.match(term)
+        if m:
+            scale = int(m.group(1))
+            index = m.group(2)
+            continue
+        # register*scale (e.g., "rcx*4")
+        m = _RE_MEM_TERM_SCALED_REV.match(term)
+        if m:
+            index = m.group(1)
+            scale = int(m.group(2))
+            continue
+        # Numeric displacement
+        try:
+            offset += sign * int(term, 0)
+            continue
+        except ValueError:
+            pass
+        # Register — assign to base or index
+        if base is None:
+            base = term
+        elif index is None:
+            index = term
+            scale = 1
+
+    return base, offset, index, scale
+
+
+def _parse_operand(text: str) -> Op:
+    """Parse a single operand string into a typed Reg, Mem, or Imm.
+
+    Examples:
+        "rax"              → Reg("rax")
+        "qword [r14 + 8]"  → Mem(size="qword", base="r14", offset=8, ...)
+        "[rax]"            → Mem(size=None, base="rax", ...)
+        "42"               → Imm(42, "42")
+        "-1"               → Imm(-1, "-1")
+    """
+    text = text.strip()
+
+    # Memory operand: contains brackets
+    if "[" in text:
+        size = None
+        rest = text
+        for s in _SIZE_PREFIXES:
+            if rest.lower().startswith(s + " "):
+                size = s
+                rest = rest[len(s) :].strip()
+                break
+        # Extract bracket contents
+        bracket_start = rest.index("[")
+        bracket_end = rest.rindex("]")
+        inner = rest[bracket_start + 1 : bracket_end].strip()
+        expr = rest[bracket_start : bracket_end + 1]
+        base, offset, index, scale = _parse_mem_expr(inner)
+        return Mem(
+            size=size,
+            base=base,
+            offset=offset,
+            index=index,
+            scale=scale,
+            expr=expr,
+        )
+
+    # Register: check all known register names
+    if text.lower() in _ALL_REGS:
+        return Reg(text)
+
+    # Immediate: try parsing as integer
+    try:
+        return Imm(int(text, 0), text)
+    except ValueError:
+        pass
+
+    # DynASM label reference (e.g., "=>L(3)") — treat as Imm-like
+    # (used by branch targets, but normally handled via target field)
+    return Imm(0, text)
+
+
+def _split_operands(operands: str) -> list[str]:
+    """Split operand string on commas, respecting brackets.
+
+    ``"qword [r13 + 64], r14"``  → ``["qword [r13 + 64]", "r14"]``
+    ``"rax"``                     → ``["rax"]``
+    ``"qword [rax + rbx*8 + 16]"`` → ``["qword [rax + rbx*8 + 16]"]``
+    """
+    parts: list[str] = []
+    depth = 0
+    start = 0
+    for j, ch in enumerate(operands):
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+        elif ch == "," and depth == 0:
+            parts.append(operands[start:j])
+            start = j + 1
+    parts.append(operands[start:])
+    return parts
+
+
+def _split_call_args(args: str) -> tuple[str, ...]:
+    """Split C helper arguments on top-level commas.
+
+    Unlike ``_split_operands()``, this helper understands parentheses as well
+    as brackets so expressions like ``(uintptr_t)&PyType_Type`` stay intact.
+    """
+    parts: list[str] = []
+    depth = 0
+    start = 0
+    for j, ch in enumerate(args):
+        if ch in "([":
+            depth += 1
+        elif ch in ")]":
+            depth = max(0, depth - 1)
+        elif ch == "," and depth == 0:
+            parts.append(args[start:j].strip())
+            start = j + 1
+    parts.append(args[start:].strip())
+    return tuple(part for part in parts if part)
+
+
+# ── DynASM assembly line parser (produces typed Asm / CCall / etc.) ────
+
+_BRANCH_MNEMONICS = frozenset((
+    "jmp", "je", "jne", "jz", "jnz", "ja", "jae", "jb", "jbe",
+    "jg", "jge", "jl", "jle", "js", "jns", "jo", "jno", "jp", "jnp",
+))
+
+
+def parse_line(text: str) -> Line:
+    """Parse a DynASM output line into a typed Line object.
+
+    Returns one of: Asm, CCall, Label, Section, FuncDef, Blank, CCode.
+    Each type carries only the fields relevant to that line kind, with
+    structured operands (Reg/Mem/Imm) for assembly instructions.
+
+    Classification priority:
+      1. C helper calls (emit_mov_imm, emit_call_ext, emit_cmp_reg_imm)
+      2. DynASM labels (|=>NAME:)
+      3. DynASM section directives (|.code, |.cold)
+      4. DynASM assembly instructions (| mnemonic ...)
+      5. Function definitions (static void emit_...)
+      6. Blanks / comments
+      7. Everything else (C code)
+    """
+    stripped = text.strip()
+
+    # ── C helper calls ──
+    m = _RE_C_CALL_MOV_IMM.match(stripped)
+    if m:
+        args = m.group(3)
+        return CCall(
+            kind=CCallKind.MOV_IMM,
+            helper=m.group(2),
+            indent=m.group(1),
+            args=args,
+            argv=_split_call_args(args),
+            raw=text,
+        )
+    m = _RE_C_CALL_EXT.match(stripped)
+    if m:
+        args = m.group(2)
+        return CCall(
+            kind=CCallKind.CALL_EXT,
+            helper="emit_call_ext",
+            indent=m.group(1),
+            args=args,
+            argv=_split_call_args(args),
+            raw=text,
+        )
+    m = _RE_C_CALL_CMP.match(stripped)
+    if m:
+        args = m.group(2)
+        return CCall(
+            kind=CCallKind.CMP_REG_IMM,
+            helper="emit_cmp_reg_imm",
+            indent=m.group(1),
+            args=args,
+            argv=_split_call_args(args),
+            raw=text,
+        )
+    m = _RE_C_CALL_CMP_MEM64.match(stripped)
+    if m:
+        args = m.group(2)
+        return CCall(
+            kind=CCallKind.CMP_MEM64_IMM,
+            helper="emit_cmp_mem64_imm",
+            indent=m.group(1),
+            args=args,
+            argv=_split_call_args(args),
+            raw=text,
+        )
+    m = _RE_C_CALL_ALU.match(stripped)
+    if m:
+        args = m.group(3)
+        return CCall(
+            kind=CCallKind.ALU_REG_IMM,
+            helper=m.group(2),
+            indent=m.group(1),
+            args=args,
+            argv=_split_call_args(args),
+            raw=text,
+        )
+
+    # ── DynASM labels ──
+    m = _RE_DASC_LABEL.match(stripped)
+    if m:
+        return Label(name=m.group(1), raw=text)
+
+    # ── DynASM section directives ──
+    m = _RE_DASC_SECTION.match(stripped)
+    if m:
+        return Section(name=m.group(1), raw=text)
+
+    # ── DynASM assembly instructions ──
+    m = _RE_ASM_LINE.match(stripped)
+    if m:
+        mnemonic = m.group(1)
+        operands_str = m.group(2)
+        dst: Op | None = None
+        src: Op | None = None
+        target: str | None = None
+
+        if operands_str:
+            # Branch instructions: operand is a target label, not a dst/src
+            if mnemonic in _BRANCH_MNEMONICS:
+                target = operands_str.strip()
+            else:
+                parts = _split_operands(operands_str)
+                if parts:
+                    dst = _parse_operand(parts[0])
+                    if len(parts) > 1:
+                        src = _parse_operand(parts[1])
+
+        return Asm(mnemonic=mnemonic, dst=dst, src=src,
+                   target=target, raw=text)
+
+    # ── Function definitions ──
+    if _RE_DASC_FUNC_DEF.match(stripped):
+        return FuncDef(raw=text)
+
+    # ── Blanks / comments ──
+    if not stripped or stripped.startswith("//"):
+        return Blank(raw=text)
+
+    # ── Everything else (C code) ──
+    return CCode(raw=text)
+
+
+def parse_lines(lines: list[str]) -> list[Line]:
+    """Batch-parse a list of DynASM output lines into typed objects."""
+    return [parse_line(text) for text in lines]
+
+
+# ── Query helpers (work across the Line type hierarchy) ────────────────
+
+
+def is_call(line: Line) -> bool:
+    """Is this line a function call (ASM 'call' or C emit_call_ext)?"""
+    match line:
+        case CCall(kind=CCallKind.CALL_EXT):
+            return True
+        case Asm(mnemonic="call"):
+            return True
+    return False
+
+
+def is_branch(line: Line) -> bool:
+    """Is this a conditional jump (jne, je, jae, etc.)?"""
+    match line:
+        case Asm(mnemonic=m) if m.startswith("j") and m != "jmp":
+            return True
+    return False
+
+
+def is_jump(line: Line) -> bool:
+    """Is this an unconditional jmp?"""
+    return isinstance(line, Asm) and line.mnemonic == "jmp"
+
+
+def branch_target(line: Line) -> str | None:
+    """Extract branch/jump target label (e.g., "=>L(3)"), or None."""
+    match line:
+        case Asm(target=t) if t and t.startswith("=>"):
+            return t
+    return None
+
+
+def line_raw(line: Line) -> str:
+    """Get the original text of any Line type."""
+    return line.raw
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class _LineEffect:
+    """Structured dataflow summary for one parsed line."""
+
+    reads: frozenset[int] = dataclasses.field(default_factory=frozenset)
+    full_writes: frozenset[int] = dataclasses.field(default_factory=frozenset)
+    partial_writes: frozenset[int] = dataclasses.field(default_factory=frozenset)
+    uses_flags: bool = False
+    writes_flags: bool = False
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class _BasicBlock:
+    """Basic block inside one emitted stencil function."""
+
+    start: int
+    end: int
+    labels: tuple[str, ...] = ()
+    successors: tuple[int, ...] = ()
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class _PeepholeFunction:
+    """Structured view of one emitted stencil function."""
+
+    start: int
+    end: int
+    blocks: tuple[_BasicBlock, ...] = ()
+
+
+def _operand_regs(op: Op | None) -> frozenset[int]:
+    """Registers read while evaluating an operand."""
+    regs: set[int] = set()
+    match op:
+        case Reg(idx=idx) if idx is not None:
+            regs.add(idx)
+        case Mem(base=base, index=index):
+            for name in (base, index):
+                if name is None:
+                    continue
+                idx = Reg(name).idx
+                if idx is not None:
+                    regs.add(idx)
+    return frozenset(regs)
+
+
+def _mem_uses_reg(mem: Mem, reg_idx: int) -> bool:
+    """Does a memory address depend on the given canonical register?"""
+    return reg_idx in _operand_regs(mem)
+
+
+def _compute_c_depth(lines: list[Line]) -> list[int]:
+    """Track inline-C nesting depth for each emitted line."""
+    c_depth = [0] * len(lines)
+    depth = 0
+    for i, line in enumerate(lines):
+        stripped = line.raw.strip()
+        if stripped.endswith("{"):
+            depth += 1
+        c_depth[i] = depth
+        if stripped == "}" or stripped == "} else {":
+            depth = max(0, depth - 1)
+            c_depth[i] = depth
+    return c_depth
+
+
+def _build_blocks(
+    parsed: list[Line],
+    label_to_line: dict[str, int],
+    start: int,
+    end: int,
+) -> tuple[_BasicBlock, ...]:
+    """Split one stencil function into coarse basic blocks."""
+
+    if start >= end:
+        return ()
+
+    starts = {start}
+    for i in range(start + 1, end):
+        if isinstance(parsed[i], (Label, Section)):
+            starts.add(i)
+        prev = parsed[i - 1]
+        if isinstance(prev, Asm) and (is_branch(prev) or is_jump(prev) or prev.mnemonic == "ret"):
+            starts.add(i)
+
+    ordered = sorted(starts)
+    blocks: list[_BasicBlock] = []
+    for idx, block_start in enumerate(ordered):
+        block_end = ordered[idx + 1] if idx + 1 < len(ordered) else end
+        labels: list[str] = []
+        j = block_start
+        while j < block_end and isinstance(parsed[j], Label):
+            labels.append(parsed[j].name)
+            j += 1
+
+        successors: list[int] = []
+        last = parsed[block_end - 1]
+        if isinstance(last, Asm):
+            target = branch_target(last)
+            if is_branch(last):
+                if block_end < end:
+                    successors.append(block_end)
+                if target and target.startswith("=>"):
+                    target_idx = label_to_line.get(target[2:])
+                    if target_idx is not None:
+                        successors.append(target_idx)
+            elif is_jump(last):
+                if target and target.startswith("=>"):
+                    target_idx = label_to_line.get(target[2:])
+                    if target_idx is not None:
+                        successors.append(target_idx)
+            elif last.mnemonic != "ret" and block_end < end:
+                successors.append(block_end)
+        elif block_end < end:
+            successors.append(block_end)
+
+        blocks.append(
+            _BasicBlock(
+                start=block_start,
+                end=block_end,
+                labels=tuple(labels),
+                successors=tuple(dict.fromkeys(successors)),
+            )
+        )
+    return tuple(blocks)
+
+
+@dataclasses.dataclass(slots=True)
+class _PeepholeProgram:
+    """Parsed view of the current emitted DynASM function bodies."""
+
+    lines: list[str]
+    parsed: list[Line]
+    c_depth: list[int]
+    label_to_line: dict[str, int]
+    effects: list[_LineEffect]
+    functions: tuple[_PeepholeFunction, ...]
+    function_starts: frozenset[int]
+    function_end_by_line: list[int]
+
+    @classmethod
+    def from_lines(cls, lines: list[str]) -> "_PeepholeProgram":
+        parsed = parse_lines(lines)
+        c_depth = _compute_c_depth(parsed)
+        label_to_line = {
+            line.name: i for i, line in enumerate(parsed) if isinstance(line, Label)
+        }
+        effects = [_line_effect(line) for line in parsed]
+
+        func_starts = [i for i, line in enumerate(parsed) if isinstance(line, FuncDef)]
+        if not func_starts:
+            func_starts = [0]
+        ranges = [
+            (
+                start,
+                func_starts[idx + 1]
+                if idx + 1 < len(func_starts)
+                else len(lines),
+            )
+            for idx, start in enumerate(func_starts)
+        ]
+
+        functions = tuple(
+            _PeepholeFunction(
+                start=start,
+                end=end,
+                blocks=_build_blocks(parsed, label_to_line, start, end),
+            )
+            for start, end in ranges
+        )
+        function_end_by_line = [len(lines)] * len(lines)
+        for start, end in ranges:
+            for i in range(start, end):
+                function_end_by_line[i] = end
+
+        return cls(
+            lines=lines,
+            parsed=parsed,
+            c_depth=c_depth,
+            label_to_line=label_to_line,
+            effects=effects,
+            functions=functions,
+            function_starts=frozenset(func_starts),
+            function_end_by_line=function_end_by_line,
+        )
+
+    def reg_dead_after(self, start: int, reg_idx: int) -> bool:
+        """Control-flow-aware deadness query using structured line effects."""
+        if start >= len(self.lines):
+            return True
+        if reg_idx not in _IDX_TO_ALL_NAMES:
+            return False
+
+        func_end = self.function_end_by_line[start]
+        start_depth = self.c_depth[start]
+        visited: set[int] = set()
+        worklist = [start]
+
+        while worklist:
+            pos = worklist.pop()
+            while pos < func_end:
+                if pos in visited:
+                    break
+                visited.add(pos)
+
+                effect = self.effects[pos]
+                if reg_idx in effect.reads or reg_idx in effect.partial_writes:
+                    return False
+                if reg_idx in effect.full_writes:
+                    break
+
+                successors = self.successors(pos, start_depth)
+                if not successors:
+                    break
+                fallthrough = pos + 1
+                if len(successors) == 1 and successors[0] == fallthrough:
+                    pos = fallthrough
+                    continue
+                for succ in successors[1:]:
+                    if succ < func_end:
+                        worklist.append(succ)
+                pos = successors[0]
+
+        return True
+
+    def successors(self, pos: int, start_depth: int) -> tuple[int, ...]:
+        """Reachable successors from one line in a deadness query."""
+        if pos >= len(self.parsed):
+            return ()
+        line = self.parsed[pos]
+        func_end = self.function_end_by_line[pos]
+        next_pos = pos + 1
+        in_c_block = self.c_depth[pos] > start_depth
+
+        match line:
+            case Asm(mnemonic="jmp", target=target):
+                if target and target.startswith("=>"):
+                    target_idx = self.label_to_line.get(target[2:])
+                    if target_idx is not None:
+                        if in_c_block and next_pos < func_end:
+                            return (next_pos, target_idx + 1)
+                        return (target_idx + 1,)
+                if in_c_block and next_pos < func_end:
+                    return (next_pos,)
+                return ()
+            case Asm(mnemonic="ret"):
+                if in_c_block and next_pos < func_end:
+                    return (next_pos,)
+                return ()
+            case Asm() if is_branch(line):
+                succs: list[int] = []
+                if next_pos < func_end:
+                    succs.append(next_pos)
+                target = branch_target(line)
+                if target and target.startswith("=>"):
+                    target_idx = self.label_to_line.get(target[2:])
+                    if target_idx is not None:
+                        succs.append(target_idx + 1)
+                return tuple(succs)
+            case _:
+                if next_pos < func_end:
+                    return (next_pos,)
+                return ()
+
+
+def fmt_op(op: Op) -> str:
+    """Format a typed operand back to assembly text."""
+    match op:
+        case Reg(name=n):
+            return n
+        case Mem(size=s, expr=e):
+            return f"{s} {e}" if s else e
+        case Imm(text=t) if t:
+            return t
+        case Imm(value=v):
+            return str(v)
+
+
+# Operand size in bits based on register name (for cast selection)
+def _reg_bits(name: str) -> int:
+    """Return the operand size in bits for a register name."""
+    name = name.lower()
+    if name in ("al", "ah", "bl", "bh", "cl", "ch", "dl", "dh",
+                "spl", "bpl", "sil", "dil") or name.endswith("b"):
+        return 8
+    if name in ("ax", "bx", "cx", "dx", "si", "di", "bp", "sp") \
+            or name.endswith("w"):
+        return 16
+    if name.startswith("e") or (name.startswith("r") and name.endswith("d")):
+        return 32
+    return 64
+
+
+# ── Peephole optimizer — pattern-based architecture ────────────────────
+#
+# Inspired by _optimizers.py's clean separation of concerns, the peephole
+# operates as a *pattern registry* with a simple driver loop.  Each pattern
+# is a small, self-contained function that examines lines at position ``i``
+# and returns a ``_Match`` (consumed lines + output lines) or ``None``.
+#
+# Two categories of patterns:
+#
+#  1. **emit_mov_imm chain patterns** — These fire when the current line
+#     is ``emit_mov_imm(Dst, REG, EXPR);`` and try to fold subsequent
+#     instructions into the immediate expression.  They compose: e.g.
+#     mov_imm → movzx → shl chains.  Handled by ``_fold_mov_imm()``.
+#
+#  2. **Standalone patterns** — Independent patterns that operate on raw
+#     DynASM assembly lines (e.g. store-reload elimination).  Each is a
+#     function registered in ``_STANDALONE_PATTERNS``.
+#
+# Adding a new pattern: write a function matching the ``_PatternFn``
+# signature, add it to ``_STANDALONE_PATTERNS``, done.
+
+
+@dataclasses.dataclass
+class _Match:
+    """Result of a successful pattern match."""
+    consumed: int          # number of input lines consumed
+    output: list[str]      # replacement lines to emit
+
+
+@dataclasses.dataclass
+class _FoldCtx:
+    """Shared context for emit_mov_imm fold patterns.
+
+    Bundles the parameters that every ``_try_*`` function needs, eliminating
+    the 5 different call signatures that previously required a dispatch table
+    in ``_fold_mov_imm``.  All terminal ``_try_*`` functions now take a single
+    ``ctx: _FoldCtx`` argument and return ``_Match | None``.
+
+    The ``parsed`` list contains typed Line objects (Asm, CCall, etc.)
+    parallel to ``lines``.  Pattern functions use ``ctx.cur`` to get the
+    current typed instruction and ``match``/``case`` for destructuring.
+    """
+    program: _PeepholeProgram   # structured program for effects/CFG queries
+    lines: list[str]          # all input lines (raw text)
+    parsed: list[Line]        # typed Line objects (parallel to lines)
+    i: int                    # current look-ahead position
+    src_idx: int              # register index (0=RAX, 1=RCX, etc.)
+    src_name: str             # "JREG_RAX" etc.
+    indent: str               # indentation from the emit_mov_imm
+    expr: str                 # current expression (mutated by modifier phases)
+
+    @property
+    def cur(self) -> Line:
+        """The parsed Line at the current look-ahead position."""
+        return self.parsed[self.i]
+
+
+@dataclasses.dataclass
+class _PeepholeState:
+    """Mutable state carried across a single peephole pass.
+
+    Reset at function boundaries (``static void emit_...``) to avoid
+    cross-stencil interference.
+    """
+
+    # Merge labels where we eliminated a stackpointer reload;
+    # cold-path jumps back to these need a reload inserted.
+    need_reload_before_jmp: set[str] = dataclasses.field(default_factory=set)
+
+    def reset(self) -> None:
+        """Reset per-function state at stencil boundaries."""
+        self.need_reload_before_jmp.clear()
+
+
+_PeepholePassFn = typing.Callable[
+    [_PeepholeProgram, int, list[str], _PeepholeState],
+    int | None,
+]
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class _PeepholePass:
+    """One pass step in the peephole pipeline."""
+
+    name: str
+    apply: _PeepholePassFn
+
+
+# ── Peephole statistics ────────────────────────────────────────────────
+#
+# Each pattern increments its counter when it fires.  The stats are
+# printed at the end of stencil conversion if PEEPHOLE_STATS is True.
+
+PEEPHOLE_STATS = False  # set True or use --peephole-stats to see counts
+
+_peephole_counts: dict[str, int] = {
+    "P6_indexed_mem": 0,
+    "P8_alu_imm_fold": 0,
+    "P12_store_imm": 0,
+    "P13_dead_null_check": 0,
+    "P14_test_memory_fold": 0,
+    "P15_shift_fold": 0,
+    "P16_two_mov_add": 0,
+    "P17_lea_fold": 0,
+    "P18_dead_frame_anchor": 0,
+    "P19_inverse_mov_restore": 0,
+    "SP0_preserve_flags_mov_imm": 0,
+    "SP1_store_reload_elim": 0,
+    "SP2_cold_reload_insert": 0,
+    "SP3_inverted_store_reload": 0,
+    "dead_label_elim": 0,
+    "LLVM_fold_marker": 0,
+}
+
+
+def _stat(name: str) -> None:
+    """Increment a peephole pattern counter."""
+    _peephole_counts[name] = _peephole_counts.get(name, 0) + 1
+
+
+def get_peephole_stats() -> dict[str, int]:
+    """Return current peephole statistics (pattern name → fire count)."""
+    return dict(_peephole_counts)
+
+
+def reset_peephole_stats() -> None:
+    """Reset all peephole counters to zero."""
+    for key in _peephole_counts:
+        _peephole_counts[key] = 0
+
+
+def print_peephole_stats() -> None:
+    """Print peephole statistics to stderr."""
+    import sys
+
+    total = sum(_peephole_counts.values())
+    if total == 0:
+        return
+    print(f"\nPeephole optimization statistics ({total} total):", file=sys.stderr)
+    for name, count in sorted(_peephole_counts.items()):
+        if count > 0:
+            print(f"  {name:30s}: {count:5d}", file=sys.stderr)
+
+
+# ── x86-64 specific imports ────────────────────────────────────────────
+# Import architecture-specific peephole patterns, instruction effects,
+# and calling convention constants from the x86-64 module.
+# These are re-exported for backward compatibility with existing consumers
+# (build.py, _dasc_writer.py, _targets.py, test_peephole.py).
+#
+# This import is placed here (after all generic definitions) because the
+# amd64 module imports types and infrastructure from this file.  By this
+# point all those symbols are defined, so the circular import resolves
+# cleanly.
+from _asm_to_dasc_amd64 import (  # noqa: E402
+    # Calling convention
+    REG_FRAME, REG_STACK_PTR, REG_TSTATE, REG_EXECUTOR,
+    FRAME_IP_OFFSET, FRAME_STACKPOINTER_OFFSET,
+    _C, _SP_STORE, _SP_RELOAD, _SP_RELOAD_LINE,
+    # JREG mapping
+    _JREG_TO_IDX, _IDX_TO_JREG, _parse_jreg, _jreg_arg_index,
+    # Instruction effects
+    _line_effect, _reg_write_sets,
+    # Pattern helpers
+    _reg_dead_after,
+    uses_reg, is_store_sp, is_reload_sp,
+    _is_flag_writer, _is_flag_consumer,
+    _preserve_flags_mov_imm, _parse_emit_mov_imm_call,
+    # Pass registry
+    _PEEPHOLE_PASSES,
+)
+
+
+# ── Driver ─────────────────────────────────────────────────────────────
+
+
+def _peephole_pass(lines: list[str]) -> tuple[list[str], bool]:
+    """Single pass of peephole optimization. Returns (result, changed).
+
+    Parses all input lines once into a structured program with typed lines,
+    helper-call arguments, block boundaries, and line effects.  Registered
+    passes then match against that program rather than raw-text regex state.
+    """
+    changed = False
+    result: list[str] = []
+    state = _PeepholeState()
+    program = _PeepholeProgram.from_lines(lines)
+    i = 0
+    while i < len(lines):
+        # Reset per-function state at stencil boundaries
+        if i in program.function_starts:
+            state.reset()
+
+        matched = False
+        for peephole_pass in _PEEPHOLE_PASSES:
+            advance = peephole_pass.apply(program, i, result, state)
+            if advance is not None:
+                i += advance
+                changed = True
+                matched = True
+                break
+        if matched:
+            continue
+
+        # No pattern matched — pass through
+        result.append(lines[i])
+        i += 1
+    return result, changed
+
+
+def _peephole_optimize(lines: list[str]) -> list[str]:
+    """Apply peephole optimizations until fixpoint (max 5 passes).
+
+    Multi-pass iteration enables chained optimizations: e.g. Pattern 10
+    creates a new emit_mov_imm that Pattern 6 can then consume.
+    """
+    for _pass in range(5):
+        result, changed = _peephole_pass(lines)
+        if not changed:
+            break
+        lines = result
+    else:
+        lines = result
+    return _eliminate_dead_labels(lines)
+
+
+def _eliminate_dead_labels(lines: list[str]) -> list[str]:
+    """Remove labels that are defined but never jumped to.
+
+    Many stencils contain structural labels (e.g. |=>L(2):) that serve as
+    fall-through targets but are never referenced by any jump instruction.
+    These dead labels clutter the assembly output and make it harder to read.
+
+    Uses ``parse_lines()`` to identify labels by type, avoiding ad-hoc regex
+    duplication.
+    """
+    parsed = parse_lines(lines)
+    # Collect all L(N) references (non-definition uses).
+    # A reference is any occurrence of =>L(N) that is NOT a label def.
+    text = "".join(lines)
+    referenced = set(re.findall(r"=>L\((\d+)\)(?!:)", text))
+    result: list[str] = []
+    removed = 0
+    for line_obj in parsed:
+        # Only remove L(N) labels; leave uop_label, continue_label etc.
+        if isinstance(line_obj, Label):
+            m = re.match(r"L\((\d+)\)", line_obj.name)
+            if m and m.group(1) not in referenced:
+                removed += 1
+                continue
+        result.append(line_obj.raw)
+    for _ in range(removed):
+        _stat("dead_label_elim")
+    return result
+
+
+# ── Data structures ────────────────────────────────────────────────────
+
+
+@dataclasses.dataclass
+class DataItem:
+    """A data blob from a .rodata section (e.g. an assert filename string)."""
+
+    label: str
+    data: bytearray = dataclasses.field(default_factory=bytearray)
+
+
+@dataclasses.dataclass
+class ConvertedStencil:
+    """Result of converting one stencil to DynASM lines."""
+
+    opname: str
+    lines: list[str]
+    # Number of internal PC labels needed by this stencil (excludes GOT)
+    num_internal_labels: int
+    # Data items from .rodata sections
+    data_items: list[DataItem]
+    # Stack frame size requested by the stencil's function prologue
+    # (push rbp; [mov rbp, rsp;] sub rsp, N), or 0 if the stencil does not
+    # use a standard entry frame. The runtime allocates the maximum such
+    # frame once in the shim and the per-stencil prologue/epilogue is then
+    # stripped from the emitted DynASM.
+    frame_size: int = 0
+
+
+# ── Helpers ─────────────────────────────────────────────────────────────
+
+# x86 register names used to distinguish scale*reg from scale*immediate
+_X86_REGS = frozenset([
+    "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
+    "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+    "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
+    "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
+    "ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
+])
+
+def _swap_scale_reg(m: re.Match) -> str:
+    """Swap scale*register to register*scale in SIB memory operands."""
+    scale, name = m.group(1), m.group(2)
+    if name.lower() in _X86_REGS:
+        return f"{name}*{scale}"
+    return m.group(0)
+
+
+def _fix_syntax(text: str) -> str:
+    """Apply DynASM syntax adjustments to an instruction."""
+    # Convert tabs to spaces
+    text = text.replace("\t", " ")
+    # Strip inline comments (# ...)
+    text = re.sub(r"\s*#.*$", "", text)
+    # Convert xmmword to oword (DynASM uses oword for 128-bit)
+    text = re.sub(r"\bxmmword\b", "oword", text)
+    # Remove 'ptr' from size specifiers
+    text = re.sub(
+        r"\b(byte|word|dword|qword|tword|oword)\s+ptr\b", r"\1", text
+    )
+    # REX-prefix byte register names (sil, dil, bpl, spl, r8b-r15b) are
+    # handled by .define directives in the .dasc header — keep them as-is
+    # for readability.
+    # Remove instruction suffixes (callq→call etc.)
+    text = re.sub(r"\b(call|ret|push|pop|jmp)q\b", r"\1", text)
+    # Convert negative byte immediates to unsigned (test byte [...], -128 → 0x80)
+    text = _fix_negative_byte_imm(text)
+    # DynASM requires explicit shift count: shr reg → shr reg, 1
+    text = re.sub(
+        r"\b(shr|shl|sar|sal|ror|rol|rcr|rcl)\s+(\w+)\s*$", r"\1 \2, 1", text
+    )
+    # DynASM requires register*scale, not scale*register in memory operands
+    # e.g. [8*rcx] → [rcx*8], [rdi + 8*rax + 80] → [rdi + rax*8 + 80]
+    text = re.sub(r"(\d+)\*(\w+)", _swap_scale_reg, text)
+    # DynASM can't encode SIB-only addressing [reg*scale] without a base register.
+    # Add explicit +0 displacement: [reg*N] → [reg*N+0]
+    text = re.sub(r"\[(\w+\*\d+)\]", r"[\1+0]", text)
+    # DynASM uses "movd" for both 32-bit and 64-bit GPR<->XMM transfers.
+    # When the GPR is 64-bit (rax, rcx, etc.), DynASM infers REX.W from the
+    # register name, producing the correct movq encoding.
+    # E.g. "movq rax, xmm1" → "movd rax, xmm1"
+    text = re.sub(r"\bmovq\s+(r\w+),\s*(xmm\d+)", r"movd \1, \2", text)
+    text = re.sub(r"\bmovq\s+(xmm\d+),\s*(r\w+)", r"movd \1, \2", text)
+    # Normalize whitespace
+    text = " ".join(text.split())
+    return text
+
+
+def _fix_negative_byte_imm(text: str) -> str:
+    """Convert negative immediates in byte operations to unsigned form.
+
+    DynASM requires unsigned immediates for byte-sized operations.
+    E.g. ``test byte [...], -128`` → ``test byte [...], 128``
+    """
+    m = re.match(r"^(.+\bbyte\b.+,\s*)(-\d+)\s*$", text)
+    if m:
+        val = int(m.group(2))
+        if -128 <= val < 0:
+            return f"{m.group(1)}{val & 0xFF}"
+    return text
+
+
+def _jit_expr(symbol: str, offset: int = 0) -> str:
+    """C expression for a _JIT_* symbol value."""
+    base = _JIT_SYMBOL_EXPR.get(symbol)
+    if base is None:
+        raise ValueError(f"Unknown _JIT_ symbol: {symbol}")
+    if offset:
+        return f"((uintptr_t)({base}) + {offset})"
+    return f"(uintptr_t)({base})"
+
+
+# ── Main conversion ────────────────────────────────────────────────────
+
+
+def convert_stencil(opname: str, assembly: str, *, is_shim: bool = False) -> ConvertedStencil:
+    """Convert one stencil's optimized Intel-syntax .s to DynASM lines.
+
+    Internal branch targets use PC labels relative to ``label_base``.
+    """
+    lines: list[str] = []
+    # Map local label name → internal index (0-based)
+    local_map: dict[str, int] = {}
+    data_items: list[DataItem] = []
+    counter = 0
+
+    in_rodata = False
+    cur_data: DataItem | None = None
+
+    def _local(name: str) -> int:
+        nonlocal counter
+        if name not in local_map:
+            local_map[name] = counter
+            counter += 1
+        return local_map[name]
+
+    def _flush_data():
+        nonlocal cur_data
+        if cur_data and cur_data.data:
+            data_items.append(cur_data)
+        cur_data = None
+
+    # Special labels we handle separately (not local)
+    _SPECIAL_LABELS = {
+        "_JIT_ENTRY", "_JIT_CONTINUE", ".L_JIT_CONTINUE",
+    }
+
+    # First pass: discover all label definitions and branch/call targets
+    # to determine which ones are local (internal to this stencil)
+    all_label_defs: set[str] = set()
+    all_branch_targets: set[str] = set()
+    for raw in assembly.splitlines():
+        if m := _RE_ANY_LABEL_DEF.match(raw):
+            label = m.group(1)
+            if label not in _SPECIAL_LABELS:
+                all_label_defs.add(label)
+        # Find branch targets (labels only, not register names)
+        m_br = re.match(r"^\s*(j\w+|call)\s+([\w.]+)\s*(?:#.*)?$", raw)
+        if m_br:
+            target = m_br.group(2)
+            if target not in _SPECIAL_LABELS and not target.startswith("_JIT_"):
+                all_branch_targets.add(target)
+
+    # Local labels are those with actual definitions in this stencil.
+    # Only include branch targets that have matching definitions —
+    # targets without definitions are register-indirect branches
+    # (e.g., "call rax", "jmp r11") and not real label references.
+    local_labels = all_label_defs
+    for label in sorted(local_labels):
+        _local(label)
+
+    # Second pass: emit DynASM lines
+    _FRAME_ANCHOR_MARKER = "    // __JIT_FRAME_ANCHOR__"
+    cur_section = "code"  # track current section for data entry restoration
+    for raw in assembly.splitlines():
+        if _RE_BLANK.match(raw):
+            continue
+        if _RE_SKIP.match(raw):
+            continue
+
+        # ── rodata collection ──
+        if _RE_RODATA_SECTION.match(raw):
+            in_rodata = True
+            continue
+
+        if in_rodata:
+            if _RE_TEXT_SECTION.match(raw) or _RE_COLD_SECTION.match(raw):
+                _flush_data()
+                in_rodata = False
+                # fall through to handle section switch
+            elif m := _RE_DATA_LABEL.match(raw):
+                _flush_data()
+                cur_data = DataItem(label=m.group(1))
+                continue
+            elif m := _RE_ASCIZ.match(raw):
+                if cur_data is not None:
+                    s = (
+                        m.group(1)
+                        .encode("raw_unicode_escape")
+                        .decode("unicode_escape")
+                    )
+                    cur_data.data.extend(s.encode("utf-8"))
+                    cur_data.data.append(0)
+                continue
+            elif m := _RE_BYTE_DATA.match(raw):
+                if cur_data is not None:
+                    kind = m.group(1)
+                    for v in m.group(2).split(","):
+                        # Strip inline comments (# ...)
+                        v = re.sub(r"\s*#.*$", "", v).strip()
+                        if not v:
+                            continue
+                        n = int(v, 0)
+                        sz = {"byte": 1, "short": 2, "long": 4, "quad": 8}[kind]
+                        cur_data.data.extend(
+                            n.to_bytes(sz, "little", signed=(n < 0))
+                        )
+                continue
+            else:
+                continue  # skip unknown rodata lines
+
+        # ── section switches ──
+        if _RE_COLD_SECTION.match(raw):
+            cur_section = "cold"
+            lines.append("")
+            lines.append("    // ---- cold path ----")
+            lines.append("    |.cold")
+            continue
+        if _RE_TEXT_SECTION.match(raw):
+            cur_section = "code"
+            lines.append("    |.code")
+            continue
+
+        # ── special labels ──
+        if _RE_ENTRY.match(raw):
+            lines.append("    |=>uop_label:")
+            continue
+        if _RE_CONTINUE_LABEL.match(raw):
+            continue  # handled by caller via continue_label
+
+        # ── alignment ──
+        if m := _RE_ALIGN.match(raw):
+            lines.append(f"    | .align {1 << int(m.group(1))}")
+            continue
+
+        # ── local label definitions (any label we discovered in pass 1) ──
+        if m := _RE_ANY_LABEL_DEF.match(raw):
+            label = m.group(1)
+            if label in local_map:
+                idx = local_map[label]
+                lines.append(f"    |=>L({idx}):")
+                continue
+            # Skip other label defs we don't recognize
+            continue
+
+        # ── LLVM JIT fold pass markers ──
+        # Inline asm markers injected by jit_fold_pass.so:
+        #   nop  # @@JIT_MOV_IMM %rax, <C-expression>@@
+        # The register name has a % prefix from AT&T syntax in inline asm.
+        if m := _RE_JIT_MARKER.match(raw):
+            kind, reg, expr = m.groups()
+            # Strip AT&T % prefix from register name.
+            if reg is not None:
+                reg = reg.lstrip("%")
+            reg_name = _REG_IDX_NAME.get(reg.lower()) if reg else None
+            if kind == "JIT_MOV_IMM":
+                if reg_name is not None:
+                    lines.append(
+                        f"    emit_mov_imm_preserve_flags(Dst, {reg_name}, {expr});"
+                    )
+                else:
+                    lines.append(f"    | mov64 {reg}, {expr}")
+            elif kind == "JIT_TEST":
+                if reg_name is not None:
+                    lines.append(
+                        f"    emit_test_reg_imm(Dst, {reg_name}, JREG_RAX, {expr});"
+                    )
+            elif kind == "JIT_CMP":
+                if reg_name is not None:
+                    lines.append(
+                        f"    emit_cmp_reg_imm(Dst, {reg_name}, JREG_RAX, {expr});"
+                    )
+            elif kind == "JIT_FRAME_ANCHOR":
+                if reg is not None and lines:
+                    anchor_re = re.compile(
+                        rf"^\s*\|\s*(?:"
+                        rf"lea\s+{re.escape(reg)},\s*\[(?:rbp|rsp)(?:\s*[+-]\s*\d+)?\]"
+                        rf"|mov\s+{re.escape(reg)},\s*(?:rbp|rsp)"
+                        rf")\s*$"
+                    )
+                    if anchor_re.match(lines[-1]):
+                        lines.pop()
+                lines.append(_FRAME_ANCHOR_MARKER)
+            _stat("LLVM_fold_marker")
+            continue
+
+        # ── movabs REG, offset SYMBOL[+N] ──
+        if m := _RE_MOVABS.match(raw):
+            reg, sym, off_s = m.groups()
+            off = int(off_s) if off_s else 0
+            if sym.startswith("_JIT_"):
+                expr = _jit_expr(sym, off)
+                reg_name = _REG_IDX_NAME.get(reg.lower())
+                if reg_name is not None:
+                    # Use emit_mov_imm which picks optimal encoding at
+                    # JIT compile time (xor for 0, mov32 for ≤UINT32_MAX,
+                    # mov64 otherwise).
+                    lines.append(f"    emit_mov_imm(Dst, {reg_name}, {expr});")
+                else:
+                    lines.append(f"    | mov64 {reg}, {expr}")
+            elif sym.startswith(".L"):
+                safe = sym.replace(".", "_")
+                reg_name = _REG_IDX_NAME.get(reg.lower())
+                if reg_name is not None:
+                    lines.append(
+                        f"    emit_mov_imm(Dst, {reg_name}, (uintptr_t)jit_data_{opname}_{safe});"
+                    )
+                else:
+                    lines.append(
+                        f"    | mov64 {reg}, (uintptr_t)jit_data_{opname}_{safe}"
+                    )
+            else:
+                # External symbol: load address via emit_mov_imm which
+                # picks the optimal encoding at JIT time (xor/mov32/lea/mov64).
+                expr = (
+                    f"((uintptr_t)&{sym} + {off})"
+                    if off
+                    else f"(uintptr_t)&{sym}"
+                )
+                reg_name = _REG_IDX_NAME.get(reg.lower())
+                if reg_name is not None:
+                    lines.append(f"    emit_mov_imm(Dst, {reg_name}, {expr});")
+                else:
+                    lines.append(f"    | mov64 {reg}, {expr}")
+            continue
+
+        # ── movabs REG, IMM (plain integer) ──
+        if m := _RE_MOVABS_IMM.match(raw):
+            reg, imm = m.groups()
+            # Use unsigned form for mov64
+            val = int(imm)
+            if val < 0:
+                val = val & 0xFFFFFFFFFFFFFFFF
+            lines.append(f"    | mov64 {reg}, {val}ULL")
+            continue
+
+        # ── call/jmp via GOTPCREL ──
+        if m := _RE_GOTPCREL_CALL.match(raw):
+            instr, sym = m.groups()
+            if sym.startswith("_JIT_"):
+                # _JIT_* symbols are runtime values — emit optimal mov then
+                # indirect call/jmp through rax.
+                expr = _jit_expr(sym)
+                lines.append(f"    emit_mov_imm(Dst, JREG_RAX, {expr});")
+                lines.append(f"    | {instr} rax")
+            else:
+                # External function: emit a direct relative call using
+                # DynASM's &addr syntax (5-byte E8 rel32).  Like Pyston's
+                # emit_call_ext_func.  Falls back to mov64+call for
+                # targets beyond ±2GB.
+                lines.append(f"    emit_call_ext(Dst, (void *)&{sym});")
+                if instr == "jmp":
+                    # Tail call: after the callee returns, we need to exit
+                    # the trace.  Emit a ret which the epilogue rewriting
+                    # will convert to jmp =>cleanup_ret_label.
+                    lines.append("    | ret")
+            continue
+
+        # ── generic instruction with GOTPCREL memory operand ──
+        if m := _RE_GOTPCREL_MEM.match(raw):
+            prefix, before, size, sym, after = m.groups()
+            instr = prefix.strip().split()[0]
+            dest = before.strip().rstrip(",").strip()
+
+            if instr == "mov" and not after.strip():
+                # mov REG, qword ptr [rip + SYM@GOTPCREL]
+                # Load the symbol address directly via emit_mov_imm.
+                if sym.startswith("_JIT_"):
+                    expr = _jit_expr(sym)
+                else:
+                    expr = f"(uintptr_t)&{sym}"
+                reg_name = _ANY_REG_TO_NAME.get(dest.lower())
+                if reg_name is not None:
+                    lines.append(f"    emit_mov_imm(Dst, {reg_name}, {expr});")
+                else:
+                    lines.append(f"    | mov64 {dest}, {expr}")
+                continue
+
+            if instr == "movzx" and not after.strip():
+                # movzx REG, word/byte ptr [rip + SYM@GOTPCREL]
+                # Load the symbol value with appropriate truncation.
+                if sym.startswith("_JIT_"):
+                    expr = _jit_expr(sym)
+                else:
+                    expr = f"(uintptr_t)&{sym}"
+                _SIZE_CAST = {"word": "uint16_t", "byte": "uint8_t"}
+                cast = _SIZE_CAST.get(size, None)
+                if cast:
+                    expr = f"({cast})({expr})"
+                reg_name = _ANY_REG_TO_NAME.get(dest.lower())
+                if reg_name is not None:
+                    lines.append(f"    emit_mov_imm(Dst, {reg_name}, {expr});")
+                else:
+                    lines.append(f"    | mov64 {dest}, {expr}")
+                continue
+
+            if instr == "call":
+                # External function: direct relative call via emit_call_ext.
+                if not sym.startswith("_JIT_"):
+                    lines.append(f"    emit_call_ext(Dst, (void *)&{sym});")
+                else:
+                    expr = _jit_expr(sym)
+                    lines.append(f"    emit_mov_imm(Dst, JREG_RAX, {expr});")
+                    lines.append(f"    | call rax")
+                continue
+
+            # Other instructions (cmp, test, etc.) with GOTPCREL memory.
+            # Emit the symbol address into a per-instruction data section
+            # entry, then reference it with a RIP-relative memory operand.
+            if sym.startswith("_JIT_"):
+                expr = _jit_expr(sym)
+            else:
+                expr = f"(uintptr_t)&{sym}"
+            data_label = counter
+            counter += 1
+            lines.append(f"    |.data")
+            lines.append(f"    |=>L({data_label}):")
+            lines.append(f"    | .qword {expr}")
+            lines.append(f"    |.{cur_section}")
+            new_line = f"{prefix}{before}qword [=>L({data_label})]{after}"
+            new_line = _fix_syntax(new_line.strip())
+            lines.append(f"    | {new_line}")
+            continue
+
+        # ── JIT branch targets ──
+        if m := _RE_JIT_BRANCH.match(raw):
+            instr, target = m.groups()
+            field = (
+                "jump_target"
+                if target == "_JIT_JUMP_TARGET"
+                else "error_target"
+            )
+            lines.append(f"    | {instr} =>instruction->{field}")
+            continue
+
+        # ── JIT continue ──
+        if m := _RE_JIT_CONTINUE.match(raw):
+            lines.append(f"    | {m.group(1)} =>continue_label")
+            continue
+
+        # ── local branches / calls ──
+        # Match jmp/jcc/call to a label we discovered in pass 1
+        m_br = re.match(r"^\s*(j\w+|call)\s+([\w.]+)\s*(?:#.*)?$", raw)
+        if m_br:
+            instr, label = m_br.groups()
+            if label in local_map:
+                idx = local_map[label]
+                if instr == "call":
+                    lines.append(f"    | call =>L({idx})")
+                else:
+                    lines.append(f"    | {instr} =>L({idx})")
+                continue
+
+        # ── default: plain instruction ──
+        stripped = raw.strip()
+        if not stripped or stripped.startswith(".section"):
+            continue
+
+        fixed = _fix_syntax(stripped)
+        if "@" in fixed:
+            raise ValueError(
+                f"Unhandled @ symbol in stencil {opname}: {raw!r}"
+            )
+        lines.append(f"    | {fixed}")
+
+    _flush_data()
+
+    # Strip the frame-anchor marker emitted by template.c.
+    # With the simplified approach (no ForceFrameCall), the marker stands alone.
+    stripped_lines: list[str] = []
+    for line in lines:
+        if line == _FRAME_ANCHOR_MARKER:
+            continue
+        stripped_lines.append(line)
+    lines = stripped_lines
+
+    # Find the FIRST |.cold (the hot-cold boundary). There may be multiple
+    # |.cold directives (e.g. when the optimizer appends cold blocks after
+    # LLVM's own cold section), but the first one marks the end of hot code.
+    hot_end = len(lines)
+    for i in range(len(lines)):
+        stripped = lines[i].strip()
+        if stripped == "|.cold":
+            hot_end = i
+            break
+
+    # Strip the stencil's outer function prologue/epilogue and record its
+    # requested frame size. The shared JIT shim recreates one canonical
+    # frame before calling into the trace, so individual stencils no longer
+    # need their function entry/exit stack manipulation.
+    frame_size = 0
+    _RE_PUSH_RBP = re.compile(r"^\s*\|\s*push rbp\s*$")
+    _RE_MOV_RBP_RSP = re.compile(r"^\s*\|\s*mov rbp, rsp\s*$")
+    _RE_SUB_RSP = re.compile(r"^\s*\|\s*sub rsp,\s*(\d+)\s*$")
+    _RE_ADD_RSP = re.compile(r"^\s*\|\s*add rsp,\s*(\d+)\s*$")
+    _RE_LEA_RSP = re.compile(r"^\s*\|\s*lea rsp, \[rsp \+ (\d+)\]\s*$")
+    _RE_POP_RBP = re.compile(r"^\s*\|\s*pop rbp\s*$")
+
+    # Detect entry prologue: |=>uop_label: then | push rbp, an optional
+    # | mov rbp, rsp, and finally | sub rsp, N.
+    prologue_push_idx = -1
+    prologue_mov_idx = -1
+    prologue_sub_idx = -1
+    for i in range(len(lines)):
+        if lines[i].strip() == "|=>uop_label:":
+            if i + 2 < len(lines) and _RE_PUSH_RBP.match(lines[i + 1]):
+                prologue_push_idx = i + 1
+                sub_idx = i + 2
+                if sub_idx < len(lines) and _RE_MOV_RBP_RSP.match(
+                    lines[sub_idx]
+                ):
+                    prologue_mov_idx = sub_idx
+                    sub_idx += 1
+                if sub_idx < len(lines) and (
+                    m_sub := _RE_SUB_RSP.match(lines[sub_idx])
+                ):
+                    frame_size = int(m_sub.group(1))
+                    prologue_sub_idx = sub_idx
+            break
+
+    total_push_rbp = sum(1 for line in lines if _RE_PUSH_RBP.match(line))
+
+    if frame_size > 0 and prologue_push_idx >= 0 and prologue_sub_idx >= 0:
+        lines[prologue_push_idx] = ""
+        if prologue_mov_idx >= 0:
+            lines[prologue_mov_idx] = ""
+        lines[prologue_sub_idx] = ""
+
+        # When the entry prologue is the only push rbp in the stencil, every
+        # pop rbp belongs to the outer function epilogue even if the compiler
+        # hoists the shared add rsp, N before a branch. Strip all such outer
+        # unwinds and let the shim own the frame instead.
+        if total_push_rbp == 1:
+            for i in range(len(lines)):
+                if _RE_POP_RBP.match(lines[i]):
+                    lines[i] = ""
+                    continue
+                m_add = _RE_ADD_RSP.match(lines[i]) or _RE_LEA_RSP.match(
+                    lines[i]
+                )
+                if m_add and int(m_add.group(1)) == frame_size:
+                    lines[i] = ""
+        else:
+            # Fall back to the conservative adjacent add/lea + pop pattern
+            # when the stencil contains other push rbp saves of its own.
+            stripped_any_epilogue = False
+            for i in range(len(lines) - 1):
+                if not _RE_POP_RBP.match(lines[i + 1]):
+                    continue
+                m_add = _RE_ADD_RSP.match(lines[i]) or _RE_LEA_RSP.match(
+                    lines[i]
+                )
+                if m_add and int(m_add.group(1)) == frame_size:
+                    stripped_any_epilogue = True
+                    lines[i] = ""
+                    lines[i + 1] = ""
+            if not stripped_any_epilogue:
+                frame_size = 0
+    else:
+        frame_size = 0
+
+    # Inline trace exit sequences: tear down the shared trace frame
+    # (mov rsp, rbp; pop rbp) and then exit via the original instruction.
+    # The shim has its own prologue/epilogue and must not be rewritten.
+    if not is_shim:
+        _TRACE_EXIT_REWRITES = {
+            "| ret": [
+                "    | mov rsp, rbp",
+                "    | pop rbp",
+                "    | ret",
+            ],
+            "| jmp rax": [
+                "    | mov rsp, rbp",
+                "    | pop rbp",
+                "    | jmp rax",
+            ],
+            "| jmp rcx": [
+                "    | mov rsp, rbp",
+                "    | pop rbp",
+                "    | jmp rcx",
+            ],
+            "| jmp qword [rax + 48]": [
+                "    | mov rsp, rbp",
+                "    | pop rbp",
+                "    | jmp qword [rax + 48]",
+            ],
+        }
+        has_internal_push_rbp = total_push_rbp > 1 and frame_size > 0
+        # In stencils with internal push/pop rbp pairs (subroutines like
+        # _Py_Dealloc called via `call =>L(N)`), exit instructions reachable
+        # after a `pop rbp` within the same basic block are internal subroutine
+        # exits, not trace exits.  Track per-block state to skip rewriting.
+        _RE_LABEL_LINE = re.compile(r"^\s*\|=>")
+        in_internal_exit_block = False
+        new_lines = []
+        for line in lines:
+            stripped = line.strip()
+            if has_internal_push_rbp:
+                if _RE_LABEL_LINE.match(stripped):
+                    in_internal_exit_block = False
+                if _RE_POP_RBP.match(line):
+                    in_internal_exit_block = True
+                if in_internal_exit_block and stripped in _TRACE_EXIT_REWRITES:
+                    new_lines.append(line)
+                    continue
+            replacement = _TRACE_EXIT_REWRITES.get(stripped)
+            if replacement is not None:
+                new_lines.extend(replacement)
+            else:
+                new_lines.append(line)
+        lines = new_lines
+
+    # Apply peephole optimizations (fuse emit_mov_imm + movzx/or)
+    lines = _peephole_optimize(lines)
+
+    # Eliminate trampoline jumps: when a label contains only a single
+    # `jmp` to another target, rewrite all branches to that label to
+    # jump directly to the final target.
+    _RE_LABEL_DEF = re.compile(r"^\s*\|=>L\((\d+)\):\s*$")
+    _RE_TRAMPOLINE_JMP = re.compile(
+        r"^\s*\|\s*jmp\s+=>(instruction->(?:jump_target|error_target)"
+        r"|L\(\d+\)|continue_label)\s*$"
+    )
+    _RE_BRANCH_TO_LABEL = re.compile(
+        r"^(\s*\|\s*j\w+)\s+=>(L\(\d+\))\s*$"
+    )
+    # Pass 1: find trampoline labels (label followed immediately by lone jmp)
+    trampoline_targets: dict[str, str] = {}  # "L(N)" → target
+    for i in range(len(lines) - 1):
+        m_label = _RE_LABEL_DEF.match(lines[i])
+        if not m_label:
+            continue
+        # Skip blank/comment/section lines to find the next real instruction
+        j = i + 1
+        while j < len(lines) and (
+            not lines[j].strip()
+            or lines[j].strip().startswith("//")
+        ):
+            j += 1
+        if j >= len(lines):
+            continue
+        m_jmp = _RE_TRAMPOLINE_JMP.match(lines[j])
+        if not m_jmp:
+            continue
+        label_name = f"L({m_label.group(1)})"
+        trampoline_targets[label_name] = m_jmp.group(1)
+
+    # Resolve chains: L(8) → L(1) → instruction->jump_target
+    changed = True
+    while changed:
+        changed = False
+        for src, dst in list(trampoline_targets.items()):
+            if dst in trampoline_targets:
+                trampoline_targets[src] = trampoline_targets[dst]
+                changed = True
+
+    if trampoline_targets:
+        # Collect line indices of trampoline labels and their jmp targets
+        dead_lines: set[int] = set()
+        for i in range(len(lines)):
+            m_label = _RE_LABEL_DEF.match(lines[i])
+            if not m_label:
+                continue
+            label_name = f"L({m_label.group(1)})"
+            if label_name not in trampoline_targets:
+                continue
+            dead_lines.add(i)
+            # Find and mark the jmp line (skipping blanks/comments)
+            for j in range(i + 1, len(lines)):
+                if _RE_TRAMPOLINE_JMP.match(lines[j]):
+                    dead_lines.add(j)
+                    break
+                if lines[j].strip() and not lines[j].strip().startswith("//"):
+                    break
+
+        new_lines = []
+        for i, line in enumerate(lines):
+            if i in dead_lines:
+                continue
+            # Rewrite branches to trampolines → direct branches
+            m_branch = _RE_BRANCH_TO_LABEL.match(line)
+            if m_branch:
+                target_label = m_branch.group(2)
+                if target_label in trampoline_targets:
+                    branch_instr = m_branch.group(1)
+                    new_lines.append(
+                        f"{branch_instr} =>{trampoline_targets[target_label]}"
+                    )
+                    continue
+            new_lines.append(line)
+        lines = new_lines
+
+    # Remove blank lines and trailing whitespace from the output (clang emits
+    # blank lines between basic blocks; some peephole patterns add trailing \n).
+    lines = [l.rstrip() for l in lines if l.strip()]
+
+    return ConvertedStencil(
+        opname=opname,
+        lines=lines,
+        num_internal_labels=counter,
+        data_items=data_items,
+        frame_size=frame_size,
+    )
diff --git a/Tools/jit/_asm_to_dasc_amd64.py b/Tools/jit/_asm_to_dasc_amd64.py
new file mode 100644
index 00000000000000..8062c99457abee
--- /dev/null
+++ b/Tools/jit/_asm_to_dasc_amd64.py
@@ -0,0 +1,1464 @@
+"""x86-64 specific peephole optimizations for the DynASM JIT backend.
+
+This module contains all x86-64 architecture-specific code for the peephole
+optimizer:
+
+- JIT calling convention (register roles, frame offsets)
+- Instruction effect analysis (which registers/flags each instruction touches)
+- Peephole optimization patterns (23 patterns, organized by category)
+- Pattern registry
+
+Architecture-generic infrastructure (types, parsing, pass management, dead
+label elimination) lives in ``_asm_to_dasc.py``.  A future ARM64 backend would
+create ``_asm_to_dasc_aarch64.py`` following the same structure.
+"""
+
+from __future__ import annotations
+
+import typing
+
+from _asm_to_dasc import (
+    # Operand types
+    Reg, Mem, Imm, Op,
+    # Line types
+    Asm, CCall, CCallKind, Label, Section, FuncDef, Blank, CCode, Line,
+    # Infrastructure
+    _Match, _FoldCtx, _PeepholeState, _PeepholePass,
+    _PeepholeProgram, _LineEffect,
+    # Functions
+    parse_line, parse_lines,
+    is_call, is_branch, is_jump, branch_target, line_raw, fmt_op,
+    _stat,
+    # Data
+    _ANY_REG_TO_IDX, _ANY_REG_TO_NAME, _IDX_TO_ALL_NAMES,
+    _REG_IDX_NAME, _REG64_TO_REG32, _BRANCH_MNEMONICS, _RE_EMIT_MOV_IMM,
+    # Helpers
+    _operand_regs, _mem_uses_reg, _reg_bits,
+)
+
+# ── JIT calling convention: register roles ──────────────────────────────
+# The preserve_none calling convention assigns these fixed roles:
+#   r13 = frame pointer (_PyInterpreterFrame *)
+#   r14 = cached stack pointer (frame->stackpointer)
+#   r15 = thread state (PyThreadState *)
+#   r12 = current executor (_PyExecutorObject *)
+# These constants are used by the peephole optimizer to recognize
+# store/reload patterns without hardcoding register names.
+REG_FRAME = "r13"       # _PyInterpreterFrame *frame
+REG_STACK_PTR = "r14"   # _PyStackRef *stack_pointer (cached)
+REG_TSTATE = "r15"      # PyThreadState *tstate
+REG_EXECUTOR = "r12"    # _PyExecutorObject *executor
+
+# Frame struct field offsets (from _PyInterpreterFrame in
+# Include/internal/pycore_interpframe_structs.h).
+# Used by the peephole optimizer to match store/reload patterns.
+FRAME_IP_OFFSET = 56          # offsetof(_PyInterpreterFrame, instr_ptr)
+FRAME_STACKPOINTER_OFFSET = 64  # offsetof(_PyInterpreterFrame, stackpointer)
+
+
+class _C:
+    """Constants usable as value patterns in match/case statements.
+
+    In Python structural pattern matching, only dotted names (like ``_C.SP``)
+    are treated as value patterns that compare against the constant.  Simple
+    names (like ``REG_STACK_PTR``) would be treated as capture patterns.
+    """
+    FRAME = REG_FRAME                       # "r13"
+    SP = REG_STACK_PTR                      # "r14"
+    TSTATE = REG_TSTATE                     # "r15"
+    EXECUTOR = REG_EXECUTOR                 # "r12"
+    FRAME_SP_OFS = FRAME_STACKPOINTER_OFFSET  # 64
+    FRAME_IP_OFS = FRAME_IP_OFFSET            # 56
+
+# Derived patterns used by store/reload elimination.
+# Store: mov qword [r13 + 64], r14  (frame->stackpointer = stack_pointer)
+# Reload: mov r14, qword [r13 + 64] (stack_pointer = frame->stackpointer)
+_SP_STORE = f"| mov qword [{REG_FRAME} + {FRAME_STACKPOINTER_OFFSET}], {REG_STACK_PTR}"
+_SP_RELOAD = f"| mov {REG_STACK_PTR}, qword [{REG_FRAME} + {FRAME_STACKPOINTER_OFFSET}]"
+_SP_RELOAD_LINE = f"    {_SP_RELOAD}\n"
+
+# ── JREG name ↔ index mapping (used by emit_mov_imm patterns) ─────────
+
+_JREG_TO_IDX: dict[str, int] = {
+    "JREG_RAX": 0, "JREG_RCX": 1, "JREG_RDX": 2, "JREG_RBX": 3,
+    "JREG_RSP": 4, "JREG_RBP": 5, "JREG_RSI": 6, "JREG_RDI": 7,
+    "JREG_R8": 8, "JREG_R9": 9, "JREG_R10": 10, "JREG_R11": 11,
+    "JREG_R12": 12, "JREG_R13": 13, "JREG_R14": 14, "JREG_R15": 15,
+}
+_IDX_TO_JREG: dict[int, str] = {v: k for k, v in _JREG_TO_IDX.items()}
+
+# ── SysV ABI register classification ────────────────────────────────────
+# Used by _is_dead_before_any_call to determine how opaque calls
+# (emit_call_ext / | call) interact with each register.
+
+# Callee-saved registers: preserved across function calls.
+_CALLEE_SAVED_REGS = frozenset({3, 5, 12, 13, 14, 15})  # rbx rbp r12 r13 r14 r15
+
+# SysV integer argument registers: may be read by function calls.
+_SYSV_ARGUMENT_REGS = frozenset({7, 6, 2, 1, 8, 9})  # rdi rsi rdx rcx r8 r9
+
+# Caller-saved registers: clobbered by function calls (SysV ABI).
+_CALLER_SAVED_REGS = frozenset({0, 1, 2, 6, 7, 8, 9, 10, 11})
+# rax rcx rdx rsi rdi r8 r9 r10 r11
+
+
+def _parse_jreg(token: str) -> tuple[int, str]:
+    """Parse a JREG_* name or integer → (index, name)."""
+    if token in _JREG_TO_IDX:
+        return _JREG_TO_IDX[token], token
+    idx = int(token)
+    return idx, _IDX_TO_JREG.get(idx, str(idx))
+
+
+def _jreg_arg_index(token: str) -> int | None:
+    """Best-effort parse of a helper argument naming a JIT register."""
+    token = token.strip()
+    if token in _JREG_TO_IDX:
+        return _JREG_TO_IDX[token]
+    try:
+        return int(token, 0)
+    except ValueError:
+        return None
+
+
+def _reg_write_sets(reg: Reg) -> tuple[frozenset[int], frozenset[int]]:
+    """Return (full_writes, partial_writes) for a register destination."""
+    if reg.idx is None:
+        return frozenset(), frozenset()
+    if reg.bits >= 32:
+        return frozenset({reg.idx}), frozenset()
+    return frozenset(), frozenset({reg.idx})
+
+
+def _line_effect(line: Line) -> _LineEffect:
+    """Summarize register and flags effects for one parsed line."""
+    match line:
+        case Blank() | Label() | Section() | FuncDef():
+            return _LineEffect()
+        case CCode():
+            return _LineEffect()
+        case CCall(kind=CCallKind.MOV_IMM, argv=argv):
+            idx = _jreg_arg_index(argv[0]) if argv else None
+            return _LineEffect(
+                full_writes=frozenset({idx})
+                if idx is not None
+                else frozenset()
+            )
+        case CCall(kind=CCallKind.CALL_EXT):
+            # Model the emitted call as clobbering all caller-saved regs
+            # per SysV ABI.  We intentionally do NOT model argument registers
+            # as reads: the peephole patterns never eliminate register writes
+            # that feed call arguments (they only fold emit_mov_imm into
+            # memory addressing or ALU patterns), so the risk is negligible.
+            return _LineEffect(
+                full_writes=_CALLER_SAVED_REGS,
+                writes_flags=True,
+            )
+        case CCall(kind=CCallKind.CMP_REG_IMM, argv=argv):
+            reads = frozenset({_jreg_arg_index(argv[0])}) if len(argv) >= 1 else frozenset()
+            scratch = _jreg_arg_index(argv[1]) if len(argv) >= 2 else None
+            return _LineEffect(
+                reads=frozenset(idx for idx in reads if idx is not None),
+                full_writes=frozenset({scratch}) if scratch is not None else frozenset(),
+                writes_flags=True,
+            )
+        case CCall(kind=CCallKind.CMP_MEM64_IMM, argv=argv):
+            base = _jreg_arg_index(argv[0]) if len(argv) >= 1 else None
+            scratch = _jreg_arg_index(argv[2]) if len(argv) >= 3 else None
+            return _LineEffect(
+                reads=frozenset({base}) if base is not None else frozenset(),
+                full_writes=frozenset({scratch}) if scratch is not None else frozenset(),
+                writes_flags=True,
+            )
+        case CCall(kind=CCallKind.ALU_REG_IMM, helper=helper, argv=argv):
+            reg_idx = _jreg_arg_index(argv[0]) if len(argv) >= 1 else None
+            scratch = _jreg_arg_index(argv[1]) if len(argv) >= 2 else None
+            reads = frozenset({reg_idx}) if reg_idx is not None else frozenset()
+            full_writes = set()
+            if scratch is not None:
+                full_writes.add(scratch)
+            if helper in {"emit_and_reg_imm", "emit_or_reg_imm", "emit_xor_reg_imm",
+                         "emit_add_reg_imm", "emit_sub_reg_imm"}:
+                if reg_idx is not None:
+                    full_writes.add(reg_idx)
+            return _LineEffect(
+                reads=reads,
+                full_writes=frozenset(full_writes),
+                writes_flags=True,
+            )
+        case Asm(mnemonic=mnemonic, dst=dst, src=src):
+            reads = set(_operand_regs(src))
+            full_writes: set[int] = set()
+            partial_writes: set[int] = set()
+            uses_flags = False
+            writes_flags = False
+
+            match mnemonic:
+                case "jmp" | "ret":
+                    return _LineEffect()
+                case _ if is_branch(line):
+                    return _LineEffect(uses_flags=True)
+                case "call":
+                    # Reads the target operand (address computation),
+                    # clobbers all caller-saved registers per SysV ABI.
+                    reads |= set(_operand_regs(dst))
+                    return _LineEffect(
+                        reads=frozenset(reads),
+                        full_writes=_CALLER_SAVED_REGS,
+                        writes_flags=True,
+                    )
+                case _ if mnemonic.startswith("set"):
+                    uses_flags = True
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                case _ if mnemonic.startswith("cmov"):
+                    uses_flags = True
+                    reads |= set(_operand_regs(dst))
+                    reads |= set(_operand_regs(src))
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                case "mov":
+                    if isinstance(dst, Mem):
+                        reads |= set(_operand_regs(dst))
+                    elif isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                case "movzx" | "movsxd" | "lea":
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                    reads |= set(_operand_regs(src))
+                case "cmp" | "test" | "bt" | "ucomisd":
+                    reads |= set(_operand_regs(dst))
+                    reads |= set(_operand_regs(src))
+                    writes_flags = True
+                case "pop":
+                    # pop writes to the destination register (from stack),
+                    # it does NOT read the register.
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                case "push":
+                    # push reads the source register to put it on the stack,
+                    # but does not write to any GP register.
+                    reads |= set(_operand_regs(dst))
+                case "neg" | "not" | "inc" | "dec":
+                    reads |= set(_operand_regs(dst))
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                    writes_flags = True
+                case "xor" if (
+                    isinstance(dst, Reg)
+                    and isinstance(src, Reg)
+                    and (dst.idx is not None and dst.idx == src.idx)
+                ):
+                    # xor reg, reg is a zeroing idiom — no read dependency.
+                    reads.discard(dst.idx)
+                    full, partial = _reg_write_sets(dst)
+                    full_writes |= full
+                    partial_writes |= partial
+                    writes_flags = True
+                case _:
+                    reads |= set(_operand_regs(dst))
+                    reads |= set(_operand_regs(src))
+                    if isinstance(dst, Reg):
+                        full, partial = _reg_write_sets(dst)
+                        full_writes |= full
+                        partial_writes |= partial
+                    writes_flags = mnemonic in {
+                        "add", "and", "or", "sub", "xor", "shl", "shr",
+                        "sar", "sal", "rol", "ror", "rcl", "rcr",
+                    }
+
+            return _LineEffect(
+                reads=frozenset(reads),
+                full_writes=frozenset(full_writes),
+                partial_writes=frozenset(partial_writes),
+                uses_flags=uses_flags,
+                writes_flags=writes_flags,
+            )
+    return _LineEffect()
+
+
+def uses_reg(line: Line, reg_idx: int) -> bool:
+    """Does this line reference the given register (by index)?
+
+    Uses structured operand/helper effects where possible, then falls back to
+    the raw text for unclassified C lines.
+    """
+    effect = _line_effect(line)
+    if (
+        reg_idx in effect.reads
+        or reg_idx in effect.full_writes
+        or reg_idx in effect.partial_writes
+    ):
+        return True
+    names = _IDX_TO_ALL_NAMES.get(reg_idx, set())
+    raw = line.raw
+    return any(name in raw for name in names)
+
+
+def is_store_sp(line: Line) -> bool:
+    """Matches: ``| mov qword [r13 + 64], r14`` (store stack pointer)."""
+    return line.raw.strip() == _SP_STORE
+
+
+def is_reload_sp(line: Line) -> bool:
+    """Matches: ``| mov r14, qword [r13 + 64]`` (reload stack pointer)."""
+    return line.raw.strip() == _SP_RELOAD
+
+
+def _preserve_flags_mov_imm(line: str) -> str:
+    """Rewrite emit_mov_imm(...) to emit_mov_imm_preserve_flags(...)."""
+    return line.replace("emit_mov_imm(", "emit_mov_imm_preserve_flags(", 1)
+
+
+def _is_flag_writer(line: Line) -> bool:
+    """Does this instruction define flags for a later consumer?"""
+    return _line_effect(line).writes_flags
+
+
+def _is_flag_consumer(line: Line) -> bool:
+    """Does this instruction consume previously computed flags?"""
+    return _line_effect(line).uses_flags
+
+
+def _parse_emit_mov_imm_call(line: str) -> tuple[int, str, str] | None:
+    """Parse an emit_mov_imm* helper call into (reg_idx, reg_name, expr)."""
+    match parse_line(line):
+        case CCall(kind=CCallKind.MOV_IMM, argv=(reg, expr, *_)):
+            reg_idx, reg_name = _parse_jreg(reg)
+            return reg_idx, reg_name, expr
+    return None
+
+
+# Map jcc mnemonic → C comparison operator for unsigned cmp folding.
+# Given "cmp REG, IMM; jcc label", the branch is taken when REG <op> IMM.
+
+
+def _reg_dead_after(
+    program_or_lines: _PeepholeProgram | list[str],
+    start: int,
+    reg_idx: int,
+) -> bool:
+    """Control-flow-aware deadness query backed by structured effects."""
+    if isinstance(program_or_lines, _PeepholeProgram):
+        program = program_or_lines
+    else:
+        program = _PeepholeProgram.from_lines(program_or_lines)
+    return program.reg_dead_after(start, reg_idx)
+
+
+def _is_dead_before_any_call(
+    program: _PeepholeProgram,
+    start: int,
+    reg_idx: int,
+) -> bool:
+    """Check that *reg_idx* is dead AND safe to eliminate.
+
+    This performs a CFG-aware scan from *start* verifying that on EVERY
+    reachable path, *reg_idx* is fully overwritten before:
+      1. an opaque call (``emit_call_ext`` / ``| call``) that might read
+         it — only relevant for SysV argument registers (rdi, rsi, rdx,
+         rcx, r8, r9), OR
+      2. the end of the function or a ``.cold`` section boundary.
+
+    Callee-saved registers (rbx, rbp, r12–r15) are preserved across calls
+    by the SysV ABI, so calls are transparent for them.  Caller-saved
+    non-argument registers (rax, r10, r11) are clobbered by calls, which
+    counts as a full write.
+
+    Condition (2) is needed because in the JIT stencil system, registers
+    that are live at the end of a stencil function are inter-stencil
+    outputs consumed by the next stencil.  DynASM ``.cold`` sections are
+    placed in a separate memory region, so linear fallthrough across a
+    section switch does not exist at runtime.
+    """
+    if not program.reg_dead_after(start, reg_idx):
+        return False
+
+    # Callee-saved registers are preserved across calls.
+    callee_saved = reg_idx in _CALLEE_SAVED_REGS
+    # Argument registers may be read by calls as function arguments.
+    is_argument_reg = reg_idx in _SYSV_ARGUMENT_REGS
+    # Other caller-saved regs (rax, r10, r11) are clobbered by calls.
+
+    func_end = program.function_end_by_line[start]
+    start_depth = program.c_depth[start]
+    parsed = program.parsed
+    visited: set[int] = set()
+    worklist = [start + 1]
+
+    while worklist:
+        pos = worklist.pop()
+        while pos < func_end:
+            if pos in visited:
+                break
+            visited.add(pos)
+            pj = parsed[pos]
+
+            is_opaque_call = (
+                isinstance(pj, CCall) and pj.kind == CCallKind.CALL_EXT
+            ) or (isinstance(pj, Asm) and is_call(pj))
+
+            if is_opaque_call:
+                if is_argument_reg:
+                    # Call might read this register — unsafe.
+                    return False
+                if not callee_saved:
+                    # Caller-saved non-argument reg (rax, r10, r11):
+                    # the call clobbers it → effectively a full write.
+                    break
+                # Callee-saved: call preserves it, continue scanning.
+
+            # .cold section boundary — treat as function end.  The
+            # register hasn't been written on the hot path so it may
+            # be an inter-stencil output.  (.code sections are benign;
+            # they just confirm we are already in the hot section.)
+            if isinstance(pj, Section) and pj.name == "cold":
+                return False
+
+            # R1 fully overwritten — this path is safe
+            eff = program.effects[pos]
+            if reg_idx in eff.full_writes:
+                break
+            # Follow the same successor logic as reg_dead_after
+            successors = program.successors(pos, start_depth)
+            if not successors:
+                # No successors and no write — R1 reaches function end
+                # alive, so it may be an inter-stencil output register.
+                return False
+            fallthrough = pos + 1
+            if len(successors) == 1 and successors[0] == fallthrough:
+                pos = fallthrough
+                continue
+            for succ in successors[1:]:
+                if succ < func_end:
+                    worklist.append(succ)
+            pos = successors[0]
+        else:
+            # Inner while exited because pos >= func_end without a
+            # full_write — the register reaches function end alive.
+            return False
+
+    return True
+
+
+# ── emit_mov_imm chain patterns ───────────────────────────────────────
+#
+# These patterns all start from a parsed emit_mov_imm line and attempt
+# to fold subsequent instructions.  They share (expr, consumed, src_idx)
+# state and compose in sequence: Pattern 1 can modify expr, then Pattern
+# 2 refines it further, etc.
+
+
+def _try_indexed_mem(ctx: _FoldCtx) -> _Match | None:
+    """Pattern 6: Fold indexed memory loads/stores with computed index.
+
+    When emit_mov_imm loads a value that's used as an index in a
+    memory access [base + REG*scale + disp], precompute the offset
+    at JIT compile time.  This eliminates the index register and the
+    scaled addressing mode, replacing it with a simple [base + const].
+
+    Example — index into PyObject array:
+        emit_mov_imm(Dst, JREG_RCX, instruction->oparg);
+        | mov rax, qword [rbx + rcx*8 + 48]
+      →
+        | mov rax, qword [rbx + ((int)(instruction->oparg) * 8 + 48)]
+
+    Multiple consecutive accesses using the same index are all folded:
+        emit_mov_imm(Dst, JREG_RCX, instruction->oparg);
+        | mov rax, qword [r14 + rcx*8 + 0]
+        | mov rdx, qword [r14 + rcx*8 + 8]
+      →
+        | mov rax, qword [r14 + ((int)(instruction->oparg) * 8 + 0)]
+        | mov rdx, qword [r14 + ((int)(instruction->oparg) * 8 + 8)]
+
+    Safety: the index register must be either overwritten by the load
+    destination or dead after all folded accesses.
+    """
+    lines, i, src_idx, expr = ctx.lines, ctx.i, ctx.src_idx, ctx.expr
+    parsed = ctx.parsed
+    folded: list[str] = []
+    scan = i
+    while scan < len(lines):
+        p = parsed[scan]
+        # Match instructions with indexed memory operand containing our reg
+        mem_op: Mem | None = None
+        is_load = False
+        match p:
+            # Load: | mov rax, qword [base + idx*scale + disp]
+            # Exclude LEA (handled by P7a/P7b).
+            case Asm(mnemonic=mn, dst=Reg(), src=Mem(index=idx, scale=sc)) if (
+                mn != "lea" and idx and sc and Reg(idx).idx == src_idx
+            ):
+                mem_op = p.src
+                is_load = True
+            # Store: | mov qword [base + idx*scale + disp], reg
+            case Asm(
+                mnemonic="mov", dst=Mem(index=idx, scale=sc), src=Reg()
+            ) if idx and sc and Reg(idx).idx == src_idx:
+                mem_op = p.dst
+                is_load = False
+            case _:
+                break
+        if mem_op is None or mem_op.base is None:
+            break
+        # Reconstruct with computed offset
+        computed = f"(int)({expr}) * {mem_op.scale} + {mem_op.offset}"
+        new_mem = f"[{mem_op.base} + ({computed})]"
+        if is_load:
+            new_line = (
+                f"    | {p.mnemonic} {fmt_op(p.dst)}, {mem_op.size} {new_mem}"
+                if mem_op.size
+                else f"    | {p.mnemonic} {fmt_op(p.dst)}, {new_mem}"
+            )
+        else:
+            new_line = (
+                f"    | {p.mnemonic} {mem_op.size} {new_mem}, {fmt_op(p.src)}"
+                if mem_op.size
+                else f"    | {p.mnemonic} {new_mem}, {fmt_op(p.src)}"
+            )
+        folded.append(new_line)
+        scan += 1
+    if not folded:
+        return None
+    # Safety: index reg overwritten by load dest, or dead after
+    first = parsed[i]
+    dest_idx = (
+        first.dst.idx
+        if isinstance(first, Asm) and isinstance(first.dst, Reg)
+        else None
+    )
+    if dest_idx == src_idx or _reg_dead_after(ctx.program, scan, src_idx):
+        _stat("P6_indexed_mem")
+        return _Match(scan - i, folded)
+    return None
+
+
+def _try_two_mov_add(ctx: _FoldCtx) -> _Match | None:
+    """Pattern 15: Combine two immediate loads followed by add."""
+    lines, i, parsed, indent, expr = (
+        ctx.lines, ctx.i, ctx.parsed, ctx.indent, ctx.expr)
+    mov_info = _parse_emit_mov_imm_call(lines[i])
+    if mov_info is None or i + 1 >= len(lines):
+        return None
+    dst_idx, dst_name, rhs_expr = mov_info
+    match parsed[i + 1]:
+        case Asm(
+            mnemonic="add", dst=Reg(name=add_dst), src=Reg(name=add_src)
+        ) if Reg(add_dst).idx == dst_idx and Reg(add_src).idx == ctx.src_idx:
+            pass
+        case _:
+            return None
+    if not _reg_dead_after(ctx.program, i + 2, ctx.src_idx):
+        return None
+    bits = _reg_bits(add_dst)
+    if bits <= 32:
+        combined = f"(uint{bits}_t)(({rhs_expr}) + ({expr}))"
+    else:
+        combined = f"({rhs_expr}) + ({expr})"
+    _stat("P16_two_mov_add")
+    return _Match(
+        2,
+        [
+            f"{indent}emit_mov_imm(Dst, {dst_name}, {combined});",
+        ],
+    )
+
+
+def _try_alu_imm(ctx: _FoldCtx) -> _Match | None:
+    """Pattern 8: Fold ALU instruction's register operand into immediate.
+
+    When emit_mov_imm loads a value into a register and the next
+    instruction uses that register as the second operand of an ALU
+    operation (cmp, test, and, or, xor, add, sub), replace the register
+    with the immediate value directly.  This eliminates the mov entirely.
+
+    Example — compare with runtime constant:
+        emit_mov_imm(Dst, JREG_RAX, instruction->operand0);
+        | cmp rcx, rax
+      → (if value fits in sign-extended imm32)
+        | cmp rcx, (int)(instruction->operand0)
+      → (if value does NOT fit in imm32, falls back)
+        emit_mov_imm(Dst, JREG_RAX, instruction->operand0);
+        | cmp rcx, rax
+
+    Example — OR with type tag:
+        emit_mov_imm(Dst, JREG_RDX, instruction->operand0);
+        | or qword [rbx + 16], rdx
+      → (32-bit value fits)
+        | or qword [rbx + 16], (int)(instruction->operand0)
+
+    For 64-bit operands, emits a runtime range check (if/else) to use
+    the immediate form when possible, falling back to the register form.
+    For 32-bit and 16-bit operands, the immediate always fits.
+
+    The source register must be dead after the ALU instruction (since
+    we're eliminating the load).
+    """
+    _ALU_OPS = {"cmp", "test", "and", "or", "xor", "add", "sub"}
+    # Only test is safe for commutative swap because it doesn't write to
+    # a register.  For and/or/xor/add, swapping would change which register
+    # receives the result, corrupting program state.
+    _COMMUTATIVE_OPS = {"test"}
+    lines, i, src_idx, indent, src_name, expr = (
+        ctx.lines, ctx.i, ctx.src_idx, ctx.indent, ctx.src_name, ctx.expr)
+    cur = ctx.cur
+    if not isinstance(cur, Asm) or cur.mnemonic not in _ALU_OPS:
+        return None
+    alu_op = cur.mnemonic
+    alu_reg = None
+    dst_op = None
+    # Standard order: ALU dst, src — where src is our emit_mov_imm register.
+    if isinstance(cur.src, Reg) and Reg(cur.src.name).idx == src_idx:
+        alu_reg = cur.src.name
+        dst_op = cur.dst
+    # Commutative swap: ALU dst, src — where dst is our register.
+    elif (
+        alu_op in _COMMUTATIVE_OPS
+        and isinstance(cur.dst, Reg)
+        and Reg(cur.dst.name).idx == src_idx
+    ):
+        alu_reg = cur.dst.name
+        dst_op = cur.src
+    else:
+        return None
+    # Format the first operand text for output
+    alu_first = fmt_op(dst_op)
+    # Don't fold if first operand is also the same register
+    match dst_op:
+        case Reg(name=first_reg) if Reg(first_reg).idx == src_idx:
+            return None
+        case Mem() as mem if _mem_uses_reg(mem, src_idx):
+            return None
+    if not _reg_dead_after(ctx.program, i + 1, src_idx):
+        return None
+    _stat("P8_alu_imm_fold")
+    bits = Reg(alu_reg).bits
+    if bits == 32:
+        return _Match(
+            1,
+            [
+                f"{indent}| {alu_op} {alu_first}, (int)({expr})",
+            ],
+        )
+    if bits == 64:
+        # For register first operand, use emit_{op}_reg_imm helpers.
+        # These emit the shortest encoding: imm32 when it fits, otherwise
+        # scratch register + reg-reg form.
+        if isinstance(dst_op, Reg):
+            first_idx_name = _REG_IDX_NAME.get(dst_op.name.lower())
+            if first_idx_name:
+                return _Match(
+                    1,
+                    [
+                        f"{indent}emit_{alu_op}_reg_imm(Dst, {first_idx_name}, {src_name}, (uintptr_t)({expr}));",
+                    ],
+                )
+        # Simple qword memory compare: route through a dedicated helper so we
+        # do not have to emit a multiline if/else template at each call site.
+        if (
+            alu_op == "cmp"
+            and isinstance(dst_op, Mem)
+            and dst_op.size == "qword"
+            and dst_op.base is not None
+            and dst_op.index is None
+        ):
+            base_name = Reg(dst_op.base).jreg
+            if base_name is not None:
+                return _Match(
+                    1,
+                    [
+                        f"{indent}emit_cmp_mem64_imm(Dst, {base_name}, {dst_op.offset}, {src_name}, (uintptr_t)({expr}));",
+                    ],
+                )
+        # Memory first operand: fallback inline runtime range check for other
+        # cases that still cannot use a dedicated helper.
+        c64 = f"(int64_t)({expr})"
+        c32 = f"(int32_t)({expr})"
+        return _Match(
+            1,
+            [
+                f"{indent}if ({c64} == {c32}) {{",
+                f"{indent}| {alu_op} {alu_first}, (int)({expr})",
+                f"{indent}}} else {{",
+                f"{indent}emit_mov_imm(Dst, {src_name}, {expr});",
+                f"{indent}| {alu_op} {alu_first}, {alu_reg}",
+                f"{indent}}}",
+            ],
+        )
+    if bits == 16:
+        return _Match(
+            1,
+            [
+                f"{indent}| {alu_op} {alu_first}, (short)({expr})",
+            ],
+        )
+    return None
+
+
+def _try_store_imm(ctx: _FoldCtx) -> _Match | None:
+    """Pattern 12: Fold register store into immediate store to memory.
+
+    When emit_mov_imm loads a value into a register and the next
+    instruction stores that register to memory, replace with a direct
+    immediate-to-memory store (eliminating the register load entirely).
+
+    Example — store byte:
+        emit_mov_imm(Dst, JREG_RAX, instruction->oparg);
+        | mov byte [rbx + 42], al
+      →
+        | mov byte [rbx + 42], (char)(instruction->oparg)
+
+    Example — store qword (needs range check):
+        emit_mov_imm(Dst, JREG_RCX, instruction->operand0);
+        | mov qword [r14 + 8], rcx
+      →
+        if ((int64_t)(instruction->operand0) ==
+            (int32_t)(instruction->operand0)) {
+        | mov qword [r14 + 8], (int)(instruction->operand0)
+        } else {
+        emit_mov_imm(Dst, JREG_RCX, instruction->operand0);
+        | mov qword [r14 + 8], rcx
+        }
+
+    For byte/word/dword stores the immediate always fits.  For qword
+    stores, x86_64 only supports sign-extended imm32, so we emit a
+    runtime range check with a fallback.
+
+    The source register must be dead after the store.
+    """
+    lines, i, src_idx, indent, src_name, expr = (
+        ctx.lines, ctx.i, ctx.src_idx, ctx.indent, ctx.src_name, ctx.expr)
+    match ctx.cur:
+        case Asm(
+            mnemonic="mov",
+            dst=Mem(size=size, expr=mem_expr),
+            src=Reg(name=reg),
+        ) if (
+            size in ("qword", "dword", "word", "byte")
+            and Reg(reg).idx == src_idx
+        ):
+            pass  # fall through to shared logic
+        case _:
+            return None
+    mem = typing.cast(Mem, ctx.cur.dst)
+    if _mem_uses_reg(mem, src_idx):
+        return None
+    if not _reg_dead_after(ctx.program, i + 1, src_idx):
+        return None
+    _stat("P12_store_imm")
+    _SIZE_CAST = {"byte": "char", "word": "short", "dword": "int"}
+    if size in _SIZE_CAST:
+        cast = _SIZE_CAST[size]
+        return _Match(
+            1,
+            [
+                f"{indent}| mov {size} {mem_expr}, ({cast})({expr})",
+            ],
+        )
+    # qword: use emit_store_mem64_imm for simple [base + offset] forms,
+    # fall back to inline if/else for complex addressing modes.
+    if mem.base and not mem.index:
+        from _asm_to_dasc import Reg as _Reg
+
+        base_reg = _Reg(mem.base)
+        base_jreg = base_reg.jreg
+        if base_jreg is not None:
+            return _Match(
+                1,
+                [
+                    f"{indent}emit_store_mem64_imm(Dst, {base_jreg},"
+                    f" {mem.offset}, {src_name}, {expr});",
+                ],
+            )
+    # Complex addressing: inline if/else fallback
+    c64 = f"(int64_t)({expr})"
+    c32 = f"(int32_t)({expr})"
+    return _Match(
+        1,
+        [
+            f"{indent}if ({c64} == {c32}) {{",
+            f"{indent}| mov qword {mem_expr}, (int)({expr})",
+            f"{indent}}} else {{",
+            f"{indent}emit_mov_imm(Dst, {src_name}, {expr});",
+            f"{indent}| mov qword {mem_expr}, {reg}",
+            f"{indent}}}",
+        ],
+    )
+
+
+def _try_shift_fold(ctx: _FoldCtx) -> _Match | None:
+    """Fold shift of an emit_mov_imm register into the immediate expression.
+
+    When emit_mov_imm loads a value into a register and the next instruction
+    shifts that same register by an immediate amount, absorb the shift into
+    the emit_mov_imm expression.
+
+    Example (LOAD_GLOBAL_MODULE — dict key lookup):
+        emit_mov_imm(Dst, JREG_RAX, (uint16_t)(instruction->operand1));
+        | shl rax, 4
+      →
+        emit_mov_imm(Dst, JREG_RAX, (uintptr_t)((uint16_t)(instruction->operand1)) << 4);
+    """
+    match ctx.cur:
+        case Asm(
+            mnemonic="shl", dst=Reg(name=reg), src=Imm(text=shift_str)
+        ) if Reg(reg).idx == ctx.src_idx:
+            pass
+        case _:
+            return None
+    _stat("P15_shift_fold")
+    return _Match(
+        1,
+        [
+            f"{ctx.indent}emit_mov_imm(Dst, {ctx.src_name},"
+            f" (uintptr_t)({ctx.expr}) << {shift_str});",
+        ],
+    )
+
+
+def _try_lea_fold(ctx: _FoldCtx) -> _Match | None:
+    """Fold emit_mov_imm + lea [base + reg*scale] into lea [base + disp].
+
+    When emit_mov_imm loads a JIT-time value into a register that is then
+    only used as a scaled index in a lea, the scaled product can be computed
+    at JIT emit time and used as a 32-bit displacement instead.
+
+    Example (stack pointer adjustment):
+        emit_mov_imm(Dst, JREG_RBP, (0 - (uint16_t)(instruction->oparg)));
+        | lea rdi, [r14 + rbp*8]
+      →
+        | lea rdi, [r14 + (int)((intptr_t)(0 - (uint16_t)(instruction->oparg)) * 8)]
+
+    Also handles the no-base form [reg*scale+0]:
+        emit_mov_imm(Dst, JREG_R15, expr);
+        | lea r12, [r15*8+0]
+      →
+        emit_mov_imm(Dst, JREG_R12, (uintptr_t)(expr) * 8);
+
+    Conditions:
+    - The emit_mov_imm register is used as a scaled index in the lea
+    - The register is dead after the lea (or overwritten by it)
+    - The expression * scale fits in int32_t (guaranteed for oparg-based
+      expressions: max |oparg|=65535, max scale=8, product ≤ 524,280)
+    """
+    import re as _re
+
+    match ctx.cur:
+        case Asm(mnemonic="lea", dst=Reg(name=dst_reg), src=Mem() as mem_op):
+            pass
+        case _:
+            return None
+
+    # Check that the emit_mov_imm register is used as a scaled index
+    src_reg = _IDX_TO_ALL_NAMES.get(ctx.src_idx, ())
+    if not src_reg:
+        return None
+
+    mem_text = fmt_op(mem_op)
+    reg_alt = "|".join(_re.escape(r) for r in src_reg)
+
+    # Pattern 1: [base + reg*scale]
+    m_based = _re.search(
+        r"\[(\w+)\s*\+\s*(" + reg_alt + r")\*(\d+)\]", mem_text
+    )
+    # Pattern 2: [reg*scale+0] (no base register)
+    m_nobase = _re.search(r"\[(" + reg_alt + r")\*(\d+)\+0\]", mem_text)
+
+    if m_based:
+        base_reg = m_based.group(1)
+        scale = int(m_based.group(3))
+
+        # Check if the src register's original value is needed after the lea.
+        # The lea itself reads src_reg (scaled index), but we're replacing that
+        # with a displacement, so the lea's read doesn't count.
+        # If the lea's destination overwrites src_reg, the value is dead anyway.
+        lea_dst_idx = Reg(dst_reg).idx
+        if lea_dst_idx != ctx.src_idx:
+            if not _reg_dead_after(ctx.program, ctx.i + 1, ctx.src_idx):
+                return None
+
+        _stat("P17_lea_fold")
+        return _Match(
+            1,
+            [
+                f"{ctx.indent}| lea {dst_reg},"
+                f" [{base_reg} + (int)((intptr_t)({ctx.expr}) * {scale})]",
+            ],
+        )
+    elif m_nobase:
+        scale = int(m_nobase.group(2))
+
+        # For [reg*scale+0], the result is just expr * scale.
+        # We emit a new emit_mov_imm into the lea's destination register.
+        dst_idx = Reg(dst_reg).idx
+        dst_name = (
+            _IDX_TO_JREG.get(dst_idx, str(dst_idx))
+            if dst_idx is not None
+            else dst_reg
+        )
+
+        # Check src register is dead after the lea
+        lea_dst_idx = Reg(dst_reg).idx
+        if lea_dst_idx != ctx.src_idx:
+            if not _reg_dead_after(ctx.program, ctx.i + 1, ctx.src_idx):
+                return None
+
+        _stat("P17_lea_fold")
+        return _Match(
+            1,
+            [
+                f"{ctx.indent}emit_mov_imm(Dst, {dst_name},"
+                f" (uintptr_t)({ctx.expr}) * {scale});",
+            ],
+        )
+
+    return None
+
+
+def _fold_mov_imm(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+) -> int | None:
+    """Try to fold an emit_mov_imm with subsequent instructions.
+
+    Returns the number of lines consumed (advancing past ``i``), or
+    None if no fold was possible and the caller should try other patterns.
+    """
+    lines = program.lines
+    parsed = program.parsed
+    m_mov = _RE_EMIT_MOV_IMM.match(lines[i])
+    if not m_mov or i + 1 >= len(lines):
+        return None
+    # Guard against re-folding inside "} else {" fallback blocks
+    if result and result[-1].rstrip().endswith("} else {"):
+        return None
+
+    indent = m_mov.group(1)
+    src_idx, src_name = _parse_jreg(m_mov.group(2))
+    expr = m_mov.group(3)
+    consumed = 1
+
+    # Build shared context for all _try_* functions
+    ctx = _FoldCtx(
+        program=program,
+        lines=lines,
+        parsed=parsed,
+        i=i + 1,
+        src_idx=src_idx,
+        src_name=src_name,
+        indent=indent,
+        expr=expr,
+    )
+
+    # Try each fold pattern in priority order (all take ctx: _FoldCtx)
+    ctx.i = i + consumed
+    if ctx.i < len(lines):
+        for try_fn in (
+            _try_two_mov_add,
+            _try_indexed_mem,
+            _try_alu_imm,
+            _try_store_imm,
+            _try_shift_fold,
+            _try_lea_fold,
+        ):
+            match = try_fn(ctx)
+            if match:
+                result.extend(match.output)
+                return consumed + match.consumed
+
+    return None
+
+
+# ── Standalone patterns ────────────────────────────────────────────────
+#
+# Each function: (lines, i, result, state) → int | None
+# Returns lines consumed on match, or None.  May append to ``result``
+# and mutate ``state``.
+
+
+def _pattern_preserve_flags_mov_imm(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Preserve flags across immediate loads inserted before setcc/cmov/jcc."""
+    del state  # unused
+    lines = program.lines
+    if "emit_mov_imm(" not in lines[i] or i == 0:
+        return None
+    prev = program.parsed[i - 1]
+    if isinstance(prev, CCall) and prev.kind == CCallKind.MOV_IMM:
+        return None
+    if not _is_flag_writer(prev):
+        return None
+    j = i
+    output: list[str] = []
+    while j < len(lines) and "emit_mov_imm(" in lines[j]:
+        cur = program.parsed[j]
+        if not (isinstance(cur, CCall) and cur.kind == CCallKind.MOV_IMM):
+            break
+        output.append(_preserve_flags_mov_imm(lines[j]))
+        j += 1
+    if not output or j >= len(lines):
+        return None
+    if not _is_flag_consumer(program.parsed[j]):
+        return None
+    _stat("SP0_preserve_flags_mov_imm")
+    result.extend(output)
+    return len(output)
+
+
+def _pattern_store_reload_elim(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Eliminate redundant stackpointer reloads on the hot path.
+
+    Matches:
+        | mov qword [r13 + 64], r14   (store)
+        | test/cmp REG, IMM
+        | jcc =>COLD_LABEL
+        |=>MERGE_LABEL:
+        | mov r14, qword [r13 + 64]   (reload — eliminated)
+
+    The hot path never modifies r14 or [r13+64].  The cold dealloc path
+    may modify [r13+64], so we insert a reload there before jumping back.
+
+    Uses structural pattern matching on typed ``Line`` objects for dispatch.
+    """
+    lines = program.lines
+    if i + 4 >= len(lines):
+        return None
+    window = [program.parsed[i + k] for k in range(5)]
+    match window:
+        case [
+            # mov qword [r13 + FRAME_STACKPOINTER_OFFSET], r14
+            Asm(
+                mnemonic="mov",
+                dst=Mem(base=_C.FRAME, offset=_C.FRAME_SP_OFS),
+                src=Reg(name=_C.SP),
+            ),
+            # test/cmp REG, IMM
+            Asm(mnemonic=op),
+            # jcc =>LABEL
+            Asm(target=branch_tgt),
+            # =>MERGE_LABEL:
+            Label(name=merge_name),
+            # mov r14, qword [r13 + FRAME_STACKPOINTER_OFFSET]
+            Asm(
+                mnemonic="mov",
+                dst=Reg(name=_C.SP),
+                src=Mem(base=_C.FRAME, offset=_C.FRAME_SP_OFS),
+            ),
+        ] if op in ("test", "cmp") and branch_tgt:
+            merge_lbl = f"=>{merge_name}"
+            for k in range(4):
+                result.append(lines[i + k])
+            state.need_reload_before_jmp.add(merge_lbl)
+            _stat("SP1_store_reload_elim")
+            return 5
+    return None
+
+
+def _pattern_cold_reload_insert(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Insert stackpointer reload before cold-path jump back to merge label.
+
+    After store-reload elimination, the dealloc cold path needs to reload
+    r14 from [r13+64] before jumping back (since _Py_Dealloc may have
+    modified [r13+64] via __del__).
+    """
+    if not state.need_reload_before_jmp:
+        return None
+    lines = program.lines
+    cur = program.parsed[i]
+    # Must be a jump or conditional branch
+    match cur:
+        case Asm(mnemonic=m, target=target_lbl) if target_lbl and (
+            m == "jmp" or m in _BRANCH_MNEMONICS
+        ):
+            pass  # fall through
+        case _:
+            return None
+    if target_lbl not in state.need_reload_before_jmp:
+        return None
+    # Only insert reload if we're after a call (dealloc path)
+    for prev in reversed(result):
+        prev_line = parse_line(prev)
+        match prev_line:
+            case Blank():
+                continue
+            case CCall() | Asm(mnemonic="call"):
+                _stat("SP2_cold_reload_insert")
+                result.append(_SP_RELOAD_LINE)
+            case Asm(mnemonic="add", dst=Reg(name="rsp")):
+                _stat("SP2_cold_reload_insert")
+                result.append(_SP_RELOAD_LINE)
+        break
+    # Don't consume extra lines — just insert before the current line
+    return None
+
+
+def _pattern_inverted_store_reload(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Defer stackpointer store to cold path when the hot branch skips it.
+
+    Some stencils (e.g. _POP_TOP_r10) have an "inverted" pattern where the
+    conditional branch jumps to the merge point (hot path) and the
+    fallthrough goes to the cold path.  The store/reload pair is redundant
+    on the hot path since r14 is preserved by callee-saved convention
+    across any C calls on the cold path.
+
+    Matches the pattern:
+        | mov qword [r13 + 64], r14    ← store (line i)
+        | test/cmp REG, IMM            ← branch condition (line i+1)
+        | jcc =>L(MERGE)               ← hot: jump to merge (line i+2)
+        | jmp =>L(COLD)                ← cold path redirect (line i+3)
+        ... intermediate comeback code (labels + instructions) ...
+        |=>L(MERGE):                   ← merge label
+        | mov r14, qword [r13 + 64]   ← reload (eliminated on hot path)
+
+    Transforms to:
+        | test/cmp REG, IMM            ← condition (moved up past store)
+        | jcc =>L(MERGE)               ← hot: jump past store+reload
+        | mov qword [r13 + 64], r14   ← store (deferred, cold-only)
+        | jmp =>L(COLD)               ← cold path redirect
+        ... intermediate comeback code ...
+        | mov r14, qword [r13 + 64]   ← reload (moved before merge label)
+        |=>L(MERGE):                   ← merge point (hot enters here)
+
+    Hot path saves 14 bytes (7-byte store + 7-byte reload) and 2 memory
+    accesses.  The cold comeback path still gets the reload.
+
+    Uses structural pattern matching for robust store_sp, reload_sp, branch,
+    and jump detection.
+    """
+    lines = program.lines
+    if i + 5 >= len(lines):
+        return None
+
+    # First 4 lines: store, test/cmp, jcc, jmp
+    w = [program.parsed[i + k] for k in range(4)]
+    match w:
+        case [
+            Asm(
+                mnemonic="mov",
+                dst=Mem(base=_C.FRAME, offset=_C.FRAME_SP_OFS),
+                src=Reg(name=_C.SP),
+            ),
+            Asm(mnemonic=cmp_op),
+            Asm(target=merge_target),
+            Asm(mnemonic="jmp"),
+        ] if cmp_op in ("test", "cmp") and merge_target:
+            pass  # fall through
+        case _:
+            return None
+
+    # Scan forward to find the merge label and reload
+    merge_label_str = (
+        f"|=>{merge_target[2:]}:" if merge_target.startswith("=>") else None
+    )
+    if not merge_label_str:
+        return None
+    merge_idx = None
+    for j in range(i + 4, min(i + 20, len(lines))):
+        stripped = lines[j].strip()
+        if stripped.replace(" ", "").startswith(
+            merge_label_str.replace(" ", "")
+        ):
+            merge_idx = j
+            break
+    if merge_idx is None or merge_idx + 1 >= len(lines):
+        return None
+    match program.parsed[merge_idx + 1]:
+        case Asm(
+            mnemonic="mov",
+            dst=Reg(name=_C.SP),
+            src=Mem(base=_C.FRAME, offset=_C.FRAME_SP_OFS),
+        ):
+            pass  # confirmed reload
+        case _:
+            return None
+
+    # Pattern matched!  Build the transformed output.
+    result.append(lines[i + 1])  # test/cmp
+    result.append(lines[i + 2])  # jcc =>L(MERGE)
+    result.append(lines[i])  # store (now cold-only)
+    result.append(lines[i + 3])  # jmp =>L(COLD)
+    for k in range(i + 4, merge_idx):
+        result.append(lines[k])
+    result.append(lines[merge_idx + 1])  # reload (before merge label)
+    result.append(lines[merge_idx])  # |=>L(MERGE):
+    _stat("SP3_inverted_store_reload")
+    return (merge_idx + 1) - i + 1
+
+
+def _pattern_test_memory_fold(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Pattern 14: Fold mov+test into test [mem] when loaded register is dead.
+
+    When a mov loads a value from memory solely to test a bit/byte
+    and the register is dead after the conditional branch, we can
+    test the memory location directly, eliminating the mov.
+
+    Matches (3 consecutive lines):
+        | mov REG, qword/dword [MEM]
+        | test REG_LOW, REG_LOW   (or: test REG_LOW, IMM)
+        | jcc =>TARGET
+
+    When REG is dead after the jcc, transforms to:
+        | cmp byte [MEM], 0        (for test REG, REG form)
+        | test byte [MEM], IMM     (for test REG, IMM form)
+        | jcc =>TARGET
+
+    Example (TIER2_RESUME_CHECK — very hot path):
+        Before:
+            | mov rax, qword [r15 + 24]
+            | test al, al
+            | jne =>instruction->jump_target
+        After:
+            | cmp byte [r15 + 24], 0
+            | jne =>instruction->jump_target
+    """
+    del state  # unused
+    lines = program.lines
+    if i + 2 >= len(lines):
+        return None
+    window = [program.parsed[i + k] for k in range(3)]
+    match window:
+        # mov REG, qword/dword [MEM]; test REG_LOW, REG_LOW; jcc
+        case [
+            Asm(mnemonic="mov", dst=Reg() as mov_dst, src=Mem() as mov_src),
+            Asm(mnemonic="test", dst=Reg() as test_dst, src=Reg() as test_src),
+            Asm(target=branch_target),
+        ] if (
+            branch_target
+            and mov_dst.idx is not None
+            and test_dst.idx == mov_dst.idx
+            and test_src.idx == mov_dst.idx
+            and mov_src.size in ("qword", "dword")
+        ):
+            # Start deadness check from the jcc (i+2), not i+3, so
+            # both the fall-through AND the branch target are checked.
+            if not _reg_dead_after(program, i + 2, mov_dst.idx):
+                return None
+            mem_expr = mov_src.expr
+            _stat("P14_test_memory_fold")
+            result.append(f"    | cmp byte {mem_expr}, 0\n")
+            result.append(lines[i + 2])
+            return 3
+
+        # mov REG, qword/dword [MEM]; test REG_LOW, IMM; jcc
+        case [
+            Asm(mnemonic="mov", dst=Reg() as mov_dst, src=Mem() as mov_src),
+            Asm(mnemonic="test", dst=Reg() as test_dst, src=Imm() as test_imm),
+            Asm(target=branch_target),
+        ] if (
+            branch_target
+            and mov_dst.idx is not None
+            and test_dst.idx == mov_dst.idx
+            and mov_src.size in ("qword", "dword")
+        ):
+            if not _reg_dead_after(program, i + 2, mov_dst.idx):
+                return None
+            mem_expr = mov_src.expr
+            _stat("P14_test_memory_fold")
+            result.append(f"    | test byte {mem_expr}, {test_imm.text}\n")
+            result.append(lines[i + 2])
+            return 3
+    return None
+
+
+def _pattern_dead_null_check(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Pattern 13: Remove dead NULL check after PyStackRef tag creation.
+
+    When creating a tagged _PyStackRef from a raw pointer, Clang emits a
+    NULL check that is actually dead code: the preceding ``movzx`` already
+    dereferences the pointer (at offset 6), so a NULL pointer would
+    segfault before the check could ever fire.
+
+    Matches (5 consecutive lines):
+        | movzx edi, word [rax + 6]   ← dereferences rax (proves non-NULL)
+        | and edi, 1                  ← extract ob_flags deferred bit
+        | or rdi, rax                 ← create tagged ref: ptr | flag
+        | cmp rdi, 1                  ← dead: rax!=NULL so rdi!=1
+        | je =>L(N)                   ← dead branch (error path)
+
+    Emits only the first 3 lines (tag creation), removing the dead check.
+
+    Example (from _BINARY_OP_MULTIPLY_FLOAT after freelist allocation):
+        Before:
+            | movzx edi, word [rax + 6]
+            | and edi, 1
+            | or rdi, rax
+            | cmp rdi, 1
+            | je =>L(3)             ← removed (dead)
+        After:
+            | movzx edi, word [rax + 6]
+            | and edi, 1
+            | or rdi, rax
+
+    Uses structural pattern matching for robust operand checking.
+    """
+    lines = program.lines
+    if i + 4 >= len(lines):
+        return None
+    window = [program.parsed[i + k] for k in range(5)]
+    match window:
+        case [
+            # movzx edi, word [REG + 6] — dereferences REG (proves non-NULL)
+            Asm(mnemonic="movzx", src=Mem(size="word", base=deref_reg)),
+            # and edi, 1 — extract ob_flags deferred bit
+            Asm(mnemonic="and", src=Imm(value=1)),
+            # or rdi, REG — create tagged ref: ptr | flag
+            Asm(mnemonic="or", src=Reg(name=tagged_reg)),
+            # cmp rdi, 1 — dead NULL check
+            Asm(mnemonic="cmp", src=Imm(value=1)),
+            # je =>L(N) — dead branch (error path)
+            Asm(mnemonic="je", target=branch_target),
+        ] if deref_reg == tagged_reg.lower() and branch_target:
+            # Emit only the tag creation (first 3 lines), skip cmp + je
+            for k in range(3):
+                result.append(lines[i + k])
+            _stat("P13_dead_null_check")
+            return 5
+    return None
+
+
+def _pattern_dead_frame_anchor(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Remove dead lea anchors introduced to force canonical stack frames.
+
+    The JIT template intentionally forces Clang to materialize a fixed stack
+    frame. That can leave behind dead instructions like ``lea rax, [rbp-144]``
+    whose only purpose was to keep the frame allocated. Those writes must not
+    leak into the stitched trace, since they clobber live cross-stencil
+    registers such as ``rax``.
+    """
+    del state
+    match program.parsed[i]:
+        case Asm(
+            mnemonic="lea",
+            dst=Reg() as dst,
+            src=Mem(base="rbp", index=None, scale=None, offset=offset),
+        ):
+            if offset >= 0 or dst.idx is None:
+                return None
+            if i + 1 < len(program.parsed):
+                next_effect = _line_effect(program.parsed[i + 1])
+                if dst.idx in next_effect.reads:
+                    return None
+            if not _is_dead_before_any_call(program, i, dst.idx):
+                return None
+            _stat("P18_dead_frame_anchor")
+            return 1
+    return None
+
+
+def _pattern_inverse_mov_restore(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Drop the redundant second move in ``mov A, B`` / ``mov B, A`` pairs.
+
+    The first move already preserves the original value of ``B`` in ``A`` while
+    leaving ``B`` unchanged, so the immediate inverse move is a no-op.
+    """
+    del state
+    if i + 1 >= len(program.parsed):
+        return None
+    match program.parsed[i], program.parsed[i + 1]:
+        case (
+            Asm(mnemonic="mov", dst=Reg() as dst1, src=Reg() as src1),
+            Asm(mnemonic="mov", dst=Reg() as dst2, src=Reg() as src2),
+        ):
+            if (
+                dst1.idx is None
+                or src1.idx is None
+                or dst2.idx is None
+                or src2.idx is None
+            ):
+                return None
+            if (
+                dst1.idx != src2.idx
+                or src1.idx != dst2.idx
+                or dst1.bits != src1.bits
+                or dst2.bits != src2.bits
+                or dst1.bits != dst2.bits
+            ):
+                return None
+            result.append(program.lines[i])
+            _stat("P19_inverse_mov_restore")
+            return 2
+    return None
+
+
+def _pass_fold_mov_imm(
+    program: _PeepholeProgram,
+    i: int,
+    result: list[str],
+    state: _PeepholeState,
+) -> int | None:
+    """Driver wrapper for the emit_mov_imm fold family."""
+    del state
+    if (
+        isinstance(program.parsed[i], CCall)
+        and program.parsed[i].kind == CCallKind.MOV_IMM
+    ):
+        return _fold_mov_imm(program, i, result)
+    return None
+
+
+# Pass registry — order matters for priority
+_PEEPHOLE_PASSES = (
+    _PeepholePass("mov_imm_folds", _pass_fold_mov_imm),
+    _PeepholePass(
+        "SP0_preserve_flags_mov_imm", _pattern_preserve_flags_mov_imm
+    ),
+    _PeepholePass("P13_dead_null_check", _pattern_dead_null_check),
+    _PeepholePass("P18_dead_frame_anchor", _pattern_dead_frame_anchor),
+    _PeepholePass("P19_inverse_mov_restore", _pattern_inverse_mov_restore),
+    _PeepholePass("P14_test_memory_fold", _pattern_test_memory_fold),
+    _PeepholePass("SP1_store_reload_elim", _pattern_store_reload_elim),
+    _PeepholePass("SP2_cold_reload_insert", _pattern_cold_reload_insert),
+    _PeepholePass("SP3_inverted_store_reload", _pattern_inverted_store_reload),
+)
diff --git a/Tools/jit/_dasc_writer.py b/Tools/jit/_dasc_writer.py
new file mode 100644
index 00000000000000..93e613f7ae3344
--- /dev/null
+++ b/Tools/jit/_dasc_writer.py
@@ -0,0 +1,448 @@
+"""Generate jit_stencils.h via DynASM from converted stencil assembly.
+
+This module replaces _writer.py for the DynASM-based JIT pipeline.
+It generates a .dasc file from converted stencils, runs the DynASM Lua
+preprocessor on it, and produces a complete jit_stencils.h header.
+"""
+
+from __future__ import annotations
+
+import pathlib
+import re
+import subprocess
+import typing
+
+import _asm_to_dasc
+
+
+# Path to the DynASM Lua preprocessor
+_DYNASM_DIR = pathlib.Path(__file__).resolve().parent / "LuaJIT" / "dynasm"
+_DYNASM_LUA = _DYNASM_DIR / "dynasm.lua"
+
+
+def _generate_dasc_content(
+    stencils: dict[str, _asm_to_dasc.ConvertedStencil],
+    shim: _asm_to_dasc.ConvertedStencil | None = None,
+) -> typing.Iterator[str]:
+    """Generate the contents of the .dasc file.
+
+    This produces a C file with embedded DynASM directives that, when
+    processed by dynasm.lua, yields a header with action lists and
+    dasm_put() calls for each stencil.
+    """
+    max_frame_size = max(
+        (stencil.frame_size for stencil in stencils.values()), default=0
+    )
+
+    # Deduplicate static data blobs: group identical content under a
+    # single declaration, mapping per-stencil names → shared names.
+    _data_by_content: dict[bytes, str] = {}  # content → shared name
+    _data_name_map: dict[str, str] = {}  # old per-stencil name → shared name
+    all_stencils = list(stencils.values())
+    if shim:
+        all_stencils.append(shim)
+    for stencil in all_stencils:
+        for item in stencil.data_items:
+            safe = item.label.replace(".", "_")
+            old_name = f"jit_data_{stencil.opname}_{safe}"
+            content = bytes(item.data)
+            if content not in _data_by_content:
+                shared_name = f"jit_data_{len(_data_by_content)}"
+                _data_by_content[content] = shared_name
+            _data_name_map[old_name] = _data_by_content[content]
+
+    # Simple identifier-based regex for data name substitution.
+    # Matches "jit_data_" followed by an identifier and looks it up
+    # in the map — much faster than a regex with 11K alternatives.
+    _DATA_RE = re.compile(r"\bjit_data_\w+") if _data_name_map else None
+
+    def _resolve_data_names(lines: list[str]) -> list[str]:
+        """Replace per-stencil jit_data_OPNAME_LABEL with shared jit_data_N."""
+        if _DATA_RE is None:
+            return lines
+        resolved = []
+        for line in lines:
+            if "jit_data_" in line:
+                line = _DATA_RE.sub(
+                    lambda m: _data_name_map.get(m.group(0), m.group(0)),
+                    line,
+                )
+            resolved.append(line)
+        return resolved
+
+    yield "// Auto-generated by Tools/jit/_dasc_writer.py — DO NOT EDIT"
+    yield "// This file is processed by DynASM (dynasm.lua -D X64)"
+    yield ""
+
+    # DynASM architecture and section definitions
+    yield "|.arch x64"
+    yield "|.section code, cold, data"
+    yield "|.actionlist jit_actionlist"
+    yield ""
+
+    # Teach DynASM the standard x86-64 byte register names for registers
+    # 4-7 (spl, bpl, sil, dil).  DynASM auto-generates r4b-r7b for these
+    # but the standard names are more readable.  r8b-r15b are already
+    # known to DynASM natively.
+    yield "|.define spl, Rb(4)"
+    yield "|.define bpl, Rb(5)"
+    yield "|.define sil, Rb(6)"
+    yield "|.define dil, Rb(7)"
+    yield ""
+
+    # Shorthand macro for label references — keeps emitted code compact
+    # and human-readable.  L(n) references internal stencil labels.
+    yield "#define L(n) (label_base + (n))"
+    yield ""
+
+    # Named register indices for emit_mov_imm() — human-readable
+    # alternative to raw integer indices.  Prefixed with JREG_ to avoid
+    # collisions with system headers (e.g. ucontext.h REG_R8).
+    yield "#define JREG_RAX  0"
+    yield "#define JREG_RCX  1"
+    yield "#define JREG_RDX  2"
+    yield "#define JREG_RBX  3"
+    yield "#define JREG_RSP  4"
+    yield "#define JREG_RBP  5"
+    yield "#define JREG_RSI  6"
+    yield "#define JREG_RDI  7"
+    yield "#define JREG_R8   8"
+    yield "#define JREG_R9   9"
+    yield "#define JREG_R10 10"
+    yield "#define JREG_R11 11"
+    yield "#define JREG_R12 12"
+    yield "#define JREG_R13 13"
+    yield "#define JREG_R14 14"
+    yield "#define JREG_R15 15"
+    yield ""
+
+    # jit_code_base is set by _PyJIT_Compile to the real allocation address.
+    # Always valid — jit_alloc() places code near CPython text via mmap hints.
+    yield "static uintptr_t jit_code_base;"
+    yield ""
+
+    # Cross-stencil untag reuse: GUARD_TOS/NOS_FLOAT stencils compute
+    # rax = src_reg & -2 (pointer untagging).  If the immediately following
+    # BINARY_OP_*_FLOAT stencil needs the same untag, it can reuse rax
+    yield ""
+
+    # Runtime-optimal immediate load: picks the shortest encoding based
+    # on the actual value at JIT compile time.  Like Pyston's emit_mov_imm.
+    #
+    # Encoding priority (shortest first):
+    #   val == 0:             xor Rd, Rd              (2 bytes)
+    #   val <= UINT32_MAX:    mov Rd, imm32           (5 bytes)
+    #   val near JIT code:    lea Rq, [rip+disp32]    (7 bytes)
+    #   otherwise:            mov64 Rq, imm64         (10 bytes)
+    #
+    # jit_code_base is always the real allocation address, so the LEA
+    # path is used whenever the value is within ±2GB of any point in the JIT code.
+    yield "static void emit_mov_imm(dasm_State **Dst, int r, uintptr_t val) {"
+    yield "    if (val == 0) {"
+    yield "        | xor Rd(r), Rd(r)"
+    yield "    } else if (val <= UINT32_MAX) {"
+    yield "        | mov Rd(r), (unsigned int)val"
+    yield "    } else {"
+    yield "        intptr_t delta = (intptr_t)(val - jit_code_base);"
+    yield "        intptr_t safe_radius = 0x7FFFFFFFLL - PY_MAX_JIT_CODE_SIZE - 15;"
+    yield "        if (delta >= -safe_radius && delta <= safe_radius) {"
+    yield "            | lea Rq(r), [&((void*)(uintptr_t)val)]"
+    yield "        } else {"
+    yield "            | mov64 Rq(r), (unsigned long)val"
+    yield "        }"
+    yield "    }"
+    yield "}"
+    yield ""
+
+    # Flag-preserving immediate load.  Same codegen as emit_mov_imm() except
+    # that the zero-immediate case must use mov instead of xor so condition
+    # codes remain intact for a following setcc/cmov/jcc.
+    yield "static void emit_mov_imm_preserve_flags(dasm_State **Dst, int r, uintptr_t val) {"
+    yield "    if (val == 0) {"
+    yield "        | mov Rd(r), 0"
+    yield "    } else {"
+    yield "        emit_mov_imm(Dst, r, val);"
+    yield "    }"
+    yield "}"
+    yield ""
+
+    # Direct relative call to an external function.  Uses DynASM's &addr
+    # syntax which emits a 5-byte E8 rel32 instruction.
+    #
+    # Falls back to mov+call for targets beyond ±2GB (e.g. shared
+    # library functions when JIT code is mapped far from them).
+    yield "static void emit_call_ext(dasm_State **Dst, void *addr) {"
+    yield "    intptr_t delta = (intptr_t)((uintptr_t)addr - (uintptr_t)jit_code_base);"
+    yield "    intptr_t safe_radius = 0x7FFFFFFFLL - PY_MAX_JIT_CODE_SIZE - 15;"
+    yield "    if (delta >= -safe_radius && delta <= safe_radius) {"
+    yield "        | call qword &addr    // 5-byte E8 rel32"
+    yield "    } else {"
+    yield "        emit_mov_imm(Dst, JREG_RAX, (unsigned long)(uintptr_t)addr);"
+    yield "        | call rax"
+    yield "    }"
+    yield "}"
+    yield ""
+
+    # Generalized ALU register-vs-immediate helper.  Handles all commutative
+    # and comparison ALU operations: cmp, test, and, or, xor.
+    # When the immediate fits in sign-extended imm32, uses the direct form;
+    # otherwise loads into scratch register first.
+    for alu_op in ("cmp", "test", "and", "or", "xor"):
+        func_name = f"emit_{alu_op}_reg_imm"
+        yield f"__attribute__((unused)) // may not be used prevents a compiler warning"
+        yield f"static void {func_name}(dasm_State **Dst, int r, int scratch, uintptr_t val) {{"
+        yield "    if ((int64_t)val == (int32_t)val) {"
+        yield f"        | {alu_op} Rq(r), (int)val"
+        yield "    } else {"
+        yield "        emit_mov_imm(Dst, scratch, val);"
+        yield f"        | {alu_op} Rq(r), Rq(scratch)"
+        yield "    }"
+        yield "}"
+        yield ""
+
+    # 64-bit memory-vs-immediate compare helper for simple [base + offset]
+    # operands.  This replaces the old multiline inline if/else fallback in
+    # _asm_to_dasc.py with one helper call at the use site.
+    yield "__attribute__((unused)) // may not be used prevents a compiler warning"
+    yield "static void emit_cmp_mem64_imm("
+    yield "    dasm_State **Dst, int r_mem, long offset, int scratch, uintptr_t val"
+    yield ") {"
+    yield "    if ((int64_t)val == (int32_t)val) {"
+    yield "        | cmp qword [Rq(r_mem)+ offset], (int)val"
+    yield "    } else {"
+    yield "        emit_mov_imm(Dst, scratch, val);"
+    yield "        | cmp qword [Rq(r_mem)+ offset], Rq(scratch)"
+    yield "    }"
+    yield "}"
+    yield ""
+
+    # 64-bit memory store with immediate value.  When the value fits in
+    # sign-extended imm32, uses a direct mov qword [mem], imm32.
+    # Otherwise loads into scratch register first.
+    yield "__attribute__((unused)) // may not be used prevents a compiler warning"
+    yield "static void emit_store_mem64_imm("
+    yield "    dasm_State **Dst, int r_mem, long offset, int scratch, uintptr_t val"
+    yield ") {"
+    yield "    if ((int64_t)val == (int32_t)val) {"
+    yield "        | mov qword [Rq(r_mem)+ offset], (int)val"
+    yield "    } else {"
+    yield "        emit_mov_imm(Dst, scratch, val);"
+    yield "        | mov qword [Rq(r_mem)+ offset], Rq(scratch)"
+    yield "    }"
+    yield "}"
+    yield ""
+
+    # _SET_IP delta helper: replace movabs+store (14 bytes) with
+    # add qword [frame+IP_OFFSET], delta (8 bytes) for subsequent _SET_IP ops.
+    yield "static void emit_set_ip_delta(dasm_State **Dst, int uop_label, intptr_t delta) {"
+    yield "    |.code"
+    yield "    |=>uop_label:"
+    yield (
+        "    | add qword [r13 + "
+        + str(_asm_to_dasc.FRAME_IP_OFFSET)
+        + "], (int)(delta)"
+    )
+    yield "}"
+    yield ""
+
+    yield "static int jit_max_frame_size(void) {"
+    yield f"    return {max_frame_size};"
+    yield "}"
+    yield ""
+
+    yield "static void emit_trace_entry_frame(dasm_State **Dst) {"
+    yield "    |.code"
+    yield "    | push rbp"
+    yield "    | mov rbp, rsp"
+    yield "    | sub rsp, jit_max_frame_size()"
+    yield "}"
+    yield ""
+
+    # Deduplicated static data blobs (assert strings, etc.)
+    for content, shared_name in sorted(
+        _data_by_content.items(), key=lambda x: x[1]
+    ):
+        vals = ", ".join(str(b) for b in content)
+        yield f"static const char {shared_name}[] = {{{vals}}};"
+    yield ""
+
+    # Emit function for each stencil
+    for opname, stencil in sorted(stencils.items()):
+        yield f"static void emit_{opname}("
+        yield "    dasm_State **Dst,"
+        yield "    const _PyUOpInstruction *instruction,"
+        yield "    int uop_label,"
+        yield "    int continue_label,"
+        yield "    int label_base"
+        yield ") {"
+        stencil_lines = _resolve_data_names(stencil.lines)
+        for line in stencil_lines:
+            yield line
+        yield "}"
+        yield ""
+
+    # Shim emit function (not rewritten — has its own prologue/epilogue)
+    if shim:
+        yield "static void emit_shim(dasm_State **Dst, int uop_label, int label_base) {"
+        for line in _resolve_data_names(shim.lines):
+            yield line
+        yield "}"
+        yield ""
+
+    # Shim internal-label-count helper
+    if shim:
+        yield f"static int jit_internal_label_count_shim(void) {{ return {shim.num_internal_labels}; }}"
+    else:
+        yield "static int jit_internal_label_count_shim(void) { return 0; }"
+    yield ""
+
+    # Emit function type
+    yield "typedef void (*jit_emit_fn)("
+    yield "    dasm_State **Dst,"
+    yield "    const _PyUOpInstruction *instruction,"
+    yield "    int uop_label,"
+    yield "    int continue_label,"
+    yield "    int label_base"
+    yield ");"
+    yield ""
+
+    # Stencil descriptor table: function pointer + label count + whether the
+    # stencil invalidates the tracked frame->ip value on the hot path.
+    yield "static const struct {"
+    yield "    jit_emit_fn emit;"
+    yield "    int label_count;"
+    yield "    int invalidates_ip;"  # stencil writes r13 or frame->ip on hot path
+    yield f"}} jit_stencil_table[MAX_UOP_REGS_ID + 1] = {{"
+    for opname, stencil in sorted(stencils.items()):
+        # Detect if the stencil invalidates our tracked frame->ip value.
+        # Two cases (both checked on the hot path only):
+        #  1. Writes to r13 directly (frame pointer changes)
+        #  2. Writes to [r13 + <IP offset>] (frame->ip modified by the stencil)
+        invalidates_ip = 0
+        _ip_mem = f"r13 + {_asm_to_dasc.FRAME_IP_OFFSET}"
+        for line in stencil.lines:
+            if ".cold" in line:
+                break
+            stripped = line.strip()
+            # Case 1: "mov r13, <source>" (not "mov qword [r13+...]")
+            if "mov r13," in stripped:
+                before_r13 = stripped.split("r13,")[0]
+                if "[" not in before_r13:
+                    invalidates_ip = 1
+                    break
+            # Case 2: write to [r13 + <IP offset>] (frame->ip)
+            if _ip_mem in stripped:
+                for op in (
+                    f"mov qword [{_ip_mem}]",
+                    f"add qword [{_ip_mem}]",
+                    f"sub qword [{_ip_mem}]",
+                ):
+                    if op in stripped:
+                        invalidates_ip = 1
+                        break
+                if invalidates_ip:
+                    break
+        yield (
+            f"    [{opname}] = {{ emit_{opname}, "
+            f"{stencil.num_internal_labels}, {invalidates_ip} }},"
+        )
+    yield "};"
+    yield ""
+
+    # Thin wrappers used by jit.c
+    yield "static int jit_internal_label_count(int opcode) {"
+    yield "    return jit_stencil_table[opcode].label_count;"
+    yield "}"
+    yield ""
+    yield "static int jit_invalidates_ip(int opcode) {"
+    yield "    return jit_stencil_table[opcode].invalidates_ip;"
+    yield "}"
+    yield ""
+    yield "static void jit_emit_one("
+    yield "    dasm_State **Dst,"
+    yield "    int opcode,"
+    yield "    const _PyUOpInstruction *instruction,"
+    yield "    int uop_label,"
+    yield "    int continue_label,"
+    yield "    int label_base"
+    yield ") {"
+    yield "    jit_stencil_table[opcode].emit(Dst, instruction, uop_label, continue_label, label_base);"
+    yield "}"
+
+
+def write_dasc(
+    dasc_path: pathlib.Path,
+    stencils: dict[str, _asm_to_dasc.ConvertedStencil],
+    shim: _asm_to_dasc.ConvertedStencil | None = None,
+) -> None:
+    """Write the .dasc file to disk."""
+    with dasc_path.open("w") as f:
+        for line in _generate_dasc_content(stencils, shim):
+            f.write(line)
+            f.write("\n")
+
+
+def run_dynasm(
+    dasc_path: pathlib.Path,
+    output_path: pathlib.Path,
+    *,
+    luajit: str = "luajit",
+) -> None:
+    """Run the DynASM Lua preprocessor to generate the C header.
+
+    Args:
+        dasc_path: Path to the .dasc file
+        output_path: Path for the generated .h output
+        luajit: Path to luajit binary
+    """
+    if not _DYNASM_LUA.exists():
+        raise FileNotFoundError(
+            f"DynASM preprocessor not found at {_DYNASM_LUA}.\n"
+            f"Ensure the LuaJIT submodule is initialized:\n"
+            f"  git submodule update --init Tools/jit/LuaJIT"
+        )
+    cmd = [
+        luajit,
+        str(_DYNASM_LUA),
+        "-D",
+        "X64",
+        "-o",
+        str(output_path),
+        str(dasc_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"DynASM preprocessing failed:\n"
+            f"stdout: {result.stdout}\n"
+            f"stderr: {result.stderr}\n"
+            f"Command: {' '.join(cmd)}"
+        )
+
+
+def dump_header(
+    stencils: dict[str, _asm_to_dasc.ConvertedStencil],
+    shim: _asm_to_dasc.ConvertedStencil | None = None,
+    *,
+    dasc_path: pathlib.Path,
+    luajit: str = "luajit",
+) -> str:
+    """Generate jit_stencils.h content via DynASM.
+
+    1. Writes a .dasc file
+    2. Runs dynasm.lua to produce a .h file
+    3. Returns the .h file content
+
+    Args:
+        stencils: Dict mapping opname to ConvertedStencil
+        shim: Optional shim stencil
+        dasc_path: Path for the intermediate .dasc file
+        luajit: Path to luajit binary
+
+    Returns:
+        The generated header content as a string
+    """
+    output_path = dasc_path.with_suffix(".h")
+    write_dasc(dasc_path, stencils, shim)
+    run_dynasm(dasc_path, output_path, luajit=luajit)
+    return output_path.read_text()
diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py
index 83c878d8fe205b..0561ef824cce05 100644
--- a/Tools/jit/_optimizers.py
+++ b/Tools/jit/_optimizers.py
@@ -63,42 +63,40 @@
     "hi": "ls",
     "ls": "hi",
 }
-# MyPy doesn't understand that a invariant variable can be initialized by a covariant value
-CUSTOM_AARCH64_BRANCH19: str | None = "CUSTOM_AARCH64_BRANCH19"
-
 _AARCH64_SHORT_BRANCHES = {
     "tbz": "tbnz",
     "tbnz": "tbz",
 }
 
-# Branches are either b.{cond}, bc.{cond}, cbz, cbnz, tbz or tbnz
+# Branches are either b.{cond}, bc.{cond}, cbz, cbnz, tbz or tbnz.
+# Second tuple element unused (was for relocation fixup, now handled by DynASM).
 _AARCH64_BRANCHES: dict[str, tuple[str | None, str | None]] = (
     {
-        "b." + cond: (("b." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19)
+        "b." + cond: (("b." + inverse if inverse else None), None)
         for (cond, inverse) in _AARCH64_COND_CODES.items()
     }
     | {
-        "bc." + cond: (("bc." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19)
+        "bc." + cond: (("bc." + inverse if inverse else None), None)
         for (cond, inverse) in _AARCH64_COND_CODES.items()
     }
     | {
-        "cbz": ("cbnz", CUSTOM_AARCH64_BRANCH19),
-        "cbnz": ("cbz", CUSTOM_AARCH64_BRANCH19),
+        "cbz": ("cbnz", None),
+        "cbnz": ("cbz", None),
+    }
+    | {
+        cond: (inverse, None)
+        for (cond, inverse) in _AARCH64_SHORT_BRANCHES.items()
     }
-    | {cond: (inverse, None) for (cond, inverse) in _AARCH64_SHORT_BRANCHES.items()}
 )
 
 
 @enum.unique
 class InstructionKind(enum.Enum):
-
     JUMP = enum.auto()
     LONG_BRANCH = enum.auto()
     SHORT_BRANCH = enum.auto()
     CALL = enum.auto()
     RETURN = enum.auto()
-    SMALL_CONST_1 = enum.auto()
-    SMALL_CONST_2 = enum.auto()
     OTHER = enum.auto()
 
 
@@ -110,12 +108,18 @@ class Instruction:
     target: str | None
 
     def is_branch(self) -> bool:
-        return self.kind in (InstructionKind.LONG_BRANCH, InstructionKind.SHORT_BRANCH)
+        return self.kind in (
+            InstructionKind.LONG_BRANCH,
+            InstructionKind.SHORT_BRANCH,
+        )
 
     def update_target(self, target: str) -> "Instruction":
         assert self.target is not None
         return Instruction(
-            self.kind, self.name, self.text.replace(self.target, target), target
+            self.kind,
+            self.name,
+            self.text.replace(self.target, target),
+            target,
         )
 
     def update_name_and_target(self, name: str, target: str) -> "Instruction":
@@ -164,7 +168,9 @@ class Optimizer:
     re_global: re.Pattern[str]
     # The first block in the linked list:
     _root: _Block = dataclasses.field(init=False, default_factory=_Block)
-    _labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict)
+    _labels: dict[str, _Block] = dataclasses.field(
+        init=False, default_factory=dict
+    )
     # No groups:
     _re_noninstructions: typing.ClassVar[re.Pattern[str]] = re.compile(
         r"\s*(?:\.|#|//|;|$)"
@@ -174,8 +180,6 @@ class Optimizer:
         r'\s*(?P<label>[\w."$?@]+):'
     )
     # Override everything that follows in subclasses:
-    _supports_external_relocations = True
-    supports_small_constants = False
     _branches: typing.ClassVar[dict[str, tuple[str | None, str | None]]] = {}
     # Short branches are instructions that can branch within a micro-op,
     # but might not have the reach to branch anywhere within a trace.
@@ -190,9 +194,6 @@ class Optimizer:
     _re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
     text: str = ""
     globals: set[str] = dataclasses.field(default_factory=set)
-    _re_small_const_1 = _RE_NEVER_MATCH
-    _re_small_const_2 = _RE_NEVER_MATCH
-    const_reloc = "<Not supported>"
 
     def __post_init__(self) -> None:
         # Split the code into a linked list of basic blocks. A basic block is an
@@ -230,13 +231,13 @@ def __post_init__(self) -> None:
                 assert block.fallthrough
             elif inst.kind == InstructionKind.CALL:
                 # A block ending in a call has a target and return point after call:
-                assert inst.target is not None
-                block.target = self._lookup_label(inst.target)
+                if inst.target is not None:
+                    block.target = self._lookup_label(inst.target)
                 assert block.fallthrough
             elif inst.kind == InstructionKind.JUMP:
                 # A block ending in a jump has a target and no fallthrough:
-                assert inst.target is not None
-                block.target = self._lookup_label(inst.target)
+                if inst.target is not None:
+                    block.target = self._lookup_label(inst.target)
                 block.fallthrough = False
             elif inst.kind == InstructionKind.RETURN:
                 # A block ending in a return has no target and fallthrough:
@@ -249,7 +250,29 @@ def _preprocess(self, text: str) -> str:
         # references to a local _JIT_CONTINUE label (which we will add later):
         continue_symbol = rf"\b{re.escape(self.symbol_prefix)}_JIT_CONTINUE\b"
         continue_label = f"{self.label_prefix}_JIT_CONTINUE"
-        return re.sub(continue_symbol, continue_label, text)
+        text = re.sub(continue_symbol, continue_label, text)
+        # Keep metadata-only directives out of the assembled stencil body:
+        text = re.sub(
+            r'^\s*\.section\s+"?\.note\.GNU-stack.*$\n?',
+            "",
+            text,
+            flags=re.MULTILINE,
+        )
+        # Cross-section hot/cold splitting can make these non-absolute:
+        text = re.sub(
+            r"^\s*\.size\s+[\w.]+,\s+.*$\n?",
+            "",
+            text,
+            flags=re.MULTILINE,
+        )
+        # Keep type directives only for _JIT_ENTRY:
+        text = re.sub(
+            r"^\s*\.type\s+(?!_JIT_ENTRY\s)[\w.]+,@function\s*$\n?",
+            "",
+            text,
+            flags=re.MULTILINE,
+        )
+        return text
 
     def _parse_instruction(self, line: str) -> Instruction:
         target = None
@@ -271,22 +294,18 @@ def _parse_instruction(self, line: str) -> Instruction:
         elif match := self._re_return.match(line):
             name = line
             kind = InstructionKind.RETURN
-        elif match := self._re_small_const_1.match(line):
-            target = match["value"]
-            name = match["instruction"]
-            kind = InstructionKind.SMALL_CONST_1
-        elif match := self._re_small_const_2.match(line):
-            target = match["value"]
-            name = match["instruction"]
-            kind = InstructionKind.SMALL_CONST_2
         else:
             name, *_ = line.split(" ")
             kind = InstructionKind.OTHER
         return Instruction(kind, name, line, target)
 
-    def _invert_branch(self, inst: Instruction, target: str) -> Instruction | None:
+    def _invert_branch(
+        self, inst: Instruction, target: str
+    ) -> Instruction | None:
         assert inst.is_branch()
-        if inst.kind == InstructionKind.SHORT_BRANCH and self._is_far_target(target):
+        if inst.kind == InstructionKind.SHORT_BRANCH and self._is_far_target(
+            target
+        ):
             return None
         inverted_reloc = self._branches.get(inst.name)
         if inverted_reloc is None:
@@ -310,20 +329,118 @@ def _blocks(self) -> typing.Generator[_Block, None, None]:
             yield block
             block = block.link
 
+    _cold_section_directive: typing.ClassVar[str | None] = None
+    _hot_section_directive: typing.ClassVar[str | None] = None
+
     def _body(self) -> str:
         lines = ["#" + line for line in self.text.splitlines()]
-        hot = True
+        split_cold = self._cold_section_directive is not None
+        if not split_cold:
+            hot = True
+            for block in self._blocks():
+                if hot != block.hot:
+                    hot = block.hot
+                    # Make it easy to tell at a glance where cold code is:
+                    lines.append(
+                        f"# JIT: {'HOT' if hot else 'COLD'} ".ljust(80, "#")
+                    )
+                lines.extend(block.noninstructions)
+                for inst in block.instructions:
+                    lines.append(inst.text)
+            return "\n".join(lines)
+
+        hot_lines: list[str] = []
+        cold_lines: list[str] = []
+        continue_label = f"{self.label_prefix}_JIT_CONTINUE"
+        has_cold = any(not block.hot for block in self._blocks())
+
+        split_label_counter = [0]
+
+        def _ensure_label(block: _Block) -> str:
+            if block.label is not None:
+                return block.label
+            for noninstruction in block.noninstructions:
+                stripped = noninstruction.strip()
+                if stripped.endswith(":") and not stripped.startswith("#"):
+                    label = stripped[:-1]
+                    block.label = label
+                    return label
+            label = f"{self.label_prefix}_JIT_SPLIT_{split_label_counter[0]}"
+            split_label_counter[0] += 1
+            block.label = label
+            block.noninstructions.insert(0, f"{label}:")
+            return label
+
         for block in self._blocks():
-            if hot != block.hot:
-                hot = block.hot
-                # Make it easy to tell at a glance where cold code is:
-                lines.append(f"# JIT: {'HOT' if hot else 'COLD'} ".ljust(80, "#"))
-            lines.extend(block.noninstructions)
-            for inst in block.instructions:
-                lines.append(inst.text)
+            if has_cold and block.label == continue_label:
+                continue
+            dest = hot_lines if block.hot else cold_lines
+            dest.extend(block.noninstructions)
+            instructions = list(block.instructions)
+            # Splitting sections can make a jump to the next emitted block in
+            # the same section redundant.
+            if (
+                instructions
+                and instructions[-1].kind == InstructionKind.JUMP
+                and block.target is not None
+            ):
+                next_same = block.link
+                while next_same and (
+                    next_same.hot != block.hot
+                    or (has_cold and next_same.label == continue_label)
+                ):
+                    next_same = next_same.link
+                if next_same and block.target.resolve() is next_same.resolve():
+                    instructions.pop()
+            for inst in instructions:
+                dest.append(inst.text)
+            # Fallthrough crosses sections, so make it explicit:
+            if (
+                block.fallthrough
+                and block.link
+                and block.hot != block.link.hot
+            ):
+                target = _ensure_label(block.link)
+                # Prefer inverting a final hot branch to avoid the extra
+                # branch+jump sequence:
+                #     jcc HOT; jmp COLD
+                #  -> jncc COLD   (HOT fallthrough)
+                if (
+                    block.hot
+                    and block.instructions
+                    and block.instructions[-1].is_branch()
+                    and block.target
+                    and block.target.hot
+                ):
+                    next_hot = block.link
+                    while next_hot and not next_hot.hot:
+                        next_hot = next_hot.link
+                    if (
+                        next_hot is not None
+                        and block.target.resolve() is next_hot.resolve()
+                    ):
+                        inverted = self._invert_branch(
+                            block.instructions[-1], target
+                        )
+                        if inverted is not None:
+                            dest[-1] = inverted.text
+                            continue
+                dest.append(f"\tjmp\t{target}")
+
+        lines.extend(hot_lines)
+        if has_cold:
+            assert self._hot_section_directive is not None
+            assert self._cold_section_directive is not None
+            lines.append(self._hot_section_directive)
+            lines.append(f"{continue_label}:")
+            lines.append(self._cold_section_directive)
+            lines.extend(cold_lines)
+            lines.append(self._hot_section_directive)
         return "\n".join(lines)
 
-    def _predecessors(self, block: _Block) -> typing.Generator[_Block, None, None]:
+    def _predecessors(
+        self, block: _Block
+    ) -> typing.Generator[_Block, None, None]:
         # This is inefficient, but it's never wrong:
         for pre in self._blocks():
             if pre.target is block or pre.fallthrough and pre.link is block:
@@ -352,7 +469,174 @@ def _mark_hot_blocks(self) -> None:
         while todo:
             block = todo.pop()
             block.hot = True
-            todo.extend(pre for pre in self._predecessors(block) if not pre.hot)
+            todo.extend(
+                pre for pre in self._predecessors(block) if not pre.hot
+            )
+        self._demote_cold_blocks()
+
+    _call_pattern: typing.ClassVar[re.Pattern[str]] = re.compile(
+        r"^\s*(?:callq?|blx?|blr)\b"
+    )
+
+    def _is_call_block(self, block: _Block) -> bool:
+        if not block.instructions:
+            return False
+        for inst in block.instructions:
+            if inst.kind == InstructionKind.CALL:
+                return True
+            if self._call_pattern.search(inst.text):
+                return True
+        return False
+
+    def _successors(self, block: _Block) -> list[_Block]:
+        successors = []
+        if block.target is not None:
+            successors.append(block.target.resolve())
+        if block.fallthrough and block.link is not None:
+            successors.append(block.link.resolve())
+        return successors
+
+    def _has_hot_passthrough(self, block: _Block, chain: set[_Block]) -> bool:
+        """Check if demoting block would route hot traffic through cold."""
+        has_hot_predecessor = any(
+            pred.hot and pred not in chain
+            for pred in self._predecessors(block)
+        )
+        if not has_hot_predecessor:
+            return False
+        return any(
+            succ.hot and succ not in chain for succ in self._successors(block)
+        )
+
+    def _demote_cold_blocks(self) -> None:
+        """Demote call blocks and their feeder paths to cold.
+
+        First demotes blocks containing function calls, then blocks whose
+        only purpose is feeding those cold regions.  The entry block is
+        never demoted.
+        """
+        entry: _Block | None = None
+        for block in self._blocks():
+            if block.instructions:
+                entry = block
+                break
+        # Phase 1: demote call blocks.
+        changed = True
+        while changed:
+            changed = False
+            for block in reversed(list(self._blocks())):
+                if (
+                    not block.hot
+                    or block is entry
+                    or not self._is_call_block(block)
+                ):
+                    continue
+                chain = self._find_cold_chain(block)
+                if chain is not None and any(b.hot for b in chain):
+                    for b in chain:
+                        b.hot = False
+                    changed = True
+        # Phase 2: demote feeder blocks that only exist to reach cold code.
+        changed = True
+        while changed:
+            changed = False
+            for block in reversed(list(self._blocks())):
+                if (
+                    not block.hot
+                    or block is entry
+                    or not block.instructions
+                    or not any(not s.hot for s in self._successors(block))
+                ):
+                    continue
+                chain = self._find_cold_chain(block)
+                if chain is not None and any(b.hot for b in chain):
+                    for b in chain:
+                        b.hot = False
+                    changed = True
+
+    def _can_reach_without(
+        self, start: _Block, target: _Block, avoid: set[_Block]
+    ) -> bool:
+        """Check if target is reachable from start without going through avoid."""
+        visited: set[int] = set()
+        todo = [start]
+        while todo:
+            block = todo.pop()
+            block_id = id(block)
+            if block_id in visited or block in avoid:
+                continue
+            visited.add(block_id)
+            if block is target:
+                return True
+            if block.target is not None:
+                todo.append(block.target)
+            if block.fallthrough and block.link is not None:
+                todo.append(block.link)
+        return False
+
+    def _find_cold_chain(self, cold_block: _Block) -> set[_Block] | None:
+        chain: set[_Block] = {cold_block}
+        changed = True
+        while changed:
+            changed = False
+            for block in list(self._blocks()):
+                if not block.hot or block in chain or not block.instructions:
+                    continue
+                # Pull predecessor gates into the chain if all successors are
+                # already in the chain.
+                target_ok = block.target is None or block.target in chain
+                fallthrough_ok = (
+                    not block.fallthrough
+                    or block.link is None
+                    or block.link in chain
+                )
+                if target_ok and fallthrough_ok:
+                    chain.add(block)
+                    changed = True
+                    continue
+                # Also include blocks only reachable from inside the chain.
+                preds = list(self._predecessors(block))
+                if preds and all(pred in chain for pred in preds):
+                    chain.add(block)
+                    changed = True
+
+        # Every edge into the chain from outside must have a hot alternative.
+        for block in chain:
+            # Keep tiny branch gates that sit between hot predecessors and hot
+            # successors in the hot section. They are frequently executed and
+            # should not pay a cross-section jump penalty.
+            if self._has_hot_passthrough(block, chain):
+                return None
+            for pred in self._predecessors(block):
+                if pred in chain or not pred.hot:
+                    continue
+                has_hot_alt = False
+                if (
+                    pred.target
+                    and pred.target not in chain
+                    and pred.target.hot
+                ):
+                    has_hot_alt = True
+                if (
+                    pred.fallthrough
+                    and pred.link
+                    and pred.link not in chain
+                    and pred.link.hot
+                ):
+                    has_hot_alt = True
+                if not has_hot_alt:
+                    return None
+
+        # Reject chains that would disconnect the hot path from entry to
+        # continuation.  A "hot alternative" like a loop backedge doesn't
+        # help if the loop must always exit through the chain.
+        entry = next((b for b in self._blocks() if b.instructions), None)
+        continuation = self._labels.get(f"{self.label_prefix}_JIT_CONTINUE")
+        if entry and continuation:
+            if not self._can_reach_without(entry, continuation, chain):
+                return None
+
+        return chain
 
     def _invert_hot_branches(self) -> None:
         for branch in self._blocks():
@@ -438,7 +722,9 @@ def _remove_redundant_jumps(self) -> None:
     def _find_live_blocks(self) -> set[_Block]:
         live: set[_Block] = set()
         # Externally reachable blocks are live
-        todo: set[_Block] = {b for b in self._blocks() if b.label in self.globals}
+        todo: set[_Block] = {
+            b for b in self._blocks() if b.label in self.globals
+        }
         while todo:
             block = todo.pop()
             live.add(block)
@@ -468,91 +754,6 @@ def _remove_unreachable(self) -> None:
             block = next
             assert prev.link is block
 
-    def _fixup_external_labels(self) -> None:
-        if self._supports_external_relocations:
-            # Nothing to fix up
-            return
-        for index, block in enumerate(self._blocks()):
-            if block.target and block.fallthrough:
-                branch = block.instructions[-1]
-                if branch.kind == InstructionKind.CALL:
-                    continue
-                assert branch.is_branch()
-                target = branch.target
-                assert target is not None
-                reloc = self._branches[branch.name][1]
-                if reloc is not None and self._is_far_target(target):
-                    name = target[len(self.symbol_prefix) :]
-                    label = f"{self.symbol_prefix}{reloc}_JIT_RELOCATION_{name}_JIT_RELOCATION_{index}:"
-                    block.instructions[-1] = Instruction(
-                        InstructionKind.OTHER, "", label, None
-                    )
-                    block.instructions.append(branch.update_target("0"))
-
-    def _make_temp_label(self, index: int) -> Instruction:
-        marker = f"jit_temp_{index}:"
-        return Instruction(InstructionKind.OTHER, "", marker, None)
-
-    def _fixup_constants(self) -> None:
-        if not self.supports_small_constants:
-            return
-        index = 0
-        for block in self._blocks():
-            fixed: list[Instruction] = []
-            small_const_index = -1
-            for inst in block.instructions:
-                if inst.kind == InstructionKind.SMALL_CONST_1:
-                    marker = f"jit_pending_{inst.target}{index}:"
-                    fixed.append(self._make_temp_label(index))
-                    index += 1
-                    small_const_index = len(fixed)
-                    fixed.append(inst)
-                elif inst.kind == InstructionKind.SMALL_CONST_2:
-                    if small_const_index < 0:
-                        fixed.append(inst)
-                        continue
-                    small_const_1 = fixed[small_const_index]
-                    if not self._small_consts_match(small_const_1, inst):
-                        small_const_index = -1
-                        fixed.append(inst)
-                        continue
-                    assert small_const_1.target is not None
-                    if small_const_1.target.endswith("16"):
-                        fixed[small_const_index] = self._make_temp_label(index)
-                        index += 1
-                    else:
-                        assert small_const_1.target.endswith("32")
-                        patch_kind, replacement = self._small_const_1(small_const_1)
-                        if replacement is not None:
-                            label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{small_const_1.target[:-3]}_JIT_RELOCATION_{index}:"
-                            index += 1
-                            fixed[small_const_index - 1] = Instruction(
-                                InstructionKind.OTHER, "", label, None
-                            )
-                            fixed[small_const_index] = replacement
-                    patch_kind, replacement = self._small_const_2(inst)
-                    if replacement is not None:
-                        assert inst.target is not None
-                        label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{inst.target[:-3]}_JIT_RELOCATION_{index}:"
-                        index += 1
-                        fixed.append(
-                            Instruction(InstructionKind.OTHER, "", label, None)
-                        )
-                        fixed.append(replacement)
-                    small_const_index = -1
-                else:
-                    fixed.append(inst)
-            block.instructions = fixed
-
-    def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        raise NotImplementedError()
-
-    def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        raise NotImplementedError()
-
-    def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
-        raise NotImplementedError()
-
     def run(self) -> None:
         """Run this optimizer."""
         self._insert_continue_label()
@@ -563,18 +764,14 @@ def run(self) -> None:
             self._invert_hot_branches()
             self._remove_redundant_jumps()
             self._remove_unreachable()
-        self._fixup_external_labels()
-        self._fixup_constants()
         self.path.write_text(self._body())
 
 
 class OptimizerAArch64(Optimizer):  # pylint: disable = too-few-public-methods
-    """aarch64-pc-windows-msvc/aarch64-apple-darwin/aarch64-unknown-linux-gnu"""
+    """aarch64-unknown-linux-gnu"""
 
     _branches = _AARCH64_BRANCHES
     _short_branches = _AARCH64_SHORT_BRANCHES
-    # Mach-O does not support the 19 bit branch locations needed for branch reordering
-    _supports_external_relocations = False
     _branch_patterns = [name.replace(".", r"\.") for name in _AARCH64_BRANCHES]
     _re_branch = re.compile(
         rf"\s*(?P<instruction>{'|'.join(_branch_patterns)})\s+(.+,\s+)*(?P<target>[\w.]+)"
@@ -587,57 +784,9 @@ class OptimizerAArch64(Optimizer):  # pylint: disable = too-few-public-methods
     # https://developer.arm.com/documentation/ddi0602/2025-09/Base-Instructions/RET--Return-from-subroutine-
     _re_return = re.compile(r"\s*ret\b")
 
-    supports_small_constants = True
-    _re_small_const_1 = re.compile(
-        r"\s*(?P<instruction>adrp)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
-    )
-    _re_small_const_2 = re.compile(
-        r"\s*(?P<instruction>ldr)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
-    )
-    const_reloc = "CUSTOM_AARCH64_CONST"
-
-    def _get_reg(self, inst: Instruction) -> str:
-        _, rest = inst.text.split(inst.name)
-        reg, *_ = rest.split(",")
-        return reg.strip()
-
-    def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        assert inst.kind is InstructionKind.SMALL_CONST_1
-        assert inst.target is not None
-        if "16" in inst.target:
-            return "", None
-        pre, _ = inst.text.split(inst.name)
-        return "16a", Instruction(
-            InstructionKind.OTHER, "movz", f"{pre}movz {self._get_reg(inst)}, 0", None
-        )
-
-    def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        assert inst.kind is InstructionKind.SMALL_CONST_2
-        assert inst.target is not None
-        pre, _ = inst.text.split(inst.name)
-        if "16" in inst.target:
-            return "16a", Instruction(
-                InstructionKind.OTHER,
-                "movz",
-                f"{pre}movz {self._get_reg(inst)}, 0",
-                None,
-            )
-        else:
-            return "16b", Instruction(
-                InstructionKind.OTHER,
-                "movk",
-                f"{pre}movk {self._get_reg(inst)}, 0, lsl #16",
-                None,
-            )
-
-    def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
-        reg1 = self._get_reg(inst1)
-        reg2 = self._get_reg(inst2)
-        return reg1 == reg2
-
 
 class OptimizerX86(Optimizer):  # pylint: disable = too-few-public-methods
-    """i686-pc-windows-msvc/x86_64-apple-darwin/x86_64-unknown-linux-gnu"""
+    """x86_64-unknown-linux-gnu"""
 
     _branches = _X86_BRANCHES
     _short_branches = {}
@@ -647,6 +796,16 @@ class OptimizerX86(Optimizer):  # pylint: disable = too-few-public-methods
     # https://www.felixcloutier.com/x86/call
     _re_call = re.compile(r"\s*callq?\s+(?P<target>[\w.]+)")
     # https://www.felixcloutier.com/x86/jmp
-    _re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
+    _re_jump = re.compile(r"\s*jmpq?\s+(?P<target>[\w.]+)")
     # https://www.felixcloutier.com/x86/ret
-    _re_return = re.compile(r"\s*ret\b")
+    _re_return = re.compile(r"\s*retq?\b")
+
+
+class OptimizerAArch64ELF(OptimizerAArch64):
+    _cold_section_directive = '\t.section\t.text.cold,"ax",@progbits'
+    _hot_section_directive = "\t.text"
+
+
+class OptimizerX86ELF(OptimizerX86):
+    _cold_section_directive = '\t.section\t.text.cold,"ax",@progbits'
+    _hot_section_directive = "\t.text"
diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py
index 964e8bdbdc1010..e19393b2bdee02 100644
--- a/Tools/jit/_schema.py
+++ b/Tools/jit/_schema.py
@@ -106,6 +106,7 @@ class ELFSection(typing.TypedDict):
     Flags: dict[typing.Literal["Flags"], list[dict[typing.Literal["Name"], str]]]
     Index: int
     Info: int
+    Name: typing.NotRequired[dict[typing.Literal["Name"], str]]
     Relocations: list[dict[typing.Literal["Relocation"], ELFRelocation]]
     SectionData: dict[typing.Literal["Bytes"], list[int]]
     Symbols: list[dict[typing.Literal["Symbol"], _ELFSymbol]]
diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py
index 258de8ab3136a4..21da4d3ef43c6d 100644
--- a/Tools/jit/_stencils.py
+++ b/Tools/jit/_stencils.py
@@ -17,6 +17,8 @@ class HoleValue(enum.Enum):
 
     # The base address of the machine code for the current uop (exposed as _JIT_ENTRY):
     CODE = enum.auto()
+    # The base address of the cold machine code for the current uop:
+    COLD_CODE = enum.auto()
     # The base address of the read-only data for this uop:
     DATA = enum.auto()
     # The base address of the "global" offset table located in the read-only data.
@@ -91,7 +93,10 @@ class HoleValue(enum.Enum):
     "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d",
     # x86_64-unknown-linux-gnu:
     "R_X86_64_64": "patch_64",
+    "R_X86_64_32": "patch_32",
+    "R_X86_64_32S": "patch_32",
     "R_X86_64_GOTPCRELX": "patch_x86_64_32rx",
+    "R_X86_64_PC32": "patch_32r",
     "R_X86_64_PLT32": "patch_32r",
     "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32rx",
     # x86_64-apple-darwin:
@@ -105,6 +110,7 @@ class HoleValue(enum.Enum):
 # Translate HoleValues to C expressions:
 _HOLE_EXPRS = {
     HoleValue.CODE: "(uintptr_t)code",
+    HoleValue.COLD_CODE: "(uintptr_t)cold_code",
     HoleValue.DATA: "(uintptr_t)data",
     HoleValue.GOT: "",
     # These should all have been turned into DATA values by process_relocations:
@@ -255,10 +261,13 @@ class StencilGroup:
     """
 
     code: Stencil = dataclasses.field(default_factory=Stencil, init=False)
+    cold_code: Stencil = dataclasses.field(default_factory=Stencil, init=False)
     data: Stencil = dataclasses.field(default_factory=Stencil, init=False)
     symbols: dict[int | str, tuple[HoleValue, int]] = dataclasses.field(
         default_factory=dict, init=False
     )
+    # Assembly text (Intel syntax) preserved for DynASM conversion:
+    assembly_text: str = ""
     _jit_symbol_table: dict[str, int] = dataclasses.field(
         default_factory=dict, init=False
     )
@@ -268,17 +277,20 @@ class StencilGroup:
     def convert_labels_to_relocations(self) -> None:
         for name, hole_plus in self.symbols.items():
             if isinstance(name, str) and "_JIT_RELOCATION_" in name:
-                _, offset = hole_plus
+                section_value, offset = hole_plus
                 reloc, target, _ = name.split("_JIT_RELOCATION_")
                 value, symbol = symbol_to_value(target)
                 hole = Hole(
                     int(offset), typing.cast(_schema.HoleKind, reloc), value, symbol, 0
                 )
-                self.code.holes.append(hole)
+                if section_value is HoleValue.COLD_CODE:
+                    self.cold_code.holes.append(hole)
+                else:
+                    self.code.holes.append(hole)
 
     def process_relocations(self, known_symbols: dict[str, int]) -> None:
         """Fix up all GOT and internal relocations for this stencil group."""
-        for hole in self.code.holes.copy():
+        for hole in list(self.code.holes) + list(self.cold_code.holes):
             if (
                 hole.kind
                 in {"R_AARCH64_CALL26", "R_AARCH64_JUMP26", "ARM64_RELOC_BRANCH26"}
@@ -326,7 +338,7 @@ def process_relocations(self, known_symbols: dict[str, int]) -> None:
                     known_symbols[hole.symbol] = ordinal
                 self._got_entries.add(ordinal)
         self.data.pad(8)
-        for stencil in [self.code, self.data]:
+        for stencil in [self.code, self.cold_code, self.data]:
             for hole in stencil.holes:
                 if hole.value is HoleValue.GOT:
                     assert hole.symbol is not None
@@ -361,6 +373,7 @@ def process_relocations(self, known_symbols: dict[str, int]) -> None:
         self._emit_jit_symbol_table()
         self._emit_global_offset_table()
         self.code.holes.sort(key=lambda hole: hole.offset)
+        self.cold_code.holes.sort(key=lambda hole: hole.offset)
         self.data.holes.sort(key=lambda hole: hole.offset)
 
     def _jit_symbol_table_lookup(self, symbol: str) -> int:
@@ -395,7 +408,7 @@ def _emit_jit_symbol_table(self) -> None:
             self.data.body.extend([0] * 8)
 
     def _emit_global_offset_table(self) -> None:
-        for hole in self.code.holes:
+        for hole in list(self.code.holes) + list(self.cold_code.holes):
             if hole.value is HoleValue.GOT:
                 _got_hole = Hole(0, "R_X86_64_64", hole.value, None, hole.addend)
                 _got_hole.func = "patch_got_symbol"
@@ -422,7 +435,10 @@ def _get_got_mask(self) -> str:
 
     def as_c(self, opname: str) -> str:
         """Dump this hole as a StencilGroup initializer."""
-        return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}, {self._get_trampoline_mask()}, {self._get_got_mask()}}}"
+        return (
+            f"{{emit_{opname}, {len(self.code.body)}, {len(self.cold_code.body)}, "
+            f"{len(self.data.body)}, {self._get_trampoline_mask()}, {self._get_got_mask()}}}"
+        )
 
 
 def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]:
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index 39be353ec30858..1f126778cea918 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -18,6 +18,13 @@
 import _stencils
 import _writer
 
+try:
+    import _asm_to_dasc
+    import _dasc_writer
+except ImportError:
+    _asm_to_dasc = None  # type: ignore[assignment]
+    _dasc_writer = None  # type: ignore[assignment]
+
 if sys.version_info < (3, 11):
     raise RuntimeError("Building the JIT compiler requires Python 3.11 or newer!")
 
@@ -54,6 +61,10 @@ class _Target(typing.Generic[_S, _R]):
     llvm_version: str = _llvm._LLVM_VERSION
     known_symbols: dict[str, int] = dataclasses.field(default_factory=dict)
     pyconfig_dir: pathlib.Path = pathlib.Path.cwd().resolve()
+    use_dynasm: bool = False
+    _jit_fold_pass: typing.Optional[pathlib.Path] = dataclasses.field(
+        default=None, init=False, repr=False
+    )
 
     def _get_nop(self) -> bytes:
         if re.fullmatch(r"aarch64-.*", self.triple):
@@ -134,11 +145,10 @@ async def _compile(
     ) -> _stencils.StencilGroup:
         s = tempdir / f"{opname}.s"
         o = tempdir / f"{opname}.o"
-        args_s = [
+        common_args = [
             f"--target={self.triple}",
             "-DPy_BUILD_CORE_MODULE",
             "-D_DEBUG" if self.debug else "-DNDEBUG",
-            f"-DSUPPORTS_SMALL_CONSTS={1 if self.optimizer.supports_small_constants else 0}",
             f"-D_JIT_OPCODE={opname}",
             "-D_PyJIT_ACTIVE",
             "-D_Py_JIT",
@@ -157,7 +167,6 @@ async def _compile(
             # generates better code than -O2 (and -O2 usually generates better
             # code than -O3). As a nice benefit, it uses less memory too:
             "-Os",
-            "-S",
             # Shorten full absolute file paths in the generated code (like the
             # __FILE__ macro and assert failure messages) for reproducibility:
             f"-ffile-prefix-map={CPYTHON}=.",
@@ -171,16 +180,68 @@ async def _compile(
             # Don't call stack-smashing canaries that we can't find or patch:
             "-fno-stack-protector",
             "-std=c11",
-            "-o",
-            f"{s}",
-            f"{c}",
             *self.args,
             # Allow user-provided CFLAGS to override any defaults
             *shlex.split(self.cflags),
         ]
-        await _llvm.run(
-            "clang", args_s, echo=self.verbose, llvm_version=self.llvm_version
-        )
+        if self.use_dynasm and self._jit_fold_pass is not None:
+            # DynASM pipeline: compile to LLVM IR, run the JIT fold pass to
+            # replace patch-value computation trees with inline-asm markers,
+            # then compile the folded IR to assembly.
+            ll = tempdir / f"{opname}.ll"
+            ll_folded = tempdir / f"{opname}.folded.ll"
+            args_ll = [
+                *common_args,
+                "-emit-llvm",
+                "-S",
+                "-o", f"{ll}",
+                f"{c}",
+            ]
+            await _llvm.run(
+                "clang", args_ll, echo=self.verbose,
+                llvm_version=self.llvm_version,
+            )
+            args_opt = [
+                f"--load-pass-plugin={self._jit_fold_pass}",
+                "-passes=jit-fold",
+                f"{ll}",
+                "-S", "-o", f"{ll_folded}",
+            ]
+            await _llvm.run(
+                "opt", args_opt, echo=self.verbose,
+                llvm_version=self.llvm_version,
+            )
+            args_s = [
+                f"--target={self.triple}",
+                "-Os",
+                "-fno-asynchronous-unwind-tables",
+                "-fno-builtin",
+                "-fno-stack-protector",
+                *self.args,
+                *shlex.split(self.cflags),
+                "-S",
+                "-o", f"{s}",
+                f"{ll_folded}",
+                # DynASM requires Intel syntax for x86:
+                "-mllvm", "--x86-asm-syntax=intel",
+            ]
+            await _llvm.run(
+                "clang", args_s, echo=self.verbose,
+                llvm_version=self.llvm_version,
+            )
+        else:
+            args_s = [
+                *common_args,
+                "-S",
+                "-o", f"{s}",
+                f"{c}",
+                # DynASM requires Intel syntax for x86:
+                *(("-mllvm", "--x86-asm-syntax=intel") if self.use_dynasm else ()),
+            ]
+            await _llvm.run(
+                "clang", args_s, echo=self.verbose,
+                llvm_version=self.llvm_version,
+            )
         self.optimizer(
             s,
             label_prefix=self.label_prefix,
@@ -188,12 +249,130 @@ async def _compile(
             re_global=self.re_global,
         ).run()
         args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"]
+        if self.use_dynasm:
+            args_o.insert(1, "-masm=intel")
         await _llvm.run(
             "clang", args_o, echo=self.verbose, llvm_version=self.llvm_version
         )
-        return await self._parse(o)
+        group = await self._parse(o)
+        if self.use_dynasm:
+            group.assembly_text = s.read_text()
+        return group
+
+    async def _build_jit_fold_pass(self) -> None:
+        """Build the LLVM JIT fold pass plugin if DynASM is enabled."""
+        if not self.use_dynasm:
+            return
+        src = TOOLS_JIT / "jit_fold_pass.cpp"
+        if not src.exists():
+            return
+        so = TOOLS_JIT / "jit_fold_pass.so"
+        # Check if rebuild is needed.
+        if so.exists() and so.stat().st_mtime > src.stat().st_mtime:
+            self._jit_fold_pass = so
+            return
+        import subprocess
+        # Find the compiler and llvm-config.
+        clangxx = await _llvm.maybe_run(
+            "clang++", ["--version"], echo=self.verbose,
+            llvm_version=self.llvm_version,
+        )
+        if clangxx is None:
+            print("Warning: clang++ not found, skipping JIT fold pass build")
+            return
+        # Resolve the actual clang++ path.
+        clangxx_path = None
+        for name in (f"clang++-{self.llvm_version}", "clang++"):
+            result = subprocess.run(
+                ["which", name], capture_output=True, text=True
+            )
+            if result.returncode == 0:
+                clangxx_path = result.stdout.strip()
+                break
+        if clangxx_path is None:
+            print("Warning: clang++ not found, skipping JIT fold pass build")
+            return
+        # Find llvm-config.
+        llvm_config = None
+        for name in (f"llvm-config-{self.llvm_version}", "llvm-config"):
+            result = subprocess.run(
+                [name, "--version"], capture_output=True, text=True
+            )
+            if result.returncode == 0 and result.stdout.strip().startswith(
+                f"{self.llvm_version}."
+            ):
+                llvm_config = name
+                break
+        if llvm_config is None:
+            print("Warning: llvm-config not found, skipping JIT fold pass build")
+            return
+        # Get compile and link flags.
+        cxxflags = subprocess.run(
+            [llvm_config, "--cxxflags"], capture_output=True, text=True
+        ).stdout.strip()
+        ldflags = subprocess.run(
+            [llvm_config, "--ldflags"], capture_output=True, text=True
+        ).stdout.strip()
+        # Find the GCC installation directory for C++ stdlib headers.
+        gcc_install_dir = ""
+        for ver in ("15", "14", "13", "12"):
+            candidate = pathlib.Path(f"/usr/lib/gcc/x86_64-linux-gnu/{ver}")
+            headers = pathlib.Path(f"/usr/include/c++/{ver}/type_traits")
+            if candidate.exists() and headers.exists():
+                gcc_install_dir = f"--gcc-install-dir={candidate}"
+                break
+        args = [
+            clangxx_path,
+            "-shared", "-fPIC",
+            *cxxflags.split(),
+            *ldflags.split(),
+            *(gcc_install_dir.split() if gcc_install_dir else []),
+            "-o", f"{so}",
+            f"{src}",
+            f"-lLLVM-{self.llvm_version}",
+        ]
+        if self.verbose:
+            import shlex as _shlex
+            print(_shlex.join(args))
+        result = subprocess.run(args, capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Warning: Failed to build JIT fold pass:\n{result.stderr}")
+            return
+        self._jit_fold_pass = so
+
+    # Stencils where oparg is provably constrained at the C level.
+    # These __builtin_assume hints let LLVM eliminate dead branches
+    # that compare oparg to constants.
+    #
+    # Evidence for each constraint:
+    #   COPY_FREE_VARS: Only emitted when nfreevars > 0
+    #                   (Python/flowgraph.c:make_cfg_from_code_object).
+    #   BUILD_SLICE: oparg is always 2 or 3 (2-arg or 3-arg slice).
+    #               (Python/bytecodes.c: inst(BUILD_SLICE))
+    #   UNPACK_SEQUENCE_TUPLE: Unpacking requires at least 1 target.
+    #   UNPACK_SEQUENCE_LIST: Same as UNPACK_SEQUENCE_TUPLE.
+    #   INIT_CALL_PY_EXACT_ARGS (non-specialized): The replicate(5)
+    #       creates _0 through _4 variants; the general variant
+    #       only runs when oparg >= 5.
+    _OPARG_ASSUMES: typing.ClassVar[dict[str, str]] = {
+        "COPY_FREE_VARS":          "    __builtin_assume(_oparg > 0);",
+        "BUILD_SLICE":             "    __builtin_assume(_oparg >= 2 && _oparg <= 3);",
+        "INIT_CALL_PY_EXACT_ARGS": "    __builtin_assume(_oparg >= 5);",
+    }
+
+    @staticmethod
+    def _oparg_assumes(opname: str) -> str:
+        """Return __builtin_assume statements for a stencil's oparg."""
+        # Strip leading underscore and register variant suffix
+        # e.g. "_COPY_FREE_VARS_r11" -> "COPY_FREE_VARS"
+        base = opname.lstrip("_")
+        # Remove register variant suffix (e.g. "_r01", "_r11")
+        base = re.sub(r"_r\d+$", "", base)
+        return _Target._OPARG_ASSUMES.get(base, "")
 
     async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
+        # Build the LLVM JIT fold pass before compiling stencils.
+        await self._build_jit_fold_pass()
         generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text()
         cases_and_opnames = sorted(
             re.findall(
@@ -214,7 +393,15 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
                     # compiler wastes a bunch of time parsing the dead code for
                     # all of the other cases):
                     c = work / f"{opname}.c"
-                    c.write_text(template.replace("CASE", case))
+                    body = template.replace("CASE", case)
+                    assumes = self._oparg_assumes(opname)
+                    if assumes:
+                        body = body.replace(
+                            "PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)",
+                            "PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)\n"
+                            + assumes,
+                        )
+                    c.write_text(body)
                     coro = self._compile(opname, c, work)
                     tasks.append(group.create_task(coro, name=opname))
         stencil_groups = {task.get_name(): task.result() for task in tasks}
@@ -223,6 +410,32 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
             stencil_group.process_relocations(self.known_symbols)
         return stencil_groups
 
+    def _build_dasc(
+        self,
+        stencil_groups: dict[str, _stencils.StencilGroup],
+        jit_stencils: pathlib.Path,
+    ) -> str:
+        """Convert stencils to DynASM and return the generated header content."""
+        assert _asm_to_dasc is not None, "DynASM support requires _asm_to_dasc module"
+        assert _dasc_writer is not None, "DynASM support requires _dasc_writer module"
+        converted = {}
+        shim_stencil = None
+        for opname, group in stencil_groups.items():
+            if not group.assembly_text:
+                continue
+            cs = _asm_to_dasc.convert_stencil(opname, group.assembly_text,
+                                                is_shim=(opname == "shim"))
+            if opname == "shim":
+                shim_stencil = cs
+            else:
+                converted[opname] = cs
+        _asm_to_dasc.print_peephole_stats()
+        # Write .dasc next to the output header for easy debugging
+        dasc_path = jit_stencils.with_suffix(".dasc")
+        return _dasc_writer.dump_header(
+            converted, shim_stencil, dasc_path=dasc_path
+        )
+
     def build(
         self,
         *,
@@ -254,8 +467,12 @@ def build(
                 if comment:
                     file.write(f"// {comment}\n")
                 file.write("\n")
-                for line in _writer.dump(stencil_groups, self.known_symbols):
-                    file.write(f"{line}\n")
+                if self.use_dynasm:
+                    content = self._build_dasc(stencil_groups, jit_stencils)
+                    file.write(content)
+                else:
+                    for line in _writer.dump(stencil_groups, self.known_symbols):
+                        file.write(f"{line}\n")
             try:
                 jit_stencils_new.replace(jit_stencils)
             except FileNotFoundError:
@@ -392,6 +609,8 @@ def _handle_section(
             value, base = maybe_symbol
             if value is _stencils.HoleValue.CODE:
                 stencil = group.code
+            elif value is _stencils.HoleValue.COLD_CODE:
+                stencil = group.cold_code
             else:
                 assert value is _stencils.HoleValue.DATA
                 stencil = group.data
@@ -403,8 +622,13 @@ def _handle_section(
             if "SHF_ALLOC" not in flags:
                 return
             if "SHF_EXECINSTR" in flags:
-                value = _stencils.HoleValue.CODE
-                stencil = group.code
+                section_name = section.get("Name", {}).get("Name", "")
+                if ".text.cold" in section_name:
+                    value = _stencils.HoleValue.COLD_CODE
+                    stencil = group.cold_code
+                else:
+                    value = _stencils.HoleValue.CODE
+                    stencil = group.code
             else:
                 value = _stencils.HoleValue.DATA
                 stencil = group.data
@@ -588,7 +812,7 @@ def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
         condition = "defined(__aarch64__) && defined(__linux__)"
         # -mno-outline-atomics: Keep intrinsics from being emitted.
         args = ["-fpic", "-mno-outline-atomics", "-fno-plt"]
-        optimizer = _optimizers.OptimizerAArch64
+        optimizer = _optimizers.OptimizerAArch64ELF
         target = _ELF(host, condition, args=args, optimizer=optimizer)
     elif re.fullmatch(r"i686-pc-windows-msvc", host):
         host = "i686-pc-windows-msvc"
@@ -611,9 +835,16 @@ def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO:
     elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
         host = "x86_64-unknown-linux-gnu"
         condition = "defined(__x86_64__) && defined(__linux__)"
-        args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0", "-fno-plt"]
-        optimizer = _optimizers.OptimizerX86
-        target = _ELF(host, condition, args=args, optimizer=optimizer)
+        args = [
+            "-fno-pic",
+            "-mcmodel=medium",
+            "-mlarge-data-threshold=0",
+            "-fno-plt",
+            "-fno-omit-frame-pointer",
+            "-mno-red-zone",
+        ]
+        optimizer = _optimizers.OptimizerX86ELF
+        target = _ELF(host, condition, args=args, optimizer=optimizer, use_dynasm=True)
     else:
         raise ValueError(host)
     return target
diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py
index 5fd9a2ee2d6e58..b7be0941159586 100644
--- a/Tools/jit/_writer.py
+++ b/Tools/jit/_writer.py
@@ -81,4 +81,4 @@ def dump(
     """Yield a JIT compiler line-by-line as a C header file."""
     for opname, group in groups.items():
         yield from _dump_stencil(opname, group)
-    yield from _dump_footer(groups, symbols)
+    yield from _dump_footer(groups, symbols)
\ No newline at end of file
diff --git a/Tools/jit/build.py b/Tools/jit/build.py
index 127d93b317fb09..55b01e0a5bfce6 100644
--- a/Tools/jit/build.py
+++ b/Tools/jit/build.py
@@ -43,7 +43,14 @@
         "--cflags", help="additional flags to pass to the compiler", default=""
     )
     parser.add_argument("--llvm-version", help="LLVM version to use")
+    parser.add_argument(
+        "--peephole-stats", action="store_true",
+        help="print peephole optimization statistics after DynASM build",
+    )
     args = parser.parse_args()
+    # Always print peephole optimization statistics during build.
+    import _asm_to_dasc
+    _asm_to_dasc.PEEPHOLE_STATS = True
     for target in args.target:
         target.debug = args.debug
         target.force = args.force
diff --git a/Tools/jit/jit_fold_pass.cpp b/Tools/jit/jit_fold_pass.cpp
new file mode 100644
index 00000000000000..938880be60c6a1
--- /dev/null
+++ b/Tools/jit/jit_fold_pass.cpp
@@ -0,0 +1,682 @@
+// Tools/jit/jit_fold_pass.cpp
+//
+// LLVM pass plugin that folds JIT-time constant computations into
+// inline assembly markers for the CPython DynASM JIT backend.
+//
+// The CPython JIT uses extern symbols (_JIT_OPARG, _JIT_OPERAND0, etc.)
+// whose addresses encode runtime-patchable values.  In the LLVM IR these
+// appear as `ptrtoint (ptr @_JIT_OPARG to i16)` constant expressions.
+// Clang compiles arithmetic on these into multi-instruction sequences,
+// but since the values are known at JIT emit time, these computations
+// can be deferred to the DynASM emitter as C expressions.
+//
+// This pass traces SSA use-chains from the patch-value symbols through
+// type conversions and arithmetic, builds C expression strings, and
+// replaces the computation trees with inline assembly markers that
+// survive through LLVM codegen into the final .s file.
+//
+// The markers have the form:
+//   nop # @@JIT_MOV_IMM %reg, <C-expression>@@
+//
+// The _asm_to_dasc.py converter recognizes these and emits:
+//   emit_mov_imm(Dst, JREG_X, <C-expression>);
+//
+// Markers do NOT declare ~{flags} as clobbered.  The downstream
+// _asm_to_dasc.py converter emits emit_mov_imm_preserve_flags() which
+// avoids the flag-destroying xor-zeroing idiom (uses mov Rd, 0 instead).
+// This gives LLVM full freedom to schedule flag-producing instructions
+// (cmp, test) before the marker without inserting flag-save/restore code.
+//
+// Build:
+//   clang++-21 -shared -fPIC -o jit_fold_pass.so jit_fold_pass.cpp \
+//       $(llvm-config-21 --cxxflags --ldflags) -lLLVM-21
+//
+// Use:
+//   opt-21 --load-pass-plugin=./jit_fold_pass.so -passes=jit-fold \
+//       input.ll -S -o output.ll
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Operator.h"
+
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <optional>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+// ── Helpers ────────────────────────────────────────────────────────────
+
+// C unsigned integer type for a given bit width.
+static std::string utype(unsigned bits) {
+    switch (bits) {
+    case 8:  return "uint8_t";
+    case 16: return "uint16_t";
+    case 32: return "uint32_t";
+    default: return "uintptr_t";
+    }
+}
+
+// C signed integer type for a given bit width.
+static std::string stype(unsigned bits) {
+    switch (bits) {
+    case 8:  return "int8_t";
+    case 16: return "int16_t";
+    case 32: return "int32_t";
+    default: return "int64_t";
+    }
+}
+
+// Check if a Constant references any _JIT_* patch global.
+static bool usesJITGlobal(
+    Constant *C,
+    const DenseMap<GlobalVariable *, std::string> &patchBases)
+{
+    if (auto *GV = dyn_cast<GlobalVariable>(C))
+        return patchBases.count(GV);
+    for (unsigned i = 0; i < C->getNumOperands(); ++i)
+        if (auto *inner = dyn_cast<Constant>(C->getOperand(i)))
+            if (usesJITGlobal(inner, patchBases))
+                return true;
+    return false;
+}
+
+// Map a binary opcode to the C operator string.
+static const char *binOpStr(unsigned opcode) {
+    switch (opcode) {
+    case Instruction::Add:  return " + ";
+    case Instruction::Sub:  return " - ";
+    case Instruction::Mul:  return " * ";
+    case Instruction::Shl:  return " << ";
+    case Instruction::LShr: return " >> ";
+    case Instruction::AShr: return " >> ";
+    case Instruction::And:  return " & ";
+    case Instruction::Or:   return " | ";
+    case Instruction::Xor:  return " ^ ";
+    default: return nullptr;
+    }
+}
+
+// ── Expression builder ─────────────────────────────────────────────────
+//
+// Recursively builds a C expression string from an SSA value that is
+// derived from _JIT_* patch symbols, integer constants, and known
+// global addresses.  Returns nullopt when the value depends on
+// runtime data.
+
+static std::optional<std::string> buildExpr(
+    Value *V,
+    const DenseMap<GlobalVariable *, std::string> &patchBases,
+    DenseMap<Value *, std::string> &cache,
+    const DataLayout &DL)
+{
+    // Check cache first.
+    {
+        auto it = cache.find(V);
+        if (it != cache.end())
+            return it->second;
+    }
+
+    std::optional<std::string> result;
+
+    // ── Base: integer constant ──
+    if (auto *CI = dyn_cast<ConstantInt>(V)) {
+        int64_t sv = CI->getSExtValue();
+        uint64_t uv = CI->getZExtValue();
+        // Use unsigned representation unless negative.
+        if (sv >= 0)
+            result = std::to_string(uv);
+        else
+            result = "(" + std::to_string(sv) + ")";
+    }
+
+    // ── GlobalVariable: patch symbols or known globals ──
+    // Handles pointer operands (e.g., GEP base pointers).
+    else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+        auto it = patchBases.find(GV);
+        if (it != patchBases.end())
+            result = it->second;
+        else if (GV->hasName() && !GV->getName().empty())
+            result = "(uintptr_t)&" + GV->getName().str();
+    }
+
+    // ── Function pointer ──
+    else if (auto *Fn = dyn_cast<Function>(V)) {
+        if (Fn->hasName() && !Fn->getName().empty())
+            result = "(uintptr_t)&" + Fn->getName().str();
+    }
+
+    // ── GEPOperator: handles both GEP instructions and GEP ConstantExprs ──
+    // Decomposes into base + constant_offset + sum(variable * stride).
+    // This enables folding patterns like &_PyRuntime + 14120 + oparg * 32
+    // into a single emit_mov_imm C expression.
+    else if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+        auto base = buildExpr(GEP->getPointerOperand(), patchBases, cache, DL);
+        if (base) {
+            // Try all-constant GEP first (fast path).
+            APInt constOff(DL.getIndexTypeSizeInBits(
+                GEP->getPointerOperandType()), 0);
+            if (GEP->accumulateConstantOffset(DL, constOff)) {
+                int64_t offset = constOff.getSExtValue();
+                if (offset == 0)
+                    result = *base;
+                else if (offset > 0)
+                    result = "(" + *base + " + " + std::to_string(offset) + ")";
+                else
+                    result = "(" + *base + " + (" +
+                             std::to_string(offset) + "))";
+            } else {
+                // Decompose GEP with variable indices using
+                // gep_type_iterator for correct stride computation.
+                int64_t constOffset = 0;
+                SmallVector<std::pair<std::string, uint64_t>, 2> varTerms;
+                bool ok = true;
+
+                for (auto GTI = gep_type_begin(GEP),
+                          GTE = gep_type_end(GEP);
+                     GTI != GTE; ++GTI) {
+                    Value *idx = GTI.getOperand();
+                    if (StructType *STy = GTI.getStructTypeOrNull()) {
+                        // Struct index — always constant.
+                        auto *CIdx = cast<ConstantInt>(idx);
+                        constOffset += DL.getStructLayout(STy)
+                            ->getElementOffset(CIdx->getZExtValue());
+                    } else {
+                        // Sequential type (array, pointer).
+                        TypeSize elemSize =
+                            GTI.getSequentialElementStride(DL);
+                        if (elemSize.isScalable()) { ok = false; break; }
+                        uint64_t size = elemSize.getFixedValue();
+
+                        if (auto *CIdx = dyn_cast<ConstantInt>(idx)) {
+                            constOffset +=
+                                CIdx->getSExtValue() * (int64_t)size;
+                        } else {
+                            auto idxExpr = buildExpr(
+                                idx, patchBases, cache, DL);
+                            if (!idxExpr) { ok = false; break; }
+                            varTerms.push_back({*idxExpr, size});
+                        }
+                    }
+                }
+
+                if (ok) {
+                    std::string expr = *base;
+                    if (constOffset != 0)
+                        expr = "(" + expr + " + " +
+                               std::to_string(constOffset) + ")";
+                    for (auto &[idxExpr, stride] : varTerms) {
+                        if (stride == 1)
+                            expr = "(" + expr +
+                                   " + (uintptr_t)(" + idxExpr + "))";
+                        else
+                            expr = "(" + expr +
+                                   " + (uintptr_t)(" + idxExpr + ") * " +
+                                   std::to_string(stride) + ")";
+                    }
+                    result = expr;
+                }
+            }
+        }
+    }
+
+    // ── ConstantExpr: ptrtoint, binary ops, casts ──
+    else if (auto *CE = dyn_cast<ConstantExpr>(V)) {
+        if (CE->getOpcode() == Instruction::PtrToInt) {
+            auto *op = CE->getOperand(0);
+            if (auto *GV = dyn_cast<GlobalVariable>(op)) {
+                auto it = patchBases.find(GV);
+                if (it != patchBases.end()) {
+                    result = it->second;
+                }
+                // Non-JIT globals: fold as (uintptr_t)&symbol_name.
+                // This allows mixed expressions like
+                //   &_PyRuntime + 14121 + (oparg + 5) * 32
+                // to become a single emit_mov_imm C expression.
+                // The global's address is available at JIT compile time
+                // because the JIT emitter is part of the CPython binary.
+                else if (GV->hasName() && !GV->getName().empty()) {
+                    result = "(uintptr_t)&" + GV->getName().str();
+                }
+            }
+            // Function pointers: fold as (uintptr_t)&func_name.
+            else if (auto *Fn = dyn_cast<Function>(op)) {
+                if (Fn->hasName() && !Fn->getName().empty())
+                    result = "(uintptr_t)&" + Fn->getName().str();
+            }
+        }
+        // Binary operations in ConstantExprs (e.g., add(ptrtoint, 2)).
+        // Clang emits these for expressions like `oparg + 2` when the
+        // result is used in a store or other non-instruction context.
+        // LLVM 21 supports Add, Sub, Xor as ConstantExpr binary ops.
+        else if (CE->getNumOperands() == 2) {
+            const char *ceOp = binOpStr(CE->getOpcode());
+            if (ceOp) {
+                auto lhs = buildExpr(CE->getOperand(0), patchBases, cache, DL);
+                auto rhs = buildExpr(CE->getOperand(1), patchBases, cache, DL);
+                if (lhs && rhs) {
+                    std::string left = *lhs;
+                    if (CE->getOpcode() == Instruction::AShr) {
+                        unsigned bits = CE->getType()->getIntegerBitWidth();
+                        left = "(" + stype(bits) + ")(" + left + ")";
+                    }
+                    result = "(" + left + ceOp + *rhs + ")";
+                }
+            }
+        }
+        // ZExt/SExt/Trunc ConstantExprs
+        else if (CE->getOpcode() == Instruction::ZExt) {
+            auto inner = buildExpr(CE->getOperand(0), patchBases, cache, DL);
+            if (inner) {
+                unsigned srcBits =
+                    CE->getOperand(0)->getType()->getIntegerBitWidth();
+                if (srcBits < 32)
+                    result = "(" + utype(srcBits) + ")(" + *inner + ")";
+                else
+                    result = *inner;
+            }
+        }
+        else if (CE->getOpcode() == Instruction::SExt) {
+            auto inner = buildExpr(CE->getOperand(0), patchBases, cache, DL);
+            if (inner) {
+                unsigned srcBits =
+                    CE->getOperand(0)->getType()->getIntegerBitWidth();
+                result = "(" + stype(srcBits) + ")(" + *inner + ")";
+            }
+        }
+        else if (CE->getOpcode() == Instruction::Trunc) {
+            auto inner = buildExpr(CE->getOperand(0), patchBases, cache, DL);
+            if (inner) {
+                unsigned dstBits = CE->getType()->getIntegerBitWidth();
+                result = "(" + utype(dstBits) + ")(" + *inner + ")";
+            }
+        }
+    }
+
+    // ── PtrToInt instruction ──
+    // Converts pointer to integer — the expression is the same as the
+    // pointer operand (which already includes (uintptr_t)& for globals).
+    else if (auto *PTI = dyn_cast<PtrToIntInst>(V)) {
+        auto inner = buildExpr(PTI->getPointerOperand(), patchBases, cache, DL);
+        if (inner)
+            result = *inner;
+    }
+
+    // ── ZExt: zero-extend ──
+    else if (auto *I = dyn_cast<ZExtInst>(V)) {
+        auto inner = buildExpr(I->getOperand(0), patchBases, cache, DL);
+        if (inner) {
+            unsigned srcBits = I->getOperand(0)->getType()->getIntegerBitWidth();
+            // Explicit truncation mask for narrow sources to match C semantics.
+            if (srcBits < 32)
+                result = "(" + utype(srcBits) + ")(" + *inner + ")";
+            else
+                result = *inner;
+        }
+    }
+
+    // ── SExt: sign-extend ──
+    else if (auto *I = dyn_cast<SExtInst>(V)) {
+        auto inner = buildExpr(I->getOperand(0), patchBases, cache, DL);
+        if (inner) {
+            unsigned srcBits = I->getOperand(0)->getType()->getIntegerBitWidth();
+            result = "(" + stype(srcBits) + ")(" + *inner + ")";
+        }
+    }
+
+    // ── Trunc: truncate ──
+    else if (auto *I = dyn_cast<TruncInst>(V)) {
+        auto inner = buildExpr(I->getOperand(0), patchBases, cache, DL);
+        if (inner) {
+            unsigned dstBits = I->getType()->getIntegerBitWidth();
+            result = "(" + utype(dstBits) + ")(" + *inner + ")";
+        }
+    }
+
+    // ── Binary operators ──
+    else if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+        auto lhs = buildExpr(BO->getOperand(0), patchBases, cache, DL);
+        auto rhs = buildExpr(BO->getOperand(1), patchBases, cache, DL);
+        if (lhs && rhs) {
+            const char *op = binOpStr(BO->getOpcode());
+            if (op) {
+                std::string left = *lhs;
+                if (BO->getOpcode() == Instruction::AShr) {
+                    unsigned bits = BO->getType()->getIntegerBitWidth();
+                    left = "(" + stype(bits) + ")(" + left + ")";
+                }
+                result = "(" + left + op + *rhs + ")";
+            }
+        }
+    }
+
+    // ── Select with patch-derived condition and both arms ──
+    else if (auto *SI = dyn_cast<SelectInst>(V)) {
+        auto cond = buildExpr(SI->getCondition(), patchBases, cache, DL);
+        auto tv   = buildExpr(SI->getTrueValue(), patchBases, cache, DL);
+        auto fv   = buildExpr(SI->getFalseValue(), patchBases, cache, DL);
+        if (cond && tv && fv)
+            result = "((" + *cond + ") ? (" + *tv + ") : (" + *fv + "))";
+    }
+
+    // NOTE: ICmpInst is NOT handled here.  Making the comparison result
+    // a patch-derived value would cause LLVM to emit `test al, 1; jne`
+    // which is worse than the original `test rax, rax; je`.  Instead,
+    // stencils with provable oparg constraints use __builtin_assume()
+    // (injected by _targets.py) so LLVM eliminates dead branches at
+    // compile time.
+
+    // Cache the result (only successful ones — nullopt means "not foldable").
+    if (result)
+        cache[V] = *result;
+
+    return result;
+}
+
+
+// ── Main pass ──────────────────────────────────────────────────────────
+
+class JITFoldPass : public PassInfoMixin<JITFoldPass> {
+public:
+    PreservedAnalyses run(Module &M, ModuleAnalysisManager &) {
+        // 1. Discover patch-value globals and their base C expressions.
+        DenseMap<GlobalVariable *, std::string> patchBases;
+
+        auto tryAdd = [&](const char *name, const char *expr) {
+            if (auto *GV = M.getGlobalVariable(name))
+                patchBases[GV] = expr;
+        };
+        tryAdd("_JIT_OPARG",    "(uintptr_t)(instruction->oparg)");
+        tryAdd("_JIT_OPERAND0", "(uintptr_t)(instruction->operand0)");
+        tryAdd("_JIT_OPERAND1", "(uintptr_t)(instruction->operand1)");
+        tryAdd("_JIT_TARGET",   "(uintptr_t)(instruction->jump_target)");
+
+        // Don't bail out early when patchBases is empty.
+        // Stencils that don't reference _JIT_* globals may still
+        // benefit from OR-distribution and global-address folding
+        // (e.g., _MATCH_MAPPING uses _Py_TrueStruct | 1).
+
+        bool changed = false;
+        for (Function &F : M) {
+            if (F.isDeclaration())
+                continue;
+            changed |= processFunction(F, patchBases);
+        }
+        return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+    }
+
+private:
+    // ── Phase 0: Materialize binary ConstantExprs ──
+    //
+    // Clang can emit `store i16 add(ptrtoint(@_JIT_OPARG to i16), i16 2)`
+    // where the add is a ConstantExpr, not an instruction.  buildExpr now
+    // handles ConstantExpr binary ops, but boundary detection only looks
+    // at Instructions.  Convert these to real instructions so Phase 2
+    // can find and replace them.
+    void materializeConstantExprs(
+        Function &F,
+        const DenseMap<GlobalVariable *, std::string> &patchBases)
+    {
+        SmallVector<std::pair<Instruction *, unsigned>> toFix;
+        for (auto &BB : F) {
+            for (auto &I : BB) {
+                for (unsigned opIdx = 0; opIdx < I.getNumOperands(); ++opIdx) {
+                    auto *CE = dyn_cast<ConstantExpr>(I.getOperand(opIdx));
+                    if (!CE)
+                        continue;
+                    // Check if this is a binary op ConstantExpr
+                    // (Add, Sub, Xor are the binary ops that survive as
+                    // ConstantExpr in LLVM 21).
+                    if (!binOpStr(CE->getOpcode()) ||
+                        CE->getNumOperands() != 2)
+                        continue;
+                    if (!usesJITGlobal(CE, patchBases))
+                        continue;
+                    toFix.push_back({&I, opIdx});
+                }
+            }
+        }
+        for (auto &[inst, opIdx] : toFix) {
+            auto *CE = dyn_cast<ConstantExpr>(inst->getOperand(opIdx));
+            if (!CE)
+                continue;
+            // Create a real BinaryOperator (avoids constant folding back
+            // into a ConstantExpr).
+            auto *NewI = BinaryOperator::Create(
+                static_cast<Instruction::BinaryOps>(CE->getOpcode()),
+                CE->getOperand(0), CE->getOperand(1), "", inst->getIterator());
+            inst->setOperand(opIdx, NewI);
+        }
+    }
+
+    // GEP decomposition comment: The concern about GEP decomposition
+    // moving SIB scaling into C arithmetic only applies to GEPs with
+    // RUNTIME indices (e.g., [r13 + r8*8 + 80]).  For GEPs where all
+    // variable indices are patch-derived, buildExpr uses (uintptr_t)
+    // arithmetic which is always 64-bit and cannot overflow.
+
+    // ── Phase 0b: Distribute OR into SELECT arms ──
+    //
+    // LLVM's instcombine canonicalizes:
+    //   select(cond, (A|C), (B|C))  →  (select(cond, A, B)) | C
+    //
+    // For PyStackRef borrow tagging (| Py_TAG_REFCNT), this prevents
+    // folding the tag into compile-time-constant addresses of
+    // _Py_TrueStruct, _Py_FalseStruct, _Py_NoneStruct.
+    // We reverse it so each arm becomes a foldable constant:
+    //   or(select(cond, A, B), C)  →  select(cond, A|C, B|C)
+    void distributeOrIntoSelect(Function &F) {
+        SmallVector<Instruction *, 8> toRemoveOr;
+        SmallVector<SelectInst *, 8> maybeDeadSel;
+
+        for (auto &BB : F) {
+            for (auto &I : BB) {
+                auto *BO = dyn_cast<BinaryOperator>(&I);
+                if (!BO || BO->getOpcode() != Instruction::Or)
+                    continue;
+
+                // Match: or(select, C) or or(C, select)
+                ConstantInt *C = nullptr;
+                SelectInst *Sel = nullptr;
+                if ((Sel = dyn_cast<SelectInst>(BO->getOperand(0))) &&
+                    (C = dyn_cast<ConstantInt>(BO->getOperand(1)))) {
+                } else if ((C = dyn_cast<ConstantInt>(BO->getOperand(0))) &&
+                           (Sel = dyn_cast<SelectInst>(BO->getOperand(1)))) {
+                } else {
+                    continue;
+                }
+
+                // Only distribute small tag constants (e.g., Py_TAG_REFCNT=1).
+                if (C->getZExtValue() > 7)
+                    continue;
+
+                auto *TvOr = BinaryOperator::Create(
+                    Instruction::Or, Sel->getTrueValue(), C,
+                    "", BO->getIterator());
+                auto *FvOr = BinaryOperator::Create(
+                    Instruction::Or, Sel->getFalseValue(), C,
+                    "", BO->getIterator());
+                auto *NewSel = SelectInst::Create(
+                    Sel->getCondition(), TvOr, FvOr,
+                    "", BO->getIterator());
+
+                BO->replaceAllUsesWith(NewSel);
+                toRemoveOr.push_back(BO);
+                maybeDeadSel.push_back(Sel);
+            }
+        }
+
+        for (auto *I : toRemoveOr)
+            I->eraseFromParent();
+        for (auto *Sel : maybeDeadSel) {
+            if (Sel->use_empty())
+                Sel->eraseFromParent();
+        }
+    }
+
+    bool processFunction(
+        Function &F,
+        const DenseMap<GlobalVariable *, std::string> &patchBases)
+    {
+        const DataLayout &DL = F.getParent()->getDataLayout();
+
+        // Phase 0: Convert binary ConstantExprs that reference _JIT_*
+        // globals into real instructions so the rest of the pass can
+        // find and replace them.
+        materializeConstantExprs(F, patchBases);
+
+        // Phase 0b: Distribute OR into SELECT arms so borrow tags
+        // (| Py_TAG_REFCNT) get folded into constant addresses.
+        distributeOrIntoSelect(F);
+
+        // Phase 1: Build expressions for all patch-derived values.
+        DenseMap<Value *, std::string> cache;
+        for (auto &BB : F)
+            for (auto &I : BB)
+                buildExpr(&I, patchBases, cache, DL);
+
+        // Phase 2: Find "boundary" instructions — patch-derived values
+        // that have at least one user which is NOT patch-derived.
+        // These are the points where we inject inline-asm markers.
+        //
+        // Create boundaries for expressions that reference:
+        // - _JIT_* patch values (contain "instruction->"), OR
+        // - Global addresses (contain "(uintptr_t)&") — these arise from
+        //   OR-distribution of borrow tags into select arms, e.g.
+        //   ((uintptr_t)&_Py_TrueStruct | 1).  The combined address+tag
+        //   cannot be expressed as a single relocation, so emit_mov_imm
+        //   is more efficient than LLVM's mov+or sequence.
+        //
+        // Pure numeric constants not referencing either are skipped —
+        // LLVM handles them better via immediates.
+        SmallVector<std::pair<Instruction *, std::string>> boundaries;
+
+        for (auto &BB : F) {
+            for (auto &I : BB) {
+                auto it = cache.find(&I);
+                if (it == cache.end())
+                    continue;  // not patch-derived
+
+                // Skip pure constant expressions (no JIT or global ref).
+                if (it->second.find("instruction->") == std::string::npos &&
+                    it->second.find("(uintptr_t)&") == std::string::npos)
+                    continue;
+
+                bool hasBoundaryUse = false;
+                for (User *U : I.users()) {
+                    if (cache.find(U) == cache.end()) {
+                        hasBoundaryUse = true;
+                        break;
+                    }
+                }
+                if (hasBoundaryUse)
+                    boundaries.push_back({&I, it->second});
+            }
+        }
+
+        if (boundaries.empty())
+            return false;
+
+        // Phase 3: Replace each boundary value with an inline-asm marker.
+        for (auto &[inst, expr] : boundaries)
+            injectMarker(inst, expr);
+
+        // Phase 4: Remove dead patch-derived instructions.
+        // Work backwards to handle use-before-def chains.
+        bool progress = true;
+        while (progress) {
+            progress = false;
+            for (auto &BB : F) {
+                for (auto it = BB.begin(); it != BB.end(); ) {
+                    Instruction &I = *it++;
+                    if (I.use_empty() && cache.count(&I) &&
+                        !I.isTerminator() && !I.mayHaveSideEffects()) {
+                        I.eraseFromParent();
+                        progress = true;
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    void injectMarker(Instruction *I, const std::string &expr) {
+        IRBuilder<> Builder(I);
+        Type *origTy = I->getType();
+        LLVMContext &Ctx = I->getContext();
+        Type *i64Ty = Type::getInt64Ty(Ctx);
+
+        // The inline asm emits a NOP (valid x86) with a comment marker.
+        // The NOP ensures the assembler doesn't complain.  The marker
+        // is parsed by _asm_to_dasc.py which converts it to an
+        // emit_mov_imm_preserve_flags() call.
+        //
+        // Always use i64 output type so LLVM allocates a 64-bit register
+        // (rax, rcx, ...) rather than a 32/16-bit sub-register (eax, ax).
+        // The emit_mov_imm_preserve_flags helper handles narrower values
+        // correctly.
+        //
+        // We do NOT clobber ~{flags} here.  The downstream
+        // emit_mov_imm_preserve_flags() avoids xor-zeroing (which would
+        // clobber flags), using mov Rd, 0 instead.  This gives LLVM full
+        // scheduling freedom around flag-producing instructions.
+        std::string asmText = "nop # @@JIT_MOV_IMM $0, " + expr + "@@";
+
+        FunctionType *FTy = FunctionType::get(i64Ty, /*isVarArg=*/false);
+        InlineAsm *IA = InlineAsm::get(
+            FTy, asmText,
+            "=r,~{dirflag},~{fpsr}",
+            /*hasSideEffects=*/true);
+
+        CallInst *marker = Builder.CreateCall(FTy, IA);
+
+        // If original type was narrower than i64, truncate back.
+        // LLVM codegen will use the appropriate sub-register in the
+        // consuming instruction without emitting extra instructions.
+        Value *result = marker;
+        if (origTy != i64Ty && origTy->isIntegerTy()) {
+            result = Builder.CreateTrunc(marker, origTy);
+        } else if (origTy->isPointerTy()) {
+            result = Builder.CreateIntToPtr(marker, origTy);
+        }
+
+        I->replaceAllUsesWith(result);
+        I->eraseFromParent();
+    }
+};
+
+} // anonymous namespace
+
+// ── Plugin registration ────────────────────────────────────────────────
+
+extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo
+llvmGetPassPluginInfo() {
+    return {
+        LLVM_PLUGIN_API_VERSION,
+        "JITFoldPass",
+        LLVM_VERSION_STRING,
+        [](PassBuilder &PB) {
+            PB.registerPipelineParsingCallback(
+                [](StringRef Name, ModulePassManager &MPM,
+                   ArrayRef<PassBuilder::PipelineElement>) -> bool {
+                    if (Name == "jit-fold") {
+                        MPM.addPass(JITFoldPass());
+                        return true;
+                    }
+                    return false;
+                });
+        }};
+}
diff --git a/Tools/jit/template.c b/Tools/jit/template.c
index afdd9b77e7c7ff..feb05b8791f182 100644
--- a/Tools/jit/template.c
+++ b/Tools/jit/template.c
@@ -37,6 +37,396 @@
 #include "jit.h"
 
 
+// ── JIT-inlined object allocation helpers ──────────────────────────────
+//
+// These replace calls to PyFloat_FromDouble / _PyFloat_ExactDealloc with
+// fully inlined fast paths.  The freelist pop/push is done directly,
+// inlining _Py_NewReference (just sets refcnt=1) and skipping
+// _PyReftracerTrack (virtually never active).
+//
+// Each inliner takes tstate as its first parameter.  Since the JIT's
+// preserve_none calling convention keeps tstate in r15, this is a simple
+// register read — no TLS call, no spills.  The #define redirects in
+// template.c automatically capture tstate from the stencil function scope,
+// so calling code doesn't need any special plumbing.
+//
+// To add a new type's freelist inlining:
+//   1. Add _PyJIT_*_Alloc(tstate, ...) / _PyJIT_*_Dealloc(tstate, ...)
+//      using _PyJIT_FREELIST_POP / _PyJIT_FREELIST_PUSH
+//   2. Add #define redirects in template.c
+//
+// The slow path (malloc / PyObject_Free) falls back to the original
+// C functions.
+
+#include "pycore_freelist.h"
+#include "pycore_object.h"
+
+// Inline replacement for _PyObject_Init that avoids the external
+// _Py_NewReference function call.  In non-debug builds, _Py_NewReference
+// just sets ob_refcnt_full=1 — we do this directly.
+static inline void
+_PyJIT_Object_Init(PyObject *op, PyTypeObject *typeobj)
+{
+    Py_SET_TYPE(op, typeobj);
+    _Py_INCREF_TYPE(typeobj);
+    op->ob_refcnt_full = 1;  // Inline _Py_NewReference
+}
+
+// ── Generic freelist helpers ──────────────────────────────────────────
+// These macros provide the core freelist pop/push logic shared by all
+// type-specific inliners.  They inline _Py_NewReference (just sets
+// refcnt=1) and skip _PyReftracerTrack.
+
+// Pop an object from the freelist.  Returns the object with refcnt=1,
+// or NULL if the freelist is empty.  NAME is the field name in
+// struct _Py_freelists (e.g., floats, ints).
+#define _PyJIT_FREELIST_POP(tstate, NAME) ({ \
+    struct _Py_freelist *_fl = \
+        &(tstate)->interp->object_state.freelists.NAME; \
+    void *_obj = _fl->freelist; \
+    if (_obj != NULL) { \
+        _fl->freelist = *(void **)_obj; \
+        _fl->size--; \
+        OBJECT_STAT_INC(from_freelist); \
+        ((PyObject *)_obj)->ob_refcnt_full = 1; \
+    } \
+    _obj; \
+})
+
+// Push an object onto the freelist.  Returns 1 if pushed, 0 if full.
+// NAME is the field name, MAX is the maximum freelist size.
+#define _PyJIT_FREELIST_PUSH(tstate, NAME, OP, MAX) ({ \
+    struct _Py_freelist *_fl = \
+        &(tstate)->interp->object_state.freelists.NAME; \
+    int _ok = 0; \
+    if (_fl->size >= 0 && _fl->size < (MAX)) { \
+        *(void **)(OP) = _fl->freelist; \
+        _fl->freelist = (OP); \
+        _fl->size++; \
+        OBJECT_STAT_INC(to_freelist); \
+        _ok = 1; \
+    } \
+    _ok; \
+})
+
+// ── Float freelist inliners ───────────────────────────────────────────
+
+static inline PyObject *
+_PyJIT_FloatFromDouble(PyThreadState *tstate, double fval)
+{
+    PyFloatObject *op = (PyFloatObject *)_PyJIT_FREELIST_POP(tstate, floats);
+    if (op != NULL) {
+        op->ob_fval = fval;
+        return (PyObject *)op;
+    }
+    return PyFloat_FromDouble(fval);
+}
+
+static inline void
+_PyJIT_FloatDealloc(PyThreadState *tstate, PyObject *op)
+{
+    if (!_PyJIT_FREELIST_PUSH(tstate, floats, op, Py_floats_MAXFREELIST)) {
+        PyObject_Free(op);
+    }
+}
+
+// ── Int (long) freelist inliners ──────────────────────────────────────
+
+#include "pycore_long.h"
+
+static inline void
+_PyJIT_LongDealloc(PyThreadState *tstate, PyObject *op)
+{
+    PyLongObject *l = (PyLongObject *)op;
+    // Small int recovery: re-immortalize.  This is a cold path — small
+    // ints have immortal refcnt and shouldn't normally reach dealloc.
+    if (l->long_value.lv_tag & IMMORTALITY_BIT_MASK) {
+        _Py_SetImmortal(op);
+        return;
+    }
+    // Compact (single-digit) int: push to freelist.
+    if (_PyLong_IsCompact(l)) {
+        if (!_PyJIT_FREELIST_PUSH(tstate, ints, op, Py_ints_MAXFREELIST)) {
+            PyObject_Free(op);
+        }
+        return;
+    }
+    // Non-compact (multi-digit) int: free directly.
+    PyObject_Free(op);
+}
+
+// Inline version of medium_from_stwodigits: converts an arithmetic result
+// back to a PyLongObject.  Returns small int singleton for small values,
+// pops from freelist for medium values, falls back to NULL if overflow.
+static inline _PyStackRef
+_PyJIT_MediumFromStwodigits(PyThreadState *tstate, stwodigits x)
+{
+    // Small int: return borrowed reference to cached singleton.
+    if (-_PY_NSMALLNEGINTS <= x && x < _PY_NSMALLPOSINTS) {
+        PyObject *small = (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + (sdigit)x];
+        return PyStackRef_FromPyObjectBorrow(small);
+    }
+    // Check if result fits in a single digit (medium int).
+    twodigits x_plus_mask = ((twodigits)x) + PyLong_MASK;
+    if (x_plus_mask >= ((twodigits)PyLong_MASK) + PyLong_BASE) {
+        return PyStackRef_NULL;  // overflow — caller will deoptimize
+    }
+    // Pop from freelist or allocate.
+    PyLongObject *v = (PyLongObject *)_PyJIT_FREELIST_POP(tstate, ints);
+    if (v == NULL) {
+        v = PyObject_Malloc(sizeof(PyLongObject));
+        if (v == NULL) {
+            return PyStackRef_NULL;
+        }
+        _PyJIT_Object_Init((PyObject *)v, &PyLong_Type);
+    }
+    digit abs_x = x < 0 ? (digit)(-x) : (digit)x;
+    _PyLong_SetSignAndDigitCount(v, x < 0 ? -1 : 1, 1);
+    v->long_value.ob_digit[0] = abs_x;
+    return PyStackRef_FromPyObjectStealMortal((PyObject *)v);
+}
+
+static inline _PyStackRef
+_PyJIT_CompactLong_Add(PyThreadState *tstate,
+                       PyLongObject *left, PyLongObject *right)
+{
+    stwodigits v = (stwodigits)_PyLong_CompactValue(left)
+                 + (stwodigits)_PyLong_CompactValue(right);
+    return _PyJIT_MediumFromStwodigits(tstate, v);
+}
+
+static inline _PyStackRef
+_PyJIT_CompactLong_Subtract(PyThreadState *tstate,
+                            PyLongObject *left, PyLongObject *right)
+{
+    stwodigits v = (stwodigits)_PyLong_CompactValue(left)
+                 - (stwodigits)_PyLong_CompactValue(right);
+    return _PyJIT_MediumFromStwodigits(tstate, v);
+}
+
+static inline _PyStackRef
+_PyJIT_CompactLong_Multiply(PyThreadState *tstate,
+                            PyLongObject *left, PyLongObject *right)
+{
+    stwodigits v = (stwodigits)_PyLong_CompactValue(left)
+                 * (stwodigits)_PyLong_CompactValue(right);
+    return _PyJIT_MediumFromStwodigits(tstate, v);
+}
+
+// Floor division for compact longs.  Caller must ensure right != 0
+// (the bytecode guards with DEOPT_IF).
+// Python floor division rounds toward negative infinity, unlike C which
+// truncates toward zero.  We correct for this when the remainder is
+// nonzero and operand signs differ.
+static inline _PyStackRef
+_PyJIT_CompactLong_FloorDivide(PyThreadState *tstate,
+                               PyLongObject *left, PyLongObject *right)
+{
+    stwodigits a = (stwodigits)_PyLong_CompactValue(left);
+    stwodigits b = (stwodigits)_PyLong_CompactValue(right);
+    assert(b != 0);
+    stwodigits div = a / b;
+    stwodigits rem = a % b;
+    // Adjust for floor semantics: if remainder nonzero and signs differ
+    if (rem != 0 && ((rem ^ b) < 0)) {
+        div -= 1;
+    }
+    return _PyJIT_MediumFromStwodigits(tstate, div);
+}
+
+// Modulo for compact longs.  Caller must ensure right != 0.
+static inline _PyStackRef
+_PyJIT_CompactLong_Modulo(PyThreadState *tstate,
+                          PyLongObject *left, PyLongObject *right)
+{
+    stwodigits a = (stwodigits)_PyLong_CompactValue(left);
+    stwodigits b = (stwodigits)_PyLong_CompactValue(right);
+    assert(b != 0);
+    stwodigits mod = a % b;
+    if (mod != 0 && ((mod ^ b) < 0)) {
+        mod += b;
+    }
+    return _PyJIT_MediumFromStwodigits(tstate, mod);
+}
+
+// ── Fast dealloc inliner (replaces generic _Py_Dealloc) ──────────────
+// _Py_Dealloc goes through reftracer check, recursion limit check,
+// and an indirect call to tp_dealloc.  For the vast majority of JIT-hot
+// object types (float, compact int) we can inline the freelist push
+// directly, saving TWO function calls and the tracer/recursion overhead.
+// For unknown types we fall back to a direct tp_dealloc call (still
+// faster than _Py_Dealloc because we skip the tracer + recursion checks).
+
+static inline void
+_PyJIT_FastDealloc(PyThreadState *tstate, PyObject *op)
+{
+    PyTypeObject *type = Py_TYPE(op);
+    if (type == &PyFloat_Type) {
+        _PyJIT_FloatDealloc(tstate, op);
+    } else if (type == &PyLong_Type) {
+        _PyJIT_LongDealloc(tstate, op);
+    } else {
+        // Direct tp_dealloc call — skips reftracer and recursion check.
+        type->tp_dealloc(op);
+    }
+}
+
+// Inline replacement for PyLong_FromLong.  Most range() iterators produce
+// values in the small int range (-5..1024) which are immortal singletons
+// requiring no allocation at all.  Medium values (single digit) go through
+// the freelist.  Only large values fall back to the full PyLong_FromLong.
+static inline PyObject *
+_PyJIT_LongFromLong(PyThreadState *tstate, long ival)
+{
+    // Small int: return cached singleton (no allocation, no refcount).
+    if (ival >= -_PY_NSMALLNEGINTS && ival < _PY_NSMALLPOSINTS) {
+        return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS + (int)ival];
+    }
+    // Medium int (fits in single digit): use freelist.
+    if (-(long)PyLong_MASK <= ival && ival <= (long)PyLong_MASK) {
+        PyLongObject *v = (PyLongObject *)_PyJIT_FREELIST_POP(tstate, ints);
+        if (v == NULL) {
+            v = PyObject_Malloc(sizeof(PyLongObject));
+            if (v == NULL) {
+                PyErr_NoMemory();
+                return NULL;
+            }
+            _PyJIT_Object_Init((PyObject *)v, &PyLong_Type);
+        }
+        digit abs_x = ival < 0 ? (digit)(-ival) : (digit)ival;
+        _PyLong_SetSignAndDigitCount(v, ival < 0 ? -1 : 1, 1);
+        v->long_value.ob_digit[0] = abs_x;
+        return (PyObject *)v;
+    }
+    // Large int: fall back to full PyLong_FromLong.
+    return PyLong_FromLong(ival);
+}
+
+// Replace PyFloat_FromDouble with JIT-inlined version that avoids
+// function call overhead for freelist pop and _Py_NewReference.
+// tstate is captured from the stencil function scope (r15 in preserve_none).
+#define PyFloat_FromDouble(val) _PyJIT_FloatFromDouble(tstate, (val))
+
+// Override PyStackRef_CLOSE_SPECIALIZED to call the destructor directly
+// (as a macro expansion) instead of through a function pointer.
+// This allows _PyFloat_ExactDealloc to be a macro that inlines the
+// freelist push directly.
+// Note: We skip _PyReftracerTrack in JIT stencils for performance.
+// The reftracer is a debugging/profiling tool that is virtually never
+// active.  Our inlined _PyJIT_*Dealloc helpers already skip it, so
+// this makes the non-inlined path consistent.
+#undef PyStackRef_CLOSE_SPECIALIZED
+#define PyStackRef_CLOSE_SPECIALIZED(REF, DESTRUCT) do { \
+    _PyStackRef _jit_ref = (REF); \
+    if (PyStackRef_RefcountOnObject(_jit_ref)) { \
+        PyObject *_jit_obj = BITS_TO_PTR(_jit_ref); \
+        _Py_DECREF_STAT_INC(); \
+        if (--_jit_obj->ob_refcnt == 0) { \
+            DESTRUCT(_jit_obj); \
+        } \
+    } \
+} while(0)
+
+// Replace _PyFloat_ExactDealloc with inline freelist push.
+// tstate is captured from the stencil function scope.
+#undef _PyFloat_ExactDealloc
+#define _PyFloat_ExactDealloc(op) _PyJIT_FloatDealloc(tstate, (op))
+
+// Replace _PyLong_ExactDealloc with inline version that avoids
+// function call overhead for compact int freelist push.
+#undef _PyLong_ExactDealloc
+#define _PyLong_ExactDealloc(op) _PyJIT_LongDealloc(tstate, (op))
+
+// Replace compact long arithmetic with inline versions that avoid
+// function call overhead and inline the freelist pop.
+#define _PyCompactLong_Add(a, b) _PyJIT_CompactLong_Add(tstate, (a), (b))
+#define _PyCompactLong_Subtract(a, b) _PyJIT_CompactLong_Subtract(tstate, (a), (b))
+#define _PyCompactLong_Multiply(a, b) _PyJIT_CompactLong_Multiply(tstate, (a), (b))
+#define _PyCompactLong_FloorDivide(a, b) _PyJIT_CompactLong_FloorDivide(tstate, (a), (b))
+#define _PyCompactLong_Modulo(a, b) _PyJIT_CompactLong_Modulo(tstate, (a), (b))
+#define PyLong_FromLong(val) _PyJIT_LongFromLong(tstate, (val))
+
+// ── Fast _Py_Dealloc replacement ────────────────────────────────────
+// _Py_Dealloc goes through reftracer check, recursion limit check,
+// and an indirect call to tp_dealloc.  For the vast majority of JIT-hot
+// object types (float, compact int) we can inline the freelist push
+// directly, saving TWO function calls and the tracer/recursion overhead.
+// For unknown types we fall back to a direct tp_dealloc call (still
+// faster than _Py_Dealloc because we skip the tracer + recursion checks).
+//
+// We must override at the PyStackRef_CLOSE level because:
+// - Py_DECREF / Py_DECREF_MORTAL are inline functions defined in headers
+//   BEFORE our overrides, so their bodies already contain _Py_Dealloc
+// - PyStackRef_CLOSE also uses Py_DECREF_MORTAL via the header's inline body
+// - Only macro-level overrides at the caller level take effect
+
+// Override PyStackRef_CLOSE — the main path for all POP_TOP/CLOSE operations
+#undef PyStackRef_CLOSE
+static inline Py_ALWAYS_INLINE void
+_PyJIT_PyStackRef_CLOSE(PyThreadState *tstate, _PyStackRef ref)
+{
+    if (PyStackRef_RefcountOnObject(ref)) {
+        PyObject *op = BITS_TO_PTR(ref);
+        _Py_DECREF_STAT_INC();
+        if (--op->ob_refcnt == 0) {
+            _PyJIT_FastDealloc(tstate, op);
+        }
+    }
+}
+#define PyStackRef_CLOSE(REF) _PyJIT_PyStackRef_CLOSE(tstate, (REF))
+
+// Override PyStackRef_XCLOSE — used by POP_TOP (which calls XCLOSE, not CLOSE)
+#undef PyStackRef_XCLOSE
+static inline Py_ALWAYS_INLINE void
+_PyJIT_PyStackRef_XCLOSE(PyThreadState *tstate, _PyStackRef ref)
+{
+    if (PyStackRef_RefcountOnObject(ref)) {
+        PyObject *op = BITS_TO_PTR(ref);
+        _Py_DECREF_STAT_INC();
+        if (--op->ob_refcnt == 0) {
+            _PyJIT_FastDealloc(tstate, op);
+        }
+    }
+}
+#define PyStackRef_XCLOSE(REF) _PyJIT_PyStackRef_XCLOSE(tstate, (REF))
+
+// Override PyStackRef_CLEAR — macro that uses XCLOSE internally
+#undef PyStackRef_CLEAR
+#define PyStackRef_CLEAR(REF) \
+    do { \
+        _PyStackRef *_tmp_op_ptr = &(REF); \
+        _PyStackRef _tmp_old_op = (*_tmp_op_ptr); \
+        *_tmp_op_ptr = PyStackRef_NULL; \
+        _PyJIT_PyStackRef_XCLOSE(tstate, _tmp_old_op); \
+    } while (0)
+
+// Also override Py_DECREF for direct calls outside PyStackRef_CLOSE
+#undef Py_DECREF
+static inline Py_ALWAYS_INLINE void
+_PyJIT_Py_DECREF(PyThreadState *tstate, PyObject *op)
+{
+    if (_Py_IsImmortal(op)) {
+        _Py_DECREF_IMMORTAL_STAT_INC();
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    if (--op->ob_refcnt == 0) {
+        _PyJIT_FastDealloc(tstate, op);
+    }
+}
+#define Py_DECREF(op) _PyJIT_Py_DECREF(tstate, _PyObject_CAST(op))
+
+// Override Py_XDECREF — also an inline function with baked-in Py_DECREF
+#undef Py_XDECREF
+#define Py_XDECREF(op) \
+    do { \
+        PyObject *_xdecref_tmp = _PyObject_CAST(op); \
+        if (_xdecref_tmp != _Py_NULL) { \
+            _PyJIT_Py_DECREF(tstate, _xdecref_tmp); \
+        } \
+    } while (0)
+
+
 #undef CURRENT_OPERAND0_64
 #define CURRENT_OPERAND0_64() (_operand0_64)
 
@@ -50,24 +440,12 @@
 #undef CURRENT_OPERAND1_16
 #undef CURRENT_OPERAND1_32
 
-#if SUPPORTS_SMALL_CONSTS
-
-#define CURRENT_OPARG() (_oparg_16)
-#define CURRENT_OPERAND0_32() (_operand0_32)
-#define CURRENT_OPERAND0_16() (_operand0_16)
-#define CURRENT_OPERAND1_32() (_operand1_32)
-#define CURRENT_OPERAND1_16() (_operand1_16)
-
-#else
-
 #define CURRENT_OPARG() (_oparg)
 #define CURRENT_OPERAND0_32() (_operand0_64)
 #define CURRENT_OPERAND0_16() (_operand0_64)
 #define CURRENT_OPERAND1_32() (_operand1_64)
 #define CURRENT_OPERAND1_16() (_operand1_64)
 
-#endif
-
 
 #undef CURRENT_TARGET
 #define CURRENT_TARGET() (_target)
@@ -130,6 +508,8 @@ _JIT_ENTRY(
     int oparg;
     int uopcode = _JIT_OPCODE;
     _Py_CODEUNIT *next_instr;
+    volatile unsigned char _jit_stack_pad[1];
+    __asm__ volatile("nop # @@JIT_FRAME_ANCHOR@@" : : "m"(_jit_stack_pad) : "memory");
     // Other stuff we need handy:
 #if SIZEOF_VOID_P == 8
     PATCH_VALUE(uint64_t, _operand0_64, _JIT_OPERAND0)
@@ -143,15 +523,7 @@ _JIT_ENTRY(
     PATCH_VALUE(uint32_t, _operand1_lo, _JIT_OPERAND1_LO)
     uint64_t _operand1_64 = ((uint64_t)_operand1_hi << 32) | _operand1_lo;
 #endif
-#if SUPPORTS_SMALL_CONSTS
-    PATCH_VALUE(uint32_t, _operand0_32, _JIT_OPERAND0_32)
-    PATCH_VALUE(uint32_t, _operand1_32, _JIT_OPERAND1_32)
-    PATCH_VALUE(uint16_t, _operand0_16, _JIT_OPERAND0_16)
-    PATCH_VALUE(uint16_t, _operand1_16, _JIT_OPERAND1_16)
-    PATCH_VALUE(uint16_t, _oparg_16, _JIT_OPARG_16)
-#else
     PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)
-#endif
     PATCH_VALUE(uint32_t, _target, _JIT_TARGET)
     OPT_STAT_INC(uops_executed);
     UOP_STAT_INC(uopcode, execution_count);
diff --git a/Tools/jit/test_optimizers.py b/Tools/jit/test_optimizers.py
new file mode 100644
index 00000000000000..ad1f201dd57ab1
--- /dev/null
+++ b/Tools/jit/test_optimizers.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import pathlib
+import re
+import tempfile
+import unittest
+
+import _optimizers
+
+
+def _run_optimizer(text: str) -> str:
+    with tempfile.TemporaryDirectory() as tmp:
+        path = pathlib.Path(tmp) / "test.s"
+        path.write_text(text)
+        _optimizers.OptimizerX86ELF(
+            path,
+            label_prefix=".L",
+            symbol_prefix="",
+            re_global=re.compile(
+                r"\s*\.globl\s+(?P<label>[\w.\"$?@]+)(\s+.*)?"
+            ),
+        ).run()
+        return path.read_text()
+
+
+def _strip_annotated_input(text: str) -> str:
+    return "\n".join(
+        line for line in text.splitlines() if not line.startswith("#")
+    )
+
+
+class TestHotColdLayout(unittest.TestCase):
+    def test_refcount_gate_stays_hot(self):
+        text = """
+            .text
+            .globl entry
+        entry:
+            test al, 1
+            je .Ldecref
+        .Lhot:
+            jmp _JIT_CONTINUE
+        .Ldecref:
+            dec dword [rbx]
+            je .Lcold
+            jmp .Lhot
+        .Lcold:
+            callq helper
+            jmp .Lhot
+        """
+        optimized = _run_optimizer(text)
+        hot, _, cold = optimized.partition(
+            '.section\t.text.cold,"ax",@progbits'
+        )
+        self.assertIn(".Ldecref:", hot)
+        self.assertIn(".Lcold:", cold)
+
+    def test_cold_join_block_can_leave_hot_section(self):
+        text = """
+            .text
+            .globl entry
+        entry:
+            test al, 1
+            je .Lfast
+            jmp .Lcoldstart
+        .Ljoin:
+            test rbp, rbp
+            je .Lerr
+        .Lfast:
+            jmp _JIT_CONTINUE
+        .Lcoldstart:
+            callq helper
+            jmp .Ljoin
+        .Lerr:
+            jmp slowerror
+        """
+        optimized = _run_optimizer(text)
+        hot, _, cold = optimized.partition(
+            '.section\t.text.cold,"ax",@progbits'
+        )
+        hot = _strip_annotated_input(hot)
+        self.assertIn(".Ljoin:", cold)
+        self.assertNotIn(".Ljoin:", hot)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Tools/jit/test_peephole.py b/Tools/jit/test_peephole.py
new file mode 100644
index 00000000000000..df36f55ffc27f0
--- /dev/null
+++ b/Tools/jit/test_peephole.py
@@ -0,0 +1,687 @@
+"""Tests for peephole optimization patterns in _asm_to_dasc.py.
+
+Each test feeds a minimal sequence of DynASM lines through the peephole
+optimizer and verifies the expected output.  This catches regressions in
+the pattern matching and ensures newly added patterns don't break existing
+ones.
+
+Run with:
+    python -m pytest Tools/jit/test_peephole.py -v
+or:
+    python Tools/jit/test_peephole.py
+"""
+
+from __future__ import annotations
+
+import sys
+import os
+import unittest
+
+# Ensure the JIT tools directory is importable
+sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+
+import _asm_to_dasc
+
+
+def _optimize(lines: list[str]) -> list[str]:
+    """Run peephole on the given lines and return the result.
+
+    Wraps lines in a fake emit function header/footer so the peephole
+    driver recognizes stencil boundaries correctly.
+    """
+    wrapped = ["static void emit_TEST(Dst_DECL) {\n"] + lines + ["}\n"]
+    result = _asm_to_dasc._peephole_optimize(wrapped)
+    # Strip the wrapper
+    return result[1:-1]
+
+
+class TestPattern6_IndexedMem(unittest.TestCase):
+    """Pattern 6: fold indexed memory [base + REG*scale + disp]."""
+
+    def test_single_indexed_load(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, instruction->oparg);\n",
+            "    | mov rax, qword [rbx + rcx*8 + 48]\n",
+        ]
+        result = _optimize(lines)
+        # Should produce a computed offset, not emit_mov_imm
+        self.assertEqual(len(result), 1)
+        self.assertIn("* 8 + 48", result[0])
+
+    def test_multiple_indexed_accesses(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, instruction->oparg);\n",
+            "    | mov rax, qword [r14 + rcx*8 + 0]\n",
+            "    | mov rdx, qword [r14 + rcx*8 + 8]\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 2)
+        self.assertIn("* 8 + 0", result[0])
+        self.assertIn("* 8 + 8", result[1])
+
+
+class TestPattern8_AluImmFold(unittest.TestCase):
+    """Pattern 8: ALU OP, REG → fold immediate into instruction."""
+
+    def test_cmp_fold_32bit(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, instruction->operand0);\n",
+            "    | cmp ecx, eax\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("cmp ecx, (int)", result[0])
+
+    def test_cmp_fold_64bit_with_guard(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, instruction->operand0);\n",
+            "    | cmp rcx, rax\n",
+        ]
+        result = _optimize(lines)
+        # Should emit a single emit_cmp_reg_imm call
+        self.assertEqual(len(result), 1)
+        self.assertIn("emit_cmp_reg_imm", result[0])
+
+    def test_cmp_fold_64bit_mem_operand(self):
+        """Simple qword memory compares use the dedicated helper."""
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, (uintptr_t)&PyLong_Type);\n",
+            "    | cmp qword [rax + 8], rcx\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("emit_cmp_mem64_imm", result[0])
+        self.assertIn("JREG_RAX", result[0])
+        self.assertIn(", 8,", result[0])
+
+    def test_cmp_mem_operand_preserves_address_dependency(self):
+        """Do not fold when the compared register is also the memory base."""
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, instruction->operand0);\n",
+            "    | cmp qword [rax + 8], rax\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(result, lines)
+
+    def test_or_memory_operand(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RDX, instruction->operand0);\n",
+            "    | or dword [rbx + 16], edx\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("or dword [rbx + 16], (int)", result[0])
+
+
+class TestPattern12_StoreImm(unittest.TestCase):
+    """Pattern 12: store register to memory → store immediate directly."""
+
+    def test_store_byte(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, instruction->oparg);\n",
+            "    | mov byte [rbx + 42], al\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("(char)", result[0])
+
+    def test_store_dword(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, instruction->oparg);\n",
+            "    | mov dword [r14 + 8], ecx\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("(int)", result[0])
+
+    def test_store_qword_with_guard(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, instruction->operand0);\n",
+            "    | mov qword [r14 + 8], rcx\n",
+        ]
+        result = _optimize(lines)
+        found_helper = any("emit_store_mem64_imm" in r for r in result)
+        self.assertTrue(
+            found_helper,
+            "Expected emit_store_mem64_imm helper call for qword store",
+        )
+
+    def test_store_keeps_address_dependency(self):
+        """Do not fold stores when the loaded register is part of the address."""
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RCX, instruction->operand0);\n",
+            "    | mov qword [rcx + 8], rcx\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(result, lines)
+
+
+class TestPattern15_TwoMovAdd(unittest.TestCase):
+    """Pattern 15: combine two immediate loads followed by add."""
+
+    def test_load_small_int_style_fold(self):
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, ((uint16_t)((uintptr_t)(instruction->oparg))) << 5);\n",
+            "    emit_mov_imm(Dst, JREG_RDI, ((uintptr_t)&_PyRuntime + 14280));\n",
+            "    | add rdi, rax\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(len(result), 1)
+        self.assertIn("JREG_RDI", result[0])
+        self.assertIn("&_PyRuntime + 14280", result[0])
+        self.assertIn("<< 5", result[0])
+
+
+class TestPattern19_InverseMovRestore(unittest.TestCase):
+    def test_drop_redundant_inverse_restore(self):
+        lines = [
+            "    | mov rbx, rdi\n",
+            "    | mov rdi, rbx\n",
+        ]
+        result = _optimize(lines)
+        self.assertEqual(result, ["    | mov rbx, rdi\n"])
+
+
+class TestPattern13_DeadNullCheck(unittest.TestCase):
+    """Pattern 13: Remove dead NULL check after PyStackRef tag creation.
+
+    The movzx at [rax + 6] already dereferences rax, proving non-NULL.
+    The subsequent cmp rdi, 1 / je =>L(N) is therefore dead code.
+    """
+
+    def test_dead_null_check_removed(self):
+        lines = [
+            "    | movzx edi, word [rax + 6]\n",
+            "    | and edi, 1\n",
+            "    | or rdi, rax\n",
+            "    | cmp rdi, 1\n",
+            "    | je =>L(3)\n",
+        ]
+        result = _optimize(lines)
+        # cmp + je should be removed, leaving 3 lines
+        self.assertEqual(len(result), 3)
+        self.assertIn("movzx edi, word [rax + 6]", result[0])
+        self.assertIn("and edi, 1", result[1])
+        self.assertIn("or rdi, rax", result[2])
+
+    def test_no_false_positive_different_reg(self):
+        """Don't remove check when movzx uses a different register."""
+        lines = [
+            "    | movzx edi, word [rcx + 6]\n",
+            "    | and edi, 1\n",
+            "    | or rdi, rax\n",
+            "    | cmp rdi, 1\n",
+            "    | je =>L(3)\n",
+        ]
+        result = _optimize(lines)
+        # Should NOT remove: movzx dereferences rcx, not rax
+        self.assertEqual(len(result), 5)
+
+
+class TestSP0_PreserveFlagsMovImm(unittest.TestCase):
+    """SP0: rewrite flag-sensitive immediate loads to preserve flags."""
+
+    def test_preserve_flags_before_setcc(self):
+        lines = [
+            "    | ucomisd xmm1, xmm0\n",
+            "    emit_mov_imm(Dst, JREG_RDI, (uintptr_t)(instruction->oparg));\n",
+            "    | setae cl\n",
+        ]
+        result = _optimize(lines)
+        self.assertIn("emit_mov_imm_preserve_flags", result[1])
+
+    def test_preserve_flags_before_cmov(self):
+        lines = [
+            "    | test cx, ax\n",
+            "    emit_mov_imm(Dst, JREG_RAX, (uintptr_t)&_Py_FalseStruct);\n",
+            "    emit_mov_imm(Dst, JREG_RDI, (uintptr_t)&_Py_TrueStruct);\n",
+            "    | cmove rdi, rax\n",
+        ]
+        result = _optimize(lines)
+        self.assertIn("emit_mov_imm_preserve_flags", result[1])
+        self.assertIn("emit_mov_imm_preserve_flags", result[2])
+
+
+class TestSP1_StoreReloadElim(unittest.TestCase):
+    """SP1: Eliminate redundant stackpointer reload on hot path."""
+
+    def test_store_reload_elimination(self):
+        lines = [
+            "    | mov qword [r13 + 64], r14\n",
+            "    | test eax, eax\n",
+            "    | je =>L(5)\n",
+            "    |=>L(6):\n",
+            "    | mov r14, qword [r13 + 64]\n",
+        ]
+        result = _optimize(lines)
+        # The reload line should be eliminated
+        reload_lines = [r for r in result if "mov r14, qword [r13 + 64]" in r]
+        self.assertEqual(len(reload_lines), 0, "Reload should be eliminated")
+        # store + test + jcc (label removed by dead label elimination
+        # since L(6) has no jump references after reload removal)
+        self.assertEqual(len(result), 3)
+
+
+class TestSP3_InvertedStoreReload(unittest.TestCase):
+    """SP3: Deferred stackpointer store when hot branch is inverted."""
+
+    def test_inverted_store_reload(self):
+        """Hot path jumps to merge, cold path falls through."""
+        lines = [
+            "    | mov qword [r13 + 64], r14\n",
+            "    | test dil, 1\n",
+            "    | jne =>L(2)\n",
+            "    | jmp =>L(7)\n",
+            "    |=>L(1):\n",
+            "    | add rsp, 8\n",
+            "    |=>L(2):\n",
+            "    | mov r14, qword [r13 + 64]\n",
+            "    | xor edi, edi\n",
+        ]
+        result = _optimize(lines)
+        # Hot path: test + jne L(2) → L(2): → xor (no store/reload)
+        reload_before_merge = False
+        for i, r in enumerate(result):
+            if "mov r14, qword [r13 + 64]" in r:
+                # Reload should appear BEFORE the merge label
+                for j in range(i + 1, len(result)):
+                    if "=>L(2):" in result[j]:
+                        reload_before_merge = True
+                        break
+        self.assertTrue(
+            reload_before_merge, "Reload should be moved before merge label"
+        )
+        # Store should appear after jne (deferred to cold path)
+        store_idx = None
+        jne_idx = None
+        for i, r in enumerate(result):
+            if "jne" in r:
+                jne_idx = i
+            if "mov qword [r13 + 64], r14" in r:
+                store_idx = i
+        self.assertIsNotNone(store_idx)
+        self.assertIsNotNone(jne_idx)
+        self.assertGreater(
+            store_idx, jne_idx, "Store should be deferred past the jne"
+        )
+
+
+class TestStatistics(unittest.TestCase):
+    """Test the peephole statistics tracking."""
+
+    def test_stats_increment(self):
+        _asm_to_dasc.reset_peephole_stats()
+        lines = [
+            "    emit_mov_imm(Dst, JREG_RAX, instruction->oparg);\n",
+            "    | cmp ecx, eax\n",
+        ]
+        _optimize(lines)
+        stats = _asm_to_dasc.get_peephole_stats()
+        self.assertGreater(stats["P8_alu_imm_fold"], 0)
+
+    def test_stats_reset(self):
+        _asm_to_dasc.reset_peephole_stats()
+        stats = _asm_to_dasc.get_peephole_stats()
+        self.assertEqual(sum(stats.values()), 0)
+
+    def test_new_stats_registered(self):
+        stats = _asm_to_dasc.get_peephole_stats()
+        for key in (
+            "P16_two_mov_add",
+            "SP0_preserve_flags_mov_imm",
+            "LLVM_fold_marker",
+        ):
+            self.assertIn(key, stats)
+
+
+class TestLineClassification(unittest.TestCase):
+    """Tests for the structured line classification (parse_line) infrastructure.
+
+    Verifies that parse_line() correctly classifies DynASM output lines into
+    typed Line objects (Asm, CCall, Label, Section, FuncDef, Blank, CCode)
+    with structured operands (Reg, Mem, Imm).
+    """
+
+    def test_asm_mov(self):
+        line = _asm_to_dasc.parse_line("    | mov rax, rbx")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "mov")
+        self.assertIsInstance(line.dst, _asm_to_dasc.Reg)
+        self.assertEqual(line.dst.name, "rax")
+        self.assertIsInstance(line.src, _asm_to_dasc.Reg)
+        self.assertEqual(line.src.name, "rbx")
+
+    def test_asm_mov_memory(self):
+        line = _asm_to_dasc.parse_line("    | mov qword [r13 + 64], r14")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "mov")
+        self.assertIsInstance(line.dst, _asm_to_dasc.Mem)
+        self.assertEqual(line.dst.size, "qword")
+        self.assertEqual(line.dst.base, "r13")
+        self.assertEqual(line.dst.offset, 64)
+        self.assertIsInstance(line.src, _asm_to_dasc.Reg)
+        self.assertEqual(line.src.name, "r14")
+
+    def test_asm_jmp(self):
+        line = _asm_to_dasc.parse_line("    | jmp =>L(3)")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "jmp")
+        self.assertEqual(line.target, "=>L(3)")
+        # jmp is an unconditional jump, not a conditional branch
+        self.assertTrue(_asm_to_dasc.is_jump(line))
+        self.assertFalse(_asm_to_dasc.is_branch(line))
+        self.assertEqual(_asm_to_dasc.branch_target(line), "=>L(3)")
+
+    def test_asm_jcc(self):
+        line = _asm_to_dasc.parse_line("    | jne =>L(1)")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "jne")
+        self.assertTrue(_asm_to_dasc.is_branch(line))
+        self.assertFalse(_asm_to_dasc.is_jump(line))
+        self.assertEqual(_asm_to_dasc.branch_target(line), "=>L(1)")
+
+    def test_asm_call(self):
+        line = _asm_to_dasc.parse_line("    | call rax")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "call")
+        self.assertTrue(_asm_to_dasc.is_call(line))
+
+    def test_asm_no_operands(self):
+        line = _asm_to_dasc.parse_line("    | ret")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "ret")
+        self.assertIsNone(line.dst)
+        self.assertIsNone(line.src)
+
+    def test_c_call_mov_imm(self):
+        line = _asm_to_dasc.parse_line(
+            "    emit_mov_imm(Dst, JREG_R8, (uintptr_t)&PyFloat_Type);"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.CCall)
+        self.assertEqual(line.kind, _asm_to_dasc.CCallKind.MOV_IMM)
+        self.assertIn("JREG_R8", line.args)
+
+    def test_c_call_mov_imm_preserve_flags(self):
+        line = _asm_to_dasc.parse_line(
+            "    emit_mov_imm_preserve_flags(Dst, JREG_R8, 0);"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.CCall)
+        self.assertEqual(line.kind, _asm_to_dasc.CCallKind.MOV_IMM)
+        self.assertIn("JREG_R8", line.args)
+
+    def test_c_call_ext(self):
+        line = _asm_to_dasc.parse_line(
+            "    emit_call_ext(Dst, (void *)&PyFloat_FromDouble);"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.CCall)
+        self.assertEqual(line.kind, _asm_to_dasc.CCallKind.CALL_EXT)
+        self.assertTrue(_asm_to_dasc.is_call(line))
+
+    def test_c_call_cmp(self):
+        line = _asm_to_dasc.parse_line(
+            "    emit_cmp_reg_imm(Dst, JREG_R8, (uintptr_t)&PyFloat_Type);"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.CCall)
+        self.assertEqual(line.kind, _asm_to_dasc.CCallKind.CMP_REG_IMM)
+
+    def test_c_call_cmp_mem64(self):
+        line = _asm_to_dasc.parse_line(
+            "    emit_cmp_mem64_imm(Dst, JREG_RAX, 8, JREG_RCX, (uintptr_t)&PyFloat_Type);"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.CCall)
+        self.assertEqual(line.kind, _asm_to_dasc.CCallKind.CMP_MEM64_IMM)
+
+    def test_label(self):
+        line = _asm_to_dasc.parse_line("    |=>L(3):")
+        self.assertIsInstance(line, _asm_to_dasc.Label)
+        self.assertEqual(line.name, "L(3)")
+
+    def test_label_uop(self):
+        line = _asm_to_dasc.parse_line("    |=>uop_label:")
+        self.assertIsInstance(line, _asm_to_dasc.Label)
+        self.assertEqual(line.name, "uop_label")
+
+    def test_section_code(self):
+        line = _asm_to_dasc.parse_line("    |.code")
+        self.assertIsInstance(line, _asm_to_dasc.Section)
+        self.assertEqual(line.name, "code")
+
+    def test_section_cold(self):
+        line = _asm_to_dasc.parse_line("    |.cold")
+        self.assertIsInstance(line, _asm_to_dasc.Section)
+        self.assertEqual(line.name, "cold")
+
+    def test_func_def(self):
+        line = _asm_to_dasc.parse_line(
+            "static void emit__BINARY_OP_ADD_FLOAT("
+        )
+        self.assertIsInstance(line, _asm_to_dasc.FuncDef)
+
+    def test_c_code(self):
+        line = _asm_to_dasc.parse_line("    if (!elide_prologue) {")
+        self.assertIsInstance(line, _asm_to_dasc.CCode)
+
+    def test_c_code_brace(self):
+        line = _asm_to_dasc.parse_line("    }")
+        self.assertIsInstance(line, _asm_to_dasc.CCode)
+
+    def test_blank(self):
+        line = _asm_to_dasc.parse_line("")
+        self.assertIsInstance(line, _asm_to_dasc.Blank)
+
+    def test_comment(self):
+        line = _asm_to_dasc.parse_line("    // cold path")
+        self.assertIsInstance(line, _asm_to_dasc.Blank)
+
+    def test_store_sp(self):
+        line = _asm_to_dasc.parse_line("    | mov qword [r13 + 64], r14")
+        self.assertTrue(_asm_to_dasc.is_store_sp(line))
+
+    def test_reload_sp(self):
+        line = _asm_to_dasc.parse_line("    | mov r14, qword [r13 + 64]")
+        self.assertTrue(_asm_to_dasc.is_reload_sp(line))
+
+    def test_uses_reg(self):
+        line = _asm_to_dasc.parse_line("    | mov rax, rbx")
+        self.assertTrue(_asm_to_dasc.uses_reg(line, 0))  # rax
+        self.assertTrue(_asm_to_dasc.uses_reg(line, 3))  # rbx
+        self.assertFalse(_asm_to_dasc.uses_reg(line, 1))  # rcx
+
+    def test_parse_lines_batch(self):
+        lines = [
+            "    | mov rax, rbx",
+            "    emit_mov_imm(Dst, JREG_R8, 42);",
+            "    |=>L(1):",
+            "    |.cold",
+        ]
+        parsed = _asm_to_dasc.parse_lines(lines)
+        self.assertEqual(len(parsed), 4)
+        self.assertIsInstance(parsed[0], _asm_to_dasc.Asm)
+        self.assertIsInstance(parsed[1], _asm_to_dasc.CCall)
+        self.assertIsInstance(parsed[2], _asm_to_dasc.Label)
+        self.assertIsInstance(parsed[3], _asm_to_dasc.Section)
+
+    def test_split_operands_memory(self):
+        parts = _asm_to_dasc._split_operands("qword [r13 + 64], r14")
+        self.assertEqual(len(parts), 2)
+        self.assertEqual(parts[0].strip(), "qword [r13 + 64]")
+        self.assertEqual(parts[1].strip(), "r14")
+
+    def test_split_operands_single(self):
+        parts = _asm_to_dasc._split_operands("rax")
+        self.assertEqual(len(parts), 1)
+        self.assertEqual(parts[0].strip(), "rax")
+
+    def test_split_operands_complex(self):
+        parts = _asm_to_dasc._split_operands("qword [rax + rbx*8 + 16], rcx")
+        self.assertEqual(len(parts), 2)
+        self.assertIn("[rax + rbx*8 + 16]", parts[0])
+
+    def test_c_call_alu_reg_imm(self):
+        """ALU reg-imm helpers (test/and/or/xor) are detected as CCall."""
+        for op in ("test", "and", "or", "xor"):
+            line = _asm_to_dasc.parse_line(
+                f"    emit_{op}_reg_imm(Dst, JREG_RAX, JREG_R8, val);"
+            )
+            self.assertIsInstance(line, _asm_to_dasc.CCall)
+            self.assertEqual(line.kind, _asm_to_dasc.CCallKind.ALU_REG_IMM)
+
+    def test_indexed_mem_operand(self):
+        """Indexed memory operand [base + idx*scale + disp] is parsed."""
+        line = _asm_to_dasc.parse_line(
+            "    | mov rax, qword [rbx + rcx*8 + 48]"
+        )
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertIsInstance(line.src, _asm_to_dasc.Mem)
+        self.assertEqual(line.src.base, "rbx")
+        self.assertEqual(line.src.index, "rcx")
+        self.assertEqual(line.src.scale, 8)
+        self.assertEqual(line.src.offset, 48)
+
+    def test_lea_scale_only(self):
+        """LEA with scale-only addressing [idx*scale+0] is parsed."""
+        line = _asm_to_dasc.parse_line("    | lea rax, [rcx*8+0]")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "lea")
+        self.assertIsInstance(line.src, _asm_to_dasc.Mem)
+        self.assertIsNone(line.src.base)
+        self.assertEqual(line.src.index, "rcx")
+        self.assertEqual(line.src.scale, 8)
+        self.assertEqual(line.src.offset, 0)
+
+    def test_lea_base_scale(self):
+        """LEA with base+index*scale [rax + rcx*8] is parsed."""
+        line = _asm_to_dasc.parse_line("    | lea rax, [rax + rcx*8]")
+        self.assertIsInstance(line, _asm_to_dasc.Asm)
+        self.assertEqual(line.mnemonic, "lea")
+        self.assertIsInstance(line.src, _asm_to_dasc.Mem)
+        self.assertEqual(line.src.base, "rax")
+        self.assertEqual(line.src.index, "rcx")
+        self.assertEqual(line.src.scale, 8)
+
+
+class TestCanonicalFrameStripping(unittest.TestCase):
+    def test_strip_standard_entry_frame(self):
+        assembly = """
+_JIT_ENTRY:
+    push rbp
+    sub rsp, 32
+    mov qword ptr [rsp + 8], rdi
+    add rsp, 32
+    pop rbp
+    jmp _JIT_CONTINUE
+"""
+        stencil = _asm_to_dasc.convert_stencil("TEST", assembly)
+        self.assertEqual(stencil.frame_size, 32)
+        self.assertEqual(
+            stencil.lines,
+            [
+                "    |=>uop_label:",
+                "    | mov qword [rsp + 8], rdi",
+                "    | jmp =>continue_label",
+            ],
+        )
+
+    def test_strip_entry_frame_with_rbp_setup(self):
+        assembly = """
+_JIT_ENTRY:
+    push rbp
+    mov rbp, rsp
+    sub rsp, 48
+    mov qword ptr [rsp + 32], rdx
+    add rsp, 48
+    pop rbp
+    jmp _JIT_CONTINUE
+"""
+        stencil = _asm_to_dasc.convert_stencil("TEST", assembly)
+        self.assertEqual(stencil.frame_size, 48)
+        self.assertEqual(
+            stencil.lines,
+            [
+                "    |=>uop_label:",
+                "    | mov qword [rsp + 32], rdx",
+                "    | jmp =>continue_label",
+            ],
+        )
+
+    def test_keep_non_entry_rbp_save(self):
+        assembly = """
+_JIT_ENTRY:
+    cmp edi, 0
+    jne .Lcold
+    jmp _JIT_CONTINUE
+.Lcold:
+    push rbp
+    sub rsp, 16
+    mov rbp, rsi
+    mov qword ptr [rsp + 8], rdx
+    add rsp, 16
+    pop rbp
+    jmp _JIT_CONTINUE
+"""
+        stencil = _asm_to_dasc.convert_stencil("TEST", assembly)
+        self.assertEqual(stencil.frame_size, 0)
+        self.assertIn("    | push rbp", stencil.lines)
+        self.assertIn("    | add rsp, 16", stencil.lines)
+
+    def test_frame_anchor_stripping_keeps_cache_move(self):
+        assembly = """
+_JIT_ENTRY:
+    push rbp
+    mov rbp, rsp
+    sub rsp, 144
+    mov rbp, rsi
+    mov rbx, rdi
+    nop # @@JIT_FRAME_ANCHOR@@
+    mov qword ptr [r14], rbx
+    jmp _JIT_CONTINUE
+"""
+        stencil = _asm_to_dasc.convert_stencil("TEST", assembly)
+        self.assertEqual(stencil.frame_size, 144)
+        self.assertIn("    | mov rbx, rdi", stencil.lines)
+        self.assertIn("    | mov qword [r14], rbx", stencil.lines)
+
+    def test_frame_anchor_stripping_removes_marker(self):
+        assembly = """
+_JIT_ENTRY:
+    push rbp
+    mov rbp, rsp
+    sub rsp, 144
+    mov qword ptr [rsp + 8], rdx
+    nop # @@JIT_FRAME_ANCHOR@@
+    mov qword ptr [r14], rbx
+    jmp _JIT_CONTINUE
+"""
+        stencil = _asm_to_dasc.convert_stencil("TEST", assembly)
+        self.assertEqual(stencil.frame_size, 144)
+        self.assertNotIn("JIT_FRAME_ANCHOR", "\n".join(stencil.lines))
+
+
+class TestDeadFrameAnchorElimination(unittest.TestCase):
+    def test_remove_dead_frame_anchor_lea(self):
+        result = _optimize(
+            [
+                "    | lea rax, [rbp - 144]\n",
+                "    | mov rax, rdi\n",
+            ]
+        )
+        self.assertEqual(result, ["    | mov rax, rdi\n"])
+
+    def test_keep_live_frame_anchor_lea(self):
+        result = _optimize(
+            [
+                "    | lea rax, [rbp - 144]\n",
+                "    | add rdi, rax\n",
+            ]
+        )
+        self.assertEqual(
+            result,
+            [
+                "    | lea rax, [rbp - 144]\n",
+                "    | add rdi, rax\n",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 72ed4456b1a768b89aa497b0c1cd1ff9555100fc Mon Sep 17 00:00:00 2001
From: Marius Wachtler <undingen@gmail.com>
Date: Sat, 14 Mar 2026 18:16:26 +0100
Subject: [PATCH 2/2] JIT: add tier2 inplace modification via specialization
 framework and true divide, power, floor divide, modulo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a proper specialization-based approach for inplace float modification.
This uses the existing _PyBinaryOpCache.external_cache[0..2] to store
profiling hints from the specializer, which the optimizer reads to select
the best tier2 inplace variant.

The key insight: when a float binary operation produces a result and one
of the input operands has refcount 1 (or 2 when a STORE_FAST targets it),
we can modify that float object in-place instead of allocating a new one.
This eliminates allocation overhead in common patterns like:

    x += y       # STORE_FAST_LEFT: left operand is the local
    x = a + b    # generic: whichever operand has refcount 1
    total += a * b  # chained: intermediate has refcount 1

Specializer (specialize.c):
- binary_op_float_inplace_candidate(): checks if either operand has
  refcount 1 at specialization time
- binary_op_float_inplace_store_fast_hint(): checks if next instruction
  is STORE_FAST targeting left (source=1) or right (source=2) operand
- Stores hints in external_cache[0]=use_inplace, [1]=source, [2]=local_index

New tier2 ops:
- Float/int mixed: _BINARY_OP_{ADD,SUBTRACT,MULTIPLY,TRUE_DIVIDE}_FLOAT_INT
- Float true divide: _BINARY_OP_TRUE_DIVIDE_FLOAT (with zero check)
- Float power: _BINARY_OP_POWER_FLOAT (positive base, finite exponent)
- Int floor divide: _BINARY_OP_FLOOR_DIVIDE_INT (Python semantics)
- Int modulo: _BINARY_OP_MODULO_INT (Python semantics)
- Inplace float: _BINARY_OP_INPLACE_{TRUE_DIVIDE,POWER}_FLOAT
- Inplace int: _BINARY_OP_INPLACE_{ADD,SUBTRACT,MULTIPLY}_INT

Architecture: The specializer writes JIT hints to external_cache[3] (an enum
indicating which tier2 op to use) and calls unspecialize() — the interpreter
runs the generic BINARY_OP path while the JIT optimizer reads the hints and
emits the specialized tier2 op.  This avoids wasting interpreter opcode slots
while still getting full JIT specialization.

Also adds _PyCompactLong_FloorDivide() and _PyCompactLong_Modulo() helpers
in Objects/longobject.c with correct Python floor-division/modulo semantics
(sign correction for negative operands).
---
 Include/internal/pycore_long.h         |    2 +
 Include/internal/pycore_uop_ids.h      | 2350 ++++++++++++------------
 Include/internal/pycore_uop_metadata.h |  346 ++++
 Lib/test/test_capi/test_opt.py         |    2 +-
 Lib/test/test_opcache.py               |    1 -
 Objects/longobject.c                   |   35 +
 Python/bytecodes.c                     |  662 +++++++
 Python/executor_cases.c.h              | 2081 +++++++++++++++++++++
 Python/optimizer.c                     |  120 ++
 Python/optimizer_bytecodes.c           |  139 ++
 Python/optimizer_cases.c.h             |  446 +++++
 Python/specialize.c                    |  159 ++
 12 files changed, 5192 insertions(+), 1151 deletions(-)

diff --git a/Include/internal/pycore_long.h b/Include/internal/pycore_long.h
index d545ba0c3abb52..b75b5e871df317 100644
--- a/Include/internal/pycore_long.h
+++ b/Include/internal/pycore_long.h
@@ -115,6 +115,8 @@ PyAPI_DATA(PyObject*) _PyLong_Lshift(PyObject *, int64_t);
 PyAPI_FUNC(_PyStackRef) _PyCompactLong_Add(PyLongObject *left, PyLongObject *right);
 PyAPI_FUNC(_PyStackRef) _PyCompactLong_Multiply(PyLongObject *left, PyLongObject *right);
 PyAPI_FUNC(_PyStackRef) _PyCompactLong_Subtract(PyLongObject *left, PyLongObject *right);
+PyAPI_FUNC(_PyStackRef) _PyCompactLong_FloorDivide(PyLongObject *left, PyLongObject *right);
+PyAPI_FUNC(_PyStackRef) _PyCompactLong_Modulo(PyLongObject *left, PyLongObject *right);
 
 // Export for 'binascii' shared extension.
 PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h
index 94b05b736ed277..c32f328f604d60 100644
--- a/Include/internal/pycore_uop_ids.h
+++ b/Include/internal/pycore_uop_ids.h
@@ -13,23 +13,45 @@ extern "C" {
 #define _SET_IP 301
 #define _BINARY_OP 302
 #define _BINARY_OP_ADD_FLOAT 303
-#define _BINARY_OP_ADD_INT 304
-#define _BINARY_OP_ADD_UNICODE 305
-#define _BINARY_OP_EXTEND 306
-#define _BINARY_OP_INPLACE_ADD_UNICODE 307
-#define _BINARY_OP_MULTIPLY_FLOAT 308
-#define _BINARY_OP_MULTIPLY_INT 309
-#define _BINARY_OP_SUBSCR_CHECK_FUNC 310
-#define _BINARY_OP_SUBSCR_DICT 311
-#define _BINARY_OP_SUBSCR_INIT_CALL 312
-#define _BINARY_OP_SUBSCR_LIST_INT 313
-#define _BINARY_OP_SUBSCR_LIST_SLICE 314
-#define _BINARY_OP_SUBSCR_STR_INT 315
-#define _BINARY_OP_SUBSCR_TUPLE_INT 316
-#define _BINARY_OP_SUBSCR_USTR_INT 317
-#define _BINARY_OP_SUBTRACT_FLOAT 318
-#define _BINARY_OP_SUBTRACT_INT 319
-#define _BINARY_SLICE 320
+#define _BINARY_OP_ADD_FLOAT_INT 304
+#define _BINARY_OP_ADD_INT 305
+#define _BINARY_OP_ADD_UNICODE 306
+#define _BINARY_OP_EXTEND 307
+#define _BINARY_OP_FLOOR_DIVIDE_INT 308
+#define _BINARY_OP_INPLACE_ADD_FLOAT 309
+#define _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT 310
+#define _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT 311
+#define _BINARY_OP_INPLACE_ADD_INT 312
+#define _BINARY_OP_INPLACE_ADD_UNICODE 313
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT 314
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT 315
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT 316
+#define _BINARY_OP_INPLACE_MULTIPLY_INT 317
+#define _BINARY_OP_INPLACE_POWER_FLOAT 318
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT 319
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT 320
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT 321
+#define _BINARY_OP_INPLACE_SUBTRACT_INT 322
+#define _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT 323
+#define _BINARY_OP_MODULO_INT 324
+#define _BINARY_OP_MULTIPLY_FLOAT 325
+#define _BINARY_OP_MULTIPLY_FLOAT_INT 326
+#define _BINARY_OP_MULTIPLY_INT 327
+#define _BINARY_OP_POWER_FLOAT 328
+#define _BINARY_OP_SUBSCR_CHECK_FUNC 329
+#define _BINARY_OP_SUBSCR_DICT 330
+#define _BINARY_OP_SUBSCR_INIT_CALL 331
+#define _BINARY_OP_SUBSCR_LIST_INT 332
+#define _BINARY_OP_SUBSCR_LIST_SLICE 333
+#define _BINARY_OP_SUBSCR_STR_INT 334
+#define _BINARY_OP_SUBSCR_TUPLE_INT 335
+#define _BINARY_OP_SUBSCR_USTR_INT 336
+#define _BINARY_OP_SUBTRACT_FLOAT 337
+#define _BINARY_OP_SUBTRACT_FLOAT_INT 338
+#define _BINARY_OP_SUBTRACT_INT 339
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT 340
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT_INT 341
+#define _BINARY_SLICE 342
 #define _BUILD_INTERPOLATION BUILD_INTERPOLATION
 #define _BUILD_LIST BUILD_LIST
 #define _BUILD_MAP BUILD_MAP
@@ -38,167 +60,167 @@ extern "C" {
 #define _BUILD_STRING BUILD_STRING
 #define _BUILD_TEMPLATE BUILD_TEMPLATE
 #define _BUILD_TUPLE BUILD_TUPLE
-#define _CALL_BUILTIN_CLASS 321
-#define _CALL_BUILTIN_FAST 322
-#define _CALL_BUILTIN_FAST_WITH_KEYWORDS 323
-#define _CALL_BUILTIN_O 324
-#define _CALL_FUNCTION_EX_NON_PY_GENERAL 325
+#define _CALL_BUILTIN_CLASS 343
+#define _CALL_BUILTIN_FAST 344
+#define _CALL_BUILTIN_FAST_WITH_KEYWORDS 345
+#define _CALL_BUILTIN_O 346
+#define _CALL_FUNCTION_EX_NON_PY_GENERAL 347
 #define _CALL_INTRINSIC_1 CALL_INTRINSIC_1
 #define _CALL_INTRINSIC_2 CALL_INTRINSIC_2
-#define _CALL_ISINSTANCE 326
-#define _CALL_KW_NON_PY 327
-#define _CALL_LEN 328
-#define _CALL_LIST_APPEND 329
-#define _CALL_METHOD_DESCRIPTOR_FAST 330
-#define _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS 331
-#define _CALL_METHOD_DESCRIPTOR_NOARGS 332
-#define _CALL_METHOD_DESCRIPTOR_O 333
-#define _CALL_NON_PY_GENERAL 334
-#define _CALL_STR_1 335
-#define _CALL_TUPLE_1 336
-#define _CALL_TYPE_1 337
-#define _CHECK_AND_ALLOCATE_OBJECT 338
-#define _CHECK_ATTR_CLASS 339
-#define _CHECK_ATTR_METHOD_LAZY_DICT 340
-#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS 341
+#define _CALL_ISINSTANCE 348
+#define _CALL_KW_NON_PY 349
+#define _CALL_LEN 350
+#define _CALL_LIST_APPEND 351
+#define _CALL_METHOD_DESCRIPTOR_FAST 352
+#define _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS 353
+#define _CALL_METHOD_DESCRIPTOR_NOARGS 354
+#define _CALL_METHOD_DESCRIPTOR_O 355
+#define _CALL_NON_PY_GENERAL 356
+#define _CALL_STR_1 357
+#define _CALL_TUPLE_1 358
+#define _CALL_TYPE_1 359
+#define _CHECK_AND_ALLOCATE_OBJECT 360
+#define _CHECK_ATTR_CLASS 361
+#define _CHECK_ATTR_METHOD_LAZY_DICT 362
+#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS 363
 #define _CHECK_EG_MATCH CHECK_EG_MATCH
 #define _CHECK_EXC_MATCH CHECK_EXC_MATCH
-#define _CHECK_FUNCTION_EXACT_ARGS 342
-#define _CHECK_FUNCTION_VERSION 343
-#define _CHECK_FUNCTION_VERSION_INLINE 344
-#define _CHECK_FUNCTION_VERSION_KW 345
-#define _CHECK_IS_NOT_PY_CALLABLE 346
-#define _CHECK_IS_NOT_PY_CALLABLE_EX 347
-#define _CHECK_IS_NOT_PY_CALLABLE_KW 348
-#define _CHECK_IS_PY_CALLABLE_EX 349
-#define _CHECK_MANAGED_OBJECT_HAS_VALUES 350
-#define _CHECK_METHOD_VERSION 351
-#define _CHECK_METHOD_VERSION_KW 352
-#define _CHECK_PEP_523 353
-#define _CHECK_PERIODIC 354
-#define _CHECK_PERIODIC_AT_END 355
-#define _CHECK_PERIODIC_IF_NOT_YIELD_FROM 356
-#define _CHECK_RECURSION_REMAINING 357
-#define _CHECK_STACK_SPACE 358
-#define _CHECK_STACK_SPACE_OPERAND 359
-#define _CHECK_VALIDITY 360
-#define _COLD_DYNAMIC_EXIT 361
-#define _COLD_EXIT 362
-#define _COMPARE_OP 363
-#define _COMPARE_OP_FLOAT 364
-#define _COMPARE_OP_INT 365
-#define _COMPARE_OP_STR 366
-#define _CONTAINS_OP 367
-#define _CONTAINS_OP_DICT 368
-#define _CONTAINS_OP_SET 369
+#define _CHECK_FUNCTION_EXACT_ARGS 364
+#define _CHECK_FUNCTION_VERSION 365
+#define _CHECK_FUNCTION_VERSION_INLINE 366
+#define _CHECK_FUNCTION_VERSION_KW 367
+#define _CHECK_IS_NOT_PY_CALLABLE 368
+#define _CHECK_IS_NOT_PY_CALLABLE_EX 369
+#define _CHECK_IS_NOT_PY_CALLABLE_KW 370
+#define _CHECK_IS_PY_CALLABLE_EX 371
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES 372
+#define _CHECK_METHOD_VERSION 373
+#define _CHECK_METHOD_VERSION_KW 374
+#define _CHECK_PEP_523 375
+#define _CHECK_PERIODIC 376
+#define _CHECK_PERIODIC_AT_END 377
+#define _CHECK_PERIODIC_IF_NOT_YIELD_FROM 378
+#define _CHECK_RECURSION_REMAINING 379
+#define _CHECK_STACK_SPACE 380
+#define _CHECK_STACK_SPACE_OPERAND 381
+#define _CHECK_VALIDITY 382
+#define _COLD_DYNAMIC_EXIT 383
+#define _COLD_EXIT 384
+#define _COMPARE_OP 385
+#define _COMPARE_OP_FLOAT 386
+#define _COMPARE_OP_INT 387
+#define _COMPARE_OP_STR 388
+#define _CONTAINS_OP 389
+#define _CONTAINS_OP_DICT 390
+#define _CONTAINS_OP_SET 391
 #define _CONVERT_VALUE CONVERT_VALUE
-#define _COPY 370
-#define _COPY_1 371
-#define _COPY_2 372
-#define _COPY_3 373
+#define _COPY 392
+#define _COPY_1 393
+#define _COPY_2 394
+#define _COPY_3 395
 #define _COPY_FREE_VARS COPY_FREE_VARS
-#define _CREATE_INIT_FRAME 374
+#define _CREATE_INIT_FRAME 396
 #define _DELETE_ATTR DELETE_ATTR
 #define _DELETE_DEREF DELETE_DEREF
 #define _DELETE_FAST DELETE_FAST
 #define _DELETE_GLOBAL DELETE_GLOBAL
 #define _DELETE_NAME DELETE_NAME
 #define _DELETE_SUBSCR DELETE_SUBSCR
-#define _DEOPT 375
+#define _DEOPT 397
 #define _DICT_MERGE DICT_MERGE
 #define _DICT_UPDATE DICT_UPDATE
-#define _DO_CALL 376
-#define _DO_CALL_FUNCTION_EX 377
-#define _DO_CALL_KW 378
-#define _DYNAMIC_EXIT 379
+#define _DO_CALL 398
+#define _DO_CALL_FUNCTION_EX 399
+#define _DO_CALL_KW 400
+#define _DYNAMIC_EXIT 401
 #define _END_FOR END_FOR
 #define _END_SEND END_SEND
-#define _ERROR_POP_N 380
+#define _ERROR_POP_N 402
 #define _EXIT_INIT_CHECK EXIT_INIT_CHECK
-#define _EXPAND_METHOD 381
-#define _EXPAND_METHOD_KW 382
-#define _FATAL_ERROR 383
+#define _EXPAND_METHOD 403
+#define _EXPAND_METHOD_KW 404
+#define _FATAL_ERROR 405
 #define _FORMAT_SIMPLE FORMAT_SIMPLE
 #define _FORMAT_WITH_SPEC FORMAT_WITH_SPEC
-#define _FOR_ITER 384
-#define _FOR_ITER_GEN_FRAME 385
-#define _FOR_ITER_TIER_TWO 386
+#define _FOR_ITER 406
+#define _FOR_ITER_GEN_FRAME 407
+#define _FOR_ITER_TIER_TWO 408
 #define _GET_AITER GET_AITER
 #define _GET_ANEXT GET_ANEXT
 #define _GET_AWAITABLE GET_AWAITABLE
 #define _GET_ITER GET_ITER
 #define _GET_LEN GET_LEN
 #define _GET_YIELD_FROM_ITER GET_YIELD_FROM_ITER
-#define _GUARD_BINARY_OP_EXTEND 387
-#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS 388
-#define _GUARD_BIT_IS_SET_POP 389
-#define _GUARD_BIT_IS_SET_POP_4 390
-#define _GUARD_BIT_IS_SET_POP_5 391
-#define _GUARD_BIT_IS_SET_POP_6 392
-#define _GUARD_BIT_IS_SET_POP_7 393
-#define _GUARD_BIT_IS_UNSET_POP 394
-#define _GUARD_BIT_IS_UNSET_POP_4 395
-#define _GUARD_BIT_IS_UNSET_POP_5 396
-#define _GUARD_BIT_IS_UNSET_POP_6 397
-#define _GUARD_BIT_IS_UNSET_POP_7 398
-#define _GUARD_CALLABLE_ISINSTANCE 399
-#define _GUARD_CALLABLE_LEN 400
-#define _GUARD_CALLABLE_LIST_APPEND 401
-#define _GUARD_CALLABLE_STR_1 402
-#define _GUARD_CALLABLE_TUPLE_1 403
-#define _GUARD_CALLABLE_TYPE_1 404
-#define _GUARD_CODE 405
-#define _GUARD_DORV_NO_DICT 406
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 407
-#define _GUARD_GLOBALS_VERSION 408
-#define _GUARD_IP_RETURN_GENERATOR 409
-#define _GUARD_IP_RETURN_VALUE 410
-#define _GUARD_IP_YIELD_VALUE 411
-#define _GUARD_IP__PUSH_FRAME 412
-#define _GUARD_IS_FALSE_POP 413
-#define _GUARD_IS_NONE_POP 414
-#define _GUARD_IS_NOT_NONE_POP 415
-#define _GUARD_IS_TRUE_POP 416
-#define _GUARD_KEYS_VERSION 417
-#define _GUARD_NOS_COMPACT_ASCII 418
-#define _GUARD_NOS_DICT 419
-#define _GUARD_NOS_FLOAT 420
-#define _GUARD_NOS_INT 421
-#define _GUARD_NOS_LIST 422
-#define _GUARD_NOS_NOT_NULL 423
-#define _GUARD_NOS_NULL 424
-#define _GUARD_NOS_OVERFLOWED 425
-#define _GUARD_NOS_TUPLE 426
-#define _GUARD_NOS_UNICODE 427
-#define _GUARD_NOT_EXHAUSTED_LIST 428
-#define _GUARD_NOT_EXHAUSTED_RANGE 429
-#define _GUARD_NOT_EXHAUSTED_TUPLE 430
-#define _GUARD_THIRD_NULL 431
-#define _GUARD_TOS_ANY_SET 432
-#define _GUARD_TOS_DICT 433
-#define _GUARD_TOS_FLOAT 434
-#define _GUARD_TOS_INT 435
-#define _GUARD_TOS_LIST 436
-#define _GUARD_TOS_OVERFLOWED 437
-#define _GUARD_TOS_SLICE 438
-#define _GUARD_TOS_TUPLE 439
-#define _GUARD_TOS_UNICODE 440
-#define _GUARD_TYPE_VERSION 441
-#define _GUARD_TYPE_VERSION_AND_LOCK 442
-#define _HANDLE_PENDING_AND_DEOPT 443
+#define _GUARD_BINARY_OP_EXTEND 409
+#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS 410
+#define _GUARD_BIT_IS_SET_POP 411
+#define _GUARD_BIT_IS_SET_POP_4 412
+#define _GUARD_BIT_IS_SET_POP_5 413
+#define _GUARD_BIT_IS_SET_POP_6 414
+#define _GUARD_BIT_IS_SET_POP_7 415
+#define _GUARD_BIT_IS_UNSET_POP 416
+#define _GUARD_BIT_IS_UNSET_POP_4 417
+#define _GUARD_BIT_IS_UNSET_POP_5 418
+#define _GUARD_BIT_IS_UNSET_POP_6 419
+#define _GUARD_BIT_IS_UNSET_POP_7 420
+#define _GUARD_CALLABLE_ISINSTANCE 421
+#define _GUARD_CALLABLE_LEN 422
+#define _GUARD_CALLABLE_LIST_APPEND 423
+#define _GUARD_CALLABLE_STR_1 424
+#define _GUARD_CALLABLE_TUPLE_1 425
+#define _GUARD_CALLABLE_TYPE_1 426
+#define _GUARD_CODE 427
+#define _GUARD_DORV_NO_DICT 428
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 429
+#define _GUARD_GLOBALS_VERSION 430
+#define _GUARD_IP_RETURN_GENERATOR 431
+#define _GUARD_IP_RETURN_VALUE 432
+#define _GUARD_IP_YIELD_VALUE 433
+#define _GUARD_IP__PUSH_FRAME 434
+#define _GUARD_IS_FALSE_POP 435
+#define _GUARD_IS_NONE_POP 436
+#define _GUARD_IS_NOT_NONE_POP 437
+#define _GUARD_IS_TRUE_POP 438
+#define _GUARD_KEYS_VERSION 439
+#define _GUARD_NOS_COMPACT_ASCII 440
+#define _GUARD_NOS_DICT 441
+#define _GUARD_NOS_FLOAT 442
+#define _GUARD_NOS_INT 443
+#define _GUARD_NOS_LIST 444
+#define _GUARD_NOS_NOT_NULL 445
+#define _GUARD_NOS_NULL 446
+#define _GUARD_NOS_OVERFLOWED 447
+#define _GUARD_NOS_TUPLE 448
+#define _GUARD_NOS_UNICODE 449
+#define _GUARD_NOT_EXHAUSTED_LIST 450
+#define _GUARD_NOT_EXHAUSTED_RANGE 451
+#define _GUARD_NOT_EXHAUSTED_TUPLE 452
+#define _GUARD_THIRD_NULL 453
+#define _GUARD_TOS_ANY_SET 454
+#define _GUARD_TOS_DICT 455
+#define _GUARD_TOS_FLOAT 456
+#define _GUARD_TOS_INT 457
+#define _GUARD_TOS_LIST 458
+#define _GUARD_TOS_OVERFLOWED 459
+#define _GUARD_TOS_SLICE 460
+#define _GUARD_TOS_TUPLE 461
+#define _GUARD_TOS_UNICODE 462
+#define _GUARD_TYPE_VERSION 463
+#define _GUARD_TYPE_VERSION_AND_LOCK 464
+#define _HANDLE_PENDING_AND_DEOPT 465
 #define _IMPORT_FROM IMPORT_FROM
 #define _IMPORT_NAME IMPORT_NAME
-#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 444
-#define _INIT_CALL_PY_EXACT_ARGS 445
-#define _INIT_CALL_PY_EXACT_ARGS_0 446
-#define _INIT_CALL_PY_EXACT_ARGS_1 447
-#define _INIT_CALL_PY_EXACT_ARGS_2 448
-#define _INIT_CALL_PY_EXACT_ARGS_3 449
-#define _INIT_CALL_PY_EXACT_ARGS_4 450
-#define _INSERT_1_LOAD_CONST_INLINE 451
-#define _INSERT_1_LOAD_CONST_INLINE_BORROW 452
-#define _INSERT_2_LOAD_CONST_INLINE_BORROW 453
-#define _INSERT_NULL 454
+#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 466
+#define _INIT_CALL_PY_EXACT_ARGS 467
+#define _INIT_CALL_PY_EXACT_ARGS_0 468
+#define _INIT_CALL_PY_EXACT_ARGS_1 469
+#define _INIT_CALL_PY_EXACT_ARGS_2 470
+#define _INIT_CALL_PY_EXACT_ARGS_3 471
+#define _INIT_CALL_PY_EXACT_ARGS_4 472
+#define _INSERT_1_LOAD_CONST_INLINE 473
+#define _INSERT_1_LOAD_CONST_INLINE_BORROW 474
+#define _INSERT_2_LOAD_CONST_INLINE_BORROW 475
+#define _INSERT_NULL 476
 #define _INSTRUMENTED_FOR_ITER INSTRUMENTED_FOR_ITER
 #define _INSTRUMENTED_INSTRUCTION INSTRUMENTED_INSTRUCTION
 #define _INSTRUMENTED_JUMP_FORWARD INSTRUMENTED_JUMP_FORWARD
@@ -208,1050 +230,1080 @@ extern "C" {
 #define _INSTRUMENTED_POP_JUMP_IF_NONE INSTRUMENTED_POP_JUMP_IF_NONE
 #define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE INSTRUMENTED_POP_JUMP_IF_NOT_NONE
 #define _INSTRUMENTED_POP_JUMP_IF_TRUE INSTRUMENTED_POP_JUMP_IF_TRUE
-#define _IS_NONE 455
-#define _IS_OP 456
-#define _ITER_CHECK_LIST 457
-#define _ITER_CHECK_RANGE 458
-#define _ITER_CHECK_TUPLE 459
-#define _ITER_JUMP_LIST 460
-#define _ITER_JUMP_RANGE 461
-#define _ITER_JUMP_TUPLE 462
-#define _ITER_NEXT_LIST 463
-#define _ITER_NEXT_LIST_TIER_TWO 464
-#define _ITER_NEXT_RANGE 465
-#define _ITER_NEXT_TUPLE 466
+#define _IS_NONE 477
+#define _IS_OP 478
+#define _ITER_CHECK_LIST 479
+#define _ITER_CHECK_RANGE 480
+#define _ITER_CHECK_TUPLE 481
+#define _ITER_JUMP_LIST 482
+#define _ITER_JUMP_RANGE 483
+#define _ITER_JUMP_TUPLE 484
+#define _ITER_NEXT_LIST 485
+#define _ITER_NEXT_LIST_TIER_TWO 486
+#define _ITER_NEXT_RANGE 487
+#define _ITER_NEXT_TUPLE 488
 #define _JUMP_BACKWARD_NO_INTERRUPT JUMP_BACKWARD_NO_INTERRUPT
-#define _JUMP_TO_TOP 467
+#define _JUMP_TO_TOP 489
 #define _LIST_APPEND LIST_APPEND
 #define _LIST_EXTEND LIST_EXTEND
-#define _LOAD_ATTR 468
-#define _LOAD_ATTR_CLASS 469
+#define _LOAD_ATTR 490
+#define _LOAD_ATTR_CLASS 491
 #define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN
-#define _LOAD_ATTR_INSTANCE_VALUE 470
-#define _LOAD_ATTR_METHOD_LAZY_DICT 471
-#define _LOAD_ATTR_METHOD_NO_DICT 472
-#define _LOAD_ATTR_METHOD_WITH_VALUES 473
-#define _LOAD_ATTR_MODULE 474
-#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 475
-#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 476
-#define _LOAD_ATTR_PROPERTY_FRAME 477
-#define _LOAD_ATTR_SLOT 478
-#define _LOAD_ATTR_WITH_HINT 479
+#define _LOAD_ATTR_INSTANCE_VALUE 492
+#define _LOAD_ATTR_METHOD_LAZY_DICT 493
+#define _LOAD_ATTR_METHOD_NO_DICT 494
+#define _LOAD_ATTR_METHOD_WITH_VALUES 495
+#define _LOAD_ATTR_MODULE 496
+#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 497
+#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 498
+#define _LOAD_ATTR_PROPERTY_FRAME 499
+#define _LOAD_ATTR_SLOT 500
+#define _LOAD_ATTR_WITH_HINT 501
 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS
-#define _LOAD_BYTECODE 480
+#define _LOAD_BYTECODE 502
 #define _LOAD_COMMON_CONSTANT LOAD_COMMON_CONSTANT
 #define _LOAD_CONST LOAD_CONST
-#define _LOAD_CONST_INLINE 481
-#define _LOAD_CONST_INLINE_BORROW 482
-#define _LOAD_CONST_UNDER_INLINE 483
-#define _LOAD_CONST_UNDER_INLINE_BORROW 484
+#define _LOAD_CONST_INLINE 503
+#define _LOAD_CONST_INLINE_BORROW 504
+#define _LOAD_CONST_UNDER_INLINE 505
+#define _LOAD_CONST_UNDER_INLINE_BORROW 506
 #define _LOAD_DEREF LOAD_DEREF
-#define _LOAD_FAST 485
-#define _LOAD_FAST_0 486
-#define _LOAD_FAST_1 487
-#define _LOAD_FAST_2 488
-#define _LOAD_FAST_3 489
-#define _LOAD_FAST_4 490
-#define _LOAD_FAST_5 491
-#define _LOAD_FAST_6 492
-#define _LOAD_FAST_7 493
+#define _LOAD_FAST 507
+#define _LOAD_FAST_0 508
+#define _LOAD_FAST_1 509
+#define _LOAD_FAST_2 510
+#define _LOAD_FAST_3 511
+#define _LOAD_FAST_4 512
+#define _LOAD_FAST_5 513
+#define _LOAD_FAST_6 514
+#define _LOAD_FAST_7 515
 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR
-#define _LOAD_FAST_BORROW 494
-#define _LOAD_FAST_BORROW_0 495
-#define _LOAD_FAST_BORROW_1 496
-#define _LOAD_FAST_BORROW_2 497
-#define _LOAD_FAST_BORROW_3 498
-#define _LOAD_FAST_BORROW_4 499
-#define _LOAD_FAST_BORROW_5 500
-#define _LOAD_FAST_BORROW_6 501
-#define _LOAD_FAST_BORROW_7 502
+#define _LOAD_FAST_BORROW 516
+#define _LOAD_FAST_BORROW_0 517
+#define _LOAD_FAST_BORROW_1 518
+#define _LOAD_FAST_BORROW_2 519
+#define _LOAD_FAST_BORROW_3 520
+#define _LOAD_FAST_BORROW_4 521
+#define _LOAD_FAST_BORROW_5 522
+#define _LOAD_FAST_BORROW_6 523
+#define _LOAD_FAST_BORROW_7 524
 #define _LOAD_FAST_CHECK LOAD_FAST_CHECK
 #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF
 #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS
-#define _LOAD_GLOBAL 503
-#define _LOAD_GLOBAL_BUILTINS 504
-#define _LOAD_GLOBAL_MODULE 505
+#define _LOAD_GLOBAL 525
+#define _LOAD_GLOBAL_BUILTINS 526
+#define _LOAD_GLOBAL_MODULE 527
 #define _LOAD_LOCALS LOAD_LOCALS
 #define _LOAD_NAME LOAD_NAME
-#define _LOAD_SMALL_INT 506
-#define _LOAD_SMALL_INT_0 507
-#define _LOAD_SMALL_INT_1 508
-#define _LOAD_SMALL_INT_2 509
-#define _LOAD_SMALL_INT_3 510
-#define _LOAD_SPECIAL 511
+#define _LOAD_SMALL_INT 528
+#define _LOAD_SMALL_INT_0 529
+#define _LOAD_SMALL_INT_1 530
+#define _LOAD_SMALL_INT_2 531
+#define _LOAD_SMALL_INT_3 532
+#define _LOAD_SPECIAL 533
 #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR
 #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD
-#define _MAKE_CALLARGS_A_TUPLE 512
+#define _MAKE_CALLARGS_A_TUPLE 534
 #define _MAKE_CELL MAKE_CELL
 #define _MAKE_FUNCTION MAKE_FUNCTION
-#define _MAKE_WARM 513
+#define _MAKE_WARM 535
 #define _MAP_ADD MAP_ADD
 #define _MATCH_CLASS MATCH_CLASS
 #define _MATCH_KEYS MATCH_KEYS
 #define _MATCH_MAPPING MATCH_MAPPING
 #define _MATCH_SEQUENCE MATCH_SEQUENCE
-#define _MAYBE_EXPAND_METHOD 514
-#define _MAYBE_EXPAND_METHOD_KW 515
-#define _MONITOR_CALL 516
-#define _MONITOR_CALL_KW 517
-#define _MONITOR_JUMP_BACKWARD 518
-#define _MONITOR_RESUME 519
+#define _MAYBE_EXPAND_METHOD 536
+#define _MAYBE_EXPAND_METHOD_KW 537
+#define _MONITOR_CALL 538
+#define _MONITOR_CALL_KW 539
+#define _MONITOR_JUMP_BACKWARD 540
+#define _MONITOR_RESUME 541
 #define _NOP NOP
-#define _POP_CALL 520
-#define _POP_CALL_LOAD_CONST_INLINE_BORROW 521
-#define _POP_CALL_ONE 522
-#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW 523
-#define _POP_CALL_TWO 524
-#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW 525
+#define _POP_CALL 542
+#define _POP_CALL_LOAD_CONST_INLINE_BORROW 543
+#define _POP_CALL_ONE 544
+#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW 545
+#define _POP_CALL_TWO 546
+#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW 547
 #define _POP_EXCEPT POP_EXCEPT
 #define _POP_ITER POP_ITER
-#define _POP_JUMP_IF_FALSE 526
-#define _POP_JUMP_IF_TRUE 527
+#define _POP_JUMP_IF_FALSE 548
+#define _POP_JUMP_IF_TRUE 549
 #define _POP_TOP POP_TOP
-#define _POP_TOP_FLOAT 528
-#define _POP_TOP_INT 529
-#define _POP_TOP_LOAD_CONST_INLINE 530
-#define _POP_TOP_LOAD_CONST_INLINE_BORROW 531
-#define _POP_TOP_NOP 532
-#define _POP_TOP_UNICODE 533
-#define _POP_TWO 534
-#define _POP_TWO_LOAD_CONST_INLINE_BORROW 535
+#define _POP_TOP_FLOAT 550
+#define _POP_TOP_INT 551
+#define _POP_TOP_LOAD_CONST_INLINE 552
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW 553
+#define _POP_TOP_NOP 554
+#define _POP_TOP_UNICODE 555
+#define _POP_TWO 556
+#define _POP_TWO_LOAD_CONST_INLINE_BORROW 557
 #define _PUSH_EXC_INFO PUSH_EXC_INFO
-#define _PUSH_FRAME 536
+#define _PUSH_FRAME 558
 #define _PUSH_NULL PUSH_NULL
-#define _PUSH_NULL_CONDITIONAL 537
-#define _PY_FRAME_EX 538
-#define _PY_FRAME_GENERAL 539
-#define _PY_FRAME_KW 540
-#define _QUICKEN_RESUME 541
-#define _RECORD_4OS 542
-#define _RECORD_BOUND_METHOD 543
-#define _RECORD_CALLABLE 544
-#define _RECORD_CODE 545
-#define _RECORD_NOS 546
-#define _RECORD_NOS_GEN_FUNC 547
-#define _RECORD_TOS 548
-#define _RECORD_TOS_TYPE 549
-#define _REPLACE_WITH_TRUE 550
+#define _PUSH_NULL_CONDITIONAL 559
+#define _PY_FRAME_EX 560
+#define _PY_FRAME_GENERAL 561
+#define _PY_FRAME_KW 562
+#define _QUICKEN_RESUME 563
+#define _RECORD_4OS 564
+#define _RECORD_BOUND_METHOD 565
+#define _RECORD_CALLABLE 566
+#define _RECORD_CODE 567
+#define _RECORD_NOS 568
+#define _RECORD_NOS_GEN_FUNC 569
+#define _RECORD_TOS 570
+#define _RECORD_TOS_TYPE 571
+#define _REPLACE_WITH_TRUE 572
 #define _RESUME_CHECK RESUME_CHECK
 #define _RETURN_GENERATOR RETURN_GENERATOR
 #define _RETURN_VALUE RETURN_VALUE
-#define _SAVE_RETURN_OFFSET 551
-#define _SEND 552
-#define _SEND_GEN_FRAME 553
+#define _SAVE_RETURN_OFFSET 573
+#define _SEND 574
+#define _SEND_GEN_FRAME 575
 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS
 #define _SET_ADD SET_ADD
 #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
 #define _SET_UPDATE SET_UPDATE
-#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW 554
-#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW 555
-#define _SPILL_OR_RELOAD 556
-#define _START_EXECUTOR 557
-#define _STORE_ATTR 558
-#define _STORE_ATTR_INSTANCE_VALUE 559
-#define _STORE_ATTR_SLOT 560
-#define _STORE_ATTR_WITH_HINT 561
+#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW 576
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW 577
+#define _SPILL_OR_RELOAD 578
+#define _START_EXECUTOR 579
+#define _STORE_ATTR 580
+#define _STORE_ATTR_INSTANCE_VALUE 581
+#define _STORE_ATTR_SLOT 582
+#define _STORE_ATTR_WITH_HINT 583
 #define _STORE_DEREF STORE_DEREF
 #define _STORE_GLOBAL STORE_GLOBAL
 #define _STORE_NAME STORE_NAME
-#define _STORE_SLICE 562
-#define _STORE_SUBSCR 563
-#define _STORE_SUBSCR_DICT 564
-#define _STORE_SUBSCR_LIST_INT 565
-#define _SWAP 566
-#define _SWAP_2 567
-#define _SWAP_3 568
-#define _SWAP_FAST 569
-#define _SWAP_FAST_0 570
-#define _SWAP_FAST_1 571
-#define _SWAP_FAST_2 572
-#define _SWAP_FAST_3 573
-#define _SWAP_FAST_4 574
-#define _SWAP_FAST_5 575
-#define _SWAP_FAST_6 576
-#define _SWAP_FAST_7 577
-#define _TIER2_RESUME_CHECK 578
-#define _TO_BOOL 579
+#define _STORE_SLICE 584
+#define _STORE_SUBSCR 585
+#define _STORE_SUBSCR_DICT 586
+#define _STORE_SUBSCR_LIST_INT 587
+#define _SWAP 588
+#define _SWAP_2 589
+#define _SWAP_3 590
+#define _SWAP_FAST 591
+#define _SWAP_FAST_0 592
+#define _SWAP_FAST_1 593
+#define _SWAP_FAST_2 594
+#define _SWAP_FAST_3 595
+#define _SWAP_FAST_4 596
+#define _SWAP_FAST_5 597
+#define _SWAP_FAST_6 598
+#define _SWAP_FAST_7 599
+#define _TIER2_RESUME_CHECK 600
+#define _TO_BOOL 601
 #define _TO_BOOL_BOOL TO_BOOL_BOOL
-#define _TO_BOOL_INT 580
-#define _TO_BOOL_LIST 581
+#define _TO_BOOL_INT 602
+#define _TO_BOOL_LIST 603
 #define _TO_BOOL_NONE TO_BOOL_NONE
-#define _TO_BOOL_STR 582
+#define _TO_BOOL_STR 604
 #define _TRACE_RECORD TRACE_RECORD
-#define _UNARY_INVERT 583
-#define _UNARY_NEGATIVE 584
+#define _UNARY_INVERT 605
+#define _UNARY_NEGATIVE 606
 #define _UNARY_NOT UNARY_NOT
 #define _UNPACK_EX UNPACK_EX
-#define _UNPACK_SEQUENCE 585
-#define _UNPACK_SEQUENCE_LIST 586
-#define _UNPACK_SEQUENCE_TUPLE 587
-#define _UNPACK_SEQUENCE_TWO_TUPLE 588
+#define _UNPACK_SEQUENCE 607
+#define _UNPACK_SEQUENCE_LIST 608
+#define _UNPACK_SEQUENCE_TUPLE 609
+#define _UNPACK_SEQUENCE_TWO_TUPLE 610
 #define _WITH_EXCEPT_START WITH_EXCEPT_START
 #define _YIELD_VALUE YIELD_VALUE
-#define MAX_UOP_ID 588
-#define _BINARY_OP_r23 589
-#define _BINARY_OP_ADD_FLOAT_r03 590
-#define _BINARY_OP_ADD_FLOAT_r13 591
-#define _BINARY_OP_ADD_FLOAT_r23 592
-#define _BINARY_OP_ADD_INT_r03 593
-#define _BINARY_OP_ADD_INT_r13 594
-#define _BINARY_OP_ADD_INT_r23 595
-#define _BINARY_OP_ADD_UNICODE_r03 596
-#define _BINARY_OP_ADD_UNICODE_r13 597
-#define _BINARY_OP_ADD_UNICODE_r23 598
-#define _BINARY_OP_EXTEND_r23 599
-#define _BINARY_OP_INPLACE_ADD_UNICODE_r21 600
-#define _BINARY_OP_MULTIPLY_FLOAT_r03 601
-#define _BINARY_OP_MULTIPLY_FLOAT_r13 602
-#define _BINARY_OP_MULTIPLY_FLOAT_r23 603
-#define _BINARY_OP_MULTIPLY_INT_r03 604
-#define _BINARY_OP_MULTIPLY_INT_r13 605
-#define _BINARY_OP_MULTIPLY_INT_r23 606
-#define _BINARY_OP_SUBSCR_CHECK_FUNC_r23 607
-#define _BINARY_OP_SUBSCR_DICT_r23 608
-#define _BINARY_OP_SUBSCR_INIT_CALL_r01 609
-#define _BINARY_OP_SUBSCR_INIT_CALL_r11 610
-#define _BINARY_OP_SUBSCR_INIT_CALL_r21 611
-#define _BINARY_OP_SUBSCR_INIT_CALL_r31 612
-#define _BINARY_OP_SUBSCR_LIST_INT_r23 613
-#define _BINARY_OP_SUBSCR_LIST_SLICE_r23 614
-#define _BINARY_OP_SUBSCR_STR_INT_r23 615
-#define _BINARY_OP_SUBSCR_TUPLE_INT_r03 616
-#define _BINARY_OP_SUBSCR_TUPLE_INT_r13 617
-#define _BINARY_OP_SUBSCR_TUPLE_INT_r23 618
-#define _BINARY_OP_SUBSCR_USTR_INT_r23 619
-#define _BINARY_OP_SUBTRACT_FLOAT_r03 620
-#define _BINARY_OP_SUBTRACT_FLOAT_r13 621
-#define _BINARY_OP_SUBTRACT_FLOAT_r23 622
-#define _BINARY_OP_SUBTRACT_INT_r03 623
-#define _BINARY_OP_SUBTRACT_INT_r13 624
-#define _BINARY_OP_SUBTRACT_INT_r23 625
-#define _BINARY_SLICE_r31 626
-#define _BUILD_INTERPOLATION_r01 627
-#define _BUILD_LIST_r01 628
-#define _BUILD_MAP_r01 629
-#define _BUILD_SET_r01 630
-#define _BUILD_SLICE_r01 631
-#define _BUILD_STRING_r01 632
-#define _BUILD_TEMPLATE_r21 633
-#define _BUILD_TUPLE_r01 634
-#define _CALL_BUILTIN_CLASS_r01 635
-#define _CALL_BUILTIN_FAST_r01 636
-#define _CALL_BUILTIN_FAST_WITH_KEYWORDS_r01 637
-#define _CALL_BUILTIN_O_r03 638
-#define _CALL_FUNCTION_EX_NON_PY_GENERAL_r31 639
-#define _CALL_INTRINSIC_1_r11 640
-#define _CALL_INTRINSIC_2_r21 641
-#define _CALL_ISINSTANCE_r31 642
-#define _CALL_KW_NON_PY_r11 643
-#define _CALL_LEN_r33 644
-#define _CALL_LIST_APPEND_r03 645
-#define _CALL_LIST_APPEND_r13 646
-#define _CALL_LIST_APPEND_r23 647
-#define _CALL_LIST_APPEND_r33 648
-#define _CALL_METHOD_DESCRIPTOR_FAST_r01 649
-#define _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS_r01 650
-#define _CALL_METHOD_DESCRIPTOR_NOARGS_r01 651
-#define _CALL_METHOD_DESCRIPTOR_O_r03 652
-#define _CALL_NON_PY_GENERAL_r01 653
-#define _CALL_STR_1_r32 654
-#define _CALL_TUPLE_1_r32 655
-#define _CALL_TYPE_1_r02 656
-#define _CALL_TYPE_1_r12 657
-#define _CALL_TYPE_1_r22 658
-#define _CALL_TYPE_1_r32 659
-#define _CHECK_AND_ALLOCATE_OBJECT_r00 660
-#define _CHECK_ATTR_CLASS_r01 661
-#define _CHECK_ATTR_CLASS_r11 662
-#define _CHECK_ATTR_CLASS_r22 663
-#define _CHECK_ATTR_CLASS_r33 664
-#define _CHECK_ATTR_METHOD_LAZY_DICT_r01 665
-#define _CHECK_ATTR_METHOD_LAZY_DICT_r11 666
-#define _CHECK_ATTR_METHOD_LAZY_DICT_r22 667
-#define _CHECK_ATTR_METHOD_LAZY_DICT_r33 668
-#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS_r00 669
-#define _CHECK_EG_MATCH_r22 670
-#define _CHECK_EXC_MATCH_r22 671
-#define _CHECK_FUNCTION_EXACT_ARGS_r00 672
-#define _CHECK_FUNCTION_VERSION_r00 673
-#define _CHECK_FUNCTION_VERSION_INLINE_r00 674
-#define _CHECK_FUNCTION_VERSION_INLINE_r11 675
-#define _CHECK_FUNCTION_VERSION_INLINE_r22 676
-#define _CHECK_FUNCTION_VERSION_INLINE_r33 677
-#define _CHECK_FUNCTION_VERSION_KW_r11 678
-#define _CHECK_IS_NOT_PY_CALLABLE_r00 679
-#define _CHECK_IS_NOT_PY_CALLABLE_EX_r03 680
-#define _CHECK_IS_NOT_PY_CALLABLE_EX_r13 681
-#define _CHECK_IS_NOT_PY_CALLABLE_EX_r23 682
-#define _CHECK_IS_NOT_PY_CALLABLE_EX_r33 683
-#define _CHECK_IS_NOT_PY_CALLABLE_KW_r11 684
-#define _CHECK_IS_PY_CALLABLE_EX_r03 685
-#define _CHECK_IS_PY_CALLABLE_EX_r13 686
-#define _CHECK_IS_PY_CALLABLE_EX_r23 687
-#define _CHECK_IS_PY_CALLABLE_EX_r33 688
-#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r01 689
-#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r11 690
-#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r22 691
-#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r33 692
-#define _CHECK_METHOD_VERSION_r00 693
-#define _CHECK_METHOD_VERSION_KW_r11 694
-#define _CHECK_PEP_523_r00 695
-#define _CHECK_PEP_523_r11 696
-#define _CHECK_PEP_523_r22 697
-#define _CHECK_PEP_523_r33 698
-#define _CHECK_PERIODIC_r00 699
-#define _CHECK_PERIODIC_AT_END_r00 700
-#define _CHECK_PERIODIC_IF_NOT_YIELD_FROM_r00 701
-#define _CHECK_RECURSION_REMAINING_r00 702
-#define _CHECK_RECURSION_REMAINING_r11 703
-#define _CHECK_RECURSION_REMAINING_r22 704
-#define _CHECK_RECURSION_REMAINING_r33 705
-#define _CHECK_STACK_SPACE_r00 706
-#define _CHECK_STACK_SPACE_OPERAND_r00 707
-#define _CHECK_STACK_SPACE_OPERAND_r11 708
-#define _CHECK_STACK_SPACE_OPERAND_r22 709
-#define _CHECK_STACK_SPACE_OPERAND_r33 710
-#define _CHECK_VALIDITY_r00 711
-#define _CHECK_VALIDITY_r11 712
-#define _CHECK_VALIDITY_r22 713
-#define _CHECK_VALIDITY_r33 714
-#define _COLD_DYNAMIC_EXIT_r00 715
-#define _COLD_EXIT_r00 716
-#define _COMPARE_OP_r21 717
-#define _COMPARE_OP_FLOAT_r03 718
-#define _COMPARE_OP_FLOAT_r13 719
-#define _COMPARE_OP_FLOAT_r23 720
-#define _COMPARE_OP_INT_r23 721
-#define _COMPARE_OP_STR_r23 722
-#define _CONTAINS_OP_r23 723
-#define _CONTAINS_OP_DICT_r23 724
-#define _CONTAINS_OP_SET_r23 725
-#define _CONVERT_VALUE_r11 726
-#define _COPY_r01 727
-#define _COPY_1_r02 728
-#define _COPY_1_r12 729
-#define _COPY_1_r23 730
-#define _COPY_2_r03 731
-#define _COPY_2_r13 732
-#define _COPY_2_r23 733
-#define _COPY_3_r03 734
-#define _COPY_3_r13 735
-#define _COPY_3_r23 736
-#define _COPY_3_r33 737
-#define _COPY_FREE_VARS_r00 738
-#define _COPY_FREE_VARS_r11 739
-#define _COPY_FREE_VARS_r22 740
-#define _COPY_FREE_VARS_r33 741
-#define _CREATE_INIT_FRAME_r01 742
-#define _DELETE_ATTR_r10 743
-#define _DELETE_DEREF_r00 744
-#define _DELETE_FAST_r00 745
-#define _DELETE_GLOBAL_r00 746
-#define _DELETE_NAME_r00 747
-#define _DELETE_SUBSCR_r20 748
-#define _DEOPT_r00 749
-#define _DEOPT_r10 750
-#define _DEOPT_r20 751
-#define _DEOPT_r30 752
-#define _DICT_MERGE_r10 753
-#define _DICT_UPDATE_r10 754
-#define _DO_CALL_r01 755
-#define _DO_CALL_FUNCTION_EX_r31 756
-#define _DO_CALL_KW_r11 757
-#define _DYNAMIC_EXIT_r00 758
-#define _DYNAMIC_EXIT_r10 759
-#define _DYNAMIC_EXIT_r20 760
-#define _DYNAMIC_EXIT_r30 761
-#define _END_FOR_r10 762
-#define _END_SEND_r21 763
-#define _ERROR_POP_N_r00 764
-#define _EXIT_INIT_CHECK_r10 765
-#define _EXIT_TRACE_r00 766
-#define _EXIT_TRACE_r10 767
-#define _EXIT_TRACE_r20 768
-#define _EXIT_TRACE_r30 769
-#define _EXPAND_METHOD_r00 770
-#define _EXPAND_METHOD_KW_r11 771
-#define _FATAL_ERROR_r00 772
-#define _FATAL_ERROR_r11 773
-#define _FATAL_ERROR_r22 774
-#define _FATAL_ERROR_r33 775
-#define _FORMAT_SIMPLE_r11 776
-#define _FORMAT_WITH_SPEC_r21 777
-#define _FOR_ITER_r23 778
-#define _FOR_ITER_GEN_FRAME_r03 779
-#define _FOR_ITER_GEN_FRAME_r13 780
-#define _FOR_ITER_GEN_FRAME_r23 781
-#define _FOR_ITER_TIER_TWO_r23 782
-#define _GET_AITER_r11 783
-#define _GET_ANEXT_r12 784
-#define _GET_AWAITABLE_r11 785
-#define _GET_ITER_r12 786
-#define _GET_LEN_r12 787
-#define _GET_YIELD_FROM_ITER_r11 788
-#define _GUARD_BINARY_OP_EXTEND_r22 789
-#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r02 790
-#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r12 791
-#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r22 792
-#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r33 793
-#define _GUARD_BIT_IS_SET_POP_r00 794
-#define _GUARD_BIT_IS_SET_POP_r10 795
-#define _GUARD_BIT_IS_SET_POP_r21 796
-#define _GUARD_BIT_IS_SET_POP_r32 797
-#define _GUARD_BIT_IS_SET_POP_4_r00 798
-#define _GUARD_BIT_IS_SET_POP_4_r10 799
-#define _GUARD_BIT_IS_SET_POP_4_r21 800
-#define _GUARD_BIT_IS_SET_POP_4_r32 801
-#define _GUARD_BIT_IS_SET_POP_5_r00 802
-#define _GUARD_BIT_IS_SET_POP_5_r10 803
-#define _GUARD_BIT_IS_SET_POP_5_r21 804
-#define _GUARD_BIT_IS_SET_POP_5_r32 805
-#define _GUARD_BIT_IS_SET_POP_6_r00 806
-#define _GUARD_BIT_IS_SET_POP_6_r10 807
-#define _GUARD_BIT_IS_SET_POP_6_r21 808
-#define _GUARD_BIT_IS_SET_POP_6_r32 809
-#define _GUARD_BIT_IS_SET_POP_7_r00 810
-#define _GUARD_BIT_IS_SET_POP_7_r10 811
-#define _GUARD_BIT_IS_SET_POP_7_r21 812
-#define _GUARD_BIT_IS_SET_POP_7_r32 813
-#define _GUARD_BIT_IS_UNSET_POP_r00 814
-#define _GUARD_BIT_IS_UNSET_POP_r10 815
-#define _GUARD_BIT_IS_UNSET_POP_r21 816
-#define _GUARD_BIT_IS_UNSET_POP_r32 817
-#define _GUARD_BIT_IS_UNSET_POP_4_r00 818
-#define _GUARD_BIT_IS_UNSET_POP_4_r10 819
-#define _GUARD_BIT_IS_UNSET_POP_4_r21 820
-#define _GUARD_BIT_IS_UNSET_POP_4_r32 821
-#define _GUARD_BIT_IS_UNSET_POP_5_r00 822
-#define _GUARD_BIT_IS_UNSET_POP_5_r10 823
-#define _GUARD_BIT_IS_UNSET_POP_5_r21 824
-#define _GUARD_BIT_IS_UNSET_POP_5_r32 825
-#define _GUARD_BIT_IS_UNSET_POP_6_r00 826
-#define _GUARD_BIT_IS_UNSET_POP_6_r10 827
-#define _GUARD_BIT_IS_UNSET_POP_6_r21 828
-#define _GUARD_BIT_IS_UNSET_POP_6_r32 829
-#define _GUARD_BIT_IS_UNSET_POP_7_r00 830
-#define _GUARD_BIT_IS_UNSET_POP_7_r10 831
-#define _GUARD_BIT_IS_UNSET_POP_7_r21 832
-#define _GUARD_BIT_IS_UNSET_POP_7_r32 833
-#define _GUARD_CALLABLE_ISINSTANCE_r03 834
-#define _GUARD_CALLABLE_ISINSTANCE_r13 835
-#define _GUARD_CALLABLE_ISINSTANCE_r23 836
-#define _GUARD_CALLABLE_ISINSTANCE_r33 837
-#define _GUARD_CALLABLE_LEN_r03 838
-#define _GUARD_CALLABLE_LEN_r13 839
-#define _GUARD_CALLABLE_LEN_r23 840
-#define _GUARD_CALLABLE_LEN_r33 841
-#define _GUARD_CALLABLE_LIST_APPEND_r03 842
-#define _GUARD_CALLABLE_LIST_APPEND_r13 843
-#define _GUARD_CALLABLE_LIST_APPEND_r23 844
-#define _GUARD_CALLABLE_LIST_APPEND_r33 845
-#define _GUARD_CALLABLE_STR_1_r03 846
-#define _GUARD_CALLABLE_STR_1_r13 847
-#define _GUARD_CALLABLE_STR_1_r23 848
-#define _GUARD_CALLABLE_STR_1_r33 849
-#define _GUARD_CALLABLE_TUPLE_1_r03 850
-#define _GUARD_CALLABLE_TUPLE_1_r13 851
-#define _GUARD_CALLABLE_TUPLE_1_r23 852
-#define _GUARD_CALLABLE_TUPLE_1_r33 853
-#define _GUARD_CALLABLE_TYPE_1_r03 854
-#define _GUARD_CALLABLE_TYPE_1_r13 855
-#define _GUARD_CALLABLE_TYPE_1_r23 856
-#define _GUARD_CALLABLE_TYPE_1_r33 857
-#define _GUARD_CODE_r00 858
-#define _GUARD_CODE_r11 859
-#define _GUARD_CODE_r22 860
-#define _GUARD_CODE_r33 861
-#define _GUARD_DORV_NO_DICT_r01 862
-#define _GUARD_DORV_NO_DICT_r11 863
-#define _GUARD_DORV_NO_DICT_r22 864
-#define _GUARD_DORV_NO_DICT_r33 865
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r01 866
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r11 867
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r22 868
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r33 869
-#define _GUARD_GLOBALS_VERSION_r00 870
-#define _GUARD_GLOBALS_VERSION_r11 871
-#define _GUARD_GLOBALS_VERSION_r22 872
-#define _GUARD_GLOBALS_VERSION_r33 873
-#define _GUARD_IP_RETURN_GENERATOR_r00 874
-#define _GUARD_IP_RETURN_GENERATOR_r11 875
-#define _GUARD_IP_RETURN_GENERATOR_r22 876
-#define _GUARD_IP_RETURN_GENERATOR_r33 877
-#define _GUARD_IP_RETURN_VALUE_r00 878
-#define _GUARD_IP_RETURN_VALUE_r11 879
-#define _GUARD_IP_RETURN_VALUE_r22 880
-#define _GUARD_IP_RETURN_VALUE_r33 881
-#define _GUARD_IP_YIELD_VALUE_r00 882
-#define _GUARD_IP_YIELD_VALUE_r11 883
-#define _GUARD_IP_YIELD_VALUE_r22 884
-#define _GUARD_IP_YIELD_VALUE_r33 885
-#define _GUARD_IP__PUSH_FRAME_r00 886
-#define _GUARD_IP__PUSH_FRAME_r11 887
-#define _GUARD_IP__PUSH_FRAME_r22 888
-#define _GUARD_IP__PUSH_FRAME_r33 889
-#define _GUARD_IS_FALSE_POP_r00 890
-#define _GUARD_IS_FALSE_POP_r10 891
-#define _GUARD_IS_FALSE_POP_r21 892
-#define _GUARD_IS_FALSE_POP_r32 893
-#define _GUARD_IS_NONE_POP_r00 894
-#define _GUARD_IS_NONE_POP_r10 895
-#define _GUARD_IS_NONE_POP_r21 896
-#define _GUARD_IS_NONE_POP_r32 897
-#define _GUARD_IS_NOT_NONE_POP_r10 898
-#define _GUARD_IS_TRUE_POP_r00 899
-#define _GUARD_IS_TRUE_POP_r10 900
-#define _GUARD_IS_TRUE_POP_r21 901
-#define _GUARD_IS_TRUE_POP_r32 902
-#define _GUARD_KEYS_VERSION_r01 903
-#define _GUARD_KEYS_VERSION_r11 904
-#define _GUARD_KEYS_VERSION_r22 905
-#define _GUARD_KEYS_VERSION_r33 906
-#define _GUARD_NOS_COMPACT_ASCII_r02 907
-#define _GUARD_NOS_COMPACT_ASCII_r12 908
-#define _GUARD_NOS_COMPACT_ASCII_r22 909
-#define _GUARD_NOS_COMPACT_ASCII_r33 910
-#define _GUARD_NOS_DICT_r02 911
-#define _GUARD_NOS_DICT_r12 912
-#define _GUARD_NOS_DICT_r22 913
-#define _GUARD_NOS_DICT_r33 914
-#define _GUARD_NOS_FLOAT_r02 915
-#define _GUARD_NOS_FLOAT_r12 916
-#define _GUARD_NOS_FLOAT_r22 917
-#define _GUARD_NOS_FLOAT_r33 918
-#define _GUARD_NOS_INT_r02 919
-#define _GUARD_NOS_INT_r12 920
-#define _GUARD_NOS_INT_r22 921
-#define _GUARD_NOS_INT_r33 922
-#define _GUARD_NOS_LIST_r02 923
-#define _GUARD_NOS_LIST_r12 924
-#define _GUARD_NOS_LIST_r22 925
-#define _GUARD_NOS_LIST_r33 926
-#define _GUARD_NOS_NOT_NULL_r02 927
-#define _GUARD_NOS_NOT_NULL_r12 928
-#define _GUARD_NOS_NOT_NULL_r22 929
-#define _GUARD_NOS_NOT_NULL_r33 930
-#define _GUARD_NOS_NULL_r02 931
-#define _GUARD_NOS_NULL_r12 932
-#define _GUARD_NOS_NULL_r22 933
-#define _GUARD_NOS_NULL_r33 934
-#define _GUARD_NOS_OVERFLOWED_r02 935
-#define _GUARD_NOS_OVERFLOWED_r12 936
-#define _GUARD_NOS_OVERFLOWED_r22 937
-#define _GUARD_NOS_OVERFLOWED_r33 938
-#define _GUARD_NOS_TUPLE_r02 939
-#define _GUARD_NOS_TUPLE_r12 940
-#define _GUARD_NOS_TUPLE_r22 941
-#define _GUARD_NOS_TUPLE_r33 942
-#define _GUARD_NOS_UNICODE_r02 943
-#define _GUARD_NOS_UNICODE_r12 944
-#define _GUARD_NOS_UNICODE_r22 945
-#define _GUARD_NOS_UNICODE_r33 946
-#define _GUARD_NOT_EXHAUSTED_LIST_r02 947
-#define _GUARD_NOT_EXHAUSTED_LIST_r12 948
-#define _GUARD_NOT_EXHAUSTED_LIST_r22 949
-#define _GUARD_NOT_EXHAUSTED_LIST_r33 950
-#define _GUARD_NOT_EXHAUSTED_RANGE_r02 951
-#define _GUARD_NOT_EXHAUSTED_RANGE_r12 952
-#define _GUARD_NOT_EXHAUSTED_RANGE_r22 953
-#define _GUARD_NOT_EXHAUSTED_RANGE_r33 954
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r02 955
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r12 956
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r22 957
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r33 958
-#define _GUARD_THIRD_NULL_r03 959
-#define _GUARD_THIRD_NULL_r13 960
-#define _GUARD_THIRD_NULL_r23 961
-#define _GUARD_THIRD_NULL_r33 962
-#define _GUARD_TOS_ANY_SET_r01 963
-#define _GUARD_TOS_ANY_SET_r11 964
-#define _GUARD_TOS_ANY_SET_r22 965
-#define _GUARD_TOS_ANY_SET_r33 966
-#define _GUARD_TOS_DICT_r01 967
-#define _GUARD_TOS_DICT_r11 968
-#define _GUARD_TOS_DICT_r22 969
-#define _GUARD_TOS_DICT_r33 970
-#define _GUARD_TOS_FLOAT_r01 971
-#define _GUARD_TOS_FLOAT_r11 972
-#define _GUARD_TOS_FLOAT_r22 973
-#define _GUARD_TOS_FLOAT_r33 974
-#define _GUARD_TOS_INT_r01 975
-#define _GUARD_TOS_INT_r11 976
-#define _GUARD_TOS_INT_r22 977
-#define _GUARD_TOS_INT_r33 978
-#define _GUARD_TOS_LIST_r01 979
-#define _GUARD_TOS_LIST_r11 980
-#define _GUARD_TOS_LIST_r22 981
-#define _GUARD_TOS_LIST_r33 982
-#define _GUARD_TOS_OVERFLOWED_r01 983
-#define _GUARD_TOS_OVERFLOWED_r11 984
-#define _GUARD_TOS_OVERFLOWED_r22 985
-#define _GUARD_TOS_OVERFLOWED_r33 986
-#define _GUARD_TOS_SLICE_r01 987
-#define _GUARD_TOS_SLICE_r11 988
-#define _GUARD_TOS_SLICE_r22 989
-#define _GUARD_TOS_SLICE_r33 990
-#define _GUARD_TOS_TUPLE_r01 991
-#define _GUARD_TOS_TUPLE_r11 992
-#define _GUARD_TOS_TUPLE_r22 993
-#define _GUARD_TOS_TUPLE_r33 994
-#define _GUARD_TOS_UNICODE_r01 995
-#define _GUARD_TOS_UNICODE_r11 996
-#define _GUARD_TOS_UNICODE_r22 997
-#define _GUARD_TOS_UNICODE_r33 998
-#define _GUARD_TYPE_VERSION_r01 999
-#define _GUARD_TYPE_VERSION_r11 1000
-#define _GUARD_TYPE_VERSION_r22 1001
-#define _GUARD_TYPE_VERSION_r33 1002
-#define _GUARD_TYPE_VERSION_AND_LOCK_r01 1003
-#define _GUARD_TYPE_VERSION_AND_LOCK_r11 1004
-#define _GUARD_TYPE_VERSION_AND_LOCK_r22 1005
-#define _GUARD_TYPE_VERSION_AND_LOCK_r33 1006
-#define _HANDLE_PENDING_AND_DEOPT_r00 1007
-#define _HANDLE_PENDING_AND_DEOPT_r10 1008
-#define _HANDLE_PENDING_AND_DEOPT_r20 1009
-#define _HANDLE_PENDING_AND_DEOPT_r30 1010
-#define _IMPORT_FROM_r12 1011
-#define _IMPORT_NAME_r21 1012
-#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS_r00 1013
-#define _INIT_CALL_PY_EXACT_ARGS_r01 1014
-#define _INIT_CALL_PY_EXACT_ARGS_0_r01 1015
-#define _INIT_CALL_PY_EXACT_ARGS_1_r01 1016
-#define _INIT_CALL_PY_EXACT_ARGS_2_r01 1017
-#define _INIT_CALL_PY_EXACT_ARGS_3_r01 1018
-#define _INIT_CALL_PY_EXACT_ARGS_4_r01 1019
-#define _INSERT_1_LOAD_CONST_INLINE_r02 1020
-#define _INSERT_1_LOAD_CONST_INLINE_r12 1021
-#define _INSERT_1_LOAD_CONST_INLINE_r23 1022
-#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r02 1023
-#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r12 1024
-#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r23 1025
-#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r03 1026
-#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r13 1027
-#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r23 1028
-#define _INSERT_NULL_r10 1029
-#define _INSTRUMENTED_FOR_ITER_r23 1030
-#define _INSTRUMENTED_INSTRUCTION_r00 1031
-#define _INSTRUMENTED_JUMP_FORWARD_r00 1032
-#define _INSTRUMENTED_JUMP_FORWARD_r11 1033
-#define _INSTRUMENTED_JUMP_FORWARD_r22 1034
-#define _INSTRUMENTED_JUMP_FORWARD_r33 1035
-#define _INSTRUMENTED_LINE_r00 1036
-#define _INSTRUMENTED_NOT_TAKEN_r00 1037
-#define _INSTRUMENTED_NOT_TAKEN_r11 1038
-#define _INSTRUMENTED_NOT_TAKEN_r22 1039
-#define _INSTRUMENTED_NOT_TAKEN_r33 1040
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r00 1041
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r10 1042
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r21 1043
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r32 1044
-#define _INSTRUMENTED_POP_JUMP_IF_NONE_r10 1045
-#define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE_r10 1046
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r00 1047
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r10 1048
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r21 1049
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r32 1050
-#define _IS_NONE_r11 1051
-#define _IS_OP_r03 1052
-#define _IS_OP_r13 1053
-#define _IS_OP_r23 1054
-#define _ITER_CHECK_LIST_r02 1055
-#define _ITER_CHECK_LIST_r12 1056
-#define _ITER_CHECK_LIST_r22 1057
-#define _ITER_CHECK_LIST_r33 1058
-#define _ITER_CHECK_RANGE_r02 1059
-#define _ITER_CHECK_RANGE_r12 1060
-#define _ITER_CHECK_RANGE_r22 1061
-#define _ITER_CHECK_RANGE_r33 1062
-#define _ITER_CHECK_TUPLE_r02 1063
-#define _ITER_CHECK_TUPLE_r12 1064
-#define _ITER_CHECK_TUPLE_r22 1065
-#define _ITER_CHECK_TUPLE_r33 1066
-#define _ITER_JUMP_LIST_r02 1067
-#define _ITER_JUMP_LIST_r12 1068
-#define _ITER_JUMP_LIST_r22 1069
-#define _ITER_JUMP_LIST_r33 1070
-#define _ITER_JUMP_RANGE_r02 1071
-#define _ITER_JUMP_RANGE_r12 1072
-#define _ITER_JUMP_RANGE_r22 1073
-#define _ITER_JUMP_RANGE_r33 1074
-#define _ITER_JUMP_TUPLE_r02 1075
-#define _ITER_JUMP_TUPLE_r12 1076
-#define _ITER_JUMP_TUPLE_r22 1077
-#define _ITER_JUMP_TUPLE_r33 1078
-#define _ITER_NEXT_LIST_r23 1079
-#define _ITER_NEXT_LIST_TIER_TWO_r23 1080
-#define _ITER_NEXT_RANGE_r03 1081
-#define _ITER_NEXT_RANGE_r13 1082
-#define _ITER_NEXT_RANGE_r23 1083
-#define _ITER_NEXT_TUPLE_r03 1084
-#define _ITER_NEXT_TUPLE_r13 1085
-#define _ITER_NEXT_TUPLE_r23 1086
-#define _JUMP_BACKWARD_NO_INTERRUPT_r00 1087
-#define _JUMP_BACKWARD_NO_INTERRUPT_r11 1088
-#define _JUMP_BACKWARD_NO_INTERRUPT_r22 1089
-#define _JUMP_BACKWARD_NO_INTERRUPT_r33 1090
-#define _JUMP_TO_TOP_r00 1091
-#define _LIST_APPEND_r10 1092
-#define _LIST_EXTEND_r10 1093
-#define _LOAD_ATTR_r10 1094
-#define _LOAD_ATTR_CLASS_r11 1095
-#define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_r11 1096
-#define _LOAD_ATTR_INSTANCE_VALUE_r02 1097
-#define _LOAD_ATTR_INSTANCE_VALUE_r12 1098
-#define _LOAD_ATTR_INSTANCE_VALUE_r23 1099
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r02 1100
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r12 1101
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r23 1102
-#define _LOAD_ATTR_METHOD_NO_DICT_r02 1103
-#define _LOAD_ATTR_METHOD_NO_DICT_r12 1104
-#define _LOAD_ATTR_METHOD_NO_DICT_r23 1105
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r02 1106
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r12 1107
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r23 1108
-#define _LOAD_ATTR_MODULE_r12 1109
-#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT_r11 1110
-#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES_r11 1111
-#define _LOAD_ATTR_PROPERTY_FRAME_r11 1112
-#define _LOAD_ATTR_SLOT_r02 1113
-#define _LOAD_ATTR_SLOT_r12 1114
-#define _LOAD_ATTR_SLOT_r23 1115
-#define _LOAD_ATTR_WITH_HINT_r12 1116
-#define _LOAD_BUILD_CLASS_r01 1117
-#define _LOAD_BYTECODE_r00 1118
-#define _LOAD_COMMON_CONSTANT_r01 1119
-#define _LOAD_COMMON_CONSTANT_r12 1120
-#define _LOAD_COMMON_CONSTANT_r23 1121
-#define _LOAD_CONST_r01 1122
-#define _LOAD_CONST_r12 1123
-#define _LOAD_CONST_r23 1124
-#define _LOAD_CONST_INLINE_r01 1125
-#define _LOAD_CONST_INLINE_r12 1126
-#define _LOAD_CONST_INLINE_r23 1127
-#define _LOAD_CONST_INLINE_BORROW_r01 1128
-#define _LOAD_CONST_INLINE_BORROW_r12 1129
-#define _LOAD_CONST_INLINE_BORROW_r23 1130
-#define _LOAD_CONST_UNDER_INLINE_r02 1131
-#define _LOAD_CONST_UNDER_INLINE_r12 1132
-#define _LOAD_CONST_UNDER_INLINE_r23 1133
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r02 1134
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r12 1135
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r23 1136
-#define _LOAD_DEREF_r01 1137
-#define _LOAD_FAST_r01 1138
-#define _LOAD_FAST_r12 1139
-#define _LOAD_FAST_r23 1140
-#define _LOAD_FAST_0_r01 1141
-#define _LOAD_FAST_0_r12 1142
-#define _LOAD_FAST_0_r23 1143
-#define _LOAD_FAST_1_r01 1144
-#define _LOAD_FAST_1_r12 1145
-#define _LOAD_FAST_1_r23 1146
-#define _LOAD_FAST_2_r01 1147
-#define _LOAD_FAST_2_r12 1148
-#define _LOAD_FAST_2_r23 1149
-#define _LOAD_FAST_3_r01 1150
-#define _LOAD_FAST_3_r12 1151
-#define _LOAD_FAST_3_r23 1152
-#define _LOAD_FAST_4_r01 1153
-#define _LOAD_FAST_4_r12 1154
-#define _LOAD_FAST_4_r23 1155
-#define _LOAD_FAST_5_r01 1156
-#define _LOAD_FAST_5_r12 1157
-#define _LOAD_FAST_5_r23 1158
-#define _LOAD_FAST_6_r01 1159
-#define _LOAD_FAST_6_r12 1160
-#define _LOAD_FAST_6_r23 1161
-#define _LOAD_FAST_7_r01 1162
-#define _LOAD_FAST_7_r12 1163
-#define _LOAD_FAST_7_r23 1164
-#define _LOAD_FAST_AND_CLEAR_r01 1165
-#define _LOAD_FAST_AND_CLEAR_r12 1166
-#define _LOAD_FAST_AND_CLEAR_r23 1167
-#define _LOAD_FAST_BORROW_r01 1168
-#define _LOAD_FAST_BORROW_r12 1169
-#define _LOAD_FAST_BORROW_r23 1170
-#define _LOAD_FAST_BORROW_0_r01 1171
-#define _LOAD_FAST_BORROW_0_r12 1172
-#define _LOAD_FAST_BORROW_0_r23 1173
-#define _LOAD_FAST_BORROW_1_r01 1174
-#define _LOAD_FAST_BORROW_1_r12 1175
-#define _LOAD_FAST_BORROW_1_r23 1176
-#define _LOAD_FAST_BORROW_2_r01 1177
-#define _LOAD_FAST_BORROW_2_r12 1178
-#define _LOAD_FAST_BORROW_2_r23 1179
-#define _LOAD_FAST_BORROW_3_r01 1180
-#define _LOAD_FAST_BORROW_3_r12 1181
-#define _LOAD_FAST_BORROW_3_r23 1182
-#define _LOAD_FAST_BORROW_4_r01 1183
-#define _LOAD_FAST_BORROW_4_r12 1184
-#define _LOAD_FAST_BORROW_4_r23 1185
-#define _LOAD_FAST_BORROW_5_r01 1186
-#define _LOAD_FAST_BORROW_5_r12 1187
-#define _LOAD_FAST_BORROW_5_r23 1188
-#define _LOAD_FAST_BORROW_6_r01 1189
-#define _LOAD_FAST_BORROW_6_r12 1190
-#define _LOAD_FAST_BORROW_6_r23 1191
-#define _LOAD_FAST_BORROW_7_r01 1192
-#define _LOAD_FAST_BORROW_7_r12 1193
-#define _LOAD_FAST_BORROW_7_r23 1194
-#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r02 1195
-#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r13 1196
-#define _LOAD_FAST_CHECK_r01 1197
-#define _LOAD_FAST_CHECK_r12 1198
-#define _LOAD_FAST_CHECK_r23 1199
-#define _LOAD_FAST_LOAD_FAST_r02 1200
-#define _LOAD_FAST_LOAD_FAST_r13 1201
-#define _LOAD_FROM_DICT_OR_DEREF_r11 1202
-#define _LOAD_FROM_DICT_OR_GLOBALS_r11 1203
-#define _LOAD_GLOBAL_r00 1204
-#define _LOAD_GLOBAL_BUILTINS_r01 1205
-#define _LOAD_GLOBAL_MODULE_r01 1206
-#define _LOAD_LOCALS_r01 1207
-#define _LOAD_LOCALS_r12 1208
-#define _LOAD_LOCALS_r23 1209
-#define _LOAD_NAME_r01 1210
-#define _LOAD_SMALL_INT_r01 1211
-#define _LOAD_SMALL_INT_r12 1212
-#define _LOAD_SMALL_INT_r23 1213
-#define _LOAD_SMALL_INT_0_r01 1214
-#define _LOAD_SMALL_INT_0_r12 1215
-#define _LOAD_SMALL_INT_0_r23 1216
-#define _LOAD_SMALL_INT_1_r01 1217
-#define _LOAD_SMALL_INT_1_r12 1218
-#define _LOAD_SMALL_INT_1_r23 1219
-#define _LOAD_SMALL_INT_2_r01 1220
-#define _LOAD_SMALL_INT_2_r12 1221
-#define _LOAD_SMALL_INT_2_r23 1222
-#define _LOAD_SMALL_INT_3_r01 1223
-#define _LOAD_SMALL_INT_3_r12 1224
-#define _LOAD_SMALL_INT_3_r23 1225
-#define _LOAD_SPECIAL_r00 1226
-#define _LOAD_SUPER_ATTR_ATTR_r31 1227
-#define _LOAD_SUPER_ATTR_METHOD_r32 1228
-#define _MAKE_CALLARGS_A_TUPLE_r33 1229
-#define _MAKE_CELL_r00 1230
-#define _MAKE_FUNCTION_r11 1231
-#define _MAKE_WARM_r00 1232
-#define _MAKE_WARM_r11 1233
-#define _MAKE_WARM_r22 1234
-#define _MAKE_WARM_r33 1235
-#define _MAP_ADD_r20 1236
-#define _MATCH_CLASS_r31 1237
-#define _MATCH_KEYS_r23 1238
-#define _MATCH_MAPPING_r02 1239
-#define _MATCH_MAPPING_r12 1240
-#define _MATCH_MAPPING_r23 1241
-#define _MATCH_SEQUENCE_r02 1242
-#define _MATCH_SEQUENCE_r12 1243
-#define _MATCH_SEQUENCE_r23 1244
-#define _MAYBE_EXPAND_METHOD_r00 1245
-#define _MAYBE_EXPAND_METHOD_KW_r11 1246
-#define _MONITOR_CALL_r00 1247
-#define _MONITOR_CALL_KW_r11 1248
-#define _MONITOR_JUMP_BACKWARD_r00 1249
-#define _MONITOR_JUMP_BACKWARD_r11 1250
-#define _MONITOR_JUMP_BACKWARD_r22 1251
-#define _MONITOR_JUMP_BACKWARD_r33 1252
-#define _MONITOR_RESUME_r00 1253
-#define _NOP_r00 1254
-#define _NOP_r11 1255
-#define _NOP_r22 1256
-#define _NOP_r33 1257
-#define _POP_CALL_r20 1258
-#define _POP_CALL_LOAD_CONST_INLINE_BORROW_r21 1259
-#define _POP_CALL_ONE_r30 1260
-#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31 1261
-#define _POP_CALL_TWO_r30 1262
-#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31 1263
-#define _POP_EXCEPT_r10 1264
-#define _POP_ITER_r20 1265
-#define _POP_JUMP_IF_FALSE_r00 1266
-#define _POP_JUMP_IF_FALSE_r10 1267
-#define _POP_JUMP_IF_FALSE_r21 1268
-#define _POP_JUMP_IF_FALSE_r32 1269
-#define _POP_JUMP_IF_TRUE_r00 1270
-#define _POP_JUMP_IF_TRUE_r10 1271
-#define _POP_JUMP_IF_TRUE_r21 1272
-#define _POP_JUMP_IF_TRUE_r32 1273
-#define _POP_TOP_r10 1274
-#define _POP_TOP_FLOAT_r00 1275
-#define _POP_TOP_FLOAT_r10 1276
-#define _POP_TOP_FLOAT_r21 1277
-#define _POP_TOP_FLOAT_r32 1278
-#define _POP_TOP_INT_r00 1279
-#define _POP_TOP_INT_r10 1280
-#define _POP_TOP_INT_r21 1281
-#define _POP_TOP_INT_r32 1282
-#define _POP_TOP_LOAD_CONST_INLINE_r11 1283
-#define _POP_TOP_LOAD_CONST_INLINE_BORROW_r11 1284
-#define _POP_TOP_NOP_r00 1285
-#define _POP_TOP_NOP_r10 1286
-#define _POP_TOP_NOP_r21 1287
-#define _POP_TOP_NOP_r32 1288
-#define _POP_TOP_UNICODE_r00 1289
-#define _POP_TOP_UNICODE_r10 1290
-#define _POP_TOP_UNICODE_r21 1291
-#define _POP_TOP_UNICODE_r32 1292
-#define _POP_TWO_r20 1293
-#define _POP_TWO_LOAD_CONST_INLINE_BORROW_r21 1294
-#define _PUSH_EXC_INFO_r02 1295
-#define _PUSH_EXC_INFO_r12 1296
-#define _PUSH_EXC_INFO_r23 1297
-#define _PUSH_FRAME_r10 1298
-#define _PUSH_NULL_r01 1299
-#define _PUSH_NULL_r12 1300
-#define _PUSH_NULL_r23 1301
-#define _PUSH_NULL_CONDITIONAL_r00 1302
-#define _PY_FRAME_EX_r31 1303
-#define _PY_FRAME_GENERAL_r01 1304
-#define _PY_FRAME_KW_r11 1305
-#define _QUICKEN_RESUME_r00 1306
-#define _QUICKEN_RESUME_r11 1307
-#define _QUICKEN_RESUME_r22 1308
-#define _QUICKEN_RESUME_r33 1309
-#define _REPLACE_WITH_TRUE_r02 1310
-#define _REPLACE_WITH_TRUE_r12 1311
-#define _REPLACE_WITH_TRUE_r23 1312
-#define _RESUME_CHECK_r00 1313
-#define _RESUME_CHECK_r11 1314
-#define _RESUME_CHECK_r22 1315
-#define _RESUME_CHECK_r33 1316
-#define _RETURN_GENERATOR_r01 1317
-#define _RETURN_VALUE_r11 1318
-#define _SAVE_RETURN_OFFSET_r00 1319
-#define _SAVE_RETURN_OFFSET_r11 1320
-#define _SAVE_RETURN_OFFSET_r22 1321
-#define _SAVE_RETURN_OFFSET_r33 1322
-#define _SEND_r22 1323
-#define _SEND_GEN_FRAME_r22 1324
-#define _SETUP_ANNOTATIONS_r00 1325
-#define _SET_ADD_r10 1326
-#define _SET_FUNCTION_ATTRIBUTE_r01 1327
-#define _SET_FUNCTION_ATTRIBUTE_r11 1328
-#define _SET_FUNCTION_ATTRIBUTE_r21 1329
-#define _SET_FUNCTION_ATTRIBUTE_r32 1330
-#define _SET_IP_r00 1331
-#define _SET_IP_r11 1332
-#define _SET_IP_r22 1333
-#define _SET_IP_r33 1334
-#define _SET_UPDATE_r10 1335
-#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r02 1336
-#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r12 1337
-#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r22 1338
-#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r32 1339
-#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03 1340
-#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13 1341
-#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23 1342
-#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33 1343
-#define _SPILL_OR_RELOAD_r01 1344
-#define _SPILL_OR_RELOAD_r02 1345
-#define _SPILL_OR_RELOAD_r03 1346
-#define _SPILL_OR_RELOAD_r10 1347
-#define _SPILL_OR_RELOAD_r12 1348
-#define _SPILL_OR_RELOAD_r13 1349
-#define _SPILL_OR_RELOAD_r20 1350
-#define _SPILL_OR_RELOAD_r21 1351
-#define _SPILL_OR_RELOAD_r23 1352
-#define _SPILL_OR_RELOAD_r30 1353
-#define _SPILL_OR_RELOAD_r31 1354
-#define _SPILL_OR_RELOAD_r32 1355
-#define _START_EXECUTOR_r00 1356
-#define _STORE_ATTR_r20 1357
-#define _STORE_ATTR_INSTANCE_VALUE_r21 1358
-#define _STORE_ATTR_SLOT_r21 1359
-#define _STORE_ATTR_WITH_HINT_r21 1360
-#define _STORE_DEREF_r10 1361
-#define _STORE_FAST_LOAD_FAST_r11 1362
-#define _STORE_FAST_STORE_FAST_r20 1363
-#define _STORE_GLOBAL_r10 1364
-#define _STORE_NAME_r10 1365
-#define _STORE_SLICE_r30 1366
-#define _STORE_SUBSCR_r30 1367
-#define _STORE_SUBSCR_DICT_r31 1368
-#define _STORE_SUBSCR_LIST_INT_r32 1369
-#define _SWAP_r11 1370
-#define _SWAP_2_r02 1371
-#define _SWAP_2_r12 1372
-#define _SWAP_2_r22 1373
-#define _SWAP_2_r33 1374
-#define _SWAP_3_r03 1375
-#define _SWAP_3_r13 1376
-#define _SWAP_3_r23 1377
-#define _SWAP_3_r33 1378
-#define _SWAP_FAST_r01 1379
-#define _SWAP_FAST_r11 1380
-#define _SWAP_FAST_r22 1381
-#define _SWAP_FAST_r33 1382
-#define _SWAP_FAST_0_r01 1383
-#define _SWAP_FAST_0_r11 1384
-#define _SWAP_FAST_0_r22 1385
-#define _SWAP_FAST_0_r33 1386
-#define _SWAP_FAST_1_r01 1387
-#define _SWAP_FAST_1_r11 1388
-#define _SWAP_FAST_1_r22 1389
-#define _SWAP_FAST_1_r33 1390
-#define _SWAP_FAST_2_r01 1391
-#define _SWAP_FAST_2_r11 1392
-#define _SWAP_FAST_2_r22 1393
-#define _SWAP_FAST_2_r33 1394
-#define _SWAP_FAST_3_r01 1395
-#define _SWAP_FAST_3_r11 1396
-#define _SWAP_FAST_3_r22 1397
-#define _SWAP_FAST_3_r33 1398
-#define _SWAP_FAST_4_r01 1399
-#define _SWAP_FAST_4_r11 1400
-#define _SWAP_FAST_4_r22 1401
-#define _SWAP_FAST_4_r33 1402
-#define _SWAP_FAST_5_r01 1403
-#define _SWAP_FAST_5_r11 1404
-#define _SWAP_FAST_5_r22 1405
-#define _SWAP_FAST_5_r33 1406
-#define _SWAP_FAST_6_r01 1407
-#define _SWAP_FAST_6_r11 1408
-#define _SWAP_FAST_6_r22 1409
-#define _SWAP_FAST_6_r33 1410
-#define _SWAP_FAST_7_r01 1411
-#define _SWAP_FAST_7_r11 1412
-#define _SWAP_FAST_7_r22 1413
-#define _SWAP_FAST_7_r33 1414
-#define _TIER2_RESUME_CHECK_r00 1415
-#define _TIER2_RESUME_CHECK_r11 1416
-#define _TIER2_RESUME_CHECK_r22 1417
-#define _TIER2_RESUME_CHECK_r33 1418
-#define _TO_BOOL_r11 1419
-#define _TO_BOOL_BOOL_r01 1420
-#define _TO_BOOL_BOOL_r11 1421
-#define _TO_BOOL_BOOL_r22 1422
-#define _TO_BOOL_BOOL_r33 1423
-#define _TO_BOOL_INT_r02 1424
-#define _TO_BOOL_INT_r12 1425
-#define _TO_BOOL_INT_r23 1426
-#define _TO_BOOL_LIST_r02 1427
-#define _TO_BOOL_LIST_r12 1428
-#define _TO_BOOL_LIST_r23 1429
-#define _TO_BOOL_NONE_r01 1430
-#define _TO_BOOL_NONE_r11 1431
-#define _TO_BOOL_NONE_r22 1432
-#define _TO_BOOL_NONE_r33 1433
-#define _TO_BOOL_STR_r02 1434
-#define _TO_BOOL_STR_r12 1435
-#define _TO_BOOL_STR_r23 1436
-#define _TRACE_RECORD_r00 1437
-#define _UNARY_INVERT_r12 1438
-#define _UNARY_NEGATIVE_r12 1439
-#define _UNARY_NOT_r01 1440
-#define _UNARY_NOT_r11 1441
-#define _UNARY_NOT_r22 1442
-#define _UNARY_NOT_r33 1443
-#define _UNPACK_EX_r10 1444
-#define _UNPACK_SEQUENCE_r10 1445
-#define _UNPACK_SEQUENCE_LIST_r10 1446
-#define _UNPACK_SEQUENCE_TUPLE_r10 1447
-#define _UNPACK_SEQUENCE_TWO_TUPLE_r12 1448
-#define _WITH_EXCEPT_START_r33 1449
-#define _YIELD_VALUE_r11 1450
-#define MAX_UOP_REGS_ID 1450
+#define MAX_UOP_ID 610
+#define _BINARY_OP_r23 611
+#define _BINARY_OP_ADD_FLOAT_r03 612
+#define _BINARY_OP_ADD_FLOAT_r13 613
+#define _BINARY_OP_ADD_FLOAT_r23 614
+#define _BINARY_OP_ADD_FLOAT_INT_r03 615
+#define _BINARY_OP_ADD_FLOAT_INT_r13 616
+#define _BINARY_OP_ADD_FLOAT_INT_r23 617
+#define _BINARY_OP_ADD_INT_r03 618
+#define _BINARY_OP_ADD_INT_r13 619
+#define _BINARY_OP_ADD_INT_r23 620
+#define _BINARY_OP_ADD_UNICODE_r03 621
+#define _BINARY_OP_ADD_UNICODE_r13 622
+#define _BINARY_OP_ADD_UNICODE_r23 623
+#define _BINARY_OP_EXTEND_r23 624
+#define _BINARY_OP_FLOOR_DIVIDE_INT_r23 625
+#define _BINARY_OP_INPLACE_ADD_FLOAT_r23 626
+#define _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23 627
+#define _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23 628
+#define _BINARY_OP_INPLACE_ADD_INT_r23 629
+#define _BINARY_OP_INPLACE_ADD_UNICODE_r21 630
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23 631
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23 632
+#define _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23 633
+#define _BINARY_OP_INPLACE_MULTIPLY_INT_r23 634
+#define _BINARY_OP_INPLACE_POWER_FLOAT_r23 635
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23 636
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23 637
+#define _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23 638
+#define _BINARY_OP_INPLACE_SUBTRACT_INT_r23 639
+#define _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23 640
+#define _BINARY_OP_MODULO_INT_r23 641
+#define _BINARY_OP_MULTIPLY_FLOAT_r03 642
+#define _BINARY_OP_MULTIPLY_FLOAT_r13 643
+#define _BINARY_OP_MULTIPLY_FLOAT_r23 644
+#define _BINARY_OP_MULTIPLY_FLOAT_INT_r03 645
+#define _BINARY_OP_MULTIPLY_FLOAT_INT_r13 646
+#define _BINARY_OP_MULTIPLY_FLOAT_INT_r23 647
+#define _BINARY_OP_MULTIPLY_INT_r03 648
+#define _BINARY_OP_MULTIPLY_INT_r13 649
+#define _BINARY_OP_MULTIPLY_INT_r23 650
+#define _BINARY_OP_POWER_FLOAT_r23 651
+#define _BINARY_OP_SUBSCR_CHECK_FUNC_r23 652
+#define _BINARY_OP_SUBSCR_DICT_r23 653
+#define _BINARY_OP_SUBSCR_INIT_CALL_r01 654
+#define _BINARY_OP_SUBSCR_INIT_CALL_r11 655
+#define _BINARY_OP_SUBSCR_INIT_CALL_r21 656
+#define _BINARY_OP_SUBSCR_INIT_CALL_r31 657
+#define _BINARY_OP_SUBSCR_LIST_INT_r23 658
+#define _BINARY_OP_SUBSCR_LIST_SLICE_r23 659
+#define _BINARY_OP_SUBSCR_STR_INT_r23 660
+#define _BINARY_OP_SUBSCR_TUPLE_INT_r03 661
+#define _BINARY_OP_SUBSCR_TUPLE_INT_r13 662
+#define _BINARY_OP_SUBSCR_TUPLE_INT_r23 663
+#define _BINARY_OP_SUBSCR_USTR_INT_r23 664
+#define _BINARY_OP_SUBTRACT_FLOAT_r03 665
+#define _BINARY_OP_SUBTRACT_FLOAT_r13 666
+#define _BINARY_OP_SUBTRACT_FLOAT_r23 667
+#define _BINARY_OP_SUBTRACT_FLOAT_INT_r03 668
+#define _BINARY_OP_SUBTRACT_FLOAT_INT_r13 669
+#define _BINARY_OP_SUBTRACT_FLOAT_INT_r23 670
+#define _BINARY_OP_SUBTRACT_INT_r03 671
+#define _BINARY_OP_SUBTRACT_INT_r13 672
+#define _BINARY_OP_SUBTRACT_INT_r23 673
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT_r03 674
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT_r13 675
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT_r23 676
+#define _BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23 677
+#define _BINARY_SLICE_r31 678
+#define _BUILD_INTERPOLATION_r01 679
+#define _BUILD_LIST_r01 680
+#define _BUILD_MAP_r01 681
+#define _BUILD_SET_r01 682
+#define _BUILD_SLICE_r01 683
+#define _BUILD_STRING_r01 684
+#define _BUILD_TEMPLATE_r21 685
+#define _BUILD_TUPLE_r01 686
+#define _CALL_BUILTIN_CLASS_r01 687
+#define _CALL_BUILTIN_FAST_r01 688
+#define _CALL_BUILTIN_FAST_WITH_KEYWORDS_r01 689
+#define _CALL_BUILTIN_O_r03 690
+#define _CALL_FUNCTION_EX_NON_PY_GENERAL_r31 691
+#define _CALL_INTRINSIC_1_r11 692
+#define _CALL_INTRINSIC_2_r21 693
+#define _CALL_ISINSTANCE_r31 694
+#define _CALL_KW_NON_PY_r11 695
+#define _CALL_LEN_r33 696
+#define _CALL_LIST_APPEND_r03 697
+#define _CALL_LIST_APPEND_r13 698
+#define _CALL_LIST_APPEND_r23 699
+#define _CALL_LIST_APPEND_r33 700
+#define _CALL_METHOD_DESCRIPTOR_FAST_r01 701
+#define _CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS_r01 702
+#define _CALL_METHOD_DESCRIPTOR_NOARGS_r01 703
+#define _CALL_METHOD_DESCRIPTOR_O_r03 704
+#define _CALL_NON_PY_GENERAL_r01 705
+#define _CALL_STR_1_r32 706
+#define _CALL_TUPLE_1_r32 707
+#define _CALL_TYPE_1_r02 708
+#define _CALL_TYPE_1_r12 709
+#define _CALL_TYPE_1_r22 710
+#define _CALL_TYPE_1_r32 711
+#define _CHECK_AND_ALLOCATE_OBJECT_r00 712
+#define _CHECK_ATTR_CLASS_r01 713
+#define _CHECK_ATTR_CLASS_r11 714
+#define _CHECK_ATTR_CLASS_r22 715
+#define _CHECK_ATTR_CLASS_r33 716
+#define _CHECK_ATTR_METHOD_LAZY_DICT_r01 717
+#define _CHECK_ATTR_METHOD_LAZY_DICT_r11 718
+#define _CHECK_ATTR_METHOD_LAZY_DICT_r22 719
+#define _CHECK_ATTR_METHOD_LAZY_DICT_r33 720
+#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS_r00 721
+#define _CHECK_EG_MATCH_r22 722
+#define _CHECK_EXC_MATCH_r22 723
+#define _CHECK_FUNCTION_EXACT_ARGS_r00 724
+#define _CHECK_FUNCTION_VERSION_r00 725
+#define _CHECK_FUNCTION_VERSION_INLINE_r00 726
+#define _CHECK_FUNCTION_VERSION_INLINE_r11 727
+#define _CHECK_FUNCTION_VERSION_INLINE_r22 728
+#define _CHECK_FUNCTION_VERSION_INLINE_r33 729
+#define _CHECK_FUNCTION_VERSION_KW_r11 730
+#define _CHECK_IS_NOT_PY_CALLABLE_r00 731
+#define _CHECK_IS_NOT_PY_CALLABLE_EX_r03 732
+#define _CHECK_IS_NOT_PY_CALLABLE_EX_r13 733
+#define _CHECK_IS_NOT_PY_CALLABLE_EX_r23 734
+#define _CHECK_IS_NOT_PY_CALLABLE_EX_r33 735
+#define _CHECK_IS_NOT_PY_CALLABLE_KW_r11 736
+#define _CHECK_IS_PY_CALLABLE_EX_r03 737
+#define _CHECK_IS_PY_CALLABLE_EX_r13 738
+#define _CHECK_IS_PY_CALLABLE_EX_r23 739
+#define _CHECK_IS_PY_CALLABLE_EX_r33 740
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r01 741
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r11 742
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r22 743
+#define _CHECK_MANAGED_OBJECT_HAS_VALUES_r33 744
+#define _CHECK_METHOD_VERSION_r00 745
+#define _CHECK_METHOD_VERSION_KW_r11 746
+#define _CHECK_PEP_523_r00 747
+#define _CHECK_PEP_523_r11 748
+#define _CHECK_PEP_523_r22 749
+#define _CHECK_PEP_523_r33 750
+#define _CHECK_PERIODIC_r00 751
+#define _CHECK_PERIODIC_AT_END_r00 752
+#define _CHECK_PERIODIC_IF_NOT_YIELD_FROM_r00 753
+#define _CHECK_RECURSION_REMAINING_r00 754
+#define _CHECK_RECURSION_REMAINING_r11 755
+#define _CHECK_RECURSION_REMAINING_r22 756
+#define _CHECK_RECURSION_REMAINING_r33 757
+#define _CHECK_STACK_SPACE_r00 758
+#define _CHECK_STACK_SPACE_OPERAND_r00 759
+#define _CHECK_STACK_SPACE_OPERAND_r11 760
+#define _CHECK_STACK_SPACE_OPERAND_r22 761
+#define _CHECK_STACK_SPACE_OPERAND_r33 762
+#define _CHECK_VALIDITY_r00 763
+#define _CHECK_VALIDITY_r11 764
+#define _CHECK_VALIDITY_r22 765
+#define _CHECK_VALIDITY_r33 766
+#define _COLD_DYNAMIC_EXIT_r00 767
+#define _COLD_EXIT_r00 768
+#define _COMPARE_OP_r21 769
+#define _COMPARE_OP_FLOAT_r03 770
+#define _COMPARE_OP_FLOAT_r13 771
+#define _COMPARE_OP_FLOAT_r23 772
+#define _COMPARE_OP_INT_r23 773
+#define _COMPARE_OP_STR_r23 774
+#define _CONTAINS_OP_r23 775
+#define _CONTAINS_OP_DICT_r23 776
+#define _CONTAINS_OP_SET_r23 777
+#define _CONVERT_VALUE_r11 778
+#define _COPY_r01 779
+#define _COPY_1_r02 780
+#define _COPY_1_r12 781
+#define _COPY_1_r23 782
+#define _COPY_2_r03 783
+#define _COPY_2_r13 784
+#define _COPY_2_r23 785
+#define _COPY_3_r03 786
+#define _COPY_3_r13 787
+#define _COPY_3_r23 788
+#define _COPY_3_r33 789
+#define _COPY_FREE_VARS_r00 790
+#define _COPY_FREE_VARS_r11 791
+#define _COPY_FREE_VARS_r22 792
+#define _COPY_FREE_VARS_r33 793
+#define _CREATE_INIT_FRAME_r01 794
+#define _DELETE_ATTR_r10 795
+#define _DELETE_DEREF_r00 796
+#define _DELETE_FAST_r00 797
+#define _DELETE_GLOBAL_r00 798
+#define _DELETE_NAME_r00 799
+#define _DELETE_SUBSCR_r20 800
+#define _DEOPT_r00 801
+#define _DEOPT_r10 802
+#define _DEOPT_r20 803
+#define _DEOPT_r30 804
+#define _DICT_MERGE_r10 805
+#define _DICT_UPDATE_r10 806
+#define _DO_CALL_r01 807
+#define _DO_CALL_FUNCTION_EX_r31 808
+#define _DO_CALL_KW_r11 809
+#define _DYNAMIC_EXIT_r00 810
+#define _DYNAMIC_EXIT_r10 811
+#define _DYNAMIC_EXIT_r20 812
+#define _DYNAMIC_EXIT_r30 813
+#define _END_FOR_r10 814
+#define _END_SEND_r21 815
+#define _ERROR_POP_N_r00 816
+#define _EXIT_INIT_CHECK_r10 817
+#define _EXIT_TRACE_r00 818
+#define _EXIT_TRACE_r10 819
+#define _EXIT_TRACE_r20 820
+#define _EXIT_TRACE_r30 821
+#define _EXPAND_METHOD_r00 822
+#define _EXPAND_METHOD_KW_r11 823
+#define _FATAL_ERROR_r00 824
+#define _FATAL_ERROR_r11 825
+#define _FATAL_ERROR_r22 826
+#define _FATAL_ERROR_r33 827
+#define _FORMAT_SIMPLE_r11 828
+#define _FORMAT_WITH_SPEC_r21 829
+#define _FOR_ITER_r23 830
+#define _FOR_ITER_GEN_FRAME_r03 831
+#define _FOR_ITER_GEN_FRAME_r13 832
+#define _FOR_ITER_GEN_FRAME_r23 833
+#define _FOR_ITER_TIER_TWO_r23 834
+#define _GET_AITER_r11 835
+#define _GET_ANEXT_r12 836
+#define _GET_AWAITABLE_r11 837
+#define _GET_ITER_r12 838
+#define _GET_LEN_r12 839
+#define _GET_YIELD_FROM_ITER_r11 840
+#define _GUARD_BINARY_OP_EXTEND_r22 841
+#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r02 842
+#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r12 843
+#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r22 844
+#define _GUARD_BINARY_OP_SUBSCR_TUPLE_INT_BOUNDS_r33 845
+#define _GUARD_BIT_IS_SET_POP_r00 846
+#define _GUARD_BIT_IS_SET_POP_r10 847
+#define _GUARD_BIT_IS_SET_POP_r21 848
+#define _GUARD_BIT_IS_SET_POP_r32 849
+#define _GUARD_BIT_IS_SET_POP_4_r00 850
+#define _GUARD_BIT_IS_SET_POP_4_r10 851
+#define _GUARD_BIT_IS_SET_POP_4_r21 852
+#define _GUARD_BIT_IS_SET_POP_4_r32 853
+#define _GUARD_BIT_IS_SET_POP_5_r00 854
+#define _GUARD_BIT_IS_SET_POP_5_r10 855
+#define _GUARD_BIT_IS_SET_POP_5_r21 856
+#define _GUARD_BIT_IS_SET_POP_5_r32 857
+#define _GUARD_BIT_IS_SET_POP_6_r00 858
+#define _GUARD_BIT_IS_SET_POP_6_r10 859
+#define _GUARD_BIT_IS_SET_POP_6_r21 860
+#define _GUARD_BIT_IS_SET_POP_6_r32 861
+#define _GUARD_BIT_IS_SET_POP_7_r00 862
+#define _GUARD_BIT_IS_SET_POP_7_r10 863
+#define _GUARD_BIT_IS_SET_POP_7_r21 864
+#define _GUARD_BIT_IS_SET_POP_7_r32 865
+#define _GUARD_BIT_IS_UNSET_POP_r00 866
+#define _GUARD_BIT_IS_UNSET_POP_r10 867
+#define _GUARD_BIT_IS_UNSET_POP_r21 868
+#define _GUARD_BIT_IS_UNSET_POP_r32 869
+#define _GUARD_BIT_IS_UNSET_POP_4_r00 870
+#define _GUARD_BIT_IS_UNSET_POP_4_r10 871
+#define _GUARD_BIT_IS_UNSET_POP_4_r21 872
+#define _GUARD_BIT_IS_UNSET_POP_4_r32 873
+#define _GUARD_BIT_IS_UNSET_POP_5_r00 874
+#define _GUARD_BIT_IS_UNSET_POP_5_r10 875
+#define _GUARD_BIT_IS_UNSET_POP_5_r21 876
+#define _GUARD_BIT_IS_UNSET_POP_5_r32 877
+#define _GUARD_BIT_IS_UNSET_POP_6_r00 878
+#define _GUARD_BIT_IS_UNSET_POP_6_r10 879
+#define _GUARD_BIT_IS_UNSET_POP_6_r21 880
+#define _GUARD_BIT_IS_UNSET_POP_6_r32 881
+#define _GUARD_BIT_IS_UNSET_POP_7_r00 882
+#define _GUARD_BIT_IS_UNSET_POP_7_r10 883
+#define _GUARD_BIT_IS_UNSET_POP_7_r21 884
+#define _GUARD_BIT_IS_UNSET_POP_7_r32 885
+#define _GUARD_CALLABLE_ISINSTANCE_r03 886
+#define _GUARD_CALLABLE_ISINSTANCE_r13 887
+#define _GUARD_CALLABLE_ISINSTANCE_r23 888
+#define _GUARD_CALLABLE_ISINSTANCE_r33 889
+#define _GUARD_CALLABLE_LEN_r03 890
+#define _GUARD_CALLABLE_LEN_r13 891
+#define _GUARD_CALLABLE_LEN_r23 892
+#define _GUARD_CALLABLE_LEN_r33 893
+#define _GUARD_CALLABLE_LIST_APPEND_r03 894
+#define _GUARD_CALLABLE_LIST_APPEND_r13 895
+#define _GUARD_CALLABLE_LIST_APPEND_r23 896
+#define _GUARD_CALLABLE_LIST_APPEND_r33 897
+#define _GUARD_CALLABLE_STR_1_r03 898
+#define _GUARD_CALLABLE_STR_1_r13 899
+#define _GUARD_CALLABLE_STR_1_r23 900
+#define _GUARD_CALLABLE_STR_1_r33 901
+#define _GUARD_CALLABLE_TUPLE_1_r03 902
+#define _GUARD_CALLABLE_TUPLE_1_r13 903
+#define _GUARD_CALLABLE_TUPLE_1_r23 904
+#define _GUARD_CALLABLE_TUPLE_1_r33 905
+#define _GUARD_CALLABLE_TYPE_1_r03 906
+#define _GUARD_CALLABLE_TYPE_1_r13 907
+#define _GUARD_CALLABLE_TYPE_1_r23 908
+#define _GUARD_CALLABLE_TYPE_1_r33 909
+#define _GUARD_CODE_r00 910
+#define _GUARD_CODE_r11 911
+#define _GUARD_CODE_r22 912
+#define _GUARD_CODE_r33 913
+#define _GUARD_DORV_NO_DICT_r01 914
+#define _GUARD_DORV_NO_DICT_r11 915
+#define _GUARD_DORV_NO_DICT_r22 916
+#define _GUARD_DORV_NO_DICT_r33 917
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r01 918
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r11 919
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r22 920
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r33 921
+#define _GUARD_GLOBALS_VERSION_r00 922
+#define _GUARD_GLOBALS_VERSION_r11 923
+#define _GUARD_GLOBALS_VERSION_r22 924
+#define _GUARD_GLOBALS_VERSION_r33 925
+#define _GUARD_IP_RETURN_GENERATOR_r00 926
+#define _GUARD_IP_RETURN_GENERATOR_r11 927
+#define _GUARD_IP_RETURN_GENERATOR_r22 928
+#define _GUARD_IP_RETURN_GENERATOR_r33 929
+#define _GUARD_IP_RETURN_VALUE_r00 930
+#define _GUARD_IP_RETURN_VALUE_r11 931
+#define _GUARD_IP_RETURN_VALUE_r22 932
+#define _GUARD_IP_RETURN_VALUE_r33 933
+#define _GUARD_IP_YIELD_VALUE_r00 934
+#define _GUARD_IP_YIELD_VALUE_r11 935
+#define _GUARD_IP_YIELD_VALUE_r22 936
+#define _GUARD_IP_YIELD_VALUE_r33 937
+#define _GUARD_IP__PUSH_FRAME_r00 938
+#define _GUARD_IP__PUSH_FRAME_r11 939
+#define _GUARD_IP__PUSH_FRAME_r22 940
+#define _GUARD_IP__PUSH_FRAME_r33 941
+#define _GUARD_IS_FALSE_POP_r00 942
+#define _GUARD_IS_FALSE_POP_r10 943
+#define _GUARD_IS_FALSE_POP_r21 944
+#define _GUARD_IS_FALSE_POP_r32 945
+#define _GUARD_IS_NONE_POP_r00 946
+#define _GUARD_IS_NONE_POP_r10 947
+#define _GUARD_IS_NONE_POP_r21 948
+#define _GUARD_IS_NONE_POP_r32 949
+#define _GUARD_IS_NOT_NONE_POP_r10 950
+#define _GUARD_IS_TRUE_POP_r00 951
+#define _GUARD_IS_TRUE_POP_r10 952
+#define _GUARD_IS_TRUE_POP_r21 953
+#define _GUARD_IS_TRUE_POP_r32 954
+#define _GUARD_KEYS_VERSION_r01 955
+#define _GUARD_KEYS_VERSION_r11 956
+#define _GUARD_KEYS_VERSION_r22 957
+#define _GUARD_KEYS_VERSION_r33 958
+#define _GUARD_NOS_COMPACT_ASCII_r02 959
+#define _GUARD_NOS_COMPACT_ASCII_r12 960
+#define _GUARD_NOS_COMPACT_ASCII_r22 961
+#define _GUARD_NOS_COMPACT_ASCII_r33 962
+#define _GUARD_NOS_DICT_r02 963
+#define _GUARD_NOS_DICT_r12 964
+#define _GUARD_NOS_DICT_r22 965
+#define _GUARD_NOS_DICT_r33 966
+#define _GUARD_NOS_FLOAT_r02 967
+#define _GUARD_NOS_FLOAT_r12 968
+#define _GUARD_NOS_FLOAT_r22 969
+#define _GUARD_NOS_FLOAT_r33 970
+#define _GUARD_NOS_INT_r02 971
+#define _GUARD_NOS_INT_r12 972
+#define _GUARD_NOS_INT_r22 973
+#define _GUARD_NOS_INT_r33 974
+#define _GUARD_NOS_LIST_r02 975
+#define _GUARD_NOS_LIST_r12 976
+#define _GUARD_NOS_LIST_r22 977
+#define _GUARD_NOS_LIST_r33 978
+#define _GUARD_NOS_NOT_NULL_r02 979
+#define _GUARD_NOS_NOT_NULL_r12 980
+#define _GUARD_NOS_NOT_NULL_r22 981
+#define _GUARD_NOS_NOT_NULL_r33 982
+#define _GUARD_NOS_NULL_r02 983
+#define _GUARD_NOS_NULL_r12 984
+#define _GUARD_NOS_NULL_r22 985
+#define _GUARD_NOS_NULL_r33 986
+#define _GUARD_NOS_OVERFLOWED_r02 987
+#define _GUARD_NOS_OVERFLOWED_r12 988
+#define _GUARD_NOS_OVERFLOWED_r22 989
+#define _GUARD_NOS_OVERFLOWED_r33 990
+#define _GUARD_NOS_TUPLE_r02 991
+#define _GUARD_NOS_TUPLE_r12 992
+#define _GUARD_NOS_TUPLE_r22 993
+#define _GUARD_NOS_TUPLE_r33 994
+#define _GUARD_NOS_UNICODE_r02 995
+#define _GUARD_NOS_UNICODE_r12 996
+#define _GUARD_NOS_UNICODE_r22 997
+#define _GUARD_NOS_UNICODE_r33 998
+#define _GUARD_NOT_EXHAUSTED_LIST_r02 999
+#define _GUARD_NOT_EXHAUSTED_LIST_r12 1000
+#define _GUARD_NOT_EXHAUSTED_LIST_r22 1001
+#define _GUARD_NOT_EXHAUSTED_LIST_r33 1002
+#define _GUARD_NOT_EXHAUSTED_RANGE_r02 1003
+#define _GUARD_NOT_EXHAUSTED_RANGE_r12 1004
+#define _GUARD_NOT_EXHAUSTED_RANGE_r22 1005
+#define _GUARD_NOT_EXHAUSTED_RANGE_r33 1006
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r02 1007
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r12 1008
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r22 1009
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r33 1010
+#define _GUARD_THIRD_NULL_r03 1011
+#define _GUARD_THIRD_NULL_r13 1012
+#define _GUARD_THIRD_NULL_r23 1013
+#define _GUARD_THIRD_NULL_r33 1014
+#define _GUARD_TOS_ANY_SET_r01 1015
+#define _GUARD_TOS_ANY_SET_r11 1016
+#define _GUARD_TOS_ANY_SET_r22 1017
+#define _GUARD_TOS_ANY_SET_r33 1018
+#define _GUARD_TOS_DICT_r01 1019
+#define _GUARD_TOS_DICT_r11 1020
+#define _GUARD_TOS_DICT_r22 1021
+#define _GUARD_TOS_DICT_r33 1022
+#define _GUARD_TOS_FLOAT_r01 1023
+#define _GUARD_TOS_FLOAT_r11 1024
+#define _GUARD_TOS_FLOAT_r22 1025
+#define _GUARD_TOS_FLOAT_r33 1026
+#define _GUARD_TOS_INT_r01 1027
+#define _GUARD_TOS_INT_r11 1028
+#define _GUARD_TOS_INT_r22 1029
+#define _GUARD_TOS_INT_r33 1030
+#define _GUARD_TOS_LIST_r01 1031
+#define _GUARD_TOS_LIST_r11 1032
+#define _GUARD_TOS_LIST_r22 1033
+#define _GUARD_TOS_LIST_r33 1034
+#define _GUARD_TOS_OVERFLOWED_r01 1035
+#define _GUARD_TOS_OVERFLOWED_r11 1036
+#define _GUARD_TOS_OVERFLOWED_r22 1037
+#define _GUARD_TOS_OVERFLOWED_r33 1038
+#define _GUARD_TOS_SLICE_r01 1039
+#define _GUARD_TOS_SLICE_r11 1040
+#define _GUARD_TOS_SLICE_r22 1041
+#define _GUARD_TOS_SLICE_r33 1042
+#define _GUARD_TOS_TUPLE_r01 1043
+#define _GUARD_TOS_TUPLE_r11 1044
+#define _GUARD_TOS_TUPLE_r22 1045
+#define _GUARD_TOS_TUPLE_r33 1046
+#define _GUARD_TOS_UNICODE_r01 1047
+#define _GUARD_TOS_UNICODE_r11 1048
+#define _GUARD_TOS_UNICODE_r22 1049
+#define _GUARD_TOS_UNICODE_r33 1050
+#define _GUARD_TYPE_VERSION_r01 1051
+#define _GUARD_TYPE_VERSION_r11 1052
+#define _GUARD_TYPE_VERSION_r22 1053
+#define _GUARD_TYPE_VERSION_r33 1054
+#define _GUARD_TYPE_VERSION_AND_LOCK_r01 1055
+#define _GUARD_TYPE_VERSION_AND_LOCK_r11 1056
+#define _GUARD_TYPE_VERSION_AND_LOCK_r22 1057
+#define _GUARD_TYPE_VERSION_AND_LOCK_r33 1058
+#define _HANDLE_PENDING_AND_DEOPT_r00 1059
+#define _HANDLE_PENDING_AND_DEOPT_r10 1060
+#define _HANDLE_PENDING_AND_DEOPT_r20 1061
+#define _HANDLE_PENDING_AND_DEOPT_r30 1062
+#define _IMPORT_FROM_r12 1063
+#define _IMPORT_NAME_r21 1064
+#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS_r00 1065
+#define _INIT_CALL_PY_EXACT_ARGS_r01 1066
+#define _INIT_CALL_PY_EXACT_ARGS_0_r01 1067
+#define _INIT_CALL_PY_EXACT_ARGS_1_r01 1068
+#define _INIT_CALL_PY_EXACT_ARGS_2_r01 1069
+#define _INIT_CALL_PY_EXACT_ARGS_3_r01 1070
+#define _INIT_CALL_PY_EXACT_ARGS_4_r01 1071
+#define _INSERT_1_LOAD_CONST_INLINE_r02 1072
+#define _INSERT_1_LOAD_CONST_INLINE_r12 1073
+#define _INSERT_1_LOAD_CONST_INLINE_r23 1074
+#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r02 1075
+#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r12 1076
+#define _INSERT_1_LOAD_CONST_INLINE_BORROW_r23 1077
+#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r03 1078
+#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r13 1079
+#define _INSERT_2_LOAD_CONST_INLINE_BORROW_r23 1080
+#define _INSERT_NULL_r10 1081
+#define _INSTRUMENTED_FOR_ITER_r23 1082
+#define _INSTRUMENTED_INSTRUCTION_r00 1083
+#define _INSTRUMENTED_JUMP_FORWARD_r00 1084
+#define _INSTRUMENTED_JUMP_FORWARD_r11 1085
+#define _INSTRUMENTED_JUMP_FORWARD_r22 1086
+#define _INSTRUMENTED_JUMP_FORWARD_r33 1087
+#define _INSTRUMENTED_LINE_r00 1088
+#define _INSTRUMENTED_NOT_TAKEN_r00 1089
+#define _INSTRUMENTED_NOT_TAKEN_r11 1090
+#define _INSTRUMENTED_NOT_TAKEN_r22 1091
+#define _INSTRUMENTED_NOT_TAKEN_r33 1092
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r00 1093
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r10 1094
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r21 1095
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r32 1096
+#define _INSTRUMENTED_POP_JUMP_IF_NONE_r10 1097
+#define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE_r10 1098
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r00 1099
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r10 1100
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r21 1101
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r32 1102
+#define _IS_NONE_r11 1103
+#define _IS_OP_r03 1104
+#define _IS_OP_r13 1105
+#define _IS_OP_r23 1106
+#define _ITER_CHECK_LIST_r02 1107
+#define _ITER_CHECK_LIST_r12 1108
+#define _ITER_CHECK_LIST_r22 1109
+#define _ITER_CHECK_LIST_r33 1110
+#define _ITER_CHECK_RANGE_r02 1111
+#define _ITER_CHECK_RANGE_r12 1112
+#define _ITER_CHECK_RANGE_r22 1113
+#define _ITER_CHECK_RANGE_r33 1114
+#define _ITER_CHECK_TUPLE_r02 1115
+#define _ITER_CHECK_TUPLE_r12 1116
+#define _ITER_CHECK_TUPLE_r22 1117
+#define _ITER_CHECK_TUPLE_r33 1118
+#define _ITER_JUMP_LIST_r02 1119
+#define _ITER_JUMP_LIST_r12 1120
+#define _ITER_JUMP_LIST_r22 1121
+#define _ITER_JUMP_LIST_r33 1122
+#define _ITER_JUMP_RANGE_r02 1123
+#define _ITER_JUMP_RANGE_r12 1124
+#define _ITER_JUMP_RANGE_r22 1125
+#define _ITER_JUMP_RANGE_r33 1126
+#define _ITER_JUMP_TUPLE_r02 1127
+#define _ITER_JUMP_TUPLE_r12 1128
+#define _ITER_JUMP_TUPLE_r22 1129
+#define _ITER_JUMP_TUPLE_r33 1130
+#define _ITER_NEXT_LIST_r23 1131
+#define _ITER_NEXT_LIST_TIER_TWO_r23 1132
+#define _ITER_NEXT_RANGE_r03 1133
+#define _ITER_NEXT_RANGE_r13 1134
+#define _ITER_NEXT_RANGE_r23 1135
+#define _ITER_NEXT_TUPLE_r03 1136
+#define _ITER_NEXT_TUPLE_r13 1137
+#define _ITER_NEXT_TUPLE_r23 1138
+#define _JUMP_BACKWARD_NO_INTERRUPT_r00 1139
+#define _JUMP_BACKWARD_NO_INTERRUPT_r11 1140
+#define _JUMP_BACKWARD_NO_INTERRUPT_r22 1141
+#define _JUMP_BACKWARD_NO_INTERRUPT_r33 1142
+#define _JUMP_TO_TOP_r00 1143
+#define _LIST_APPEND_r10 1144
+#define _LIST_EXTEND_r10 1145
+#define _LOAD_ATTR_r10 1146
+#define _LOAD_ATTR_CLASS_r11 1147
+#define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_r11 1148
+#define _LOAD_ATTR_INSTANCE_VALUE_r02 1149
+#define _LOAD_ATTR_INSTANCE_VALUE_r12 1150
+#define _LOAD_ATTR_INSTANCE_VALUE_r23 1151
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r02 1152
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r12 1153
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r23 1154
+#define _LOAD_ATTR_METHOD_NO_DICT_r02 1155
+#define _LOAD_ATTR_METHOD_NO_DICT_r12 1156
+#define _LOAD_ATTR_METHOD_NO_DICT_r23 1157
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r02 1158
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r12 1159
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r23 1160
+#define _LOAD_ATTR_MODULE_r12 1161
+#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT_r11 1162
+#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES_r11 1163
+#define _LOAD_ATTR_PROPERTY_FRAME_r11 1164
+#define _LOAD_ATTR_SLOT_r02 1165
+#define _LOAD_ATTR_SLOT_r12 1166
+#define _LOAD_ATTR_SLOT_r23 1167
+#define _LOAD_ATTR_WITH_HINT_r12 1168
+#define _LOAD_BUILD_CLASS_r01 1169
+#define _LOAD_BYTECODE_r00 1170
+#define _LOAD_COMMON_CONSTANT_r01 1171
+#define _LOAD_COMMON_CONSTANT_r12 1172
+#define _LOAD_COMMON_CONSTANT_r23 1173
+#define _LOAD_CONST_r01 1174
+#define _LOAD_CONST_r12 1175
+#define _LOAD_CONST_r23 1176
+#define _LOAD_CONST_INLINE_r01 1177
+#define _LOAD_CONST_INLINE_r12 1178
+#define _LOAD_CONST_INLINE_r23 1179
+#define _LOAD_CONST_INLINE_BORROW_r01 1180
+#define _LOAD_CONST_INLINE_BORROW_r12 1181
+#define _LOAD_CONST_INLINE_BORROW_r23 1182
+#define _LOAD_CONST_UNDER_INLINE_r02 1183
+#define _LOAD_CONST_UNDER_INLINE_r12 1184
+#define _LOAD_CONST_UNDER_INLINE_r23 1185
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r02 1186
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r12 1187
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r23 1188
+#define _LOAD_DEREF_r01 1189
+#define _LOAD_FAST_r01 1190
+#define _LOAD_FAST_r12 1191
+#define _LOAD_FAST_r23 1192
+#define _LOAD_FAST_0_r01 1193
+#define _LOAD_FAST_0_r12 1194
+#define _LOAD_FAST_0_r23 1195
+#define _LOAD_FAST_1_r01 1196
+#define _LOAD_FAST_1_r12 1197
+#define _LOAD_FAST_1_r23 1198
+#define _LOAD_FAST_2_r01 1199
+#define _LOAD_FAST_2_r12 1200
+#define _LOAD_FAST_2_r23 1201
+#define _LOAD_FAST_3_r01 1202
+#define _LOAD_FAST_3_r12 1203
+#define _LOAD_FAST_3_r23 1204
+#define _LOAD_FAST_4_r01 1205
+#define _LOAD_FAST_4_r12 1206
+#define _LOAD_FAST_4_r23 1207
+#define _LOAD_FAST_5_r01 1208
+#define _LOAD_FAST_5_r12 1209
+#define _LOAD_FAST_5_r23 1210
+#define _LOAD_FAST_6_r01 1211
+#define _LOAD_FAST_6_r12 1212
+#define _LOAD_FAST_6_r23 1213
+#define _LOAD_FAST_7_r01 1214
+#define _LOAD_FAST_7_r12 1215
+#define _LOAD_FAST_7_r23 1216
+#define _LOAD_FAST_AND_CLEAR_r01 1217
+#define _LOAD_FAST_AND_CLEAR_r12 1218
+#define _LOAD_FAST_AND_CLEAR_r23 1219
+#define _LOAD_FAST_BORROW_r01 1220
+#define _LOAD_FAST_BORROW_r12 1221
+#define _LOAD_FAST_BORROW_r23 1222
+#define _LOAD_FAST_BORROW_0_r01 1223
+#define _LOAD_FAST_BORROW_0_r12 1224
+#define _LOAD_FAST_BORROW_0_r23 1225
+#define _LOAD_FAST_BORROW_1_r01 1226
+#define _LOAD_FAST_BORROW_1_r12 1227
+#define _LOAD_FAST_BORROW_1_r23 1228
+#define _LOAD_FAST_BORROW_2_r01 1229
+#define _LOAD_FAST_BORROW_2_r12 1230
+#define _LOAD_FAST_BORROW_2_r23 1231
+#define _LOAD_FAST_BORROW_3_r01 1232
+#define _LOAD_FAST_BORROW_3_r12 1233
+#define _LOAD_FAST_BORROW_3_r23 1234
+#define _LOAD_FAST_BORROW_4_r01 1235
+#define _LOAD_FAST_BORROW_4_r12 1236
+#define _LOAD_FAST_BORROW_4_r23 1237
+#define _LOAD_FAST_BORROW_5_r01 1238
+#define _LOAD_FAST_BORROW_5_r12 1239
+#define _LOAD_FAST_BORROW_5_r23 1240
+#define _LOAD_FAST_BORROW_6_r01 1241
+#define _LOAD_FAST_BORROW_6_r12 1242
+#define _LOAD_FAST_BORROW_6_r23 1243
+#define _LOAD_FAST_BORROW_7_r01 1244
+#define _LOAD_FAST_BORROW_7_r12 1245
+#define _LOAD_FAST_BORROW_7_r23 1246
+#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r02 1247
+#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r13 1248
+#define _LOAD_FAST_CHECK_r01 1249
+#define _LOAD_FAST_CHECK_r12 1250
+#define _LOAD_FAST_CHECK_r23 1251
+#define _LOAD_FAST_LOAD_FAST_r02 1252
+#define _LOAD_FAST_LOAD_FAST_r13 1253
+#define _LOAD_FROM_DICT_OR_DEREF_r11 1254
+#define _LOAD_FROM_DICT_OR_GLOBALS_r11 1255
+#define _LOAD_GLOBAL_r00 1256
+#define _LOAD_GLOBAL_BUILTINS_r01 1257
+#define _LOAD_GLOBAL_MODULE_r01 1258
+#define _LOAD_LOCALS_r01 1259
+#define _LOAD_LOCALS_r12 1260
+#define _LOAD_LOCALS_r23 1261
+#define _LOAD_NAME_r01 1262
+#define _LOAD_SMALL_INT_r01 1263
+#define _LOAD_SMALL_INT_r12 1264
+#define _LOAD_SMALL_INT_r23 1265
+#define _LOAD_SMALL_INT_0_r01 1266
+#define _LOAD_SMALL_INT_0_r12 1267
+#define _LOAD_SMALL_INT_0_r23 1268
+#define _LOAD_SMALL_INT_1_r01 1269
+#define _LOAD_SMALL_INT_1_r12 1270
+#define _LOAD_SMALL_INT_1_r23 1271
+#define _LOAD_SMALL_INT_2_r01 1272
+#define _LOAD_SMALL_INT_2_r12 1273
+#define _LOAD_SMALL_INT_2_r23 1274
+#define _LOAD_SMALL_INT_3_r01 1275
+#define _LOAD_SMALL_INT_3_r12 1276
+#define _LOAD_SMALL_INT_3_r23 1277
+#define _LOAD_SPECIAL_r00 1278
+#define _LOAD_SUPER_ATTR_ATTR_r31 1279
+#define _LOAD_SUPER_ATTR_METHOD_r32 1280
+#define _MAKE_CALLARGS_A_TUPLE_r33 1281
+#define _MAKE_CELL_r00 1282
+#define _MAKE_FUNCTION_r11 1283
+#define _MAKE_WARM_r00 1284
+#define _MAKE_WARM_r11 1285
+#define _MAKE_WARM_r22 1286
+#define _MAKE_WARM_r33 1287
+#define _MAP_ADD_r20 1288
+#define _MATCH_CLASS_r31 1289
+#define _MATCH_KEYS_r23 1290
+#define _MATCH_MAPPING_r02 1291
+#define _MATCH_MAPPING_r12 1292
+#define _MATCH_MAPPING_r23 1293
+#define _MATCH_SEQUENCE_r02 1294
+#define _MATCH_SEQUENCE_r12 1295
+#define _MATCH_SEQUENCE_r23 1296
+#define _MAYBE_EXPAND_METHOD_r00 1297
+#define _MAYBE_EXPAND_METHOD_KW_r11 1298
+#define _MONITOR_CALL_r00 1299
+#define _MONITOR_CALL_KW_r11 1300
+#define _MONITOR_JUMP_BACKWARD_r00 1301
+#define _MONITOR_JUMP_BACKWARD_r11 1302
+#define _MONITOR_JUMP_BACKWARD_r22 1303
+#define _MONITOR_JUMP_BACKWARD_r33 1304
+#define _MONITOR_RESUME_r00 1305
+#define _NOP_r00 1306
+#define _NOP_r11 1307
+#define _NOP_r22 1308
+#define _NOP_r33 1309
+#define _POP_CALL_r20 1310
+#define _POP_CALL_LOAD_CONST_INLINE_BORROW_r21 1311
+#define _POP_CALL_ONE_r30 1312
+#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31 1313
+#define _POP_CALL_TWO_r30 1314
+#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31 1315
+#define _POP_EXCEPT_r10 1316
+#define _POP_ITER_r20 1317
+#define _POP_JUMP_IF_FALSE_r00 1318
+#define _POP_JUMP_IF_FALSE_r10 1319
+#define _POP_JUMP_IF_FALSE_r21 1320
+#define _POP_JUMP_IF_FALSE_r32 1321
+#define _POP_JUMP_IF_TRUE_r00 1322
+#define _POP_JUMP_IF_TRUE_r10 1323
+#define _POP_JUMP_IF_TRUE_r21 1324
+#define _POP_JUMP_IF_TRUE_r32 1325
+#define _POP_TOP_r10 1326
+#define _POP_TOP_FLOAT_r00 1327
+#define _POP_TOP_FLOAT_r10 1328
+#define _POP_TOP_FLOAT_r21 1329
+#define _POP_TOP_FLOAT_r32 1330
+#define _POP_TOP_INT_r00 1331
+#define _POP_TOP_INT_r10 1332
+#define _POP_TOP_INT_r21 1333
+#define _POP_TOP_INT_r32 1334
+#define _POP_TOP_LOAD_CONST_INLINE_r11 1335
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW_r11 1336
+#define _POP_TOP_NOP_r00 1337
+#define _POP_TOP_NOP_r10 1338
+#define _POP_TOP_NOP_r21 1339
+#define _POP_TOP_NOP_r32 1340
+#define _POP_TOP_UNICODE_r00 1341
+#define _POP_TOP_UNICODE_r10 1342
+#define _POP_TOP_UNICODE_r21 1343
+#define _POP_TOP_UNICODE_r32 1344
+#define _POP_TWO_r20 1345
+#define _POP_TWO_LOAD_CONST_INLINE_BORROW_r21 1346
+#define _PUSH_EXC_INFO_r02 1347
+#define _PUSH_EXC_INFO_r12 1348
+#define _PUSH_EXC_INFO_r23 1349
+#define _PUSH_FRAME_r10 1350
+#define _PUSH_NULL_r01 1351
+#define _PUSH_NULL_r12 1352
+#define _PUSH_NULL_r23 1353
+#define _PUSH_NULL_CONDITIONAL_r00 1354
+#define _PY_FRAME_EX_r31 1355
+#define _PY_FRAME_GENERAL_r01 1356
+#define _PY_FRAME_KW_r11 1357
+#define _QUICKEN_RESUME_r00 1358
+#define _QUICKEN_RESUME_r11 1359
+#define _QUICKEN_RESUME_r22 1360
+#define _QUICKEN_RESUME_r33 1361
+#define _REPLACE_WITH_TRUE_r02 1362
+#define _REPLACE_WITH_TRUE_r12 1363
+#define _REPLACE_WITH_TRUE_r23 1364
+#define _RESUME_CHECK_r00 1365
+#define _RESUME_CHECK_r11 1366
+#define _RESUME_CHECK_r22 1367
+#define _RESUME_CHECK_r33 1368
+#define _RETURN_GENERATOR_r01 1369
+#define _RETURN_VALUE_r11 1370
+#define _SAVE_RETURN_OFFSET_r00 1371
+#define _SAVE_RETURN_OFFSET_r11 1372
+#define _SAVE_RETURN_OFFSET_r22 1373
+#define _SAVE_RETURN_OFFSET_r33 1374
+#define _SEND_r22 1375
+#define _SEND_GEN_FRAME_r22 1376
+#define _SETUP_ANNOTATIONS_r00 1377
+#define _SET_ADD_r10 1378
+#define _SET_FUNCTION_ATTRIBUTE_r01 1379
+#define _SET_FUNCTION_ATTRIBUTE_r11 1380
+#define _SET_FUNCTION_ATTRIBUTE_r21 1381
+#define _SET_FUNCTION_ATTRIBUTE_r32 1382
+#define _SET_IP_r00 1383
+#define _SET_IP_r11 1384
+#define _SET_IP_r22 1385
+#define _SET_IP_r33 1386
+#define _SET_UPDATE_r10 1387
+#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r02 1388
+#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r12 1389
+#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r22 1390
+#define _SHUFFLE_2_LOAD_CONST_INLINE_BORROW_r32 1391
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03 1392
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13 1393
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23 1394
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33 1395
+#define _SPILL_OR_RELOAD_r01 1396
+#define _SPILL_OR_RELOAD_r02 1397
+#define _SPILL_OR_RELOAD_r03 1398
+#define _SPILL_OR_RELOAD_r10 1399
+#define _SPILL_OR_RELOAD_r12 1400
+#define _SPILL_OR_RELOAD_r13 1401
+#define _SPILL_OR_RELOAD_r20 1402
+#define _SPILL_OR_RELOAD_r21 1403
+#define _SPILL_OR_RELOAD_r23 1404
+#define _SPILL_OR_RELOAD_r30 1405
+#define _SPILL_OR_RELOAD_r31 1406
+#define _SPILL_OR_RELOAD_r32 1407
+#define _START_EXECUTOR_r00 1408
+#define _STORE_ATTR_r20 1409
+#define _STORE_ATTR_INSTANCE_VALUE_r21 1410
+#define _STORE_ATTR_SLOT_r21 1411
+#define _STORE_ATTR_WITH_HINT_r21 1412
+#define _STORE_DEREF_r10 1413
+#define _STORE_FAST_LOAD_FAST_r11 1414
+#define _STORE_FAST_STORE_FAST_r20 1415
+#define _STORE_GLOBAL_r10 1416
+#define _STORE_NAME_r10 1417
+#define _STORE_SLICE_r30 1418
+#define _STORE_SUBSCR_r30 1419
+#define _STORE_SUBSCR_DICT_r31 1420
+#define _STORE_SUBSCR_LIST_INT_r32 1421
+#define _SWAP_r11 1422
+#define _SWAP_2_r02 1423
+#define _SWAP_2_r12 1424
+#define _SWAP_2_r22 1425
+#define _SWAP_2_r33 1426
+#define _SWAP_3_r03 1427
+#define _SWAP_3_r13 1428
+#define _SWAP_3_r23 1429
+#define _SWAP_3_r33 1430
+#define _SWAP_FAST_r01 1431
+#define _SWAP_FAST_r11 1432
+#define _SWAP_FAST_r22 1433
+#define _SWAP_FAST_r33 1434
+#define _SWAP_FAST_0_r01 1435
+#define _SWAP_FAST_0_r11 1436
+#define _SWAP_FAST_0_r22 1437
+#define _SWAP_FAST_0_r33 1438
+#define _SWAP_FAST_1_r01 1439
+#define _SWAP_FAST_1_r11 1440
+#define _SWAP_FAST_1_r22 1441
+#define _SWAP_FAST_1_r33 1442
+#define _SWAP_FAST_2_r01 1443
+#define _SWAP_FAST_2_r11 1444
+#define _SWAP_FAST_2_r22 1445
+#define _SWAP_FAST_2_r33 1446
+#define _SWAP_FAST_3_r01 1447
+#define _SWAP_FAST_3_r11 1448
+#define _SWAP_FAST_3_r22 1449
+#define _SWAP_FAST_3_r33 1450
+#define _SWAP_FAST_4_r01 1451
+#define _SWAP_FAST_4_r11 1452
+#define _SWAP_FAST_4_r22 1453
+#define _SWAP_FAST_4_r33 1454
+#define _SWAP_FAST_5_r01 1455
+#define _SWAP_FAST_5_r11 1456
+#define _SWAP_FAST_5_r22 1457
+#define _SWAP_FAST_5_r33 1458
+#define _SWAP_FAST_6_r01 1459
+#define _SWAP_FAST_6_r11 1460
+#define _SWAP_FAST_6_r22 1461
+#define _SWAP_FAST_6_r33 1462
+#define _SWAP_FAST_7_r01 1463
+#define _SWAP_FAST_7_r11 1464
+#define _SWAP_FAST_7_r22 1465
+#define _SWAP_FAST_7_r33 1466
+#define _TIER2_RESUME_CHECK_r00 1467
+#define _TIER2_RESUME_CHECK_r11 1468
+#define _TIER2_RESUME_CHECK_r22 1469
+#define _TIER2_RESUME_CHECK_r33 1470
+#define _TO_BOOL_r11 1471
+#define _TO_BOOL_BOOL_r01 1472
+#define _TO_BOOL_BOOL_r11 1473
+#define _TO_BOOL_BOOL_r22 1474
+#define _TO_BOOL_BOOL_r33 1475
+#define _TO_BOOL_INT_r02 1476
+#define _TO_BOOL_INT_r12 1477
+#define _TO_BOOL_INT_r23 1478
+#define _TO_BOOL_LIST_r02 1479
+#define _TO_BOOL_LIST_r12 1480
+#define _TO_BOOL_LIST_r23 1481
+#define _TO_BOOL_NONE_r01 1482
+#define _TO_BOOL_NONE_r11 1483
+#define _TO_BOOL_NONE_r22 1484
+#define _TO_BOOL_NONE_r33 1485
+#define _TO_BOOL_STR_r02 1486
+#define _TO_BOOL_STR_r12 1487
+#define _TO_BOOL_STR_r23 1488
+#define _TRACE_RECORD_r00 1489
+#define _UNARY_INVERT_r12 1490
+#define _UNARY_NEGATIVE_r12 1491
+#define _UNARY_NOT_r01 1492
+#define _UNARY_NOT_r11 1493
+#define _UNARY_NOT_r22 1494
+#define _UNARY_NOT_r33 1495
+#define _UNPACK_EX_r10 1496
+#define _UNPACK_SEQUENCE_r10 1497
+#define _UNPACK_SEQUENCE_LIST_r10 1498
+#define _UNPACK_SEQUENCE_TUPLE_r10 1499
+#define _UNPACK_SEQUENCE_TWO_TUPLE_r12 1500
+#define _WITH_EXCEPT_START_r33 1501
+#define _YIELD_VALUE_r11 1502
+#define MAX_UOP_REGS_ID 1502
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h
index 5a47eae7a9abb1..2096a0c919c313 100644
--- a/Include/internal/pycore_uop_metadata.h
+++ b/Include/internal/pycore_uop_metadata.h
@@ -112,6 +112,28 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_BINARY_OP_MULTIPLY_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_ADD_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_SUBTRACT_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
+    [_BINARY_OP_INPLACE_ADD_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_POWER_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_ADD_FLOAT_INT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_SUBTRACT_FLOAT_INT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_MULTIPLY_FLOAT_INT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_INT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_FLOOR_DIVIDE_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_MODULO_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG,
+    [_BINARY_OP_INPLACE_POWER_FLOAT] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_INPLACE_ADD_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_INPLACE_SUBTRACT_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_INPLACE_MULTIPLY_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
     [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_INPLACE_ADD_UNICODE] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
     [_GUARD_BINARY_OP_EXTEND] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
@@ -1091,6 +1113,204 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
             { -1, -1, -1 },
         },
     },
+    [_BINARY_OP_INPLACE_ADD_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_ADD_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT] = {
+        .best = { 0, 1, 2, 2 },
+        .entries = {
+            { 3, 0, _BINARY_OP_TRUE_DIVIDE_FLOAT_r03 },
+            { 3, 1, _BINARY_OP_TRUE_DIVIDE_FLOAT_r13 },
+            { 3, 2, _BINARY_OP_TRUE_DIVIDE_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_POWER_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_POWER_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_ADD_FLOAT_INT] = {
+        .best = { 0, 1, 2, 2 },
+        .entries = {
+            { 3, 0, _BINARY_OP_ADD_FLOAT_INT_r03 },
+            { 3, 1, _BINARY_OP_ADD_FLOAT_INT_r13 },
+            { 3, 2, _BINARY_OP_ADD_FLOAT_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_SUBTRACT_FLOAT_INT] = {
+        .best = { 0, 1, 2, 2 },
+        .entries = {
+            { 3, 0, _BINARY_OP_SUBTRACT_FLOAT_INT_r03 },
+            { 3, 1, _BINARY_OP_SUBTRACT_FLOAT_INT_r13 },
+            { 3, 2, _BINARY_OP_SUBTRACT_FLOAT_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_MULTIPLY_FLOAT_INT] = {
+        .best = { 0, 1, 2, 2 },
+        .entries = {
+            { 3, 0, _BINARY_OP_MULTIPLY_FLOAT_INT_r03 },
+            { 3, 1, _BINARY_OP_MULTIPLY_FLOAT_INT_r13 },
+            { 3, 2, _BINARY_OP_MULTIPLY_FLOAT_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_FLOOR_DIVIDE_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_FLOOR_DIVIDE_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_MODULO_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_MODULO_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_POWER_FLOAT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_POWER_FLOAT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_ADD_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_ADD_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_SUBTRACT_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_SUBTRACT_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
+    [_BINARY_OP_INPLACE_MULTIPLY_INT] = {
+        .best = { 2, 2, 2, 2 },
+        .entries = {
+            { -1, -1, -1 },
+            { -1, -1, -1 },
+            { 3, 2, _BINARY_OP_INPLACE_MULTIPLY_INT_r23 },
+            { -1, -1, -1 },
+        },
+    },
     [_BINARY_OP_ADD_UNICODE] = {
         .best = { 0, 1, 2, 2 },
         .entries = {
@@ -3693,6 +3913,36 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_SUBTRACT_FLOAT_r03] = _BINARY_OP_SUBTRACT_FLOAT,
     [_BINARY_OP_SUBTRACT_FLOAT_r13] = _BINARY_OP_SUBTRACT_FLOAT,
     [_BINARY_OP_SUBTRACT_FLOAT_r23] = _BINARY_OP_SUBTRACT_FLOAT,
+    [_BINARY_OP_INPLACE_ADD_FLOAT_r23] = _BINARY_OP_INPLACE_ADD_FLOAT,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23] = _BINARY_OP_INPLACE_SUBTRACT_FLOAT,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23] = _BINARY_OP_INPLACE_MULTIPLY_FLOAT,
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23] = _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23] = _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23] = _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT,
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23] = _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT,
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23] = _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT,
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23] = _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r03] = _BINARY_OP_TRUE_DIVIDE_FLOAT,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r13] = _BINARY_OP_TRUE_DIVIDE_FLOAT,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r23] = _BINARY_OP_TRUE_DIVIDE_FLOAT,
+    [_BINARY_OP_POWER_FLOAT_r23] = _BINARY_OP_POWER_FLOAT,
+    [_BINARY_OP_ADD_FLOAT_INT_r03] = _BINARY_OP_ADD_FLOAT_INT,
+    [_BINARY_OP_ADD_FLOAT_INT_r13] = _BINARY_OP_ADD_FLOAT_INT,
+    [_BINARY_OP_ADD_FLOAT_INT_r23] = _BINARY_OP_ADD_FLOAT_INT,
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r03] = _BINARY_OP_SUBTRACT_FLOAT_INT,
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r13] = _BINARY_OP_SUBTRACT_FLOAT_INT,
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r23] = _BINARY_OP_SUBTRACT_FLOAT_INT,
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r03] = _BINARY_OP_MULTIPLY_FLOAT_INT,
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r13] = _BINARY_OP_MULTIPLY_FLOAT_INT,
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r23] = _BINARY_OP_MULTIPLY_FLOAT_INT,
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23] = _BINARY_OP_TRUE_DIVIDE_FLOAT_INT,
+    [_BINARY_OP_FLOOR_DIVIDE_INT_r23] = _BINARY_OP_FLOOR_DIVIDE_INT,
+    [_BINARY_OP_MODULO_INT_r23] = _BINARY_OP_MODULO_INT,
+    [_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23] = _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT,
+    [_BINARY_OP_INPLACE_POWER_FLOAT_r23] = _BINARY_OP_INPLACE_POWER_FLOAT,
+    [_BINARY_OP_INPLACE_ADD_INT_r23] = _BINARY_OP_INPLACE_ADD_INT,
+    [_BINARY_OP_INPLACE_SUBTRACT_INT_r23] = _BINARY_OP_INPLACE_SUBTRACT_INT,
+    [_BINARY_OP_INPLACE_MULTIPLY_INT_r23] = _BINARY_OP_INPLACE_MULTIPLY_INT,
     [_BINARY_OP_ADD_UNICODE_r03] = _BINARY_OP_ADD_UNICODE,
     [_BINARY_OP_ADD_UNICODE_r13] = _BINARY_OP_ADD_UNICODE,
     [_BINARY_OP_ADD_UNICODE_r23] = _BINARY_OP_ADD_UNICODE,
@@ -4265,6 +4515,10 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_ADD_FLOAT_r03] = "_BINARY_OP_ADD_FLOAT_r03",
     [_BINARY_OP_ADD_FLOAT_r13] = "_BINARY_OP_ADD_FLOAT_r13",
     [_BINARY_OP_ADD_FLOAT_r23] = "_BINARY_OP_ADD_FLOAT_r23",
+    [_BINARY_OP_ADD_FLOAT_INT] = "_BINARY_OP_ADD_FLOAT_INT",
+    [_BINARY_OP_ADD_FLOAT_INT_r03] = "_BINARY_OP_ADD_FLOAT_INT_r03",
+    [_BINARY_OP_ADD_FLOAT_INT_r13] = "_BINARY_OP_ADD_FLOAT_INT_r13",
+    [_BINARY_OP_ADD_FLOAT_INT_r23] = "_BINARY_OP_ADD_FLOAT_INT_r23",
     [_BINARY_OP_ADD_INT] = "_BINARY_OP_ADD_INT",
     [_BINARY_OP_ADD_INT_r03] = "_BINARY_OP_ADD_INT_r03",
     [_BINARY_OP_ADD_INT_r13] = "_BINARY_OP_ADD_INT_r13",
@@ -4275,16 +4529,54 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_ADD_UNICODE_r23] = "_BINARY_OP_ADD_UNICODE_r23",
     [_BINARY_OP_EXTEND] = "_BINARY_OP_EXTEND",
     [_BINARY_OP_EXTEND_r23] = "_BINARY_OP_EXTEND_r23",
+    [_BINARY_OP_FLOOR_DIVIDE_INT] = "_BINARY_OP_FLOOR_DIVIDE_INT",
+    [_BINARY_OP_FLOOR_DIVIDE_INT_r23] = "_BINARY_OP_FLOOR_DIVIDE_INT_r23",
+    [_BINARY_OP_INPLACE_ADD_FLOAT] = "_BINARY_OP_INPLACE_ADD_FLOAT",
+    [_BINARY_OP_INPLACE_ADD_FLOAT_r23] = "_BINARY_OP_INPLACE_ADD_FLOAT_r23",
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT] = "_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT",
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23] = "_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23",
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT] = "_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT",
+    [_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23] = "_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23",
+    [_BINARY_OP_INPLACE_ADD_INT] = "_BINARY_OP_INPLACE_ADD_INT",
+    [_BINARY_OP_INPLACE_ADD_INT_r23] = "_BINARY_OP_INPLACE_ADD_INT_r23",
     [_BINARY_OP_INPLACE_ADD_UNICODE] = "_BINARY_OP_INPLACE_ADD_UNICODE",
     [_BINARY_OP_INPLACE_ADD_UNICODE_r21] = "_BINARY_OP_INPLACE_ADD_UNICODE_r21",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT",
+    [_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23] = "_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23",
+    [_BINARY_OP_INPLACE_MULTIPLY_INT] = "_BINARY_OP_INPLACE_MULTIPLY_INT",
+    [_BINARY_OP_INPLACE_MULTIPLY_INT_r23] = "_BINARY_OP_INPLACE_MULTIPLY_INT_r23",
+    [_BINARY_OP_INPLACE_POWER_FLOAT] = "_BINARY_OP_INPLACE_POWER_FLOAT",
+    [_BINARY_OP_INPLACE_POWER_FLOAT_r23] = "_BINARY_OP_INPLACE_POWER_FLOAT_r23",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT",
+    [_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23] = "_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23",
+    [_BINARY_OP_INPLACE_SUBTRACT_INT] = "_BINARY_OP_INPLACE_SUBTRACT_INT",
+    [_BINARY_OP_INPLACE_SUBTRACT_INT_r23] = "_BINARY_OP_INPLACE_SUBTRACT_INT_r23",
+    [_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT] = "_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT",
+    [_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23] = "_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23",
+    [_BINARY_OP_MODULO_INT] = "_BINARY_OP_MODULO_INT",
+    [_BINARY_OP_MODULO_INT_r23] = "_BINARY_OP_MODULO_INT_r23",
     [_BINARY_OP_MULTIPLY_FLOAT] = "_BINARY_OP_MULTIPLY_FLOAT",
     [_BINARY_OP_MULTIPLY_FLOAT_r03] = "_BINARY_OP_MULTIPLY_FLOAT_r03",
     [_BINARY_OP_MULTIPLY_FLOAT_r13] = "_BINARY_OP_MULTIPLY_FLOAT_r13",
     [_BINARY_OP_MULTIPLY_FLOAT_r23] = "_BINARY_OP_MULTIPLY_FLOAT_r23",
+    [_BINARY_OP_MULTIPLY_FLOAT_INT] = "_BINARY_OP_MULTIPLY_FLOAT_INT",
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r03] = "_BINARY_OP_MULTIPLY_FLOAT_INT_r03",
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r13] = "_BINARY_OP_MULTIPLY_FLOAT_INT_r13",
+    [_BINARY_OP_MULTIPLY_FLOAT_INT_r23] = "_BINARY_OP_MULTIPLY_FLOAT_INT_r23",
     [_BINARY_OP_MULTIPLY_INT] = "_BINARY_OP_MULTIPLY_INT",
     [_BINARY_OP_MULTIPLY_INT_r03] = "_BINARY_OP_MULTIPLY_INT_r03",
     [_BINARY_OP_MULTIPLY_INT_r13] = "_BINARY_OP_MULTIPLY_INT_r13",
     [_BINARY_OP_MULTIPLY_INT_r23] = "_BINARY_OP_MULTIPLY_INT_r23",
+    [_BINARY_OP_POWER_FLOAT] = "_BINARY_OP_POWER_FLOAT",
+    [_BINARY_OP_POWER_FLOAT_r23] = "_BINARY_OP_POWER_FLOAT_r23",
     [_BINARY_OP_SUBSCR_CHECK_FUNC] = "_BINARY_OP_SUBSCR_CHECK_FUNC",
     [_BINARY_OP_SUBSCR_CHECK_FUNC_r23] = "_BINARY_OP_SUBSCR_CHECK_FUNC_r23",
     [_BINARY_OP_SUBSCR_DICT] = "_BINARY_OP_SUBSCR_DICT",
@@ -4310,10 +4602,20 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_SUBTRACT_FLOAT_r03] = "_BINARY_OP_SUBTRACT_FLOAT_r03",
     [_BINARY_OP_SUBTRACT_FLOAT_r13] = "_BINARY_OP_SUBTRACT_FLOAT_r13",
     [_BINARY_OP_SUBTRACT_FLOAT_r23] = "_BINARY_OP_SUBTRACT_FLOAT_r23",
+    [_BINARY_OP_SUBTRACT_FLOAT_INT] = "_BINARY_OP_SUBTRACT_FLOAT_INT",
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r03] = "_BINARY_OP_SUBTRACT_FLOAT_INT_r03",
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r13] = "_BINARY_OP_SUBTRACT_FLOAT_INT_r13",
+    [_BINARY_OP_SUBTRACT_FLOAT_INT_r23] = "_BINARY_OP_SUBTRACT_FLOAT_INT_r23",
     [_BINARY_OP_SUBTRACT_INT] = "_BINARY_OP_SUBTRACT_INT",
     [_BINARY_OP_SUBTRACT_INT_r03] = "_BINARY_OP_SUBTRACT_INT_r03",
     [_BINARY_OP_SUBTRACT_INT_r13] = "_BINARY_OP_SUBTRACT_INT_r13",
     [_BINARY_OP_SUBTRACT_INT_r23] = "_BINARY_OP_SUBTRACT_INT_r23",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT] = "_BINARY_OP_TRUE_DIVIDE_FLOAT",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r03] = "_BINARY_OP_TRUE_DIVIDE_FLOAT_r03",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r13] = "_BINARY_OP_TRUE_DIVIDE_FLOAT_r13",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_r23] = "_BINARY_OP_TRUE_DIVIDE_FLOAT_r23",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_INT] = "_BINARY_OP_TRUE_DIVIDE_FLOAT_INT",
+    [_BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23] = "_BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23",
     [_BINARY_SLICE] = "_BINARY_SLICE",
     [_BINARY_SLICE_r31] = "_BINARY_SLICE_r31",
     [_BUILD_INTERPOLATION] = "_BUILD_INTERPOLATION",
@@ -5554,6 +5856,50 @@ int _PyUop_num_popped(int opcode, int oparg)
             return 2;
         case _BINARY_OP_SUBTRACT_FLOAT:
             return 2;
+        case _BINARY_OP_INPLACE_ADD_FLOAT:
+            return 2;
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT:
+            return 2;
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT:
+            return 2;
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT:
+            return 2;
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT:
+            return 2;
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT:
+            return 2;
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT:
+            return 2;
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT:
+            return 2;
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT:
+            return 2;
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT:
+            return 2;
+        case _BINARY_OP_POWER_FLOAT:
+            return 2;
+        case _BINARY_OP_ADD_FLOAT_INT:
+            return 2;
+        case _BINARY_OP_SUBTRACT_FLOAT_INT:
+            return 2;
+        case _BINARY_OP_MULTIPLY_FLOAT_INT:
+            return 2;
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_INT:
+            return 2;
+        case _BINARY_OP_FLOOR_DIVIDE_INT:
+            return 2;
+        case _BINARY_OP_MODULO_INT:
+            return 2;
+        case _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT:
+            return 2;
+        case _BINARY_OP_INPLACE_POWER_FLOAT:
+            return 2;
+        case _BINARY_OP_INPLACE_ADD_INT:
+            return 2;
+        case _BINARY_OP_INPLACE_SUBTRACT_INT:
+            return 2;
+        case _BINARY_OP_INPLACE_MULTIPLY_INT:
+            return 2;
         case _BINARY_OP_ADD_UNICODE:
             return 2;
         case _BINARY_OP_INPLACE_ADD_UNICODE:
diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
index 2cad53d9c0728b..e570c38f1cc75d 100644
--- a/Lib/test/test_capi/test_opt.py
+++ b/Lib/test/test_capi/test_opt.py
@@ -3067,7 +3067,7 @@ def testfunc(n):
         self.assertEqual(res, 3.5)
         self.assertIsNotNone(ex)
         uops = get_opnames(ex)
-        self.assertIn("_BINARY_OP_EXTEND", uops)
+        self.assertIn("_BINARY_OP_ADD_FLOAT_INT", uops)
         self.assertIn("_POP_TOP_NOP", uops)
         self.assertLessEqual(count_ops(ex, "_POP_TOP"), 2)
 
diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py
index 1f5b0596107704..9ce059d5e2aeeb 100644
--- a/Lib/test/test_opcache.py
+++ b/Lib/test/test_opcache.py
@@ -1421,7 +1421,6 @@ def binary_op_add_extend():
 
         binary_op_add_extend()
         self.assert_specialized(binary_op_add_extend, "BINARY_OP_EXTEND")
-        self.assert_no_opcode(binary_op_add_extend, "BINARY_OP")
 
         def binary_op_zero_division():
             def compactlong_lhs(arg):
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 7ce5d0535b884e..ec6ae5fd7fe180 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -4368,6 +4368,41 @@ _PyCompactLong_Multiply(PyLongObject *a, PyLongObject *b)
     return medium_from_stwodigits(v);
 }
 
+_PyStackRef
+_PyCompactLong_FloorDivide(PyLongObject *a, PyLongObject *b)
+{
+    assert(_PyLong_BothAreCompact(a, b));
+    stwodigits left = medium_value(a);
+    stwodigits right = medium_value(b);
+    assert(right != 0);
+    stwodigits div = left / right;
+    stwodigits rem = left % right;
+    /* Python floor division rounds toward negative infinity.
+     * C division truncates toward zero.  Adjust when remainder
+     * is nonzero and operand signs differ. */
+    if (rem != 0 && ((rem ^ right) < 0)) {
+        div -= 1;
+    }
+    return medium_from_stwodigits(div);
+}
+
+_PyStackRef
+_PyCompactLong_Modulo(PyLongObject *a, PyLongObject *b)
+{
+    assert(_PyLong_BothAreCompact(a, b));
+    stwodigits left = medium_value(a);
+    stwodigits right = medium_value(b);
+    assert(right != 0);
+    stwodigits mod = left % right;
+    /* Python modulo has the sign of the divisor.
+     * C modulo has the sign of the dividend.
+     * Adjust when remainder is nonzero and signs differ. */
+    if (mod != 0 && ((mod ^ right) < 0)) {
+        mod += right;
+    }
+    return medium_from_stwodigits(mod);
+}
+
 static PyObject *
 long_mul_method(PyObject *a, PyObject *b)
 {
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 63a4222264985a..87806fa85d7ed5 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -757,6 +757,668 @@ dummy_func(
         macro(BINARY_OP_SUBTRACT_FLOAT) =
             _GUARD_TOS_FLOAT + _GUARD_NOS_FLOAT + unused/5 + _BINARY_OP_SUBTRACT_FLOAT + _POP_TOP_FLOAT + _POP_TOP_FLOAT;
 
+        // ── Float inplace modification (JIT-only tier2 ops) ──────────
+        //
+        // These tier2-only ops reuse one of the input float objects when
+        // its reference count is 1 at runtime, avoiding a PyFloat
+        // allocation.  The specializer records profiling hints in
+        // external_cache[0..2] of _PyBinaryOpCache so the optimizer can
+        // select the right variant.
+        //
+        // Generic variants try left first, then right, then allocate.
+        // STORE_FAST_LEFT/RIGHT variants additionally verify that the
+        // STORE_FAST target local is the same object (refcnt == 2 when
+        // both the stack and the local reference it).
+
+        tier2 op(_BINARY_OP_INPLACE_ADD_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval +
+                ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+                INPUTS_DEAD();
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+                INPUTS_DEAD();
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    ERROR_NO_POP();
+                }
+                l = left;
+                r = right;
+                INPUTS_DEAD();
+            }
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval -
+                ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+                INPUTS_DEAD();
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+                INPUTS_DEAD();
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    ERROR_NO_POP();
+                }
+                l = left;
+                r = right;
+                INPUTS_DEAD();
+            }
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval *
+                ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+                INPUTS_DEAD();
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+                INPUTS_DEAD();
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    ERROR_NO_POP();
+                }
+                l = left;
+                r = right;
+                INPUTS_DEAD();
+            }
+        }
+
+        // STORE_FAST_LEFT variants: the specializer detected that the
+        // next instruction is STORE_FAST targeting the LEFT operand.
+        // The local still holds a reference → refcnt is 2 (stack + local).
+        // `local` is the operand field carrying the STORE_FAST target index.
+
+        tier2 op(_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval +
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != left_o);
+            if (PyStackRef_RefcountOnObject(left)) {
+                DEOPT_IF(Py_REFCNT(left_o) != 2);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(left_o) != 1);
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval -
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != left_o);
+            if (PyStackRef_RefcountOnObject(left)) {
+                DEOPT_IF(Py_REFCNT(left_o) != 2);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(left_o) != 1);
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval *
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != left_o);
+            if (PyStackRef_RefcountOnObject(left)) {
+                DEOPT_IF(Py_REFCNT(left_o) != 2);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(left_o) != 1);
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        // STORE_FAST_RIGHT variants: same as LEFT but the STORE_FAST
+        // target is the RIGHT operand.
+
+        tier2 op(_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval +
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != right_o);
+            if (PyStackRef_RefcountOnObject(right)) {
+                DEOPT_IF(Py_REFCNT(right_o) != 2);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(right_o) != 1);
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval -
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != right_o);
+            if (PyStackRef_RefcountOnObject(right)) {
+                DEOPT_IF(Py_REFCNT(right_o) != 2);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(right_o) != 1);
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval *
+                          ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            DEOPT_IF(PyStackRef_IsNull(target));
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            DEOPT_IF(target_o != right_o);
+            if (PyStackRef_RefcountOnObject(right)) {
+                DEOPT_IF(Py_REFCNT(right_o) != 2);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                DEOPT_IF(Py_REFCNT(right_o) != 1);
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            INPUTS_DEAD();
+        }
+
+        // --- Tier2-only: float true divide, power, float/int mixed ---
+
+        tier2 op(_BINARY_OP_TRUE_DIVIDE_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            DEOPT_IF(denom == 0.0);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_POWER_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double base_val = ((PyFloatObject *)left_o)->ob_fval;
+            double exp_val = ((PyFloatObject *)right_o)->ob_fval;
+            DEOPT_IF(!(base_val > 0.0));
+            int exp_finite = isfinite(exp_val);
+            DEOPT_IF(!exp_finite);
+            double dres = pow(base_val, exp_val);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_ADD_FLOAT_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_IsCompact((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval +
+                (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_SUBTRACT_FLOAT_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_IsCompact((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval -
+                (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_MULTIPLY_FLOAT_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_IsCompact((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval *
+                (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_TRUE_DIVIDE_FLOAT_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_IsCompact((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            double denom = (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            DEOPT_IF(denom == 0.0);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        // --- Tier2-only: int floor divide and modulo ---
+
+        tier2 op(_BINARY_OP_FLOOR_DIVIDE_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyLong_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o));
+            DEOPT_IF(_PyLong_IsZero((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            res = _PyCompactLong_FloorDivide((PyLongObject *)left_o, (PyLongObject *)right_o);
+            DEOPT_IF(PyStackRef_IsNull(res));
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_MODULO_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyLong_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o));
+            DEOPT_IF(_PyLong_IsZero((PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            res = _PyCompactLong_Modulo((PyLongObject *)left_o, (PyLongObject *)right_o);
+            DEOPT_IF(PyStackRef_IsNull(res));
+            l = left;
+            r = right;
+            INPUTS_DEAD();
+        }
+
+        // --- Tier2-only: inplace true divide and power ---
+
+        tier2 op(_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            DEOPT_IF(denom == 0.0);
+            double dres =
+                ((PyFloatObject *)left_o)->ob_fval / denom;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+                INPUTS_DEAD();
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+                INPUTS_DEAD();
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    ERROR_NO_POP();
+                }
+                l = left;
+                r = right;
+                INPUTS_DEAD();
+            }
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_POWER_FLOAT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyFloat_CheckExact(left_o));
+            DEOPT_IF(!PyFloat_CheckExact(right_o));
+            STAT_INC(BINARY_OP, hit);
+            double base_val = ((PyFloatObject *)left_o)->ob_fval;
+            double exp_val = ((PyFloatObject *)right_o)->ob_fval;
+            DEOPT_IF(!(base_val > 0.0));
+            int exp_finite = isfinite(exp_val);
+            DEOPT_IF(!exp_finite);
+            double dres = pow(base_val, exp_val);
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+                INPUTS_DEAD();
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+                INPUTS_DEAD();
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    ERROR_NO_POP();
+                }
+                l = left;
+                r = right;
+                INPUTS_DEAD();
+            }
+        }
+
+        // --- Tier2-only: inplace int ops ---
+
+        tier2 op(_BINARY_OP_INPLACE_ADD_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyLong_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) +
+                           (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                ((uintptr_t)1 << NON_SIZE_BITS));
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                 ((uintptr_t)1 << NON_SIZE_BITS));
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Add((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+            }
+            DEOPT_IF(PyStackRef_IsNull(res));
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_SUBTRACT_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyLong_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) -
+                           (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                ((uintptr_t)1 << NON_SIZE_BITS));
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                 ((uintptr_t)1 << NON_SIZE_BITS));
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Subtract((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+            }
+            DEOPT_IF(PyStackRef_IsNull(res));
+            INPUTS_DEAD();
+        }
+
+        tier2 op(_BINARY_OP_INPLACE_MULTIPLY_INT, (left, right -- res, l, r)) {
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            DEOPT_IF(!PyLong_CheckExact(left_o));
+            DEOPT_IF(!PyLong_CheckExact(right_o));
+            DEOPT_IF(!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o));
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) *
+                           (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                ((uintptr_t)1 << NON_SIZE_BITS));
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                                                 ((uintptr_t)1 << NON_SIZE_BITS));
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Multiply((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+            }
+            DEOPT_IF(PyStackRef_IsNull(res));
+            INPUTS_DEAD();
+        }
+
         pure op(_BINARY_OP_ADD_UNICODE, (left, right -- res, l, r)) {
             PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
             PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index 1b3de80e4443b1..8c8516dc930e7d 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -4943,6 +4943,2087 @@
             break;
         }
 
+        case _BINARY_OP_INPLACE_ADD_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval +
+            ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    stack_pointer[0] = left;
+                    stack_pointer[1] = right;
+                    stack_pointer += 2;
+                    ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_ERROR();
+                }
+                l = left;
+                r = right;
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval -
+            ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    stack_pointer[0] = left;
+                    stack_pointer[1] = right;
+                    stack_pointer += 2;
+                    ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_ERROR();
+                }
+                l = left;
+                r = right;
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval *
+            ((PyFloatObject *)right_o)->ob_fval;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    stack_pointer[0] = left;
+                    stack_pointer[1] = right;
+                    stack_pointer += 2;
+                    ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_ERROR();
+                }
+                l = left;
+                r = right;
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval +
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != left_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(left)) {
+                if (Py_REFCNT(left_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                if (Py_REFCNT(left_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval -
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != left_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(left)) {
+                if (Py_REFCNT(left_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                if (Py_REFCNT(left_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval *
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != left_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(left)) {
+                if (Py_REFCNT(left_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else {
+                if (Py_REFCNT(left_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(left_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)left_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval +
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != right_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(right)) {
+                if (Py_REFCNT(right_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                if (Py_REFCNT(right_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval -
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != right_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(right)) {
+                if (Py_REFCNT(right_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                if (Py_REFCNT(right_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            uint16_t local = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres = ((PyFloatObject *)left_o)->ob_fval *
+            ((PyFloatObject *)right_o)->ob_fval;
+            _PyStackRef target = GETLOCAL(local);
+            if (PyStackRef_IsNull(target)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            PyObject *target_o = PyStackRef_AsPyObjectBorrow(target);
+            if (target_o != right_o) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (PyStackRef_RefcountOnObject(right)) {
+                if (Py_REFCNT(right_o) != 2) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                if (Py_REFCNT(right_o) != 1) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = right;
+                    _tos_cache0 = left;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+                res = PyStackRef_FromPyObjectNew(right_o);
+                l = left;
+                r = right;
+            }
+            ((PyFloatObject *)right_o)->ob_fval = dres;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            if (denom == 0.0) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            right = _stack_item_0;
+            left = stack_pointer[-1];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            if (denom == 0.0) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = right;
+                stack_pointer += 1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            if (denom == 0.0) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_POWER_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double base_val = ((PyFloatObject *)left_o)->ob_fval;
+            double exp_val = ((PyFloatObject *)right_o)->ob_fval;
+            if (!(base_val > 0.0)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            stack_pointer[0] = left;
+            stack_pointer[1] = right;
+            stack_pointer += 2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            int exp_finite = isfinite(exp_val);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (!exp_finite) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            double dres = pow(base_val, exp_val);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_ADD_FLOAT_INT_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval +
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_ADD_FLOAT_INT_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            right = _stack_item_0;
+            left = stack_pointer[-1];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval +
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = right;
+                stack_pointer += 1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_ADD_FLOAT_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval +
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_SUBTRACT_FLOAT_INT_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval -
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_SUBTRACT_FLOAT_INT_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            right = _stack_item_0;
+            left = stack_pointer[-1];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval -
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = right;
+                stack_pointer += 1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_SUBTRACT_FLOAT_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval -
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_MULTIPLY_FLOAT_INT_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval *
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_MULTIPLY_FLOAT_INT_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            right = _stack_item_0;
+            left = stack_pointer[-1];
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = right;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval *
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = right;
+                stack_pointer += 1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_MULTIPLY_FLOAT_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval *
+            (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_IsCompact((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double denom = (double)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (denom == 0.0) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval / denom;
+            res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_ERROR();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_FLOOR_DIVIDE_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyLong_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (_PyLong_IsZero((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            stack_pointer[0] = left;
+            stack_pointer[1] = right;
+            stack_pointer += 2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            res = _PyCompactLong_FloorDivide((PyLongObject *)left_o, (PyLongObject *)right_o);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (PyStackRef_IsNull(res)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_MODULO_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyLong_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (_PyLong_IsZero((PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            stack_pointer[0] = left;
+            stack_pointer[1] = right;
+            stack_pointer += 2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            res = _PyCompactLong_Modulo((PyLongObject *)left_o, (PyLongObject *)right_o);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (PyStackRef_IsNull(res)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double denom = ((PyFloatObject *)right_o)->ob_fval;
+            if (denom == 0.0) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            double dres =
+            ((PyFloatObject *)left_o)->ob_fval / denom;
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    stack_pointer[0] = left;
+                    stack_pointer[1] = right;
+                    stack_pointer += 2;
+                    ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_ERROR();
+                }
+                l = left;
+                r = right;
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_POWER_FLOAT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyFloat_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyFloat_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            double base_val = ((PyFloatObject *)left_o)->ob_fval;
+            double exp_val = ((PyFloatObject *)right_o)->ob_fval;
+            if (!(base_val > 0.0)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            stack_pointer[0] = left;
+            stack_pointer[1] = right;
+            stack_pointer += 2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            int exp_finite = isfinite(exp_val);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (!exp_finite) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            _PyFrame_SetStackPointer(frame, stack_pointer);
+            double dres = pow(base_val, exp_val);
+            stack_pointer = _PyFrame_GetStackPointer(frame);
+            if (PyStackRef_RefcountOnObject(left) && Py_REFCNT(left_o) == 1) {
+                ((PyFloatObject *)left_o)->ob_fval = dres;
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (PyStackRef_RefcountOnObject(right) && Py_REFCNT(right_o) == 1) {
+                ((PyFloatObject *)right_o)->ob_fval = dres;
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = PyStackRef_FromPyObjectSteal(PyFloat_FromDouble(dres));
+                if (PyStackRef_IsNull(res)) {
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_ERROR();
+                }
+                l = left;
+                r = right;
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyLong_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) +
+            (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Add((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+                stack_pointer += 2;
+            }
+            if (PyStackRef_IsNull(res)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyLong_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) -
+            (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Subtract((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+                stack_pointer += 2;
+            }
+            if (PyStackRef_IsNull(res)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_INT_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef right;
+            _PyStackRef left;
+            _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            right = _stack_item_1;
+            left = _stack_item_0;
+            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
+            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
+            if (!PyLong_CheckExact(left_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!PyLong_CheckExact(right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!_PyLong_BothAreCompact((PyLongObject *)left_o, (PyLongObject *)right_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(BINARY_OP, hit);
+            stwodigits v = (stwodigits)_PyLong_CompactValue((PyLongObject *)left_o) *
+            (stwodigits)_PyLong_CompactValue((PyLongObject *)right_o);
+            if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                v != 0 &&
+                (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                PyStackRef_RefcountOnObject(left) &&
+                Py_REFCNT(left_o) == 1 &&
+                (((PyLongObject *)left_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *left_long = (PyLongObject *)left_o;
+                left_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                left_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = left;
+                l = PyStackRef_DUP(left);
+                r = right;
+            }
+            else if (-(stwodigits)PyLong_MASK <= v && v <= (stwodigits)PyLong_MASK &&
+                     v != 0 &&
+                     (v < -_PY_NSMALLNEGINTS || v >= _PY_NSMALLPOSINTS) &&
+                     PyStackRef_RefcountOnObject(right) &&
+                     Py_REFCNT(right_o) == 1 &&
+                     (((PyLongObject *)right_o)->long_value.lv_tag & IMMORTALITY_BIT_MASK) == 0)
+            {
+                PyLongObject *right_long = (PyLongObject *)right_o;
+                right_long->long_value.lv_tag = ((uintptr_t)(v < 0 ? 2 : 0) |
+                    ((uintptr_t)1 << NON_SIZE_BITS));
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                right_long->long_value.ob_digit[0] = (digit)(v < 0 ? -v : v);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                res = right;
+                l = left;
+                r = PyStackRef_DUP(right);
+            }
+            else {
+                res = _PyCompactLong_Multiply((PyLongObject *)left_o, (PyLongObject *)right_o);
+                l = left;
+                r = right;
+                stack_pointer += 2;
+            }
+            if (PyStackRef_IsNull(res)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = right;
+                _tos_cache0 = left;
+                SET_CURRENT_CACHED_VALUES(2);
+                stack_pointer += -2;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                JUMP_TO_JUMP_TARGET();
+            }
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
         case _BINARY_OP_ADD_UNICODE_r03: {
             CHECK_CURRENT_CACHED_VALUES(0);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 09936cafbb0a98..a2c6018bb5894a 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -940,6 +940,126 @@ _PyJit_translate_single_bytecode_to_trace(
                     assert(next->op.code == STORE_FAST);
                     operand = next->op.arg;
                 }
+                else if (uop == _BINARY_OP_ADD_FLOAT ||
+                         uop == _BINARY_OP_SUBTRACT_FLOAT ||
+                         uop == _BINARY_OP_MULTIPLY_FLOAT)
+                {
+                    // Check specializer hints for inplace float modification.
+                    _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(target_instr + 1);
+                    bool use_inplace = cache->external_cache[0];
+                    if (use_inplace) {
+                        uint16_t inplace_source = cache->external_cache[1];
+                        uint16_t inplace_local = cache->external_cache[2];
+                        if (uop == _BINARY_OP_ADD_FLOAT) {
+                            if (inplace_source == 1) {
+                                uop = _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT;
+                                operand = inplace_local;
+                            }
+                            else if (inplace_source == 2) {
+                                uop = _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT;
+                                operand = inplace_local;
+                            }
+                            else {
+                                uop = _BINARY_OP_INPLACE_ADD_FLOAT;
+                            }
+                        }
+                        else if (uop == _BINARY_OP_SUBTRACT_FLOAT) {
+                            if (inplace_source == 1) {
+                                uop = _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT;
+                                operand = inplace_local;
+                            }
+                            else if (inplace_source == 2) {
+                                uop = _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT;
+                                operand = inplace_local;
+                            }
+                            else {
+                                uop = _BINARY_OP_INPLACE_SUBTRACT_FLOAT;
+                            }
+                        }
+                        else {
+                            assert(uop == _BINARY_OP_MULTIPLY_FLOAT);
+                            if (inplace_source == 1) {
+                                uop = _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT;
+                                operand = inplace_local;
+                            }
+                            else if (inplace_source == 2) {
+                                uop = _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT;
+                                operand = inplace_local;
+                            }
+                            else {
+                                uop = _BINARY_OP_INPLACE_MULTIPLY_FLOAT;
+                            }
+                        }
+                        ADD_TO_TRACE(uop, oparg, operand, target);
+                        continue;
+                    }
+                }
+                else if (uop == _BINARY_OP_ADD_INT ||
+                         uop == _BINARY_OP_SUBTRACT_INT ||
+                         uop == _BINARY_OP_MULTIPLY_INT)
+                {
+                    // Check specializer hints for inplace int modification.
+                    _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(target_instr + 1);
+                    bool use_inplace = cache->external_cache[0];
+                    if (use_inplace) {
+                        if (uop == _BINARY_OP_ADD_INT) {
+                            uop = _BINARY_OP_INPLACE_ADD_INT;
+                        }
+                        else if (uop == _BINARY_OP_SUBTRACT_INT) {
+                            uop = _BINARY_OP_INPLACE_SUBTRACT_INT;
+                        }
+                        else {
+                            assert(uop == _BINARY_OP_MULTIPLY_INT);
+                            uop = _BINARY_OP_INPLACE_MULTIPLY_INT;
+                        }
+                        ADD_TO_TRACE(uop, oparg, operand, target);
+                        continue;
+                    }
+                }
+                else if (uop == _BINARY_OP) {
+                    // Check JIT-only specialization hints from the specializer.
+                    // These are for operations that don't have interpreter opcodes
+                    // but can be optimized in tier2.
+                    _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(target_instr + 1);
+                    uint16_t jit_hint = cache->external_cache[3];
+                    if (jit_hint) {
+                        bool use_inplace = cache->external_cache[0];
+                        switch (jit_hint) {
+                        case 1: // JIT_HINT_ADD_FLOAT_INT
+                            uop = _BINARY_OP_ADD_FLOAT_INT;
+                            break;
+                        case 2: // JIT_HINT_SUBTRACT_FLOAT_INT
+                            uop = _BINARY_OP_SUBTRACT_FLOAT_INT;
+                            break;
+                        case 3: // JIT_HINT_MULTIPLY_FLOAT_INT
+                            uop = _BINARY_OP_MULTIPLY_FLOAT_INT;
+                            break;
+                        case 4: // JIT_HINT_TRUE_DIVIDE_FLOAT_INT
+                            uop = _BINARY_OP_TRUE_DIVIDE_FLOAT_INT;
+                            break;
+                        case 5: // JIT_HINT_TRUE_DIVIDE_FLOAT
+                            uop = use_inplace
+                                ? _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT
+                                : _BINARY_OP_TRUE_DIVIDE_FLOAT;
+                            break;
+                        case 6: // JIT_HINT_POWER_FLOAT
+                            uop = use_inplace
+                                ? _BINARY_OP_INPLACE_POWER_FLOAT
+                                : _BINARY_OP_POWER_FLOAT;
+                            break;
+                        case 7: // JIT_HINT_FLOOR_DIVIDE_INT
+                            uop = _BINARY_OP_FLOOR_DIVIDE_INT;
+                            break;
+                        case 8: // JIT_HINT_MODULO_INT
+                            uop = _BINARY_OP_MODULO_INT;
+                            break;
+                        default:
+                            break;
+                        }
+                        ADD_TO_TRACE(uop, oparg, operand, target);
+                        continue;
+                    }
+                }
                 else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
                     PyObject *recorded_value = tracer->prev_state.recorded_value;
                     tracer->prev_state.recorded_value = NULL;
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index 2b35628ad99999..4e226cdd501eeb 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -316,6 +316,145 @@ dummy_func(void) {
         r = right;
     }
 
+    // Inplace float ops — same abstract signature as the regular ops.
+    // res is always a float; l and r are copies of the inputs (for POP_TOP).
+
+    op(_BINARY_OP_INPLACE_ADD_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT, (local/1, left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    // Tier2-only: float true divide, power, float/int mixed
+    op(_BINARY_OP_TRUE_DIVIDE_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_POWER_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_ADD_FLOAT_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_SUBTRACT_FLOAT_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_MULTIPLY_FLOAT_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_TRUE_DIVIDE_FLOAT_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    // Tier2-only: int floor divide, modulo
+    op(_BINARY_OP_FLOOR_DIVIDE_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyLong_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_MODULO_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyLong_Type);
+        l = left;
+        r = right;
+    }
+
+    // Tier2-only: inplace true divide, power
+    op(_BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_POWER_FLOAT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyFloat_Type);
+        l = left;
+        r = right;
+    }
+
+    // Tier2-only: inplace int ops
+    op(_BINARY_OP_INPLACE_ADD_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyLong_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_SUBTRACT_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyLong_Type);
+        l = left;
+        r = right;
+    }
+
+    op(_BINARY_OP_INPLACE_MULTIPLY_INT, (left, right -- res, l, r)) {
+        res = sym_new_type(ctx, &PyLong_Type);
+        l = left;
+        r = right;
+    }
+
     op(_BINARY_OP_ADD_UNICODE, (left, right -- res, l, r)) {
         res = sym_new_type(ctx, &PyUnicode_Type);
         l = left;
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index 7faa699a058249..625c1e9cf44b7c 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -834,6 +834,452 @@
             break;
         }
 
+        case _BINARY_OP_INPLACE_ADD_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_LEFT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_LEFT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_LEFT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_FLOAT_STORE_FAST_RIGHT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_FLOAT_STORE_FAST_RIGHT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_FLOAT_STORE_FAST_RIGHT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            uint16_t local = (uint16_t)this_instr->operand0;
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_POWER_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_ADD_FLOAT_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_SUBTRACT_FLOAT_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_MULTIPLY_FLOAT_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_TRUE_DIVIDE_FLOAT_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_FLOOR_DIVIDE_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyLong_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_MODULO_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyLong_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_TRUE_DIVIDE_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_POWER_FLOAT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyFloat_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_ADD_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyLong_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_SUBTRACT_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyLong_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
+        case _BINARY_OP_INPLACE_MULTIPLY_INT: {
+            JitOptRef right;
+            JitOptRef left;
+            JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            res = sym_new_type(ctx, &PyLong_Type);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
+            stack_pointer[-2] = res;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            break;
+        }
+
         case _BINARY_OP_ADD_UNICODE: {
             JitOptRef right;
             JitOptRef left;
diff --git a/Python/specialize.c b/Python/specialize.c
index 4d3ba4acbbf038..3493744a5338a0 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -2200,6 +2200,67 @@ binary_op_extended_specialization(PyObject *lhs, PyObject *rhs, int oparg,
     return 0;
 }
 
+// JIT-only specialization hints stored in external_cache[3].
+// These tell the trace optimizer which tier2 op to emit, without
+// consuming interpreter opcode slots.
+enum {
+    JIT_HINT_NONE = 0,
+    JIT_HINT_ADD_FLOAT_INT,
+    JIT_HINT_SUBTRACT_FLOAT_INT,
+    JIT_HINT_MULTIPLY_FLOAT_INT,
+    JIT_HINT_TRUE_DIVIDE_FLOAT_INT,
+    JIT_HINT_TRUE_DIVIDE_FLOAT,
+    JIT_HINT_POWER_FLOAT,
+    JIT_HINT_FLOOR_DIVIDE_INT,
+    JIT_HINT_MODULO_INT,
+};
+
+// Check if either operand of a float/int binary op is a candidate for
+// in-place modification.  An operand is a candidate when the stackref
+// owns a refcount AND the object's refcount is exactly 1 (meaning the
+// evaluation stack is the sole owner).
+static inline bool
+binary_op_float_inplace_candidate(_PyStackRef lhs_st, _PyStackRef rhs_st)
+{
+    PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st);
+    PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st);
+    if (PyStackRef_RefcountOnObject(lhs_st) && Py_REFCNT(lhs) == 1) {
+        return true;
+    }
+    if (PyStackRef_RefcountOnObject(rhs_st) && Py_REFCNT(rhs) == 1) {
+        return true;
+    }
+    return false;
+}
+
+// If the instruction immediately following the binary op is STORE_FAST
+// and the target local currently holds the same object as one of the
+// operands, record which side (1=left, 2=right) and the local index.
+// The optimizer uses this to select STORE_FAST_LEFT / STORE_FAST_RIGHT
+// inplace variants where the refcount check uses 2 instead of 1.
+static inline void
+binary_op_float_inplace_store_fast_hint(_Py_CODEUNIT *instr, _PyStackRef *locals,
+                                        PyObject *lhs, PyObject *rhs,
+                                        uint16_t *source, uint16_t *local_index)
+{
+    *source = 0;
+    *local_index = 0;
+    _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1];
+    if (next.op.code != STORE_FAST) {
+        return;
+    }
+    uint16_t target = next.op.arg;
+    PyObject *target_o = PyStackRef_AsPyObjectBorrow(locals[target]);
+    if (target_o == lhs) {
+        *source = 1;
+        *local_index = target;
+    }
+    else if (target_o == rhs) {
+        *source = 2;
+        *local_index = target;
+    }
+}
+
 Py_NO_INLINE void
 _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *instr,
                         int oparg, _PyStackRef *locals)
@@ -2218,6 +2279,12 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
         case NB_ADD:
         case NB_INPLACE_ADD:
             if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                if (PyFloat_CheckExact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                    cache->external_cache[0] = 0;
+                    cache->external_cache[3] = JIT_HINT_ADD_FLOAT_INT;
+                    unspecialize(instr);
+                    return;
+                }
                 break;
             }
             if (PyUnicode_CheckExact(lhs)) {
@@ -2231,10 +2298,16 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
                 return;
             }
             if (_PyLong_CheckExactAndCompact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st);
                 specialize(instr, BINARY_OP_ADD_INT);
                 return;
             }
             if (PyFloat_CheckExact(lhs)) {
+                uint16_t source = 0, local_index = 0;
+                binary_op_float_inplace_store_fast_hint(instr, locals, lhs, rhs, &source, &local_index);
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st) || source != 0;
+                cache->external_cache[1] = source;
+                cache->external_cache[2] = local_index;
                 specialize(instr, BINARY_OP_ADD_FLOAT);
                 return;
             }
@@ -2242,13 +2315,25 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
         case NB_MULTIPLY:
         case NB_INPLACE_MULTIPLY:
             if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                if (PyFloat_CheckExact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                    cache->external_cache[0] = 0;
+                    cache->external_cache[3] = JIT_HINT_MULTIPLY_FLOAT_INT;
+                    unspecialize(instr);
+                    return;
+                }
                 break;
             }
             if (_PyLong_CheckExactAndCompact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st);
                 specialize(instr, BINARY_OP_MULTIPLY_INT);
                 return;
             }
             if (PyFloat_CheckExact(lhs)) {
+                uint16_t source = 0, local_index = 0;
+                binary_op_float_inplace_store_fast_hint(instr, locals, lhs, rhs, &source, &local_index);
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st) || source != 0;
+                cache->external_cache[1] = source;
+                cache->external_cache[2] = local_index;
                 specialize(instr, BINARY_OP_MULTIPLY_FLOAT);
                 return;
             }
@@ -2256,17 +2341,91 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
         case NB_SUBTRACT:
         case NB_INPLACE_SUBTRACT:
             if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                if (PyFloat_CheckExact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                    cache->external_cache[0] = 0;
+                    cache->external_cache[3] = JIT_HINT_SUBTRACT_FLOAT_INT;
+                    unspecialize(instr);
+                    return;
+                }
                 break;
             }
             if (_PyLong_CheckExactAndCompact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st);
                 specialize(instr, BINARY_OP_SUBTRACT_INT);
                 return;
             }
             if (PyFloat_CheckExact(lhs)) {
+                uint16_t source = 0, local_index = 0;
+                binary_op_float_inplace_store_fast_hint(instr, locals, lhs, rhs, &source, &local_index);
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st) || source != 0;
+                cache->external_cache[1] = source;
+                cache->external_cache[2] = local_index;
                 specialize(instr, BINARY_OP_SUBTRACT_FLOAT);
                 return;
             }
             break;
+        case NB_FLOOR_DIVIDE:
+        case NB_INPLACE_FLOOR_DIVIDE:
+            if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                break;
+            }
+            if (_PyLong_CheckExactAndCompact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st);
+                cache->external_cache[3] = JIT_HINT_FLOOR_DIVIDE_INT;
+                unspecialize(instr);
+                return;
+            }
+            break;
+        case NB_REMAINDER:
+        case NB_INPLACE_REMAINDER:
+            if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                break;
+            }
+            if (_PyLong_CheckExactAndCompact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st);
+                cache->external_cache[3] = JIT_HINT_MODULO_INT;
+                unspecialize(instr);
+                return;
+            }
+            break;
+        case NB_TRUE_DIVIDE:
+        case NB_INPLACE_TRUE_DIVIDE:
+            if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                if (PyFloat_CheckExact(lhs) && _PyLong_CheckExactAndCompact(rhs)) {
+                    cache->external_cache[0] = 0;
+                    cache->external_cache[3] = JIT_HINT_TRUE_DIVIDE_FLOAT_INT;
+                    unspecialize(instr);
+                    return;
+                }
+                break;
+            }
+            if (PyFloat_CheckExact(lhs)) {
+                uint16_t source = 0, local_index = 0;
+                binary_op_float_inplace_store_fast_hint(instr, locals, lhs, rhs, &source, &local_index);
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st) || source != 0;
+                cache->external_cache[1] = source;
+                cache->external_cache[2] = local_index;
+                cache->external_cache[3] = JIT_HINT_TRUE_DIVIDE_FLOAT;
+                unspecialize(instr);
+                return;
+            }
+            break;
+        case NB_POWER:
+        case NB_INPLACE_POWER:
+            if (!Py_IS_TYPE(lhs, Py_TYPE(rhs))) {
+                break;
+            }
+            if (PyFloat_CheckExact(lhs)) {
+                uint16_t source = 0, local_index = 0;
+                binary_op_float_inplace_store_fast_hint(instr, locals, lhs, rhs, &source, &local_index);
+                cache->external_cache[0] = binary_op_float_inplace_candidate(lhs_st, rhs_st) || source != 0;
+                cache->external_cache[1] = source;
+                cache->external_cache[2] = local_index;
+                cache->external_cache[3] = JIT_HINT_POWER_FLOAT;
+                unspecialize(instr);
+                return;
+            }
+            break;
         case NB_SUBSCR:
             if (PyLong_CheckExact(rhs) && _PyLong_IsNonNegativeCompact((PyLongObject *)rhs)) {
                 if (PyList_CheckExact(lhs)) {