diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f0c67752..64dcf17a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -69,4 +69,5 @@ jobs:
       - name: Run test suite (parallel)
         env:
           RSPEC_QUIET_OUTPUT: "1"
+          RSPEC_TIMEOUT: "20"
         run: bundle exec rake pspec
diff --git a/.gitignore b/.gitignore
index 83ff9f09..ddb75c9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,8 +49,22 @@ lib/rhdl/codegen/ir/sim/ir_compiler/*.json
 .arcilator_build/
 .arcilator_gpu_build/
 
+# Metal backend build artifacts
+.metal_build/
+.metal_probe/
+
 # HDL build artifacts (Verilator/Arcilator for example systems)
 .hdl_build/
+/examples/8bit/.gem_metal_cpu8bit/
+/examples/apple2/.gem_metal_apple2/
+/examples/riscv/.gem_metal_riscv/
+
+# Submodule trees
+/examples/apple2/reference/
+/examples/gameboy/reference/
+/examples/riscv/software/xv6/
+/examples/riscv/software/linux/
+/external/GEM/
 
 # Web simulator generated artifacts
 web/assets/fixtures/*
@@ -95,3 +109,12 @@ web/build/verilator/*
 /examples/riscv/software/.buildroot.defconfig
 /examples/riscv/software/.docker-config*/
 /examples/riscv/software/.tmp_*/
+
+# AO486 local scratch/import artifacts
+/examples/ao486/reference/
+/examples/ao486/tmp/
+/examples/ao486/software/tmp/
+/examples/ao486/hdl/tmp/
+/examples/ao486/hdl/reports/
+/examples/ao486/hdl/vendor/source_hdl/
+/examples/ao486/hdl/lib/hdl/modules/frontend_blackbox_stubs__*.rb
diff --git a/.gitmodules b/.gitmodules
index 3952e987..d13da972 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,20 @@
 [submodule "examples/apple2/reference"]
 	path = examples/apple2/reference
 	url = https://github.com/zf3/neoapple2
+	ignore = all
 [submodule "examples/gameboy/reference"]
 	path = examples/gameboy/reference
 	url = https://github.com/MiSTer-devel/Gameboy_MiSTer.git
+	ignore = all
 [submodule "examples/riscv/software/xv6"]
 	path = examples/riscv/software/xv6
 	url = https://github.com/michaelengel/xv6-rv32.git
+	ignore = all
 [submodule "examples/riscv/software/linux"]
 	path = examples/riscv/software/linux
 	url = https://github.com/torvalds/linux.git
+	ignore = all
+[submodule "external/GEM"]
+	path = external/GEM
+	url = https://github.com/skryl/GEM
+	ignore = all
diff --git a/README.md b/README.md
index 126d5795..bb3bc7da 100644
--- a/README.md
+++ b/README.md
@@ -613,14 +613,20 @@ RHDL includes benchmarking tasks to measure simulation performance across backen
 
 ```bash
 rake bench:native[gates]             # Benchmark gate-level simulation
-rake bench:native[cpu8bit,cycles]    # Benchmark 8-bit CPU FastHarness (compiler vs arcilator_gpu)
+rake bench:native[cpu8bit,cycles]    # Benchmark 8-bit CPU FastHarness (compiler, arcilator_gpu, GemMetal)
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[cpu8bit,cycles] # Benchmark only GemMetal
 rake bench:native[mos6502,cycles]    # Benchmark MOS 6502 CPU
-rake bench:native[apple2,cycles]     # Benchmark Apple II full system
+rake bench:native[apple2,cycles]     # Benchmark Apple II full system (including GemMetal)
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[apple2,cycles] # Benchmark Apple II with only GemMetal
+rake bench:native[riscv,cycles]      # Benchmark RISC-V single-cycle core (includes GemMetal)
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[riscv,cycles] # Benchmark only RISC-V GemMetal
 rake bench:native[gameboy,frames]    # Benchmark GameBoy with Prince of Persia
 rake bench:web[apple2,cycles] # Benchmark Apple II web WASM backends
 rake bench:web[riscv,cycles]  # Benchmark RISC-V web WASM backends
 ```
 
+GemMetal is part of the default native backend set for CPU8bit, Apple II, and RISC-V. Use `RHDL_BENCH_BACKENDS` to restrict the runners, for example `RHDL_BENCH_BACKENDS=gem_metal bundle exec rake bench:native[cpu8bit,5000000]`, `RHDL_BENCH_BACKENDS=compiler,gem_metal bundle exec rake bench:native[apple2,2000000]`, or `RHDL_BENCH_BACKENDS=gem_metal bundle exec rake bench:native[riscv,500000]`. `gem` remains an alias for `gem_metal`. On RISC-V, the GemMetal path benchmarks an MMU-off synthesized core netlist via `metal_dummy_test`; it does not load xv6 images. For CPU8bit ArcilatorGPU throughput mode, set `RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES=<n>` or the benchmark-wide fallback `RHDL_BENCH_ARCILATOR_GPU_INSTANCES=<n>`.
+
 **Sample Results (1M cycles):**
 
 | System | JIT | Compiler | Verilator | Compiler Speedup |
@@ -650,7 +656,13 @@ bundle exec rake pspec[riscv]          # Run RISC-V specs in parallel
 bundle exec rake spec:bench[riscv,20]  # Benchmark 20 RISC-V spec files
 bundle exec rake bench:native[ir,5000000]     # Benchmark IR runners
 bundle exec rake bench:native[gates]          # Benchmark gate-level simulation
-bundle exec rake bench:native[cpu8bit,5000000] # Benchmark 8-bit CPU compiler vs arcilator_gpu
+bundle exec rake bench:native[cpu8bit,5000000] # Benchmark 8-bit CPU compiler vs arcilator_gpu vs GemMetal
+RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES=8 bundle exec rake bench:native[cpu8bit,5000000] # Benchmark CPU8bit ArcilatorGPU with 8 mirrored instances
+bundle exec rake bench:native[apple2,2000000]  # Benchmark Apple II including GemMetal
+bundle exec rake bench:native[riscv,500000]    # Benchmark RISC-V including GemMetal
+RHDL_BENCH_BACKENDS=gem_metal bundle exec rake bench:native[cpu8bit,5000000] # Benchmark only CPU8bit GemMetal
+RHDL_BENCH_BACKENDS=gem_metal bundle exec rake bench:native[apple2,2000000]  # Benchmark only Apple II GemMetal
+RHDL_BENCH_BACKENDS=gem_metal bundle exec rake bench:native[riscv,500000]    # Benchmark only RISC-V GemMetal
 bundle exec rake bench:web[apple2]     # Benchmark Apple II web compiler vs arcilator vs verilator
 bundle exec rake bench:web[riscv]      # Benchmark RISC-V web compiler vs arcilator vs verilator
 
diff --git a/docs/simulation.md b/docs/simulation.md
index c740a343..9883a536 100644
--- a/docs/simulation.md
+++ b/docs/simulation.md
@@ -644,7 +644,9 @@ RHDL_BENCH_LANES=128 RHDL_BENCH_CYCLES=1000000 rake bench:native[gates]
 
 # 8-bit CPU FastHarness benchmark
 rake bench:native[cpu8bit]
-rake bench:native[cpu8bit,5000000]  # 5M cycles
+rake bench:native[cpu8bit,5000000]  # 5M cycles, includes GemMetal by default
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[cpu8bit,5000000]  # GemMetal only
+RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES=8 rake bench:native[cpu8bit,5000000]  # ArcilatorGPU throughput mode
 
 # MOS6502 CPU benchmark
 rake bench:native[mos6502]
@@ -652,7 +654,13 @@ rake bench:native[mos6502,5000000]  # 5M cycles
 
 # Apple II full system benchmark
 rake bench:native[apple2]
-rake bench:native[apple2,2000000]  # 2M cycles
+rake bench:native[apple2,2000000]  # 2M cycles, includes GemMetal by default
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[apple2,2000000]  # GemMetal only
+
+# RISC-V single-cycle benchmark
+rake bench:native[riscv]
+rake bench:native[riscv,500000]  # includes GemMetal by default
+RHDL_BENCH_BACKENDS=gem_metal rake bench:native[riscv,500000]  # GemMetal only
 
 # Legacy alias removed (use bench scopes)
 
@@ -667,6 +675,8 @@ rake spec:bench:timing      # Per-file timing
 rake spec:bench:quick       # Category summary
 ```
 
+GemMetal is part of the default native backend set for CPU8bit, Apple II, and RISC-V. Set `RHDL_BENCH_BACKENDS` to restrict the runners; `gem` remains an alias for `gem_metal`. On RISC-V, the GemMetal path benchmarks an MMU-off synthesized core netlist via `metal_dummy_test`; it does not load xv6 images.
+
 ### Programmatic Benchmarking
 
 ```ruby
diff --git a/examples/8bit/hdl/cpu/harness.rb b/examples/8bit/hdl/cpu/harness.rb
index 9f881b6a..808539fd 100644
--- a/examples/8bit/hdl/cpu/harness.rb
+++ b/examples/8bit/hdl/cpu/harness.rb
@@ -77,6 +77,28 @@ def self.arcilator_gpu_status
           }
         end
 
+        def self.arcilator_status
+          require_relative '../../utilities/runners/arcilator_runner'
+          RHDL::Examples::CPU8Bit::ArcilatorRunner.status
+        rescue LoadError, NameError => e
+          {
+            ready: false,
+            missing_tools: [],
+            missing_capabilities: ["arcilator runner unavailable: #{e.message}"]
+          }
+        end
+
+        def self.verilator_status
+          require_relative '../../utilities/runners/verilator_runner'
+          RHDL::Examples::CPU8Bit::VerilatorRunner.status
+        rescue LoadError, NameError => e
+          {
+            ready: false,
+            missing_tools: [],
+            missing_capabilities: ["verilator runner unavailable: #{e.message}"]
+          }
+        end
+
         def self.ensure_arcilator_gpu_available!
           status = arcilator_gpu_status
           return true if status[:ready]
@@ -89,6 +111,30 @@ def self.ensure_arcilator_gpu_available!
             "Install an ArcToGPU-enabled arcilator build plus Metal/SPIR-V toolchain support."
         end
 
+        def self.ensure_arcilator_available!
+          status = arcilator_status
+          return true if status[:ready]
+
+          details = []
+          details << "missing tools: #{status[:missing_tools].join(', ')}" unless status[:missing_tools].empty?
+          details << "missing capabilities: #{status[:missing_capabilities].join(', ')}" unless status[:missing_capabilities].empty?
+          raise ArgumentError,
+            "arcilator backend unavailable (#{details.join('; ')}). " \
+            "Install an arcilator/firtool-enabled toolchain."
+        end
+
+        def self.ensure_verilator_available!
+          status = verilator_status
+          return true if status[:ready]
+
+          details = []
+          details << "missing tools: #{status[:missing_tools].join(', ')}" unless status[:missing_tools].empty?
+          details << "missing capabilities: #{status[:missing_capabilities].join(', ')}" unless status[:missing_capabilities].empty?
+          raise ArgumentError,
+            "verilator backend unavailable (#{details.join('; ')}). " \
+            "Install a verilator-enabled native build toolchain."
+        end
+
         def initialize(external_memory = nil, sim: :compile)
           require 'rhdl/codegen'
 
@@ -96,10 +142,23 @@ def initialize(external_memory = nil, sim: :compile)
           @halted = false
           @sim_backend = normalize_sim_backend(sim)
 
-          if arcilator_gpu_mode?
-            self.class.ensure_arcilator_gpu_available!
-            require_relative '../../utilities/runners/arcilator_gpu_runner'
-            @sim = RHDL::Examples::CPU8Bit::ArcilatorGpuRunner.new
+          if runner_backend_mode?
+            @sim = case @sim_backend
+            when :arcilator_gpu
+              self.class.ensure_arcilator_gpu_available!
+              require_relative '../../utilities/runners/arcilator_gpu_runner'
+              RHDL::Examples::CPU8Bit::ArcilatorGpuRunner.new
+            when :arcilator
+              self.class.ensure_arcilator_available!
+              require_relative '../../utilities/runners/arcilator_runner'
+              RHDL::Examples::CPU8Bit::ArcilatorRunner.new
+            when :verilator
+              self.class.ensure_verilator_available!
+              require_relative '../../utilities/runners/verilator_runner'
+              RHDL::Examples::CPU8Bit::VerilatorRunner.new
+            else
+              raise ArgumentError, "Unsupported runner backend: #{@sim_backend.inspect}"
+            end
             @memory = RunnerMemory64K.new(@sim)
             ensure_runner_cpu8bit_mode!
           else
@@ -132,7 +191,17 @@ def native?
         end
 
         def backend
-          arcilator_gpu_mode? ? :arcilator_gpu : @sim.backend
+          return @sim_backend if runner_backend_mode?
+
+          @sim.backend
+        end
+
+        def parallel_instances
+          return 1 unless runner_backend_mode?
+          return 1 unless @sim.respond_to?(:runner_parallel_instances)
+
+          instances = @sim.runner_parallel_instances.to_i
+          instances.positive? ? instances : 1
         end
 
         # Read CPU state
@@ -172,7 +241,7 @@ def reset
           @halted = false
           @cycle_count = 0
 
-          if arcilator_gpu_mode?
+          if runner_backend_mode?
             @sim.poke('rst', 1)
             run_runner_cycles(1)
             @sim.poke('rst', 0)
@@ -189,7 +258,7 @@ def reset
         end
 
         def clock_cycle
-          if arcilator_gpu_mode?
+          if runner_backend_mode?
             run_runner_cycles(1)
             @halted = true if @sim.peek('halted') == 1
             return
@@ -224,7 +293,7 @@ def run_cycles(count, batch_size: 1024)
           n = count.to_i
           return 0 if n <= 0
 
-          unless arcilator_gpu_mode?
+          unless runner_backend_mode?
             ran = 0
             n.times do
               break if @halted
@@ -236,22 +305,9 @@ def run_cycles(count, batch_size: 1024)
             return ran
           end
 
-          remaining = n
-          ran = 0
-          batch = [batch_size.to_i, 1].max
-          while remaining.positive?
-            step = [remaining, batch].min
-            batch_ran = run_runner_cycles(step)
-            break if batch_ran <= 0
-
-            ran += batch_ran
-            remaining -= batch_ran
-            if @sim.peek('halted') == 1
-              @halted = true
-              break
-            end
-          end
-
+          # Native runner backends handle internal batching/scheduling.
+          # Keep host-side execution as a single call to avoid extra sync points.
+          ran = run_runner_cycles(n)
           @cycle_count += ran
           @halted = true if @sim.peek('halted') == 1
           ran
@@ -291,6 +347,8 @@ def get_input(name)
         def normalize_sim_backend(sim)
           sym = sim.to_sym
           return :compile if sym == :compiler
+          return :arcilator_gpu if sym == :arc_to_gpu
+          return :arcilator if sym == :arc
 
           sym
         end
@@ -303,11 +361,23 @@ def arcilator_gpu_mode?
           @sim_backend == :arcilator_gpu
         end
 
+        def arcilator_mode?
+          @sim_backend == :arcilator
+        end
+
+        def verilator_mode?
+          @sim_backend == :verilator
+        end
+
+        def runner_backend_mode?
+          arcilator_gpu_mode? || arcilator_mode? || verilator_mode?
+        end
+
         def ensure_runner_cpu8bit_mode!
           return if @sim.runner_mode? && @sim.runner_kind == :cpu8bit
 
           raise ArgumentError,
-            "arcilator_gpu backend requires native cpu8bit runner mode " \
+            "#{@sim_backend} backend requires native cpu8bit runner mode " \
             "(runner_mode=#{@sim.runner_mode?}, runner_kind=#{@sim.runner_kind.inspect})"
         end
 
diff --git a/examples/8bit/utilities/runners/arcilator_gpu_runner.rb b/examples/8bit/utilities/runners/arcilator_gpu_runner.rb
index 3f989b7c..a9358932 100644
--- a/examples/8bit/utilities/runners/arcilator_gpu_runner.rb
+++ b/examples/8bit/utilities/runners/arcilator_gpu_runner.rb
@@ -19,6 +19,7 @@ module CPU8Bit
       #   -> clang/llc object -> C++ shim .so/.dylib -> Fiddle
       class ArcilatorGpuRunner
         BUILD_DIR = File.expand_path('../../.arcilator_gpu_build', __dir__)
+        MAX_INSTANCE_COUNT = 1024
 
         REQUIRED_TOOLS = %w[firtool arcilator mlir-opt spirv-cross].freeze
         GPU_OPTION_PATTERNS = [
@@ -43,7 +44,7 @@ class ArcilatorGpuRunner
           zero_flag_out
         ].freeze
 
-        attr_reader :backend
+        attr_reader :backend, :instance_count
 
         def self.status
           missing_tools = REQUIRED_TOOLS.reject { |tool| command_available?(tool) }
@@ -53,9 +54,6 @@ def self.status
           arcilator_help = command_output(%w[arcilator --help])
           gpu_option_tokens = detect_gpu_option_tokens(arcilator_help)
           missing_capabilities = []
-          if gpu_option_tokens.nil? || gpu_option_tokens.empty?
-            missing_capabilities << 'ArcToGPU arcilator lowering flag'
-          end
 
           if macos_host?
             missing_tools << 'xcrun' unless command_available?('xcrun')
@@ -64,10 +62,10 @@ def self.status
           end
 
           {
-            ready: missing_tools.empty? && missing_capabilities.empty?,
+            ready: missing_tools.empty?,
             missing_tools: missing_tools.uniq,
             missing_capabilities: missing_capabilities,
-            gpu_option_tokens: gpu_option_tokens || []
+            gpu_option_tokens: gpu_option_tokens
           }
         end
 
@@ -81,7 +79,7 @@ def self.ensure_available!
 
           raise ArgumentError,
             "arcilator_gpu backend unavailable (#{details.join('; ')}). " \
-            "Install an ArcToGPU-enabled arcilator build and required Metal/SPIR-V toolchain tools."
+            "Install required arcilator/firtool and Metal/SPIR-V toolchain tools."
         end
 
         def self.detect_gpu_option_tokens(help_text)
@@ -89,7 +87,7 @@ def self.detect_gpu_option_tokens(help_text)
           return Shellwords.split(env_value) unless env_value.empty?
 
           text = help_text.to_s
-          return nil if text.empty?
+          return [] if text.empty?
 
           GPU_OPTION_PATTERNS.each do |opt|
             return [opt] if text.include?(opt)
@@ -102,11 +100,12 @@ def self.detect_gpu_option_tokens(help_text)
             return [token] if token
           end
 
-          nil
+          []
         end
 
-        def initialize
+        def initialize(instances: nil)
           @backend = :arcilator_gpu
+          @instance_count = normalize_instance_count(instances)
           @gpu_info = self.class.ensure_available!
           @gpu_option_tokens = @gpu_info[:gpu_option_tokens]
           build_simulation
@@ -126,6 +125,10 @@ def runner_kind
           :cpu8bit
         end
 
+        def runner_parallel_instances
+          @instance_count
+        end
+
         def evaluate
           @fn_sim_eval.call(@ctx)
           nil
@@ -215,6 +218,13 @@ def command_available?(tool)
           self.class.command_available?(tool)
         end
 
+        def normalize_instance_count(instances)
+          raw = instances || ENV['RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES'] || ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES']
+          value = raw.to_i
+          value = 1 if value <= 0
+          [value, MAX_INSTANCE_COUNT].min
+        end
+
         def build_simulation
           FileUtils.mkdir_p(BUILD_DIR)
 
@@ -224,15 +234,30 @@ def build_simulation
           state_file = File.join(BUILD_DIR, 'cpu8bit_state.json')
           obj_file = File.join(BUILD_DIR, 'cpu8bit_arcgpu.o')
           wrapper_file = File.join(BUILD_DIR, 'cpu8bit_arcgpu_wrapper.cpp')
+          runner_file = __FILE__
+
+          firrtl_changed = write_file_if_changed(
+            fir_file,
+            RHDL::Codegen::CIRCT::FIRRTL.generate(RHDL::HDL::CPU::CPU.to_flat_ir(top_name: 'cpu8bit'))
+          )
 
-          needs_rebuild = !File.exist?(shared_lib_path)
+          codegen_needed = firrtl_changed
+          codegen_needed ||= !File.exist?(mlir_file) || !File.exist?(ll_file) || !File.exist?(state_file) || !File.exist?(obj_file)
+          codegen_needed ||= File.exist?(obj_file) && File.mtime(runner_file) > File.mtime(obj_file)
 
-          if needs_rebuild
-            export_firrtl(fir_file)
+          if codegen_needed
             compile_with_arcilator(fir_file, mlir_file, ll_file, state_file, obj_file)
-            write_wrapper(wrapper_file, state_file)
-            link_shared_library(wrapper_file, obj_file, shared_lib_path)
           end
+
+          wrapper_changed = write_wrapper(wrapper_file, state_file)
+
+          needs_link = !File.exist?(shared_lib_path)
+          needs_link ||= codegen_needed || wrapper_changed
+          needs_link ||= File.mtime(obj_file) > File.mtime(shared_lib_path) if File.exist?(shared_lib_path)
+          needs_link ||= File.mtime(wrapper_file) > File.mtime(shared_lib_path) if File.exist?(shared_lib_path)
+          needs_link ||= File.mtime(runner_file) > File.mtime(shared_lib_path) if File.exist?(shared_lib_path)
+
+          link_shared_library(wrapper_file, obj_file, shared_lib_path) if needs_link
         end
 
         def export_firrtl(path)
@@ -328,7 +353,9 @@ def write_wrapper(path, state_path)
           raise LoadError, "Missing required CPU8bit signals in arcilator state: #{missing.join(', ')}" unless missing.empty?
 
           defines = []
+          defines << "#define MAX_INSTANCE_COUNT #{MAX_INSTANCE_COUNT}"
           defines << "#define STATE_SIZE #{mod.fetch('numStateBytes')}"
+          defines << '#define MEMORY_SIZE 65536'
           offsets.each do |name, offset|
             defines << "#define #{offset_define(name)} #{offset}"
           end
@@ -349,14 +376,16 @@ def write_wrapper(path, state_path)
             #include <cstdint>
             #include <cstring>
             #include <cstdlib>
+            #include <cstddef>
 
             extern "C" void #{mod.fetch('name')}_eval(void* state);
 
             #{defines.join("\n")}
 
             struct SimContext {
-              uint8_t state[STATE_SIZE];
-              uint8_t memory[65536];
+              unsigned int instance_count;
+              uint8_t* states;
+              uint8_t* memories;
             };
 
             static inline void set_u8(uint8_t* s, int o, uint8_t v) { s[o] = v; }
@@ -367,45 +396,69 @@ def write_wrapper(path, state_path)
             static inline uint32_t get_u32(uint8_t* s, int o) { uint32_t v; memcpy(&v, &s[o], 4); return v; }
             static inline void set_bit(uint8_t* s, int o, uint8_t v) { s[o] = v & 1; }
             static inline uint8_t get_bit(uint8_t* s, int o) { return s[o] & 1; }
+            static inline unsigned int clamp_instance_count(unsigned int requested) {
+              if (requested == 0) return 1;
+              if (requested > MAX_INSTANCE_COUNT) return MAX_INSTANCE_COUNT;
+              return requested;
+            }
+            static inline uint8_t* state_for(SimContext* ctx, unsigned int instance_index) {
+              return ctx->states + (static_cast<size_t>(instance_index) * STATE_SIZE);
+            }
+            static inline uint8_t* memory_for(SimContext* ctx, unsigned int instance_index) {
+              return ctx->memories + (static_cast<size_t>(instance_index) * MEMORY_SIZE);
+            }
 
             extern "C" {
-            void* sim_create(void) {
+            void* sim_create(unsigned int requested_instances) {
               SimContext* ctx = new SimContext();
-              memset(ctx->state, 0, sizeof(ctx->state));
-              memset(ctx->memory, 0, sizeof(ctx->memory));
-              set_bit(ctx->state, OFF_CLK, 0);
-              set_bit(ctx->state, OFF_RST, 1);
-              #{mod.fetch('name')}_eval(ctx->state);
+              ctx->instance_count = clamp_instance_count(requested_instances);
+              ctx->states = new uint8_t[static_cast<size_t>(ctx->instance_count) * STATE_SIZE];
+              ctx->memories = new uint8_t[static_cast<size_t>(ctx->instance_count) * MEMORY_SIZE];
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                uint8_t* state = state_for(ctx, inst_i);
+                uint8_t* memory = memory_for(ctx, inst_i);
+                memset(state, 0, STATE_SIZE);
+                memset(memory, 0, MEMORY_SIZE);
+                set_bit(state, OFF_CLK, 0);
+                set_bit(state, OFF_RST, 1);
+                #{mod.fetch('name')}_eval(state);
+              }
               return ctx;
             }
 
             void sim_destroy(void* sim) {
-              delete static_cast<SimContext*>(sim);
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              if (!ctx) return;
+              delete[] ctx->states;
+              delete[] ctx->memories;
+              delete ctx;
             }
 
             void sim_eval(void* sim) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              #{mod.fetch('name')}_eval(ctx->state);
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                #{mod.fetch('name')}_eval(state_for(ctx, inst_i));
+              }
             }
 
-            static inline unsigned int run_cycles_internal(SimContext* ctx, unsigned int n) {
+            static inline unsigned int run_cycles_internal(uint8_t* state, uint8_t* memory, unsigned int n) {
               for (unsigned int i = 0; i < n; ++i) {
-                if (get_bit(ctx->state, OFF_HALTED)) {
+                if (get_bit(state, OFF_HALTED)) {
                   return i;
                 }
 
-                set_bit(ctx->state, OFF_CLK, 0);
-                #{mod.fetch('name')}_eval(ctx->state);
+                set_bit(state, OFF_CLK, 0);
+                #{mod.fetch('name')}_eval(state);
 
-                uint16_t addr = get_u16(ctx->state, OFF_MEM_ADDR) & 0xFFFF;
-                if (get_bit(ctx->state, OFF_MEM_WRITE_EN)) {
-                  ctx->memory[addr] = get_u8(ctx->state, OFF_MEM_DATA_OUT);
+                uint16_t addr = get_u16(state, OFF_MEM_ADDR) & 0xFFFF;
+                if (get_bit(state, OFF_MEM_WRITE_EN)) {
+                  memory[addr] = get_u8(state, OFF_MEM_DATA_OUT);
                 }
-                set_u8(ctx->state, OFF_MEM_DATA_IN, ctx->memory[addr]);
+                set_u8(state, OFF_MEM_DATA_IN, memory[addr]);
 
-                set_bit(ctx->state, OFF_CLK, 1);
-                #{mod.fetch('name')}_eval(ctx->state);
-                if (get_bit(ctx->state, OFF_HALTED)) {
+                set_bit(state, OFF_CLK, 1);
+                #{mod.fetch('name')}_eval(state);
+                if (get_bit(state, OFF_HALTED)) {
                   return i + 1;
                 }
               }
@@ -414,55 +467,74 @@ def write_wrapper(path, state_path)
 
             void sim_reset(void* sim) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              set_bit(ctx->state, OFF_RST, 1);
-              run_cycles_internal(ctx, 1);
-              set_bit(ctx->state, OFF_RST, 0);
-              #{mod.fetch('name')}_eval(ctx->state);
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                uint8_t* state = state_for(ctx, inst_i);
+                set_bit(state, OFF_RST, 1);
+                run_cycles_internal(state, memory_for(ctx, inst_i), 1);
+                set_bit(state, OFF_RST, 0);
+                #{mod.fetch('name')}_eval(state);
+              }
             }
 
             void sim_poke(void* sim, const char* name, unsigned int value) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              #{poke_cases.join("\n  else ")}
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                uint8_t* state = state_for(ctx, inst_i);
+                #{poke_cases.join("\n    else ")}
+              }
             }
 
             unsigned int sim_peek(void* sim, const char* name) {
               SimContext* ctx = static_cast<SimContext*>(sim);
+              uint8_t* state = state_for(ctx, 0);
               #{peek_cases.join("\n  ")}
               return 0;
             }
 
             unsigned int sim_runner_load_memory(void* sim, const unsigned char* data, unsigned int len, unsigned int offset) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              unsigned int loaded = 0;
-              for (unsigned int i = 0; i < len; ++i) {
-                unsigned int addr = (offset + i) & 0xFFFF;
-                ctx->memory[addr] = data[i];
-                loaded++;
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                uint8_t* memory = memory_for(ctx, inst_i);
+                for (unsigned int i = 0; i < len; ++i) {
+                  unsigned int addr = (offset + i) & 0xFFFF;
+                  memory[addr] = data[i];
+                }
               }
-              return loaded;
+              return len;
             }
 
             unsigned int sim_runner_read_memory(void* sim, unsigned int offset, unsigned int len, unsigned char* out) {
               SimContext* ctx = static_cast<SimContext*>(sim);
+              uint8_t* memory = memory_for(ctx, 0);
               for (unsigned int i = 0; i < len; ++i) {
                 unsigned int addr = (offset + i) & 0xFFFF;
-                out[i] = ctx->memory[addr];
+                out[i] = memory[addr];
               }
               return len;
             }
 
             unsigned int sim_runner_write_memory(void* sim, unsigned int offset, const unsigned char* data, unsigned int len) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              for (unsigned int i = 0; i < len; ++i) {
-                unsigned int addr = (offset + i) & 0xFFFF;
-                ctx->memory[addr] = data[i];
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                uint8_t* memory = memory_for(ctx, inst_i);
+                for (unsigned int i = 0; i < len; ++i) {
+                  unsigned int addr = (offset + i) & 0xFFFF;
+                  memory[addr] = data[i];
+                }
               }
               return len;
             }
 
             unsigned int sim_runner_run_cycles(void* sim, unsigned int n) {
               SimContext* ctx = static_cast<SimContext*>(sim);
-              return run_cycles_internal(ctx, n);
+              unsigned int completed = n;
+              for (unsigned int inst_i = 0; inst_i < ctx->instance_count; ++inst_i) {
+                unsigned int inst_completed = run_cycles_internal(state_for(ctx, inst_i), memory_for(ctx, inst_i), n);
+                if (inst_i == 0 || inst_completed < completed) {
+                  completed = inst_completed;
+                }
+              }
+              return completed;
             }
             }
           CPP
@@ -476,31 +548,31 @@ def offset_define(name)
 
         def setter_expr(offset_macro, width_bits, source_value)
           if width_bits <= 1
-            "set_bit(ctx->state, #{offset_macro}, #{source_value})"
+            "set_bit(state, #{offset_macro}, #{source_value})"
           elsif width_bits <= 8
-            "set_u8(ctx->state, #{offset_macro}, static_cast<uint8_t>(#{source_value}))"
+            "set_u8(state, #{offset_macro}, static_cast<uint8_t>(#{source_value}))"
           elsif width_bits <= 16
-            "set_u16(ctx->state, #{offset_macro}, static_cast<uint16_t>(#{source_value}))"
+            "set_u16(state, #{offset_macro}, static_cast<uint16_t>(#{source_value}))"
           else
-            "set_u32(ctx->state, #{offset_macro}, static_cast<uint32_t>(#{source_value}))"
+            "set_u32(state, #{offset_macro}, static_cast<uint32_t>(#{source_value}))"
           end
         end
 
         def getter_expr(offset_macro, width_bits)
           if width_bits <= 1
-            "get_bit(ctx->state, #{offset_macro})"
+            "get_bit(state, #{offset_macro})"
           elsif width_bits <= 8
-            "get_u8(ctx->state, #{offset_macro})"
+            "get_u8(state, #{offset_macro})"
           elsif width_bits <= 16
-            "get_u16(ctx->state, #{offset_macro})"
+            "get_u16(state, #{offset_macro})"
           else
-            "get_u32(ctx->state, #{offset_macro})"
+            "get_u32(state, #{offset_macro})"
           end
         end
 
         def load_library
           @lib = Fiddle.dlopen(shared_lib_path)
-          @fn_sim_create = Fiddle::Function.new(@lib['sim_create'], [], Fiddle::TYPE_VOIDP)
+          @fn_sim_create = Fiddle::Function.new(@lib['sim_create'], [Fiddle::TYPE_UINT], Fiddle::TYPE_VOIDP)
           @fn_sim_destroy = Fiddle::Function.new(@lib['sim_destroy'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
           @fn_sim_eval = Fiddle::Function.new(@lib['sim_eval'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
           @fn_sim_reset = Fiddle::Function.new(@lib['sim_reset'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
@@ -527,7 +599,9 @@ def load_library
             Fiddle::TYPE_UINT
           )
 
-          @ctx = @fn_sim_create.call
+          @ctx = @fn_sim_create.call(@instance_count)
+          raise LoadError, 'CPU8bit ArcilatorGPU simulation context initialization failed' if !@ctx || @ctx.to_i.zero?
+
           ObjectSpace.define_finalizer(self, self.class.finalizer(@fn_sim_destroy, @ctx))
         end
 
@@ -557,6 +631,15 @@ def normalize_payload(data)
           end
         end
 
+        def write_file_if_changed(path, content)
+          if File.exist?(path) && File.read(path) == content
+            return false
+          end
+
+          File.write(path, content)
+          true
+        end
+
         def last_log_lines(path, count: 8)
           return 'unknown error' unless File.exist?(path)
 
diff --git a/examples/8bit/utilities/runners/arcilator_runner.rb b/examples/8bit/utilities/runners/arcilator_runner.rb
new file mode 100644
index 00000000..cf4e346b
--- /dev/null
+++ b/examples/8bit/utilities/runners/arcilator_runner.rb
@@ -0,0 +1,509 @@
+# frozen_string_literal: true
+
+require 'fileutils'
+require 'fiddle'
+require 'json'
+require 'open3'
+require 'rbconfig'
+require 'rhdl/codegen'
+require_relative '../../hdl/cpu/cpu'
+
+module RHDL
+  module Examples
+    module CPU8Bit
+      # Native runner for 8-bit CPU using arcilator lowering.
+      #
+      # Pipeline:
+      #   RHDL CPU -> FIRRTL -> firtool (HW MLIR) -> arcilator (LLVM IR)
+      #   -> clang/llc object -> C++ shim .so/.dylib -> Fiddle
+      class ArcilatorRunner
+        BUILD_DIR = File.expand_path('../../.arcilator_build', __dir__)
+
+        REQUIRED_TOOLS = %w[firtool arcilator].freeze
+
+        REQUIRED_SIGNAL_NAMES = %w[
+          clk
+          rst
+          mem_addr
+          mem_data_in
+          mem_data_out
+          mem_write_en
+          halted
+          pc_out
+          acc_out
+          sp_out
+          state_out
+          zero_flag_out
+        ].freeze
+
+        attr_reader :backend
+
+        def self.status
+          missing_tools = REQUIRED_TOOLS.reject { |tool| command_available?(tool) }
+          missing_tools << 'llc/clang' unless command_available?('llc') || command_available?('clang')
+          missing_tools << 'c++/clang++/g++' unless command_available?('c++') || command_available?('clang++') || command_available?('g++')
+
+          {
+            ready: missing_tools.empty?,
+            missing_tools: missing_tools.uniq,
+            missing_capabilities: []
+          }
+        end
+
+        def self.ensure_available!
+          info = status
+          return info if info[:ready]
+
+          details = []
+          details << "missing tools: #{info[:missing_tools].join(', ')}" unless info[:missing_tools].empty?
+          details << "missing capabilities: #{info[:missing_capabilities].join(', ')}" unless info[:missing_capabilities].empty?
+
+          raise ArgumentError,
+            "arcilator backend unavailable (#{details.join('; ')}). " \
+            "Install required arcilator/firtool toolchain tools."
+        end
+
+        def initialize
+          @backend = :arcilator
+          self.class.ensure_available!
+          build_simulation
+          load_library
+          reset
+        end
+
+        def native?
+          true
+        end
+
+        def runner_mode?
+          true
+        end
+
+        def runner_kind
+          :cpu8bit
+        end
+
+        def evaluate
+          @fn_sim_eval.call(@ctx)
+          nil
+        end
+
+        def reset
+          @fn_sim_reset.call(@ctx)
+          nil
+        end
+
+        def poke(name, value)
+          @fn_sim_poke.call(@ctx, name.to_s, value.to_i & 0xFFFF_FFFF)
+          true
+        end
+
+        def peek(name)
+          @fn_sim_peek.call(@ctx, name.to_s) & 0xFFFF_FFFF
+        end
+
+        def runner_load_memory(data, offset = 0, _is_rom = false)
+          payload = normalize_payload(data)
+          return false if payload.empty?
+
+          ptr = Fiddle::Pointer[payload]
+          loaded = @fn_runner_load_memory.call(@ctx, ptr, payload.bytesize, offset.to_i & 0xFFFF)
+          loaded.to_i.positive?
+        end
+
+        def runner_read_memory(offset, length, mapped: true)
+          _ = mapped
+          len = [length.to_i, 0].max
+          return [] if len.zero?
+
+          out = Fiddle::Pointer.malloc(len)
+          read_len = @fn_runner_read_memory.call(@ctx, offset.to_i & 0xFFFF, len, out).to_i
+          return [] if read_len <= 0
+
+          out[0, read_len].unpack('C*')
+        end
+
+        def runner_write_memory(offset, data, mapped: true)
+          _ = mapped
+          payload = normalize_payload(data)
+          return 0 if payload.empty?
+
+          ptr = Fiddle::Pointer[payload]
+          @fn_runner_write_memory.call(@ctx, offset.to_i & 0xFFFF, ptr, payload.bytesize).to_i
+        end
+
+        def runner_run_cycles(n, _key_data = 0, _key_ready = false)
+          cycles = @fn_runner_run_cycles.call(@ctx, [n.to_i, 0].max).to_i
+          {
+            text_dirty: false,
+            key_cleared: false,
+            cycles_run: cycles,
+            speaker_toggles: 0
+          }
+        end
+
+        private
+
+        def self.command_available?(tool)
+          ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+            File.executable?(File.join(path, tool))
+          end
+        end
+
+        def command_available?(tool)
+          self.class.command_available?(tool)
+        end
+
+        def build_simulation
+          FileUtils.mkdir_p(BUILD_DIR)
+
+          fir_file = File.join(BUILD_DIR, 'cpu8bit.fir')
+          mlir_file = File.join(BUILD_DIR, 'cpu8bit_hw.mlir')
+          ll_file = File.join(BUILD_DIR, 'cpu8bit_arcilator.ll')
+          state_file = File.join(BUILD_DIR, 'cpu8bit_state.json')
+          obj_file = File.join(BUILD_DIR, 'cpu8bit_arcilator.o')
+          wrapper_file = File.join(BUILD_DIR, 'cpu8bit_arcilator_wrapper.cpp')
+
+          needs_rebuild = !File.exist?(shared_lib_path)
+
+          if needs_rebuild
+            export_firrtl(fir_file)
+            compile_with_arcilator(fir_file, mlir_file, ll_file, state_file, obj_file)
+            write_wrapper(wrapper_file, state_file)
+            link_shared_library(wrapper_file, obj_file, shared_lib_path)
+          end
+        end
+
+        def export_firrtl(path)
+          ir = RHDL::HDL::CPU::CPU.to_flat_ir(top_name: 'cpu8bit')
+          firrtl = RHDL::Codegen::CIRCT::FIRRTL.generate(ir)
+          File.write(path, firrtl)
+        end
+
+        def compile_with_arcilator(fir_file, mlir_file, ll_file, state_file, obj_file)
+          log_file = File.join(BUILD_DIR, 'cpu8bit_arcilator.log')
+          File.delete(log_file) if File.exist?(log_file)
+
+          run_or_raise(%W[firtool #{fir_file} --ir-hw -o #{mlir_file}], 'firtool HW lowering', log_file)
+
+          arcilator_cmd = ['arcilator', mlir_file, "--state-file=#{state_file}", '-o', ll_file]
+          run_or_raise(arcilator_cmd, 'arcilator lowering', log_file)
+
+          if command_available?('clang')
+            compile_object_with_clang(ll_file: ll_file, obj_file: obj_file, log_file: log_file)
+            return
+          end
+
+          compile_object_with_llc(ll_file: ll_file, obj_file: obj_file, log_file: log_file)
+        end
+
+        def run_or_raise(cmd, step_name, log_file)
+          out, status = Open3.capture2e(*cmd)
+          File.write(log_file, out, mode: 'a')
+          return if status.success?
+
+          raise LoadError, "#{step_name} failed: #{last_log_lines(log_file)}"
+        end
+
+        def compile_object_with_clang(ll_file:, obj_file:, log_file:)
+          cmd = ['clang', '-c', '-O2', '-fPIC']
+          if (target = llc_target_triple)
+            cmd += ['-target', target]
+          end
+          cmd += [ll_file, '-o', obj_file]
+          run_or_raise(cmd, 'clang compile', log_file)
+        end
+
+        def compile_object_with_llc(ll_file:, obj_file:, log_file:)
+          cmd = ['llc', '-filetype=obj', '-O2', '-relocation-model=pic']
+          if (triple = llc_target_triple)
+            cmd << "-mtriple=#{triple}"
+          end
+          cmd += [ll_file, '-o', obj_file]
+          run_or_raise(cmd, 'llc compile', log_file)
+        end
+
+        def llc_target_triple(host_os: RbConfig::CONFIG['host_os'], host_cpu: RbConfig::CONFIG['host_cpu'])
+          return nil unless host_os.to_s.downcase.include?('darwin')
+
+          cpu = host_cpu.to_s.downcase
+          arch = if cpu.include?('arm64') || cpu.include?('aarch64')
+            'arm64'
+          elsif cpu.include?('x86_64') || cpu.include?('amd64')
+            'x86_64'
+          end
+          return nil unless arch
+
+          "#{arch}-apple-macosx"
+        end
+
+        def link_shared_library(wrapper_file, obj_file, output_file)
+          cxx = if command_available?('clang++')
+            'clang++'
+          elsif command_available?('g++')
+            'g++'
+          else
+            'c++'
+          end
+
+          cmd = [cxx, '-shared', '-fPIC', '-O2', '-o', output_file, wrapper_file, obj_file]
+          run_or_raise(cmd, 'C++ link', File.join(BUILD_DIR, 'cpu8bit_arcilator.log'))
+        end
+
+        def write_wrapper(path, state_path)
+          state = JSON.parse(File.read(state_path))
+          mod = state[0]
+          states = mod.fetch('states', [])
+
+          offsets = {}
+          widths = {}
+          states.each do |entry|
+            name = entry.fetch('name')
+            offsets[name] = entry.fetch('offset')
+            widths[name] = entry.fetch('numBits', 32).to_i
+          end
+
+          missing = REQUIRED_SIGNAL_NAMES.reject { |name| offsets.key?(name) }
+          raise LoadError, "Missing required CPU8bit signals in arcilator state: #{missing.join(', ')}" unless missing.empty?
+
+          defines = []
+          defines << "#define STATE_SIZE #{mod.fetch('numStateBytes')}"
+          offsets.each do |name, offset|
+            defines << "#define #{offset_define(name)} #{offset}"
+          end
+
+          poke_cases = []
+          %w[clk rst mem_data_in pc_reg__q].each do |name|
+            next unless offsets.key?(name)
+
+            poke_cases << "if (!strcmp(name, \"#{name}\")) { #{setter_expr(offset_define(name), widths[name], 'value')}; return; }"
+          end
+
+          peek_cases = []
+          %w[mem_addr mem_data_out mem_write_en halted pc_out acc_out sp_out state_out zero_flag_out].each do |name|
+            peek_cases << "if (!strcmp(name, \"#{name}\")) return #{getter_expr(offset_define(name), widths[name])};"
+          end
+
+          wrapper = <<~CPP
+            #include <cstdint>
+            #include <cstring>
+            #include <cstdlib>
+
+            extern "C" void #{mod.fetch('name')}_eval(void* state);
+
+            #{defines.join("\n")}
+
+            struct SimContext {
+              uint8_t state[STATE_SIZE];
+              uint8_t memory[65536];
+            };
+
+            static inline void set_u8(uint8_t* s, int o, uint8_t v) { s[o] = v; }
+            static inline uint8_t get_u8(uint8_t* s, int o) { return s[o]; }
+            static inline void set_u16(uint8_t* s, int o, uint16_t v) { memcpy(&s[o], &v, 2); }
+            static inline uint16_t get_u16(uint8_t* s, int o) { uint16_t v; memcpy(&v, &s[o], 2); return v; }
+            static inline void set_u32(uint8_t* s, int o, uint32_t v) { memcpy(&s[o], &v, 4); }
+            static inline uint32_t get_u32(uint8_t* s, int o) { uint32_t v; memcpy(&v, &s[o], 4); return v; }
+            static inline void set_bit(uint8_t* s, int o, uint8_t v) { s[o] = v & 1; }
+            static inline uint8_t get_bit(uint8_t* s, int o) { return s[o] & 1; }
+
+            extern "C" {
+            void* sim_create(void) {
+              SimContext* ctx = new SimContext();
+              memset(ctx->state, 0, sizeof(ctx->state));
+              memset(ctx->memory, 0, sizeof(ctx->memory));
+              set_bit(ctx->state, OFF_CLK, 0);
+              set_bit(ctx->state, OFF_RST, 1);
+              #{mod.fetch('name')}_eval(ctx->state);
+              return ctx;
+            }
+
+            void sim_destroy(void* sim) {
+              delete static_cast<SimContext*>(sim);
+            }
+
+            void sim_eval(void* sim) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              #{mod.fetch('name')}_eval(ctx->state);
+            }
+
+            static inline unsigned int run_cycles_internal(SimContext* ctx, unsigned int n) {
+              for (unsigned int i = 0; i < n; ++i) {
+                if (get_bit(ctx->state, OFF_HALTED)) {
+                  return i;
+                }
+
+                set_bit(ctx->state, OFF_CLK, 0);
+                #{mod.fetch('name')}_eval(ctx->state);
+
+                uint16_t addr = get_u16(ctx->state, OFF_MEM_ADDR) & 0xFFFF;
+                if (get_bit(ctx->state, OFF_MEM_WRITE_EN)) {
+                  ctx->memory[addr] = get_u8(ctx->state, OFF_MEM_DATA_OUT);
+                }
+                set_u8(ctx->state, OFF_MEM_DATA_IN, ctx->memory[addr]);
+
+                set_bit(ctx->state, OFF_CLK, 1);
+                #{mod.fetch('name')}_eval(ctx->state);
+                if (get_bit(ctx->state, OFF_HALTED)) {
+                  return i + 1;
+                }
+              }
+              return n;
+            }
+
+            void sim_reset(void* sim) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              set_bit(ctx->state, OFF_RST, 1);
+              run_cycles_internal(ctx, 1);
+              set_bit(ctx->state, OFF_RST, 0);
+              #{mod.fetch('name')}_eval(ctx->state);
+            }
+
+            void sim_poke(void* sim, const char* name, unsigned int value) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              #{poke_cases.join("\n  else ")}
+            }
+
+            unsigned int sim_peek(void* sim, const char* name) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              #{peek_cases.join("\n  ")}
+              return 0;
+            }
+
+            unsigned int sim_runner_load_memory(void* sim, const unsigned char* data, unsigned int len, unsigned int offset) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              unsigned int loaded = 0;
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFF;
+                ctx->memory[addr] = data[i];
+                loaded++;
+              }
+              return loaded;
+            }
+
+            unsigned int sim_runner_read_memory(void* sim, unsigned int offset, unsigned int len, unsigned char* out) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFF;
+                out[i] = ctx->memory[addr];
+              }
+              return len;
+            }
+
+            unsigned int sim_runner_write_memory(void* sim, unsigned int offset, const unsigned char* data, unsigned int len) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFF;
+                ctx->memory[addr] = data[i];
+              }
+              return len;
+            }
+
+            unsigned int sim_runner_run_cycles(void* sim, unsigned int n) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              return run_cycles_internal(ctx, n);
+            }
+            }
+          CPP
+
+          File.write(path, wrapper)
+        end
+
+        def offset_define(name)
+          "OFF_#{name.to_s.upcase.gsub(/[^A-Z0-9]/, '_')}"
+        end
+
+        def setter_expr(offset_macro, width_bits, source_value)
+          if width_bits <= 1
+            "set_bit(ctx->state, #{offset_macro}, #{source_value})"
+          elsif width_bits <= 8
+            "set_u8(ctx->state, #{offset_macro}, static_cast<uint8_t>(#{source_value}))"
+          elsif width_bits <= 16
+            "set_u16(ctx->state, #{offset_macro}, static_cast<uint16_t>(#{source_value}))"
+          else
+            "set_u32(ctx->state, #{offset_macro}, static_cast<uint32_t>(#{source_value}))"
+          end
+        end
+
+        def getter_expr(offset_macro, width_bits)
+          if width_bits <= 1
+            "get_bit(ctx->state, #{offset_macro})"
+          elsif width_bits <= 8
+            "get_u8(ctx->state, #{offset_macro})"
+          elsif width_bits <= 16
+            "get_u16(ctx->state, #{offset_macro})"
+          else
+            "get_u32(ctx->state, #{offset_macro})"
+          end
+        end
+
+        def load_library
+          @lib = Fiddle.dlopen(shared_lib_path)
+          @fn_sim_create = Fiddle::Function.new(@lib['sim_create'], [], Fiddle::TYPE_VOIDP)
+          @fn_sim_destroy = Fiddle::Function.new(@lib['sim_destroy'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_eval = Fiddle::Function.new(@lib['sim_eval'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_reset = Fiddle::Function.new(@lib['sim_reset'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_poke = Fiddle::Function.new(@lib['sim_poke'], [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT], Fiddle::TYPE_VOID)
+          @fn_sim_peek = Fiddle::Function.new(@lib['sim_peek'], [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_UINT)
+          @fn_runner_load_memory = Fiddle::Function.new(
+            @lib['sim_runner_load_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_read_memory = Fiddle::Function.new(
+            @lib['sim_runner_read_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_write_memory = Fiddle::Function.new(
+            @lib['sim_runner_write_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_run_cycles = Fiddle::Function.new(
+            @lib['sim_runner_run_cycles'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+
+          @ctx = @fn_sim_create.call
+          ObjectSpace.define_finalizer(self, self.class.finalizer(@fn_sim_destroy, @ctx))
+        end
+
+        def self.finalizer(destroy_fn, ctx)
+          proc { destroy_fn.call(ctx) if ctx && !ctx.to_i.zero? }
+        end
+
+        def shared_lib_path
+          ext = if RbConfig::CONFIG['host_os'] =~ /darwin/
+            '.dylib'
+          elsif RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
+            '.dll'
+          else
+            '.so'
+          end
+          File.join(BUILD_DIR, "libcpu8bit_arcilator_sim#{ext}")
+        end
+
+        def normalize_payload(data)
+          case data
+          when String
+            data.b
+          when Array
+            data.pack('C*')
+          else
+            Array(data).pack('C*')
+          end
+        end
+
+        def last_log_lines(path, count: 8)
+          return 'unknown error' unless File.exist?(path)
+
+          File.read(path).lines.last(count).join.strip
+        rescue StandardError
+          'unknown error'
+        end
+      end
+    end
+  end
+end
diff --git a/examples/8bit/utilities/runners/verilator_runner.rb b/examples/8bit/utilities/runners/verilator_runner.rb
new file mode 100644
index 00000000..1991495c
--- /dev/null
+++ b/examples/8bit/utilities/runners/verilator_runner.rb
@@ -0,0 +1,410 @@
+# frozen_string_literal: true
+
+require 'fileutils'
+require 'fiddle'
+require 'open3'
+require 'rbconfig'
+require 'rhdl/codegen'
+require 'rhdl/codegen/verilog/sim/verilog_simulator'
+require_relative '../../hdl/cpu/cpu'
+
+module RHDL
+  module Examples
+    module CPU8Bit
+      # Native runner for 8-bit CPU using Verilator.
+      #
+      # Pipeline:
+      #   RHDL CPU -> Verilog -> Verilator C++ -> shared library -> Fiddle
+      class VerilatorRunner
+        BUILD_DIR = File.expand_path('../../.verilator_build', __dir__)
+        LIB_BASENAME = 'cpu8bit_verilator_sim'
+        TOP_MODULE = 'cpu8bit'
+        VERILATOR_PREFIX = 'Vcpu8bit'
+        REQUIRED_TOOLS = %w[firtool verilator make].freeze
+
+        attr_reader :backend
+
+        def self.status
+          missing_tools = REQUIRED_TOOLS.reject { |tool| command_available?(tool) }
+          missing_tools << 'c++/clang++/g++' unless command_available?('c++') || command_available?('clang++') || command_available?('g++')
+          {
+            ready: missing_tools.empty?,
+            missing_tools: missing_tools.uniq,
+            missing_capabilities: []
+          }
+        end
+
+        def self.ensure_available!
+          info = status
+          return info if info[:ready]
+
+          details = []
+          details << "missing tools: #{info[:missing_tools].join(', ')}" unless info[:missing_tools].empty?
+          details << "missing capabilities: #{info[:missing_capabilities].join(', ')}" unless info[:missing_capabilities].empty?
+          raise ArgumentError,
+            "verilator backend unavailable (#{details.join('; ')}). " \
+            "Install required verilator/make/C++ toolchain tools."
+        end
+
+        def initialize
+          @backend = :verilator
+          self.class.ensure_available!
+          build_simulation
+          load_library
+          reset
+        end
+
+        def native?
+          true
+        end
+
+        def runner_mode?
+          true
+        end
+
+        def runner_kind
+          :cpu8bit
+        end
+
+        def evaluate
+          @fn_sim_eval.call(@ctx)
+          nil
+        end
+
+        def reset
+          @fn_sim_reset.call(@ctx)
+          nil
+        end
+
+        def poke(name, value)
+          @fn_sim_poke.call(@ctx, name.to_s, value.to_i & 0xFFFF_FFFF)
+          true
+        end
+
+        def peek(name)
+          @fn_sim_peek.call(@ctx, name.to_s) & 0xFFFF_FFFF
+        end
+
+        def runner_load_memory(data, offset = 0, _is_rom = false)
+          payload = normalize_payload(data)
+          return false if payload.empty?
+
+          ptr = Fiddle::Pointer[payload]
+          loaded = @fn_runner_load_memory.call(@ctx, ptr, payload.bytesize, offset.to_i & 0xFFFF)
+          loaded.to_i.positive?
+        end
+
+        def runner_read_memory(offset, length, mapped: true)
+          _ = mapped
+          len = [length.to_i, 0].max
+          return [] if len.zero?
+
+          out = Fiddle::Pointer.malloc(len)
+          read_len = @fn_runner_read_memory.call(@ctx, offset.to_i & 0xFFFF, len, out).to_i
+          return [] if read_len <= 0
+
+          out[0, read_len].unpack('C*')
+        end
+
+        def runner_write_memory(offset, data, mapped: true)
+          _ = mapped
+          payload = normalize_payload(data)
+          return 0 if payload.empty?
+
+          ptr = Fiddle::Pointer[payload]
+          @fn_runner_write_memory.call(@ctx, offset.to_i & 0xFFFF, ptr, payload.bytesize).to_i
+        end
+
+        def runner_run_cycles(n, _key_data = 0, _key_ready = false)
+          cycles = @fn_runner_run_cycles.call(@ctx, [n.to_i, 0].max).to_i
+          {
+            text_dirty: false,
+            key_cleared: false,
+            cycles_run: cycles,
+            speaker_toggles: 0
+          }
+        end
+
+        private
+
+        def self.command_available?(tool)
+          ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+            File.executable?(File.join(path, tool))
+          end
+        end
+
+        def verilog_simulator
+          @verilog_simulator ||= RHDL::Codegen::Verilog::VerilogSimulator.new(
+            backend: :verilator,
+            build_dir: BUILD_DIR,
+            library_basename: LIB_BASENAME,
+            top_module: TOP_MODULE,
+            verilator_prefix: VERILATOR_PREFIX,
+            x_assign: '0',
+            x_initial: '0'
+          )
+        end
+
+        def build_simulation
+          verilog_simulator.ensure_backend_available!
+          verilog_simulator.prepare_build_dirs!
+
+          fir_file = File.join(BUILD_DIR, 'cpu8bit.fir')
+          verilog_file = File.join(verilog_simulator.verilog_dir, 'cpu8bit.v')
+          wrapper_file = File.join(verilog_simulator.verilog_dir, 'sim_wrapper.cpp')
+          header_file = File.join(verilog_simulator.verilog_dir, 'sim_wrapper.h')
+          lib_file = verilog_simulator.shared_library_path
+          log_file = File.join(BUILD_DIR, 'cpu8bit_verilator.log')
+
+          firrtl_changed = verilog_simulator.write_file_if_changed(
+            fir_file,
+            RHDL::Codegen::CIRCT::FIRRTL.generate(RHDL::HDL::CPU::CPU.to_flat_ir(top_name: TOP_MODULE))
+          )
+          verilog_regen_needed = firrtl_changed || !File.exist?(verilog_file)
+          verilog_regen_needed ||= File.exist?(verilog_file) && File.mtime(fir_file) > File.mtime(verilog_file)
+          if verilog_regen_needed
+            run_or_raise(%W[firtool #{fir_file} --verilog -o #{verilog_file}], 'firtool Verilog lowering', log_file)
+          end
+
+          wrapper_changed = create_cpp_wrapper(wrapper_file, header_file)
+
+          needs_build = !File.exist?(lib_file)
+          if File.exist?(lib_file)
+            needs_build ||= File.mtime(verilog_file) > File.mtime(lib_file)
+            needs_build ||= File.mtime(wrapper_file) > File.mtime(lib_file)
+            needs_build ||= File.mtime(__FILE__) > File.mtime(lib_file)
+          end
+          needs_build ||= verilog_regen_needed || wrapper_changed
+
+          if needs_build
+            verilog_simulator.compile_backend(
+              verilog_file: verilog_file,
+              wrapper_file: wrapper_file,
+              log_file: log_file
+            )
+          end
+        end
+
+        def create_cpp_wrapper(cpp_file, header_file)
+          header_content = <<~HEADER
+            #ifndef SIM_WRAPPER_H
+            #define SIM_WRAPPER_H
+
+            #ifdef __cplusplus
+            extern "C" {
+            #endif
+
+            void* sim_create(void);
+            void sim_destroy(void* sim);
+            void sim_eval(void* sim);
+            void sim_reset(void* sim);
+            void sim_poke(void* sim, const char* name, unsigned int value);
+            unsigned int sim_peek(void* sim, const char* name);
+            unsigned int sim_runner_load_memory(void* sim, const unsigned char* data, unsigned int len, unsigned int offset);
+            unsigned int sim_runner_read_memory(void* sim, unsigned int offset, unsigned int len, unsigned char* out);
+            unsigned int sim_runner_write_memory(void* sim, unsigned int offset, const unsigned char* data, unsigned int len);
+            unsigned int sim_runner_run_cycles(void* sim, unsigned int n);
+
+            #ifdef __cplusplus
+            }
+            #endif
+
+            #endif
+          HEADER
+
+          cpp_content = <<~CPP
+            #include "#{VERILATOR_PREFIX}.h"
+            #include "verilated.h"
+            #include "sim_wrapper.h"
+            #include <cstdint>
+            #include <cstring>
+
+            double sc_time_stamp() { return 0; }
+
+            struct SimContext {
+              #{VERILATOR_PREFIX}* dut;
+              std::uint8_t memory[65536];
+            };
+
+            static inline unsigned int run_cycles_internal(SimContext* ctx, unsigned int n) {
+              for (unsigned int i = 0; i < n; ++i) {
+                if (ctx->dut->halted) {
+                  return i;
+                }
+
+                ctx->dut->clk = 0;
+                ctx->dut->eval();
+
+                unsigned int addr = ctx->dut->mem_addr & 0xFFFFu;
+                if (ctx->dut->mem_write_en) {
+                  ctx->memory[addr] = static_cast<std::uint8_t>(ctx->dut->mem_data_out & 0xFFu);
+                }
+                ctx->dut->mem_data_in = ctx->memory[addr];
+                ctx->dut->eval();
+
+                ctx->dut->clk = 1;
+                ctx->dut->eval();
+                if (ctx->dut->halted) {
+                  return i + 1;
+                }
+              }
+              return n;
+            }
+
+            extern "C" {
+
+            void* sim_create(void) {
+              const char* empty_args[] = {""};
+              Verilated::commandArgs(1, empty_args);
+              Verilated::randReset(0);
+              SimContext* ctx = new SimContext();
+              ctx->dut = new #{VERILATOR_PREFIX}();
+              std::memset(ctx->memory, 0, sizeof(ctx->memory));
+              ctx->dut->clk = 0;
+              ctx->dut->rst = 1;
+              ctx->dut->mem_data_in = 0;
+              ctx->dut->eval();
+              return ctx;
+            }
+
+            void sim_destroy(void* sim) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              delete ctx->dut;
+              delete ctx;
+            }
+
+            void sim_eval(void* sim) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              ctx->dut->eval();
+            }
+
+            void sim_reset(void* sim) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              ctx->dut->rst = 1;
+              run_cycles_internal(ctx, 1);
+              ctx->dut->rst = 0;
+              ctx->dut->eval();
+            }
+
+            void sim_poke(void* sim, const char* name, unsigned int value) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              if (std::strcmp(name, "clk") == 0) ctx->dut->clk = value & 1u;
+              else if (std::strcmp(name, "rst") == 0) ctx->dut->rst = value & 1u;
+              else if (std::strcmp(name, "mem_data_in") == 0) ctx->dut->mem_data_in = value & 0xFFu;
+            }
+
+            unsigned int sim_peek(void* sim, const char* name) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              if (std::strcmp(name, "mem_addr") == 0) return ctx->dut->mem_addr;
+              if (std::strcmp(name, "mem_data_out") == 0) return ctx->dut->mem_data_out;
+              if (std::strcmp(name, "mem_write_en") == 0) return ctx->dut->mem_write_en;
+              if (std::strcmp(name, "halted") == 0) return ctx->dut->halted;
+              if (std::strcmp(name, "pc_out") == 0) return ctx->dut->pc_out;
+              if (std::strcmp(name, "acc_out") == 0) return ctx->dut->acc_out;
+              if (std::strcmp(name, "sp_out") == 0) return ctx->dut->sp_out;
+              if (std::strcmp(name, "state_out") == 0) return ctx->dut->state_out;
+              if (std::strcmp(name, "zero_flag_out") == 0) return ctx->dut->zero_flag_out;
+              return 0;
+            }
+
+            unsigned int sim_runner_load_memory(void* sim, const unsigned char* data, unsigned int len, unsigned int offset) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              unsigned int loaded = 0;
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFFu;
+                ctx->memory[addr] = data[i];
+                loaded++;
+              }
+              return loaded;
+            }
+
+            unsigned int sim_runner_read_memory(void* sim, unsigned int offset, unsigned int len, unsigned char* out) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFFu;
+                out[i] = ctx->memory[addr];
+              }
+              return len;
+            }
+
+            unsigned int sim_runner_write_memory(void* sim, unsigned int offset, const unsigned char* data, unsigned int len) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              for (unsigned int i = 0; i < len; ++i) {
+                unsigned int addr = (offset + i) & 0xFFFFu;
+                ctx->memory[addr] = data[i];
+              }
+              return len;
+            }
+
+            unsigned int sim_runner_run_cycles(void* sim, unsigned int n) {
+              SimContext* ctx = static_cast<SimContext*>(sim);
+              return run_cycles_internal(ctx, n);
+            }
+
+            }
+          CPP
+
+          header_changed = verilog_simulator.write_file_if_changed(header_file, header_content)
+          cpp_changed = verilog_simulator.write_file_if_changed(cpp_file, cpp_content)
+          header_changed || cpp_changed
+        end
+
+        def load_library
+          @lib = verilog_simulator.load_library!(verilog_simulator.shared_library_path)
+          @fn_sim_create = Fiddle::Function.new(@lib['sim_create'], [], Fiddle::TYPE_VOIDP)
+          @fn_sim_destroy = Fiddle::Function.new(@lib['sim_destroy'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_eval = Fiddle::Function.new(@lib['sim_eval'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_reset = Fiddle::Function.new(@lib['sim_reset'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID)
+          @fn_sim_poke = Fiddle::Function.new(@lib['sim_poke'], [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT], Fiddle::TYPE_VOID)
+          @fn_sim_peek = Fiddle::Function.new(@lib['sim_peek'], [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP], Fiddle::TYPE_UINT)
+          @fn_runner_load_memory = Fiddle::Function.new(
+            @lib['sim_runner_load_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_read_memory = Fiddle::Function.new(
+            @lib['sim_runner_read_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_write_memory = Fiddle::Function.new(
+            @lib['sim_runner_write_memory'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+          @fn_runner_run_cycles = Fiddle::Function.new(
+            @lib['sim_runner_run_cycles'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_UINT
+          )
+
+          @ctx = @fn_sim_create.call
+          ObjectSpace.define_finalizer(self, self.class.finalizer(@fn_sim_destroy, @ctx))
+        end
+
+        def run_or_raise(cmd, step_name, log_file)
+          out, status = Open3.capture2e(*cmd)
+          File.write(log_file, out, mode: 'a')
+          return if status.success?
+
+          raise LoadError, "#{step_name} failed"
+        end
+
+        def self.finalizer(destroy_fn, ctx)
+          proc { destroy_fn.call(ctx) if ctx && !ctx.to_i.zero? }
+        end
+
+        def normalize_payload(data)
+          case data
+          when String
+            data.b
+          when Array
+            data.pack('C*')
+          else
+            Array(data).pack('C*')
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/examples/apple2/bin/apple2 b/examples/apple2/bin/apple2
index 436c3689..50791e2a 100755
--- a/examples/apple2/bin/apple2
+++ b/examples/apple2/bin/apple2
@@ -58,7 +58,7 @@ parser = OptionParser.new do |opts|
     options[:audio] = true
   end
 
-  opts.on("-m", "--mode TYPE", [:ruby, :ir, :netlist, :verilog, :circt], "Simulation mode: ruby (default), ir, netlist (gate-level), verilog (Verilator), circt (Arcilator)") do |v|
+  opts.on("-m", "--mode TYPE", [:ruby, :ir, :netlist, :verilog, :circt, :arcilator_gpu], "Simulation mode: ruby (default), ir, netlist (gate-level), verilog (Verilator), circt (Arcilator), arcilator_gpu (ArcToGPU Metal)") do |v|
     options[:mode] = v
   end
 
@@ -126,7 +126,7 @@ if options[:sim].nil?
   options[:sim] = case options[:mode]
                   when :ruby then :ruby
                   when :ir, :netlist then :compile
-                  when :verilog, :circt then :ruby
+                  when :verilog, :circt, :arcilator_gpu then :ruby
                   else :ruby
                   end
 end
@@ -149,7 +149,7 @@ if options[:speed].nil?
                       when :interpret then 1_000
                       else 1_000
                       end
-                    when :verilog, :circt
+                    when :verilog, :circt, :arcilator_gpu
                       33_333
                     else
                       100
diff --git a/examples/apple2/utilities/runners/arcilator_gpu_runner.rb b/examples/apple2/utilities/runners/arcilator_gpu_runner.rb
new file mode 100644
index 00000000..8f6aaafa
--- /dev/null
+++ b/examples/apple2/utilities/runners/arcilator_gpu_runner.rb
@@ -0,0 +1,716 @@
+# frozen_string_literal: true
+
+# Apple II Arcilator GPU Runner
+#
+# Pipeline:
+#   RHDL -> FIRRTL -> firtool (HW MLIR) -> arcilator (--until-after=arc-opt)
+#   -> ArcToGpuLowering(profile: :apple2) -> Metal shader -> native Metal executor
+#
+# This runner preserves the Apple II runner ABI exposed by ArcilatorRunner
+# (`sim_*` functions) while executing circuit eval on the generated ArcToGPU Metal kernel.
+
+require_relative 'arcilator_runner'
+require 'rhdl/codegen/firrtl/firrtl'
+require 'rhdl/codegen/firrtl/arc_to_gpu_lowering'
+require 'fileutils'
+require 'json'
+require 'open3'
+require 'rbconfig'
+
+module RHDL
+  module Examples
+    module Apple2
+      class ArcilatorGpuRunner < ArcilatorRunner
+        BUILD_DIR = File.expand_path('../../../.arcilator_gpu_build', __dir__)
+
+        REQUIRED_TOOLS = %w[firtool arcilator].freeze
+
+        def self.status
+          missing_tools = []
+          REQUIRED_TOOLS.each { |tool| missing_tools << tool unless command_available?(tool) }
+
+          unless command_available?('llc') || command_available?('clang')
+            missing_tools << 'llc/clang'
+          end
+
+          if macos_host?
+            missing_tools << 'xcrun' unless command_available?('xcrun')
+            missing_tools << 'metal' unless command_success?(%w[xcrun -f metal])
+            missing_tools << 'metallib' unless command_success?(%w[xcrun -f metallib])
+            missing_tools << 'clang++/c++' unless command_available?('clang++') || command_available?('c++')
+          else
+            missing_tools << 'macOS Metal toolchain'
+          end
+
+          {
+            ready: missing_tools.empty?,
+            missing_tools: missing_tools.uniq
+          }
+        end
+
+        def self.available?
+          status[:ready]
+        end
+
+        def self.ensure_available!
+          info = status
+          return info if info[:ready]
+
+          raise ArgumentError,
+            "arcilator_gpu backend unavailable (missing tools: #{info[:missing_tools].join(', ')}). " \
+            "Install CIRCT tools and the macOS Metal toolchain."
+        end
+
+        attr_reader :instance_count
+
+        def initialize(sub_cycles: 14, instances: nil)
+          @sub_cycles = sub_cycles.clamp(1, 14)
+          @instance_count = normalize_instance_count(instances)
+          self.class.ensure_available!
+
+          puts 'Initializing Apple2 ArcilatorGPU simulation...'
+          start_time = Time.now
+
+          build_arcilator_gpu_simulation
+
+          elapsed = Time.now - start_time
+          puts "  ArcilatorGPU simulation built in #{elapsed.round(2)}s"
+          puts "  Pipeline: #{pipeline}"
+          puts "  Sub-cycles: #{@sub_cycles} (#{@sub_cycles == 14 ? 'full accuracy' : 'fast mode'})"
+          puts "  Instances: #{@instance_count}"
+
+          @cycles = 0
+          @halted = false
+          @text_page_dirty = false
+          @ram = Array.new(48 * 1024, 0)
+          @rom = Array.new(12 * 1024, 0)
+          @ps2_encoder = PS2Encoder.new
+          @speaker = Speaker.new
+          @prev_speaker_state = 0
+        end
+
+        def simulator_type
+          :hdl_arcilator_gpu
+        end
+
+        def dry_run_info
+          {
+            mode: :arcilator_gpu,
+            simulator_type: :hdl_arcilator_gpu,
+            native: true,
+            instances: @instance_count
+          }
+        end
+
+        private
+
+        def build_dir
+          BUILD_DIR
+        end
+
+        def pipeline
+          :arc_to_gpu
+        end
+
+        def pipeline_slug
+          'arc_to_gpu'
+        end
+
+        def shared_library_basename
+          'libapple2_arcilator_gpu_sim'
+        end
+
+        def build_arcilator_gpu_simulation
+          FileUtils.mkdir_p(build_dir)
+
+          pipeline_prefix = "apple2_#{pipeline_slug}"
+
+          fir_file = File.join(build_dir, 'apple2.fir')
+          hw_mlir_file = File.join(build_dir, 'apple2_hw.mlir')
+          arc_mlir_file = File.join(build_dir, 'apple2_arc.mlir')
+          gpu_mlir_file = File.join(build_dir, "#{pipeline_prefix}.mlir")
+          gpu_meta_file = File.join(build_dir, "#{pipeline_prefix}.json")
+          metal_source_file = File.join(build_dir, "#{pipeline_prefix}.metal")
+          metal_air_file = File.join(build_dir, "#{pipeline_prefix}.air")
+          metal_lib_file = File.join(build_dir, "#{pipeline_prefix}.metallib")
+          wrapper_file = File.join(build_dir, "#{pipeline_prefix}_wrapper.mm")
+          log_file = File.join(build_dir, "#{pipeline_prefix}.log")
+
+          File.delete(log_file) if File.exist?(log_file)
+
+          export_firrtl(fir_file)
+          run_or_raise(%W[firtool #{fir_file} --ir-hw -o #{hw_mlir_file}], 'firtool HW lowering', log_file)
+          run_or_raise(
+            ['arcilator', hw_mlir_file, '--emit-mlir', '--until-after=arc-opt', '-o', arc_mlir_file],
+            'arcilator Arc emission',
+            log_file
+          )
+
+          RHDL::Codegen::FIRRTL::ArcToGpuLowering.lower(
+            arc_mlir_path: arc_mlir_file,
+            gpu_mlir_path: gpu_mlir_file,
+            metadata_path: gpu_meta_file,
+            metal_source_path: metal_source_file,
+            profile: :apple2
+          )
+
+          module_cache_dir = File.join(build_dir, 'clang_module_cache')
+          FileUtils.rm_rf(module_cache_dir)
+          FileUtils.mkdir_p(module_cache_dir)
+          run_or_raise(
+            [
+              'xcrun', '-sdk', 'macosx', 'metal', '-c', '-O3',
+              "-fmodules-cache-path=#{module_cache_dir}",
+              metal_source_file, '-o', metal_air_file
+            ],
+            'metal shader compile',
+            log_file
+          )
+          run_or_raise(
+            ['xcrun', '-sdk', 'macosx', 'metallib', metal_air_file, '-o', metal_lib_file],
+            'metallib link',
+            log_file
+          )
+
+          write_wrapper(
+            path: wrapper_file,
+            metadata_path: gpu_meta_file,
+            metallib_path: metal_lib_file,
+            instance_count: @instance_count
+          )
+          link_shared_library(wrapper_file, shared_lib_path, log_file: log_file)
+          load_shared_library(shared_lib_path)
+          if !@sim_ctx || (@sim_ctx.respond_to?(:to_i) && @sim_ctx.to_i.zero?)
+            raise LoadError,
+              'ArcilatorGPU simulation context initialization failed (sim_create returned null). ' \
+              'Check Metal pipeline/toolchain compatibility for generated ArcToGPU kernel.'
+          end
+        end
+
+        def export_firrtl(path)
+          components = [
+            TimingGenerator,
+            VideoGenerator,
+            CharacterROM,
+            SpeakerToggle,
+            CPU6502,
+            DiskII,
+            DiskIIROM,
+            Keyboard,
+            PS2Controller,
+            Apple2
+          ]
+          module_defs = components.map(&:to_ir)
+          firrtl = RHDL::Codegen::FIRRTL.generate_hierarchy(module_defs, top_name: 'apple2_apple2')
+          File.write(path, firrtl)
+        end
+
+        def link_shared_library(wrapper_file, output_file, log_file:)
+          cxx = if command_available?('clang++')
+            'clang++'
+          else
+            'c++'
+          end
+
+          cmd = [
+            cxx,
+            '-std=c++17',
+            '-x',
+            'objective-c++',
+            '-fobjc-arc',
+            '-dynamiclib',
+            '-O2',
+            '-o',
+            output_file,
+            wrapper_file,
+            '-framework',
+            'Foundation',
+            '-framework',
+            'Metal'
+          ]
+          run_or_raise(cmd, 'Objective-C++ link', log_file)
+        end
+
+        def write_wrapper(path:, metadata_path:, metallib_path:, instance_count:)
+          metadata = JSON.parse(File.read(metadata_path))
+          state_count = metadata.dig('metal', 'state_count').to_i
+          state_scalar_bits = metadata.dig('metal', 'state_scalar_bits').to_i
+          state_scalar_bits = 32 if state_scalar_bits <= 0
+          state_scalar_cpp_type = state_scalar_bits > 32 ? 'uint64_t' : 'uint32_t'
+          kernel_name = metadata.dig('metal', 'entry').to_s
+          input_layout = Array(metadata['top_input_layout'])
+          output_layout = Array(metadata['top_output_layout'])
+
+          raise LoadError, 'ArcToGPU metadata missing metal entry' if kernel_name.empty?
+          raise LoadError, 'ArcToGPU metadata missing state_count' if state_count <= 0
+
+          input_field_names = input_layout.map { |entry| cpp_ident(entry.fetch('name')) }
+          output_field_names = output_layout.map { |entry| cpp_ident(entry.fetch('name')) }
+          widths = {}
+          input_layout.each { |entry| widths[entry.fetch('name')] = entry.fetch('width').to_i }
+          output_layout.each { |entry| widths[entry.fetch('name')] = entry.fetch('width').to_i }
+
+          struct_input_fields = input_layout.map { |entry| "  uint32_t #{cpp_ident(entry.fetch('name'))};" }
+          struct_output_fields = output_layout.map { |entry| "  uint32_t #{cpp_ident(entry.fetch('name'))};" }
+
+          poke_cases = input_layout.map do |entry|
+            name = entry.fetch('name')
+            field = cpp_ident(name)
+            width = entry.fetch('width').to_i
+            <<~CPP
+              if (!strcmp(name, "#{name}")) {
+                io[0].#{field} = mask_width(value, #{width}u);
+                return;
+              }
+            CPP
+          end
+
+          peek_cases = (input_layout + output_layout).map do |entry|
+            name = entry.fetch('name')
+            field = cpp_ident(name)
+            width = entry.fetch('width').to_i
+            <<~CPP
+              if (!strcmp(name, "#{name}")) {
+                return io[0].#{field} & #{mask_literal(width)};
+              }
+            CPP
+          end
+
+          default_lines = []
+          default_lines << 'io->reset = 1u;' if input_field_names.include?(cpp_ident('reset'))
+          default_lines << 'io->clk_14m = 0u;' if input_field_names.include?(cpp_ident('clk_14m'))
+          default_lines << 'io->flash_clk = 0u;' if input_field_names.include?(cpp_ident('flash_clk'))
+          default_lines << 'io->ps2_clk = 1u;' if input_field_names.include?(cpp_ident('ps2_clk'))
+          default_lines << 'io->ps2_data = 1u;' if input_field_names.include?(cpp_ident('ps2_data'))
+          default_lines << 'io->pause = 0u;' if input_field_names.include?(cpp_ident('pause'))
+          default_lines << 'io->ram_do = 0u;' if input_field_names.include?(cpp_ident('ram_do'))
+          default_lines << 'io->pd = 0u;' if input_field_names.include?(cpp_ident('pd'))
+          default_lines << 'io->gameport = 0u;' if input_field_names.include?(cpp_ident('gameport'))
+
+          reset_assert_line = input_field_names.include?(cpp_ident('reset')) ? 'io[0].reset = 1u;' : ''
+          reset_deassert_line = input_field_names.include?(cpp_ident('reset')) ? 'io[0].reset = 0u;' : ''
+
+          wrapper = <<~CPP
+            #import <Foundation/Foundation.h>
+            #import <Metal/Metal.h>
+            #include <CoreFoundation/CoreFoundation.h>
+            #include <cstdint>
+            #include <cstring>
+            #include <cstdlib>
+            #include <cstdio>
+
+            static const uint32_t STATE_COUNT = #{state_count}u;
+            static const uint32_t STATE_SCALAR_BITS = #{state_scalar_bits}u;
+            static const uint32_t INSTANCE_COUNT = #{instance_count}u;
+            static const uint32_t RAM_SIZE = 65536u;
+            static const uint32_t ROM_SIZE = 12288u;
+            static NSString* const kMetallibPath = @#{metallib_path.dump};
+            static NSString* const kKernelName = @#{kernel_name.dump};
+            using RhdlStateScalar = #{state_scalar_cpp_type};
+
+            struct RhdlArcGpuIo {
+              uint32_t cycle_budget;
+              uint32_t cycles_ran;
+              uint32_t last_clock;
+              uint32_t prev_speaker;
+              uint32_t speaker_toggles;
+              uint32_t text_dirty;
+            #{struct_input_fields.join("\n")}
+            #{struct_output_fields.join("\n")}
+            };
+
+            static inline uint32_t mask_width(uint32_t value, uint32_t width) {
+              if (width >= 32u) {
+                return value;
+              }
+              return value & ((1u << width) - 1u);
+            }
+
+            @interface RhdlApple2MetalSim : NSObject
+            @property(nonatomic, strong) id<MTLDevice> device;
+            @property(nonatomic, strong) id<MTLCommandQueue> queue;
+            @property(nonatomic, strong) id<MTLLibrary> library;
+            @property(nonatomic, strong) id<MTLComputePipelineState> pipeline;
+            @property(nonatomic, strong) id<MTLBuffer> stateBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> ramBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> romBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> ioBuffer;
+            @property(nonatomic, assign) uint32_t instanceCount;
+            - (instancetype)initWithMetallibPath:(NSString*)metallibPath kernelName:(NSString*)kernelName stateCount:(uint32_t)stateCount instanceCount:(uint32_t)instanceCount;
+            - (BOOL)dispatchKernel;
+            - (RhdlArcGpuIo*)io;
+            - (RhdlStateScalar*)stateSlots;
+            - (uint8_t*)ram;
+            - (uint8_t*)rom;
+            @end
+
+            @implementation RhdlApple2MetalSim
+            - (instancetype)initWithMetallibPath:(NSString*)metallibPath kernelName:(NSString*)kernelName stateCount:(uint32_t)stateCount instanceCount:(uint32_t)instanceCount {
+              self = [super init];
+              if (!self) {
+                return nil;
+              }
+              self.instanceCount = instanceCount;
+
+              self.device = MTLCreateSystemDefaultDevice();
+              if (!self.device) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: no MTL device\\n");
+                return nil;
+              }
+
+              self.queue = [self.device newCommandQueue];
+              if (!self.queue) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: no command queue\\n");
+                return nil;
+              }
+
+              NSError* error = nil;
+              self.library = [self.device newLibraryWithFile:metallibPath error:&error];
+              if (!self.library) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: newLibraryWithFile %s\\n",
+                        error ? [[error localizedDescription] UTF8String] : "unknown");
+                return nil;
+              }
+
+              id<MTLFunction> fn = [self.library newFunctionWithName:kernelName];
+              if (!fn) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: missing kernel function %s\\n", [kernelName UTF8String]);
+                return nil;
+              }
+
+              self.pipeline = [self.device newComputePipelineStateWithFunction:fn error:&error];
+              if (!self.pipeline) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: pipeline creation %s\\n",
+                        error ? [[error localizedDescription] UTF8String] : "unknown");
+                return nil;
+              }
+
+              self.stateBuffer = [self.device newBufferWithLength:sizeof(RhdlStateScalar) * stateCount * instanceCount options:MTLResourceStorageModeShared];
+              self.ramBuffer = [self.device newBufferWithLength:RAM_SIZE * instanceCount options:MTLResourceStorageModeShared];
+              self.romBuffer = [self.device newBufferWithLength:ROM_SIZE * instanceCount options:MTLResourceStorageModeShared];
+              self.ioBuffer = [self.device newBufferWithLength:sizeof(RhdlArcGpuIo) * instanceCount options:MTLResourceStorageModeShared];
+              if (!self.stateBuffer || !self.ramBuffer || !self.romBuffer || !self.ioBuffer) {
+                fprintf(stderr, "[apple2-arcilator-gpu] init failed: buffer allocation\\n");
+                return nil;
+              }
+
+              memset([self.stateBuffer contents], 0, sizeof(RhdlStateScalar) * stateCount * instanceCount);
+              memset([self.ramBuffer contents], 0, RAM_SIZE * instanceCount);
+              memset([self.romBuffer contents], 0, ROM_SIZE * instanceCount);
+              memset([self.ioBuffer contents], 0, sizeof(RhdlArcGpuIo) * instanceCount);
+
+              return self;
+            }
+
+            - (BOOL)dispatchKernel {
+              id<MTLCommandBuffer> commandBuffer = [self.queue commandBuffer];
+              if (!commandBuffer) {
+                return NO;
+              }
+              id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+              if (!encoder) {
+                return NO;
+              }
+
+              [encoder setComputePipelineState:self.pipeline];
+              [encoder setBuffer:self.stateBuffer offset:0 atIndex:0];
+              [encoder setBuffer:self.ramBuffer offset:0 atIndex:1];
+              [encoder setBuffer:self.romBuffer offset:0 atIndex:2];
+              [encoder setBuffer:self.ioBuffer offset:0 atIndex:3];
+
+              MTLSize grid = MTLSizeMake(self.instanceCount, 1, 1);
+              MTLSize tg = MTLSizeMake(1, 1, 1);
+              [encoder dispatchThreads:grid threadsPerThreadgroup:tg];
+              [encoder endEncoding];
+
+              [commandBuffer commit];
+              [commandBuffer waitUntilCompleted];
+              if (commandBuffer.status != MTLCommandBufferStatusCompleted) {
+                NSError* cbError = commandBuffer.error;
+                if (cbError) {
+                  fprintf(stderr, "[apple2-arcilator-gpu] command buffer error: %s\\n", [[cbError localizedDescription] UTF8String]);
+                }
+                return NO;
+              }
+              return YES;
+            }
+
+            - (RhdlArcGpuIo*)io { return reinterpret_cast<RhdlArcGpuIo*>([self.ioBuffer contents]); }
+            - (RhdlStateScalar*)stateSlots { return reinterpret_cast<RhdlStateScalar*>([self.stateBuffer contents]); }
+            - (uint8_t*)ram { return reinterpret_cast<uint8_t*>([self.ramBuffer contents]); }
+            - (uint8_t*)rom { return reinterpret_cast<uint8_t*>([self.romBuffer contents]); }
+            @end
+
+            static inline RhdlApple2MetalSim* as_sim(void* sim) {
+              return (__bridge RhdlApple2MetalSim*)sim;
+            }
+
+            static inline unsigned int run_cycles_internal(RhdlApple2MetalSim* sim, unsigned int n, unsigned int* dirty_out) {
+              if (!sim) {
+                if (dirty_out) { *dirty_out = 0u; }
+                return 0u;
+              }
+              RhdlArcGpuIo* io = [sim io];
+              for (uint32_t i = 0; i < sim.instanceCount; ++i) {
+                io[i].cycle_budget = n;
+                io[i].speaker_toggles = 0u;
+                io[i].text_dirty = 0u;
+              }
+              if (![sim dispatchKernel]) {
+                io[0].cycles_ran = 0u;
+                if (dirty_out) { *dirty_out = 0u; }
+                return 0u;
+              }
+              if (dirty_out) {
+                uint32_t any_dirty = 0u;
+                for (uint32_t i = 0; i < sim.instanceCount; ++i) {
+                  any_dirty |= (io[i].text_dirty & 1u);
+                }
+                *dirty_out = any_dirty;
+              }
+              return io[0].speaker_toggles;
+            }
+
+            extern "C" {
+            void* sim_create(void) {
+              @autoreleasepool {
+                RhdlApple2MetalSim* sim = [[RhdlApple2MetalSim alloc] initWithMetallibPath:kMetallibPath kernelName:kKernelName stateCount:STATE_COUNT instanceCount:INSTANCE_COUNT];
+                if (!sim) {
+                  fprintf(stderr, "[apple2-arcilator-gpu] sim_create failed during init\\n");
+                  return nullptr;
+                }
+                RhdlArcGpuIo* io = [sim io];
+            #{indent_cpp(default_lines.map { |line| line.sub('io->', 'io[0].') })}
+                for (uint32_t i = 0; i < sim.instanceCount; ++i) {
+                  io[i] = io[0];
+                }
+                io[0].cycle_budget = 0u;
+                io[0].last_clock = io[0].#{cpp_ident('clk_14m')};
+                if (![sim dispatchKernel]) {
+                  fprintf(stderr, "[apple2-arcilator-gpu] sim_create failed initial dispatch\\n");
+                  return nullptr;
+                }
+                return (__bridge_retained void*)sim;
+              }
+            }
+
+            void sim_destroy(void* sim) {
+              if (!sim) {
+                return;
+              }
+              @autoreleasepool {
+                CFBridgingRelease(sim);
+              }
+            }
+
+            void sim_eval(void* sim) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s) {
+                return;
+              }
+              RhdlArcGpuIo* io = [s io];
+              io[0].cycle_budget = 0u;
+              (void)[s dispatchKernel];
+            }
+
+            void sim_reset(void* sim) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s) {
+                return;
+              }
+              RhdlArcGpuIo* io = [s io];
+            #{indent_cpp([reset_assert_line])}
+              unsigned int dirty = 0u;
+              (void)run_cycles_internal(s, 14u, &dirty);
+            #{indent_cpp([reset_deassert_line])}
+              (void)run_cycles_internal(s, 140u, &dirty);
+              io[0].cycle_budget = 0u;
+              io[0].speaker_toggles = 0u;
+              io[0].text_dirty = 0u;
+            }
+
+            void sim_poke(void* sim, const char* name, unsigned int value) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s || !name) {
+                return;
+              }
+              RhdlArcGpuIo* io = [s io];
+            #{indent_cpp(poke_cases)}
+            }
+
+            unsigned int sim_peek(void* sim, const char* name) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s || !name) {
+                return 0u;
+              }
+              RhdlArcGpuIo* io = [s io];
+            #{indent_cpp(peek_cases)}
+              if (!strcmp(name, "cycle_budget")) { return io[0].cycle_budget; }
+              if (!strcmp(name, "cycles_ran")) { return io[0].cycles_ran; }
+              if (!strcmp(name, "last_clock")) { return io[0].last_clock; }
+              if (!strcmp(name, "speaker_toggles")) { return io[0].speaker_toggles; }
+              if (!strcmp(name, "text_dirty")) { return io[0].text_dirty; }
+              return 0u;
+            }
+
+            void sim_write_ram(void* sim, unsigned int addr, unsigned char value) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s) {
+                return;
+              }
+              if (addr < RAM_SIZE) {
+                uint8_t* ram = [s ram];
+                for (uint32_t i = 0; i < s.instanceCount; ++i) {
+                  ram[(i * RAM_SIZE) + addr] = value;
+                }
+              }
+            }
+
+            unsigned char sim_read_ram(void* sim, unsigned int addr) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s) {
+                return 0u;
+              }
+              if (addr < RAM_SIZE) {
+                return [s ram][addr];
+              }
+              return 0u;
+            }
+
+            void sim_write_rom(void* sim, unsigned int offset, unsigned char value) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s) {
+                return;
+              }
+              if (offset < ROM_SIZE) {
+                uint8_t* rom = [s rom];
+                uint8_t* ram = [s ram];
+                for (uint32_t i = 0; i < s.instanceCount; ++i) {
+                  rom[(i * ROM_SIZE) + offset] = value;
+                  ram[(i * RAM_SIZE) + 0xD000u + offset] = value;
+                }
+              }
+            }
+
+            unsigned int sim_run_cycles(void* sim, unsigned int n, unsigned int* dirty) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              return run_cycles_internal(s, n, dirty);
+            }
+
+            void sim_load_ram(void* sim, const unsigned char* data, unsigned int offset, unsigned int len) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s || !data) {
+                return;
+              }
+              uint8_t* ram = [s ram];
+              for (uint32_t inst = 0; inst < s.instanceCount; ++inst) {
+                uint32_t base = inst * RAM_SIZE;
+                for (unsigned int i = 0; i < len; ++i) {
+                  unsigned int addr = offset + i;
+                  if (addr >= RAM_SIZE) {
+                    break;
+                  }
+                  ram[base + addr] = data[i];
+                }
+              }
+            }
+
+            void sim_load_rom(void* sim, const unsigned char* data, unsigned int len) {
+              RhdlApple2MetalSim* s = as_sim(sim);
+              if (!s || !data) {
+                return;
+              }
+              uint8_t* rom = [s rom];
+              uint8_t* ram = [s ram];
+              unsigned int n = (len < ROM_SIZE) ? len : ROM_SIZE;
+              for (uint32_t inst = 0; inst < s.instanceCount; ++inst) {
+                uint32_t rom_base = inst * ROM_SIZE;
+                uint32_t ram_base = inst * RAM_SIZE;
+                for (unsigned int i = 0; i < n; ++i) {
+                  rom[rom_base + i] = data[i];
+                  ram[ram_base + 0xD000u + i] = data[i];
+                }
+              }
+            }
+            } // extern "C"
+          CPP
+
+          File.write(path, wrapper)
+        end
+
+        def run_or_raise(cmd, step_name, log_file)
+          out, status = Open3.capture2e(*cmd)
+          File.write(log_file, out, mode: 'a')
+          return if status.success?
+
+          raise LoadError, "#{step_name} failed: #{last_log_lines(log_file)}"
+        end
+
+        def last_log_lines(path, count: 8)
+          return 'unknown error' unless File.exist?(path)
+
+          File.read(path).lines.last(count).join.strip
+        rescue StandardError
+          'unknown error'
+        end
+
+        def shared_lib_path
+          ext = if RbConfig::CONFIG['host_os'] =~ /darwin/
+            '.dylib'
+          elsif RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
+            '.dll'
+          else
+            '.so'
+          end
+          File.join(build_dir, "#{shared_library_basename}#{ext}")
+        end
+
+        def cpp_ident(name)
+          name.to_s.gsub(/[^A-Za-z0-9_]/, '_')
+        end
+
+        def mask_literal(width)
+          w = width.to_i
+          return '0xFFFFFFFFu' if w >= 32
+          format('0x%Xu', (1 << w) - 1)
+        end
+
+        def indent_cpp(lines, spaces: 12)
+          lines = Array(lines).reject { |line| line.to_s.empty? }
+          return '' if lines.empty?
+
+          prefix = ' ' * spaces
+          lines.map { |line| "#{prefix}#{line}" }.join("\n")
+        end
+
+        def normalize_instance_count(instances)
+          raw = instances || ENV['RHDL_APPLE2_ARCILATOR_GPU_INSTANCES']
+          value = raw.to_i
+          value = 1 if value <= 0
+          [value, 1024].min
+        end
+
+        class << self
+          private
+
+          def command_available?(tool)
+            ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+              File.executable?(File.join(path, tool))
+            end
+          end
+
+          def command_success?(cmd)
+            _out, status = Open3.capture2e(*cmd)
+            status.success?
+          rescue StandardError
+            false
+          end
+
+          def macos_host?
+            RUBY_PLATFORM.include?('darwin')
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/examples/apple2/utilities/runners/headless_runner.rb b/examples/apple2/utilities/runners/headless_runner.rb
index 98db237e..0765b327 100644
--- a/examples/apple2/utilities/runners/headless_runner.rb
+++ b/examples/apple2/utilities/runners/headless_runner.rb
@@ -15,7 +15,7 @@ class HeadlessRunner
       attr_reader :runner, :mode, :sim_backend
 
       # Create a headless runner with the specified options
-      # @param mode [Symbol] Simulation mode: :ruby, :ir, :netlist, :verilog, :circt
+      # @param mode [Symbol] Simulation mode: :ruby, :ir, :netlist, :verilog, :circt, :arcilator_gpu
       # @param sim [Symbol] Simulator backend for :ir/:netlist: :interpret, :jit, :compile
       # @param sub_cycles [Integer] Sub-cycles per CPU cycle (for IR backends)
       def initialize(mode: :ruby, sim: nil, sub_cycles: 14)
@@ -39,8 +39,11 @@ def initialize(mode: :ruby, sim: nil, sub_cycles: 14)
                   when :circt
                     require_relative 'arcilator_runner'
                     RHDL::Examples::Apple2::ArcilatorRunner.new(sub_cycles: sub_cycles)
+                  when :arcilator_gpu
+                    require_relative 'arcilator_gpu_runner'
+                    RHDL::Examples::Apple2::ArcilatorGpuRunner.new(sub_cycles: sub_cycles)
                   else
-                    raise ArgumentError, "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, circt"
+                    raise ArgumentError, "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, circt, arcilator_gpu"
                   end
       end
 
@@ -191,7 +194,7 @@ def backend
           @sim_backend
         when :netlist
           @sim_backend
-        when :verilog, :circt
+        when :verilog, :circt, :arcilator_gpu
           nil
         else
           @sim_backend
@@ -289,9 +292,9 @@ def default_backend(mode)
         case mode
         when :ruby then :ruby
         when :ir, :netlist then :compile
-        when :verilog, :circt then nil
+        when :verilog, :circt, :arcilator_gpu then nil
         else
-          raise ArgumentError, "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, arcilator"
+          raise ArgumentError, "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, circt, arcilator_gpu"
         end
       end
     end
diff --git a/examples/apple2/utilities/tasks/run_task.rb b/examples/apple2/utilities/tasks/run_task.rb
index 910415d1..7a9013b5 100644
--- a/examples/apple2/utilities/tasks/run_task.rb
+++ b/examples/apple2/utilities/tasks/run_task.rb
@@ -226,6 +226,7 @@ def run
                       when :netlist then "Netlist (gate-level)"
                       when :verilog then "Verilog (Verilator RTL)"
                       when :circt then "CIRCT (Arcilator RTL)"
+                      when :arcilator_gpu then "Arcilator GPU (ArcToGPU Metal)"
                       else @sim_mode.to_s
                       end
           puts "Starting Apple II emulator in #{mode_name} mode..."
diff --git a/examples/riscv/bin/riscv b/examples/riscv/bin/riscv
index d100ff71..595df488 100755
--- a/examples/riscv/bin/riscv
+++ b/examples/riscv/bin/riscv
@@ -43,7 +43,7 @@ module RHDL
             opts.separator "RISC-V Core Runner"
             opts.separator ""
 
-            opts.on("-m", "--mode TYPE", %i[ruby ir netlist verilog circt], "Simulation mode: ir (default), ruby, netlist, verilog, circt") do |v|
+            opts.on("-m", "--mode TYPE", %i[ruby ir netlist verilog circt arcilator_gpu], "Simulation mode: ir (default), ruby, netlist, verilog, circt, arcilator_gpu") do |v|
               options[:mode] = v
             end
 
@@ -177,7 +177,7 @@ module RHDL
           options[:sim] = case options[:mode]
                           when :ruby then :ruby
                           when :ir, :netlist then :compile
-                          when :verilog, :circt then :ruby
+                          when :verilog, :circt, :arcilator_gpu then :ruby
                           else :ruby
                           end
         end
@@ -200,7 +200,7 @@ module RHDL
                               when :interpret then 1_000
                               else 1_000
                               end
-                            when :verilog, :circt
+                            when :verilog, :circt, :arcilator_gpu
                               100_000
                             else
                               1_000
diff --git a/examples/riscv/utilities/runners/arcilator_gpu_runner.rb b/examples/riscv/utilities/runners/arcilator_gpu_runner.rb
new file mode 100644
index 00000000..6a9f8688
--- /dev/null
+++ b/examples/riscv/utilities/runners/arcilator_gpu_runner.rb
@@ -0,0 +1,1173 @@
+# frozen_string_literal: true
+
+# RV32I Arcilator GPU Runner - Native RTL simulation via ArcToGPU + Metal
+
+require_relative 'arcilator_runner'
+require 'fileutils'
+require 'json'
+require 'open3'
+require 'rbconfig'
+require 'rhdl/codegen/firrtl/firrtl'
+require 'rhdl/codegen/firrtl/arc_to_gpu_lowering'
+
+module RHDL
+  module Examples
+    module RISCV
+      class ArcilatorGpuRunner < ArcilatorRunner
+        BUILD_BASE = File.expand_path('../../.hdl_build', __dir__)
+        REQUIRED_TOOLS = %w[firtool circt-opt arcilator].freeze
+        MAX_INSTANCE_COUNT = 1024
+        DEFAULT_ARC_TO_GPU_PROFILE = :riscv
+        DEFAULT_BUILD_VARIANT = 'arcilator_gpu'
+        DEFAULT_SHARED_LIB_NAME = 'libriscv_arcilator_gpu_sim.so'
+        DEFAULT_BACKEND_SYMBOL = :arcilator_gpu
+        DEFAULT_SIMULATOR_TYPE = :hdl_arcilator_gpu
+        ARC_TO_GPU_BUILD_ENV_VARS = %w[
+          RHDL_ARC_TO_GPU_RISCV_CORE_SPECIALIZE
+        ].freeze
+
+        class << self
+          def status
+            missing_tools = []
+            REQUIRED_TOOLS.each { |tool| missing_tools << tool unless command_available?(tool) }
+
+            unless command_available?('clang++') || command_available?('c++')
+              missing_tools << 'clang++/c++'
+            end
+
+            if macos_host?
+              missing_tools << 'xcrun' unless command_available?('xcrun')
+              missing_tools << 'metal' unless command_success?(%w[xcrun -f metal])
+              missing_tools << 'metallib' unless command_success?(%w[xcrun -f metallib])
+            else
+              missing_tools << 'macOS Metal toolchain'
+            end
+
+            {
+              ready: missing_tools.empty?,
+              missing_tools: missing_tools.uniq
+            }
+          end
+
+          def available?
+            status[:ready]
+          end
+
+          def ensure_available!
+            info = status
+            return info if info[:ready]
+
+            raise LoadError,
+              "arcilator_gpu backend unavailable (missing tools: #{info[:missing_tools].join(', ')}). " \
+              'Install CIRCT tools and the macOS Metal toolchain.'
+          end
+
+          private
+
+          def command_available?(tool)
+            ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+              File.executable?(File.join(path, tool))
+            end
+          end
+
+          def command_success?(cmd)
+            _out, status = Open3.capture2e(*cmd)
+            status.success?
+          rescue StandardError
+            false
+          end
+
+          def macos_host?
+            RUBY_PLATFORM.include?('darwin')
+          end
+        end
+
+        attr_reader :instance_count
+
+        def initialize(
+          mem_size: Memory::DEFAULT_SIZE,
+          instances: nil,
+          core_specialize: nil,
+          arc_to_gpu_profile: DEFAULT_ARC_TO_GPU_PROFILE,
+          build_variant: DEFAULT_BUILD_VARIANT,
+          shared_lib_name: DEFAULT_SHARED_LIB_NAME,
+          backend_symbol: DEFAULT_BACKEND_SYMBOL,
+          simulator_type_symbol: DEFAULT_SIMULATOR_TYPE
+        )
+          normalized_mem_size = normalize_mem_size(mem_size)
+          @instance_count = normalize_instance_count(instances)
+          @core_specialize = normalize_core_specialize(core_specialize)
+          @arc_to_gpu_profile = arc_to_gpu_profile.to_sym
+          @build_variant = build_variant.to_s
+          @shared_lib_name = shared_lib_name.to_s
+          @backend_symbol = backend_symbol.to_sym
+          @simulator_type_symbol = simulator_type_symbol.to_sym
+          env_overrides = {
+            'RHDL_RISCV_ARCILATOR_GPU_INSTANCES_RUNTIME' => @instance_count.to_s,
+            'RHDL_ARC_TO_GPU_RISCV_CORE_SPECIALIZE' => (@core_specialize ? '1' : '0'),
+            'RHDL_RISCV_ARCILATOR_GPU_CORE_SPECIALIZE_RUNTIME' => (@core_specialize ? '1' : '0')
+          }
+          previous_env = env_overrides.to_h { |key, _value| [key, ENV[key]] }
+          env_overrides.each { |key, value| ENV[key] = value }
+          initialize_backend_runner(
+            backend_sym: @backend_symbol,
+            simulator_type_sym: @simulator_type_symbol,
+            mem_size: normalized_mem_size
+          )
+        ensure
+          if previous_env
+            previous_env.each do |key, value|
+              if value.nil?
+                ENV.delete(key)
+              else
+                ENV[key] = value
+              end
+            end
+          end
+        end
+
+        def read_pc
+          if @sim_read_pc_fn
+            @sim_read_pc_fn.call(@sim_ctx).to_i & 0xFFFF_FFFF
+          else
+            eval_cpu
+            super
+          end
+        rescue StandardError
+          eval_cpu
+          super
+        end
+
+        def read_reg(index)
+          idx = index.to_i & 0x1F
+          return 0 if idx.zero?
+
+          if @sim_read_reg_fn
+            @sim_read_reg_fn.call(@sim_ctx, idx).to_i & 0xFFFF_FFFF
+          else
+            super
+          end
+        rescue StandardError
+          super
+        end
+
+        def current_inst
+          if @sim_read_inst_fn
+            @sim_read_inst_fn.call(@sim_ctx).to_i & 0xFFFF_FFFF
+          else
+            eval_cpu
+            super
+          end
+        rescue StandardError
+          eval_cpu
+          super
+        end
+
+        def dispatch_count
+          return nil unless @sim_dispatch_count_fn
+
+          @sim_dispatch_count_fn.call(@sim_ctx).to_i
+        rescue StandardError
+          nil
+        end
+
+        def wait_count
+          return nil unless @sim_wait_count_fn
+
+          @sim_wait_count_fn.call(@sim_ctx).to_i
+        rescue StandardError
+          nil
+        end
+
+        def fast_dispatch_count
+          return nil unless @sim_fast_dispatch_count_fn
+
+          @sim_fast_dispatch_count_fn.call(@sim_ctx).to_i
+        rescue StandardError
+          nil
+        end
+
+        def fallback_dispatch_count
+          return nil unless @sim_fallback_dispatch_count_fn
+
+          @sim_fallback_dispatch_count_fn.call(@sim_ctx).to_i
+        rescue StandardError
+          nil
+        end
+
+        private
+
+        def load_shared_library
+          super
+          load_optional_metrics_symbols
+          validate_sim_context!
+        end
+
+        def load_optional_metrics_symbols
+          @sim_read_pc_fn = Fiddle::Function.new(
+            @lib['sim_read_pc'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @sim_read_reg_fn = Fiddle::Function.new(
+            @lib['sim_read_reg'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT],
+            Fiddle::TYPE_INT
+          )
+          @sim_read_inst_fn = Fiddle::Function.new(
+            @lib['sim_read_inst'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @sim_dispatch_count_fn = Fiddle::Function.new(
+            @lib['sim_dispatch_count'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @sim_wait_count_fn = Fiddle::Function.new(
+            @lib['sim_wait_count'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @sim_fast_dispatch_count_fn = Fiddle::Function.new(
+            @lib['sim_fast_dispatch_count'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @sim_fallback_dispatch_count_fn = Fiddle::Function.new(
+            @lib['sim_fallback_dispatch_count'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+        rescue Fiddle::DLError, NameError
+          @sim_read_pc_fn = nil
+          @sim_read_reg_fn = nil
+          @sim_read_inst_fn = nil
+          @sim_dispatch_count_fn = nil
+          @sim_wait_count_fn = nil
+          @sim_fast_dispatch_count_fn = nil
+          @sim_fallback_dispatch_count_fn = nil
+        end
+
+        def normalize_mem_size(mem_size)
+          size = mem_size.to_i
+          size = Memory::DEFAULT_SIZE if size <= 0
+          return size if power_of_two?(size)
+
+          next_pow2 = 1
+          next_pow2 <<= 1 while next_pow2 < size
+          warn "ArcilatorGpuRunner mem_size #{size} is not power-of-two; rounding up to #{next_pow2}."
+          next_pow2
+        end
+
+        def power_of_two?(value)
+          value > 0 && (value & (value - 1)).zero?
+        end
+
+        def normalize_instance_count(instances)
+          raw = instances || ENV['RHDL_RISCV_ARCILATOR_GPU_INSTANCES'] || ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES']
+          value = raw.to_i
+          value = 1 if value <= 0
+          [value, MAX_INSTANCE_COUNT].min
+        end
+
+        def normalize_core_specialize(core_specialize)
+          return core_specialize unless core_specialize.nil?
+
+          raw = ENV['RHDL_RISCV_ARCILATOR_GPU_CORE_SPECIALIZE']
+          return false if raw.nil?
+
+          !%w[0 false no off].include?(raw.to_s.strip.downcase)
+        end
+
+        def check_tools_available!
+          self.class.ensure_available!
+        end
+
+        def build_dir
+          @build_dir ||= File.join(BUILD_BASE, @build_variant)
+        end
+
+        def shared_lib_path
+          File.join(build_dir, @shared_lib_name)
+        end
+
+        def build_simulation
+          FileUtils.mkdir_p(build_dir)
+
+          fir_file = File.join(build_dir, 'riscv_cpu.fir')
+          parsed_mlir_file = File.join(build_dir, 'riscv_cpu_parsed.mlir')
+          lowered_mlir_file = File.join(build_dir, 'riscv_cpu_lowered.mlir')
+          hw_mlir_file = File.join(build_dir, 'riscv_cpu_hw.mlir')
+          arc_mlir_file = File.join(build_dir, 'riscv_cpu_arc.mlir')
+          gpu_mlir_file = File.join(build_dir, 'riscv_cpu_arc_to_gpu.mlir')
+          gpu_meta_file = File.join(build_dir, 'riscv_cpu_arc_to_gpu.json')
+          metal_source_file = File.join(build_dir, 'riscv_cpu_arc_to_gpu.metal')
+          metal_air_file = File.join(build_dir, 'riscv_cpu_arc_to_gpu.air')
+          metal_lib_file = File.join(build_dir, 'riscv_cpu_arc_to_gpu.metallib')
+          wrapper_file = File.join(build_dir, 'riscv_arcgpu_wrapper.mm')
+          build_config_file = File.join(build_dir, 'riscv_metal_build_config.json')
+          log_file = File.join(build_dir, 'riscv_metal.log')
+          lib_file = shared_lib_path
+          expected_build_config = build_config_signature
+
+          needs_rebuild = !File.exist?(lib_file)
+          outputs = [gpu_meta_file, metal_source_file, metal_lib_file, wrapper_file, lib_file, build_config_file]
+          needs_rebuild ||= outputs.any? { |path| !File.exist?(path) }
+
+          unless needs_rebuild
+            deps = [
+              __FILE__,
+              File.expand_path('../../hdl/cpu.rb', __dir__),
+              File.expand_path('../../../../lib/rhdl/codegen/firrtl/firrtl.rb', __dir__),
+              File.expand_path('../../../../lib/rhdl/codegen/firrtl/arc_to_gpu_lowering.rb', __dir__),
+              File.expand_path(
+                "../../../../lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/#{@arc_to_gpu_profile}.rb",
+                __dir__
+              )
+            ].select { |path| File.exist?(path) }
+
+            newest_dep = deps.map { |path| File.mtime(path) }.max
+            oldest_output = outputs.map { |path| File.mtime(path) }.min
+            needs_rebuild = newest_dep && oldest_output && newest_dep > oldest_output
+            needs_rebuild ||= read_build_config(build_config_file) != expected_build_config
+          end
+
+          if needs_rebuild
+            File.delete(log_file) if File.exist?(log_file)
+            export_firrtl(fir_file)
+            run_or_raise(%W[firtool #{fir_file} --parse-only -o #{parsed_mlir_file}], 'firtool parse', log_file)
+            run_or_raise(
+              ['circt-opt', parsed_mlir_file, "--pass-pipeline=#{firrtl_pipeline_without_comb_check}", '-o', lowered_mlir_file],
+              'circt-opt FIRRTL pipeline',
+              log_file
+            )
+            run_or_raise(
+              ['firtool', '--format=mlir', lowered_mlir_file, '--ir-hw', '-o', hw_mlir_file],
+              'firtool HW lowering',
+              log_file
+            )
+            emit_gpu_input_mlir(
+              hw_mlir_file: hw_mlir_file,
+              arc_mlir_file: arc_mlir_file,
+              log_file: log_file
+            )
+
+            RHDL::Codegen::FIRRTL::ArcToGpuLowering.lower(
+              arc_mlir_path: arc_mlir_file,
+              gpu_mlir_path: gpu_mlir_file,
+              metadata_path: gpu_meta_file,
+              metal_source_path: metal_source_file,
+              profile: @arc_to_gpu_profile
+            )
+
+            compile_metal_shader(
+              metal_source_file: metal_source_file,
+              metal_air_file: metal_air_file,
+              metal_lib_file: metal_lib_file,
+              log_file: log_file
+            )
+
+            write_wrapper(
+              path: wrapper_file,
+              metadata_path: gpu_meta_file,
+              metallib_path: metal_lib_file
+            )
+            link_shared_library(wrapper_file, lib_file, log_file: log_file)
+            File.write(build_config_file, JSON.pretty_generate(expected_build_config))
+          end
+
+          @lib_path = lib_file
+        end
+
+        def export_firrtl(path)
+          flat_ir = CPU.to_flat_ir(top_name: 'riscv_cpu')
+          firrtl = RHDL::Codegen::FIRRTL.generate(flat_ir)
+          File.write(path, firrtl)
+        end
+
+        def build_config_signature
+          {
+            'format' => 4,
+            'arc_to_gpu_profile' => @arc_to_gpu_profile.to_s,
+            'build_dir' => build_dir,
+            'shared_lib_path' => shared_lib_path,
+            'arc_to_gpu_env' => ARC_TO_GPU_BUILD_ENV_VARS.to_h { |name| [name, ENV[name].to_s] }
+          }
+        end
+
+        def read_build_config(path)
+          return nil unless File.exist?(path)
+
+          JSON.parse(File.read(path))
+        rescue JSON::ParserError
+          nil
+        end
+
+        def compile_metal_shader(metal_source_file:, metal_air_file:, metal_lib_file:, log_file:)
+          module_cache_dir = File.join(build_dir, 'clang_module_cache')
+          FileUtils.rm_rf(module_cache_dir)
+          FileUtils.mkdir_p(module_cache_dir)
+          run_or_raise(
+            [
+              'xcrun', '-sdk', 'macosx', 'metal', '-c', '-O3',
+              "-fmodules-cache-path=#{module_cache_dir}",
+              metal_source_file, '-o', metal_air_file
+            ],
+            'metal shader compile',
+            log_file
+          )
+          run_or_raise(
+            ['xcrun', '-sdk', 'macosx', 'metallib', metal_air_file, '-o', metal_lib_file],
+            'metallib link',
+            log_file
+          )
+        end
+
+        def link_shared_library(wrapper_file, output_file, log_file:)
+          cxx = command_available?('clang++') ? 'clang++' : 'c++'
+          cmd = [
+            cxx,
+            '-std=c++17',
+            '-x',
+            'objective-c++',
+            '-fobjc-arc',
+            '-dynamiclib',
+            '-O2',
+            '-o',
+            output_file,
+            wrapper_file,
+            '-framework',
+            'Foundation',
+            '-framework',
+            'Metal'
+          ]
+          run_or_raise(cmd, 'Objective-C++ link', log_file)
+        end
+
+        def run_or_raise(cmd, step_name, log_file)
+          out, status = Open3.capture2e(*cmd)
+          File.write(log_file, out, mode: 'a')
+          return if status.success?
+
+          raise LoadError, "#{step_name} failed for RISC-V Metal runner: #{last_log_lines(log_file)}"
+        end
+
+        def emit_gpu_input_mlir(
+          hw_mlir_file:,
+          arc_mlir_file:,
+          log_file:
+        )
+          run_or_raise(
+            ['arcilator', hw_mlir_file, '--emit-mlir', '--until-after=arc-opt', '-o', arc_mlir_file],
+            'arcilator Arc emission',
+            log_file
+          )
+        end
+
+        def last_log_lines(log_file, count = 20)
+          return 'no log output' unless File.exist?(log_file)
+
+          File.readlines(log_file).last(count).join.strip
+        end
+
+        def validate_sim_context!
+          return unless !@sim_ctx || (@sim_ctx.respond_to?(:to_i) && @sim_ctx.to_i.zero?)
+
+          raise LoadError,
+            'ArcilatorGPU simulation context initialization failed (sim_create returned null). ' \
+            'Check the generated Metal library path and GPU toolchain compatibility.'
+        end
+
+        def cpp_ident(name)
+          name.to_s.gsub(/[^A-Za-z0-9_]/, '_')
+        end
+
+        def write_wrapper(path:, metadata_path:, metallib_path:)
+          metadata = JSON.parse(File.read(metadata_path))
+          state_count = metadata.dig('metal', 'state_count').to_i
+          state_scalar_bits = metadata.dig('metal', 'state_scalar_bits').to_i
+          state_scalar_bits = 32 if state_scalar_bits <= 0
+          state_scalar_bytes = state_scalar_bits > 32 ? 8 : 4
+          state_scalar_cpp_type = state_scalar_bits > 32 ? 'uint64_t' : 'uint32_t'
+          kernel_name = metadata.dig('metal', 'entry').to_s
+          input_layout = metadata.dig('metal', 'runtime_input_layout')
+          input_layout = metadata['top_input_layout'] if input_layout.nil?
+          input_layout = Array(input_layout)
+          output_layout = metadata.dig('metal', 'runtime_output_layout')
+          output_layout = metadata['top_output_layout'] if output_layout.nil?
+          output_layout = Array(output_layout)
+          state_layout = Array(metadata['state_layout'])
+          introspection = metadata.dig('metal', 'introspection') || {}
+
+          raise LoadError, 'ArcToGPU metadata missing metal entry' if kernel_name.empty?
+          raise LoadError, 'ArcToGPU metadata missing state_count' if state_count <= 0
+
+          pc_slot = introspection['pc_slot']
+          pc_width = introspection['pc_width']
+          if pc_slot.nil?
+            pc_slot_entry = state_layout.find { |entry| entry.fetch('result_ref', '').include?('pc_reg__pc') }
+            pc_slot_entry ||= state_layout.find { |entry| entry.fetch('result_ref', '').include?('pc_reg__q') }
+            pc_slot = pc_slot_entry ? pc_slot_entry.fetch('index').to_i : -1
+            pc_width = pc_slot_entry ? pc_slot_entry.fetch('width').to_i : 32
+          end
+          pc_slot = pc_slot.to_i
+          pc_width = pc_width.to_i
+
+          regfile_base_slot = introspection['regfile_base_slot']
+          regfile_length = introspection['regfile_length']
+          if regfile_base_slot.nil? || regfile_length.nil?
+            regfile_entry = state_layout.find do |entry|
+              entry.fetch('kind', '') == 'arc_memory' &&
+                entry.fetch('length', 0).to_i == 32 &&
+                entry.fetch('index_width', 0).to_i == 5 &&
+                entry.fetch('width', 0).to_i == 32 &&
+                entry.fetch('slots_per_element', 1).to_i == 1
+            end
+            regfile_base_slot = regfile_entry ? regfile_entry.fetch('index').to_i : -1
+            regfile_length = regfile_entry ? regfile_entry.fetch('length').to_i : 0
+          end
+          regfile_base_slot = regfile_base_slot.to_i
+          regfile_length = regfile_length.to_i
+
+          struct_input_fields = input_layout.map { |entry| "  uint32_t #{cpp_ident(entry.fetch('name'))};" }
+          struct_output_fields = output_layout.map { |entry| "  uint32_t #{cpp_ident(entry.fetch('name'))};" }
+
+          poke_cases = input_layout.map do |entry|
+            name = entry.fetch('name')
+            field = cpp_ident(name)
+            width = entry.fetch('width').to_i
+            <<~CPP
+              if (!strcmp(name, "#{name}")) {
+                uint32_t masked = mask_width(value, #{width}u);
+                for (uint32_t inst = 0u; inst < ctx->sim.instanceCount; ++inst) {
+                  RhdlArcGpuIo* io_inst = [ctx->sim ioAtIndex:inst];
+                  if (io_inst) {
+                    io_inst->#{field} = masked;
+                  }
+                }
+                return;
+              }
+            CPP
+          end
+
+          peek_cases = (input_layout + output_layout).map do |entry|
+            name = entry.fetch('name')
+            field = cpp_ident(name)
+            width = entry.fetch('width').to_i
+            <<~CPP
+              if (!strcmp(name, "#{name}")) {
+                return io->#{field} & mask_width(0xFFFFFFFFu, #{width}u);
+              }
+            CPP
+          end
+
+          input_field_set = input_layout.each_with_object({}) { |entry, acc| acc[cpp_ident(entry.fetch('name'))] = true }
+
+          init_defaults = []
+          init_defaults << 'io->cycle_budget = 0u;'
+          init_defaults << 'io->cycles_ran = 0u;'
+          init_defaults << 'io->mem_mask = memMask;'
+          init_defaults << 'io->clk = 0u;' if input_field_set[cpp_ident('clk')]
+          init_defaults << 'io->rst = 1u;' if input_field_set[cpp_ident('rst')]
+          init_defaults << 'io->irq_software = 0u;' if input_field_set[cpp_ident('irq_software')]
+          init_defaults << 'io->irq_timer = 0u;' if input_field_set[cpp_ident('irq_timer')]
+          init_defaults << 'io->irq_external = 0u;' if input_field_set[cpp_ident('irq_external')]
+          init_defaults << 'io->inst_data = 0u;' if input_field_set[cpp_ident('inst_data')]
+          init_defaults << 'io->data_rdata = 0u;' if input_field_set[cpp_ident('data_rdata')]
+          init_defaults << 'io->inst_ptw_pte0 = 0u;' if input_field_set[cpp_ident('inst_ptw_pte0')]
+          init_defaults << 'io->inst_ptw_pte1 = 0u;' if input_field_set[cpp_ident('inst_ptw_pte1')]
+          init_defaults << 'io->data_ptw_pte0 = 0u;' if input_field_set[cpp_ident('data_ptw_pte0')]
+          init_defaults << 'io->data_ptw_pte1 = 0u;' if input_field_set[cpp_ident('data_ptw_pte1')]
+          init_defaults << 'io->debug_reg_addr = 0u;' if input_field_set[cpp_ident('debug_reg_addr')]
+
+          reset_assert_line = input_field_set[cpp_ident('rst')] ? 'io->rst = 1u;' : ''
+          reset_deassert_line = input_field_set[cpp_ident('rst')] ? 'io->rst = 0u;' : ''
+          reset_clk_low_line = input_field_set[cpp_ident('clk')] ? 'io->clk = 0u;' : ''
+          reset_clk_high_line = input_field_set[cpp_ident('clk')] ? 'io->clk = 1u;' : ''
+          objc_sim_class = "RhdlRiscvMetalSim_#{cpp_ident(@build_variant)}"
+
+          wrapper = <<~CPP
+            #import <Foundation/Foundation.h>
+            #import <Metal/Metal.h>
+            #include <CoreFoundation/CoreFoundation.h>
+            #include <dlfcn.h>
+            #include <cstdint>
+            #include <cstring>
+            #include <cstdlib>
+            #include <cstdio>
+
+            static const uint32_t STATE_COUNT = #{state_count}u;
+            static const uint32_t STATE_SCALAR_BITS = #{state_scalar_bits}u;
+            static const uint32_t STATE_SCALAR_BYTES = #{state_scalar_bytes}u;
+            static const uint32_t MAX_INSTANCE_COUNT = #{MAX_INSTANCE_COUNT}u;
+            static const int32_t PC_SLOT_INDEX = #{pc_slot};
+            static const uint32_t PC_SLOT_WIDTH = #{pc_width}u;
+            static const int32_t REGFILE_BASE_SLOT = #{regfile_base_slot};
+            static const uint32_t REGFILE_LENGTH = #{regfile_length}u;
+            static NSString* const kMetallibFilename = @#{File.basename(metallib_path).dump};
+            static NSString* const kMetallibFallbackPath = @#{metallib_path.dump};
+            static NSString* const kKernelName = @#{kernel_name.dump};
+            using RhdlStateScalar = #{state_scalar_cpp_type};
+
+            struct RhdlArcGpuIo {
+              uint32_t cycle_budget;
+              uint32_t cycles_ran;
+              uint32_t mem_mask;
+              uint32_t _reserved;
+            #{struct_input_fields.join("\n")}
+            #{struct_output_fields.join("\n")}
+            };
+
+            static inline uint32_t mask_width(uint32_t value, uint32_t width) {
+              if (width >= 32u) {
+                return value;
+              }
+              if (width == 0u) {
+                return 0u;
+              }
+              return value & ((1u << width) - 1u);
+            }
+
+            static inline uint32_t resolve_instance_count() {
+              const char* raw = getenv("RHDL_RISCV_ARCILATOR_GPU_INSTANCES_RUNTIME");
+              if (!raw || *raw == '\\0') {
+                return 1u;
+              }
+              char* end = nullptr;
+              unsigned long parsed = strtoul(raw, &end, 10);
+              if (end == raw || parsed == 0ul) {
+                return 1u;
+              }
+              if (parsed > (unsigned long)MAX_INSTANCE_COUNT) {
+                return MAX_INSTANCE_COUNT;
+              }
+              return (uint32_t)parsed;
+            }
+
+            static inline uint32_t read_word_le(const uint8_t* mem, uint32_t mask, uint32_t addr) {
+              uint32_t a = addr & mask;
+              return (uint32_t)mem[a] |
+                ((uint32_t)mem[(a + 1u) & mask] << 8u) |
+                ((uint32_t)mem[(a + 2u) & mask] << 16u) |
+                ((uint32_t)mem[(a + 3u) & mask] << 24u);
+            }
+
+            static NSString* resolveMetallibPath() {
+              Dl_info info;
+              if (dladdr((const void*)&resolveMetallibPath, &info) != 0 && info.dli_fname) {
+                NSString* dylibPath = [NSString stringWithUTF8String:info.dli_fname];
+                NSString* candidate = [[dylibPath stringByDeletingLastPathComponent]
+                  stringByAppendingPathComponent:kMetallibFilename];
+                if ([[NSFileManager defaultManager] fileExistsAtPath:candidate]) {
+                  return candidate;
+                }
+              }
+              return kMetallibFallbackPath;
+            }
+
+            @interface #{objc_sim_class} : NSObject
+            @property(nonatomic, strong) id<MTLDevice> device;
+            @property(nonatomic, strong) id<MTLCommandQueue> queue;
+            @property(nonatomic, strong) id<MTLLibrary> library;
+            @property(nonatomic, strong) id<MTLComputePipelineState> pipeline;
+            @property(nonatomic, strong) id<MTLBuffer> stateBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> instBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> dataBuffer;
+            @property(nonatomic, strong) id<MTLBuffer> ioBuffer;
+            @property(nonatomic, assign) uint32_t memSize;
+            @property(nonatomic, assign) uint32_t memMask;
+            @property(nonatomic, assign) uint32_t instanceCount;
+            @property(nonatomic, assign) uint32_t threadgroupWidth;
+            @property(nonatomic, assign) uint32_t dispatchCount;
+            @property(nonatomic, assign) uint32_t waitCount;
+            @property(nonatomic, assign) uint32_t fastDispatchCount;
+            - (instancetype)initWithMetallibPath:(NSString*)metallibPath kernelName:(NSString*)kernelName stateCount:(uint32_t)stateCount stateScalarBytes:(uint32_t)stateScalarBytes memSize:(uint32_t)memSize instanceCount:(uint32_t)instanceCount;
+            - (BOOL)dispatchKernelWithBudget:(uint32_t)budget;
+            - (RhdlArcGpuIo*)io;
+            - (RhdlStateScalar*)stateSlots;
+            - (uint8_t*)instMem;
+            - (uint8_t*)dataMem;
+            - (RhdlArcGpuIo*)ioAtIndex:(uint32_t)index;
+            - (RhdlStateScalar*)stateSlotsAtIndex:(uint32_t)index;
+            - (uint8_t*)instMemAtIndex:(uint32_t)index;
+            - (uint8_t*)dataMemAtIndex:(uint32_t)index;
+            @end
+
+            @implementation #{objc_sim_class}
+            - (instancetype)initWithMetallibPath:(NSString*)metallibPath kernelName:(NSString*)kernelName stateCount:(uint32_t)stateCount stateScalarBytes:(uint32_t)stateScalarBytes memSize:(uint32_t)memSize instanceCount:(uint32_t)instanceCount {
+              self = [super init];
+              if (!self) {
+                return nil;
+              }
+
+              self.memSize = memSize;
+              self.memMask = (memSize > 0u) ? (memSize - 1u) : 0u;
+              self.instanceCount = (instanceCount > 0u) ? instanceCount : 1u;
+              self.threadgroupWidth = 1u;
+              self.dispatchCount = 0u;
+              self.waitCount = 0u;
+              self.fastDispatchCount = 0u;
+
+              self.device = MTLCreateSystemDefaultDevice();
+              if (!self.device) {
+                fprintf(stderr, "[riscv-arcilator-gpu] init failed: no MTL device\\n");
+                return nil;
+              }
+
+              self.queue = [self.device newCommandQueue];
+              if (!self.queue) {
+                fprintf(stderr, "[riscv-arcilator-gpu] init failed: no command queue\\n");
+                return nil;
+              }
+
+              NSError* error = nil;
+              NSURL* libURL = [NSURL fileURLWithPath:metallibPath];
+              self.library = [self.device newLibraryWithURL:libURL error:&error];
+              if (!self.library) {
+                fprintf(
+                  stderr,
+                  "[riscv-arcilator-gpu] failed to load metallib %s: %s\\n",
+                  metallibPath.UTF8String,
+                  error.localizedDescription.UTF8String
+                );
+                return nil;
+              }
+
+              id<MTLFunction> fn = [self.library newFunctionWithName:kernelName];
+              if (!fn) {
+                fprintf(stderr, "[riscv-arcilator-gpu] kernel not found: %s\\n", kernelName.UTF8String);
+                return nil;
+              }
+
+              self.pipeline = [self.device newComputePipelineStateWithFunction:fn error:&error];
+              if (!self.pipeline) {
+                fprintf(stderr, "[riscv-arcilator-gpu] failed to build pipeline: %s\\n", error.localizedDescription.UTF8String);
+                return nil;
+              }
+
+              uint32_t executionWidth = (uint32_t)self.pipeline.threadExecutionWidth;
+              uint32_t maxThreads = (uint32_t)self.pipeline.maxTotalThreadsPerThreadgroup;
+              uint32_t preferredTg = executionWidth > 0u ? executionWidth : 1u;
+              if (maxThreads > 0u && preferredTg > maxThreads) {
+                preferredTg = maxThreads;
+              }
+              if (preferredTg == 0u) {
+                preferredTg = 1u;
+              }
+              if (preferredTg > self.instanceCount) {
+                preferredTg = self.instanceCount;
+              }
+              self.threadgroupWidth = preferredTg;
+
+              uint64_t stateBytes = (uint64_t)stateCount * (uint64_t)stateScalarBytes * (uint64_t)self.instanceCount;
+              uint64_t memBytes = (uint64_t)memSize * (uint64_t)self.instanceCount;
+              uint64_t ioBytes = (uint64_t)sizeof(RhdlArcGpuIo) * (uint64_t)self.instanceCount;
+              self.stateBuffer = [self.device newBufferWithLength:stateBytes options:MTLResourceStorageModeShared];
+              self.instBuffer = [self.device newBufferWithLength:memBytes options:MTLResourceStorageModeShared];
+              // Unified memory model: instruction and data views alias the same buffer.
+              self.dataBuffer = self.instBuffer;
+              self.ioBuffer = [self.device newBufferWithLength:ioBytes options:MTLResourceStorageModeShared];
+
+              if (!self.stateBuffer || !self.instBuffer || !self.ioBuffer) {
+                fprintf(stderr, "[riscv-arcilator-gpu] failed to allocate GPU buffers\\n");
+                return nil;
+              }
+
+              memset(self.stateBuffer.contents, 0, stateBytes);
+              memset(self.instBuffer.contents, 0, memBytes);
+              memset(self.ioBuffer.contents, 0, ioBytes);
+
+              RhdlArcGpuIo* io = [self ioAtIndex:0u];
+              if (io) {
+                uint32_t memMask = self.memMask;
+            #{init_defaults.join("\n")}
+                for (uint32_t i = 1u; i < self.instanceCount; ++i) {
+                  RhdlArcGpuIo* ioInst = [self ioAtIndex:i];
+                  if (ioInst) {
+                    *ioInst = *io;
+                  }
+                }
+              }
+
+              return self;
+            }
+
+            - (BOOL)dispatchKernelWithBudget:(uint32_t)budget {
+              id<MTLCommandBuffer> commandBuffer = [self.queue commandBuffer];
+              if (!commandBuffer) {
+                return NO;
+              }
+
+              id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+              if (!encoder) {
+                return NO;
+              }
+
+              RhdlArcGpuIo* io0 = [self ioAtIndex:0u];
+              if (!io0) {
+                [encoder endEncoding];
+                return NO;
+              }
+
+              for (uint32_t i = 0u; i < self.instanceCount; ++i) {
+                RhdlArcGpuIo* io = [self ioAtIndex:i];
+                if (!io) {
+                  continue;
+                }
+                io->cycle_budget = budget;
+                io->cycles_ran = 0u;
+              }
+
+              [encoder setComputePipelineState:self.pipeline];
+              [encoder setBuffer:self.stateBuffer offset:0 atIndex:0];
+              [encoder setBuffer:self.instBuffer offset:0 atIndex:1];
+              [encoder setBuffer:self.dataBuffer offset:0 atIndex:2];
+              [encoder setBuffer:self.ioBuffer offset:0 atIndex:3];
+              MTLSize grid = MTLSizeMake(self.instanceCount, 1, 1);
+              uint32_t tgWidth = self.threadgroupWidth > 0u ? self.threadgroupWidth : 1u;
+              if (tgWidth > self.instanceCount) {
+                tgWidth = self.instanceCount;
+              }
+              MTLSize tg = MTLSizeMake(tgWidth, 1, 1);
+              [encoder dispatchThreads:grid threadsPerThreadgroup:tg];
+              [encoder endEncoding];
+
+              self.dispatchCount = self.dispatchCount + 1u;
+              self.fastDispatchCount = self.fastDispatchCount + 1u;
+              [commandBuffer commit];
+              [commandBuffer waitUntilCompleted];
+              self.waitCount = self.waitCount + 1u;
+              if (commandBuffer.status != MTLCommandBufferStatusCompleted) {
+                return NO;
+              }
+              return YES;
+            }
+
+            - (RhdlArcGpuIo*)io {
+              return (RhdlArcGpuIo*)self.ioBuffer.contents;
+            }
+
+            - (RhdlStateScalar*)stateSlots {
+              return (RhdlStateScalar*)self.stateBuffer.contents;
+            }
+
+            - (uint8_t*)instMem {
+              return (uint8_t*)self.instBuffer.contents;
+            }
+
+            - (uint8_t*)dataMem {
+              return (uint8_t*)self.instBuffer.contents;
+            }
+
+            - (RhdlArcGpuIo*)ioAtIndex:(uint32_t)index {
+              if (index >= self.instanceCount) {
+                return nullptr;
+              }
+              return ((RhdlArcGpuIo*)self.ioBuffer.contents) + index;
+            }
+
+            - (RhdlStateScalar*)stateSlotsAtIndex:(uint32_t)index {
+              if (index >= self.instanceCount) {
+                return nullptr;
+              }
+              return ((RhdlStateScalar*)self.stateBuffer.contents) + ((size_t)index * (size_t)STATE_COUNT);
+            }
+
+            - (uint8_t*)instMemAtIndex:(uint32_t)index {
+              if (index >= self.instanceCount) {
+                return nullptr;
+              }
+              return ((uint8_t*)self.instBuffer.contents) + ((size_t)index * (size_t)self.memSize);
+            }
+
+            - (uint8_t*)dataMemAtIndex:(uint32_t)index {
+              if (index >= self.instanceCount) {
+                return nullptr;
+              }
+              return ((uint8_t*)self.instBuffer.contents) + ((size_t)index * (size_t)self.memSize);
+            }
+            @end
+
+            struct SimContext {
+              __strong #{objc_sim_class}* sim;
+            };
+
+            static inline SimContext* ctx_cast(void* raw) {
+              return static_cast<SimContext*>(raw);
+            }
+
+            extern "C" {
+
+            void* sim_create(unsigned int mem_size) {
+              @autoreleasepool {
+                uint32_t resolvedMemSize = mem_size > 0u ? mem_size : 1u;
+                uint32_t resolvedInstanceCount = resolve_instance_count();
+                SimContext* ctx = new SimContext();
+                ctx->sim = [[#{objc_sim_class} alloc]
+                  initWithMetallibPath:resolveMetallibPath()
+                           kernelName:kKernelName
+                           stateCount:STATE_COUNT
+                     stateScalarBytes:STATE_SCALAR_BYTES
+                              memSize:resolvedMemSize
+                        instanceCount:resolvedInstanceCount];
+                if (!ctx->sim) {
+                  delete ctx;
+                  return nullptr;
+                }
+                return ctx;
+              }
+            }
+
+            void sim_destroy(void* sim) {
+              if (!sim) {
+                return;
+              }
+              @autoreleasepool {
+                SimContext* ctx = ctx_cast(sim);
+                ctx->sim = nil;
+                delete ctx;
+              }
+            }
+
+            void sim_reset(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return;
+              }
+              @autoreleasepool {
+                RhdlArcGpuIo* io = [ctx->sim io];
+                if (!io) {
+                  return;
+                }
+                RhdlArcGpuIo base = io[0];
+                #{reset_assert_line.sub('io->', 'base.')}
+                #{reset_clk_low_line.sub('io->', 'base.')}
+                for (uint32_t i = 0u; i < ctx->sim.instanceCount; ++i) {
+                  io[i] = base;
+                }
+                [ctx->sim dispatchKernelWithBudget:0u];
+                #{reset_clk_high_line.sub('io->', 'base.')}
+                for (uint32_t i = 0u; i < ctx->sim.instanceCount; ++i) {
+                  io[i] = base;
+                }
+                [ctx->sim dispatchKernelWithBudget:0u];
+                #{reset_clk_low_line.sub('io->', 'base.')}
+                #{reset_deassert_line.sub('io->', 'base.')}
+                for (uint32_t i = 0u; i < ctx->sim.instanceCount; ++i) {
+                  io[i] = base;
+                }
+                [ctx->sim dispatchKernelWithBudget:0u];
+              }
+            }
+
+            void sim_eval(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return;
+              }
+              @autoreleasepool {
+                [ctx->sim dispatchKernelWithBudget:0u];
+              }
+            }
+
+            void sim_poke(void* sim, const char* name, unsigned int value) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim || !name) {
+                return;
+              }
+              RhdlArcGpuIo* io = [ctx->sim ioAtIndex:0u];
+              if (!io) {
+                return;
+              }
+            #{poke_cases.join("\n")}
+            }
+
+            unsigned int sim_peek(void* sim, const char* name) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim || !name) {
+                return 0u;
+              }
+              RhdlArcGpuIo* io = [ctx->sim ioAtIndex:0u];
+              if (!io) {
+                return 0u;
+              }
+            #{peek_cases.join("\n")}
+              return 0u;
+            }
+
+            unsigned int sim_read_pc(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              if (PC_SLOT_INDEX < 0) {
+                return 0u;
+              }
+              RhdlStateScalar* slots = [ctx->sim stateSlotsAtIndex:0u];
+              if (!slots) {
+                return 0u;
+              }
+              uint32_t value = (uint32_t)slots[PC_SLOT_INDEX];
+              return mask_width(value, PC_SLOT_WIDTH);
+            }
+
+            unsigned int sim_read_reg(void* sim, unsigned int index) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              uint32_t reg_index = index & 0x1Fu;
+              if (reg_index == 0u) {
+                return 0u;
+              }
+              if (REGFILE_BASE_SLOT < 0 || reg_index >= REGFILE_LENGTH) {
+                return 0u;
+              }
+              RhdlStateScalar* slots = [ctx->sim stateSlotsAtIndex:0u];
+              if (!slots) {
+                return 0u;
+              }
+              return (uint32_t)slots[REGFILE_BASE_SLOT + (int32_t)reg_index];
+            }
+
+            unsigned int sim_read_inst(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              const uint8_t* inst = [ctx->sim instMemAtIndex:0u];
+              if (!inst) {
+                return 0u;
+              }
+              uint32_t pc = sim_read_pc(sim);
+              return read_word_le(inst, ctx->sim.memMask, pc);
+            }
+
+            void sim_write_pc(void* sim, unsigned int value) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return;
+              }
+              if (PC_SLOT_INDEX >= 0) {
+                for (uint32_t i = 0u; i < ctx->sim.instanceCount; ++i) {
+                  RhdlStateScalar* slots = [ctx->sim stateSlotsAtIndex:i];
+                  if (slots) {
+                    slots[PC_SLOT_INDEX] = (RhdlStateScalar)mask_width(value, PC_SLOT_WIDTH);
+                  }
+                }
+              }
+              [ctx->sim dispatchKernelWithBudget:0u];
+            }
+
+            void sim_load_mem(void* sim, int mem_type, const unsigned char* data, unsigned int size, unsigned int base_addr) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim || !data || size == 0u) {
+                return;
+              }
+              uint32_t mask = ctx->sim.memMask;
+              for (uint32_t inst = 0u; inst < ctx->sim.instanceCount; ++inst) {
+                uint8_t* target = mem_type == 0 ? [ctx->sim instMemAtIndex:inst] : [ctx->sim dataMemAtIndex:inst];
+                if (!target) {
+                  continue;
+                }
+                for (uint32_t i = 0u; i < size; ++i) {
+                  target[(base_addr + i) & mask] = data[i];
+                }
+              }
+            }
+
+            unsigned int sim_read_mem_word(void* sim, int mem_type, unsigned int addr) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              const uint8_t* target = mem_type == 0 ? [ctx->sim instMemAtIndex:0u] : [ctx->sim dataMemAtIndex:0u];
+              return read_word_le(target, ctx->sim.memMask, addr);
+            }
+
+            void sim_run_cycles(void* sim, unsigned int n_cycles) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return;
+              }
+              [ctx->sim dispatchKernelWithBudget:n_cycles];
+            }
+
+            unsigned int sim_dispatch_count(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              return ctx->sim.dispatchCount;
+            }
+
+            unsigned int sim_wait_count(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              return ctx->sim.waitCount;
+            }
+
+            unsigned int sim_fast_dispatch_count(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              return ctx->sim.fastDispatchCount;
+            }
+
+            unsigned int sim_fallback_dispatch_count(void* sim) {
+              SimContext* ctx = ctx_cast(sim);
+              if (!ctx || !ctx->sim) {
+                return 0u;
+              }
+              return 0u;
+            }
+
+            void sim_uart_rx_push(void* sim, const unsigned char* data, unsigned int len) {
+              (void)sim;
+              (void)data;
+              (void)len;
+            }
+
+            unsigned int sim_uart_tx_len(void* sim) {
+              (void)sim;
+              return 0u;
+            }
+
+            unsigned int sim_uart_tx_copy(void* sim, unsigned char* out, unsigned int max_len) {
+              (void)sim;
+              (void)out;
+              (void)max_len;
+              return 0u;
+            }
+
+            void sim_uart_tx_clear(void* sim) {
+              (void)sim;
+            }
+
+            unsigned int sim_disk_load(void* sim, const unsigned char* data, unsigned int size, unsigned int base_addr) {
+              (void)sim;
+              (void)data;
+              (void)size;
+              (void)base_addr;
+              return 0u;
+            }
+
+            unsigned int sim_disk_read_byte(void* sim, unsigned int offset) {
+              (void)sim;
+              (void)offset;
+              return 0u;
+            }
+
+            } // extern "C"
+          CPP
+
+          File.write(path, wrapper)
+        end
+      end
+    end
+  end
+end
diff --git a/examples/riscv/utilities/runners/headless_runner.rb b/examples/riscv/utilities/runners/headless_runner.rb
index 2994fc4d..a7261399 100644
--- a/examples/riscv/utilities/runners/headless_runner.rb
+++ b/examples/riscv/utilities/runners/headless_runner.rb
@@ -55,6 +55,13 @@ def initialize(mode: :ir, sim: nil, core: :single, mem_size: nil)
                    end
                    require_relative 'arcilator_runner'
                    ArcilatorRunner.new(mem_size: resolved_mem_size)
+                 when :arcilator_gpu
+                   if @core != :single
+                     warn "ArcilatorGPU mode only supports single-cycle core; overriding core=#{@core} to single."
+                     @core = :single
+                   end
+                   require_relative 'arcilator_gpu_runner'
+                   ArcilatorGpuRunner.new(mem_size: resolved_mem_size)
                  else
                    raise ArgumentError, "Unsupported mode #{@effective_mode.inspect}"
                  end
@@ -239,13 +246,13 @@ def safe_cpu_inst
 
         def normalize_mode(mode)
           case mode
-          when :ruby, :ir, :verilog, :circt
+          when :ruby, :ir, :verilog, :circt, :arcilator_gpu
             mode
           when :netlist
             warn "Mode #{mode.inspect} is not implemented for RISC-V yet; falling back to :ir."
             :ir
           else
-            raise ArgumentError, "Unsupported mode #{mode.inspect}. Use ruby, ir, netlist, verilog, or circt."
+            raise ArgumentError, "Unsupported mode #{mode.inspect}. Use ruby, ir, netlist, verilog, circt, or arcilator_gpu."
           end
         end
 
@@ -263,14 +270,14 @@ def current_pc
         end
 
         def xv6_capable?
-          return true if %i[verilog circt].include?(@effective_mode)
+          return true if %i[verilog circt arcilator_gpu].include?(@effective_mode)
           return true if native? && @cpu.respond_to?(:sim) && @cpu.sim.respond_to?(:runner_kind) && @cpu.sim.runner_kind == :riscv
 
           false
         end
 
         def linux_capable?
-          return true if %i[verilog circt].include?(@effective_mode)
+          return true if %i[verilog circt arcilator_gpu].include?(@effective_mode)
           return false unless native? && @cpu.respond_to?(:sim) && @cpu.sim.respond_to?(:runner_kind)
 
           %i[riscv hdl].include?(@cpu.sim.runner_kind)
@@ -305,10 +312,10 @@ def default_backend(mode)
             :ruby
           when :ir, :netlist
             :compile
-          when :verilog, :circt
+          when :verilog, :circt, :arcilator_gpu
             :ruby
           else
-            raise "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, circt"
+            raise "Unknown mode: #{mode}. Valid modes: ruby, ir, netlist, verilog, circt, arcilator_gpu"
           end
         end
 
diff --git a/examples/riscv/utilities/tasks/run_task.rb b/examples/riscv/utilities/tasks/run_task.rb
index 93b447ef..44c94ddc 100644
--- a/examples/riscv/utilities/tasks/run_task.rb
+++ b/examples/riscv/utilities/tasks/run_task.rb
@@ -680,7 +680,7 @@ def default_sim_backend(mode)
               :ruby
             when :ir, :netlist
               :compile
-            when :verilog, :circt
+            when :verilog, :circt, :arcilator_gpu
               :ruby
             else
               :compile
diff --git a/external/GEM b/external/GEM
new file mode 160000
index 00000000..9524cfe0
--- /dev/null
+++ b/external/GEM
@@ -0,0 +1 @@
+Subproject commit 9524cfe0663cf770a3eaac2a2b51e654808a93d0
diff --git a/lib/rhdl/cli/tasks/benchmark_task.rb b/lib/rhdl/cli/tasks/benchmark_task.rb
index c47b3700..9973cf2d 100644
--- a/lib/rhdl/cli/tasks/benchmark_task.rb
+++ b/lib/rhdl/cli/tasks/benchmark_task.rb
@@ -215,6 +215,8 @@ def benchmark_cpu8bit
             .split(',')
             .map { |name| name.strip.downcase.to_sym }
             .map { |name| name == :gpu ? :arcilator_gpu : name }
+            .map { |name| name == :arc ? :arcilator_gpu : name }
+            .map { |name| name == :gem ? :gem_metal : name }
             .reject(&:empty?)
 
           puts_header("8-bit CPU FastHarness Benchmark")
@@ -228,22 +230,36 @@ def benchmark_cpu8bit
           runners = [
             {
               name: 'Compiler',
+              backend: :compiler,
               sim: :compile,
               filter_key: :compiler,
               available: RHDL::Codegen::IR::IR_COMPILER_AVAILABLE
             },
             {
               name: 'ArcilatorGPU',
+              backend: :arcilator_gpu,
               sim: :arcilator_gpu,
               filter_key: :arcilator_gpu,
               available: RHDL::HDL::CPU::FastHarness.arcilator_gpu_status[:ready]
             }
           ]
+          runners << {
+            name: 'GemMetal',
+            backend: :gem_metal,
+            filter_key: :gem_metal
+          }
           runners.select! { |runner| runner_filter.include?(runner[:filter_key]) } unless runner_filter.empty?
 
           results = []
 
           runners.each do |runner|
+            if runner[:backend] == :gem_metal
+              print "\n#{runner[:name]}: "
+              $stdout.flush
+              results << benchmark_gem_metal_cpu8bit(cycles: cycles, standalone: false)
+              next
+            end
+
             unless runner[:available]
               puts "\n#{runner[:name]}: SKIPPED (not available)"
               results << { name: runner[:name], status: :skipped }
@@ -269,12 +285,18 @@ def benchmark_cpu8bit
               run_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - run_start
 
               cycles_per_sec = cycles_run / run_elapsed
+              parallel_instances = sim.parallel_instances
+              effective_cycles_per_sec = cycles_per_sec * parallel_instances
               final_pc = sim.pc
 
               puts 'done'
               puts "  Init time:  #{format('%.3f', init_elapsed)}s"
               puts "  Run time:   #{format('%.3f', run_elapsed)}s"
               puts "  Cycles run: #{cycles_run}"
+              if parallel_instances > 1
+                puts "  Instances:  #{parallel_instances}"
+                puts "  Effective:  #{format('%.0f', effective_cycles_per_sec)} cycles/s"
+              end
               puts "  Final PC:   0x#{final_pc.to_s(16).upcase}"
 
               results << {
@@ -283,6 +305,8 @@ def benchmark_cpu8bit
                 init_time: init_elapsed,
                 run_time: run_elapsed,
                 cycles_per_sec: cycles_per_sec,
+                parallel_instances: parallel_instances,
+                effective_cycles_per_sec: effective_cycles_per_sec,
                 final_pc: final_pc
               }
             rescue => e
@@ -296,6 +320,661 @@ def benchmark_cpu8bit
           print_benchmark_summary(results, cycles)
         end
 
+        # Benchmark external GEM Metal binary on the 8-bit CPU workload.
+        # This builds (or reuses) a CPU8bit AIGPDK-mapped netlist and gemparts.
+        def benchmark_gem_metal_cpu8bit(cycles: options[:cycles] || 5_000, standalone: true)
+          require 'fileutils'
+          require 'open3'
+          require_relative '../../../../examples/8bit/hdl/cpu/cpu'
+
+          runner_name = 'GemMetal'
+          num_blocks = (options[:blocks] || ENV.fetch('RHDL_GEM_METAL_CPU8BIT_BLOCKS', '5')).to_i
+          num_blocks = 1 if num_blocks <= 0
+          top_module = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_TOP', 'cpu8bit')
+          force_rebuild = truthy_env?(ENV.fetch('RHDL_GEM_METAL_CPU8BIT_REBUILD', '0'))
+
+          project_root = File.expand_path('../../../..', __dir__)
+          gem_root = File.join(project_root, 'external', 'GEM')
+          build_dir = File.expand_path(
+            ENV.fetch('RHDL_GEM_METAL_CPU8BIT_BUILD_DIR', File.join(project_root, 'examples/8bit/.gem_metal_cpu8bit'))
+          )
+          FileUtils.mkdir_p(build_dir)
+
+          rtl_path = File.join(build_dir, 'cpu8bit_rtl.v')
+          yosys_script_path = File.join(build_dir, 'cpu8bit_gem.ys')
+          yosys_log_path = File.join(build_dir, 'cpu8bit_yosys.log')
+          cut_map_log_path = File.join(build_dir, 'cpu8bit_cut_map.log')
+          metal_log_path = File.join(build_dir, 'cpu8bit_metal_dummy.log')
+
+          netlist_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_CPU8BIT_NETLIST'],
+            File.join(build_dir, 'cpu8bit_gatelevel.gv'),
+            project_root
+          )
+          gemparts_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_CPU8BIT_GEMPARTS'],
+            File.join(build_dir, 'cpu8bit.gemparts'),
+            project_root
+          )
+
+          level_split = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_LEVEL_SPLIT', '').strip
+          max_stage_degrad = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_MAX_STAGE_DEGRAD', '').strip
+
+          if standalone
+            puts_header('External GEM Metal Benchmark (CPU8bit)')
+            puts "Cycles per run: #{cycles}"
+            puts "Blocks: #{num_blocks}"
+            puts "Top module: #{top_module}"
+            puts "GEM root: #{gem_root}"
+            puts "Build dir: #{build_dir}"
+            puts "Netlist: #{netlist_path}"
+            puts "Gemparts: #{gemparts_path}"
+            puts "Force rebuild: #{force_rebuild}"
+            puts
+          else
+            print 'initializing... '
+            $stdout.flush
+          end
+
+          init_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+
+          unless Dir.exist?(gem_root)
+            message = "external GEM repo not found at #{gem_root}"
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          unless command_available?('cargo')
+            message = 'cargo not found in PATH'
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          need_netlist = force_rebuild || !File.exist?(netlist_path)
+          need_gemparts = force_rebuild || !File.exist?(gemparts_path)
+
+          if need_netlist
+            unless command_available?('yosys')
+              message = 'yosys not found in PATH (required to synthesize CPU8bit AIGPDK netlist)'
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              puts "Set RHDL_GEM_METAL_CPU8BIT_NETLIST to a prebuilt netlist or install yosys, then retry." if standalone
+              return { name: runner_name, status: :skipped }
+            end
+
+            aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+            unless File.exist?(aigpdk_nomem_lib)
+              message = "missing AIGPDK library at #{aigpdk_nomem_lib}"
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              return { name: runner_name, status: :skipped }
+            end
+
+            puts 'Generating CPU8bit Verilog hierarchy...' if standalone
+            FileUtils.mkdir_p(File.dirname(rtl_path))
+            File.write(rtl_path, RHDL::HDL::CPU::CPU.to_verilog_hierarchy(top_name: top_module))
+
+            puts 'Synthesizing/mapping CPU8bit netlist with yosys...' if standalone
+            yosys_script = <<~YOSYS
+              read_verilog "#{rtl_path}"
+              hierarchy -check -top #{top_module}
+              synth -flatten
+              delete t:\$print
+              dfflibmap -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              abc -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              write_verilog "#{netlist_path}"
+            YOSYS
+            File.write(yosys_script_path, yosys_script)
+
+            yosys_cmd = ['yosys', '-q', '-s', yosys_script_path]
+            yosys_out, yosys_status = Open3.capture2e(*yosys_cmd)
+            File.write(yosys_log_path, yosys_out)
+            unless yosys_status.success?
+              puts yosys_out if standalone
+              puts if standalone
+              message = "yosys synthesis exited with #{yosys_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Yosys log: #{yosys_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing netlist at #{netlist_path}"
+          end
+
+          if need_gemparts
+            puts 'Generating GEM partition (.gemparts) via cut_map_interactive...' if standalone
+            cut_map_cmd = ['cargo', 'run', '--release', '--features', 'metal', '--bin', 'cut_map_interactive', '--',
+                           netlist_path]
+            cut_map_cmd += ['--top-module', top_module]
+            cut_map_cmd += ['--level-split', level_split] unless level_split.empty?
+            cut_map_cmd += ['--max-stage-degrad', max_stage_degrad] unless max_stage_degrad.empty?
+            cut_map_cmd << gemparts_path
+
+            cut_map_out, cut_map_status = Open3.capture2e(*cut_map_cmd, chdir: gem_root)
+            File.write(cut_map_log_path, cut_map_out)
+            unless cut_map_status.success?
+              puts cut_map_out if standalone
+              puts if standalone
+              message = "cut_map_interactive exited with #{cut_map_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Cut-map log: #{cut_map_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing gemparts at #{gemparts_path}"
+          end
+
+          unless File.exist?(netlist_path)
+            message = "netlist not found at #{netlist_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+          unless File.exist?(gemparts_path)
+            message = "gemparts not found at #{gemparts_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          init_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - init_start
+
+          cmd = [
+            'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_dummy_test', '--',
+            netlist_path, gemparts_path, num_blocks.to_s, cycles.to_s
+          ]
+
+          print(standalone ? "Running: #{cmd.join(' ')}\n" : "running #{cycles} cycles... ")
+          $stdout.flush
+          run_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+          out, status = Open3.capture2e(*cmd, chdir: gem_root)
+          run_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - run_start
+          File.write(metal_log_path, out)
+
+          unless status.success?
+            puts out if standalone
+            puts if standalone
+            message = "external GEM CPU8bit benchmark command exited with #{status.exitstatus}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          summary = parse_gem_metal_summary(out)
+          unless summary && summary[:cycles_per_sec]
+            message = 'could not parse metal_dummy_test summary line'
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          puts 'done'
+          puts "  Init time: #{format('%.3f', init_elapsed)}s"
+          puts "  Run time:  #{format('%.3f', run_elapsed)}s"
+          puts "  Logical dispatches: #{summary[:logical_dispatches]}"
+          puts "  GPU dispatches: #{summary[:gpu_dispatches]}"
+          puts "  Reported total: #{format('%.3f', summary[:total_ms] || 0.0)}ms"
+          puts "  Cycles/s: #{format('%.2f', summary[:cycles_per_sec])}"
+
+          {
+            name: runner_name,
+            status: :success,
+            init_time: init_elapsed,
+            run_time: run_elapsed,
+            cycles_per_sec: summary[:cycles_per_sec],
+            logical_dispatches: summary[:logical_dispatches],
+            gpu_dispatches: summary[:gpu_dispatches],
+            total_ms: summary[:total_ms]
+          }
+        rescue => e
+          puts 'FAILED'
+          puts "  Error: #{e.message}"
+          puts "  #{e.backtrace.first(3).join("\n  ")}" if options[:verbose]
+          { name: runner_name, status: :failed, error: e.message }
+        end
+
+        # Benchmark external GEM Metal binary on the Apple2 workload.
+        # This builds (or reuses) an Apple2 AIGPDK-mapped netlist and gemparts.
+        def benchmark_gem_metal_apple2(cycles: options[:cycles] || 5_000, standalone: true)
+          require 'fileutils'
+          require 'open3'
+          require_relative '../../../../examples/apple2/hdl'
+
+          runner_name = 'GemMetal'
+          num_blocks = (options[:blocks] || ENV.fetch('RHDL_GEM_METAL_APPLE2_BLOCKS', '5')).to_i
+          num_blocks = 1 if num_blocks <= 0
+          top_module = ENV.fetch('RHDL_GEM_METAL_APPLE2_TOP', 'apple2_apple2')
+          force_rebuild = truthy_env?(ENV.fetch('RHDL_GEM_METAL_APPLE2_REBUILD', '0'))
+
+          project_root = File.expand_path('../../../..', __dir__)
+          gem_root = File.join(project_root, 'external', 'GEM')
+          build_dir = File.expand_path(
+            ENV.fetch('RHDL_GEM_METAL_APPLE2_BUILD_DIR', File.join(project_root, 'examples/apple2/.gem_metal_apple2'))
+          )
+          FileUtils.mkdir_p(build_dir)
+
+          rtl_path = File.join(build_dir, 'apple2_rtl.v')
+          yosys_script_path = File.join(build_dir, 'apple2_gem.ys')
+          yosys_log_path = File.join(build_dir, 'apple2_yosys.log')
+          cut_map_log_path = File.join(build_dir, 'apple2_cut_map.log')
+          metal_log_path = File.join(build_dir, 'apple2_metal_dummy.log')
+
+          netlist_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_APPLE2_NETLIST'],
+            File.join(build_dir, 'apple2_gatelevel.gv'),
+            project_root
+          )
+          gemparts_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_APPLE2_GEMPARTS'],
+            File.join(build_dir, 'apple2.gemparts'),
+            project_root
+          )
+
+          level_split = ENV.fetch('RHDL_GEM_METAL_APPLE2_LEVEL_SPLIT', '').strip
+          max_stage_degrad = ENV.fetch('RHDL_GEM_METAL_APPLE2_MAX_STAGE_DEGRAD', '').strip
+
+          if standalone
+            puts_header('External GEM Metal Benchmark (Apple2)')
+            puts "Cycles per run: #{cycles}"
+            puts "Blocks: #{num_blocks}"
+            puts "Top module: #{top_module}"
+            puts "GEM root: #{gem_root}"
+            puts "Build dir: #{build_dir}"
+            puts "Netlist: #{netlist_path}"
+            puts "Gemparts: #{gemparts_path}"
+            puts "Force rebuild: #{force_rebuild}"
+            puts
+          else
+            print 'initializing... '
+            $stdout.flush
+          end
+
+          init_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+
+          unless Dir.exist?(gem_root)
+            message = "external GEM repo not found at #{gem_root}"
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          unless command_available?('cargo')
+            message = 'cargo not found in PATH'
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          need_netlist = force_rebuild || !File.exist?(netlist_path)
+          need_gemparts = force_rebuild || !File.exist?(gemparts_path)
+
+          if need_netlist
+            unless command_available?('yosys')
+              message = 'yosys not found in PATH (required to synthesize Apple2 AIGPDK netlist)'
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              puts "Set RHDL_GEM_METAL_APPLE2_NETLIST to a prebuilt netlist or install yosys, then retry." if standalone
+              return { name: runner_name, status: :skipped }
+            end
+
+            aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+            unless File.exist?(aigpdk_nomem_lib)
+              message = "missing AIGPDK library at #{aigpdk_nomem_lib}"
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              return { name: runner_name, status: :skipped }
+            end
+
+            puts 'Generating Apple2 Verilog hierarchy...' if standalone
+            FileUtils.mkdir_p(File.dirname(rtl_path))
+            rtl = RHDL::Examples::Apple2::Apple2.to_verilog_hierarchy(top_name: top_module)
+            File.write(rtl_path, rtl)
+
+            puts 'Synthesizing/mapping Apple2 netlist with yosys...' if standalone
+            yosys_script = <<~YOSYS
+              read_verilog "#{rtl_path}"
+              hierarchy -check -top #{top_module}
+              synth -flatten
+              delete t:\$print
+              dfflibmap -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              abc -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              write_verilog "#{netlist_path}"
+            YOSYS
+            File.write(yosys_script_path, yosys_script)
+
+            yosys_cmd = ['yosys', '-q', '-s', yosys_script_path]
+            yosys_out, yosys_status = Open3.capture2e(*yosys_cmd)
+            File.write(yosys_log_path, yosys_out)
+            unless yosys_status.success?
+              puts yosys_out if standalone
+              puts if standalone
+              message = "yosys synthesis exited with #{yosys_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Yosys log: #{yosys_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing netlist at #{netlist_path}"
+          end
+
+          if need_gemparts
+            puts 'Generating GEM partition (.gemparts) via cut_map_interactive...' if standalone
+            cut_map_cmd = ['cargo', 'run', '--release', '--features', 'metal', '--bin', 'cut_map_interactive', '--',
+                           netlist_path]
+            cut_map_cmd += ['--top-module', top_module]
+            cut_map_cmd += ['--level-split', level_split] unless level_split.empty?
+            cut_map_cmd += ['--max-stage-degrad', max_stage_degrad] unless max_stage_degrad.empty?
+            cut_map_cmd << gemparts_path
+
+            cut_map_out, cut_map_status = Open3.capture2e(*cut_map_cmd, chdir: gem_root)
+            File.write(cut_map_log_path, cut_map_out)
+            unless cut_map_status.success?
+              puts cut_map_out if standalone
+              puts if standalone
+              message = "cut_map_interactive exited with #{cut_map_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Cut-map log: #{cut_map_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing gemparts at #{gemparts_path}"
+          end
+
+          unless File.exist?(netlist_path)
+            message = "netlist not found at #{netlist_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+          unless File.exist?(gemparts_path)
+            message = "gemparts not found at #{gemparts_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          init_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - init_start
+
+          cmd = [
+            'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_dummy_test', '--',
+            netlist_path, gemparts_path, num_blocks.to_s, cycles.to_s
+          ]
+
+          print(standalone ? "Running: #{cmd.join(' ')}\n" : "running #{cycles} cycles... ")
+          $stdout.flush
+          run_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+          out, status = Open3.capture2e(*cmd, chdir: gem_root)
+          run_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - run_start
+          File.write(metal_log_path, out)
+
+          unless status.success?
+            puts out if standalone
+            puts if standalone
+            message = "external GEM Apple2 benchmark command exited with #{status.exitstatus}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          summary = parse_gem_metal_summary(out)
+          unless summary && summary[:cycles_per_sec]
+            message = 'could not parse metal_dummy_test summary line'
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          puts 'done'
+          puts "  Init time: #{format('%.3f', init_elapsed)}s"
+          puts "  Run time:  #{format('%.3f', run_elapsed)}s"
+          puts "  Logical dispatches: #{summary[:logical_dispatches]}"
+          puts "  GPU dispatches: #{summary[:gpu_dispatches]}"
+          puts "  Reported total: #{format('%.3f', summary[:total_ms] || 0.0)}ms"
+          puts "  Cycles/s: #{format('%.2f', summary[:cycles_per_sec])}"
+
+          {
+            name: runner_name,
+            status: :success,
+            init_time: init_elapsed,
+            run_time: run_elapsed,
+            cycles_per_sec: summary[:cycles_per_sec],
+            logical_dispatches: summary[:logical_dispatches],
+            gpu_dispatches: summary[:gpu_dispatches],
+            total_ms: summary[:total_ms]
+          }
+        rescue => e
+          puts 'FAILED'
+          puts "  Error: #{e.message}"
+          puts "  #{e.backtrace.first(3).join("\n  ")}" if options[:verbose]
+          { name: runner_name, status: :failed, error: e.message }
+        end
+
+        # Benchmark external GEM Metal binary on the RISC-V core.
+        # This builds (or reuses) an MMU-off RISC-V AIGPDK-mapped netlist and gemparts.
+        def benchmark_gem_metal_riscv(cycles: options[:cycles] || 5_000, standalone: true)
+          require 'fileutils'
+          require 'json'
+          require 'open3'
+          require_relative '../../../../examples/riscv/hdl/cpu'
+
+          runner_name = 'GemMetal'
+          num_blocks = (options[:blocks] || ENV.fetch('RHDL_GEM_METAL_RISCV_BLOCKS', '5')).to_i
+          num_blocks = 1 if num_blocks <= 0
+          top_module = ENV.fetch('RHDL_GEM_METAL_RISCV_TOP', 'riscv_cpu')
+          force_rebuild = truthy_env?(ENV.fetch('RHDL_GEM_METAL_RISCV_REBUILD', '0'))
+
+          project_root = File.expand_path('../../../..', __dir__)
+          gem_root = File.join(project_root, 'external', 'GEM')
+          build_dir = File.expand_path(
+            ENV.fetch('RHDL_GEM_METAL_RISCV_BUILD_DIR', File.join(project_root, 'examples/riscv/.gem_metal_riscv'))
+          )
+          FileUtils.mkdir_p(build_dir)
+
+          rtl_path = File.join(build_dir, 'riscv_rtl.v')
+          yosys_script_path = File.join(build_dir, 'riscv_gem.ys')
+          yosys_log_path = File.join(build_dir, 'riscv_yosys.log')
+          cut_map_log_path = File.join(build_dir, 'riscv_cut_map.log')
+          metal_log_path = File.join(build_dir, 'riscv_metal_dummy.log')
+          build_config_path = File.join(build_dir, 'riscv_gem_build_config.json')
+
+          netlist_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_RISCV_NETLIST'],
+            File.join(build_dir, 'riscv_gatelevel.gv'),
+            project_root
+          )
+          gemparts_path = resolve_path_for_bench(
+            ENV['RHDL_GEM_METAL_RISCV_GEMPARTS'],
+            File.join(build_dir, 'riscv.gemparts'),
+            project_root
+          )
+
+          level_split = ENV.fetch('RHDL_GEM_METAL_RISCV_LEVEL_SPLIT', '').strip
+          max_stage_degrad = ENV.fetch('RHDL_GEM_METAL_RISCV_MAX_STAGE_DEGRAD', '').strip
+          expected_build_config = {
+            'format' => 1,
+            'top_module' => top_module,
+            'mmu_disabled' => true,
+            'level_split' => level_split,
+            'max_stage_degrad' => max_stage_degrad
+          }
+
+          if standalone
+            puts_header('External GEM Metal Benchmark (RISC-V)')
+            puts "Cycles per run: #{cycles}"
+            puts "Blocks: #{num_blocks}"
+            puts "Top module: #{top_module}"
+            puts "GEM root: #{gem_root}"
+            puts "Build dir: #{build_dir}"
+            puts "Netlist: #{netlist_path}"
+            puts "Gemparts: #{gemparts_path}"
+            puts "Force rebuild: #{force_rebuild}"
+            puts
+          else
+            print 'initializing... '
+            $stdout.flush
+          end
+
+          init_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+
+          unless Dir.exist?(gem_root)
+            message = "external GEM repo not found at #{gem_root}"
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          unless command_available?('cargo')
+            message = 'cargo not found in PATH'
+            puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+            return { name: runner_name, status: :skipped }
+          end
+
+          build_config = begin
+            if File.exist?(build_config_path)
+              JSON.parse(File.read(build_config_path))
+            end
+          rescue JSON::ParserError
+            nil
+          end
+          need_netlist = force_rebuild || !File.exist?(netlist_path) || build_config != expected_build_config
+          need_gemparts = force_rebuild || !File.exist?(gemparts_path) || need_netlist
+
+          if need_netlist
+            unless command_available?('yosys')
+              message = 'yosys not found in PATH (required to synthesize RISC-V AIGPDK netlist)'
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              puts "Set RHDL_GEM_METAL_RISCV_NETLIST to a prebuilt netlist or install yosys, then retry." if standalone
+              return { name: runner_name, status: :skipped }
+            end
+
+            aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+            unless File.exist?(aigpdk_nomem_lib)
+              message = "missing AIGPDK library at #{aigpdk_nomem_lib}"
+              puts(standalone ? "Error: #{message}" : "SKIPPED (#{message})")
+              return { name: runner_name, status: :skipped }
+            end
+
+            puts 'Generating RISC-V Verilog hierarchy...' if standalone
+            FileUtils.mkdir_p(File.dirname(rtl_path))
+            rtl = RHDL::Examples::RISCV::CPU.to_verilog_hierarchy(top_name: top_module)
+            File.write(rtl_path, disable_riscv_mmu_for_gem_rtl(rtl))
+
+            puts 'Synthesizing/mapping RISC-V netlist with yosys...' if standalone
+            yosys_script = <<~YOSYS
+              read_verilog "#{rtl_path}"
+              hierarchy -check -top #{top_module}
+              synth -flatten
+              delete t:\$print
+              dfflibmap -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              abc -liberty "#{aigpdk_nomem_lib}"
+              opt_clean -purge
+              write_verilog "#{netlist_path}"
+            YOSYS
+            File.write(yosys_script_path, yosys_script)
+
+            yosys_cmd = ['yosys', '-q', '-s', yosys_script_path]
+            yosys_out, yosys_status = Open3.capture2e(*yosys_cmd)
+            File.write(yosys_log_path, yosys_out)
+            unless yosys_status.success?
+              puts yosys_out if standalone
+              puts if standalone
+              message = "yosys synthesis exited with #{yosys_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Yosys log: #{yosys_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing netlist at #{netlist_path}"
+          end
+
+          if need_gemparts
+            puts 'Generating GEM partition (.gemparts) via cut_map_interactive...' if standalone
+            cut_map_cmd = ['cargo', 'run', '--release', '--features', 'metal', '--bin', 'cut_map_interactive', '--',
+                           netlist_path]
+            cut_map_cmd += ['--top-module', top_module]
+            cut_map_cmd += ['--level-split', level_split] unless level_split.empty?
+            cut_map_cmd += ['--max-stage-degrad', max_stage_degrad] unless max_stage_degrad.empty?
+            cut_map_cmd << gemparts_path
+
+            cut_map_out, cut_map_status = Open3.capture2e(*cut_map_cmd, chdir: gem_root)
+            File.write(cut_map_log_path, cut_map_out)
+            unless cut_map_status.success?
+              puts cut_map_out if standalone
+              puts if standalone
+              message = "cut_map_interactive exited with #{cut_map_status.exitstatus}"
+              puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+              puts "Cut-map log: #{cut_map_log_path}" if standalone
+              return { name: runner_name, status: :failed, error: message }
+            end
+          elsif standalone
+            puts "Reusing existing gemparts at #{gemparts_path}"
+          end
+
+          unless File.exist?(netlist_path)
+            message = "netlist not found at #{netlist_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+          unless File.exist?(gemparts_path)
+            message = "gemparts not found at #{gemparts_path}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          if need_netlist || need_gemparts || !File.exist?(build_config_path)
+            File.write(build_config_path, JSON.pretty_generate(expected_build_config))
+          end
+
+          init_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - init_start
+
+          cmd = [
+            'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_dummy_test', '--',
+            netlist_path, gemparts_path, num_blocks.to_s, cycles.to_s
+          ]
+
+          print(standalone ? "Running: #{cmd.join(' ')}\n" : "running #{cycles} cycles... ")
+          $stdout.flush
+          run_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+          out, status = Open3.capture2e(*cmd, chdir: gem_root)
+          run_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - run_start
+          File.write(metal_log_path, out)
+
+          unless status.success?
+            puts out if standalone
+            puts if standalone
+            message = "external GEM RISC-V benchmark command exited with #{status.exitstatus}"
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          summary = parse_gem_metal_summary(out)
+          unless summary && summary[:cycles_per_sec]
+            message = 'could not parse metal_dummy_test summary line'
+            puts(standalone ? "FAILED: #{message}" : "FAILED (#{message})")
+            puts "Metal log: #{metal_log_path}" if standalone
+            return { name: runner_name, status: :failed, error: message }
+          end
+
+          puts 'done'
+          puts "  Init time: #{format('%.3f', init_elapsed)}s"
+          puts "  Run time:  #{format('%.3f', run_elapsed)}s"
+          puts "  Logical dispatches: #{summary[:logical_dispatches]}"
+          puts "  GPU dispatches: #{summary[:gpu_dispatches]}"
+          puts "  Reported total: #{format('%.3f', summary[:total_ms] || 0.0)}ms"
+          puts "  Cycles/s: #{format('%.2f', summary[:cycles_per_sec])}"
+
+          {
+            name: runner_name,
+            status: :success,
+            init_time: init_elapsed,
+            run_time: run_elapsed,
+            cycles_per_sec: summary[:cycles_per_sec],
+            logical_dispatches: summary[:logical_dispatches],
+            gpu_dispatches: summary[:gpu_dispatches],
+            total_ms: summary[:total_ms]
+          }
+        rescue => e
+          puts 'FAILED'
+          puts "  Error: #{e.message}"
+          puts "  #{e.backtrace.first(3).join("\n  ")}" if options[:verbose]
+          { name: runner_name, status: :failed, error: e.message }
+        end
+
         # Benchmark MOS6502 CPU IR with memory bridging (like karateka tests)
         def benchmark_mos6502
           require 'rhdl/codegen'
@@ -524,55 +1203,20 @@ def benchmark_mos6502
         def benchmark_apple2
           require 'rhdl/codegen'
 
-          # Paths to ROM and memory dump
-          rom_path = File.expand_path('../../../../examples/apple2/software/roms/appleiigo.rom', __dir__)
-          karateka_path = File.expand_path('../../../../examples/apple2/software/disks/karateka_mem.bin', __dir__)
-
-          unless File.exist?(rom_path)
-            puts "Error: AppleIIgo ROM not found at #{rom_path}"
-            puts "Please ensure the ROM file exists."
-            return
-          end
-
-          unless File.exist?(karateka_path)
-            puts "Error: Karateka memory dump not found at #{karateka_path}"
-            puts "Please ensure the memory dump file exists."
-            return
-          end
-
-          # Load ROM and memory data
-          rom_data = File.binread(rom_path).bytes
-          karateka_mem = File.binread(karateka_path).bytes
-
-          # Modify ROM reset vector to point to game entry ($B82A)
-          karateka_rom = rom_data.dup
-          karateka_rom[0x2FFC] = 0x2A  # low byte of $B82A
-          karateka_rom[0x2FFD] = 0xB8  # high byte of $B82A
-
           cycles = options[:cycles] || 100_000
           compiler_sub_cycles = 14
           runner_filter = (ENV['RHDL_BENCH_BACKENDS'] || '')
             .split(',')
             .map { |name| name.strip.downcase.to_sym }
+            .map { |name| name == :gem ? :gem_metal : name }
             .reject(&:empty?)
 
           puts_header("Apple2 Full System IR Benchmark - Karateka Game Code")
           puts "Cycles per run: #{cycles}"
           puts "Compiler sub-cycles: #{compiler_sub_cycles} (fixed)"
-          puts "ROM: #{rom_path}"
-          puts "Memory dump: #{karateka_path}"
           puts
 
-          # Generate IR once for all runners
-          print "Generating Apple2 IR... "
-          $stdout.flush
-
           require_relative '../../../../examples/apple2/hdl'
-          ir_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-          ir = RHDL::Examples::Apple2::Apple2.to_flat_ir
-          ir_json = RHDL::Codegen::IR::IRToJson.convert(ir)
-          ir_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - ir_start
-          puts "done (#{format('%.3f', ir_elapsed)}s)"
 
           # Define runners to benchmark
           runners = [
@@ -580,13 +1224,66 @@ def benchmark_apple2
             { name: 'JIT', backend: :jit, available_const: :IR_JIT_AVAILABLE },
             { name: 'Compiler', backend: :compiler, available_const: :IR_COMPILER_AVAILABLE },
             { name: 'Verilator', backend: :verilator },
-            { name: 'Arcilator', backend: :arcilator }
+            { name: 'Arcilator', backend: :arcilator },
+            { name: 'ArcilatorGPU', backend: :arcilator_gpu }
           ]
+          runners << { name: 'GemMetal', backend: :gem_metal }
           runners.select! { |runner| runner_filter.include?(runner[:backend]) } unless runner_filter.empty?
 
+          workload_backends = runners.reject { |runner| runner[:backend] == :gem_metal }
+          ir_json = nil
+          karateka_rom = nil
+          karateka_mem = nil
+
+          unless workload_backends.empty?
+            # Paths to ROM and memory dump
+            rom_path = File.expand_path('../../../../examples/apple2/software/roms/appleiigo.rom', __dir__)
+            karateka_path = File.expand_path('../../../../examples/apple2/software/disks/karateka_mem.bin', __dir__)
+
+            unless File.exist?(rom_path)
+              puts "Error: AppleIIgo ROM not found at #{rom_path}"
+              puts "Please ensure the ROM file exists."
+              return
+            end
+
+            unless File.exist?(karateka_path)
+              puts "Error: Karateka memory dump not found at #{karateka_path}"
+              puts "Please ensure the memory dump file exists."
+              return
+            end
+
+            puts "ROM: #{rom_path}"
+            puts "Memory dump: #{karateka_path}"
+
+            # Load ROM and memory data
+            rom_data = File.binread(rom_path).bytes
+            karateka_mem = File.binread(karateka_path).bytes
+
+            # Modify ROM reset vector to point to game entry ($B82A)
+            karateka_rom = rom_data.dup
+            karateka_rom[0x2FFC] = 0x2A  # low byte of $B82A
+            karateka_rom[0x2FFD] = 0xB8  # high byte of $B82A
+
+            # Generate IR once for all non-GEM runners
+            print "Generating Apple2 IR... "
+            $stdout.flush
+            ir_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+            ir = RHDL::Examples::Apple2::Apple2.to_flat_ir
+            ir_json = RHDL::Codegen::IR::IRToJson.convert(ir)
+            ir_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - ir_start
+            puts "done (#{format('%.3f', ir_elapsed)}s)"
+          end
+
           results = []
 
           runners.each do |runner|
+            if runner[:backend] == :gem_metal
+              print "\n#{runner[:name]}: "
+              $stdout.flush
+              results << benchmark_gem_metal_apple2(cycles: cycles, standalone: false)
+              next
+            end
+
             # Skip Interpreter for large cycle counts (too slow)
             if runner[:backend] == :interpreter && cycles > 100_000
               puts "\n#{runner[:name]}: SKIPPED (cycles > 100K, too slow)"
@@ -601,6 +1298,9 @@ def benchmark_apple2
               available = verilator_available?
             elsif runner[:backend] == :arcilator
               available = arcilator_available?
+            elsif runner[:backend] == :arcilator_gpu
+              require_relative '../../../../examples/apple2/utilities/runners/arcilator_gpu_runner'
+              available = RHDL::Examples::Apple2::ArcilatorGpuRunner.available?
             else
               available = false
             end
@@ -614,7 +1314,7 @@ def benchmark_apple2
             print "\n#{runner[:name]}: "
             $stdout.flush
 
-            is_hdl_runner = runner[:backend] == :verilator || runner[:backend] == :arcilator
+            is_hdl_runner = %i[verilator arcilator arcilator_gpu].include?(runner[:backend])
 
             begin
               # Create simulator
@@ -635,6 +1335,9 @@ def benchmark_apple2
               when :arcilator
                 require_relative '../../../../examples/apple2/utilities/runners/arcilator_runner'
                 RHDL::Examples::Apple2::ArcilatorRunner.new(sub_cycles: 14)
+              when :arcilator_gpu
+                require_relative '../../../../examples/apple2/utilities/runners/arcilator_gpu_runner'
+                RHDL::Examples::Apple2::ArcilatorGpuRunner.new(sub_cycles: 14)
               end
 
               init_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - init_start
@@ -880,6 +1583,7 @@ def benchmark_riscv
           runner_filter = (ENV['RHDL_BENCH_BACKENDS'] || '')
             .split(',')
             .map { |name| name.strip.downcase.to_sym }
+            .map { |name| name == :gem ? :gem_metal : name }
             .reject(&:empty?)
 
           puts_header("RISC-V Single-Cycle CPU Benchmark - xv6 Boot")
@@ -893,13 +1597,22 @@ def benchmark_riscv
           runners = [
             { name: 'IR Compiler', mode: :ir, sim: :compile, filter_key: :compiler },
             { name: 'Verilator', mode: :verilog, sim: nil, filter_key: :verilator },
-            { name: 'CIRCT', mode: :circt, sim: nil, filter_key: :circt }
+            { name: 'CIRCT', mode: :circt, sim: nil, filter_key: :circt },
+            { name: 'ArcilatorGPU', mode: :arcilator_gpu, sim: nil, filter_key: :arcilator_gpu },
+            { name: 'GemMetal', backend: :gem_metal, filter_key: :gem_metal }
           ]
           runners.select! { |r| runner_filter.include?(r[:filter_key]) } unless runner_filter.empty?
 
           results = []
 
           runners.each do |runner_config|
+            if runner_config[:backend] == :gem_metal
+              print "\n#{runner_config[:name]}: "
+              $stdout.flush
+              results << benchmark_gem_metal_riscv(cycles: cycles, standalone: false)
+              next
+            end
+
             # Check availability
             available = case runner_config[:mode]
                         when :ir
@@ -913,6 +1626,9 @@ def benchmark_riscv
                           verilator_available?
                         when :circt
                           arcilator_available?
+                        when :arcilator_gpu
+                          require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+                          RHDL::Examples::RISCV::ArcilatorGpuRunner.available?
                         end
 
             unless available
@@ -951,9 +1667,12 @@ def benchmark_riscv
               runner.run_steps(cycles)
               run_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - run_start
 
-              cycles_per_sec = cycles / run_elapsed
               state = runner.cpu_state
-              pc = state[:pc]
+              pc = state[:pc].to_i
+              if pc.zero?
+                raise "#{runner_config[:name]} did not initialize xv6 correctly (PC remained 0x0)"
+              end
+              cycles_per_sec = cycles / run_elapsed
 
               puts "done"
               puts "  Init time: #{format('%.3f', init_elapsed)}s"
@@ -1156,16 +1875,34 @@ def benchmark_web_riscv
         def print_benchmark_summary(results, cycles)
           puts
           puts_header("Summary")
-          puts "#{'Runner'.ljust(15)} #{'Status'.ljust(10)} #{'Init'.rjust(10)} #{'Run'.rjust(10)}"
+          show_instances = results.any? { |r| r[:parallel_instances].to_i > 1 }
+          if show_instances
+            puts "#{'Runner'.ljust(15)} #{'Status'.ljust(10)} #{'Inst'.rjust(6)} #{'Init'.rjust(10)} #{'Run'.rjust(10)}"
+          else
+            puts "#{'Runner'.ljust(15)} #{'Status'.ljust(10)} #{'Init'.rjust(10)} #{'Run'.rjust(10)}"
+          end
           puts_separator
 
           results.each do |r|
             if r[:status] == :success
-              puts "#{r[:name].ljust(15)} #{'OK'.ljust(10)} #{format('%8.3f', r[:init_time])}s #{format('%8.3f', r[:run_time])}s"
+              if show_instances
+                inst = r[:parallel_instances].to_i
+                puts "#{r[:name].ljust(15)} #{'OK'.ljust(10)} #{inst.to_s.rjust(6)} #{format('%8.3f', r[:init_time])}s #{format('%8.3f', r[:run_time])}s"
+              else
+                puts "#{r[:name].ljust(15)} #{'OK'.ljust(10)} #{format('%8.3f', r[:init_time])}s #{format('%8.3f', r[:run_time])}s"
+              end
             elsif r[:status] == :skipped
-              puts "#{r[:name].ljust(15)} #{'SKIP'.ljust(10)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              if show_instances
+                puts "#{r[:name].ljust(15)} #{'SKIP'.ljust(10)} #{'-'.rjust(6)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              else
+                puts "#{r[:name].ljust(15)} #{'SKIP'.ljust(10)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              end
             else
-              puts "#{r[:name].ljust(15)} #{'FAIL'.ljust(10)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              if show_instances
+                puts "#{r[:name].ljust(15)} #{'FAIL'.ljust(10)} #{'-'.rjust(6)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              else
+                puts "#{r[:name].ljust(15)} #{'FAIL'.ljust(10)} #{'-'.rjust(10)} #{'-'.rjust(10)}"
+              end
             end
           end
 
@@ -1183,6 +1920,21 @@ def print_benchmark_summary(results, cycles)
               puts "  #{r[:name]} vs #{base[:name]}: #{format('%.3f', ratio)}x"
             end
           end
+
+          effective_enabled = successful.any? { |r| r[:parallel_instances].to_i > 1 }
+          if effective_enabled && successful.length >= 2
+            puts
+            puts "Effective Performance Ratios (instances-adjusted):"
+            base = compiler || successful.first
+            base_eff = base[:effective_cycles_per_sec] || base[:cycles_per_sec]
+            successful.each do |r|
+              next if r[:name] == base[:name]
+
+              eff = r[:effective_cycles_per_sec] || r[:cycles_per_sec]
+              ratio = eff / base_eff
+              puts "  #{r[:name]} vs #{base[:name]}: #{format('%.3f', ratio)}x"
+            end
+          end
         end
 
         def rspec_cmd
@@ -1210,6 +1962,75 @@ def command_available?(cmd)
           end
         end
 
+        def truthy_env?(raw)
+          case raw.to_s.strip.downcase
+          when '1', 'true', 'yes', 'y', 'on'
+            true
+          else
+            false
+          end
+        end
+
+        def resolve_path_for_bench(raw_path, default_path, project_root)
+          return default_path if raw_path.nil? || raw_path.strip.empty?
+
+          path = raw_path.strip
+          return path if path.start_with?('/')
+
+          File.expand_path(path, project_root)
+        end
+
+        def parse_gem_metal_summary(output)
+          summary_line = output.lines.reverse.find { |line| line.include?('metal_dummy_test: logical_dispatches=') }
+          return nil unless summary_line
+
+          {
+            logical_dispatches: summary_line[/logical_dispatches=(\d+)/, 1]&.to_i,
+            gpu_dispatches: summary_line[/gpu_dispatches=(\d+)/, 1]&.to_i,
+            total_ms: summary_line[/total_ms=([0-9.]+)/, 1]&.to_f,
+            cycles_per_sec: summary_line[/cycles_per_sec=([0-9.]+)/, 1]&.to_f
+          }
+        end
+
+        # GEM currently requires strictly acyclic combinational logic.
+        # For RISC-V benchmarking we provide an MMU-off RTL variant that removes
+        # the Sv32 TLB instances and forces satp translation off.
+        def disable_riscv_mmu_for_gem_rtl(rtl)
+          patched = rtl.dup
+
+          satp_rewritten = patched.sub!(
+            /^\s*assign\s+satp_translate\s*=.*?;\s*$/m,
+            "  assign satp_translate = 1'b0;\n"
+          )
+
+          replaced_instances = []
+          patched = patched.gsub(
+            /^\s*riscv_sv32_tlb\s+(itlb|dtlb)\s*\(\n(?:.*?\n)*?^\s*\);\n/m
+          ) do
+            inst = Regexp.last_match(1)
+            replaced_instances << inst
+            <<~VERILOG
+                assign #{inst}__hit = 1'b0;
+                assign #{inst}__ppn = 20'd0;
+                assign #{inst}__perm_r = 1'b0;
+                assign #{inst}__perm_w = 1'b0;
+                assign #{inst}__perm_x = 1'b0;
+                assign #{inst}__perm_u = 1'b0;
+
+            VERILOG
+          end
+
+          missing = []
+          missing << 'satp_translate assignment' unless satp_rewritten
+          missing << 'itlb instance' unless replaced_instances.include?('itlb')
+          missing << 'dtlb instance' unless replaced_instances.include?('dtlb')
+          unless missing.empty?
+            raise "Failed to apply RISC-V MMU-off RTL transform (missing: #{missing.join(', ')})"
+          end
+
+          patched
+        end
+
         # Build/locate WASM backends for the web Apple II benchmark.
         # Returns an array of { name:, wasm_path:, ir_json_path: } hashes.
         def prepare_web_wasm_backends(runner_filter)
diff --git a/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering.rb b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering.rb
new file mode 100644
index 00000000..cfaeb926
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering.rb
@@ -0,0 +1,5946 @@
+# frozen_string_literal: true
+
+require 'json'
+require 'open3'
+require 'set'
+require 'tempfile'
+require_relative 'gpu_lowering_delegate'
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      # Local Arc -> ArcToGPU lowering stage.
+      #
+      # Upstream arcilator currently does not expose an ArcToGPU lowering pass.
+      # This stage consumes Arc MLIR emitted by arcilator (typically at
+      # `--until-after=arc-opt`) and produces deterministic ArcToGPU artifacts:
+      # - metadata JSON (operation/support summary + state/ABI map)
+      # - GPU dialect MLIR skeleton for pipeline traceability
+      # - Metal shader source implementing cycle execution on GPU
+      module ArcToGpuLowering
+        class LoweringError < StandardError; end
+
+        TypeRef = Struct.new(:kind, :width, :length, :element, :index_width, keyword_init: true) do
+          def fetch(key)
+            value = public_send(key)
+            raise KeyError, "missing #{key}" if value.nil?
+
+            value
+          end
+
+          def scalar?
+            kind == :scalar
+          end
+
+          def array?
+            kind == :array
+          end
+
+          def memory?
+            kind == :memory
+          end
+        end
+
+        REQUIRED_TOP_OUTPUTS = %w[
+          mem_data_out
+          mem_addr
+          mem_write_en
+          mem_read_en
+          pc_out
+          acc_out
+          sp_out
+          halted
+          state_out
+          zero_flag_out
+        ].freeze
+
+        REQUIRED_TOP_INPUTS = %w[
+          clk
+          rst
+          mem_data_in
+        ].freeze
+
+        REQUIRED_APPLE2_OUTPUTS = %w[
+          ram_addr
+          ram_we
+          d
+          speaker
+          pc_debug
+          a_debug
+          x_debug
+          y_debug
+          p_debug
+        ].freeze
+
+        REQUIRED_APPLE2_INPUTS = %w[
+          clk_14m
+          reset
+          ram_do
+          ps2_clk
+          ps2_data
+          gameport
+          pause
+        ].freeze
+
+        REQUIRED_RISCV_INPUTS = %w[
+          clk
+          rst
+          irq_software
+          irq_timer
+          irq_external
+          inst_data
+          inst_ptw_pte1
+          inst_ptw_pte0
+          data_rdata
+          data_ptw_pte1
+          data_ptw_pte0
+          debug_reg_addr
+        ].freeze
+
+        REQUIRED_RISCV_OUTPUTS = %w[
+          inst_addr
+          inst_ptw_addr1
+          inst_ptw_addr0
+          data_addr
+          data_wdata
+          data_we
+          data_re
+          data_funct3
+          data_ptw_addr1
+          data_ptw_addr0
+          debug_pc
+          debug_inst
+          debug_x1
+          debug_x2
+          debug_x10
+          debug_x11
+          debug_reg_data
+        ].freeze
+
+        REQUIRED_RISCV_LOOP_OUTPUTS = %w[
+          inst_addr
+          inst_ptw_addr1
+          inst_ptw_addr0
+          data_addr
+          data_wdata
+          data_we
+          data_re
+          data_funct3
+          data_ptw_addr1
+          data_ptw_addr0
+        ].freeze
+
+        REQUIRED_RISCV_HIGH_LOOP_OUTPUTS = %w[
+          inst_addr
+          inst_ptw_addr1
+          inst_ptw_addr0
+          data_addr
+          data_re
+          data_funct3
+          data_ptw_addr1
+          data_ptw_addr0
+        ].freeze
+
+        REQUIRED_RISCV_FAST_LOOP_OUTPUTS = %w[
+          inst_addr
+          data_addr
+          data_wdata
+          data_we
+          data_re
+        ].freeze
+
+        REQUIRED_RISCV_FAST_LOOP_OUTPUTS_NO_ADDR = %w[
+          inst_addr
+          data_wdata
+          data_we
+          data_re
+        ].freeze
+
+        REQUIRED_RISCV_FAST_LOOP_ADDR_OUTPUTS = %w[
+          data_addr
+        ].freeze
+
+        REQUIRED_RISCV_FAST_LOOP_OUTPUTS_NO_WDATA = %w[
+          inst_addr
+          data_addr
+          data_we
+          data_re
+        ].freeze
+
+        REQUIRED_RISCV_FAST_LOOP_WDATA_OUTPUTS = %w[
+          data_wdata
+        ].freeze
+
+        REQUIRED_RISCV_FAST_HIGH_LOOP_OUTPUTS = %w[
+          inst_addr
+          data_addr
+          data_re
+        ].freeze
+
+        REQUIRED_RISCV_FAST_HIGH_LOOP_OUTPUTS_NO_ADDR = %w[
+          inst_addr
+          data_re
+        ].freeze
+
+        REQUIRED_RISCV_FAST_HIGH_LOOP_ADDR_OUTPUTS = %w[
+          data_addr
+        ].freeze
+
+        RUNTIME_RISCV_OUTPUTS = [].freeze
+
+        SUPPORTED_OPS = %w[
+          arc.call
+          arc.define
+          arc.memory
+          arc.memory_read_port
+          arc.memory_write_port
+          arc.output
+          arc.state
+          seq.firreg
+          seq.firmem
+          seq.firmem.read_port
+          seq.firmem.write_port
+          synth.aig.and_inv
+          comb.add
+          comb.and
+          comb.concat
+          comb.divu
+          comb.extract
+          comb.icmp
+          comb.modu
+          comb.mul
+          comb.mux
+          comb.or
+          comb.replicate
+          comb.shl
+          comb.shru
+          comb.sub
+          comb.xor
+          func.func
+          hw.array
+          hw.array_create
+          hw.array_get
+          hw.aggregate_constant
+          hw.constant
+          hw.output
+          hw.module
+          rhdl.alias
+          seq.to_clock
+        ].freeze
+
+        DEFAULT_SCALAR_WIDTH_BITS = 32
+
+        module_function
+
+        def profile_module_for(profile)
+          case profile.to_sym
+          when :cpu8bit
+            require_relative 'arc_to_gpu_lowering/profiles/cpu8bit'
+            Profiles::Cpu8bit
+          when :apple2
+            require_relative 'arc_to_gpu_lowering/profiles/apple2'
+            Profiles::Apple2
+          when :riscv
+            require_relative 'arc_to_gpu_lowering/profiles/riscv'
+            Profiles::Riscv
+          when :riscv_netlist
+            require_relative 'arc_to_gpu_lowering/profiles/riscv_netlist'
+            Profiles::RiscvNetlist
+          else
+            raise LoweringError, "Unsupported ArcToGPU profile: #{profile.inspect}"
+          end
+        end
+
+        def lower(
+          arc_mlir_path:,
+          gpu_mlir_path:,
+          metadata_path: nil,
+          metal_source_path: nil,
+          profile: :cpu8bit
+        )
+          source = File.read(arc_mlir_path)
+          GpuLoweringDelegate.lower(
+            lowerer: self,
+            source_text: source,
+            parser: method(:parse_arc_mlir),
+            gpu_mlir_path: gpu_mlir_path,
+            metadata_path: metadata_path,
+            metal_source_path: metal_source_path,
+            profile: profile,
+            gem_kernel_interpreter: false,
+            require_arc_define: true,
+            metadata_version: 'ArcToGpuLoweringV2',
+            lowering_label: 'ArcToGpuLowering'
+          )
+        end
+
+        def summarize(parsed)
+          top = parsed.fetch(:top_module)
+          {
+            top_module: top.fetch(:name),
+            top_inputs: top.fetch(:inputs).map { |p| p.fetch(:name) },
+            top_outputs: top.fetch(:outputs).map { |p| p.fetch(:name) },
+            arc_define_count: parsed.fetch(:functions).length,
+            arc_state_count: top.fetch(:ops).count { |op| %i[arc_state seq_firreg].include?(op.fetch(:kind)) },
+            arc_call_count: parsed.fetch(:functions).values.sum { |fn| fn.fetch(:ops).count { |op| op.fetch(:kind) == :arc_call } } +
+              top.fetch(:ops).count { |op| op.fetch(:kind) == :arc_call },
+            ops: parsed.fetch(:op_counts).sort.to_h
+          }
+        end
+
+        def riscv_runtime_introspection(parsed, state_layout, output_state_slots)
+          top = parsed.fetch(:top_module)
+          output_widths = top.fetch(:outputs).each_with_object({}) do |output, acc|
+            acc[output.fetch(:name)] = output.fetch(:type).fetch(:width).to_i
+          end
+          pc_slot = output_state_slots['debug_pc']
+          pc_width = output_widths.fetch('debug_pc', 32)
+          regfile_entry = state_layout.find do |entry|
+            entry.fetch(:kind) == :arc_memory &&
+              entry.fetch(:length, 0).to_i == 32 &&
+              entry.fetch(:index_width, 0).to_i == 5 &&
+              entry.fetch(:width, 0).to_i == 32 &&
+              entry.fetch(:slots_per_element, 1).to_i == 1
+          end
+
+          {
+            pc_slot: pc_slot ? pc_slot.to_i : -1,
+            pc_width: pc_width,
+            regfile_base_slot: regfile_entry ? regfile_entry.fetch(:index).to_i : -1,
+            regfile_length: regfile_entry ? regfile_entry.fetch(:length).to_i : 0,
+            regfile_slots_per_element: regfile_entry ? regfile_entry.fetch(:slots_per_element, 1).to_i : 0
+          }
+        end
+
+        def validate_top_module!(
+          parsed,
+          summary,
+          required_inputs: [],
+          required_outputs: [],
+          require_arc_define: true
+        )
+          if summary[:top_module].nil? || summary[:top_module].empty?
+            raise LoweringError, 'ArcToGPU lowering could not find top hw.module in Arc MLIR'
+          end
+
+          if require_arc_define && summary[:arc_define_count].zero?
+            raise LoweringError, 'ArcToGPU lowering expected at least one arc.define function'
+          end
+
+          if summary[:arc_state_count].zero?
+            raise LoweringError, 'ArcToGPU lowering expected state register operations in top module'
+          end
+
+          missing_inputs = required_inputs - summary[:top_inputs]
+          unless missing_inputs.empty?
+            raise LoweringError,
+              "ArcToGPU lowering top module missing required inputs: #{missing_inputs.join(', ')}"
+          end
+
+          missing_outputs = required_outputs - summary[:top_outputs]
+          unless missing_outputs.empty?
+            raise LoweringError,
+              "ArcToGPU lowering top module missing required outputs: #{missing_outputs.join(', ')}"
+          end
+
+          functions = parsed.fetch(:functions)
+          parsed.fetch(:top_module).fetch(:ops).each do |op|
+            next unless %i[arc_call arc_state arc_memory_write_port].include?(op.fetch(:kind))
+
+            callee = op.fetch(:callee)
+            next if functions.key?(callee)
+
+            raise LoweringError, "ArcToGPU lowering could not resolve callee @#{callee}"
+          end
+
+          parsed.fetch(:functions).each_value do |fn|
+            fn.fetch(:ops).each do |op|
+              next unless op.fetch(:kind) == :arc_call
+
+              callee = op.fetch(:callee)
+              next if functions.key?(callee)
+
+              raise LoweringError, "ArcToGPU lowering could not resolve callee @#{callee}"
+            end
+          end
+        end
+
+        def emit_gpu_mlir(summary, lowering_label:)
+          module_name = summary[:top_module]
+          kernel_name = "#{module_name}_arc_to_gpu_eval"
+          counts = [
+            "arc_define=#{summary[:arc_define_count]}",
+            "arc_state=#{summary[:arc_state_count]}",
+            "arc_call=#{summary[:arc_call_count]}"
+          ].join(', ')
+
+          <<~MLIR
+            // Auto-generated by RHDL::Codegen::FIRRTL::#{lowering_label}.
+            // Source module: #{module_name}
+            // Summary: #{counts}
+            module attributes {rhdl.arc_to_gpu.version = "v2", rhdl.arc_to_gpu.module = "#{module_name}"} {
+              gpu.module @#{module_name}_gpu {
+                gpu.func @#{kernel_name}() kernel {
+                  gpu.return
+                }
+              }
+            }
+          MLIR
+        end
+
+        def optimize_arc_mlir_source(source)
+          return source if ENV['RHDL_ARC_TO_GPU_DISABLE_OPT'] == '1'
+          return source unless command_available?('circt-opt')
+
+          Tempfile.create(%w[rhdl_arc_to_gpu_input .mlir]) do |infile|
+            Tempfile.create(%w[rhdl_arc_to_gpu_output .mlir]) do |outfile|
+              infile.write(source)
+              infile.flush
+
+              cmd = [
+                'circt-opt',
+                infile.path,
+                '--canonicalize',
+                '--cse',
+                '--symbol-dce',
+                '-o',
+                outfile.path
+              ]
+              _out, status = Open3.capture2e(*cmd)
+              return source unless status.success?
+
+              optimized = File.read(outfile.path)
+              return optimized.empty? ? source : optimized
+            end
+          end
+        rescue StandardError
+          source
+        end
+
+        def run_circt_opt_pipeline(source, pass_args:)
+          return source unless command_available?('circt-opt')
+
+          Tempfile.create(%w[rhdl_arc_to_gpu_pipeline_input .mlir]) do |infile|
+            Tempfile.create(%w[rhdl_arc_to_gpu_pipeline_output .mlir]) do |outfile|
+              infile.write(source)
+              infile.flush
+
+              cmd = ['circt-opt', infile.path] + Array(pass_args) + ['-o', outfile.path]
+              _out, status = Open3.capture2e(*cmd)
+              return source unless status.success?
+
+              transformed = File.read(outfile.path)
+              return transformed.empty? ? source : transformed
+            end
+          end
+        rescue StandardError
+          source
+        end
+
+        def flatten_simple_arc_calls(parsed, max_ops: 12, max_depth: 2)
+          return parsed if ENV['RHDL_ARC_TO_GPU_DISABLE_INLINE'] == '1'
+
+          functions = deep_copy(parsed.fetch(:functions))
+          top_module = deep_copy(parsed.fetch(:top_module))
+          changed_any = false
+
+          max_depth.times do
+            candidates = functions.select do |name, fn|
+              next false if fn.fetch(:ops).empty?
+              next false if fn.fetch(:ops).length > max_ops
+              next false if fn.fetch(:ops).any? { |op| op.fetch(:kind) == :arc_state }
+              next false if fn.fetch(:ops).any? { |op| op.fetch(:kind) == :arc_call && op.fetch(:callee) == name }
+
+              true
+            end
+            break if candidates.empty?
+
+            changed_this_round = false
+            candidates_set = candidates.keys.to_set
+
+            functions.each_value do |fn|
+              inlined_ops, changed = inline_calls_in_ops(
+                ops: fn.fetch(:ops),
+                functions: functions,
+                candidates: candidates_set
+              )
+              fn[:ops] = inlined_ops
+              changed_this_round ||= changed
+            end
+
+            top_inlined_ops, top_changed = inline_calls_in_ops(
+              ops: top_module.fetch(:ops),
+              functions: functions,
+              candidates: candidates_set
+            )
+            top_module[:ops] = top_inlined_ops
+            changed_this_round ||= top_changed
+
+            changed_any ||= changed_this_round
+            break unless changed_this_round
+          end
+
+          return parsed unless changed_any
+
+          {
+            functions: functions,
+            top_module: top_module,
+            op_counts: recompute_op_counts(functions: functions, top_module: top_module)
+          }
+        end
+
+        # Fold constant-index array_get patterns produced by Arc/HW lowering:
+        # - hw.array_get(hw.array_create(...), cst_idx) -> rhdl.alias(selected_operand)
+        # - hw.array_get(hw.aggregate_constant(...), cst_idx) -> hw.constant(selected_value)
+        #
+        # This removes temporary array structs and array indexing branches in hot paths.
+        def fold_constant_array_gets(parsed)
+          functions = deep_copy(parsed.fetch(:functions))
+          top_module = deep_copy(parsed.fetch(:top_module))
+          changed_any = false
+
+          fold_ops = lambda do |ops|
+            producers = {}
+            constant_values = {}
+            transformed = []
+
+            ops.each do |op|
+              replacement = nil
+
+              if op.fetch(:kind) == :array_get
+                idx_value = constant_values[op.fetch(:index_ref)]
+                array_producer = producers[op.fetch(:array_ref)]
+
+                if !idx_value.nil? && array_producer
+                  array_len = op.fetch(:array_type).fetch(:length).to_i
+                  array_idx = idx_value.to_i
+                  array_idx = 0 if array_idx >= array_len
+                  array_idx = 0 if array_idx.negative?
+                  producer_idx = (array_len - 1) - array_idx
+
+                  case array_producer.fetch(:kind)
+                  when :array_create
+                    source_ref = array_producer.fetch(:operands).fetch(producer_idx)
+                    replacement = {
+                      kind: :alias,
+                      op_name: 'rhdl.alias',
+                      result_refs: op.fetch(:result_refs),
+                      result_types: op.fetch(:result_types),
+                      source_ref: source_ref
+                    }
+                  when :aggregate_constant
+                    folded_value = array_producer.fetch(:values).fetch(producer_idx)
+                    replacement = {
+                      kind: :constant,
+                      op_name: 'hw.constant',
+                      result_refs: op.fetch(:result_refs),
+                      result_types: op.fetch(:result_types),
+                      value: folded_value
+                    }
+                  end
+                end
+              end
+
+              current = replacement || op
+              changed_any ||= !replacement.nil?
+              transformed << current
+
+              current.fetch(:result_refs).each do |ref|
+                producers[ref] = current
+              end
+
+              if current.fetch(:kind) == :constant
+                current.fetch(:result_refs).each { |ref| constant_values[ref] = current.fetch(:value) }
+              elsif current.fetch(:kind) == :alias
+                src = current.fetch(:source_ref)
+                current.fetch(:result_refs).each do |ref|
+                  if constant_values.key?(src)
+                    constant_values[ref] = constant_values[src]
+                  else
+                    constant_values.delete(ref)
+                  end
+                end
+              else
+                current.fetch(:result_refs).each { |ref| constant_values.delete(ref) }
+              end
+            end
+
+            transformed
+          end
+
+          functions.each_value do |fn|
+            fn[:ops] = fold_ops.call(fn.fetch(:ops))
+          end
+          top_module[:ops] = fold_ops.call(top_module.fetch(:ops))
+
+          return parsed unless changed_any
+
+          {
+            functions: functions,
+            top_module: top_module,
+            op_counts: recompute_op_counts(functions: functions, top_module: top_module)
+          }
+        end
+
+        # Drop arc.define functions that are unreachable from top-module call/state/write roots.
+        def prune_unreachable_functions(parsed)
+          functions = parsed.fetch(:functions)
+          return parsed if functions.empty?
+
+          reachable = Set.new
+          worklist = []
+
+          enqueue_callees = lambda do |ops|
+            ops.each do |op|
+              case op.fetch(:kind)
+              when :arc_call, :arc_state, :arc_memory_write_port
+                callee = op[:callee]
+                worklist << callee if callee && !reachable.include?(callee)
+              end
+            end
+          end
+
+          enqueue_callees.call(parsed.fetch(:top_module).fetch(:ops))
+
+          until worklist.empty?
+            callee = worklist.pop
+            next if reachable.include?(callee)
+
+            fn = functions[callee]
+            next unless fn
+
+            reachable << callee
+            enqueue_callees.call(fn.fetch(:ops))
+          end
+
+          return parsed if reachable.length == functions.length
+
+          pruned_functions = {}
+          functions.each do |name, fn|
+            pruned_functions[name] = deep_copy(fn) if reachable.include?(name)
+          end
+
+          top_module = deep_copy(parsed.fetch(:top_module))
+
+          {
+            functions: pruned_functions,
+            top_module: top_module,
+            op_counts: recompute_op_counts(functions: pruned_functions, top_module: top_module)
+          }
+        end
+
+        def inline_calls_in_ops(ops:, functions:, candidates:)
+          changed = false
+          out = []
+
+          ops.each do |op|
+            if op.fetch(:kind) == :arc_call && candidates.include?(op.fetch(:callee))
+              callee_fn = functions[op.fetch(:callee)]
+              if callee_fn
+                out.concat(inline_arc_call_op(call_op: op, callee_fn: callee_fn))
+                changed = true
+                next
+              end
+            end
+
+            out << op
+          end
+
+          [out, changed]
+        end
+
+        def inline_arc_call_op(call_op:, callee_fn:)
+          @inline_counter ||= 0
+          call_id = @inline_counter
+          @inline_counter += 1
+
+          arg_ref_map = {}
+          callee_fn.fetch(:args).each_with_index do |arg, idx|
+            arg_ref_map[arg.fetch(:ref)] = call_op.fetch(:args).fetch(idx)
+          end
+          inner_result_map = {}
+
+          map_ref = lambda do |ref|
+            inner_result_map.fetch(ref, arg_ref_map.fetch(ref, ref))
+          end
+
+          inlined_ops = []
+          callee_fn.fetch(:ops).each do |inner_op|
+            cloned = deep_copy(inner_op)
+            old_result_refs = cloned.fetch(:result_refs)
+            new_result_refs = old_result_refs.map do |ref|
+              token = sanitize_ident(ref.sub('%', ''))
+              mapped = "%inl#{call_id}_#{token}"
+              inner_result_map[ref] = mapped
+              mapped
+            end
+            cloned[:result_refs] = new_result_refs
+
+            case cloned.fetch(:kind)
+            when :to_clock
+              cloned[:input] = map_ref.call(cloned.fetch(:input))
+            when :arc_call
+              cloned[:args] = cloned.fetch(:args).map { |arg| map_ref.call(arg) }
+            when :arc_state
+              cloned[:args] = cloned.fetch(:args).map { |arg| map_ref.call(arg) }
+              cloned[:clock_ref] = map_ref.call(cloned.fetch(:clock_ref))
+              if cloned.fetch(:enable_ref)
+                cloned[:enable_ref] = map_ref.call(cloned.fetch(:enable_ref))
+              end
+              if cloned.fetch(:reset_ref)
+                cloned[:reset_ref] = map_ref.call(cloned.fetch(:reset_ref))
+              end
+            when :array_create, :icmp, :concat, :mux, :comb, :synth_aig_and_inv
+              cloned[:operands] = cloned.fetch(:operands).map { |arg| map_ref.call(arg) }
+            when :array_get
+              cloned[:array_ref] = map_ref.call(cloned.fetch(:array_ref))
+              cloned[:index_ref] = map_ref.call(cloned.fetch(:index_ref))
+            when :extract, :replicate
+              cloned[:input] = map_ref.call(cloned.fetch(:input))
+            when :constant, :aggregate_constant
+              # no-op
+            else
+              # keep unsupported kinds untouched
+            end
+
+            inlined_ops << cloned
+          end
+
+          callee_fn.fetch(:output_refs).each_with_index do |out_ref, idx|
+            source_ref = map_ref.call(out_ref)
+            target_ref = call_op.fetch(:result_refs).fetch(idx)
+            next if source_ref == target_ref
+
+            inlined_ops << {
+              kind: :alias,
+              op_name: 'rhdl.alias',
+              result_refs: [target_ref],
+              result_types: [call_op.fetch(:result_types).fetch(idx)],
+              source_ref: source_ref
+            }
+          end
+
+          inlined_ops
+        end
+
+        def recompute_op_counts(functions:, top_module:)
+          counts = Hash.new(0)
+          counts['arc.define'] = functions.length
+          counts['func.func'] = functions.length
+          counts['hw.module'] = 1
+          counts['hw.output'] = 1
+
+          functions.each_value do |fn|
+            fn.fetch(:ops).each { |op| counts[op.fetch(:op_name)] += 1 }
+            counts['arc.output'] += 1
+          end
+          top_module.fetch(:ops).each { |op| counts[op.fetch(:op_name)] += 1 }
+
+          counts
+        end
+
+        def deep_copy(value)
+          Marshal.load(Marshal.dump(value))
+        end
+
+        def parse_arc_mlir(text)
+          lines = text.lines
+          functions = {}
+          top_module = nil
+          op_counts = Hash.new(0)
+          i = 0
+
+          while i < lines.length
+            line = clean_line(lines[i])
+            stripped = line.strip
+
+            if stripped.start_with?('arc.define @')
+              header = stripped
+              body = []
+              i += 1
+              while i < lines.length
+                inner = clean_line(lines[i]).strip
+                break if inner == '}'
+
+                body << inner unless inner.empty?
+                i += 1
+              end
+              fn = parse_define(header, body, op_counts)
+              functions[fn.fetch(:name)] = fn
+            elsif stripped.start_with?('hw.module @')
+              header = stripped
+              body = []
+              i += 1
+              while i < lines.length
+                inner = clean_line(lines[i]).strip
+                break if inner == '}'
+
+                body << inner unless inner.empty?
+                i += 1
+              end
+              top_module = parse_top_module(header, body, op_counts)
+            elsif stripped.start_with?('func.func')
+              op_counts['func.func'] += 1
+            end
+
+            i += 1
+          end
+
+          raise LoweringError, 'ArcToGPU lowering could not find top hw.module in Arc MLIR' unless top_module
+
+          {
+            functions: functions,
+            top_module: top_module,
+            op_counts: op_counts
+          }
+        end
+
+        def parse_define(header, body_lines, op_counts)
+          match = header.match(/\Aarc\.define\s+@([A-Za-z0-9_.$-]+)\((.*)\)\s*->\s*(.+?)\s*\{\z/)
+          raise LoweringError, "Could not parse arc.define header: #{header}" unless match
+
+          name = match[1]
+          args = parse_arg_list(match[2])
+          return_types = parse_return_types(match[3])
+
+          ops = []
+          output_refs = nil
+
+          body_lines.each do |line|
+            if line.start_with?('arc.output ')
+              refs_raw, type_raw = line.sub('arc.output ', '').split(':', 2)
+              refs = split_top_level(refs_raw).map(&:strip)
+              types = parse_return_types(type_raw.to_s.strip)
+              output_refs = refs
+              op_counts['arc.output'] += 1
+              next
+            end
+
+            op = parse_assignment(line)
+            ops << op
+            op_counts[op.fetch(:op_name)] += 1
+          end
+
+          raise LoweringError, "arc.define @#{name} missing arc.output" unless output_refs
+
+          {
+            name: name,
+            args: args,
+            return_types: return_types,
+            ops: ops,
+            output_refs: output_refs
+          }
+        end
+
+        def parse_top_module(header, body_lines, op_counts)
+          name, inputs, outputs = parse_hw_module_signature(header)
+          raise LoweringError, "Could not parse hw.module signature: #{header}" if name.nil?
+
+          ops = []
+          hw_output_refs = nil
+
+          body_lines.each do |line|
+            if line.start_with?('hw.output ')
+              refs_raw, _type_raw = line.sub('hw.output ', '').split(':', 2)
+              hw_output_refs = split_top_level(refs_raw).map(&:strip)
+              op_counts['hw.output'] += 1
+              next
+            end
+
+            if line.start_with?('arc.memory_write_port ')
+              op = parse_memory_write_port(line)
+              ops << op
+              op_counts[op.fetch(:op_name)] += 1
+              next
+            end
+
+            if line.start_with?('seq.firmem.write_port ')
+              op = parse_seq_firmem_write_port(line)
+              ops << op
+              op_counts[op.fetch(:op_name)] += 1
+              next
+            end
+
+            op = parse_assignment(line, allow_arc_state: true)
+            ops << op
+            op_counts[op.fetch(:op_name)] += 1
+          end
+
+          raise LoweringError, 'Top hw.module missing hw.output' unless hw_output_refs
+          op_counts['hw.module'] += 1
+
+          {
+            name: name,
+            inputs: inputs,
+            outputs: outputs,
+            ops: ops,
+            hw_output_refs: hw_output_refs
+          }
+        end
+
+        def parse_assignment(line, allow_arc_state: false)
+          match = line.match(/\A(%[A-Za-z0-9_.$#-]+(?::\d+)?)\s*=\s*(.+)\z/)
+          raise LoweringError, "Could not parse assignment: #{line}" unless match
+
+          lhs = match[1]
+          rhs = match[2]
+          result_refs = expand_lhs_result_refs(lhs)
+
+          if (m = rhs.match(/\Ahw\.constant\s+(.+?)(?:\s*:\s*(.+))?\z/))
+            value_raw = m[1].strip
+            type = m[2] ? parse_type(m[2].strip) : TypeRef.new(kind: :scalar, width: 1)
+            value = parse_constant_literal(value_raw, type)
+            return {
+              kind: :constant,
+              op_name: 'hw.constant',
+              result_refs: result_refs,
+              result_types: [type],
+              value: value
+            }
+          end
+
+          if (m = rhs.match(/\Aseq\.to_clock\s+(%[A-Za-z0-9_.$#-]+)\z/))
+            return {
+              kind: :to_clock,
+              op_name: 'seq.to_clock',
+              result_refs: result_refs,
+              result_types: [TypeRef.new(kind: :scalar, width: 1)],
+              input: m[1]
+            }
+          end
+
+          if (m = rhs.match(/\Aarc\.call\s+@([A-Za-z0-9_.$-]+)\((.*)\)\s*:\s*\((.*)\)\s*->\s*(.+)\z/))
+            callee = m[1]
+            args = parse_value_list(m[2])
+            return_types = parse_return_types(m[4])
+            return {
+              kind: :arc_call,
+              op_name: 'arc.call',
+              result_refs: result_refs,
+              result_types: return_types,
+              callee: callee,
+              args: args
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aarc\.state\s+@([A-Za-z0-9_.$-]+)\((.*)\)\s+clock\s+(%[A-Za-z0-9_.$#-]+)(?:\s+enable\s+(%[A-Za-z0-9_.$#-]+))?(?:\s+reset\s+(%[A-Za-z0-9_.$#-]+))?\s+latency\s+(\d+)\s*:\s*\((.*)\)\s*->\s*(.+)\z/))
+            callee = m[1]
+            args = parse_value_list(m[2])
+            clock_ref = m[3]
+            enable_ref = m[4]
+            reset_ref = m[5]
+            latency = m[6].to_i
+            return_types = parse_return_types(m[8].strip)
+            if return_types.length != result_refs.length
+              raise LoweringError,
+                "arc.state result arity mismatch: refs=#{result_refs.length}, types=#{return_types.length}"
+            end
+            return {
+              kind: :arc_state,
+              op_name: 'arc.state',
+              result_refs: result_refs,
+              result_types: return_types,
+              callee: callee,
+              args: args,
+              clock_ref: clock_ref,
+              enable_ref: enable_ref,
+              reset_ref: reset_ref,
+              latency: latency
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aseq\.firreg\s+(%[A-Za-z0-9_.$#-]+)\s+clock\s+(%[A-Za-z0-9_.$#-]+)\s+reset\s+(?:sync|async)\s+(%[A-Za-z0-9_.$#-]+)\s*,\s*(%[A-Za-z0-9_.$#-]+)(?:\s+\{[^{}]*\})?\s*:\s*(.+)\z/))
+            source_ref = m[1]
+            clock_ref = m[2]
+            reset_ref = m[3]
+            reset_value_ref = m[4]
+            result_type = parse_type(m[5].strip)
+            return {
+              kind: :seq_firreg,
+              op_name: 'seq.firreg',
+              result_refs: result_refs,
+              result_types: [result_type],
+              source_ref: source_ref,
+              clock_ref: clock_ref,
+              reset_ref: reset_ref,
+              reset_value_ref: reset_value_ref
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aseq\.firreg\s+(%[A-Za-z0-9_.$#-]+)\s+clock\s+(%[A-Za-z0-9_.$#-]+)(?:\s+\{[^{}]*\})?\s*:\s*(.+)\z/))
+            source_ref = m[1]
+            clock_ref = m[2]
+            result_type = parse_type(m[3].strip)
+            return {
+              kind: :seq_firreg,
+              op_name: 'seq.firreg',
+              result_refs: result_refs,
+              result_types: [result_type],
+              source_ref: source_ref,
+              clock_ref: clock_ref,
+              reset_ref: nil,
+              reset_value_ref: nil
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aseq\.firmem\s+.+\s*:\s*<\s*(\d+)\s*x\s*(\d+)\s*>\z/))
+            memory_type = parse_seq_firmem_type(length_text: m[1], width_text: m[2])
+            return {
+              kind: :arc_memory,
+              op_name: 'seq.firmem',
+              result_refs: result_refs,
+              result_types: [memory_type],
+              memory_type: memory_type
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aseq\.firmem\.read_port\s+(%[A-Za-z0-9_.$#-]+)\[(%[A-Za-z0-9_.$#-]+)\]\s*,\s*clock\s+(%[A-Za-z0-9_.$#-]+)(?:\s+\{[^{}]*\})?\s*:\s*<\s*(\d+)\s*x\s*(\d+)\s*>\z/))
+            memory_ref = m[1]
+            index_ref = m[2]
+            memory_type = parse_seq_firmem_type(length_text: m[4], width_text: m[5])
+            elem_type = memory_type.fetch(:element)
+            return {
+              kind: :arc_memory_read_port,
+              op_name: 'seq.firmem.read_port',
+              result_refs: result_refs,
+              result_types: [elem_type],
+              memory_ref: memory_ref,
+              index_ref: index_ref,
+              memory_type: memory_type,
+              index_type: TypeRef.new(kind: :scalar, width: memory_type.fetch(:index_width))
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aarc\.memory\s+(.+)\z/))
+            memory_type = parse_type(m[1].strip)
+            unless memory_type.memory?
+              raise LoweringError, "arc.memory requires memory type, got #{m[1].strip}"
+            end
+            return {
+              kind: :arc_memory,
+              op_name: 'arc.memory',
+              result_refs: result_refs,
+              result_types: [memory_type],
+              memory_type: memory_type
+            }
+          end
+
+          if allow_arc_state && (m = rhs.match(/\Aarc\.memory_read_port\s+(%[A-Za-z0-9_.$#-]+)\[(%[A-Za-z0-9_.$#-]+)\]\s*:\s*(.+)\z/))
+            memory_ref = m[1]
+            index_ref = m[2]
+            memory_type = parse_type(m[3].strip)
+            unless memory_type.memory?
+              raise LoweringError, "arc.memory_read_port requires memory type, got #{m[3].strip}"
+            end
+            elem_type = memory_type.fetch(:element)
+            return {
+              kind: :arc_memory_read_port,
+              op_name: 'arc.memory_read_port',
+              result_refs: result_refs,
+              result_types: [elem_type],
+              memory_ref: memory_ref,
+              index_ref: index_ref,
+              memory_type: memory_type,
+              index_type: TypeRef.new(kind: :scalar, width: memory_type.fetch(:index_width))
+            }
+          end
+
+          if (m = rhs.match(/\Ahw\.array_create\s+(.+)\s*:\s*(.+)\z/))
+            operands = parse_value_list(m[1])
+            elem_type = parse_type(m[2].strip)
+            array_type = TypeRef.new(kind: :array, length: operands.length, element: elem_type)
+            return {
+              kind: :array_create,
+              op_name: 'hw.array_create',
+              result_refs: result_refs,
+              result_types: [array_type],
+              operands: operands,
+              element_type: elem_type
+            }
+          end
+
+          if (m = rhs.match(/\Ahw\.array_get\s+(%[A-Za-z0-9_.$#-]+)\[(%[A-Za-z0-9_.$#-]+)\]\s*:\s*(!hw\.array<[^>]+>),\s*(.+)\z/))
+            array_ref = m[1]
+            index_ref = m[2]
+            array_type = parse_type(m[3])
+            index_type = parse_type(m[4])
+            elem_type = array_type.fetch(:element)
+            return {
+              kind: :array_get,
+              op_name: 'hw.array_get',
+              result_refs: result_refs,
+              result_types: [elem_type],
+              array_ref: array_ref,
+              index_ref: index_ref,
+              array_type: array_type,
+              index_type: index_type
+            }
+          end
+
+          if (m = rhs.match(/\Ahw\.aggregate_constant\s+\[(.*)\]\s*:\s*(.+)\z/))
+            values_raw = split_top_level(m[1])
+            out_type = parse_type(m[2].strip)
+            unless out_type.array?
+              raise LoweringError, 'hw.aggregate_constant currently only supported for !hw.array outputs'
+            end
+
+            elem_type = out_type.fetch(:element)
+            values = values_raw.map do |entry|
+              vm = entry.strip.match(/\A(.+?)\s*:\s*(.+)\z/)
+              raise LoweringError, "Could not parse hw.aggregate_constant element: #{entry}" unless vm
+
+              value = parse_constant_literal(vm[1].strip, elem_type)
+              value_type = parse_type(vm[2].strip)
+              unless value_type.scalar? && value_type.fetch(:width) == elem_type.fetch(:width)
+                raise LoweringError,
+                  "hw.aggregate_constant element type mismatch: expected i#{elem_type.fetch(:width)}, got #{vm[2].strip}"
+              end
+              value
+            end
+
+            unless values.length == out_type.fetch(:length)
+              raise LoweringError,
+                "hw.aggregate_constant length mismatch: expected #{out_type.fetch(:length)}, got #{values.length}"
+            end
+
+            return {
+              kind: :aggregate_constant,
+              op_name: 'hw.aggregate_constant',
+              result_refs: result_refs,
+              result_types: [out_type],
+              values: values
+            }
+          end
+
+          if (m = rhs.match(/\Acomb\.icmp(?:\s+bin)?\s+([A-Za-z_]+)\s+(.+)\s*:\s*(.+)\z/))
+            predicate = m[1]
+            operands = parse_value_list(m[2])
+            return {
+              kind: :icmp,
+              op_name: 'comb.icmp',
+              result_refs: result_refs,
+              result_types: [TypeRef.new(kind: :scalar, width: 1)],
+              predicate: predicate,
+              operands: operands,
+              operand_types: parse_return_types(m[3])
+            }
+          end
+
+          if (m = rhs.match(/\Acomb\.concat\s+(.+)\s*:\s*(.+)\z/))
+            operands = parse_value_list(m[1])
+            operand_types = parse_return_types(m[2])
+            total_width = operand_types.sum { |t| t.fetch(:width) }
+            return {
+              kind: :concat,
+              op_name: 'comb.concat',
+              result_refs: result_refs,
+              result_types: [TypeRef.new(kind: :scalar, width: total_width)],
+              operands: operands,
+              operand_types: operand_types
+            }
+          end
+
+          if (m = rhs.match(/\Acomb\.extract\s+(%[A-Za-z0-9_.$#-]+)(?:\s+\{[^{}]*\})?\s+from\s+(\d+)(?:\s+\{[^{}]*\})?\s*:\s*\((.+)\)\s*->\s*(.+)\z/))
+            input = m[1]
+            from = m[2].to_i
+            input_type = parse_type(m[3].strip)
+            result_type = parse_type(m[4].strip)
+            return {
+              kind: :extract,
+              op_name: 'comb.extract',
+              result_refs: result_refs,
+              result_types: [result_type],
+              input: input,
+              from: from,
+              input_type: input_type
+            }
+          end
+
+          if (m = rhs.match(/\Acomb\.replicate\s+(%[A-Za-z0-9_.$#-]+)\s*:\s*\((.+)\)\s*->\s*(.+)\z/))
+            input = m[1]
+            input_type = parse_type(m[2].strip)
+            result_type = parse_type(m[3].strip)
+            return {
+              kind: :replicate,
+              op_name: 'comb.replicate',
+              result_refs: result_refs,
+              result_types: [result_type],
+              input: input,
+              input_type: input_type
+            }
+          end
+
+          if (m = rhs.match(/\Acomb\.(add|sub|mul|divu|modu|shl|shru|xor|or|and|mux)(?:\s+bin)?\s+(.+)\s*:\s*(.+)\z/))
+            op = m[1]
+            operands = parse_value_list(m[2])
+            result_type = parse_type(m[3].strip)
+            kind = if op == 'mux'
+              :mux
+            else
+              :comb
+            end
+            return {
+              kind: kind,
+              op_name: "comb.#{op}",
+              comb_op: op,
+              result_refs: result_refs,
+              result_types: [result_type],
+              operands: operands
+            }
+          end
+
+          if (m = rhs.match(/\Asynth\.aig\.and_inv\s+(.+)\s*:\s*(.+)\z/))
+            operand_refs = []
+            invert_flags = []
+            split_top_level(m[1]).map(&:strip).each do |entry|
+              if (nm = entry.match(/\Anot\s+(%[A-Za-z0-9_.$#-]+)\b/))
+                operand_refs << nm[1]
+                invert_flags << true
+              elsif (vm = entry.match(/\A(%[A-Za-z0-9_.$#-]+)\b/))
+                operand_refs << vm[1]
+                invert_flags << false
+              else
+                raise LoweringError, "Could not parse synth.aig.and_inv operand: #{entry}"
+              end
+            end
+            result_type = parse_type(m[2].strip)
+            return {
+              kind: :synth_aig_and_inv,
+              op_name: 'synth.aig.and_inv',
+              result_refs: result_refs,
+              result_types: [result_type],
+              operands: operand_refs,
+              invert_flags: invert_flags
+            }
+          end
+
+          op_token = rhs.split(/\s+/, 2).first.to_s
+          if op_token.include?('.')
+            raise LoweringError, "ArcToGPU lowering does not support ops: #{op_token}"
+          end
+
+          raise LoweringError, "Unsupported Arc operation line: #{line}"
+        end
+
+        def parse_memory_write_port(line)
+          m = line.match(/\Aarc\.memory_write_port\s+(%[A-Za-z0-9_.$#-]+)\s*,\s*@([A-Za-z0-9_.$-]+)\((.*)\)\s+clock\s+(%[A-Za-z0-9_.$#-]+)(?:\s+(enable))?\s+latency\s+(\d+)\s*:\s*(.+)\z/)
+          raise LoweringError, "Could not parse arc.memory_write_port: #{line}" unless m
+
+          memory_ref = m[1]
+          callee = m[2]
+          args = parse_value_list(m[3])
+          clock_ref = m[4]
+          has_enable = !m[5].nil?
+          latency = m[6].to_i
+          type_parts = split_top_level(m[7])
+          raise LoweringError, "arc.memory_write_port missing type list: #{line}" if type_parts.empty?
+
+          memory_type = parse_type(type_parts.first.strip)
+          unless memory_type.memory?
+            raise LoweringError,
+              "arc.memory_write_port expected memory type first, got #{type_parts.first.strip}"
+          end
+
+          write_result_types = type_parts[1..].to_a.map { |part| parse_type(part.strip) }
+          if write_result_types.length < 3
+            raise LoweringError,
+              "arc.memory_write_port expects at least addr/data/we tuple, got #{write_result_types.length}"
+          end
+
+          {
+            kind: :arc_memory_write_port,
+            op_name: 'arc.memory_write_port',
+            result_refs: [],
+            result_types: [],
+            memory_ref: memory_ref,
+            memory_type: memory_type,
+            callee: callee,
+            args: args,
+            clock_ref: clock_ref,
+            has_enable: has_enable,
+            latency: latency,
+            write_result_types: write_result_types
+          }
+        end
+
+        def parse_seq_firmem_write_port(line)
+          m = line.match(
+            /\Aseq\.firmem\.write_port\s+(%[A-Za-z0-9_.$#-]+)\[(%[A-Za-z0-9_.$#-]+)\]\s*=\s*(%[A-Za-z0-9_.$#-]+)\s*,\s*clock\s+(%[A-Za-z0-9_.$#-]+)(?:\s+enable\s+(%[A-Za-z0-9_.$#-]+))?(?:\s+\{[^{}]*\})?\s*:\s*<\s*(\d+)\s*x\s*(\d+)\s*>\z/
+          )
+          raise LoweringError, "Could not parse seq.firmem.write_port: #{line}" unless m
+
+          memory_ref = m[1]
+          addr_ref = m[2]
+          data_ref = m[3]
+          clock_ref = m[4]
+          enable_ref = m[5]
+          memory_type = parse_seq_firmem_type(length_text: m[6], width_text: m[7])
+          {
+            kind: :seq_memory_write_port,
+            op_name: 'seq.firmem.write_port',
+            result_refs: [],
+            result_types: [],
+            memory_ref: memory_ref,
+            addr_ref: addr_ref,
+            data_ref: data_ref,
+            clock_ref: clock_ref,
+            enable_ref: enable_ref,
+            memory_type: memory_type
+          }
+        end
+
+        def parse_seq_firmem_type(length_text:, width_text:)
+          length = length_text.to_i
+          width = width_text.to_i
+          raise LoweringError, "Invalid seq.firmem length: #{length_text}" if length <= 0
+          raise LoweringError, "Invalid seq.firmem element width: #{width_text}" if width <= 0
+
+          TypeRef.new(
+            kind: :memory,
+            length: length,
+            element: TypeRef.new(kind: :scalar, width: width),
+            index_width: index_width_for_length(length)
+          )
+        end
+
+        def index_width_for_length(length)
+          return 1 if length <= 1
+
+          width = 0
+          value = length - 1
+          while value > 0
+            value >>= 1
+            width += 1
+          end
+          width
+        end
+
+        def parse_hw_module_signature(line)
+          match = line.match(/\Ahw\.module\s+@([A-Za-z0-9_.$-]+)\((.*)\)\s*\{\z/)
+          return [nil, [], []] unless match
+
+          name = match[1]
+          ports_raw = match[2]
+          inputs = []
+          outputs = []
+
+          split_top_level(ports_raw).each do |port|
+            if (m = port.match(/\Ain\s+%([A-Za-z0-9_.$-]+)\s*:\s*(.+)\z/))
+              inputs << { name: m[1], type: parse_type(m[2].strip) }
+              next
+            end
+            if (m = port.match(/\Aout\s+([A-Za-z0-9_.$-]+)\s*:\s*(.+)\z/))
+              outputs << { name: m[1], type: parse_type(m[2].strip) }
+            end
+          end
+
+          [name, inputs, outputs]
+        end
+
+        def parse_type(text)
+          t = text.strip
+          if (m = t.match(/\Ai(\d+)\z/))
+            return TypeRef.new(kind: :scalar, width: m[1].to_i)
+          end
+
+          return TypeRef.new(kind: :scalar, width: 1) if t == '!seq.clock'
+
+          if (m = t.match(/\A!hw\.array<(\d+)x(.+)>\z/))
+            len = m[1].to_i
+            elem = parse_type(m[2].strip)
+            return TypeRef.new(kind: :array, length: len, element: elem)
+          end
+
+          if (m = t.match(/\A(?:!arc\.memory)?<\s*(\d+)\s*x\s*(.+)\s*,\s*(i\d+)\s*>\z/))
+            len = m[1].to_i
+            elem = parse_type(m[2].strip)
+            idx = parse_type(m[3].strip)
+            unless idx.scalar?
+              raise LoweringError, "arc.memory index type must be scalar: #{m[3].strip}"
+            end
+            return TypeRef.new(kind: :memory, length: len, element: elem, index_width: idx.fetch(:width))
+          end
+
+          raise LoweringError, "Unsupported type in ArcToGPU lowering: #{text}"
+        end
+
+        def parse_return_types(text)
+          raw = text.to_s.strip
+          return [] if raw.empty?
+
+          if raw.start_with?('(') && raw.end_with?(')')
+            inner = raw[1..-2]
+            split_top_level(inner).map { |entry| parse_type(entry.strip) }
+          elsif split_top_level(raw).length > 1
+            split_top_level(raw).map { |entry| parse_type(entry.strip) }
+          else
+            [parse_type(raw)]
+          end
+        end
+
+        def parse_arg_list(text)
+          return [] if text.to_s.strip.empty?
+
+          split_top_level(text).map do |entry|
+            m = entry.strip.match(/\A(%[A-Za-z0-9_.$#-]+)\s*:\s*(.+)\z/)
+            raise LoweringError, "Could not parse arg entry: #{entry}" unless m
+
+            {
+              ref: m[1],
+              type: parse_type(m[2].strip)
+            }
+          end
+        end
+
+        def parse_value_list(text)
+          value_text = text.to_s.strip
+          return [] if value_text.empty?
+
+          split_top_level(value_text).map do |entry|
+            parse_value_ref(entry)
+          end
+        end
+
+        def parse_value_ref(entry)
+          m = entry.to_s.strip.match(/\A(%[A-Za-z0-9_.$#-]+)\b/)
+          raise LoweringError, "Could not parse SSA value reference: #{entry}" unless m
+
+          m[1]
+        end
+
+        def parse_constant_literal(value_raw, type)
+          token = value_raw.strip
+          return 1 if token == 'true'
+          return 0 if token == 'false'
+
+          value = begin
+            Integer(token, 0)
+          rescue ArgumentError
+            token.to_i
+          end
+          return mask_value(value, type.fetch(:width)) if type&.scalar?
+
+          value
+        end
+
+        def expand_lhs_result_refs(lhs)
+          if (m = lhs.match(/\A(%[A-Za-z0-9_.$#-]+):(\d+)\z/))
+            base = m[1]
+            count = m[2].to_i
+            return Array.new(count) { |idx| "#{base}##{idx}" }
+          end
+
+          [lhs]
+        end
+
+        def with_scalar_config(bits, pack_wide_scalars: false, narrow_scalar_types: nil)
+          prev = @scalar_width_bits
+          prev_pack = @pack_wide_scalars
+          prev_narrow = @narrow_scalar_types
+          @scalar_width_bits = bits.to_i > 32 ? 64 : DEFAULT_SCALAR_WIDTH_BITS
+          @pack_wide_scalars = !!pack_wide_scalars
+          @narrow_scalar_types = if narrow_scalar_types.nil?
+            ENV['RHDL_ARC_TO_GPU_NARROW_TYPES'] == '1'
+          else
+            !!narrow_scalar_types
+          end
+          yield
+        ensure
+          @scalar_width_bits = prev
+          @pack_wide_scalars = prev_pack
+          @narrow_scalar_types = prev_narrow
+        end
+
+        def scalar_width_bits
+          bits = @scalar_width_bits || DEFAULT_SCALAR_WIDTH_BITS
+          bits > 32 ? 64 : 32
+        end
+
+        def scalar_msl_type
+          scalar_width_bits > 32 ? 'ulong' : 'uint'
+        end
+
+        def pack_wide_scalars?
+          !!@pack_wide_scalars
+        end
+
+        def scalar_zero_literal
+          scalar_width_bits > 32 ? '0ul' : '0u'
+        end
+
+        def scalar_one_literal
+          scalar_width_bits > 32 ? '1ul' : '1u'
+        end
+
+        def scalar_full_mask_const
+          scalar_width_bits > 32 ? '0xFFFFFFFFFFFFFFFFul' : '0xFFFFFFFFu'
+        end
+
+        def inferred_scalar_width_bits(parsed)
+          max_width = 1
+          visit_type = lambda do |type|
+            next if type.nil?
+
+            if type.scalar?
+              max_width = [max_width, type.fetch(:width)].max
+            elsif type.array?
+              visit_type.call(type.fetch(:element))
+            elsif type.memory?
+              visit_type.call(type.fetch(:element))
+            end
+          end
+
+          parsed.fetch(:functions).each_value do |fn|
+            fn.fetch(:args).each { |arg| visit_type.call(arg.fetch(:type)) }
+            fn.fetch(:return_types).each { |type| visit_type.call(type) }
+            fn.fetch(:ops).each do |op|
+              op.fetch(:result_types).each { |type| visit_type.call(type) }
+              visit_type.call(op[:array_type]) if op.key?(:array_type)
+            end
+          end
+
+          top = parsed.fetch(:top_module)
+          top.fetch(:inputs).each { |input| visit_type.call(input.fetch(:type)) }
+          top.fetch(:outputs).each { |output| visit_type.call(output.fetch(:type)) }
+          top.fetch(:ops).each do |op|
+            op.fetch(:result_types).each { |type| visit_type.call(type) }
+            visit_type.call(op[:array_type]) if op.key?(:array_type)
+          end
+
+          max_width > 32 ? 64 : 32
+        end
+
+        def build_state_layout(parsed, pack_wide_scalars: false)
+          state_layout = []
+          next_index = 0
+          parsed.fetch(:top_module).fetch(:ops).each do |op|
+            case op.fetch(:kind)
+            when :arc_state, :seq_firreg
+              if op.fetch(:kind) == :arc_state && op.fetch(:latency) != 1
+                raise LoweringError, 'ArcToGPU lowering currently requires arc.state latency 1'
+              end
+
+              op.fetch(:result_refs).each_with_index do |ref, idx|
+                type = op.fetch(:result_types).fetch(idx)
+                unless type.scalar?
+                  raise LoweringError, 'ArcToGPU lowering only supports scalar state register outputs'
+                end
+
+                slot_count = if pack_wide_scalars && type.fetch(:width) > 32
+                  (type.fetch(:width) + 31) / 32
+                else
+                  1
+                end
+
+                state_layout << {
+                  index: next_index,
+                  slot_count: slot_count,
+                  result_ref: ref,
+                  width: type.fetch(:width),
+                  kind: :arc_state,
+                  type: type,
+                  callee: op[:callee],
+                  has_enable: !op[:enable_ref].nil?,
+                  has_reset: !op[:reset_ref].nil?
+                }
+                next_index += slot_count
+              end
+            when :arc_memory
+              ref = op.fetch(:result_refs).first
+              memory_type = op.fetch(:memory_type)
+              element_type = memory_type.fetch(:element)
+              unless element_type.scalar?
+                raise LoweringError, 'ArcToGPU lowering only supports scalar arc.memory element types'
+              end
+
+              slots_per_element = if pack_wide_scalars && element_type.fetch(:width) > 32
+                (element_type.fetch(:width) + 31) / 32
+              else
+                1
+              end
+              slot_count = memory_type.fetch(:length) * slots_per_element
+
+              state_layout << {
+                index: next_index,
+                slot_count: slot_count,
+                result_ref: ref,
+                width: element_type.fetch(:width),
+                kind: :arc_memory,
+                type: memory_type,
+                element_type: element_type,
+                length: memory_type.fetch(:length),
+                index_width: memory_type.fetch(:index_width),
+                slots_per_element: slots_per_element
+              }
+              next_index += slot_count
+            end
+          end
+          state_layout
+        end
+
+        def map_output_state_slots(parsed, state_layout)
+          ref_to_slot = {}
+          state_layout.each { |entry| ref_to_slot[entry.fetch(:result_ref)] = entry.fetch(:index) }
+
+          top = parsed.fetch(:top_module)
+          output_names = top.fetch(:outputs).map { |o| o.fetch(:name) }
+          output_refs = top.fetch(:hw_output_refs)
+
+          mapping = {}
+          output_names.each_with_index do |name, idx|
+            ref = output_refs[idx]
+            mapping[name] = ref_to_slot[ref] if ref_to_slot.key?(ref)
+          end
+
+          mapping
+        end
+
+        def count_clock_tracking_slots(ops)
+          ops.each_with_object(Set.new) do |op, refs|
+            next unless %i[arc_state seq_firreg arc_memory_write_port seq_memory_write_port].include?(op.fetch(:kind))
+
+            refs << op.fetch(:clock_ref)
+          end.length
+        end
+
+        def emit_metal_source(
+          parsed:,
+          state_layout:,
+          metal_entry:,
+          scalar_bits:,
+          pack_wide_scalars: false,
+          gem_kernel_interpreter: false,
+          use_state_snapshot: true,
+          split_post_comb_liveness: false,
+          trust_state_masks: false,
+          load_state_in_comb_fn: false,
+          eval_always_inline: false,
+          schedule_aware_emit: false
+        )
+          with_scalar_config(scalar_bits, pack_wide_scalars: pack_wide_scalars) do
+            top = parsed.fetch(:top_module)
+            functions = parsed.fetch(:functions)
+
+            array_types = collect_array_types(parsed)
+            fn_ret_structs = functions.values.select { |fn| fn.fetch(:return_types).length > 1 }
+
+            source = +""
+            source << "#include <metal_stdlib>\n"
+            source << "using namespace metal;\n\n"
+            source << emit_wide_helpers << "\n" if pack_wide_scalars?
+
+          source << "struct RhdlArcGpuIo {\n"
+          source << "  uint rst;\n"
+          source << "  uint clk;\n"
+          source << "  uint last_clk;\n"
+          source << "  uint mem_data_in;\n"
+          source << "  uint mem_data_out;\n"
+          source << "  uint mem_addr;\n"
+          source << "  uint mem_write_en;\n"
+          source << "  uint mem_read_en;\n"
+          source << "  uint pc_out;\n"
+          source << "  uint acc_out;\n"
+          source << "  uint sp_out;\n"
+          source << "  uint halted;\n"
+          source << "  uint state_out;\n"
+          source << "  uint zero_flag_out;\n"
+          source << "  uint cycle_budget;\n"
+          source << "  uint cycles_ran;\n"
+          source << "};\n\n"
+
+          array_types.each do |arr|
+            source << "struct #{array_struct_name(arr)} {\n"
+            source << "  #{array_element_metal_type(arr)} v[#{arr.fetch(:length)}];\n"
+            source << "};\n\n"
+          end
+
+          fn_ret_structs.each do |fn|
+            source << "struct #{ret_struct_name(fn.fetch(:name))} {\n"
+            fn.fetch(:return_types).each_with_index do |ret_type, idx|
+              source << "  #{metal_type_for(ret_type)} v#{idx};\n"
+            end
+            source << "};\n\n"
+          end
+
+          source << "struct #{top_output_struct_name(top.fetch(:name))} {\n"
+          top.fetch(:outputs).each do |out|
+            source << "  #{metal_type_for(out.fetch(:type))} #{sanitize_ident(out.fetch(:name))};\n"
+          end
+          source << "};\n\n"
+
+          source << "static inline __attribute__((always_inline)) #{scalar_msl_type} rhdl_mask_bits(#{scalar_msl_type} value, uint width) {\n"
+          source << "  if (width >= #{scalar_width_bits}u) { return value; }\n"
+          source << "  if (width == 0u) { return #{scalar_zero_literal}; }\n"
+          source << "  #{scalar_msl_type} mask = (#{scalar_one_literal} << width) - #{scalar_one_literal};\n"
+          source << "  return value & mask;\n"
+          source << "}\n\n"
+
+            functions.values.each do |fn|
+              source << emit_define_function(fn, functions)
+              source << "\n"
+            end
+
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              use_state_snapshot: use_state_snapshot,
+              split_post_comb_liveness: split_post_comb_liveness,
+              trust_state_masks: trust_state_masks,
+              load_state_in_comb_fn: load_state_in_comb_fn,
+              schedule_aware_emit: schedule_aware_emit,
+              schedule_phase_tag: "#{sanitize_ident(top.fetch(:name))}_main",
+              always_inline_eval: eval_always_inline
+            )
+            source << "\n"
+            source << emit_write_outputs_helper(top)
+            source << "\n"
+            source << emit_kernel(
+              top: top,
+              metal_entry: metal_entry,
+              state_layout: state_layout,
+              gem_kernel_interpreter: gem_kernel_interpreter
+            )
+
+            source
+          end
+        end
+
+        def emit_metal_source_apple2(parsed:, state_layout:, metal_entry:)
+          top = parsed.fetch(:top_module)
+          functions = parsed.fetch(:functions)
+          phase_split_enabled = ENV['RHDL_ARC_TO_GPU_PHASE_SPLIT'] == '1'
+          dirty_settle_enabled = ENV['RHDL_ARC_TO_GPU_DIRTY_SETTLE'] == '1'
+          full_eval_fn = top_eval_fn_name(top.fetch(:name))
+          update_loop_eval_fn = "#{full_eval_fn}_update_loop"
+          comb_loop_eval_fn = phase_split_enabled ? "#{full_eval_fn}_comb_loop" : update_loop_eval_fn
+          low_loop_eval_fn = phase_split_enabled ? comb_loop_eval_fn : "#{full_eval_fn}_low_loop"
+
+          array_types = collect_array_types(parsed)
+          fn_ret_structs = functions.values.select { |fn| fn.fetch(:return_types).length > 1 }
+
+          source = +""
+          source << "#include <metal_stdlib>\n"
+          source << "using namespace metal;\n\n"
+          source << emit_wide_helpers << "\n" if pack_wide_scalars?
+
+          source << "struct RhdlArcGpuIo {\n"
+          source << "  uint cycle_budget;\n"
+          source << "  uint cycles_ran;\n"
+          source << "  uint last_clock;\n"
+          source << "  uint prev_speaker;\n"
+          source << "  uint speaker_toggles;\n"
+          source << "  uint text_dirty;\n"
+          top.fetch(:inputs).each do |input|
+            source << "  uint #{sanitize_ident(input.fetch(:name))};\n"
+          end
+          top.fetch(:outputs).each do |out|
+            source << "  uint #{sanitize_ident(out.fetch(:name))};\n"
+          end
+          source << "};\n\n"
+
+          array_types.each do |arr|
+            source << "struct #{array_struct_name(arr)} {\n"
+            source << "  #{array_element_metal_type(arr)} v[#{arr.fetch(:length)}];\n"
+            source << "};\n\n"
+          end
+
+          fn_ret_structs.each do |fn|
+            source << "struct #{ret_struct_name(fn.fetch(:name))} {\n"
+            fn.fetch(:return_types).each_with_index do |ret_type, idx|
+              source << "  #{metal_type_for(ret_type)} v#{idx};\n"
+            end
+            source << "};\n\n"
+          end
+
+          source << "struct #{top_output_struct_name(top.fetch(:name))} {\n"
+          top.fetch(:outputs).each do |out|
+            source << "  #{metal_type_for(out.fetch(:type))} #{sanitize_ident(out.fetch(:name))};\n"
+          end
+          source << "};\n\n"
+
+          loop_step_struct = "#{sanitize_ident(top.fetch(:name))}_loop_step"
+          source << "struct #{loop_step_struct} {\n"
+          source << "  #{scalar_msl_type} ram_addr;\n"
+          source << "  #{scalar_msl_type} ram_we;\n"
+          source << "  #{scalar_msl_type} d;\n"
+          source << "  #{scalar_msl_type} speaker;\n"
+          source << "  #{scalar_msl_type} state_dirty;\n"
+          source << "};\n\n"
+
+          source << "static inline __attribute__((always_inline)) #{scalar_msl_type} rhdl_mask_bits(#{scalar_msl_type} value, uint width) {\n"
+          source << "  if (width >= #{scalar_width_bits}u) { return value; }\n"
+          source << "  if (width == 0u) { return #{scalar_zero_literal}; }\n"
+          source << "  #{scalar_msl_type} mask = (#{scalar_one_literal} << width) - #{scalar_one_literal};\n"
+          source << "  return value & mask;\n"
+          source << "}\n\n"
+
+          functions.values.each do |fn|
+            source << emit_define_function(fn, functions)
+            source << "\n"
+          end
+
+          if phase_split_enabled
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              state_address_space: 'thread',
+              use_state_snapshot: false,
+              fn_name: comb_loop_eval_fn,
+              output_names: %w[ram_addr ram_we d speaker],
+              out_struct: loop_step_struct,
+              compact_output_struct: true,
+              seed_all_outputs: true,
+              update_state: false,
+              extra_output_assignments: { 'state_dirty' => '0u' }
+            )
+            source << "\n"
+          end
+
+          unless phase_split_enabled
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              state_address_space: 'thread',
+              fn_name: low_loop_eval_fn,
+              output_names: %w[ram_addr ram_we d speaker],
+              out_struct: loop_step_struct,
+              compact_output_struct: true,
+              emit_post_comb: false,
+              update_state: true,
+              extra_output_assignments: { 'state_dirty' => '0u' }
+            )
+            source << "\n"
+          end
+
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: update_loop_eval_fn,
+            output_names: %w[ram_addr ram_we d speaker],
+            out_struct: loop_step_struct,
+            compact_output_struct: true,
+            emit_post_comb: false,
+            update_state: true,
+            track_state_dirty: dirty_settle_enabled,
+            extra_output_assignments: { 'state_dirty' => (dirty_settle_enabled ? 'state_dirty' : '1u') }
+          )
+          source << "\n"
+
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: full_eval_fn
+          )
+          source << "\n"
+          source << emit_write_outputs_helper(top)
+          source << "\n"
+          source << emit_kernel_apple2(
+            top: top,
+            metal_entry: metal_entry,
+            state_layout: state_layout,
+            low_eval_fn: low_loop_eval_fn,
+            comb_eval_fn: comb_loop_eval_fn,
+            update_eval_fn: update_loop_eval_fn,
+            phase_split_enabled: phase_split_enabled,
+            dirty_settle_enabled: dirty_settle_enabled,
+            full_eval_fn: full_eval_fn
+          )
+
+          source
+        end
+
+        def emit_metal_source_riscv(
+          parsed:,
+          state_layout:,
+          metal_entry:,
+          dirty_settle_enabled: false,
+          schedule_aware_emit: false,
+          split_low_wdata_eval: false,
+          split_high_data_addr_eval: false,
+          split_low_data_addr_eval: false,
+          runtime_output_names: nil
+        )
+          top = parsed.fetch(:top_module)
+          functions = parsed.fetch(:functions)
+          cold_memory_layout = state_layout.select do |entry|
+            entry.fetch(:kind) == :arc_memory && entry.fetch(:slot_count, 1).to_i >= 1024
+          end
+          cold_memory_bases = cold_memory_layout.map { |entry| entry.fetch(:index) }.to_set
+
+          array_types = collect_array_types(parsed)
+          fn_ret_structs = functions.values.select { |fn| fn.fetch(:return_types).length > 1 }
+
+          source = +""
+          source << "#include <metal_stdlib>\n"
+          source << "using namespace metal;\n\n"
+          source << emit_wide_helpers << "\n" if pack_wide_scalars?
+
+          source << "struct RhdlArcGpuIo {\n"
+          source << "  uint cycle_budget;\n"
+          source << "  uint cycles_ran;\n"
+          source << "  uint mem_mask;\n"
+          source << "  uint _reserved;\n"
+          top.fetch(:inputs).each do |input|
+            source << "  uint #{sanitize_ident(input.fetch(:name))};\n"
+          end
+          runtime_output_name_set =
+            if runtime_output_names
+              runtime_output_names.map(&:to_s).to_set
+            end
+          runtime_output_entries = top.fetch(:outputs).select do |out|
+            runtime_output_name_set.nil? || runtime_output_name_set.include?(out.fetch(:name))
+          end
+          runtime_output_entries.each do |out|
+            source << "  uint #{sanitize_ident(out.fetch(:name))};\n"
+          end
+          source << "};\n\n"
+
+          array_types.each do |arr|
+            source << "struct #{array_struct_name(arr)} {\n"
+            source << "  #{array_element_metal_type(arr)} v[#{arr.fetch(:length)}];\n"
+            source << "};\n\n"
+          end
+
+          fn_ret_structs.each do |fn|
+            source << "struct #{ret_struct_name(fn.fetch(:name))} {\n"
+            fn.fetch(:return_types).each_with_index do |ret_type, idx|
+              source << "  #{metal_type_for(ret_type)} v#{idx};\n"
+            end
+            source << "};\n\n"
+          end
+
+          source << "struct #{top_output_struct_name(top.fetch(:name))} {\n"
+          top.fetch(:outputs).each do |out|
+            source << "  #{metal_type_for(out.fetch(:type))} #{sanitize_ident(out.fetch(:name))};\n"
+          end
+          source << "};\n\n"
+
+          source << "static inline __attribute__((always_inline)) #{scalar_msl_type} rhdl_mask_bits(#{scalar_msl_type} value, uint width) {\n"
+          source << "  if (width >= #{scalar_width_bits}u) { return value; }\n"
+          source << "  if (width == 0u) { return #{scalar_zero_literal}; }\n"
+          source << "  #{scalar_msl_type} mask = (#{scalar_one_literal} << width) - #{scalar_one_literal};\n"
+          source << "  return value & mask;\n"
+          source << "}\n\n"
+          source << emit_state_memory_helpers << "\n"
+
+          source << <<~MSL
+            static inline __attribute__((always_inline)) uint rhdl_read_word_le(device uchar* mem, uint mask, uint addr) {
+              uint a = addr & mask;
+              if (mask >= 3u && (a & 0x3u) == 0u && a <= (mask - 3u)) {
+                return *(reinterpret_cast<device uint*>(mem + a));
+              }
+              return uint(mem[a]) |
+                (uint(mem[(a + 1u) & mask]) << 8u) |
+                (uint(mem[(a + 2u) & mask]) << 16u) |
+                (uint(mem[(a + 3u) & mask]) << 24u);
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_write_word_le(device uchar* mem, uint mask, uint addr, uint value) {
+              uint a = addr & mask;
+              if (mask >= 3u && (a & 0x3u) == 0u && a <= (mask - 3u)) {
+                *(reinterpret_cast<device uint*>(mem + a)) = value;
+                return;
+              }
+              mem[a] = uchar(value & 0xFFu);
+              mem[(a + 1u) & mask] = uchar((value >> 8u) & 0xFFu);
+              mem[(a + 2u) & mask] = uchar((value >> 16u) & 0xFFu);
+              mem[(a + 3u) & mask] = uchar((value >> 24u) & 0xFFu);
+            }
+
+            static inline __attribute__((always_inline)) uint rhdl_read_mem_funct3(device uchar* mem, uint mask, uint addr, uint funct3) {
+              uint a = addr & mask;
+              switch (funct3 & 0x7u) {
+                case 0u: {
+                  uint v = uint(mem[a]);
+                  return (v & 0x80u) != 0u ? (v | 0xFFFFFF00u) : v;
+                }
+                case 1u: {
+                  uint v = uint(mem[a]) | (uint(mem[(a + 1u) & mask]) << 8u);
+                  return (v & 0x8000u) != 0u ? (v | 0xFFFF0000u) : v;
+                }
+                case 2u:
+                  return rhdl_read_word_le(mem, mask, a);
+                case 4u:
+                  return uint(mem[a]);
+                case 5u:
+                  return uint(mem[a]) | (uint(mem[(a + 1u) & mask]) << 8u);
+                default:
+                  return 0u;
+              }
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_write_mem_funct3(device uchar* mem, uint mask, uint addr, uint value, uint funct3) {
+              uint a = addr & mask;
+              switch (funct3 & 0x7u) {
+                case 0u:
+                case 4u:
+                  mem[a] = uchar(value & 0xFFu);
+                  break;
+                case 1u:
+                case 5u:
+                  mem[a] = uchar(value & 0xFFu);
+                  mem[(a + 1u) & mask] = uchar((value >> 8u) & 0xFFu);
+                  break;
+                case 2u:
+                  rhdl_write_word_le(mem, mask, a, value);
+                  break;
+                default:
+                  break;
+              }
+            }
+
+          MSL
+
+          functions.values.each do |fn|
+            source << emit_define_function(fn, functions)
+            source << "\n"
+          end
+
+          full_eval_fn = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_full"
+          low_loop_eval_fn = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_low"
+          high_loop_eval_fn = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_high"
+          low_loop_eval_fn_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_low_fast"
+          low_loop_wdata_eval_fn_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_low_wdata_fast"
+          low_loop_data_addr_eval_fn_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_low_data_addr_fast"
+          high_loop_eval_fn_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_high_fast"
+          high_loop_data_addr_eval_fn_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_eval_high_data_addr_fast"
+          low_loop_step_struct = "#{sanitize_ident(top.fetch(:name))}_riscv_low_loop_step"
+          high_loop_step_struct = "#{sanitize_ident(top.fetch(:name))}_riscv_high_loop_step"
+          low_loop_step_struct_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_low_loop_step_fast"
+          low_loop_wdata_step_struct_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_low_loop_wdata_step_fast"
+          low_loop_data_addr_step_struct_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_low_loop_data_addr_step_fast"
+          high_loop_step_struct_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_high_loop_step_fast"
+          high_loop_data_addr_step_struct_fast = "#{sanitize_ident(top.fetch(:name))}_riscv_high_loop_data_addr_step_fast"
+          low_fast_output_names = REQUIRED_RISCV_FAST_LOOP_OUTPUTS.dup
+          low_fast_output_names -= REQUIRED_RISCV_FAST_LOOP_WDATA_OUTPUTS if split_low_wdata_eval
+          low_fast_output_names -= REQUIRED_RISCV_FAST_LOOP_ADDR_OUTPUTS if split_low_data_addr_eval
+          high_fast_output_names = split_high_data_addr_eval ? REQUIRED_RISCV_FAST_HIGH_LOOP_OUTPUTS_NO_ADDR : REQUIRED_RISCV_FAST_HIGH_LOOP_OUTPUTS
+
+          emit_step_struct = lambda do |struct_name, output_names, extra_fields = {}|
+            source << "struct #{struct_name} {\n"
+            emitted_field_count = 0
+            output_names.each do |name|
+              out = top.fetch(:outputs).find { |entry| entry.fetch(:name) == name }
+              next unless out
+
+              source << "  #{metal_type_for(out.fetch(:type))} #{sanitize_ident(name)};\n"
+              emitted_field_count += 1
+            end
+            extra_fields.each do |field_name, field_type|
+              source << "  #{field_type} #{sanitize_ident(field_name)};\n"
+              emitted_field_count += 1
+            end
+            if emitted_field_count.zero?
+              source << "  uint _unused;\n"
+            end
+            source << "};\n\n"
+          end
+
+          emit_step_struct.call(low_loop_step_struct, REQUIRED_RISCV_LOOP_OUTPUTS)
+          emit_step_struct.call(high_loop_step_struct, REQUIRED_RISCV_HIGH_LOOP_OUTPUTS)
+          emit_step_struct.call(low_loop_step_struct_fast, low_fast_output_names)
+          if split_low_wdata_eval
+            emit_step_struct.call(low_loop_wdata_step_struct_fast, REQUIRED_RISCV_FAST_LOOP_WDATA_OUTPUTS)
+          end
+          if split_low_data_addr_eval
+            emit_step_struct.call(low_loop_data_addr_step_struct_fast, REQUIRED_RISCV_FAST_LOOP_ADDR_OUTPUTS)
+          end
+          if split_high_data_addr_eval
+            emit_step_struct.call(high_loop_data_addr_step_struct_fast, REQUIRED_RISCV_FAST_HIGH_LOOP_ADDR_OUTPUTS)
+          end
+          emit_step_struct.call(
+            high_loop_step_struct_fast,
+            high_fast_output_names,
+            dirty_settle_enabled ? { 'state_dirty' => scalar_msl_type } : {}
+          )
+
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: low_loop_eval_fn,
+            output_names: REQUIRED_RISCV_LOOP_OUTPUTS,
+            out_struct: low_loop_step_struct,
+            compact_output_struct: true,
+            use_state_snapshot: false,
+            update_state: false,
+            sync_clock_slots_when_comb_only: false,
+            schedule_aware_emit: schedule_aware_emit,
+            schedule_phase_tag: 'riscv_low_eval',
+            trust_state_masks: true,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_address_space: 'device'
+          )
+          source << "\n"
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: low_loop_eval_fn_fast,
+            output_names: low_fast_output_names,
+            out_struct: low_loop_step_struct_fast,
+            compact_output_struct: true,
+            use_state_snapshot: false,
+            update_state: false,
+            sync_clock_slots_when_comb_only: false,
+            schedule_aware_emit: schedule_aware_emit,
+            schedule_phase_tag: 'riscv_low_eval_fast',
+            trust_state_masks: true,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_address_space: 'device'
+          )
+          source << "\n"
+          if split_low_wdata_eval
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              state_address_space: 'thread',
+              fn_name: low_loop_wdata_eval_fn_fast,
+              output_names: REQUIRED_RISCV_FAST_LOOP_WDATA_OUTPUTS,
+              out_struct: low_loop_wdata_step_struct_fast,
+              compact_output_struct: true,
+              use_state_snapshot: false,
+              update_state: false,
+              sync_clock_slots_when_comb_only: false,
+              schedule_aware_emit: schedule_aware_emit,
+              schedule_phase_tag: 'riscv_low_eval_wdata_fast',
+              trust_state_masks: true,
+              cold_memory_bases: cold_memory_bases,
+              cold_state_slots_address_space: 'device'
+            )
+            source << "\n"
+          end
+          if split_low_data_addr_eval
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              state_address_space: 'thread',
+              fn_name: low_loop_data_addr_eval_fn_fast,
+              output_names: REQUIRED_RISCV_FAST_LOOP_ADDR_OUTPUTS,
+              out_struct: low_loop_data_addr_step_struct_fast,
+              compact_output_struct: true,
+              use_state_snapshot: false,
+              update_state: false,
+              sync_clock_slots_when_comb_only: false,
+              schedule_aware_emit: schedule_aware_emit,
+              schedule_phase_tag: 'riscv_low_eval_data_addr_fast',
+              trust_state_masks: true,
+              cold_memory_bases: cold_memory_bases,
+              cold_state_slots_address_space: 'device'
+            )
+            source << "\n"
+          end
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: high_loop_eval_fn_fast,
+            output_names: high_fast_output_names,
+            out_struct: high_loop_step_struct_fast,
+            compact_output_struct: true,
+            use_state_snapshot: false,
+            update_state: true,
+            split_post_comb_liveness: true,
+            assume_rising_edges: false,
+            track_state_dirty: dirty_settle_enabled,
+            extra_output_assignments: (dirty_settle_enabled ? { 'state_dirty' => 'state_dirty' } : {}),
+            schedule_aware_emit: schedule_aware_emit,
+            schedule_phase_tag: 'riscv_high_eval_fast',
+            trust_state_masks: true,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_address_space: 'device'
+          )
+          source << "\n"
+          if split_high_data_addr_eval
+            source << emit_top_eval_function(
+              top,
+              functions,
+              state_layout,
+              state_address_space: 'thread',
+              fn_name: high_loop_data_addr_eval_fn_fast,
+              output_names: REQUIRED_RISCV_FAST_HIGH_LOOP_ADDR_OUTPUTS,
+              out_struct: high_loop_data_addr_step_struct_fast,
+              compact_output_struct: true,
+              use_state_snapshot: false,
+              update_state: false,
+              sync_clock_slots_when_comb_only: false,
+              schedule_aware_emit: schedule_aware_emit,
+              schedule_phase_tag: 'riscv_high_eval_data_addr_fast',
+              trust_state_masks: true,
+              cold_memory_bases: cold_memory_bases,
+              cold_state_slots_address_space: 'device'
+            )
+            source << "\n"
+          end
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: high_loop_eval_fn,
+            output_names: REQUIRED_RISCV_HIGH_LOOP_OUTPUTS,
+            out_struct: high_loop_step_struct,
+            compact_output_struct: true,
+            use_state_snapshot: false,
+            update_state: true,
+            split_post_comb_liveness: true,
+            assume_rising_edges: false,
+            schedule_aware_emit: schedule_aware_emit,
+            schedule_phase_tag: 'riscv_high_eval',
+            trust_state_masks: true,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_address_space: 'device'
+          )
+          source << "\n"
+          source << emit_top_eval_function(
+            top,
+            functions,
+            state_layout,
+            state_address_space: 'thread',
+            fn_name: full_eval_fn,
+            use_state_snapshot: false,
+            update_state: false,
+            sync_clock_slots_when_comb_only: false,
+            schedule_aware_emit: schedule_aware_emit,
+            schedule_phase_tag: 'riscv_full_eval',
+            trust_state_masks: true,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_address_space: 'device'
+          )
+          source << "\n"
+          source << emit_write_outputs_helper(top, output_names: runtime_output_entries.map { |out| out.fetch(:name) })
+          source << "\n"
+          source << emit_kernel_riscv(
+            top: top,
+            metal_entry: metal_entry,
+            state_layout: state_layout,
+            low_eval_fn: low_loop_eval_fn_fast,
+            low_wdata_eval_fn: (split_low_wdata_eval ? low_loop_wdata_eval_fn_fast : nil),
+            low_data_addr_eval_fn: (split_low_data_addr_eval ? low_loop_data_addr_eval_fn_fast : nil),
+            high_eval_fn: high_loop_eval_fn_fast,
+            high_data_addr_eval_fn: (split_high_data_addr_eval ? high_loop_data_addr_eval_fn_fast : nil),
+            full_eval_fn: full_eval_fn,
+            low_loop_step_struct: low_loop_step_struct_fast,
+            low_wdata_step_struct: (split_low_wdata_eval ? low_loop_wdata_step_struct_fast : nil),
+            low_data_addr_step_struct: (split_low_data_addr_eval ? low_loop_data_addr_step_struct_fast : nil),
+            high_loop_step_struct: high_loop_step_struct_fast,
+            high_data_addr_step_struct: (split_high_data_addr_eval ? high_loop_data_addr_step_struct_fast : nil),
+            cold_memory_layout: cold_memory_layout,
+            fast_path: true,
+            dirty_settle_enabled: dirty_settle_enabled,
+            split_low_wdata_eval: split_low_wdata_eval,
+            split_high_data_addr_eval: split_high_data_addr_eval,
+            split_low_data_addr_eval: split_low_data_addr_eval
+          )
+
+          source
+        end
+
+        def emit_define_function(fn, functions)
+          fn_name = metal_fn_name(fn.fetch(:name))
+          ret_types = fn.fetch(:return_types)
+          args = fn.fetch(:args)
+
+          arg_decls = args.map { |a| "#{metal_type_for(a.fetch(:type))} #{ref_var_name(a.fetch(:ref))}" }.join(', ')
+          inline_spec = inline_qualifier(always_inline: prefer_always_inline_for_define?(fn))
+          header = if ret_types.length == 1
+            "#{inline_spec} #{metal_type_for(ret_types.first)} #{fn_name}(#{arg_decls})"
+          else
+            "#{inline_spec} #{ret_struct_name(fn.fetch(:name))} #{fn_name}(#{arg_decls})"
+          end
+
+          lines = []
+          type_map = {}
+          available_refs = Set.new
+          args.each do |arg|
+            ref = arg.fetch(:ref)
+            type_map[ref] = arg.fetch(:type)
+            available_refs << ref
+          end
+
+          sorted_ops, _sorted_type_map = topologically_sorted_ops(
+            ops: fn.fetch(:ops),
+            initial_type_map: type_map
+          )
+          live_ops, _live_refs = select_live_ops(
+            sorted_ops: sorted_ops,
+            seed_refs: fn.fetch(:output_refs)
+          )
+          schedule_ops_topologically(
+            ops: live_ops,
+            lines: lines,
+            type_map: type_map,
+            available_refs: available_refs,
+            functions: functions,
+            in_top_module: false
+          )
+
+          output_refs = fn.fetch(:output_refs)
+          if ret_types.length == 1
+            ref = output_refs.first
+            out_type = ret_types.first
+            lines << "return #{masked_expr(ref_var_name(ref), out_type)};"
+          else
+            struct_name = ret_struct_name(fn.fetch(:name))
+            lines << "#{struct_name} out;"
+            output_refs.each_with_index do |ref, idx|
+              out_type = ret_types[idx]
+              lines << "out.v#{idx} = #{masked_expr(ref_var_name(ref), out_type)};"
+            end
+            lines << 'return out;'
+          end
+
+          body = indent_lines(lines)
+          "#{header} {\n#{body}\n}\n"
+        end
+
+        def emit_top_eval_function(
+          top,
+          functions,
+          state_layout,
+          state_address_space: 'device',
+          use_state_snapshot: true,
+          fn_name: nil,
+          output_names: nil,
+          out_struct: nil,
+          compact_output_struct: false,
+          seed_all_outputs: false,
+          emit_post_comb: true,
+          update_state: true,
+          split_post_comb_liveness: false,
+          assume_rising_edges: false,
+          sync_clock_slots_when_comb_only: true,
+          track_state_dirty: false,
+          extra_output_assignments: {},
+          trust_state_masks: false,
+          load_state_in_comb_fn: false,
+          cold_memory_bases: nil,
+          cold_state_slots_address_space: 'device',
+          schedule_aware_emit: false,
+          schedule_phase_tag: nil,
+          always_inline_eval: false
+        )
+          fn_name ||= top_eval_fn_name(top.fetch(:name))
+          out_struct ||= top_output_struct_name(top.fetch(:name))
+          cold_memory_bases ||= Set.new
+          needs_cold_state_slots = !cold_memory_bases.empty?
+          comb_tag = sanitize_ident(fn_name)
+          comb_struct = "#{comb_tag}_comb_values"
+          comb_fn = "compute_#{comb_tag}_comb"
+          top_input_refs = top.fetch(:inputs).map { |input| "%#{input.fetch(:name)}" }
+          snapshot_prefix = use_state_snapshot ? 'state_old_' : nil
+          output_name_set = output_names ? output_names.map(&:to_s).to_set : nil
+          selected_output_entries = top.fetch(:outputs).each_with_index.select do |out, _idx|
+            output_name_set.nil? || output_name_set.include?(out.fetch(:name))
+          end
+          output_seed_entries = if seed_all_outputs
+            top.fetch(:outputs).each_with_index.to_a
+          else
+            selected_output_entries
+          end
+
+          state_ref_to_slot = {}
+          base_type_map = {}
+
+          top.fetch(:inputs).each do |input|
+            ref = "%#{input.fetch(:name)}"
+            base_type_map[ref] = input.fetch(:type)
+          end
+
+          top.fetch(:ops).each do |op|
+            next unless %i[arc_state seq_firreg arc_memory].include?(op.fetch(:kind))
+
+            op.fetch(:result_refs).each_with_index do |ref, idx|
+              type = op.fetch(:result_types).fetch(idx)
+              slot_entry = state_layout.find { |entry| entry.fetch(:result_ref) == ref }
+              state_ref_to_slot[ref] = {
+                index: slot_entry.fetch(:index),
+                type: type,
+                slot_count: slot_entry.fetch(:slot_count, 1),
+                length: slot_entry[:length],
+                slots_per_element: slot_entry[:slots_per_element]
+              }
+              base_type_map[ref] = type
+            end
+          end
+
+          clock_ref_to_slot = {}
+          next_clock_slot = state_layout.sum { |entry| entry.fetch(:slot_count, 1) }
+          top.fetch(:ops).each do |op|
+            next unless %i[arc_state seq_firreg arc_memory_write_port seq_memory_write_port].include?(op.fetch(:kind))
+
+            clock_ref = op.fetch(:clock_ref)
+            clock_slot = clock_ref_to_slot[clock_ref]
+            unless clock_slot
+              clock_slot = next_clock_slot
+              clock_ref_to_slot[clock_ref] = clock_slot
+              next_clock_slot += 1
+            end
+          end
+
+          needed_refs = []
+          if update_state
+            top.fetch(:ops).each do |op|
+              case op.fetch(:kind)
+              when :arc_state
+                needed_refs << op.fetch(:clock_ref)
+                needed_refs.concat(op.fetch(:args))
+                needed_refs << op.fetch(:enable_ref) if op.fetch(:enable_ref)
+                needed_refs << op.fetch(:reset_ref) if op.fetch(:reset_ref)
+              when :seq_firreg
+                needed_refs << op.fetch(:clock_ref)
+                needed_refs << op.fetch(:source_ref)
+                needed_refs << op.fetch(:reset_ref) if op.fetch(:reset_ref)
+                needed_refs << op.fetch(:reset_value_ref) if op.fetch(:reset_value_ref)
+              when :arc_memory_write_port
+                needed_refs << op.fetch(:clock_ref)
+                needed_refs.concat(op.fetch(:args))
+                needed_refs << op.fetch(:memory_ref)
+              when :seq_memory_write_port
+                needed_refs << op.fetch(:clock_ref)
+                needed_refs << op.fetch(:memory_ref)
+                needed_refs << op.fetch(:addr_ref)
+                needed_refs << op.fetch(:data_ref)
+                needed_refs << op.fetch(:enable_ref) if op.fetch(:enable_ref)
+              end
+            end
+          end
+          seed_outputs_in_comb_pre = !(split_post_comb_liveness && emit_post_comb && update_state)
+          if seed_outputs_in_comb_pre
+            output_seed_entries.each do |_out, idx|
+              needed_refs << top.fetch(:hw_output_refs)[idx]
+            end
+          end
+          needed_refs.uniq!
+
+          combinational_ops = top.fetch(:ops).reject do |op|
+            %i[arc_state seq_firreg arc_memory arc_memory_write_port seq_memory_write_port].include?(op.fetch(:kind))
+          end
+          sorted_ops, all_comb_type_map = topologically_sorted_ops(
+            ops: combinational_ops,
+            initial_type_map: base_type_map
+          )
+          all_type_map = base_type_map.merge(all_comb_type_map)
+          comb_produced_refs = sorted_ops.flat_map { |op| op.fetch(:result_refs) }.to_set
+          needed_comb_refs = needed_refs.select { |ref| comb_produced_refs.include?(ref) }
+          live_ops, live_comb_refs = select_live_ops(sorted_ops: sorted_ops, seed_refs: needed_comb_refs)
+
+          ordered_state_refs = (
+            needed_refs.select { |ref| state_ref_to_slot.key?(ref) } +
+            live_comb_refs.select { |ref| state_ref_to_slot.key?(ref) }
+          ).uniq.select do |ref|
+            info = state_ref_to_slot.fetch(ref)
+            info.fetch(:type).scalar?
+          end.sort_by { |ref| state_ref_to_slot.fetch(ref).fetch(:index) }
+
+          uses_memory_reads = live_ops.any? { |op| op.fetch(:kind) == :arc_memory_read_port }
+
+          comb_lines = []
+          runtime_type_map = base_type_map.dup
+          if load_state_in_comb_fn
+            ordered_state_refs.each do |ref|
+              info = state_ref_to_slot.fetch(ref)
+              comb_lines << "#{metal_type_for(info.fetch(:type))} #{ref_var_name(ref)} = #{state_load_expr(info, trust_state_masks: trust_state_masks)};"
+            end
+          end
+          emit_ops_with_optional_schedule(
+            ops: live_ops,
+            lines: comb_lines,
+            runtime_type_map: runtime_type_map,
+            functions: functions,
+            in_top_module: true,
+            state_ref_to_slot: state_ref_to_slot,
+            cold_memory_bases: cold_memory_bases,
+            cold_state_slots_var: (needs_cold_state_slots ? 'cold_state_slots' : nil),
+            schedule_aware_emit: schedule_aware_emit,
+            phase_tag: schedule_phase_tag || "#{comb_fn}_pre"
+          )
+
+          comb_lines << "#{comb_struct} comb;"
+          needed_comb_refs.each do |ref|
+            type = all_type_map.fetch(ref)
+            comb_lines << "comb.#{comb_field_name(ref)} = #{masked_expr(ref_var_name(ref), type)};"
+          end
+          comb_lines << 'return comb;'
+
+          compute_fn_text = +""
+          compute_fn_text << "struct #{comb_struct} {\n"
+          needed_comb_refs.each do |ref|
+            compute_fn_text << "  #{metal_type_for(all_type_map.fetch(ref))} #{comb_field_name(ref)};\n"
+          end
+          compute_fn_text << "};\n\n"
+
+          eval_input_arg_decls = top.fetch(:inputs).map do |input|
+            ref = "%#{input.fetch(:name)}"
+            "#{metal_type_for(input.fetch(:type))} #{ref_var_name(ref)}"
+          end
+          comb_arg_decls = eval_input_arg_decls.dup
+          if load_state_in_comb_fn
+            comb_arg_decls << "#{state_address_space} #{scalar_msl_type}* state_slots" if uses_memory_reads || !ordered_state_refs.empty?
+          else
+            comb_arg_decls.concat(
+              ordered_state_refs.map do |ref|
+                info = state_ref_to_slot.fetch(ref)
+                "#{metal_type_for(info.fetch(:type))} #{ref_var_name(ref)}"
+              end
+            )
+            comb_arg_decls << "#{state_address_space} #{scalar_msl_type}* state_slots" if uses_memory_reads
+          end
+          if uses_memory_reads && needs_cold_state_slots
+            comb_arg_decls << "#{cold_state_slots_address_space} #{scalar_msl_type}* cold_state_slots"
+          end
+          input_arg_exprs = top.fetch(:inputs).map { |input| ref_var_name("%#{input.fetch(:name)}") }
+          compute_fn_text << "#{inline_qualifier(always_inline: always_inline_eval)} #{comb_struct} #{comb_fn}(#{comb_arg_decls.join(', ')}) {\n"
+          compute_fn_text << indent_lines(comb_lines)
+          compute_fn_text << "\n}\n\n"
+
+          output_comb_struct = comb_struct
+          output_comb_fn = comb_fn
+          output_comb_state_refs = ordered_state_refs
+          output_comb_uses_memory_reads = uses_memory_reads
+
+          if split_post_comb_liveness && emit_post_comb && update_state
+            post_needed_refs = output_seed_entries.map { |_out, idx| top.fetch(:hw_output_refs)[idx] }.uniq
+            post_needed_comb_refs = post_needed_refs.select { |ref| comb_produced_refs.include?(ref) }
+            post_live_ops, post_live_comb_refs = select_live_ops(sorted_ops: sorted_ops, seed_refs: post_needed_comb_refs)
+
+            post_ordered_state_refs = (
+              post_needed_refs.select { |ref| state_ref_to_slot.key?(ref) } +
+              post_live_comb_refs.select { |ref| state_ref_to_slot.key?(ref) }
+            ).uniq.select do |ref|
+              info = state_ref_to_slot.fetch(ref)
+              info.fetch(:type).scalar?
+            end.sort_by { |ref| state_ref_to_slot.fetch(ref).fetch(:index) }
+
+            post_uses_memory_reads = post_live_ops.any? { |op| op.fetch(:kind) == :arc_memory_read_port }
+            post_comb_struct = "#{comb_tag}_post_comb_values"
+            post_comb_fn = "compute_#{comb_tag}_post_comb"
+            post_runtime_type_map = base_type_map.dup
+            post_comb_lines = []
+            if load_state_in_comb_fn
+              post_ordered_state_refs.each do |ref|
+                info = state_ref_to_slot.fetch(ref)
+                post_comb_lines << "#{metal_type_for(info.fetch(:type))} #{ref_var_name(ref)} = #{state_load_expr(info, trust_state_masks: trust_state_masks)};"
+              end
+            end
+            emit_ops_with_optional_schedule(
+              ops: post_live_ops,
+              lines: post_comb_lines,
+              runtime_type_map: post_runtime_type_map,
+              functions: functions,
+              in_top_module: true,
+              state_ref_to_slot: state_ref_to_slot,
+              cold_memory_bases: cold_memory_bases,
+              cold_state_slots_var: (needs_cold_state_slots ? 'cold_state_slots' : nil),
+              schedule_aware_emit: schedule_aware_emit,
+              phase_tag: schedule_phase_tag ? "#{schedule_phase_tag}_post" : "#{post_comb_fn}_post"
+            )
+
+            post_comb_lines << "#{post_comb_struct} comb;"
+            post_needed_comb_refs.each do |ref|
+              type = all_type_map.fetch(ref)
+              post_comb_lines << "comb.#{comb_field_name(ref)} = #{masked_expr(ref_var_name(ref), type)};"
+            end
+            post_comb_lines << 'return comb;'
+
+            post_comb_arg_decls = eval_input_arg_decls.dup
+            if load_state_in_comb_fn
+              post_comb_arg_decls << "#{state_address_space} #{scalar_msl_type}* state_slots" if post_uses_memory_reads || !post_ordered_state_refs.empty?
+            else
+              post_comb_arg_decls.concat(
+                post_ordered_state_refs.map do |ref|
+                  info = state_ref_to_slot.fetch(ref)
+                  "#{metal_type_for(info.fetch(:type))} #{ref_var_name(ref)}"
+                end
+              )
+              post_comb_arg_decls << "#{state_address_space} #{scalar_msl_type}* state_slots" if post_uses_memory_reads
+            end
+            if post_uses_memory_reads && needs_cold_state_slots
+              post_comb_arg_decls << "#{cold_state_slots_address_space} #{scalar_msl_type}* cold_state_slots"
+            end
+
+            compute_fn_text << "struct #{post_comb_struct} {\n"
+            post_needed_comb_refs.each do |ref|
+              compute_fn_text << "  #{metal_type_for(all_type_map.fetch(ref))} #{comb_field_name(ref)};\n"
+            end
+            compute_fn_text << "};\n\n"
+            compute_fn_text << "#{inline_qualifier(always_inline: always_inline_eval)} #{post_comb_struct} #{post_comb_fn}(#{post_comb_arg_decls.join(', ')}) {\n"
+            compute_fn_text << indent_lines(post_comb_lines)
+            compute_fn_text << "\n}\n\n"
+
+            output_comb_struct = post_comb_struct
+            output_comb_fn = post_comb_fn
+            output_comb_state_refs = post_ordered_state_refs
+            output_comb_uses_memory_reads = post_uses_memory_reads
+          end
+
+          eval_lines = []
+          dirty_var = track_state_dirty ? 'state_dirty' : nil
+          eval_lines << "#{scalar_msl_type} #{dirty_var} = 0u;" if dirty_var
+
+          comb_state_args = if load_state_in_comb_fn
+            []
+          else
+            ordered_state_refs.map do |ref|
+              state_load_expr(state_ref_to_slot.fetch(ref), trust_state_masks: trust_state_masks)
+            end
+          end
+
+          if update_state
+            force_rising_edges = assume_rising_edges && clock_ref_to_slot.length == 1
+            if use_state_snapshot
+              ordered_state_refs.each do |ref|
+                info = state_ref_to_slot.fetch(ref)
+                eval_lines << "#{metal_type_for(info.fetch(:type))} #{snapshot_prefix}#{info.fetch(:index)} = #{state_load_expr(info, trust_state_masks: trust_state_masks)};"
+              end
+            end
+            comb_pre_state_args = if load_state_in_comb_fn
+              []
+            else
+              ordered_state_refs.map do |ref|
+                info = state_ref_to_slot.fetch(ref)
+                use_state_snapshot ? "#{snapshot_prefix}#{info.fetch(:index)}" : state_load_expr(info, trust_state_masks: trust_state_masks)
+              end
+            end
+            comb_pre_args = input_arg_exprs + comb_pre_state_args
+            comb_pre_args << 'state_slots' if uses_memory_reads || (load_state_in_comb_fn && !ordered_state_refs.empty?)
+            comb_pre_args << 'cold_state_slots' if uses_memory_reads && needs_cold_state_slots
+            eval_lines << "#{comb_struct} comb_pre = #{comb_fn}(#{comb_pre_args.join(', ')});"
+
+            clock_rising_var_by_ref = {}
+            unless force_rising_edges
+              clock_ref_to_slot.each do |clock_ref, clock_slot|
+                clock_expr = value_expr_for_ref(
+                  clock_ref,
+                  type_map: all_type_map,
+                  state_ref_to_slot: state_ref_to_slot,
+                  comb_var: 'comb_pre',
+                  state_snapshot_prefix: snapshot_prefix,
+                  top_input_refs: top_input_refs,
+                  trust_state_masks: trust_state_masks
+                )
+                eval_lines << "#{scalar_msl_type} clock_prev_#{clock_slot} = rhdl_mask_bits(state_slots[#{clock_slot}], 1u);"
+                eval_lines << "#{scalar_msl_type} clock_now_#{clock_slot} = (#{clock_expr} & 1u);"
+                eval_lines << "#{scalar_msl_type} rising_#{clock_slot} = ((clock_prev_#{clock_slot} ^ clock_now_#{clock_slot}) & clock_now_#{clock_slot}) & 1u;"
+                eval_lines << "state_slots[#{clock_slot}] = clock_now_#{clock_slot};"
+                clock_rising_var_by_ref[clock_ref] = "rising_#{clock_slot}"
+              end
+            end
+
+            active_clock_ref = nil
+            top.fetch(:ops).each_with_index do |op, op_idx|
+              case op.fetch(:kind)
+              when :arc_state
+                clock_ref = op.fetch(:clock_ref)
+                unless force_rising_edges
+                  if active_clock_ref != clock_ref
+                    eval_lines << '}' if active_clock_ref
+                    rising_var = clock_rising_var_by_ref.fetch(clock_ref)
+                    eval_lines << "if (#{rising_var} != 0u) {"
+                    active_clock_ref = clock_ref
+                  end
+                end
+
+                slot_infos = op.fetch(:result_refs).each_with_index.map do |ref, idx|
+                  info = state_ref_to_slot.fetch(ref)
+                  { index: info.fetch(:index), type: op.fetch(:result_types).fetch(idx) }
+                end
+
+                emit_state_update = lambda do |indent|
+                  arg_exprs = op.fetch(:args).map do |arg_ref|
+                    value_expr_for_ref(
+                      arg_ref,
+                      type_map: all_type_map,
+                      state_ref_to_slot: state_ref_to_slot,
+                      comb_var: 'comb_pre',
+                      state_snapshot_prefix: snapshot_prefix,
+                      top_input_refs: top_input_refs,
+                      trust_state_masks: trust_state_masks
+                    )
+                  end
+                  call_expr = generate_call_expr(
+                    callee: op.fetch(:callee),
+                    args: op.fetch(:args),
+                    result_types: op.fetch(:result_types),
+                    type_map: all_type_map,
+                    functions: functions,
+                    temp_prefix: "state_#{slot_infos.first.fetch(:index)}_next",
+                    arg_exprs: arg_exprs
+                  )
+                  call_expr.fetch(:setup_lines).each { |line| eval_lines << "#{indent}#{line}" }
+                  slot_infos.each_with_index do |slot_info, idx|
+                    store_expr = masked_expr(call_expr.fetch(:result_exprs)[idx], slot_info.fetch(:type))
+                    eval_lines.concat(
+                      emit_state_store_lines(
+                        slot_info: slot_info,
+                        value_expr: store_expr,
+                        indent: indent,
+                        dirty_var: dirty_var
+                      )
+                    )
+                  end
+                end
+
+                if op.fetch(:reset_ref)
+                  reset_expr = value_expr_for_ref(
+                    op.fetch(:reset_ref),
+                    type_map: all_type_map,
+                    state_ref_to_slot: state_ref_to_slot,
+                    comb_var: 'comb_pre',
+                    state_snapshot_prefix: snapshot_prefix,
+                    top_input_refs: top_input_refs,
+                    trust_state_masks: trust_state_masks
+                  )
+                  eval_lines << "  if ((#{reset_expr} & 1u) != 0u) {"
+                  slot_infos.each do |slot_info|
+                    eval_lines.concat(
+                      emit_state_store_lines(
+                        slot_info: slot_info,
+                        value_expr: constant_literal(0, slot_info.fetch(:type)),
+                        indent: '    ',
+                        dirty_var: dirty_var
+                      )
+                    )
+                  end
+                  eval_lines << '  } else {'
+                  if op.fetch(:enable_ref)
+                    enable_expr = value_expr_for_ref(
+                      op.fetch(:enable_ref),
+                      type_map: all_type_map,
+                      state_ref_to_slot: state_ref_to_slot,
+                      comb_var: 'comb_pre',
+                      state_snapshot_prefix: snapshot_prefix,
+                      top_input_refs: top_input_refs,
+                      trust_state_masks: trust_state_masks
+                    )
+                    eval_lines << "    if ((#{enable_expr} & 1u) != 0u) {"
+                    emit_state_update.call('      ')
+                    eval_lines << '    }'
+                  else
+                    emit_state_update.call('    ')
+                  end
+                  eval_lines << '  }'
+                else
+                  if op.fetch(:enable_ref)
+                    enable_expr = value_expr_for_ref(
+                      op.fetch(:enable_ref),
+                      type_map: all_type_map,
+                      state_ref_to_slot: state_ref_to_slot,
+                      comb_var: 'comb_pre',
+                      state_snapshot_prefix: snapshot_prefix,
+                      top_input_refs: top_input_refs,
+                      trust_state_masks: trust_state_masks
+                    )
+                    eval_lines << "  if ((#{enable_expr} & 1u) != 0u) {"
+                    emit_state_update.call('    ')
+                    eval_lines << '  }'
+                  else
+                    emit_state_update.call('  ')
+                  end
+                end
+              when :seq_firreg
+                clock_ref = op.fetch(:clock_ref)
+                unless force_rising_edges
+                  if active_clock_ref != clock_ref
+                    eval_lines << '}' if active_clock_ref
+                    rising_var = clock_rising_var_by_ref.fetch(clock_ref)
+                    eval_lines << "if (#{rising_var} != 0u) {"
+                    active_clock_ref = clock_ref
+                  end
+                end
+
+                slot_info = begin
+                  ref = op.fetch(:result_refs).first
+                  info = state_ref_to_slot.fetch(ref)
+                  { index: info.fetch(:index), type: op.fetch(:result_types).first }
+                end
+
+                source_expr = value_expr_for_ref(
+                  op.fetch(:source_ref),
+                  type_map: all_type_map,
+                  state_ref_to_slot: state_ref_to_slot,
+                  comb_var: 'comb_pre',
+                  state_snapshot_prefix: snapshot_prefix,
+                  top_input_refs: top_input_refs,
+                  trust_state_masks: trust_state_masks
+                )
+                source_store_expr = masked_expr(source_expr, slot_info.fetch(:type))
+
+                if op.fetch(:reset_ref)
+                  reset_expr = value_expr_for_ref(
+                    op.fetch(:reset_ref),
+                    type_map: all_type_map,
+                    state_ref_to_slot: state_ref_to_slot,
+                    comb_var: 'comb_pre',
+                    state_snapshot_prefix: snapshot_prefix,
+                    top_input_refs: top_input_refs,
+                    trust_state_masks: trust_state_masks
+                  )
+                  reset_value_expr = value_expr_for_ref(
+                    op.fetch(:reset_value_ref),
+                    type_map: all_type_map,
+                    state_ref_to_slot: state_ref_to_slot,
+                    comb_var: 'comb_pre',
+                    state_snapshot_prefix: snapshot_prefix,
+                    top_input_refs: top_input_refs,
+                    trust_state_masks: trust_state_masks
+                  )
+                  reset_store_expr = masked_expr(reset_value_expr, slot_info.fetch(:type))
+                  eval_lines << "  if ((#{reset_expr} & 1u) != 0u) {"
+                  eval_lines.concat(
+                    emit_state_store_lines(
+                      slot_info: slot_info,
+                      value_expr: reset_store_expr,
+                      indent: '    ',
+                      dirty_var: dirty_var
+                    )
+                  )
+                  eval_lines << '  } else {'
+                  eval_lines.concat(
+                    emit_state_store_lines(
+                      slot_info: slot_info,
+                      value_expr: source_store_expr,
+                      indent: '    ',
+                      dirty_var: dirty_var
+                    )
+                  )
+                  eval_lines << '  }'
+                else
+                  eval_lines.concat(
+                    emit_state_store_lines(
+                      slot_info: slot_info,
+                      value_expr: source_store_expr,
+                      indent: '  ',
+                      dirty_var: dirty_var
+                    )
+                  )
+                end
+              when :arc_memory_write_port
+                clock_ref = op.fetch(:clock_ref)
+                unless force_rising_edges
+                  if active_clock_ref != clock_ref
+                    eval_lines << '}' if active_clock_ref
+                    rising_var = clock_rising_var_by_ref.fetch(clock_ref)
+                    eval_lines << "if (#{rising_var} != 0u) {"
+                    active_clock_ref = clock_ref
+                  end
+                end
+
+                memory_info = state_ref_to_slot.fetch(op.fetch(:memory_ref))
+                memory_type = memory_info.fetch(:type)
+                element_type = memory_type.fetch(:element)
+
+                arg_exprs = op.fetch(:args).map do |arg_ref|
+                  value_expr_for_ref(
+                    arg_ref,
+                    type_map: all_type_map,
+                    state_ref_to_slot: state_ref_to_slot,
+                    comb_var: 'comb_pre',
+                    state_snapshot_prefix: snapshot_prefix,
+                    top_input_refs: top_input_refs,
+                    trust_state_masks: trust_state_masks
+                  )
+                end
+                call_expr = generate_call_expr(
+                  callee: op.fetch(:callee),
+                  args: op.fetch(:args),
+                  result_types: op.fetch(:write_result_types),
+                  type_map: all_type_map,
+                  functions: functions,
+                  temp_prefix: "memwrite_#{memory_info.fetch(:index)}_#{op_idx}",
+                  arg_exprs: arg_exprs
+                )
+                call_expr.fetch(:setup_lines).each { |line| eval_lines << line }
+
+                index_type = TypeRef.new(kind: :scalar, width: memory_type.fetch(:index_width))
+                addr_expr = masked_expr(call_expr.fetch(:result_exprs)[0], index_type)
+                data_expr = masked_expr(call_expr.fetch(:result_exprs)[1], element_type)
+                write_enable_expr = masked_expr(call_expr.fetch(:result_exprs)[2], TypeRef.new(kind: :scalar, width: 1))
+
+                eval_lines << "  if ((#{write_enable_expr} & 1u) != 0u) {"
+                    if wide_scalar?(element_type)
+                      target_state_slots = if needs_cold_state_slots && cold_memory_bases.include?(memory_info.fetch(:index))
+                        'cold_state_slots'
+                      else
+                        'state_slots'
+                      end
+                      eval_lines << "    rhdl_write_memory_wide(#{target_state_slots}, #{memory_info.fetch(:index)}u, #{memory_info.fetch(:length)}u, #{addr_expr}, #{data_expr}, #{element_type.fetch(:width)}u);"
+                    else
+                      target_state_slots = if needs_cold_state_slots && cold_memory_bases.include?(memory_info.fetch(:index))
+                        'cold_state_slots'
+                      else
+                        'state_slots'
+                      end
+                      eval_lines << "    rhdl_write_memory_scalar(#{target_state_slots}, #{memory_info.fetch(:index)}u, #{memory_info.fetch(:length)}u, #{addr_expr}, #{data_expr}, #{element_type.fetch(:width)}u);"
+                    end
+                eval_lines << '  }'
+              when :seq_memory_write_port
+                clock_ref = op.fetch(:clock_ref)
+                unless force_rising_edges
+                  if active_clock_ref != clock_ref
+                    eval_lines << '}' if active_clock_ref
+                    rising_var = clock_rising_var_by_ref.fetch(clock_ref)
+                    eval_lines << "if (#{rising_var} != 0u) {"
+                    active_clock_ref = clock_ref
+                  end
+                end
+
+                memory_info = state_ref_to_slot.fetch(op.fetch(:memory_ref))
+                memory_type = memory_info.fetch(:type)
+                element_type = memory_type.fetch(:element)
+                index_type = TypeRef.new(kind: :scalar, width: memory_type.fetch(:index_width))
+
+                addr_expr = value_expr_for_ref(
+                  op.fetch(:addr_ref),
+                  type_map: all_type_map,
+                  state_ref_to_slot: state_ref_to_slot,
+                  comb_var: 'comb_pre',
+                  state_snapshot_prefix: snapshot_prefix,
+                  top_input_refs: top_input_refs,
+                  trust_state_masks: trust_state_masks
+                )
+                data_expr = value_expr_for_ref(
+                  op.fetch(:data_ref),
+                  type_map: all_type_map,
+                  state_ref_to_slot: state_ref_to_slot,
+                  comb_var: 'comb_pre',
+                  state_snapshot_prefix: snapshot_prefix,
+                  top_input_refs: top_input_refs,
+                  trust_state_masks: trust_state_masks
+                )
+                write_enable_expr = if op.fetch(:enable_ref)
+                  enable_expr = value_expr_for_ref(
+                    op.fetch(:enable_ref),
+                    type_map: all_type_map,
+                    state_ref_to_slot: state_ref_to_slot,
+                    comb_var: 'comb_pre',
+                    state_snapshot_prefix: snapshot_prefix,
+                    top_input_refs: top_input_refs,
+                    trust_state_masks: trust_state_masks
+                  )
+                  masked_expr(enable_expr, TypeRef.new(kind: :scalar, width: 1))
+                else
+                  scalar_one_literal
+                end
+
+                masked_addr_expr = masked_expr(addr_expr, index_type)
+                masked_data_expr = masked_expr(data_expr, element_type)
+                eval_lines << "  if ((#{write_enable_expr} & 1u) != 0u) {"
+                if wide_scalar?(element_type)
+                  target_state_slots = if needs_cold_state_slots && cold_memory_bases.include?(memory_info.fetch(:index))
+                    'cold_state_slots'
+                  else
+                    'state_slots'
+                  end
+                  eval_lines << "    rhdl_write_memory_wide(#{target_state_slots}, #{memory_info.fetch(:index)}u, #{memory_info.fetch(:length)}u, #{masked_addr_expr}, #{masked_data_expr}, #{element_type.fetch(:width)}u);"
+                else
+                  target_state_slots = if needs_cold_state_slots && cold_memory_bases.include?(memory_info.fetch(:index))
+                    'cold_state_slots'
+                  else
+                    'state_slots'
+                  end
+                  eval_lines << "    rhdl_write_memory_scalar(#{target_state_slots}, #{memory_info.fetch(:index)}u, #{memory_info.fetch(:length)}u, #{masked_addr_expr}, #{masked_data_expr}, #{element_type.fetch(:width)}u);"
+                end
+                eval_lines << '  }'
+              end
+            end
+            eval_lines << '}' if active_clock_ref && !force_rising_edges
+          end
+
+          output_comb_var =
+            if emit_post_comb
+              comb_args = if load_state_in_comb_fn
+                input_arg_exprs.dup
+              else
+                input_arg_exprs + output_comb_state_refs.map do |ref|
+                  state_load_expr(state_ref_to_slot.fetch(ref), trust_state_masks: trust_state_masks)
+                end
+              end
+              comb_args << 'state_slots' if output_comb_uses_memory_reads || (load_state_in_comb_fn && !output_comb_state_refs.empty?)
+              comb_args << 'cold_state_slots' if output_comb_uses_memory_reads && needs_cold_state_slots
+              eval_lines << "#{output_comb_struct} comb = #{output_comb_fn}(#{comb_args.join(', ')});"
+              'comb'
+            elsif update_state
+              'comb_pre'
+            else
+              comb_args = input_arg_exprs + comb_state_args
+              comb_args << 'state_slots' if uses_memory_reads || (load_state_in_comb_fn && !ordered_state_refs.empty?)
+              comb_args << 'cold_state_slots' if uses_memory_reads && needs_cold_state_slots
+              eval_lines << "#{comb_struct} comb = #{comb_fn}(#{comb_args.join(', ')});"
+              'comb'
+            end
+          if !update_state && sync_clock_slots_when_comb_only
+            clock_ref_to_slot.each do |clock_ref, clock_slot|
+              clock_expr = value_expr_for_ref(
+                clock_ref,
+                type_map: all_type_map,
+                state_ref_to_slot: state_ref_to_slot,
+                comb_var: 'comb',
+                top_input_refs: top_input_refs,
+                trust_state_masks: trust_state_masks
+              )
+              eval_lines << "state_slots[#{clock_slot}] = (#{clock_expr} & 1u);"
+            end
+          end
+          eval_lines << "#{out_struct} out;"
+          if compact_output_struct
+            selected_output_entries.each do |out, idx|
+              out_name = out.fetch(:name)
+              ref = top.fetch(:hw_output_refs)[idx]
+              out_expr = value_expr_for_ref(
+                ref,
+                type_map: all_type_map,
+                state_ref_to_slot: state_ref_to_slot,
+                comb_var: output_comb_var,
+                top_input_refs: top_input_refs,
+                trust_state_masks: trust_state_masks
+              )
+              out_linesafe = masked_expr(out_expr, out.fetch(:type))
+              eval_lines << "out.#{sanitize_ident(out_name)} = #{out_linesafe};"
+            end
+          else
+            top.fetch(:outputs).each_with_index do |out, idx|
+              out_name = out.fetch(:name)
+              if output_name_set && !output_name_set.include?(out_name)
+                eval_lines << "out.#{sanitize_ident(out_name)} = #{constant_literal(0, out.fetch(:type))};"
+                next
+              end
+
+              ref = top.fetch(:hw_output_refs)[idx]
+              out_expr = value_expr_for_ref(
+                ref,
+                type_map: all_type_map,
+                state_ref_to_slot: state_ref_to_slot,
+                comb_var: output_comb_var,
+                top_input_refs: top_input_refs,
+                trust_state_masks: trust_state_masks
+              )
+              out_linesafe = masked_expr(out_expr, out.fetch(:type))
+              eval_lines << "out.#{sanitize_ident(out_name)} = #{out_linesafe};"
+            end
+          end
+          extra_output_assignments.each do |field_name, expr|
+            eval_lines << "out.#{sanitize_ident(field_name)} = #{expr};"
+          end
+          eval_lines << 'return out;'
+
+          eval_fn_arg_decls = eval_input_arg_decls.dup
+          eval_fn_arg_decls << "#{state_address_space} #{scalar_msl_type}* state_slots"
+          if needs_cold_state_slots
+            eval_fn_arg_decls << "#{cold_state_slots_address_space} #{scalar_msl_type}* cold_state_slots"
+          end
+          compute_fn_text +
+            "#{inline_qualifier(always_inline: always_inline_eval)} #{out_struct} #{fn_name}(#{eval_fn_arg_decls.join(', ')}) {\n#{indent_lines(eval_lines)}\n}\n"
+        end
+
+        def emit_write_outputs_helper(top, output_names: nil)
+          out_struct = top_output_struct_name(top.fetch(:name))
+          fn_name = "write_#{sanitize_ident(top.fetch(:name))}_outputs"
+
+          lines = []
+          output_name_set = output_names ? output_names.map(&:to_s).to_set : nil
+          selected_outputs = top.fetch(:outputs).select do |out|
+            output_name_set.nil? || output_name_set.include?(out.fetch(:name))
+          end
+          selected_outputs.each do |out|
+            name = sanitize_ident(out.fetch(:name))
+            if wide_scalar?(out.fetch(:type))
+              lines << "io->#{name} = out.#{name}.x;"
+            else
+              lines << "io->#{name} = out.#{name};"
+            end
+          end
+
+          <<~MSL
+            static inline __attribute__((always_inline)) void #{fn_name}(device RhdlArcGpuIo* io, #{out_struct} out) {
+            #{indent_lines(lines)}
+            }
+          MSL
+        end
+
+        def emit_kernel(top:, metal_entry:, state_layout:, gem_kernel_interpreter: false)
+          eval_fn = top_eval_fn_name(top.fetch(:name))
+          out_struct = top_output_struct_name(top.fetch(:name))
+          write_fn = "write_#{sanitize_ident(top.fetch(:name))}_outputs"
+          clock_slots = count_clock_tracking_slots(top.fetch(:ops))
+          state_slot_count = state_layout.sum { |entry| entry.fetch(:slot_count, 1) } + clock_slots
+
+          if gem_kernel_interpreter
+            return <<~MSL
+              static inline __attribute__((always_inline)) uint rhdl_gem_read_io_word(uint field, device RhdlArcGpuIo* io) {
+                switch (field) {
+                  case 0u: return io->rst;
+                  case 1u: return io->clk;
+                  case 2u: return io->last_clk;
+                  case 3u: return io->mem_data_in;
+                  case 4u: return io->mem_data_out;
+                  case 5u: return io->mem_addr;
+                  case 6u: return io->mem_write_en;
+                  case 7u: return io->mem_read_en;
+                  case 8u: return io->pc_out;
+                  case 9u: return io->acc_out;
+                  case 10u: return io->sp_out;
+                  case 11u: return io->halted;
+                  case 12u: return io->state_out;
+                  case 13u: return io->zero_flag_out;
+                  case 14u: return io->cycle_budget;
+                  case 15u: return io->cycles_ran;
+                  default: return 0u;
+                }
+              }
+
+              static inline __attribute__((always_inline)) void rhdl_gem_write_io_word(uint field, uint value, device RhdlArcGpuIo* io) {
+                switch (field) {
+                  case 0u: io->rst = value & 0x1u; break;
+                  case 1u: io->clk = value & 0x1u; break;
+                  case 2u: io->last_clk = value & 0x1u; break;
+                  case 3u: io->mem_data_in = value & 0xFFu; break;
+                  case 4u: io->mem_data_out = value & 0xFFu; break;
+                  case 5u: io->mem_addr = value & 0xFFFFu; break;
+                  case 6u: io->mem_write_en = value & 0x1u; break;
+                  case 7u: io->mem_read_en = value & 0x1u; break;
+                  case 8u: io->pc_out = value & 0xFFFFu; break;
+                  case 9u: io->acc_out = value & 0xFFu; break;
+                  case 10u: io->sp_out = value & 0xFFu; break;
+                  case 11u: io->halted = value & 0x1u; break;
+                  case 12u: io->state_out = value & 0xFFu; break;
+                  case 13u: io->zero_flag_out = value & 0x1u; break;
+                  case 14u: io->cycle_budget = value; break;
+                  case 15u: io->cycles_ran = value; break;
+                  default: break;
+                }
+              }
+
+              static inline __attribute__((always_inline)) uint rhdl_gem_decode_extern_descriptor(
+                uint desc,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io) {
+                uint desc_kind = desc & 0x7u;
+                switch (desc_kind) {
+                  case 0u: {
+                    return (desc >> 3u) & 0x1u;
+                  }
+                  case 1u: {
+                    uint state_index = (desc >> 3u) & 0x3FFu;
+                    uint bit_index = (desc >> 13u) & 0x3Fu;
+                    if (state_index < #{state_slot_count}u && bit_index < 32u) {
+                      return (uint(state_slots[state_index]) >> bit_index) & 0x1u;
+                    }
+                    return 0u;
+                  }
+                  case 2u: {
+                    uint field = (desc >> 3u) & 0xFFu;
+                    uint bit_index = (desc >> 11u) & 0x3Fu;
+                    if (bit_index < 32u) {
+                      return (rhdl_gem_read_io_word(field, io) >> bit_index) & 0x1u;
+                    }
+                    return 0u;
+                  }
+                  case 4u:
+                  case 5u: {
+                    uint lhs_state_index = (desc >> 3u) & 0x3FFu;
+                    uint rhs_state_index = (desc >> 13u) & 0x3FFu;
+                    uint bit_index = (desc >> 23u) & 0x3Fu;
+                    if (lhs_state_index < #{state_slot_count}u && rhs_state_index < #{state_slot_count}u && bit_index < 32u) {
+                      uint lhs = uint(state_slots[lhs_state_index]) & 0xFFu;
+                      uint rhs = uint(state_slots[rhs_state_index]) & 0xFFu;
+                      uint result = 0u;
+                      if (rhs != 0u) {
+                        result = desc_kind == 4u ? (lhs / rhs) : (lhs % rhs);
+                      }
+                      return (result >> bit_index) & 0x1u;
+                    }
+                    return 0u;
+                  }
+                  default: {
+                    return 0u;
+                  }
+                }
+              }
+
+              static inline __attribute__((always_inline)) uint rhdl_gem_decode_src(
+                uint packed,
+                thread uchar* node_vals,
+                device const uint* gem_instr,
+                uint gem_flags,
+                uint extern_off,
+                uint extern_count,
+                uint extern_desc_off,
+                uint extern_desc_count,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io,
+                thread const uint* extern_values,
+                uint extern_value_count) {
+                constexpr uint kGemNodeCap = 4096u;
+                uint inv = packed & 1u;
+                uint kind = (packed >> 1u) & 1u;
+                uint id = packed >> 2u;
+                uint value = 0u;
+                if (kind == 0u) {
+                  if (id < kGemNodeCap) {
+                    value = uint(node_vals[id]) & 1u;
+                  }
+                } else {
+                  if (id < extern_value_count) {
+                    value = extern_values[id] & 0x1u;
+                  } else if ((gem_flags & 0x8u) != 0u && id < extern_desc_count) {
+                    uint desc = gem_instr[extern_desc_off + 1u + id];
+                    value = rhdl_gem_decode_extern_descriptor(desc, state_slots, io);
+                  } else if ((gem_flags & 0x4u) != 0u) {
+                    if (id < extern_count) {
+                      value = gem_instr[extern_off + 1u + id] & 1u;
+                    }
+                  } else {
+                    value = id & 1u;
+                  }
+                }
+                return (value ^ inv) & 1u;
+              }
+
+              static inline __attribute__((always_inline)) void rhdl_gem_fill_extern_values(
+                device const uint* gem_instr,
+                uint gem_flags,
+                uint extern_off,
+                uint extern_count,
+                uint extern_desc_off,
+                uint extern_desc_count,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io,
+                thread uint* extern_values,
+                uint extern_value_count) {
+                for (uint e = 0u; e < extern_value_count; ++e) {
+                  uint value = 0u;
+                  if ((gem_flags & 0x8u) != 0u && e < extern_desc_count) {
+                    uint desc = gem_instr[extern_desc_off + 1u + e];
+                    value = rhdl_gem_decode_extern_descriptor(desc, state_slots, io);
+                  } else if ((gem_flags & 0x4u) != 0u && e < extern_count) {
+                    value = gem_instr[extern_off + 1u + e] & 1u;
+                  } else {
+                    value = e & 1u;
+                  }
+                  extern_values[e] = value & 0x1u;
+                }
+              }
+
+              static inline __attribute__((always_inline)) void rhdl_gem_eval_nodes(
+                device const uint* gem_instr,
+                uint instr_count,
+                uint gem_flags,
+                uint extern_off,
+                uint extern_count,
+                uint extern_desc_off,
+                uint extern_desc_count,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io,
+                thread const uint* extern_values,
+                uint extern_value_count,
+                thread uchar* node_vals) {
+                constexpr uint kGemNodeCap = 4096u;
+                for (uint idx = 0u; idx < kGemNodeCap; ++idx) {
+                  node_vals[idx] = 0u;
+                }
+                for (uint idx = 0u; idx < instr_count; ++idx) {
+                  uint off = 2u + (idx * 4u);
+                  uint dst = gem_instr[off];
+                  if (dst >= kGemNodeCap) {
+                    continue;
+                  }
+                  uint src0_packed = gem_instr[off + 1u];
+                  uint src1_packed = gem_instr[off + 2u];
+                  uint src0 = rhdl_gem_decode_src(
+                    src0_packed, node_vals, gem_instr, gem_flags,
+                    extern_off, extern_count, extern_desc_off, extern_desc_count,
+                    state_slots, io, extern_values, extern_value_count);
+                  uint src1 = rhdl_gem_decode_src(
+                    src1_packed, node_vals, gem_instr, gem_flags,
+                    extern_off, extern_count, extern_desc_off, extern_desc_count,
+                    state_slots, io, extern_values, extern_value_count);
+                  uint value = (src0 & src1) & 1u;
+                  node_vals[dst] = uchar(value);
+                }
+              }
+
+              static inline __attribute__((always_inline)) uint rhdl_gem_materialize_word(
+                device const uint* gem_instr,
+                uint source_off,
+                uint width,
+                thread uchar* node_vals,
+                uint gem_flags,
+                uint extern_off,
+                uint extern_count,
+                uint extern_desc_off,
+                uint extern_desc_count,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io,
+                thread const uint* extern_values,
+                uint extern_value_count) {
+                uint word = 0u;
+                uint bit_count = width > 32u ? 32u : width;
+                for (uint bit = 0u; bit < bit_count; ++bit) {
+                  uint packed = gem_instr[source_off + bit];
+                  uint value = rhdl_gem_decode_src(
+                    packed, node_vals, gem_instr, gem_flags,
+                    extern_off, extern_count, extern_desc_off, extern_desc_count,
+                    state_slots, io, extern_values, extern_value_count);
+                  word |= (value & 0x1u) << bit;
+                }
+                return word;
+              }
+
+              static inline __attribute__((always_inline)) uint rhdl_gem_execute_shadow(
+                device const uint* gem_instr,
+                device #{scalar_msl_type}* state_slots,
+                device RhdlArcGpuIo* io,
+                thread uint* watch_bits) {
+                constexpr uint kGemNodeCap = 4096u;
+                uint instr_count = gem_instr[0];
+                uint gem_flags = gem_instr[1];
+                bool emit_shadow_hash = (gem_flags & 0x2u) != 0u;
+                if (instr_count > kGemNodeCap) {
+                  instr_count = kGemNodeCap;
+                }
+                uint watch_off = 2u + (instr_count * 4u);
+                uint watch_count = gem_instr[watch_off];
+                if (watch_count > 32u) {
+                  watch_count = 32u;
+                }
+                uint control_off = watch_off + 1u + watch_count;
+                uint control_count = gem_instr[control_off];
+                if (control_count > 32u) {
+                  control_count = 32u;
+                }
+                uint extern_off = control_off + 1u + control_count;
+                uint extern_count = gem_instr[extern_off];
+                if (extern_count > 16384u) {
+                  extern_count = 16384u;
+                }
+                uint extern_desc_off = extern_off + 1u + extern_count;
+                uint extern_desc_count = gem_instr[extern_desc_off];
+                if (extern_desc_count > 16384u) {
+                  extern_desc_count = 16384u;
+                }
+                uint watch_eval_off = extern_desc_off + 1u + extern_desc_count;
+                uint watch_eval_count = gem_instr[watch_eval_off];
+                if (watch_eval_count > instr_count) {
+                  watch_eval_count = instr_count;
+                }
+                bool use_watch_subset = ((gem_flags & 0x1u) != 0u) && watch_eval_count > 0u;
+                constexpr uint kGemExternValueCap = 512u;
+                thread uint extern_values[kGemExternValueCap];
+                uint extern_value_count = extern_desc_count > extern_count ? extern_desc_count : extern_count;
+                if (extern_value_count > kGemExternValueCap) {
+                  extern_value_count = kGemExternValueCap;
+                }
+                for (uint e = 0u; e < extern_value_count; ++e) {
+                  uint value = 0u;
+                  if ((gem_flags & 0x8u) != 0u && e < extern_desc_count) {
+                    uint desc = gem_instr[extern_desc_off + 1u + e];
+                    uint desc_kind = desc & 0x3u;
+                    switch (desc_kind) {
+                      case 0u: {
+                        value = (desc >> 2u) & 0x1u;
+                        break;
+                      }
+                      case 1u: {
+                        uint state_index = (desc >> 2u) & 0xFFFFu;
+                        uint bit_index = (desc >> 18u) & 0x3Fu;
+                        if (state_index < #{state_slot_count}u && bit_index < 32u) {
+                          value = (uint(state_slots[state_index]) >> bit_index) & 0x1u;
+                        }
+                        break;
+                      }
+                      case 2u: {
+                        uint field = (desc >> 2u) & 0xFFu;
+                        uint bit_index = (desc >> 10u) & 0x3Fu;
+                        if (bit_index < 32u) {
+                          value = (rhdl_gem_read_io_word(field, io) >> bit_index) & 0x1u;
+                        }
+                        break;
+                      }
+                      default: {
+                        value = 0u;
+                        break;
+                      }
+                    }
+                  } else if ((gem_flags & 0x4u) != 0u && e < extern_count) {
+                    value = gem_instr[extern_off + 1u + e] & 1u;
+                  } else {
+                    value = e & 1u;
+                  }
+                  extern_values[e] = value & 0x1u;
+                }
+
+                thread uchar node_vals[kGemNodeCap];
+                if (use_watch_subset) {
+                  for (uint wi = 0u; wi < watch_eval_count; ++wi) {
+                    uint idx = gem_instr[watch_eval_off + 1u + wi];
+                    if (idx >= instr_count) {
+                      continue;
+                    }
+                    uint off = 2u + (idx * 4u);
+                    uint dst = gem_instr[off];
+                    if (dst < kGemNodeCap) {
+                      node_vals[dst] = 0u;
+                    }
+                  }
+                } else {
+                  for (uint i = 0u; i < kGemNodeCap; ++i) {
+                    node_vals[i] = 0u;
+                  }
+                }
+
+                uint shadow = 0u;
+                if (use_watch_subset) {
+                  if (emit_shadow_hash) {
+                    for (uint wi = 0u; wi < watch_eval_count; ++wi) {
+                      uint idx = gem_instr[watch_eval_off + 1u + wi];
+                      if (idx >= instr_count) {
+                        continue;
+                      }
+                      uint off = 2u + (idx * 4u);
+                      uint dst = gem_instr[off];
+                      uint src0_packed = gem_instr[off + 1u];
+                      uint src1_packed = gem_instr[off + 2u];
+                      if (dst >= kGemNodeCap) {
+                        continue;
+                      }
+
+                      uint src0 = rhdl_gem_decode_src(
+                        src0_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint src1 = rhdl_gem_decode_src(
+                        src1_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint value = (src0 & src1) & 1u;
+                      node_vals[dst] = uchar(value);
+                      shadow = ((shadow << 1u) | (shadow >> 31u)) ^ (value << (idx & 31u));
+                    }
+                  } else {
+                    for (uint wi = 0u; wi < watch_eval_count; ++wi) {
+                      uint idx = gem_instr[watch_eval_off + 1u + wi];
+                      if (idx >= instr_count) {
+                        continue;
+                      }
+                      uint off = 2u + (idx * 4u);
+                      uint dst = gem_instr[off];
+                      uint src0_packed = gem_instr[off + 1u];
+                      uint src1_packed = gem_instr[off + 2u];
+                      if (dst >= kGemNodeCap) {
+                        continue;
+                      }
+
+                      uint src0 = rhdl_gem_decode_src(
+                        src0_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint src1 = rhdl_gem_decode_src(
+                        src1_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint value = (src0 & src1) & 1u;
+                      node_vals[dst] = uchar(value);
+                    }
+                  }
+                } else {
+                  if (emit_shadow_hash) {
+                    for (uint idx = 0u; idx < instr_count; ++idx) {
+                      uint off = 2u + (idx * 4u);
+                      uint dst = gem_instr[off];
+                      uint src0_packed = gem_instr[off + 1u];
+                      uint src1_packed = gem_instr[off + 2u];
+                      if (dst >= kGemNodeCap) {
+                        continue;
+                      }
+
+                      uint src0 = rhdl_gem_decode_src(
+                        src0_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint src1 = rhdl_gem_decode_src(
+                        src1_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint value = (src0 & src1) & 1u;
+                      node_vals[dst] = uchar(value);
+                      shadow = ((shadow << 1u) | (shadow >> 31u)) ^ (value << (idx & 31u));
+                    }
+                  } else {
+                    for (uint idx = 0u; idx < instr_count; ++idx) {
+                      uint off = 2u + (idx * 4u);
+                      uint dst = gem_instr[off];
+                      uint src0_packed = gem_instr[off + 1u];
+                      uint src1_packed = gem_instr[off + 2u];
+                      if (dst >= kGemNodeCap) {
+                        continue;
+                      }
+
+                      uint src0 = rhdl_gem_decode_src(
+                        src0_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint src1 = rhdl_gem_decode_src(
+                        src1_packed, node_vals, gem_instr, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint value = (src0 & src1) & 1u;
+                      node_vals[dst] = uchar(value);
+                    }
+                  }
+                }
+
+                if (watch_bits != nullptr) {
+                  uint watch = 0u;
+                  for (uint w = 0u; w < watch_count; ++w) {
+                    uint packed = gem_instr[watch_off + 1u + w];
+                    uint bit = rhdl_gem_decode_src(
+                      packed, node_vals, gem_instr, gem_flags,
+                      extern_off, extern_count, extern_desc_off, extern_desc_count,
+                      state_slots, io, extern_values, extern_value_count);
+                    watch |= (bit & 1u) << w;
+                  }
+                  *watch_bits = watch;
+                }
+
+                if ((gem_flags & 0x2u) != 0u && watch_bits != nullptr) {
+                  threadgroup_barrier(mem_flags::mem_none);
+                }
+
+                return shadow;
+              }
+
+              kernel void #{metal_entry}(
+                device #{scalar_msl_type}* all_state_slots [[buffer(0)]],
+                device uchar* all_memory [[buffer(1)]],
+                device RhdlArcGpuIo* all_io [[buffer(2)]],
+                device const uint* gem_instr [[buffer(3)]],
+                uint tid [[thread_position_in_grid]]) {
+                device #{scalar_msl_type}* state_slots = all_state_slots + (tid * #{state_slot_count}u);
+                device uchar* memory = all_memory + (tid * 65536u);
+                device RhdlArcGpuIo* io = all_io + tid;
+
+                io->cycles_ran = 0u;
+                uint budget = io->cycle_budget;
+                uint gem_flags = gem_instr[1];
+                uint instr_count = gem_instr[0];
+                if (instr_count > 4096u) {
+                  instr_count = 4096u;
+                }
+                uint watch_off = 2u + (instr_count * 4u);
+                uint watch_count = gem_instr[watch_off];
+                if (watch_count > 32u) {
+                  watch_count = 32u;
+                }
+                uint control_off = watch_off + 1u + watch_count;
+                uint control_count = gem_instr[control_off];
+                if (control_count > 32u) {
+                  control_count = 32u;
+                }
+                uint extern_off = control_off + 1u + control_count;
+                uint extern_count = gem_instr[extern_off];
+                if (extern_count > 16384u) {
+                  extern_count = 16384u;
+                }
+                uint extern_desc_off = extern_off + 1u + extern_count;
+                uint extern_desc_count = gem_instr[extern_desc_off];
+                if (extern_desc_count > 16384u) {
+                  extern_desc_count = 16384u;
+                }
+                uint watch_eval_off = extern_desc_off + 1u + extern_desc_count;
+                uint watch_eval_count = gem_instr[watch_eval_off];
+                if (watch_eval_count > instr_count) {
+                  watch_eval_count = instr_count;
+                }
+                uint output_field_off = watch_eval_off + 1u + watch_eval_count;
+                uint output_field_count = gem_instr[output_field_off];
+                if (output_field_count > 64u) {
+                  output_field_count = 64u;
+                }
+                uint output_width_off = output_field_off + 1u + output_field_count;
+                uint output_width_count = gem_instr[output_width_off];
+                if (output_width_count > output_field_count) {
+                  output_width_count = output_field_count;
+                }
+                uint output_bits_off = output_width_off + 1u + output_width_count;
+                uint output_bit_count = gem_instr[output_bits_off];
+                if (output_bit_count > 32768u) {
+                  output_bit_count = 32768u;
+                }
+                uint state_slot_off = output_bits_off + 1u + output_bit_count;
+                uint state_slot_count_stream = gem_instr[state_slot_off];
+                if (state_slot_count_stream > #{state_slot_count}u) {
+                  state_slot_count_stream = #{state_slot_count}u;
+                }
+                uint state_width_off = state_slot_off + 1u + state_slot_count_stream;
+                uint state_width_count = gem_instr[state_width_off];
+                if (state_width_count > state_slot_count_stream) {
+                  state_width_count = state_slot_count_stream;
+                }
+                uint state_next_off = state_width_off + 1u + state_width_count;
+                uint state_next_count = gem_instr[state_next_off];
+                if (state_next_count > 32768u) {
+                  state_next_count = 32768u;
+                }
+                uint state_reset_off = state_next_off + 1u + state_next_count;
+                uint state_reset_count = gem_instr[state_reset_off];
+                if (state_reset_count > 32768u) {
+                  state_reset_count = 32768u;
+                }
+                uint state_reset_en_off = state_reset_off + 1u + state_reset_count;
+                uint state_reset_en_count = gem_instr[state_reset_en_off];
+                if (state_reset_en_count > state_slot_count_stream) {
+                  state_reset_en_count = state_slot_count_stream;
+                }
+
+                constexpr ushort kOpCycleBegin = 0u;
+                constexpr ushort kOpEvalLow = 1u;
+                constexpr ushort kOpMemWrite = 2u;
+                constexpr ushort kOpMemRead = 3u;
+                constexpr ushort kOpEvalHigh = 4u;
+                constexpr ushort kOpOutput = 5u;
+                constexpr ushort kOpCycleEnd = 6u;
+                constexpr ushort kGemControlOps[7] = {
+                  kOpCycleBegin,
+                  kOpEvalLow,
+                  kOpMemWrite,
+                  kOpMemRead,
+                  kOpEvalHigh,
+                  kOpOutput,
+                  kOpCycleEnd
+                };
+                uint op_count = control_count > 0u ? control_count : 7u;
+                thread ushort control_ops[32];
+                if (control_count > 0u) {
+                  for (uint op_idx = 0u; op_idx < op_count; ++op_idx) {
+                    control_ops[op_idx] = ushort(gem_instr[control_off + 1u + op_idx] & 0xFFFFu);
+                  }
+                }
+                bool control_matches_default = control_count == 0u;
+                if (!control_matches_default && control_count == 7u) {
+                  control_matches_default = true;
+                  for (uint op_idx = 0u; op_idx < 7u; ++op_idx) {
+                    if (control_ops[op_idx] != kGemControlOps[op_idx]) {
+                      control_matches_default = false;
+                      break;
+                    }
+                  }
+                }
+                bool use_fast_default_loop = control_matches_default && (gem_flags & 0x3u) == 0u;
+                uint output_width_total = 0u;
+                for (uint idx = 0u; idx < output_width_count; ++idx) {
+                  output_width_total += gem_instr[output_width_off + 1u + idx] & 0x3Fu;
+                }
+                uint state_width_total = 0u;
+                for (uint idx = 0u; idx < state_width_count; ++idx) {
+                  state_width_total += gem_instr[state_width_off + 1u + idx] & 0x3Fu;
+                }
+                bool stream_semantics_ready =
+                  (gem_flags & 0x10u) != 0u &&
+                  output_field_count > 0u &&
+                  output_width_count == output_field_count &&
+                  output_bit_count == output_width_total &&
+                  state_slot_count_stream > 0u &&
+                  state_width_count == state_slot_count_stream &&
+                  state_next_count == state_width_total &&
+                  state_reset_count == state_width_total &&
+                  state_reset_en_count == state_slot_count_stream;
+
+                if (budget == 0u) {
+                  uint clk_now = io->clk & 1u;
+                  io->last_clk = clk_now;
+                  #{out_struct} out = #{eval_fn}(clk_now, io->rst, io->mem_data_in, state_slots);
+                  #{write_fn}(io, out);
+                  return;
+                }
+
+                if (stream_semantics_ready && (gem_flags & 0x3u) == 0u) {
+                  constexpr uint kGemNodeCap = 4096u;
+                  constexpr uint kGemExternValueCap = 1024u;
+                  constexpr uint kGemStateStageCap = #{state_slot_count}u;
+                  thread uchar node_vals[kGemNodeCap];
+                  thread uint extern_values[kGemExternValueCap];
+                  thread uint staged_state_values[kGemStateStageCap];
+                  uint extern_value_count = extern_desc_count > extern_count ? extern_desc_count : extern_count;
+                  if (extern_value_count > kGemExternValueCap) {
+                    extern_value_count = kGemExternValueCap;
+                  }
+                  uint mem_data_out_offset = 0xFFFFFFFFu;
+                  uint mem_data_out_width = 0u;
+                  uint mem_addr_offset = 0xFFFFFFFFu;
+                  uint mem_addr_width = 0u;
+                  uint mem_write_en_offset = 0xFFFFFFFFu;
+                  uint mem_write_en_width = 0u;
+                  uint output_cursor_scan = 0u;
+                  for (uint out_idx = 0u; out_idx < output_field_count; ++out_idx) {
+                    uint field = gem_instr[output_field_off + 1u + out_idx] & 0xFFu;
+                    uint width = gem_instr[output_width_off + 1u + out_idx] & 0x3Fu;
+                    if (field == 4u) {
+                      mem_data_out_offset = output_cursor_scan;
+                      mem_data_out_width = width;
+                    } else if (field == 5u) {
+                      mem_addr_offset = output_cursor_scan;
+                      mem_addr_width = width;
+                    } else if (field == 6u) {
+                      mem_write_en_offset = output_cursor_scan;
+                      mem_write_en_width = width;
+                    }
+                    output_cursor_scan += width;
+                  }
+
+                  for (uint i = 0u; i < budget; ++i) {
+                    if ((io->halted & 1u) != 0u) {
+                      break;
+                    }
+
+                    uint low_clk = 0u;
+                    io->last_clk = low_clk;
+                    io->clk = low_clk;
+                    rhdl_gem_fill_extern_values(
+                      gem_instr, gem_flags, extern_off, extern_count, extern_desc_off, extern_desc_count,
+                      state_slots, io, extern_values, extern_value_count);
+                    rhdl_gem_eval_nodes(
+                      gem_instr, instr_count, gem_flags, extern_off, extern_count, extern_desc_off, extern_desc_count,
+                      state_slots, io, extern_values, extern_value_count, node_vals);
+
+                    uint low_mem_data_out = 0u;
+                    uint low_mem_addr = 0u;
+                    uint low_mem_write_en = 0u;
+                    if (mem_data_out_offset != 0xFFFFFFFFu && mem_data_out_width > 0u) {
+                      low_mem_data_out = rhdl_gem_materialize_word(
+                        gem_instr, output_bits_off + 1u + mem_data_out_offset, mem_data_out_width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count) & 0xFFu;
+                    }
+                    if (mem_addr_offset != 0xFFFFFFFFu && mem_addr_width > 0u) {
+                      low_mem_addr = rhdl_gem_materialize_word(
+                        gem_instr, output_bits_off + 1u + mem_addr_offset, mem_addr_width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count) & 0xFFFFu;
+                    }
+                    if (mem_write_en_offset != 0xFFFFFFFFu && mem_write_en_width > 0u) {
+                      low_mem_write_en = rhdl_gem_materialize_word(
+                        gem_instr, output_bits_off + 1u + mem_write_en_offset, mem_write_en_width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count) & 0x1u;
+                    }
+
+                    uint addr = low_mem_addr & 0xFFFFu;
+                    if ((low_mem_write_en & 1u) != 0u) {
+                      memory[addr] = uchar(low_mem_data_out & 0xFFu);
+                    }
+                    io->mem_data_in = uint(memory[addr]);
+
+                    uint high_clk = 1u;
+                    io->last_clk = high_clk;
+                    io->clk = high_clk;
+                    rhdl_gem_fill_extern_values(
+                      gem_instr, gem_flags, extern_off, extern_count, extern_desc_off, extern_desc_count,
+                      state_slots, io, extern_values, extern_value_count);
+                    rhdl_gem_eval_nodes(
+                      gem_instr, instr_count, gem_flags, extern_off, extern_count, extern_desc_off, extern_desc_count,
+                      state_slots, io, extern_values, extern_value_count, node_vals);
+
+                    uint next_cursor = 0u;
+                    uint reset_cursor = 0u;
+                    for (uint state_idx = 0u; state_idx < state_slot_count_stream; ++state_idx) {
+                      uint slot = gem_instr[state_slot_off + 1u + state_idx];
+                      uint width = gem_instr[state_width_off + 1u + state_idx] & 0x3Fu;
+                      uint next_value = rhdl_gem_materialize_word(
+                        gem_instr, state_next_off + 1u + next_cursor, width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint reset_value = rhdl_gem_materialize_word(
+                        gem_instr, state_reset_off + 1u + reset_cursor, width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, extern_value_count);
+                      uint reset_enable = io->rst & 1u;
+                      if (state_idx < state_reset_en_count) {
+                        uint packed_reset_enable = gem_instr[state_reset_en_off + 1u + state_idx];
+                        reset_enable = rhdl_gem_decode_src(
+                          packed_reset_enable, node_vals, gem_instr, gem_flags,
+                          extern_off, extern_count, extern_desc_off, extern_desc_count,
+                          state_slots, io, extern_values, extern_value_count);
+                      }
+                      uint value = (reset_enable & 1u) != 0u ? reset_value : next_value;
+                      if (state_idx < kGemStateStageCap && slot < #{state_slot_count}u && width > 0u) {
+                        uint mask = width >= 32u ? 0xFFFFFFFFu : ((1u << width) - 1u);
+                        staged_state_values[state_idx] = value & mask;
+                      }
+                      next_cursor += width;
+                      reset_cursor += width;
+                    }
+
+                    for (uint state_idx = 0u; state_idx < state_slot_count_stream; ++state_idx) {
+                      uint slot = gem_instr[state_slot_off + 1u + state_idx];
+                      uint width = gem_instr[state_width_off + 1u + state_idx] & 0x3Fu;
+                      if (state_idx < kGemStateStageCap && slot < #{state_slot_count}u && width > 0u) {
+                        state_slots[slot] = (#{scalar_msl_type})staged_state_values[state_idx];
+                      }
+                    }
+
+                    uint output_cursor = 0u;
+                    uint live_output_extern_count = 0u;
+                    for (uint out_idx = 0u; out_idx < output_field_count; ++out_idx) {
+                      uint field = gem_instr[output_field_off + 1u + out_idx] & 0xFFu;
+                      uint width = gem_instr[output_width_off + 1u + out_idx] & 0x3Fu;
+                      uint value = rhdl_gem_materialize_word(
+                        gem_instr, output_bits_off + 1u + output_cursor, width, node_vals, gem_flags,
+                        extern_off, extern_count, extern_desc_off, extern_desc_count,
+                        state_slots, io, extern_values, live_output_extern_count);
+                      rhdl_gem_write_io_word(field, value, io);
+                      output_cursor += width;
+                    }
+
+                    io->cycles_ran = i + 1u;
+                    if ((io->halted & 1u) != 0u) {
+                      break;
+                    }
+                  }
+                  return;
+                }
+
+                if (use_fast_default_loop) {
+                  for (uint i = 0u; i < budget; ++i) {
+                    if ((io->halted & 1u) != 0u) {
+                      break;
+                    }
+
+                    uint low_clk = 0u;
+                    io->last_clk = low_clk;
+                    io->clk = low_clk;
+                    #{out_struct} low = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                    uint addr = low.mem_addr & 0xFFFFu;
+                    if ((low.mem_write_en & 1u) != 0u) {
+                      memory[addr] = uchar(low.mem_data_out & 0xFFu);
+                    }
+                    io->mem_data_in = uint(memory[addr]);
+
+                    uint high_clk = 1u;
+                    io->last_clk = high_clk;
+                    io->clk = high_clk;
+                    #{out_struct} high = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                    #{write_fn}(io, high);
+                    io->cycles_ran = i + 1u;
+
+                    if ((io->halted & 1u) != 0u) {
+                      break;
+                    }
+                  }
+                  return;
+                }
+
+                for (uint i = 0u; i < budget; ++i) {
+                  if ((io->halted & 1u) != 0u) {
+                    break;
+                  }
+
+                  #{out_struct} low;
+                  #{out_struct} high;
+                  bool have_low = false;
+                  bool have_high = false;
+                  uint addr = 0u;
+                  uint gem_shadow = 0u;
+                  uint gem_watch_bits = 0u;
+
+                  for (uint op_idx = 0u; op_idx < op_count; ++op_idx) {
+                    ushort op = control_count > 0u ? control_ops[op_idx] : kGemControlOps[op_idx];
+                    switch (op) {
+                      case kOpCycleBegin: {
+                        break;
+                      }
+                      case kOpEvalLow: {
+                        bool need_watch_override = (gem_flags & 0x1u) != 0u;
+                        bool need_debug_shadow = (gem_flags & 0x2u) != 0u;
+                        if (need_watch_override || (i == 0u && need_debug_shadow)) {
+                          gem_shadow = rhdl_gem_execute_shadow(gem_instr, state_slots, io, &gem_watch_bits);
+                          if (need_debug_shadow && (gem_shadow & 1u) != 0u) {
+                            threadgroup_barrier(mem_flags::mem_none);
+                          }
+                        }
+                        uint low_clk = 0u;
+                        io->last_clk = low_clk;
+                        io->clk = low_clk;
+                        low = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                        have_low = true;
+                        break;
+                      }
+                      case kOpMemWrite: {
+                        if (have_low) {
+                          addr = low.mem_addr & 0xFFFFu;
+                          if ((low.mem_write_en & 1u) != 0u) {
+                            memory[addr] = uchar(low.mem_data_out & 0xFFu);
+                          }
+                        }
+                        break;
+                      }
+                      case kOpMemRead: {
+                        if (have_low) {
+                          uint mem_in = uint(memory[addr]);
+                          io->mem_data_in = mem_in;
+                        }
+                        break;
+                      }
+                      case kOpEvalHigh: {
+                        uint high_clk = 1u;
+                        io->last_clk = high_clk;
+                        io->clk = high_clk;
+                        high = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                        if ((gem_flags & 0x1u) != 0u) {
+                          high.mem_write_en = (gem_watch_bits >> 0u) & 1u;
+                          high.mem_read_en = (gem_watch_bits >> 1u) & 1u;
+                          high.zero_flag_out = (gem_watch_bits >> 3u) & 1u;
+                        }
+                        have_high = true;
+                        break;
+                      }
+                      case kOpOutput: {
+                        if (have_high) {
+                          #{write_fn}(io, high);
+                          io->cycles_ran = i + 1u;
+                        }
+                        break;
+                      }
+                      case kOpCycleEnd: {
+                        break;
+                      }
+                      default: {
+                        break;
+                      }
+                    }
+                  }
+
+                  if ((io->halted & 1u) != 0u) {
+                    break;
+                  }
+                }
+              }
+            MSL
+          end
+
+          <<~MSL
+            kernel void #{metal_entry}(
+              device #{scalar_msl_type}* all_state_slots [[buffer(0)]],
+              device uchar* all_memory [[buffer(1)]],
+              device RhdlArcGpuIo* all_io [[buffer(2)]],
+              uint tid [[thread_position_in_grid]]) {
+              device #{scalar_msl_type}* state_slots = all_state_slots + (tid * #{state_slot_count}u);
+              device uchar* memory = all_memory + (tid * 65536u);
+              device RhdlArcGpuIo* io = all_io + tid;
+
+              io->cycles_ran = 0u;
+              uint budget = io->cycle_budget;
+
+              if (budget == 0u) {
+                uint clk_now = io->clk & 1u;
+                io->last_clk = clk_now;
+                #{out_struct} out = #{eval_fn}(clk_now, io->rst, io->mem_data_in, state_slots);
+                #{write_fn}(io, out);
+                return;
+              }
+
+              for (uint i = 0u; i < budget; ++i) {
+                if ((io->halted & 1u) != 0u) {
+                  break;
+                }
+
+                uint low_clk = 0u;
+                io->last_clk = low_clk;
+                io->clk = low_clk;
+                #{out_struct} low = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                uint addr = low.mem_addr & 0xFFFFu;
+                if ((low.mem_write_en & 1u) != 0u) {
+                  memory[addr] = uchar(low.mem_data_out & 0xFFu);
+                }
+
+                uint mem_in = uint(memory[addr]);
+                io->mem_data_in = mem_in;
+
+                uint high_clk = 1u;
+                io->last_clk = high_clk;
+                io->clk = high_clk;
+                #{out_struct} high = #{eval_fn}(io->clk, io->rst, io->mem_data_in, state_slots);
+                #{write_fn}(io, high);
+                io->cycles_ran = i + 1u;
+
+                if ((io->halted & 1u) != 0u) {
+                  break;
+                }
+              }
+            }
+          MSL
+        end
+
+        def emit_kernel_riscv(
+          top:,
+          metal_entry:,
+          state_layout:,
+          low_eval_fn:,
+          low_wdata_eval_fn: nil,
+          low_data_addr_eval_fn: nil,
+          high_eval_fn:,
+          high_data_addr_eval_fn: nil,
+          full_eval_fn:,
+          low_loop_step_struct:,
+          low_wdata_step_struct: nil,
+          low_data_addr_step_struct: nil,
+          high_loop_step_struct:,
+          high_data_addr_step_struct: nil,
+          cold_memory_layout: [],
+          fast_path: false,
+          dirty_settle_enabled: false,
+          split_low_wdata_eval: false,
+          split_high_data_addr_eval: false,
+          split_low_data_addr_eval: false
+        )
+          out_struct = top_output_struct_name(top.fetch(:name))
+          write_fn = "write_#{sanitize_ident(top.fetch(:name))}_outputs"
+          clock_slots = count_clock_tracking_slots(top.fetch(:ops))
+          state_value_slot_count = state_layout.sum { |entry| entry.fetch(:slot_count, 1) }
+          state_slot_count = state_value_slot_count + clock_slots
+          low_clock_reset_block =
+            if clock_slots <= 0
+              nil
+            elsif clock_slots == 1
+              "state_slots[#{state_value_slot_count}u] = 0u;"
+            else
+              <<~MSL.strip
+                for (uint clock_slot = #{state_value_slot_count}u; clock_slot < #{state_slot_count}u; ++clock_slot) {
+                  state_slots[clock_slot] = 0u;
+                }
+              MSL
+            end
+
+          cold_ranges = cold_memory_layout.filter_map do |entry|
+            start_idx = entry.fetch(:index).to_i
+            slot_count = entry.fetch(:slot_count, 1).to_i
+            finish_idx = start_idx + slot_count
+            next if finish_idx <= 0 || start_idx >= state_value_slot_count
+
+            [start_idx.clamp(0, state_value_slot_count), finish_idx.clamp(0, state_value_slot_count)]
+          end.sort_by(&:first)
+          merged_cold_ranges = []
+          cold_ranges.each do |start_idx, finish_idx|
+            if merged_cold_ranges.empty? || start_idx > merged_cold_ranges.last[1]
+              merged_cold_ranges << [start_idx, finish_idx]
+            else
+              merged_cold_ranges.last[1] = [merged_cold_ranges.last[1], finish_idx].max
+            end
+          end
+          hot_ranges = []
+          hot_cursor = 0
+          merged_cold_ranges.each do |start_idx, finish_idx|
+            hot_ranges << [hot_cursor, start_idx] if hot_cursor < start_idx
+            hot_cursor = finish_idx
+          end
+          hot_ranges << [hot_cursor, state_value_slot_count] if hot_cursor < state_value_slot_count
+          has_cold_memory = !merged_cold_ranges.empty?
+
+          hot_copy_in_lines = hot_ranges.flat_map do |start_idx, finish_idx|
+            if (finish_idx - start_idx) == 1
+              ["local_state[#{start_idx}u] = global_state_slots[#{start_idx}u];"]
+            else
+              [
+                "for (uint si = #{start_idx}u; si < #{finish_idx}u; ++si) {",
+                '  local_state[si] = global_state_slots[si];',
+                '}'
+              ]
+            end
+          end
+
+          hot_copy_out_lines = hot_ranges.flat_map do |start_idx, finish_idx|
+            if (finish_idx - start_idx) == 1
+              ["global_state_slots[#{start_idx}u] = state_slots[#{start_idx}u];"]
+            else
+              [
+                "for (uint si = #{start_idx}u; si < #{finish_idx}u; ++si) {",
+                '  global_state_slots[si] = state_slots[si];',
+                '}'
+              ]
+            end
+          end
+
+          state_init_lines =
+            if has_cold_memory
+              lines = []
+              lines << 'if (budget == 0u) {'
+              lines << "  for (uint si = 0u; si < #{state_slot_count}u; ++si) {"
+              lines << '    local_state[si] = global_state_slots[si];'
+              lines << '  }'
+              lines << '} else {'
+              hot_copy_in_lines.each { |line| lines << "  #{line}" }
+              if clock_slots.positive?
+                lines << "  for (uint si = #{state_value_slot_count}u; si < #{state_slot_count}u; ++si) {"
+                lines << '    local_state[si] = 0u;'
+                lines << '  }'
+              end
+              lines << '}'
+              lines
+            else
+              [
+                "uint state_copy_count = (budget == 0u) ? #{state_slot_count}u : #{state_value_slot_count}u;",
+                'for (uint si = 0u; si < state_copy_count; ++si) {',
+                '  local_state[si] = global_state_slots[si];',
+                '}',
+                'if (budget != 0u) {',
+                "  for (uint si = #{state_value_slot_count}u; si < #{state_slot_count}u; ++si) {",
+                '    local_state[si] = 0u;',
+                '  }',
+                '}'
+              ]
+            end
+
+          state_copy_back_lines =
+            if has_cold_memory
+              hot_copy_out_lines
+            else
+              [
+                "for (uint si = 0u; si < #{state_value_slot_count}u; ++si) {",
+                '  global_state_slots[si] = state_slots[si];',
+                '}'
+              ]
+            end
+
+          input_layout = top.fetch(:inputs).map do |input|
+            { name: sanitize_ident(input.fetch(:name)), width: input.fetch(:type).fetch(:width).to_i }
+          end
+          input_by_name = {}
+          input_layout.each { |entry| input_by_name[entry.fetch(:name)] = entry }
+          constant_input_values = riscv_kernel_constant_inputs(input_layout)
+
+          mask_expr = lambda do |width|
+            return nil if width >= 32
+
+            ((1 << width) - 1).to_s
+          end
+
+          constant_input_literals = {}
+          constant_input_values.each do |name, value|
+            entry = input_by_name[name]
+            next unless entry
+
+            width = entry.fetch(:width)
+            masked_value =
+              if width <= 0
+                0
+              elsif width >= 32
+                value & 0xFFFF_FFFF
+              else
+                value & ((1 << width) - 1)
+              end
+            constant_input_literals[name] = "#{masked_value}u"
+          end
+
+          input_local_lines = input_layout.map do |entry|
+            name = entry.fetch(:name)
+            if constant_input_literals.key?(name)
+              next "uint in_#{name} = #{constant_input_literals.fetch(name)};"
+            end
+
+            width = entry.fetch(:width)
+            mask = mask_expr.call(width)
+            if mask
+              "uint in_#{name} = io->#{name} & #{mask}u;"
+            else
+              "uint in_#{name} = io->#{name};"
+            end
+          end
+
+          input_writeback_lines = input_layout.filter_map do |entry|
+            name = entry.fetch(:name)
+            next if constant_input_literals.key?(name)
+
+            width = entry.fetch(:width)
+            mask = mask_expr.call(width)
+            if mask
+              "io->#{name} = in_#{name} & #{mask}u;"
+            else
+              "io->#{name} = in_#{name};"
+            end
+          end
+
+          clk_field = sanitize_ident('clk')
+          inst_data_field = sanitize_ident('inst_data')
+          data_rdata_field = sanitize_ident('data_rdata')
+          inst_ptw_pte0_field = sanitize_ident('inst_ptw_pte0')
+          inst_ptw_pte1_field = sanitize_ident('inst_ptw_pte1')
+          data_ptw_pte0_field = sanitize_ident('data_ptw_pte0')
+          data_ptw_pte1_field = sanitize_ident('data_ptw_pte1')
+
+          required_input_names = [
+            clk_field,
+            inst_data_field,
+            data_rdata_field,
+            inst_ptw_pte0_field,
+            inst_ptw_pte1_field,
+            data_ptw_pte0_field,
+            data_ptw_pte1_field
+          ]
+          missing = required_input_names.reject { |name| input_by_name.key?(name) }
+          unless missing.empty?
+            raise LoweringError, "RISC-V kernel emission missing expected inputs: #{missing.join(', ')}"
+          end
+
+          eval_input_args = input_layout.map do |entry|
+            name = entry.fetch(:name)
+            if constant_input_literals.key?(name)
+              constant_input_literals.fetch(name)
+            else
+              "in_#{name}"
+            end
+          end.join(', ')
+
+          low_eval_args = if has_cold_memory
+            "#{eval_input_args}, state_slots, cold_state_slots"
+          else
+            "#{eval_input_args}, state_slots"
+          end
+          high_eval_args = low_eval_args
+          full_eval_args = low_eval_args
+          ptw_zero_loop_invariant_lines = [
+            "in_#{inst_ptw_pte0_field} = 0u;",
+            "in_#{inst_ptw_pte1_field} = 0u;",
+            "in_#{data_ptw_pte0_field} = 0u;",
+            "in_#{data_ptw_pte1_field} = 0u;"
+          ]
+
+          if fast_path
+            return <<~MSL
+              kernel void #{metal_entry}(
+                device #{scalar_msl_type}* all_state_slots [[buffer(0)]],
+                device uchar* all_inst_mem [[buffer(1)]],
+                device uchar* all_data_mem [[buffer(2)]],
+                device RhdlArcGpuIo* all_io [[buffer(3)]],
+                uint tid [[thread_position_in_grid]]) {
+                device RhdlArcGpuIo* io = all_io + tid;
+                uint mem_mask = io->mem_mask;
+                uint mem_span = mem_mask + 1u;
+                device #{scalar_msl_type}* global_state_slots = all_state_slots + (tid * #{state_slot_count}u);
+                uint budget = io->cycle_budget;
+                thread #{scalar_msl_type} local_state[#{state_slot_count}];
+              #{indent_lines(state_init_lines)}
+                thread #{scalar_msl_type}* state_slots = local_state;
+              #{indent_lines(has_cold_memory ? ["device #{scalar_msl_type}* cold_state_slots = global_state_slots;"] : [])}
+                // RISC-V Metal runner uses unified instruction/data memory.
+                (void)all_inst_mem;
+                device uchar* mem = all_data_mem + (tid * mem_span);
+
+                io->cycles_ran = 0u;
+                uint local_cycles_ran = 0u;
+              #{indent_lines(input_local_lines)}
+
+                if (budget == 0u) {
+                  #{out_struct} out = #{full_eval_fn}(#{full_eval_args});
+                  #{write_fn}(io, out);
+                  for (uint si = 0u; si < #{state_slot_count}u; ++si) {
+                    global_state_slots[si] = state_slots[si];
+                  }
+              #{indent_lines(input_writeback_lines)}
+                  return;
+                }
+
+              #{indent_lines(ptw_zero_loop_invariant_lines)}
+                for (uint i = 0u; i < budget; ++i) {
+                  in_#{clk_field} = 0u;
+                  #{low_loop_step_struct} low0 = #{low_eval_fn}(#{low_eval_args});
+              #{indent_lines([low_clock_reset_block].compact)}
+
+                  in_#{inst_data_field} = rhdl_read_word_le(mem, mem_mask, low0.inst_addr & mem_mask);
+
+              #{indent_lines(
+                if split_low_data_addr_eval
+                  [
+                    'uint low_data_addr = 0u;',
+                    'if (((low0.data_re | low0.data_we) & 1u) != 0u) {',
+                    "  #{low_data_addr_step_struct} low_addr = #{low_data_addr_eval_fn}(#{low_eval_args});",
+                    '  low_data_addr = low_addr.data_addr & mem_mask;',
+                    '}'
+                  ]
+                else
+                  ['uint low_data_addr = low0.data_addr & mem_mask;']
+                end
+              )}
+                  uint low_data_rdata = ((low0.data_re & 1u) != 0u) ? rhdl_read_word_le(mem, mem_mask, low_data_addr) : 0u;
+                  in_#{data_rdata_field} = low_data_rdata;
+
+                  in_#{clk_field} = 1u;
+              #{indent_lines(
+                if split_low_wdata_eval
+                  [
+                    'if ((low0.data_we & 1u) != 0u) {',
+                    "  #{low_wdata_step_struct} loww = #{low_wdata_eval_fn}(#{low_eval_args});",
+                    '  rhdl_write_word_le(mem, mem_mask, low_data_addr, loww.data_wdata);',
+                    '}'
+                  ]
+                else
+                  [
+                    'if ((low0.data_we & 1u) != 0u) {',
+                    '  rhdl_write_word_le(mem, mem_mask, low_data_addr, low0.data_wdata);',
+                    '}'
+                  ]
+                end
+              )}
+                  #{high_loop_step_struct} high = #{high_eval_fn}(#{high_eval_args});
+
+                  in_#{clk_field} = 0u;
+                  in_#{inst_data_field} = rhdl_read_word_le(mem, mem_mask, high.inst_addr & mem_mask);
+              #{indent_lines(
+                if split_high_data_addr_eval
+                  [
+                    'uint post_data_rdata = 0u;',
+                    'if ((high.data_re & 1u) != 0u) {',
+                    "  #{high_data_addr_step_struct} high_addr = #{high_data_addr_eval_fn}(#{high_eval_args});",
+                    '  uint post_data_addr = high_addr.data_addr & mem_mask;',
+                    '  post_data_rdata = rhdl_read_word_le(mem, mem_mask, post_data_addr);',
+                    '}'
+                  ]
+                else
+                  [
+                    'uint post_data_addr = high.data_addr & mem_mask;',
+                    'uint post_data_rdata = ((high.data_re & 1u) != 0u) ? rhdl_read_word_le(mem, mem_mask, post_data_addr) : 0u;'
+                  ]
+                end
+              )}
+                  in_#{data_rdata_field} = post_data_rdata;
+                  local_cycles_ran = i + 1u;
+              #{indent_lines(
+                if dirty_settle_enabled
+                  [
+                    "if ((high.state_dirty & 1u) == 0u && (low0.data_we & 1u) == 0u) {",
+                    '  local_cycles_ran = budget;',
+                    '  break;',
+                    '}'
+                  ]
+                else
+                  []
+                end
+              )}
+                }
+
+                io->cycles_ran = local_cycles_ran;
+                if (local_cycles_ran == 0u) {
+                  in_#{clk_field} = 0u;
+                  #{out_struct} out = #{full_eval_fn}(#{full_eval_args});
+                  #{write_fn}(io, out);
+                }
+              #{indent_lines(state_copy_back_lines)}
+
+              #{indent_lines(input_writeback_lines)}
+              }
+            MSL
+          end
+
+          <<~MSL
+            kernel void #{metal_entry}(
+              device #{scalar_msl_type}* all_state_slots [[buffer(0)]],
+              device uchar* all_inst_mem [[buffer(1)]],
+              device uchar* all_data_mem [[buffer(2)]],
+              device RhdlArcGpuIo* all_io [[buffer(3)]],
+              uint tid [[thread_position_in_grid]]) {
+              device RhdlArcGpuIo* io = all_io + tid;
+              uint mem_mask = io->mem_mask;
+              uint mem_span = mem_mask + 1u;
+              device #{scalar_msl_type}* global_state_slots = all_state_slots + (tid * #{state_slot_count}u);
+              uint budget = io->cycle_budget;
+              thread #{scalar_msl_type} local_state[#{state_slot_count}];
+            #{indent_lines(state_init_lines)}
+              thread #{scalar_msl_type}* state_slots = local_state;
+            #{indent_lines(has_cold_memory ? ["device #{scalar_msl_type}* cold_state_slots = global_state_slots;"] : [])}
+              // RISC-V Metal runner uses unified instruction/data memory.
+              (void)all_inst_mem;
+              device uchar* mem = all_data_mem + (tid * mem_span);
+
+              io->cycles_ran = 0u;
+              uint local_cycles_ran = 0u;
+            #{indent_lines(input_local_lines)}
+              uint inst_ptw_addr0_cached = 0u;
+              uint inst_ptw_addr1_cached = 0u;
+              uint data_ptw_addr0_cached = 0u;
+              uint data_ptw_addr1_cached = 0u;
+              uint inst_ptw_pte0_cached = 0u;
+              uint inst_ptw_pte1_cached = 0u;
+              uint data_ptw_pte0_cached = 0u;
+              uint data_ptw_pte1_cached = 0u;
+              bool inst_ptw_addr0_valid = false;
+              bool inst_ptw_addr1_valid = false;
+              bool data_ptw_addr0_valid = false;
+              bool data_ptw_addr1_valid = false;
+
+              if (budget == 0u) {
+                #{out_struct} out = #{full_eval_fn}(#{full_eval_args});
+                #{write_fn}(io, out);
+                for (uint si = 0u; si < #{state_slot_count}u; ++si) {
+                  global_state_slots[si] = state_slots[si];
+                }
+            #{indent_lines(input_writeback_lines)}
+                return;
+              }
+
+              for (uint i = 0u; i < budget; ++i) {
+                in_#{clk_field} = 0u;
+                #{low_loop_step_struct} low0 = #{low_eval_fn}(#{low_eval_args});
+            #{indent_lines([low_clock_reset_block].compact)}
+
+                in_#{inst_data_field} = rhdl_read_word_le(mem, mem_mask, low0.inst_addr & mem_mask);
+                uint low_inst_ptw_addr0 = low0.inst_ptw_addr0 & mem_mask;
+                uint low_inst_ptw_addr1 = low0.inst_ptw_addr1 & mem_mask;
+                uint low_data_ptw_addr0 = low0.data_ptw_addr0 & mem_mask;
+                uint low_data_ptw_addr1 = low0.data_ptw_addr1 & mem_mask;
+                uint low_inst_ptw_word0 = low_inst_ptw_addr0 & ~0x3u;
+                uint low_inst_ptw_word1 = low_inst_ptw_addr1 & ~0x3u;
+                uint low_data_ptw_word0 = low_data_ptw_addr0 & ~0x3u;
+                uint low_data_ptw_word1 = low_data_ptw_addr1 & ~0x3u;
+                if (!inst_ptw_addr0_valid || inst_ptw_addr0_cached != low_inst_ptw_word0) {
+                  inst_ptw_addr0_cached = low_inst_ptw_word0;
+                  inst_ptw_pte0_cached = rhdl_read_word_le(mem, mem_mask, low_inst_ptw_addr0);
+                  inst_ptw_addr0_valid = true;
+                }
+                if (!inst_ptw_addr1_valid || inst_ptw_addr1_cached != low_inst_ptw_word1) {
+                  inst_ptw_addr1_cached = low_inst_ptw_word1;
+                  inst_ptw_pte1_cached = rhdl_read_word_le(mem, mem_mask, low_inst_ptw_addr1);
+                  inst_ptw_addr1_valid = true;
+                }
+                if (!data_ptw_addr0_valid || data_ptw_addr0_cached != low_data_ptw_word0) {
+                  data_ptw_addr0_cached = low_data_ptw_word0;
+                  data_ptw_pte0_cached = rhdl_read_word_le(mem, mem_mask, low_data_ptw_addr0);
+                  data_ptw_addr0_valid = true;
+                }
+                if (!data_ptw_addr1_valid || data_ptw_addr1_cached != low_data_ptw_word1) {
+                  data_ptw_addr1_cached = low_data_ptw_word1;
+                  data_ptw_pte1_cached = rhdl_read_word_le(mem, mem_mask, low_data_ptw_addr1);
+                  data_ptw_addr1_valid = true;
+                }
+                in_#{inst_ptw_pte0_field} = inst_ptw_pte0_cached;
+                in_#{inst_ptw_pte1_field} = inst_ptw_pte1_cached;
+                in_#{data_ptw_pte0_field} = data_ptw_pte0_cached;
+                in_#{data_ptw_pte1_field} = data_ptw_pte1_cached;
+
+                uint low_data_addr = low0.data_addr & mem_mask;
+                uint low_data_funct3 = low0.data_funct3 & 0x7u;
+                uint low_data_rdata = 0u;
+                if ((low0.data_re & 1u) != 0u) {
+                  if (low_data_funct3 == 2u) {
+                    low_data_rdata = rhdl_read_word_le(mem, mem_mask, low_data_addr);
+                  } else {
+                    low_data_rdata = rhdl_read_mem_funct3(mem, mem_mask, low_data_addr, low_data_funct3);
+                  }
+                }
+                in_#{data_rdata_field} = low_data_rdata;
+
+                in_#{clk_field} = 1u;
+                if ((low0.data_we & 1u) != 0u) {
+                  if (low_data_funct3 == 2u) {
+                    uint low_word_addr = low_data_addr & mem_mask;
+                    uint low_wdata = low0.data_wdata;
+                    rhdl_write_word_le(mem, mem_mask, low_word_addr, low_wdata);
+                  } else {
+                    rhdl_write_mem_funct3(mem, mem_mask, low_data_addr, low0.data_wdata, low_data_funct3);
+                  }
+                  uint low_write_word = low_data_addr & ~0x3u;
+                  if (inst_ptw_addr0_valid && inst_ptw_addr0_cached == low_write_word) {
+                    inst_ptw_addr0_valid = false;
+                  }
+                  if (inst_ptw_addr1_valid && inst_ptw_addr1_cached == low_write_word) {
+                    inst_ptw_addr1_valid = false;
+                  }
+                  if (data_ptw_addr0_valid && data_ptw_addr0_cached == low_write_word) {
+                    data_ptw_addr0_valid = false;
+                  }
+                  if (data_ptw_addr1_valid && data_ptw_addr1_cached == low_write_word) {
+                    data_ptw_addr1_valid = false;
+                  }
+                }
+                #{high_loop_step_struct} high = #{high_eval_fn}(#{high_eval_args});
+
+                in_#{clk_field} = 0u;
+                in_#{inst_data_field} = rhdl_read_word_le(mem, mem_mask, high.inst_addr & mem_mask);
+                uint high_inst_ptw_addr0 = high.inst_ptw_addr0 & mem_mask;
+                uint high_inst_ptw_addr1 = high.inst_ptw_addr1 & mem_mask;
+                uint high_data_ptw_addr0 = high.data_ptw_addr0 & mem_mask;
+                uint high_data_ptw_addr1 = high.data_ptw_addr1 & mem_mask;
+                uint high_inst_ptw_word0 = high_inst_ptw_addr0 & ~0x3u;
+                uint high_inst_ptw_word1 = high_inst_ptw_addr1 & ~0x3u;
+                uint high_data_ptw_word0 = high_data_ptw_addr0 & ~0x3u;
+                uint high_data_ptw_word1 = high_data_ptw_addr1 & ~0x3u;
+                if (!inst_ptw_addr0_valid || inst_ptw_addr0_cached != high_inst_ptw_word0) {
+                  inst_ptw_addr0_cached = high_inst_ptw_word0;
+                  inst_ptw_pte0_cached = rhdl_read_word_le(mem, mem_mask, high_inst_ptw_addr0);
+                  inst_ptw_addr0_valid = true;
+                }
+                if (!inst_ptw_addr1_valid || inst_ptw_addr1_cached != high_inst_ptw_word1) {
+                  inst_ptw_addr1_cached = high_inst_ptw_word1;
+                  inst_ptw_pte1_cached = rhdl_read_word_le(mem, mem_mask, high_inst_ptw_addr1);
+                  inst_ptw_addr1_valid = true;
+                }
+                if (!data_ptw_addr0_valid || data_ptw_addr0_cached != high_data_ptw_word0) {
+                  data_ptw_addr0_cached = high_data_ptw_word0;
+                  data_ptw_pte0_cached = rhdl_read_word_le(mem, mem_mask, high_data_ptw_addr0);
+                  data_ptw_addr0_valid = true;
+                }
+                if (!data_ptw_addr1_valid || data_ptw_addr1_cached != high_data_ptw_word1) {
+                  data_ptw_addr1_cached = high_data_ptw_word1;
+                  data_ptw_pte1_cached = rhdl_read_word_le(mem, mem_mask, high_data_ptw_addr1);
+                  data_ptw_addr1_valid = true;
+                }
+                in_#{inst_ptw_pte0_field} = inst_ptw_pte0_cached;
+                in_#{inst_ptw_pte1_field} = inst_ptw_pte1_cached;
+                in_#{data_ptw_pte0_field} = data_ptw_pte0_cached;
+                in_#{data_ptw_pte1_field} = data_ptw_pte1_cached;
+
+                uint post_data_addr = high.data_addr & mem_mask;
+                uint post_data_funct3 = high.data_funct3 & 0x7u;
+                uint post_data_rdata = 0u;
+                if ((high.data_re & 1u) != 0u) {
+                  if (post_data_funct3 == 2u) {
+                    post_data_rdata = rhdl_read_word_le(mem, mem_mask, post_data_addr);
+                  } else {
+                    post_data_rdata = rhdl_read_mem_funct3(mem, mem_mask, post_data_addr, post_data_funct3);
+                  }
+                }
+                in_#{data_rdata_field} = post_data_rdata;
+                local_cycles_ran = i + 1u;
+              }
+
+              io->cycles_ran = local_cycles_ran;
+              if (local_cycles_ran == 0u) {
+                in_#{clk_field} = 0u;
+                #{out_struct} out = #{full_eval_fn}(#{full_eval_args});
+                #{write_fn}(io, out);
+              }
+            #{indent_lines(state_copy_back_lines)}
+
+            #{indent_lines(input_writeback_lines)}
+            }
+          MSL
+        end
+
+        def emit_kernel_apple2(top:, metal_entry:, state_layout:, low_eval_fn:, comb_eval_fn:, update_eval_fn:, phase_split_enabled:, dirty_settle_enabled:, full_eval_fn:)
+          out_struct = top_output_struct_name(top.fetch(:name))
+          write_fn = "write_#{sanitize_ident(top.fetch(:name))}_outputs"
+          loop_step_struct = "#{sanitize_ident(top.fetch(:name))}_loop_step"
+
+          input_names = top.fetch(:inputs).map { |input| sanitize_ident(input.fetch(:name)) }
+          clock_field = sanitize_ident('clk_14m')
+          ram_do_field = sanitize_ident('ram_do')
+          speaker_field = sanitize_ident('speaker')
+          clock_slots = count_clock_tracking_slots(top.fetch(:ops))
+          state_slot_count = state_layout.sum { |entry| entry.fetch(:slot_count, 1) } + clock_slots
+          input_locals = input_names.map do |name|
+            case name
+            when clock_field
+              "uint in_#{name} = io->#{name} & 1u;"
+            when ram_do_field
+              "uint in_#{name} = io->#{name} & 0xFFu;"
+            else
+              "uint in_#{name} = io->#{name};"
+            end
+          end
+          input_args = input_names.map { |name| "in_#{name}" }.join(', ')
+
+          <<~MSL
+            kernel void #{metal_entry}(
+              device #{scalar_msl_type}* all_state_slots [[buffer(0)]],
+              device uchar* all_ram [[buffer(1)]],
+              device uchar* all_rom [[buffer(2)]],
+              device RhdlArcGpuIo* all_io [[buffer(3)]],
+              uint tid [[thread_position_in_grid]]) {
+              device #{scalar_msl_type}* state_slots = all_state_slots + (tid * #{state_slot_count}u);
+              device uchar* ram = all_ram + (tid * 65536u);
+              device uchar* rom = all_rom + (tid * 12288u);
+              device RhdlArcGpuIo* io = all_io + tid;
+              (void)rom;
+
+              uint budget = io->cycle_budget;
+              uint local_cycles_ran = 0u;
+              uint local_speaker_toggles = 0u;
+              uint local_text_dirty = 0u;
+              uint local_prev_speaker = io->prev_speaker & 1u;
+            #{indent_lines(input_locals)}
+              uint local_last_clock = in_#{clock_field} & 1u;
+              thread #{scalar_msl_type} local_state[#{state_slot_count}];
+              for (uint si = 0u; si < #{state_slot_count}u; ++si) {
+                local_state[si] = state_slots[si];
+              }
+
+              if (budget == 0u) {
+                uint clk_now = in_#{clock_field} & 1u;
+                in_#{clock_field} = clk_now;
+                local_last_clock = clk_now;
+                #{out_struct} out = #{full_eval_fn}(#{input_args}, local_state);
+                #{write_fn}(io, out);
+                io->cycles_ran = local_cycles_ran;
+                io->speaker_toggles = local_speaker_toggles;
+                io->text_dirty = local_text_dirty;
+                io->prev_speaker = local_prev_speaker;
+                io->#{clock_field} = in_#{clock_field};
+                io->#{ram_do_field} = in_#{ram_do_field};
+                io->last_clock = local_last_clock;
+                for (uint si = 0u; si < #{state_slot_count}u; ++si) {
+                  state_slots[si] = local_state[si];
+                }
+                return;
+              }
+
+              for (uint i = 0u; i < budget; ++i) {
+                in_#{clock_field} = 0u;
+                local_last_clock = in_#{clock_field};
+                #{loop_step_struct} low = #{low_eval_fn}(#{input_args}, local_state);
+
+                uint addr = low.ram_addr & 0xFFFFu;
+                uint ram_value = uint(ram[addr]);
+                in_#{ram_do_field} = ram_value & 0xFFu;
+
+                in_#{clock_field} = 1u;
+                local_last_clock = in_#{clock_field};
+                #{loop_step_struct} high = #{update_eval_fn}(#{input_args}, local_state);
+                #{loop_step_struct} step = high;
+            #{if phase_split_enabled && dirty_settle_enabled
+              "    if ((high.state_dirty & 1u) != 0u) {\n      step = #{comb_eval_fn}(#{input_args}, local_state);\n    }"
+            elsif phase_split_enabled
+              ''
+            else
+              ''
+            end}
+
+                uint write_addr = step.ram_addr & 0xFFFFu;
+                if ((step.ram_we & 1u) != 0u && write_addr < 0xC000u) {
+                  ram[write_addr] = uchar(step.d & 0xFFu);
+                  if ((write_addr >= 0x0400u && write_addr <= 0x07FFu) ||
+                      (write_addr >= 0x2000u && write_addr <= 0x5FFFu)) {
+                    local_text_dirty = 1u;
+                  }
+                }
+
+                uint speaker_now = step.#{speaker_field} & 1u;
+                if (speaker_now != local_prev_speaker) {
+                  local_speaker_toggles = local_speaker_toggles + 1u;
+                  local_prev_speaker = speaker_now;
+                }
+
+                local_cycles_ran = i + 1u;
+              }
+
+              #{out_struct} final_out = #{full_eval_fn}(#{input_args}, local_state);
+              #{write_fn}(io, final_out);
+              io->cycles_ran = local_cycles_ran;
+              io->speaker_toggles = local_speaker_toggles;
+              io->text_dirty = local_text_dirty;
+              io->prev_speaker = local_prev_speaker;
+              io->#{clock_field} = in_#{clock_field};
+              io->#{ram_do_field} = in_#{ram_do_field};
+              io->last_clock = local_last_clock;
+              for (uint si = 0u; si < #{state_slot_count}u; ++si) {
+                state_slots[si] = local_state[si];
+              }
+            }
+          MSL
+        end
+
+        def riscv_kernel_constant_inputs(input_layout)
+          return {} unless ENV['RHDL_ARC_TO_GPU_RISCV_CORE_SPECIALIZE'] == '1'
+
+          defaults = {
+            sanitize_ident('irq_software') => 0,
+            sanitize_ident('irq_timer') => 0,
+            sanitize_ident('irq_external') => 0
+          }
+          available = input_layout.map { |entry| entry.fetch(:name) }.to_set
+          defaults.select { |name, _value| available.include?(name) }
+        end
+
+        def generate_state_read_lines(op, state_layout)
+          ref = op.fetch(:result_refs).first
+          out_type = op.fetch(:result_types).first
+          slot = state_layout.find { |entry| entry.fetch(:result_ref) == ref }
+          raise LoweringError, "Missing state slot for #{ref}" unless slot
+
+          slot_index = slot.fetch(:index)
+          width = out_type.fetch(:width)
+
+          ["#{scalar_msl_type} #{ref_var_name(ref)} = rhdl_mask_bits(state_slots[#{slot_index}], #{width}u);"]
+        end
+
+        def generate_state_update_lines(op, type_map, functions, state_layout)
+          ref = op.fetch(:result_refs).first
+          out_type = op.fetch(:result_types).first
+          slot = state_layout.find { |entry| entry.fetch(:result_ref) == ref }
+          raise LoweringError, "Missing state slot for #{ref}" unless slot
+
+          slot_index = slot.fetch(:index)
+          lines = []
+
+          call_expr = generate_call_expr(
+            callee: op.fetch(:callee),
+            args: op.fetch(:args),
+            result_types: [out_type],
+            type_map: type_map,
+            functions: functions,
+            temp_prefix: "state_#{slot_index}_next"
+          )
+
+          lines.concat(call_expr.fetch(:setup_lines))
+          next_value_expr = call_expr.fetch(:result_exprs).first
+
+          clock_cond = "(#{masked_expr(ref_var_name(op.fetch(:clock_ref)), TypeRef.new(kind: :scalar, width: 1))} != 0u)"
+          enable_cond = if op.fetch(:enable_ref)
+            "(#{masked_expr(ref_var_name(op.fetch(:enable_ref)), TypeRef.new(kind: :scalar, width: 1))} != 0u)"
+          else
+            'true'
+          end
+
+          lines << "if (#{clock_cond}) {"
+          if op.fetch(:reset_ref)
+            reset_cond = "(#{masked_expr(ref_var_name(op.fetch(:reset_ref)), TypeRef.new(kind: :scalar, width: 1))} != 0u)"
+            lines << "  if (#{reset_cond}) {"
+            lines << "    next_state_#{slot_index} = #{scalar_zero_literal};"
+            lines << "  } else if (#{enable_cond}) {"
+            lines << "    next_state_#{slot_index} = #{masked_expr(next_value_expr, out_type)};"
+            lines << '  }'
+          else
+            lines << "  if (#{enable_cond}) {"
+            lines << "    next_state_#{slot_index} = #{masked_expr(next_value_expr, out_type)};"
+            lines << '  }'
+          end
+          lines << '}'
+
+          lines
+        end
+
+        def generate_op_lines(
+          op,
+          type_map,
+          functions,
+          in_top_module:,
+          state_ref_to_slot: nil,
+          cold_memory_bases: nil,
+          cold_state_slots_var: nil
+        )
+          cold_memory_bases ||= Set.new
+          kind = op.fetch(:kind)
+
+          case kind
+          when :constant
+            type = op.fetch(:result_types).first
+            var = ref_var_name(op.fetch(:result_refs).first)
+            literal = constant_literal(op.fetch(:value), type)
+            ["#{metal_type_for(type)} #{var} = #{literal};"]
+          when :to_clock
+            out = ref_var_name(op.fetch(:result_refs).first)
+            inp = ref_var_name(op.fetch(:input))
+            ["#{metal_type_for(TypeRef.new(kind: :scalar, width: 1))} #{out} = #{masked_expr(inp, TypeRef.new(kind: :scalar, width: 1))};"]
+          when :arc_call
+            call = generate_call_expr(
+              callee: op.fetch(:callee),
+              args: op.fetch(:args),
+              result_types: op.fetch(:result_types),
+              type_map: type_map,
+              functions: functions,
+              temp_prefix: ref_var_name(op.fetch(:result_refs).first)
+            )
+            lines = []
+            lines.concat(call.fetch(:setup_lines))
+            op.fetch(:result_refs).each_with_index do |ref, idx|
+              type = op.fetch(:result_types)[idx]
+              lines << "#{metal_type_for(type)} #{ref_var_name(ref)} = #{masked_expr(call.fetch(:result_exprs)[idx], type)};"
+            end
+            lines
+          when :arc_memory
+            if in_top_module
+              []
+            else
+              raise LoweringError, 'arc.memory unsupported in arc.define body'
+            end
+          when :arc_memory_read_port
+            raise LoweringError, 'arc.memory_read_port requires top-module state layout context' if state_ref_to_slot.nil?
+
+            emit_memory_read_port_lines(
+              op,
+              type_map,
+              state_ref_to_slot: state_ref_to_slot,
+              cold_memory_bases: cold_memory_bases,
+              cold_state_slots_var: cold_state_slots_var
+            )
+          when :arc_memory_write_port
+            if in_top_module
+              raise LoweringError, 'arc.memory_write_port must be handled in top module generation path'
+            end
+            raise LoweringError, 'arc.memory_write_port unsupported in arc.define body'
+          when :seq_memory_write_port
+            if in_top_module
+              raise LoweringError, 'seq.firmem.write_port must be handled in top module generation path'
+            end
+            raise LoweringError, 'seq.firmem.write_port unsupported in arc.define body'
+          when :comb
+            emit_comb_lines(op, type_map)
+          when :synth_aig_and_inv
+            emit_synth_aig_and_inv_lines(op, type_map)
+          when :mux
+            emit_mux_lines(op, type_map)
+          when :icmp
+            emit_icmp_lines(op, type_map)
+          when :concat
+            emit_concat_lines(op, type_map)
+          when :extract
+            emit_extract_lines(op, type_map)
+          when :replicate
+            emit_replicate_lines(op, type_map)
+          when :array_create
+            emit_array_create_lines(op, type_map)
+          when :aggregate_constant
+            emit_aggregate_constant_lines(op)
+          when :array_get
+            emit_array_get_lines(op, type_map)
+          when :alias
+            out = ref_var_name(op.fetch(:result_refs).first)
+            out_type = op.fetch(:result_types).first
+            src_expr = masked_expr(ref_var_name(op.fetch(:source_ref)), out_type)
+            ["#{metal_type_for(out_type)} #{out} = #{masked_expr(src_expr, out_type)};"]
+          when :arc_state
+            if in_top_module
+              raise LoweringError, 'arc.state must be handled in top module generation path'
+            end
+            raise LoweringError, 'arc.state unsupported in arc.define body'
+          when :seq_firreg
+            if in_top_module
+              raise LoweringError, 'seq.firreg must be handled in top module generation path'
+            end
+            raise LoweringError, 'seq.firreg unsupported in arc.define body'
+          else
+            raise LoweringError, "Unsupported op kind in codegen: #{kind}"
+          end
+        end
+
+        def emit_aggregate_constant_lines(op)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          arr_type = op.fetch(:result_types).first
+          arr_struct = array_struct_name(arr_type)
+          elem_type = arr_type.fetch(:element)
+          values = op.fetch(:values)
+
+          ordered_values = values.reverse
+          literal_values = ordered_values.map { |value| constant_literal(value, elem_type) }
+          ["#{arr_struct} #{out} = { {#{literal_values.join(', ')}} };"]
+        end
+
+        def emit_comb_lines(op, type_map)
+          comb_op = op.fetch(:comb_op)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          operand_refs = op.fetch(:operands)
+          operand_types = operand_refs.map { |ref| type_map.fetch(ref) }
+          operands = operand_refs.map { |ref| masked_expr(ref_var_name(ref), type_map.fetch(ref)) }
+
+          if wide_scalar?(out_type) || operand_types.any? { |t| wide_scalar?(t) }
+            wide_operands = operand_refs.zip(operand_types).map do |ref, ref_type|
+              ref_expr = masked_expr(ref_var_name(ref), ref_type)
+              wide_scalar?(ref_type) ? "(#{ref_expr})" : "rhdl_wide_make(#{ref_expr}, 0u)"
+            end
+            shift_expr =
+              if operand_types.length >= 2
+                raw = if wide_scalar?(operand_types[1])
+                  "(#{masked_expr(ref_var_name(operand_refs[1]), operand_types[1])}).x"
+                else
+                  masked_expr(ref_var_name(operand_refs[1]), operand_types[1])
+                end
+                "(#{raw} & 63u)"
+              else
+                '0u'
+              end
+
+            wide_expr = case comb_op
+            when 'add'
+              wide_operands.reduce { |lhs, rhs| "rhdl_wide_add(#{lhs}, #{rhs})" }
+            when 'sub'
+              wide_operands[1..].reduce(wide_operands[0]) { |lhs, rhs| "rhdl_wide_sub(#{lhs}, #{rhs})" }
+            when 'mul'
+              wide_operands[1..].reduce(wide_operands[0]) { |lhs, rhs| "rhdl_wide_mul(#{lhs}, #{rhs})" }
+            when 'divu'
+              lhs = wide_operands[0]
+              rhs = wide_operands[1]
+              "(rhdl_wide_to_ulong(#{rhs}) == 0ul ? rhdl_wide_make(0u, 0u) : rhdl_wide_from_ulong(rhdl_wide_to_ulong(#{lhs}) / rhdl_wide_to_ulong(#{rhs})))"
+            when 'modu'
+              lhs = wide_operands[0]
+              rhs = wide_operands[1]
+              "(rhdl_wide_to_ulong(#{rhs}) == 0ul ? rhdl_wide_make(0u, 0u) : rhdl_wide_from_ulong(rhdl_wide_to_ulong(#{lhs}) % rhdl_wide_to_ulong(#{rhs})))"
+            when 'shl'
+              "rhdl_wide_shlu(#{wide_operands[0]}, #{shift_expr})"
+            when 'shru'
+              "rhdl_wide_shru(#{wide_operands[0]}, #{shift_expr})"
+            when 'xor'
+              wide_operands.reduce { |lhs, rhs| "rhdl_wide_xor(#{lhs}, #{rhs})" }
+            when 'or'
+              wide_operands.reduce { |lhs, rhs| "rhdl_wide_or(#{lhs}, #{rhs})" }
+            when 'and'
+              wide_operands.reduce { |lhs, rhs| "rhdl_wide_and(#{lhs}, #{rhs})" }
+            else
+              raise LoweringError, "ArcToGPU lowering does not support wide comb.#{comb_op} in packed mode"
+            end
+
+            if wide_scalar?(out_type)
+              return ["#{metal_type_for(out_type)} #{out} = #{masked_expr(wide_expr, out_type)};"]
+            end
+
+            return ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{wide_expr}).x", out_type)};"]
+          end
+
+          expr = case comb_op
+          when 'add'
+            "#{operands[0]} + #{operands[1]}"
+          when 'sub'
+            "#{operands[0]} - #{operands[1]}"
+          when 'mul'
+            "#{operands[0]} * #{operands[1]}"
+          when 'divu'
+            "(#{operands[1]} == #{scalar_zero_literal} ? #{scalar_zero_literal} : (#{operands[0]} / #{operands[1]}))"
+          when 'modu'
+            "(#{operands[1]} == #{scalar_zero_literal} ? #{scalar_zero_literal} : (#{operands[0]} % #{operands[1]}))"
+          when 'shl'
+            "(#{operands[0]} << (#{operands[1]} & #{scalar_width_bits - 1}u))"
+          when 'shru'
+            "(#{operands[0]} >> (#{operands[1]} & #{scalar_width_bits - 1}u))"
+          when 'xor'
+            operands.join(' ^ ')
+          when 'or'
+            operands.join(' | ')
+          when 'and'
+            operands.join(' & ')
+          else
+            raise LoweringError, "Unsupported comb op: #{comb_op}"
+          end
+
+          ["#{metal_type_for(out_type)} #{out} = #{masked_expr(expr, out_type)};"]
+        end
+
+        def emit_synth_aig_and_inv_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          operand_refs = op.fetch(:operands)
+          invert_flags = op.fetch(:invert_flags)
+          operand_types = operand_refs.map { |ref| type_map.fetch(ref) }
+
+          raise LoweringError, 'synth.aig.and_inv requires at least one operand' if operand_refs.empty?
+
+          if wide_scalar?(out_type) || operand_types.any? { |t| wide_scalar?(t) }
+            wide_terms = operand_refs.each_with_index.map do |ref, idx|
+              ref_type = operand_types[idx]
+              expr = masked_expr(ref_var_name(ref), ref_type)
+              wide_expr = wide_scalar?(ref_type) ? "(#{expr})" : "rhdl_wide_make(#{expr}, 0u)"
+              if invert_flags[idx]
+                "rhdl_wide_xor(#{wide_expr}, rhdl_wide_make(0xFFFFFFFFu, 0xFFFFFFFFu))"
+              else
+                wide_expr
+              end
+            end
+            wide_expr = wide_terms[1..].reduce(wide_terms[0]) { |lhs, rhs| "rhdl_wide_and(#{lhs}, #{rhs})" }
+
+            if wide_scalar?(out_type)
+              return ["#{metal_type_for(out_type)} #{out} = #{masked_expr(wide_expr, out_type)};"]
+            end
+
+            return ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{wide_expr}).x", out_type)};"]
+          end
+
+          terms = operand_refs.each_with_index.map do |ref, idx|
+            expr = masked_expr(ref_var_name(ref), operand_types[idx])
+            invert_flags[idx] ? "(~(#{expr}))" : "(#{expr})"
+          end
+          expr = terms.join(' & ')
+          ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{expr})", out_type)};"]
+        end
+
+        def emit_mux_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          cond_ref, true_ref, false_ref = op.fetch(:operands)
+          cond_expr = truthy_expr(ref: cond_ref, type_map: type_map)
+          true_expr = masked_expr(ref_var_name(true_ref), type_map.fetch(true_ref))
+          false_expr = masked_expr(ref_var_name(false_ref), type_map.fetch(false_ref))
+
+          if wide_scalar?(out_type)
+            lines = []
+            lines << "#{metal_type_for(out_type)} #{out};"
+            lines << "if (#{cond_expr}) {"
+            lines << "  #{out} = #{masked_expr(true_expr, out_type)};"
+            lines << '} else {'
+            lines << "  #{out} = #{masked_expr(false_expr, out_type)};"
+            lines << '}'
+            lines << "#{out} = #{masked_expr(out, out_type)};"
+            lines
+          else
+            ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{cond_expr} ? #{true_expr} : #{false_expr})", out_type)};"]
+          end
+        end
+
+        def emit_icmp_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          lhs_ref, rhs_ref = op.fetch(:operands)
+          lhs_expr = masked_expr(ref_var_name(lhs_ref), type_map.fetch(lhs_ref))
+          rhs_expr = masked_expr(ref_var_name(rhs_ref), type_map.fetch(rhs_ref))
+          predicate = op.fetch(:predicate)
+
+          cmp_expr =
+            if wide_scalar?(type_map.fetch(lhs_ref)) || wide_scalar?(type_map.fetch(rhs_ref))
+              unless wide_scalar?(type_map.fetch(lhs_ref)) && wide_scalar?(type_map.fetch(rhs_ref))
+                raise LoweringError, 'ArcToGPU lowering cannot compare mixed-width packed/non-packed values'
+              end
+
+              case predicate
+              when 'eq'
+                "rhdl_wide_eq(#{lhs_expr}, #{rhs_expr})"
+              when 'ne'
+                "!rhdl_wide_eq(#{lhs_expr}, #{rhs_expr})"
+              when 'ult'
+                "(rhdl_wide_to_ulong(#{lhs_expr}) < rhdl_wide_to_ulong(#{rhs_expr}))"
+              when 'ule'
+                "(rhdl_wide_to_ulong(#{lhs_expr}) <= rhdl_wide_to_ulong(#{rhs_expr}))"
+              when 'ugt'
+                "(rhdl_wide_to_ulong(#{lhs_expr}) > rhdl_wide_to_ulong(#{rhs_expr}))"
+              when 'uge'
+                "(rhdl_wide_to_ulong(#{lhs_expr}) >= rhdl_wide_to_ulong(#{rhs_expr}))"
+              else
+                raise LoweringError, "ArcToGPU lowering does not support comb.icmp predicate #{predicate} for packed wide values"
+              end
+            else
+              case predicate
+              when 'eq'
+                "(#{lhs_expr} == #{rhs_expr})"
+              when 'ne'
+                "(#{lhs_expr} != #{rhs_expr})"
+              when 'ult'
+                "(#{lhs_expr} < #{rhs_expr})"
+              when 'ule'
+                "(#{lhs_expr} <= #{rhs_expr})"
+              when 'ugt'
+                "(#{lhs_expr} > #{rhs_expr})"
+              when 'uge'
+                "(#{lhs_expr} >= #{rhs_expr})"
+              when 'slt', 'sle', 'sgt', 'sge'
+                lhs_type = type_map.fetch(lhs_ref)
+                rhs_type = type_map.fetch(rhs_ref)
+                signed_width = [lhs_type.fetch(:width), rhs_type.fetch(:width)].max
+                lhs_signed = signed_cast_expr(lhs_expr, signed_width)
+                rhs_signed = signed_cast_expr(rhs_expr, signed_width)
+                case predicate
+                when 'slt'
+                  "(#{lhs_signed} < #{rhs_signed})"
+                when 'sle'
+                  "(#{lhs_signed} <= #{rhs_signed})"
+                when 'sgt'
+                  "(#{lhs_signed} > #{rhs_signed})"
+                else
+                  "(#{lhs_signed} >= #{rhs_signed})"
+                end
+              else
+                raise LoweringError, "ArcToGPU lowering does not support comb.icmp predicate #{predicate}"
+              end
+            end
+
+          ["#{metal_type_for(TypeRef.new(kind: :scalar, width: 1))} #{out} = #{cmp_expr} ? #{scalar_one_literal} : #{scalar_zero_literal};"]
+        end
+
+        def emit_concat_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          operands = op.fetch(:operands)
+          operand_types = operands.map { |ref| type_map.fetch(ref) }
+
+          if wide_scalar?(out_type) || operand_types.any? { |t| wide_scalar?(t) }
+            shift = 0
+            accum_expr = 'rhdl_wide_make(0u, 0u)'
+            operands.zip(operand_types).reverse_each do |ref, ref_type|
+              ref_expr = masked_expr(ref_var_name(ref), ref_type)
+              wide_expr =
+                if wide_scalar?(ref_type)
+                  "(#{ref_expr})"
+                else
+                  "rhdl_wide_make(#{ref_expr}, 0u)"
+                end
+              shifted = shift.zero? ? wide_expr : "rhdl_wide_shlu(#{wide_expr}, #{shift}u)"
+              accum_expr = "rhdl_wide_or(#{accum_expr}, #{shifted})"
+              shift += ref_type.fetch(:width)
+            end
+
+            if wide_scalar?(out_type)
+              return ["#{metal_type_for(out_type)} #{out} = #{masked_expr(accum_expr, out_type)};"]
+            end
+
+            return ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{accum_expr}).x", out_type)};"]
+          end
+
+          shift = 0
+          parts = []
+          operands.zip(operand_types).reverse_each do |ref, ref_type|
+            ref_expr = masked_expr(ref_var_name(ref), ref_type)
+            part = shift.zero? ? "(#{ref_expr})" : "((#{ref_expr}) << #{shift}u)"
+            parts << part
+            shift += ref_type.fetch(:width)
+          end
+
+          expr = parts.join(' | ')
+          ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{expr})", out_type)};"]
+        end
+
+        def emit_extract_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          from = op.fetch(:from)
+          input_type = type_map.fetch(op.fetch(:input))
+          inp = masked_expr(ref_var_name(op.fetch(:input)), input_type)
+
+          if wide_scalar?(input_type)
+            if wide_scalar?(out_type)
+              expr = "rhdl_wide_shru(#{inp}, #{from}u)"
+              ["#{metal_type_for(out_type)} #{out} = #{masked_expr(expr, out_type)};"]
+            else
+              expr = "rhdl_wide_shru(#{inp}, #{from}u).x"
+              ["#{metal_type_for(out_type)} #{out} = #{masked_expr(expr, out_type)};"]
+            end
+          else
+            ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{inp} >> #{from}u)", out_type)};"]
+          end
+        end
+
+        def emit_replicate_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          inp_type = type_map.fetch(op.fetch(:input))
+          inp = masked_expr(ref_var_name(op.fetch(:input)), inp_type)
+
+          src_w = inp_type.fetch(:width)
+          dst_w = out_type.fetch(:width)
+          if wide_scalar?(out_type) || wide_scalar?(inp_type)
+            wide_inp = if wide_scalar?(inp_type)
+              "(#{inp})"
+            else
+              "rhdl_wide_make(#{inp}, 0u)"
+            end
+
+            accum_expr = 'rhdl_wide_make(0u, 0u)'
+            offset = 0
+            while offset < dst_w
+              shifted = offset.zero? ? wide_inp : "rhdl_wide_shlu(#{wide_inp}, #{offset}u)"
+              accum_expr = "rhdl_wide_or(#{accum_expr}, #{shifted})"
+              offset += src_w
+            end
+
+            if wide_scalar?(out_type)
+              return ["#{metal_type_for(out_type)} #{out} = #{masked_expr(accum_expr, out_type)};"]
+            end
+
+            return ["#{metal_type_for(out_type)} #{out} = #{masked_expr("(#{accum_expr}).x", out_type)};"]
+          end
+
+          pieces = []
+          offset = 0
+          while offset < dst_w
+            pieces << "(#{inp} << #{offset}u)"
+            offset += src_w
+          end
+          expr = pieces.join(' | ')
+          ["#{metal_type_for(out_type)} #{out} = #{masked_expr(expr, out_type)};"]
+        end
+
+        def emit_array_create_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          arr_type = op.fetch(:result_types).first
+          arr_struct = array_struct_name(arr_type)
+          elem_type = arr_type.fetch(:element)
+
+          lines = ["#{arr_struct} #{out};"]
+          operands = op.fetch(:operands)
+          last_index = operands.length - 1
+          operands.each_with_index do |ref, idx|
+            # Match CIRCT HW lowering semantics: array_create operand 0 maps to the
+            # highest index, and the last operand maps to index 0.
+            array_idx = last_index - idx
+            expr = masked_expr(ref_var_name(ref), type_map.fetch(ref))
+            lines << "#{out}.v[#{array_idx}] = #{masked_expr(expr, elem_type)};"
+          end
+          lines
+        end
+
+        def emit_array_get_lines(op, type_map)
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          arr_type = op.fetch(:array_type)
+          idx_type = op.fetch(:index_type)
+          arr_ref = ref_var_name(op.fetch(:array_ref))
+          idx_ref = masked_expr(ref_var_name(op.fetch(:index_ref)), idx_type)
+          len = arr_type.fetch(:length)
+
+          lines = []
+          lines << "uint #{out}_idx = #{idx_ref};"
+          lines << "if (#{out}_idx >= #{len}u) { #{out}_idx = 0u; }"
+          lines << "#{metal_type_for(out_type)} #{out} = #{masked_expr("#{arr_ref}.v[#{out}_idx]", out_type)};"
+          lines
+        end
+
+        def emit_memory_read_port_lines(
+          op,
+          type_map,
+          state_ref_to_slot:,
+          cold_memory_bases: nil,
+          cold_state_slots_var: nil
+        )
+          cold_memory_bases ||= Set.new
+          out = ref_var_name(op.fetch(:result_refs).first)
+          out_type = op.fetch(:result_types).first
+          memory_ref = op.fetch(:memory_ref)
+          memory_info = state_ref_to_slot.fetch(memory_ref) do
+            raise LoweringError, "Unknown arc.memory reference #{memory_ref} in memory_read_port"
+          end
+          memory_type = memory_info.fetch(:type)
+          idx_expr = masked_expr(ref_var_name(op.fetch(:index_ref)), op.fetch(:index_type))
+          base = memory_info.fetch(:index)
+          length = memory_type.fetch(:length)
+          state_slots_var =
+            if cold_state_slots_var && cold_memory_bases.include?(base)
+              cold_state_slots_var
+            else
+              'state_slots'
+            end
+
+          expr = if wide_scalar?(out_type)
+            "rhdl_read_memory_wide(#{state_slots_var}, #{base}u, #{length}u, #{idx_expr}, #{out_type.fetch(:width)}u)"
+          else
+            "rhdl_read_memory_scalar(#{state_slots_var}, #{base}u, #{length}u, #{idx_expr}, #{out_type.fetch(:width)}u)"
+          end
+          ["#{metal_type_for(out_type)} #{out} = #{masked_expr(expr, out_type)};"]
+        end
+
+        def generate_call_expr(callee:, args:, result_types:, type_map:, functions:, temp_prefix:, arg_exprs: nil)
+          fn = functions[callee]
+          raise LoweringError, "ArcToGPU lowering could not resolve callee @#{callee}" unless fn
+
+          fn_name = metal_fn_name(callee)
+          arg_exprs ||= args.map do |arg_ref|
+            arg_type = type_map.fetch(arg_ref) { raise LoweringError, "Unknown call arg ref #{arg_ref} for @#{callee}" }
+            masked_expr(ref_var_name(arg_ref), arg_type)
+          end
+
+          if result_types.length == 1
+            {
+              setup_lines: [],
+              result_exprs: ["#{fn_name}(#{arg_exprs.join(', ')})"]
+            }
+          else
+            temp = "#{temp_prefix}_ret"
+            struct = ret_struct_name(callee)
+            {
+              setup_lines: ["#{struct} #{temp} = #{fn_name}(#{arg_exprs.join(', ')});"],
+              result_exprs: Array.new(result_types.length) { |idx| "#{temp}.v#{idx}" }
+            }
+          end
+        end
+
+        def schedule_ops_topologically(ops:, lines:, type_map:, available_refs:, functions:, in_top_module:)
+          pending = ops.dup
+          until pending.empty?
+            ready = []
+            blocked = []
+
+            pending.each do |op|
+              deps = op_dependencies(op)
+              missing = deps.reject { |ref| available_refs.include?(ref) }
+              if missing.empty?
+                ready << op
+              else
+                blocked << [op, missing]
+              end
+            end
+
+            if ready.empty?
+              op, missing = blocked.first
+              raise LoweringError,
+                "Could not schedule #{op.fetch(:op_name)}; unresolved refs: #{missing.join(', ')}"
+            end
+
+            ready.each do |op|
+              lines.concat(generate_op_lines(op, type_map, functions, in_top_module: in_top_module))
+              op.fetch(:result_refs).each_with_index do |ref, idx|
+                type_map[ref] = op.fetch(:result_types)[idx]
+                available_refs << ref
+              end
+            end
+
+            pending = blocked.map(&:first)
+          end
+        end
+
+        def emit_ops_with_optional_schedule(
+          ops:,
+          lines:,
+          runtime_type_map:,
+          functions:,
+          in_top_module:,
+          state_ref_to_slot: nil,
+          cold_memory_bases: Set.new,
+          cold_state_slots_var: nil,
+          schedule_aware_emit: false,
+          phase_tag: nil
+        )
+          if schedule_aware_emit
+            levels = levelize_sorted_ops(
+              sorted_ops: ops,
+              initial_refs: runtime_type_map.keys
+            )
+            lines << "// schedule_phase: #{phase_tag}" if phase_tag
+            levels.each_with_index do |level_ops, level_idx|
+              lines << "// schedule_level #{level_idx} (ops=#{level_ops.length})"
+              level_ops.each do |op|
+                lines.concat(
+                  generate_op_lines(
+                    op,
+                    runtime_type_map,
+                    functions,
+                    in_top_module: in_top_module,
+                    state_ref_to_slot: state_ref_to_slot,
+                    cold_memory_bases: cold_memory_bases,
+                    cold_state_slots_var: cold_state_slots_var
+                  )
+                )
+                op.fetch(:result_refs).each_with_index do |ref, idx|
+                  runtime_type_map[ref] = op.fetch(:result_types)[idx]
+                end
+              end
+            end
+            return
+          end
+
+          ops.each do |op|
+            lines.concat(
+              generate_op_lines(
+                op,
+                runtime_type_map,
+                functions,
+                in_top_module: in_top_module,
+                state_ref_to_slot: state_ref_to_slot,
+                cold_memory_bases: cold_memory_bases,
+                cold_state_slots_var: cold_state_slots_var
+              )
+            )
+            op.fetch(:result_refs).each_with_index do |ref, idx|
+              runtime_type_map[ref] = op.fetch(:result_types)[idx]
+            end
+          end
+        end
+
+        def levelize_sorted_ops(sorted_ops:, initial_refs:)
+          ref_levels = {}
+          initial_refs.each { |ref| ref_levels[ref] = -1 }
+          levels = []
+
+          sorted_ops.each do |op|
+            deps = op_dependencies(op)
+            missing = deps.reject { |ref| ref_levels.key?(ref) }
+            unless missing.empty?
+              raise LoweringError,
+                "Could not levelize #{op.fetch(:op_name)}; unresolved refs: #{missing.join(', ')}"
+            end
+
+            base_level = deps.empty? ? -1 : deps.map { |ref| ref_levels.fetch(ref) }.max
+            level = base_level + 1
+            levels[level] ||= []
+            levels[level] << op
+            op.fetch(:result_refs).each { |ref| ref_levels[ref] = level }
+          end
+
+          levels.compact
+        end
+
+        def op_dependencies(op)
+          case op.fetch(:kind)
+          when :constant
+            []
+          when :to_clock
+            [op.fetch(:input)]
+          when :arc_call
+            op.fetch(:args)
+          when :arc_state
+            deps = [op.fetch(:clock_ref)]
+            deps.concat(op.fetch(:args))
+            deps << op.fetch(:enable_ref) if op.fetch(:enable_ref)
+            deps << op.fetch(:reset_ref) if op.fetch(:reset_ref)
+            deps
+          when :seq_firreg
+            deps = [op.fetch(:source_ref), op.fetch(:clock_ref)]
+            deps << op.fetch(:reset_ref) if op.fetch(:reset_ref)
+            deps << op.fetch(:reset_value_ref) if op.fetch(:reset_value_ref)
+            deps
+          when :arc_memory
+            []
+          when :arc_memory_read_port
+            [op.fetch(:memory_ref), op.fetch(:index_ref)]
+          when :arc_memory_write_port
+            deps = [op.fetch(:memory_ref), op.fetch(:clock_ref)]
+            deps.concat(op.fetch(:args))
+            deps
+          when :seq_memory_write_port
+            deps = [op.fetch(:memory_ref), op.fetch(:addr_ref), op.fetch(:data_ref), op.fetch(:clock_ref)]
+            deps << op.fetch(:enable_ref) if op.fetch(:enable_ref)
+            deps
+          when :array_create
+            op.fetch(:operands)
+          when :aggregate_constant
+            []
+          when :array_get
+            [op.fetch(:array_ref), op.fetch(:index_ref)]
+          when :alias
+            [op.fetch(:source_ref)]
+          when :icmp
+            op.fetch(:operands)
+          when :concat
+            op.fetch(:operands)
+          when :extract
+            [op.fetch(:input)]
+          when :replicate
+            [op.fetch(:input)]
+          when :mux, :comb
+            op.fetch(:operands)
+          when :synth_aig_and_inv
+            op.fetch(:operands)
+          else
+            []
+          end
+        end
+
+        def topologically_sorted_ops(ops:, initial_type_map:)
+          pending = ops.dup
+          available_refs = Set.new(initial_type_map.keys)
+          type_map = initial_type_map.dup
+          sorted = []
+
+          until pending.empty?
+            progress = false
+            next_pending = []
+
+            pending.each do |op|
+              deps = op_dependencies(op)
+              missing = deps.reject { |ref| available_refs.include?(ref) }
+              if missing.empty?
+                sorted << op
+                op.fetch(:result_refs).each_with_index do |ref, idx|
+                  type_map[ref] = op.fetch(:result_types)[idx]
+                  available_refs << ref
+                end
+                progress = true
+              else
+                next_pending << [op, missing]
+              end
+            end
+
+            unless progress
+              op, missing = next_pending.first
+              raise LoweringError,
+                "Could not schedule #{op.fetch(:op_name)}; unresolved refs: #{missing.join(', ')}"
+            end
+
+            pending = next_pending.map(&:first)
+          end
+
+          [sorted, type_map]
+        end
+
+        def select_live_ops(sorted_ops:, seed_refs:)
+          live_refs = Set.new(seed_refs)
+          live_ops_reversed = []
+
+          sorted_ops.reverse_each do |op|
+            produced = op.fetch(:result_refs)
+            next unless produced.any? { |ref| live_refs.include?(ref) }
+
+            live_ops_reversed << op
+            op_dependencies(op).each { |dep| live_refs << dep }
+          end
+
+          [live_ops_reversed.reverse, live_refs]
+        end
+
+        def comb_field_name(ref)
+          sanitize_ident("comb_#{ref.to_s.sub('%', '')}")
+        end
+
+        def value_expr_for_ref(
+          ref,
+          type_map:,
+          state_ref_to_slot:,
+          comb_var:,
+          state_snapshot_prefix: nil,
+          top_input_refs: [],
+          trust_state_masks: false
+        )
+          type = type_map.fetch(ref) { raise LoweringError, "Unknown reference #{ref}" }
+          expr =
+            if state_ref_to_slot.key?(ref)
+              slot = state_ref_to_slot.fetch(ref)
+              if state_snapshot_prefix
+                "#{state_snapshot_prefix}#{slot.fetch(:index)}"
+              else
+                state_load_expr(slot, trust_state_masks: trust_state_masks)
+              end
+            elsif top_input_refs.include?(ref)
+              ref_var_name(ref)
+            else
+              "#{comb_var}.#{comb_field_name(ref)}"
+            end
+
+          masked_expr(expr, type)
+        end
+
+        def collect_array_types(parsed)
+          seen = {}
+          out = []
+
+          visit_type = lambda do |type|
+            if type&.array?
+              key = [type.fetch(:length), type.fetch(:element).fetch(:width)]
+              next if seen[key]
+
+              seen[key] = true
+              out << { length: key[0], element_width: key[1] }
+            end
+          end
+
+          parsed.fetch(:functions).each_value do |fn|
+            fn.fetch(:args).each { |arg| visit_type.call(arg.fetch(:type)) }
+            fn.fetch(:return_types).each { |t| visit_type.call(t) }
+            fn.fetch(:ops).each do |op|
+              op.fetch(:result_types).each { |t| visit_type.call(t) }
+              visit_type.call(op[:array_type]) if op.key?(:array_type)
+            end
+          end
+
+          parsed.fetch(:top_module).fetch(:inputs).each { |arg| visit_type.call(arg.fetch(:type)) }
+          parsed.fetch(:top_module).fetch(:outputs).each { |arg| visit_type.call(arg.fetch(:type)) }
+          parsed.fetch(:top_module).fetch(:ops).each do |op|
+            op.fetch(:result_types).each { |t| visit_type.call(t) }
+            visit_type.call(op[:array_type]) if op.key?(:array_type)
+          end
+
+          out
+        end
+
+        def emit_state_store_lines(slot_info:, value_expr:, indent:, dirty_var: nil)
+          type = slot_info.fetch(:type)
+          index = slot_info.fetch(:index)
+          if wide_scalar?(type)
+            lines = []
+            if dirty_var
+              lines << "#{indent}if (#{dirty_var} == 0u && !rhdl_wide_eq(rhdl_load_wide_state(state_slots, #{index}u, #{type.fetch(:width)}u), #{value_expr})) { #{dirty_var} = 1u; }"
+            end
+            lines << "#{indent}rhdl_store_wide_state(state_slots, #{index}u, #{value_expr}, #{type.fetch(:width)}u);"
+            lines
+          else
+            lines = []
+            if dirty_var
+              lines << "#{indent}if (#{dirty_var} == 0u && rhdl_mask_bits(state_slots[#{index}], #{type.fetch(:width)}u) != (#{value_expr})) { #{dirty_var} = 1u; }"
+            end
+            lines << "#{indent}state_slots[#{index}] = #{value_expr};"
+            lines
+          end
+        end
+
+        def state_load_expr(info = nil, index: nil, type: nil, trust_state_masks: false)
+          if info
+            index = info.fetch(:index)
+            type = info.fetch(:type)
+          end
+
+          if wide_scalar?(type)
+            "rhdl_load_wide_state(state_slots, #{index}u, #{type.fetch(:width)}u)"
+          elsif trust_state_masks
+            "state_slots[#{index}]"
+          else
+            "rhdl_mask_bits(state_slots[#{index}], #{type.fetch(:width)}u)"
+          end
+        end
+
+        def top_eval_fn_name(module_name)
+          "eval_#{sanitize_ident(module_name)}"
+        end
+
+        def top_output_struct_name(module_name)
+          "#{sanitize_ident(module_name)}_outputs"
+        end
+
+        def metal_fn_name(name)
+          "fn_#{sanitize_ident(name)}"
+        end
+
+        def ret_struct_name(name)
+          "ret_#{sanitize_ident(name)}"
+        end
+
+        def array_struct_name(type_or_hash)
+          if type_or_hash.is_a?(TypeRef)
+            len = type_or_hash.fetch(:length)
+            width = type_or_hash.fetch(:element).fetch(:width)
+            return "arr_#{len}_i#{width}"
+          end
+
+          len = type_or_hash.fetch(:length)
+          width = type_or_hash.fetch(:element_width)
+          "arr_#{len}_i#{width}"
+        end
+
+        def sanitize_ident(name)
+          out = name.to_s.gsub(/[^A-Za-z0-9_]/, '_')
+          out = "_#{out}" if out.match?(/\A\d/)
+          out
+        end
+
+        def ref_var_name(ref)
+          sanitize_ident("v_#{ref.to_s.sub('%', '')}")
+        end
+
+        def wide_scalar?(type)
+          pack_wide_scalars? && type&.scalar? && type.fetch(:width) > 32
+        end
+
+        def narrow_scalar_type_for_width(width)
+          return scalar_msl_type unless @narrow_scalar_types
+          return 'uchar' if width <= 8
+          return 'ushort' if width <= 16
+
+          scalar_msl_type
+        end
+
+        def metal_type_for(type)
+          return scalar_msl_type unless type&.scalar?
+
+          return 'RhdlWide' if wide_scalar?(type)
+
+          narrow_scalar_type_for_width(type.fetch(:width))
+        end
+
+        def array_element_metal_type(array_info)
+          width = array_info.fetch(:element_width)
+          if wide_scalar?(TypeRef.new(kind: :scalar, width: width))
+            'RhdlWide'
+          else
+            narrow_scalar_type_for_width(width)
+          end
+        end
+
+        def truthy_expr(ref:, type_map:)
+          type = type_map.fetch(ref)
+          value = masked_expr(ref_var_name(ref), type)
+          if wide_scalar?(type)
+            "rhdl_wide_ne_zero(#{value})"
+          else
+            "(#{value} != #{scalar_zero_literal})"
+          end
+        end
+
+        def emit_state_memory_helpers
+          text = <<~MSL
+            static inline __attribute__((always_inline)) uint rhdl_memory_index(uint idx, uint length) {
+              if (length == 0u) {
+                return 0u;
+              }
+              if ((length & (length - 1u)) == 0u) {
+                return idx & (length - 1u);
+              }
+              return idx % length;
+            }
+
+            static inline __attribute__((always_inline)) #{scalar_msl_type} rhdl_read_memory_scalar(device #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, uint width) {
+              uint pos = rhdl_memory_index(idx, length);
+              return rhdl_mask_bits(state_slots[base + pos], width);
+            }
+
+            static inline __attribute__((always_inline)) #{scalar_msl_type} rhdl_read_memory_scalar(thread #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, uint width) {
+              uint pos = rhdl_memory_index(idx, length);
+              return rhdl_mask_bits(state_slots[base + pos], width);
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_write_memory_scalar(device #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, #{scalar_msl_type} value, uint width) {
+              uint pos = rhdl_memory_index(idx, length);
+              state_slots[base + pos] = rhdl_mask_bits(value, width);
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_write_memory_scalar(thread #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, #{scalar_msl_type} value, uint width) {
+              uint pos = rhdl_memory_index(idx, length);
+              state_slots[base + pos] = rhdl_mask_bits(value, width);
+            }
+          MSL
+
+          if pack_wide_scalars?
+            text << <<~MSL
+
+              static inline __attribute__((always_inline)) RhdlWide rhdl_read_memory_wide(device #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, uint width) {
+                uint pos = rhdl_memory_index(idx, length);
+                uint elem = base + (pos * 2u);
+                return rhdl_wide_mask(rhdl_wide_make(state_slots[elem], state_slots[elem + 1u]), width);
+              }
+
+              static inline __attribute__((always_inline)) RhdlWide rhdl_read_memory_wide(thread #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, uint width) {
+                uint pos = rhdl_memory_index(idx, length);
+                uint elem = base + (pos * 2u);
+                return rhdl_wide_mask(rhdl_wide_make(state_slots[elem], state_slots[elem + 1u]), width);
+              }
+
+              static inline __attribute__((always_inline)) void rhdl_write_memory_wide(device #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, RhdlWide value, uint width) {
+                uint pos = rhdl_memory_index(idx, length);
+                uint elem = base + (pos * 2u);
+                RhdlWide masked = rhdl_wide_mask(value, width);
+                state_slots[elem] = masked.x;
+                state_slots[elem + 1u] = masked.y;
+              }
+
+              static inline __attribute__((always_inline)) void rhdl_write_memory_wide(thread #{scalar_msl_type}* state_slots, uint base, uint length, uint idx, RhdlWide value, uint width) {
+                uint pos = rhdl_memory_index(idx, length);
+                uint elem = base + (pos * 2u);
+                RhdlWide masked = rhdl_wide_mask(value, width);
+                state_slots[elem] = masked.x;
+                state_slots[elem + 1u] = masked.y;
+              }
+            MSL
+          end
+
+          text
+        end
+
+        def inline_qualifier(always_inline:)
+          always_inline ? 'static inline __attribute__((always_inline))' : 'static inline'
+        end
+
+        def prefer_always_inline_for_define?(fn)
+          return false if ENV['RHDL_ARC_TO_GPU_DISABLE_ALWAYS_INLINE'] == '1'
+          return true if ENV['RHDL_ARC_TO_GPU_FORCE_ALWAYS_INLINE'] == '1'
+
+          inline_op_limit = ENV.fetch('RHDL_ARC_TO_GPU_ALWAYS_INLINE_MAX_OPS', '12').to_i
+          inline_op_limit = 12 if inline_op_limit <= 0
+          inline_return_limit = ENV.fetch('RHDL_ARC_TO_GPU_ALWAYS_INLINE_MAX_RETURNS', '2').to_i
+          inline_return_limit = 2 if inline_return_limit <= 0
+
+          op_count = fn.fetch(:ops).length
+          return false if op_count > inline_op_limit
+
+          fn.fetch(:return_types).length <= inline_return_limit
+        end
+
+        def emit_wide_helpers
+          <<~MSL
+            struct RhdlWide {
+              uint x;
+              uint y;
+            };
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_make(uint lo, uint hi) {
+              RhdlWide v;
+              v.x = lo;
+              v.y = hi;
+              return v;
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_mask(RhdlWide value, uint width) {
+              if (width >= 64u) { return value; }
+              if (width == 0u) { return rhdl_wide_make(0u, 0u); }
+              if (width <= 32u) {
+                uint mask = (width == 32u) ? 0xFFFFFFFFu : ((1u << width) - 1u);
+                return rhdl_wide_make(value.x & mask, 0u);
+              }
+              uint hi_width = width - 32u;
+              uint hi_mask = (hi_width == 32u) ? 0xFFFFFFFFu : ((1u << hi_width) - 1u);
+              return rhdl_wide_make(value.x, value.y & hi_mask);
+            }
+
+            static inline __attribute__((always_inline)) bool rhdl_wide_eq(RhdlWide lhs, RhdlWide rhs) {
+              return lhs.x == rhs.x && lhs.y == rhs.y;
+            }
+
+            static inline __attribute__((always_inline)) bool rhdl_wide_ne_zero(RhdlWide value) {
+              return (value.x | value.y) != 0u;
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_or(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_make(lhs.x | rhs.x, lhs.y | rhs.y);
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_xor(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_make(lhs.x ^ rhs.x, lhs.y ^ rhs.y);
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_and(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_make(lhs.x & rhs.x, lhs.y & rhs.y);
+            }
+
+            static inline __attribute__((always_inline)) ulong rhdl_wide_to_ulong(RhdlWide value) {
+              return (ulong(value.y) << 32u) | ulong(value.x);
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_from_ulong(ulong value) {
+              return rhdl_wide_make(uint(value & 0xFFFFFFFFul), uint((value >> 32u) & 0xFFFFFFFFul));
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_add(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_from_ulong(rhdl_wide_to_ulong(lhs) + rhdl_wide_to_ulong(rhs));
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_sub(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_from_ulong(rhdl_wide_to_ulong(lhs) - rhdl_wide_to_ulong(rhs));
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_mul(RhdlWide lhs, RhdlWide rhs) {
+              return rhdl_wide_from_ulong(rhdl_wide_to_ulong(lhs) * rhdl_wide_to_ulong(rhs));
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_shlu(RhdlWide value, uint shift) {
+              if (shift >= 64u) { return rhdl_wide_make(0u, 0u); }
+              if (shift == 0u) { return value; }
+              if (shift < 32u) {
+                uint lo = value.x << shift;
+                uint hi = (value.y << shift) | (value.x >> (32u - shift));
+                return rhdl_wide_make(lo, hi);
+              }
+              if (shift == 32u) {
+                return rhdl_wide_make(0u, value.x);
+              }
+              return rhdl_wide_make(0u, value.x << (shift - 32u));
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_wide_shru(RhdlWide value, uint shift) {
+              if (shift >= 64u) { return rhdl_wide_make(0u, 0u); }
+              if (shift == 0u) { return value; }
+              if (shift < 32u) {
+                uint lo = (value.x >> shift) | (value.y << (32u - shift));
+                uint hi = (value.y >> shift);
+                return rhdl_wide_make(lo, hi);
+              }
+              if (shift == 32u) {
+                return rhdl_wide_make(value.y, 0u);
+              }
+              return rhdl_wide_make(value.y >> (shift - 32u), 0u);
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_load_wide_state(device uint* state_slots, uint base, uint width) {
+              RhdlWide value = rhdl_wide_make(state_slots[base], state_slots[base + 1u]);
+              return rhdl_wide_mask(value, width);
+            }
+
+            static inline __attribute__((always_inline)) RhdlWide rhdl_load_wide_state(thread uint* state_slots, uint base, uint width) {
+              RhdlWide value = rhdl_wide_make(state_slots[base], state_slots[base + 1u]);
+              return rhdl_wide_mask(value, width);
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_store_wide_state(device uint* state_slots, uint base, RhdlWide value, uint width) {
+              RhdlWide masked = rhdl_wide_mask(value, width);
+              state_slots[base] = masked.x;
+              state_slots[base + 1u] = masked.y;
+            }
+
+            static inline __attribute__((always_inline)) void rhdl_store_wide_state(thread uint* state_slots, uint base, RhdlWide value, uint width) {
+              RhdlWide masked = rhdl_wide_mask(value, width);
+              state_slots[base] = masked.x;
+              state_slots[base + 1u] = masked.y;
+            }
+          MSL
+        end
+
+        def mask_value(value, width)
+          return value if width >= 64
+
+          mask = (1 << width) - 1
+          value & mask
+        end
+
+        def mask_const(width)
+          if width >= scalar_width_bits
+            scalar_full_mask_const
+          else
+            format("0x%X%s", (1 << width) - 1, scalar_width_bits > 32 ? 'ul' : 'u')
+          end
+        end
+
+        def masked_expr(expr, type)
+          return expr unless type&.scalar?
+
+          width = type.fetch(:width)
+          if wide_scalar?(type)
+            return "rhdl_wide_mask((#{expr}), #{width}u)"
+          end
+
+          return expr if width >= scalar_width_bits
+
+          "((#{expr}) & #{mask_const(width)})"
+        end
+
+        def constant_literal(value, type)
+          return scalar_zero_literal unless type&.scalar?
+
+          masked = mask_value(value.to_i, type.fetch(:width))
+          if wide_scalar?(type)
+            lo = masked & 0xFFFFFFFF
+            hi = (masked >> 32) & 0xFFFFFFFF
+            return format('rhdl_wide_make(0x%Xu, 0x%Xu)', lo, hi)
+          end
+
+          format("0x%X%s", masked, scalar_width_bits > 32 ? 'ul' : 'u')
+        end
+
+        def split_top_level(text)
+          parts = []
+          current = +''
+          depth_angle = 0
+          depth_paren = 0
+          depth_square = 0
+
+          text.to_s.each_char do |ch|
+            case ch
+            when '<'
+              depth_angle += 1
+            when '>'
+              depth_angle -= 1 if depth_angle.positive?
+            when '('
+              depth_paren += 1
+            when ')'
+              depth_paren -= 1 if depth_paren.positive?
+            when '['
+              depth_square += 1
+            when ']'
+              depth_square -= 1 if depth_square.positive?
+            when ','
+              if depth_angle.zero? && depth_paren.zero? && depth_square.zero?
+                parts << current.strip
+                current = +''
+                next
+              end
+            end
+            current << ch
+          end
+
+          parts << current.strip unless current.strip.empty?
+          parts
+        end
+
+        def clean_line(line)
+          line.to_s.split('//', 2).first.to_s
+        end
+
+        def command_available?(tool)
+          ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+            File.executable?(File.join(path, tool))
+          end
+        end
+
+        def indent_lines(lines, spaces: 2)
+          prefix = ' ' * spaces
+          lines.map { |line| "#{prefix}#{line}" }.join("\n")
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/apple2.rb b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/apple2.rb
new file mode 100644
index 00000000..65684064
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/apple2.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      module ArcToGpuLowering
+        module Profiles
+          module Apple2
+            module_function
+
+            def required_inputs
+              ArcToGpuLowering::REQUIRED_APPLE2_INPUTS
+            end
+
+            def required_outputs
+              ArcToGpuLowering::REQUIRED_APPLE2_OUTPUTS
+            end
+
+            def prepare_source(source:, lowerer:)
+              lowerer.optimize_arc_mlir_source(source)
+            end
+
+            def pack_wide_scalars?(inferred_scalar_bits:)
+              inferred_scalar_bits > 32
+            end
+
+            def post_parse_transform(parsed:, lowerer:)
+              if ENV['RHDL_ARC_TO_GPU_FLATTEN'] == '1'
+                lowerer.flatten_simple_arc_calls(parsed, max_ops: 12, max_depth: 2)
+              else
+                parsed
+              end
+            end
+
+            def emit_metal_source(
+              lowerer:,
+              parsed:,
+              state_layout:,
+              metal_entry:,
+              scalar_bits:,
+              pack_wide_scalars:,
+              gem_kernel_interpreter: false
+            )
+              _ = gem_kernel_interpreter
+              lowerer.with_scalar_config(scalar_bits, pack_wide_scalars: pack_wide_scalars) do
+                lowerer.emit_metal_source_apple2(
+                  parsed: parsed,
+                  state_layout: state_layout,
+                  metal_entry: metal_entry
+                )
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/cpu8bit.rb b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/cpu8bit.rb
new file mode 100644
index 00000000..c5b02892
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/cpu8bit.rb
@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      module ArcToGpuLowering
+        module Profiles
+          module Cpu8bit
+            module_function
+
+            def required_inputs
+              ArcToGpuLowering::REQUIRED_TOP_INPUTS
+            end
+
+            def required_outputs
+              ArcToGpuLowering::REQUIRED_TOP_OUTPUTS
+            end
+
+            def prepare_source(source:, lowerer:)
+              if ENV['RHDL_ARC_TO_GPU_OPT_ALL'] == '1'
+                lowerer.optimize_arc_mlir_source(source)
+              else
+                source
+              end
+            end
+
+            def pack_wide_scalars?(inferred_scalar_bits:)
+              _ = inferred_scalar_bits
+              false
+            end
+
+            def post_parse_transform(parsed:, lowerer:)
+              _ = lowerer
+              parsed
+            end
+
+            def emit_metal_source(
+              lowerer:,
+              parsed:,
+              state_layout:,
+              metal_entry:,
+              scalar_bits:,
+              pack_wide_scalars:,
+              gem_kernel_interpreter: false
+            )
+              lowerer.emit_metal_source(
+                parsed: parsed,
+                state_layout: state_layout,
+                metal_entry: metal_entry,
+                scalar_bits: scalar_bits,
+                pack_wide_scalars: pack_wide_scalars,
+                gem_kernel_interpreter: gem_kernel_interpreter,
+                use_state_snapshot: false,
+                split_post_comb_liveness: true,
+                trust_state_masks: true,
+                load_state_in_comb_fn: true,
+                eval_always_inline: true,
+                schedule_aware_emit: true
+              )
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv.rb b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv.rb
new file mode 100644
index 00000000..9be9efbb
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv.rb
@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      module ArcToGpuLowering
+        module Profiles
+          module Riscv
+            module_function
+
+            def required_inputs
+              ArcToGpuLowering::REQUIRED_RISCV_INPUTS
+            end
+
+            def required_outputs
+              ArcToGpuLowering::REQUIRED_RISCV_OUTPUTS
+            end
+
+            def runtime_input_names
+              required_inputs
+            end
+
+            def runtime_output_names
+              ArcToGpuLowering::RUNTIME_RISCV_OUTPUTS
+            end
+
+            def prepare_source(source:, lowerer:)
+              lowerer.optimize_arc_mlir_source(source)
+            end
+
+            def pack_wide_scalars?(inferred_scalar_bits:)
+              _ = inferred_scalar_bits
+              true
+            end
+
+            def narrow_scalar_types?
+              true
+            end
+
+            def post_parse_transform(parsed:, lowerer:)
+              transformed = lowerer.flatten_simple_arc_calls(
+                parsed,
+                max_ops: flatten_max_ops,
+                max_depth: flatten_max_depth
+              )
+              transformed = lowerer.fold_constant_array_gets(transformed)
+              lowerer.prune_unreachable_functions(transformed)
+            end
+
+            def flatten_max_ops
+              96
+            end
+
+            def flatten_max_depth
+              6
+            end
+
+            def dirty_settle_enabled?
+              false
+            end
+
+            def scheduled_emit_enabled?
+              false
+            end
+
+            def split_low_wdata_eval_enabled?
+              true
+            end
+
+            def split_high_data_addr_eval_enabled?
+              true
+            end
+
+            def split_low_data_addr_eval_enabled?
+              true
+            end
+
+            def schedule_mode
+              scheduled_emit_enabled? ? 'levelized' : 'legacy'
+            end
+
+            def fast_low_wdata_mode
+              split_low_wdata_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def fast_high_data_addr_mode
+              split_high_data_addr_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def fast_low_data_addr_mode
+              split_low_data_addr_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def emit_metal_source(
+              lowerer:,
+              parsed:,
+              state_layout:,
+              metal_entry:,
+              scalar_bits:,
+              pack_wide_scalars:,
+              gem_kernel_interpreter: false
+            )
+              _ = gem_kernel_interpreter
+              lowerer.with_scalar_config(
+                scalar_bits,
+                pack_wide_scalars: pack_wide_scalars,
+                narrow_scalar_types: narrow_scalar_types?
+              ) do
+                lowerer.emit_metal_source_riscv(
+                  parsed: parsed,
+                  state_layout: state_layout,
+                  metal_entry: metal_entry,
+                  dirty_settle_enabled: dirty_settle_enabled?,
+                  schedule_aware_emit: scheduled_emit_enabled?,
+                  split_low_wdata_eval: split_low_wdata_eval_enabled?,
+                  split_high_data_addr_eval: split_high_data_addr_eval_enabled?,
+                  split_low_data_addr_eval: split_low_data_addr_eval_enabled?,
+                  runtime_output_names: runtime_output_names
+                )
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv_netlist.rb b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv_netlist.rb
new file mode 100644
index 00000000..acb0c484
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/riscv_netlist.rb
@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      module ArcToGpuLowering
+        module Profiles
+          module RiscvNetlist
+            module_function
+
+            def required_inputs
+              ArcToGpuLowering::REQUIRED_RISCV_INPUTS
+            end
+
+            def required_outputs
+              ArcToGpuLowering::REQUIRED_RISCV_OUTPUTS
+            end
+
+            def runtime_input_names
+              required_inputs
+            end
+
+            def runtime_output_names
+              ArcToGpuLowering::RUNTIME_RISCV_OUTPUTS
+            end
+
+            def prepare_source(source:, lowerer:)
+              lowerer.optimize_arc_mlir_source(source)
+            end
+
+            def pack_wide_scalars?(inferred_scalar_bits:)
+              _ = inferred_scalar_bits
+              true
+            end
+
+            def narrow_scalar_types?
+              true
+            end
+
+            def post_parse_transform(parsed:, lowerer:)
+              transformed = lowerer.flatten_simple_arc_calls(
+                parsed,
+                max_ops: flatten_max_ops,
+                max_depth: flatten_max_depth
+              )
+              transformed = lowerer.fold_constant_array_gets(transformed)
+              pruned = lowerer.prune_unreachable_functions(transformed)
+              return transformed if pruned.fetch(:functions).empty?
+
+              pruned
+            end
+
+            def flatten_max_ops
+              96
+            end
+
+            def flatten_max_depth
+              6
+            end
+
+            def dirty_settle_enabled?
+              false
+            end
+
+            def scheduled_emit_enabled?
+              false
+            end
+
+            def split_low_wdata_eval_enabled?
+              true
+            end
+
+            def split_high_data_addr_eval_enabled?
+              true
+            end
+
+            def split_low_data_addr_eval_enabled?
+              true
+            end
+
+            def schedule_mode
+              'netlist_aig_legacy'
+            end
+
+            def fast_low_wdata_mode
+              split_low_wdata_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def fast_high_data_addr_mode
+              split_high_data_addr_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def fast_low_data_addr_mode
+              split_low_data_addr_eval_enabled? ? 'split' : 'inline'
+            end
+
+            def emit_metal_source(
+              lowerer:,
+              parsed:,
+              state_layout:,
+              metal_entry:,
+              scalar_bits:,
+              pack_wide_scalars:,
+              gem_kernel_interpreter: false
+            )
+              _ = gem_kernel_interpreter
+              lowerer.with_scalar_config(
+                scalar_bits,
+                pack_wide_scalars: pack_wide_scalars,
+                narrow_scalar_types: narrow_scalar_types?
+              ) do
+                lowerer.emit_metal_source_riscv(
+                  parsed: parsed,
+                  state_layout: state_layout,
+                  metal_entry: metal_entry,
+                  dirty_settle_enabled: dirty_settle_enabled?,
+                  schedule_aware_emit: scheduled_emit_enabled?,
+                  split_low_wdata_eval: split_low_wdata_eval_enabled?,
+                  split_high_data_addr_eval: split_high_data_addr_eval_enabled?,
+                  split_low_data_addr_eval: split_low_data_addr_eval_enabled?,
+                  runtime_output_names: runtime_output_names
+                )
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/firrtl.rb b/lib/rhdl/codegen/firrtl/firrtl.rb
new file mode 100644
index 00000000..48b14274
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/firrtl.rb
@@ -0,0 +1,805 @@
+# FIRRTL code generator for CIRCT toolchain
+# Generates FIRRTL 5.1.0 format that can be compiled by firtool to Verilog
+
+require_relative "../ir/ir"
+
+module RHDL
+  module Codegen
+    module FIRRTL
+        FIRRTL_KEYWORDS = %w[
+          circuit module input output wire reg node when else skip stop printf
+          mux validif add sub mul div rem lt leq gt geq eq neq pad asUInt asSInt
+          asClock asFixedPoint asInterval shl shr dshl dshr cvt neg not and or xor
+          andr orr xorr cat bits head tail mux validif connect
+        ].freeze
+
+        module_function
+
+        # Generate a complete FIRRTL circuit with a single module
+        def generate(module_def)
+          lines = []
+          lines << "FIRRTL version 5.1.0"
+          lines << "circuit #{sanitize(module_def.name)}:"
+          lines << generate_module_body(module_def, is_public: true)
+          lines.join("\n")
+        end
+
+        # Generate a complete FIRRTL circuit with multiple modules (hierarchical)
+        # @param module_defs [Array<IR::ModuleDef>] Array of module definitions, top module last
+        # @param top_name [String] Name of the circuit (usually the top module name)
+        def generate_hierarchy(module_defs, top_name:)
+          lines = []
+          lines << "FIRRTL version 5.1.0"
+          lines << "circuit #{sanitize(top_name)}:"
+
+          # Build a map of module name -> module definition for looking up submodule ports
+          module_map = {}
+          module_defs.each do |mod_def|
+            module_map[sanitize(mod_def.name)] = mod_def
+          end
+
+          module_defs.each_with_index do |mod_def, idx|
+            is_top = (idx == module_defs.length - 1)
+            lines << generate_module_body(mod_def, is_public: is_top, module_map: module_map)
+            lines << "" unless is_top # Add blank line between modules
+          end
+
+          lines.join("\n")
+        end
+
+        # Generate just the module body (without circuit header)
+        # @param module_def [IR::ModuleDef] The module definition
+        # @param is_public [Boolean] Whether this is the public top-level module
+        # @param module_map [Hash{String => IR::ModuleDef}] Map of module names to definitions (for hierarchical)
+        # @return [String] The module body FIRRTL code
+        def generate_module_body(module_def, is_public: false, module_map: {})
+          lines = []
+          module_keyword = is_public ? "public module" : "module"
+          lines << "  #{module_keyword} #{sanitize(module_def.name)}:"
+
+          # Build set of all port names (for collision detection)
+          all_port_names = module_def.ports.map { |p| sanitize(p.name) }.to_set
+          output_ports = module_def.ports.select { |p| p.direction == :out }.map(&:name).to_set
+          output_widths = module_def.ports.select { |p| p.direction == :out }.map { |p| [p.name, p.width] }.to_h
+
+          # Find clock for sequential processes or from write ports
+          clock = find_clock_for_reg(module_def, nil) || find_clock_from_write_ports(module_def)
+
+          # Collect targets of sequential assignments that are outputs (need internal registers)
+          seq_targets = collect_seq_targets(module_def)
+          seq_output_targets = seq_targets.select { |t| output_ports.include?(t) }
+
+          # Track port-colliding registers (handled by seq_output_targets instead)
+          port_colliding_regs = Set.new
+          module_def.regs.each do |reg|
+            sname = sanitize(reg.name)
+            port_colliding_regs.add(reg.name) if all_port_names.include?(sname)
+          end
+
+          # Collect memory reads to set up read ports
+          memory_reads = collect_memory_reads(module_def)
+
+          # Ports
+          module_def.ports.each do |port|
+            dir = port.direction == :in ? "input" : "output"
+            lines << "    #{dir} #{sanitize(port.name)}: #{type_decl(port.width)}"
+          end
+
+          lines << "" unless module_def.ports.empty?
+
+          # Build set of register names for deduplication
+          reg_names_set = module_def.regs.map { |r| sanitize(r.name) }.to_set
+
+          # Internal wires (nets) - skip duplicates and names that are registers
+          emitted_names = all_port_names.dup
+          module_def.nets.each do |net|
+            sname = sanitize(net.name)
+            next if emitted_names.include?(sname)
+            next if reg_names_set.include?(sname)  # register declaration takes precedence
+            emitted_names.add(sname)
+            lines << "    wire #{sname}: #{type_decl(net.width)}"
+          end
+
+          # Explicit registers from IR (skip those that collide with port names;
+          # seq_output_targets creates _reg suffixed registers for those)
+          module_def.regs.each do |reg|
+            sname = sanitize(reg.name)
+            next if port_colliding_regs.include?(reg.name)
+            next if emitted_names.include?(sname)
+            emitted_names.add(sname)
+            reg_clock = find_clock_for_reg(module_def, reg.name) || clock
+            if reg_clock
+              lines << "    reg #{sname}: #{type_decl(reg.width)}, #{clock_expr(reg_clock)}"
+            else
+              lines << "    wire #{sname}: #{type_decl(reg.width)}"
+            end
+          end
+
+          # Auto-declare any signals referenced in expressions but not yet declared
+          all_refs = collect_all_signal_refs(module_def)
+          mem_names = module_def.memories.map { |m| sanitize(m.name) }.to_set
+          auto_wires = []
+          all_refs.each do |ref_name|
+            sname = sanitize(ref_name)
+            next if emitted_names.include?(sname)
+            next if mem_names.include?(sname)
+            emitted_names.add(sname)
+            auto_wires << sname
+            lines << "    wire #{sname}: UInt<16>"
+          end
+
+          # Create internal registers for output ports that have sequential assignments
+          seq_output_targets.each do |target|
+            width = output_widths[target] || 1
+            reg_name = "#{target}_reg"
+            if clock
+              lines << "    reg #{sanitize(reg_name)}: #{type_decl(width)}, #{clock_expr(clock)}"
+            end
+          end
+
+          # Pre-compute read counts per memory for consistent port naming
+          # Normalize memory names to strings for consistent lookup
+          reads_per_mem = Hash.new(0)
+          memory_reads.each { |mr| reads_per_mem[mr.memory.to_s] += 1 }
+
+          # Identify ROM memories (have initial_data and no write ports)
+          # These will be converted to combinational lookup tables instead of mem blocks
+          rom_memory_names = Set.new
+          rom_memory_map = {} # mem_key => Memory
+          module_def.memories.each do |mem|
+            next unless mem.initial_data
+            mem_key = mem.name.to_s
+            write_count = module_def.write_ports.count { |wp| wp.memory.to_s == mem_key }
+            if write_count == 0
+              rom_memory_names.add(mem_key)
+              rom_memory_map[mem_key] = mem
+            end
+          end
+
+          # Pre-compute wire names for ROM lookup results (only for ROMs with actual reads)
+          rom_read_wires = {} # [mem_key, idx] => wire_name
+          rom_memory_names.each do |mem_key|
+            total_reads = reads_per_mem[mem_key]
+            next if total_reads == 0  # No actual reads; skip wire declaration
+            total_reads.times do |i|
+              wire_name = total_reads > 1 ? "_rom_#{sanitize(mem_key)}_#{i}" : "_rom_#{sanitize(mem_key)}"
+              rom_read_wires[[mem_key, i]] = wire_name
+            end
+          end
+
+          # Memory arrays - determine read/write ports needed (skip ROMs)
+          module_def.memories.each do |mem|
+            mem_key = mem.name.to_s
+            next if rom_memory_names.include?(mem_key)
+            # Count how many read ports this memory needs
+            read_count = reads_per_mem[mem_key]
+            read_count = 1 if read_count == 0 # At least one reader if memory exists
+            # For dual-port RAM with 2 reads and 2 writes, we need multiple ports
+            write_count = module_def.write_ports.count { |wp| wp.memory.to_s == mem_key }
+
+            lines << "    mem #{sanitize(mem.name)}:"
+            lines << "      data-type => #{type_decl(mem.width)}"
+            lines << "      depth => #{mem.depth}"
+            lines << "      read-latency => 0"
+            lines << "      write-latency => 1"
+
+            # Generate reader declarations - use numbered ports if multiple readers
+            if read_count == 1
+              lines << "      reader => read"
+            else
+              read_count.times { |i| lines << "      reader => read#{i}" }
+            end
+
+            # Generate writer declarations only if there are write ports
+            if write_count == 1
+              lines << "      writer => write"
+            elsif write_count > 1
+              write_count.times { |i| lines << "      writer => write#{i}" }
+            end
+            # No writer declaration for read-only memories (write_count == 0)
+          end
+
+          # Declare wires for ROM lookup results (only for ROMs with actual reads)
+          rom_memory_names.each do |mem_key|
+            next if reads_per_mem[mem_key] == 0
+            mem = rom_memory_map[mem_key]
+            reads_per_mem[mem_key].times do |i|
+              wire_name = rom_read_wires[[mem_key, i]]
+              lines << "    wire #{wire_name}: #{type_decl(mem.width)}"
+            end
+          end
+
+          # For memories without a clock input, generate a fake clock
+          needs_fake_clock = !module_def.memories.empty? && !clock
+          if needs_fake_clock
+            lines << "    wire _fake_clk: Clock"
+            lines << "    connect _fake_clk, asClock(UInt<1>(0))"
+          end
+
+          has_decls = !module_def.nets.empty? || !module_def.regs.empty? ||
+                      !module_def.memories.empty? || !seq_output_targets.empty? || needs_fake_clock
+          lines << "" if has_decls
+
+          # ROM combinational lookup logic
+          rom_read_idx = Hash.new(0)
+          memory_reads.each do |mr|
+            mem_key = mr.memory.to_s
+            next unless rom_memory_names.include?(mem_key)
+            mem = rom_memory_map[mem_key]
+            idx = rom_read_idx[mem_key]
+            wire_name = rom_read_wires[[mem_key, idx]]
+            addr_str = expr(mr.addr)
+            lookup_str = rom_lookup_expr(mem.initial_data, mem.width, mem.depth, addr_str)
+            lines << "    connect #{wire_name}, #{lookup_str}"
+            rom_read_idx[mem_key] += 1
+          end
+
+          # Memory read port connections - must come before assigns that use them
+          # Track which memory read we're on for multi-port memories
+          mem_read_index = Hash.new(0)
+          memory_reads.each do |mr|
+            mem_key = mr.memory.to_s
+            next if rom_memory_names.include?(mem_key)
+            mem_name = sanitize(mr.memory)
+            idx = mem_read_index[mem_key]
+            # Use numbered ports only if multiple readers
+            read_port = reads_per_mem[mem_key] > 1 ? "read#{idx}" : "read"
+
+            # Use real clock if available, otherwise fake clock
+            clk_signal = clock ? clock_expr(clock) : "_fake_clk"
+            lines << "    connect #{mem_name}.#{read_port}.clk, #{clk_signal}"
+            lines << "    connect #{mem_name}.#{read_port}.en, UInt<1>(1)"
+            lines << "    connect #{mem_name}.#{read_port}.addr, #{expr(mr.addr)}"
+            mem_read_index[mem_key] += 1
+          end
+
+          # Initialize auto-declared wires to zero (prevent uninitialized sink errors)
+          auto_wires.each do |sname|
+            lines << "    connect #{sname}, UInt<16>(0)"
+          end
+
+          # Connect unread memory ports to defaults
+          read_mem_names = memory_reads.map { |mr| mr.memory.to_s }.to_set
+          module_def.memories.each do |mem|
+            next if rom_memory_names.include?(mem.name.to_s)
+            next if read_mem_names.include?(mem.name.to_s)
+            mem_name = sanitize(mem.name)
+            clk_signal = clock ? clock_expr(clock) : "asClock(UInt<1>(0))"
+            lines << "    connect #{mem_name}.read.clk, #{clk_signal}"
+            lines << "    connect #{mem_name}.read.en, UInt<1>(0)"
+            lines << "    connect #{mem_name}.read.addr, UInt<1>(0)"
+          end
+
+          # Continuous assignments - use memory read port data
+          mem_read_index = Hash.new(0)
+          module_def.assigns.each do |assign|
+            lines << "    connect #{sanitize(assign.target)}, #{expr_with_mem_reads(assign.expr, memory_reads, mem_read_index, rom_read_wires: rom_read_wires)}"
+          end
+
+          # Connect output ports to their internal registers
+          seq_output_targets.each do |target|
+            lines << "    connect #{sanitize(target)}, #{sanitize("#{target}_reg")}"
+          end
+
+          # Sequential processes (register updates)
+          # Rewrite targets that are outputs to use internal registers
+          module_def.processes.each do |process|
+            if process.clocked
+              process.statements.each do |stmt|
+                lines.concat(statement(stmt, indent: 4, output_regs: seq_output_targets,
+                                       memory_reads: memory_reads, mem_read_index: mem_read_index,
+                                       rom_read_wires: rom_read_wires))
+              end
+            else
+              # Combinational process - convert to assignments
+              process.statements.each do |stmt|
+                lines.concat(statement(stmt, indent: 4, output_regs: Set.new,
+                                       memory_reads: memory_reads, mem_read_index: mem_read_index,
+                                       rom_read_wires: rom_read_wires))
+              end
+            end
+          end
+
+          # Memory write ports
+          write_port_index = Hash.new(0)
+          module_def.write_ports.each do |wp|
+            mem_name = sanitize(wp.memory)
+            idx = write_port_index[wp.memory]
+            write_count = module_def.write_ports.count { |w| w.memory == wp.memory }
+            port_suffix = write_count > 1 ? idx.to_s : ""
+            write_port = "write#{port_suffix}"
+
+            # Find memory width for mask
+            mem = module_def.memories.find { |m| m.name == wp.memory }
+            mem_width = mem&.width || 8
+
+            lines << "    connect #{mem_name}.#{write_port}.clk, #{clock_expr(wp.clock)}"
+            lines << "    connect #{mem_name}.#{write_port}.en, #{expr(wp.enable)}"
+            lines << "    connect #{mem_name}.#{write_port}.addr, #{expr(wp.addr)}"
+            lines << "    connect #{mem_name}.#{write_port}.mask, UInt<#{mem_width}>(#{(1 << mem_width) - 1})"
+            lines << "    connect #{mem_name}.#{write_port}.data, #{expr(wp.data)}"
+            write_port_index[wp.memory] += 1
+          end
+
+          # Module instances
+          module_def.instances.each do |instance|
+            lines << ""
+            lines << "    inst #{sanitize(instance.name)} of #{sanitize(instance.module_name)}"
+
+            # Get connected port names
+            connected_ports = instance.connections.map { |c| c.port_name.to_sym }.to_set
+
+            # Generate explicit connections
+            instance.connections.each do |conn|
+              signal_str = conn.signal.is_a?(String) ? sanitize(conn.signal) : expr(conn.signal)
+              inst_port = "#{sanitize(instance.name)}.#{sanitize(conn.port_name)}"
+              # FIRRTL flow: input ports are sinks (connect TO them), output ports are sources (connect FROM them)
+              if conn.direction == :out
+                lines << "    connect #{signal_str}, #{inst_port}"
+              else
+                lines << "    connect #{inst_port}, #{signal_str}"
+              end
+            end
+
+            # Add default connections for unconnected input ports (if module_map is available)
+            submod_def = module_map[sanitize(instance.module_name)]
+            if submod_def
+              submod_def.ports.each do |port|
+                next unless port.direction == :in  # Only need to connect inputs (sinks)
+                next if connected_ports.include?(port.name.to_sym)
+
+                inst_port = "#{sanitize(instance.name)}.#{sanitize(port.name)}"
+                # Use appropriate default: Clock ports get a clock, others get 0
+                if clock_port?(port.name)
+                  # Find the clock port in the current module if available
+                  clock_port = module_def.ports.find { |p| clock_port?(p.name) && p.direction == :in }
+                  if clock_port
+                    lines << "    connect #{inst_port}, #{sanitize(clock_port.name)}"
+                  else
+                    lines << "    connect #{inst_port}, asClock(UInt<1>(0))"
+                  end
+                else
+                  lines << "    connect #{inst_port}, #{literal(0, port.width)}"
+                end
+              end
+            end
+          end
+
+          lines.join("\n")
+        end
+
+        private_class_method :generate_module_body
+
+        def find_clock_from_write_ports(module_def)
+          module_def.write_ports.first&.clock
+        end
+
+        def collect_memory_reads(module_def)
+          reads = []
+          module_def.assigns.each do |assign|
+            collect_memory_reads_from_expr(assign.expr, reads)
+          end
+          module_def.processes.each do |process|
+            process.statements.each do |stmt|
+              collect_memory_reads_from_stmt(stmt, reads)
+            end
+          end
+          reads
+        end
+
+        def collect_memory_reads_from_expr(expr_node, reads)
+          case expr_node
+          when IR::MemoryRead
+            reads << expr_node
+          when IR::UnaryOp
+            collect_memory_reads_from_expr(expr_node.operand, reads)
+          when IR::BinaryOp
+            collect_memory_reads_from_expr(expr_node.left, reads)
+            collect_memory_reads_from_expr(expr_node.right, reads)
+          when IR::Mux
+            collect_memory_reads_from_expr(expr_node.condition, reads)
+            collect_memory_reads_from_expr(expr_node.when_true, reads)
+            collect_memory_reads_from_expr(expr_node.when_false, reads)
+          when IR::Concat
+            expr_node.parts.each { |p| collect_memory_reads_from_expr(p, reads) }
+          when IR::Slice
+            collect_memory_reads_from_expr(expr_node.base, reads)
+          when IR::Resize
+            collect_memory_reads_from_expr(expr_node.expr, reads)
+          when IR::Case
+            collect_memory_reads_from_expr(expr_node.selector, reads)
+            expr_node.cases.each { |_v, branch| collect_memory_reads_from_expr(branch, reads) }
+            collect_memory_reads_from_expr(expr_node.default, reads) if expr_node.default
+          end
+        end
+
+        def collect_memory_reads_from_stmt(stmt, reads)
+          case stmt
+          when IR::SeqAssign
+            collect_memory_reads_from_expr(stmt.expr, reads)
+          when IR::If
+            collect_memory_reads_from_expr(stmt.condition, reads)
+            stmt.then_statements.each { |s| collect_memory_reads_from_stmt(s, reads) }
+            stmt.else_statements.each { |s| collect_memory_reads_from_stmt(s, reads) }
+          end
+        end
+
+        def expr_with_mem_reads(expr_node, memory_reads, mem_read_index, output_regs: Set.new, rom_read_wires: {})
+          case expr_node
+          when IR::MemoryRead
+            mem_key = expr_node.memory.to_s
+            idx = mem_read_index[mem_key]
+            mem_read_index[mem_key] += 1
+            # ROM memories use pre-computed lookup wires
+            if rom_read_wires.key?([mem_key, idx])
+              rom_read_wires[[mem_key, idx]]
+            else
+              mem_name = sanitize(expr_node.memory)
+              read_count = memory_reads.count { |mr| mr.memory.to_s == mem_key }
+              read_port = read_count > 1 ? "read#{idx}" : "read"
+              "#{mem_name}.#{read_port}.data"
+            end
+          when IR::Signal
+            name = output_regs.include?(expr_node.name) ? "#{expr_node.name}_reg" : expr_node.name
+            sanitize(name)
+          when IR::Literal
+            literal(expr_node.value, expr_node.width)
+          when IR::UnaryOp
+            case expr_node.op
+            when :~, :!
+              "not(#{expr_with_mem_reads(expr_node.operand, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)})"
+            else
+              raise ArgumentError, "Unsupported unary op: #{expr_node.op}"
+            end
+          when IR::BinaryOp
+            left = expr_with_mem_reads(expr_node.left, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            right = expr_with_mem_reads(expr_node.right, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            right_stripped = nil
+            if %i[<< >>].include?(expr_node.op) && expr_node.right.is_a?(IR::Resize) && expr_node.right.width > expr_node.right.expr.width
+              right_stripped = expr_with_mem_reads(expr_node.right.expr, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            end
+            binary_op_str(expr_node.op, left, right, right_stripped: right_stripped)
+          when IR::Mux
+            cond = expr_with_mem_reads(expr_node.condition, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            when_true = expr_with_mem_reads(expr_node.when_true, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            when_false = expr_with_mem_reads(expr_node.when_false, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            "mux(#{cond}, #{when_true}, #{when_false})"
+          when IR::Concat
+            parts = expr_node.parts.map { |p| expr_with_mem_reads(p, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires) }
+            "cat(#{parts.join(', ')})"
+          when IR::Slice
+            base = expr_with_mem_reads(expr_node.base, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            high = [expr_node.range.begin, expr_node.range.end].max
+            low = [expr_node.range.begin, expr_node.range.end].min
+            "bits(#{base}, #{high}, #{low})"
+          when IR::Resize
+            inner = expr_with_mem_reads(expr_node.expr, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            target_width = expr_node.width
+            source_width = expr_node.expr.width
+            if target_width == source_width
+              inner
+            elsif target_width > source_width
+              "pad(#{inner}, #{target_width})"
+            else
+              "bits(#{inner}, #{target_width - 1}, 0)"
+            end
+          when IR::Case
+            selector = expr_with_mem_reads(expr_node.selector, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)
+            default_expr = expr_node.default ? expr_with_mem_reads(expr_node.default, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires) : literal(0, expr_node.width)
+            result = default_expr
+            expr_node.cases.reverse_each do |values, branch|
+              values.each do |v|
+                cond = "eq(#{selector}, #{literal(v, expr_node.selector.width)})"
+                result = "mux(#{cond}, #{expr_with_mem_reads(branch, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)}, #{result})"
+              end
+            end
+            result
+          else
+            raise ArgumentError, "Unsupported FIRRTL expression: #{expr_node.inspect}"
+          end
+        end
+
+        def binary_op_str(op, left, right, right_stripped: nil)
+          case op
+          when :& then "and(#{left}, #{right})"
+          when :| then "or(#{left}, #{right})"
+          when :^ then "xor(#{left}, #{right})"
+          when :+ then "add(#{left}, #{right})"
+          when :- then "sub(#{left}, #{right})"
+          when :* then "mul(#{left}, #{right})"
+          when :/ then "div(#{left}, #{right})"
+          when :% then "rem(#{left}, #{right})"
+          when :<< then "dshl(#{left}, #{right_stripped || right})"
+          when :>> then "dshr(#{left}, #{right_stripped || right})"
+          when :== then "eq(#{left}, #{right})"
+          when :!= then "neq(#{left}, #{right})"
+          when :< then "lt(#{left}, #{right})"
+          when :> then "gt(#{left}, #{right})"
+          when :<= then "leq(#{left}, #{right})"
+          when :>= then "geq(#{left}, #{right})"
+          else raise ArgumentError, "Unsupported binary op: #{op}"
+          end
+        end
+
+        def collect_seq_targets(module_def)
+          targets = Set.new
+          module_def.processes.each do |process|
+            next unless process.clocked
+
+            process.statements.each do |stmt|
+              collect_targets_from_stmt(stmt, targets)
+            end
+          end
+          targets
+        end
+
+        def collect_targets_from_stmt(stmt, targets)
+          case stmt
+          when IR::SeqAssign
+            targets.add(stmt.target)
+          when IR::If
+            stmt.then_statements.each { |s| collect_targets_from_stmt(s, targets) }
+            stmt.else_statements.each { |s| collect_targets_from_stmt(s, targets) }
+          end
+        end
+
+        # Collect all signal names referenced in expressions throughout the module
+        def collect_all_signal_refs(module_def)
+          refs = Set.new
+          module_def.assigns.each do |a|
+            refs.add(a.target.to_s)
+            collect_expr_signal_refs(a.expr, refs)
+          end
+          module_def.processes.each do |p|
+            p.statements.each { |s| collect_stmt_signal_refs(s, refs) }
+          end
+          module_def.write_ports.each do |wp|
+            collect_expr_signal_refs(wp.enable, refs)
+            collect_expr_signal_refs(wp.addr, refs)
+            collect_expr_signal_refs(wp.data, refs)
+          end
+          refs
+        end
+
+        def collect_expr_signal_refs(expr_node, refs)
+          case expr_node
+          when IR::Signal then refs.add(expr_node.name.to_s)
+          when IR::UnaryOp then collect_expr_signal_refs(expr_node.operand, refs)
+          when IR::BinaryOp
+            collect_expr_signal_refs(expr_node.left, refs)
+            collect_expr_signal_refs(expr_node.right, refs)
+          when IR::Mux
+            collect_expr_signal_refs(expr_node.condition, refs)
+            collect_expr_signal_refs(expr_node.when_true, refs)
+            collect_expr_signal_refs(expr_node.when_false, refs)
+          when IR::MemoryRead then collect_expr_signal_refs(expr_node.addr, refs)
+          when IR::Slice then collect_expr_signal_refs(expr_node.base, refs)
+          when IR::Resize then collect_expr_signal_refs(expr_node.expr, refs)
+          when IR::Concat then expr_node.parts.each { |p| collect_expr_signal_refs(p, refs) }
+          when IR::Case
+            collect_expr_signal_refs(expr_node.selector, refs)
+            expr_node.cases.each { |_v, b| collect_expr_signal_refs(b, refs) }
+            collect_expr_signal_refs(expr_node.default, refs) if expr_node.default
+          end
+        end
+
+        def collect_stmt_signal_refs(stmt, refs)
+          case stmt
+          when IR::SeqAssign
+            refs.add(stmt.target.to_s)
+            collect_expr_signal_refs(stmt.expr, refs)
+          when IR::If
+            collect_expr_signal_refs(stmt.condition, refs)
+            stmt.then_statements.each { |s| collect_stmt_signal_refs(s, refs) }
+            stmt.else_statements.each { |s| collect_stmt_signal_refs(s, refs) }
+          end
+        end
+
+        def find_clock_for_reg(module_def, reg_name)
+          module_def.processes.each do |process|
+            return process.clock if process.clocked && process.clock
+          end
+          nil
+        end
+
+        def clock_port?(name)
+          name.to_s == "clk" || name.to_s == "clock" ||
+            name.to_s == "clk_14m"
+        end
+
+        # Returns a FIRRTL clock expression, always wrapping with asClock()
+        # since all signals are declared as UInt<1>, not Clock type
+        def clock_expr(name)
+          "asClock(#{sanitize(name)})"
+        end
+
+        # Generate a combinational lookup expression for a ROM memory
+        # Returns a nested mux chain: mux(eq(addr, N), data[N], mux(eq(addr, N-1), ...))
+        def rom_lookup_expr(initial_data, width, depth, addr_expr)
+          mask = (1 << width) - 1
+          addr_bits = [1, (Math.log2([depth, 2].max).ceil)].max
+
+          result = "UInt<#{width}>(0)"
+          initial_data.each_with_index.reverse_each do |raw, idx|
+            break if idx >= depth
+            value = (raw || 0).to_i & mask
+            next if value == 0
+            result = "mux(eq(#{addr_expr}, UInt<#{addr_bits}>(#{idx})), UInt<#{width}>(#{value}), #{result})"
+          end
+          result
+        end
+
+        def type_decl(width)
+          "UInt<#{width}>"
+        end
+
+        def statement(stmt, indent:, output_regs: Set.new, memory_reads: [], mem_read_index: Hash.new(0), rom_read_wires: {})
+          pad = " " * indent
+          case stmt
+          when IR::SeqAssign
+            # Rewrite target to use internal register if it's an output
+            target = output_regs.include?(stmt.target) ? "#{stmt.target}_reg" : stmt.target
+            ["#{pad}connect #{sanitize(target)}, #{expr_with_mem_reads(stmt.expr, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)}"]
+          when IR::MemoryWrite
+            ["#{pad}connect #{sanitize(stmt.memory)}[#{expr(stmt.addr)}], #{expr(stmt.data)}"]
+          when IR::If
+            lines = []
+            lines << "#{pad}when #{expr_with_mem_reads(stmt.condition, memory_reads, mem_read_index, output_regs: output_regs, rom_read_wires: rom_read_wires)}:"
+            stmt.then_statements.each { |s| lines.concat(statement(s, indent: indent + 2, output_regs: output_regs, memory_reads: memory_reads, mem_read_index: mem_read_index, rom_read_wires: rom_read_wires)) }
+            unless stmt.else_statements.empty?
+              lines << "#{pad}else:"
+              stmt.else_statements.each { |s| lines.concat(statement(s, indent: indent + 2, output_regs: output_regs, memory_reads: memory_reads, mem_read_index: mem_read_index, rom_read_wires: rom_read_wires)) }
+            end
+            lines
+          else
+            []
+          end
+        end
+
+        def expr(expr_node, output_regs: Set.new)
+          case expr_node
+          when IR::Signal
+            # Use internal register for sequential reads of outputs
+            name = output_regs.include?(expr_node.name) ? "#{expr_node.name}_reg" : expr_node.name
+            sanitize(name)
+          when IR::Literal
+            literal(expr_node.value, expr_node.width)
+          when IR::UnaryOp
+            unary_expr(expr_node, output_regs: output_regs)
+          when IR::BinaryOp
+            binary_expr(expr_node, output_regs: output_regs)
+          when IR::Mux
+            "mux(#{expr(expr_node.condition, output_regs: output_regs)}, #{expr(expr_node.when_true, output_regs: output_regs)}, #{expr(expr_node.when_false, output_regs: output_regs)})"
+          when IR::Concat
+            "cat(#{expr_node.parts.map { |p| expr(p, output_regs: output_regs) }.join(', ')})"
+          when IR::Slice
+            slice_expr(expr_node, output_regs: output_regs)
+          when IR::Resize
+            resize_expr(expr_node, output_regs: output_regs)
+          when IR::Case
+            case_expr(expr_node, output_regs: output_regs)
+          when IR::MemoryRead
+            "#{sanitize(expr_node.memory)}[#{expr(expr_node.addr, output_regs: output_regs)}]"
+          else
+            raise ArgumentError, "Unsupported FIRRTL expression: #{expr_node.inspect}"
+          end
+        end
+
+        def unary_expr(node, output_regs: Set.new)
+          case node.op
+          when :~
+            "not(#{expr(node.operand, output_regs: output_regs)})"
+          when :!
+            "not(#{expr(node.operand, output_regs: output_regs)})"
+          else
+            raise ArgumentError, "Unsupported unary op: #{node.op}"
+          end
+        end
+
+        def binary_expr(node, output_regs: Set.new)
+          left = expr(node.left, output_regs: output_regs)
+          right = expr(node.right, output_regs: output_regs)
+
+          case node.op
+          when :&
+            "and(#{left}, #{right})"
+          when :|
+            "or(#{left}, #{right})"
+          when :^
+            "xor(#{left}, #{right})"
+          when :+
+            "add(#{left}, #{right})"
+          when :-
+            "sub(#{left}, #{right})"
+          when :*
+            "mul(#{left}, #{right})"
+          when :/
+            "div(#{left}, #{right})"
+          when :%
+            "rem(#{left}, #{right})"
+          when :<<
+            shift_amt = strip_shift_resize(right, node.right, output_regs: output_regs)
+            "dshl(#{left}, #{shift_amt})"
+          when :>>
+            shift_amt = strip_shift_resize(right, node.right, output_regs: output_regs)
+            "dshr(#{left}, #{shift_amt})"
+          when :==
+            "eq(#{left}, #{right})"
+          when :!=
+            "neq(#{left}, #{right})"
+          when :<
+            "lt(#{left}, #{right})"
+          when :>
+            "gt(#{left}, #{right})"
+          when :<=
+            "leq(#{left}, #{right})"
+          when :>=
+            "geq(#{left}, #{right})"
+          else
+            raise ArgumentError, "Unsupported binary op: #{node.op}"
+          end
+        end
+
+        # FIRRTL dshl/dshr: the shift amount width determines the max result width
+        # (result_width = left_width + 2^shift_width - 1). If the shift amount was
+        # padded (Resize) to match the left operand, strip the Resize and use the
+        # original narrower width to avoid exceeding firtool's width limits.
+        def strip_shift_resize(firrtl_expr, ir_node, output_regs: Set.new)
+          if ir_node.is_a?(IR::Resize) && ir_node.width > ir_node.expr.width
+            expr(ir_node.expr, output_regs: output_regs)
+          else
+            firrtl_expr
+          end
+        end
+
+        def slice_expr(node, output_regs: Set.new)
+          base = expr(node.base, output_regs: output_regs)
+          high = [node.range.begin, node.range.end].max
+          low = [node.range.begin, node.range.end].min
+
+          "bits(#{base}, #{high}, #{low})"
+        end
+
+        def resize_expr(node, output_regs: Set.new)
+          inner = expr(node.expr, output_regs: output_regs)
+          target_width = node.width
+          source_width = node.expr.width
+
+          if target_width == source_width
+            inner
+          elsif target_width > source_width
+            "pad(#{inner}, #{target_width})"
+          else
+            "bits(#{inner}, #{target_width - 1}, 0)"
+          end
+        end
+
+        def case_expr(node, output_regs: Set.new)
+          # Convert case to nested mux
+          selector = expr(node.selector, output_regs: output_regs)
+          default_expr = node.default ? expr(node.default, output_regs: output_regs) : literal(0, node.width)
+
+          result = default_expr
+          node.cases.reverse_each do |values, branch|
+            values.each do |v|
+              cond = "eq(#{selector}, #{literal(v, node.selector.width)})"
+              result = "mux(#{cond}, #{expr(branch, output_regs: output_regs)}, #{result})"
+            end
+          end
+          result
+        end
+
+        def literal(value, width)
+          "UInt<#{width}>(#{value})"
+        end
+
+        def sanitize(name)
+          base = name.to_s.gsub(/[^a-zA-Z0-9_]/, "_")
+          base = "_#{base}" if base.match?(/\A\d/)
+          return "#{base}_fir" if FIRRTL_KEYWORDS.include?(base.downcase)
+
+          base
+        end
+    end
+  end
+end
diff --git a/lib/rhdl/codegen/firrtl/gpu_lowering_delegate.rb b/lib/rhdl/codegen/firrtl/gpu_lowering_delegate.rb
new file mode 100644
index 00000000..cf3156fe
--- /dev/null
+++ b/lib/rhdl/codegen/firrtl/gpu_lowering_delegate.rb
@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+
+require 'json'
+require 'set'
+
+module RHDL
+  module Codegen
+    module FIRRTL
+      # Shared lowering flow for Arc/Synth/GEM frontends.
+      #
+      # Frontends provide source text + parser semantics. This delegate handles
+      # the shared profile transforms, validation, metadata, and code emission.
+      module GpuLoweringDelegate
+        module_function
+
+        def lower(
+          lowerer:,
+          source_text:,
+          parser:,
+          gpu_mlir_path:,
+          metadata_path: nil,
+          metal_source_path: nil,
+          profile: :cpu8bit,
+          gem_kernel_interpreter: false,
+          require_arc_define: true,
+          metadata_version: 'ArcToGpuLoweringV2',
+          lowering_label: 'ArcToGpuLowering'
+        )
+          profile_impl = lowerer.profile_module_for(profile)
+          source = profile_impl.prepare_source(source: source_text, lowerer: lowerer)
+          parsed = parser.call(source)
+          parsed = profile_impl.post_parse_transform(parsed: parsed, lowerer: lowerer)
+          summary = lowerer.summarize(parsed)
+
+          unsupported = summary[:ops].keys.reject { |op| lowerer::SUPPORTED_OPS.include?(op) }
+          unless unsupported.empty?
+            raise lowerer::LoweringError,
+              "#{lowering_label} does not support ops: #{unsupported.sort.join(', ')}"
+          end
+
+          lowerer.validate_top_module!(
+            parsed,
+            summary,
+            required_inputs: profile_impl.required_inputs,
+            required_outputs: profile_impl.required_outputs,
+            require_arc_define: require_arc_define
+          )
+
+          gpu_mlir = lowerer.emit_gpu_mlir(summary, lowering_label: lowering_label)
+          File.write(gpu_mlir_path, gpu_mlir)
+
+          inferred_scalar_bits = lowerer.inferred_scalar_width_bits(parsed)
+          pack_wide_scalars = profile_impl.pack_wide_scalars?(inferred_scalar_bits: inferred_scalar_bits)
+          effective_scalar_bits = pack_wide_scalars ? lowerer::DEFAULT_SCALAR_WIDTH_BITS : inferred_scalar_bits
+
+          state_layout = lowerer.build_state_layout(parsed, pack_wide_scalars: pack_wide_scalars)
+          clock_tracking_slot_count = lowerer.count_clock_tracking_slots(parsed.fetch(:top_module).fetch(:ops))
+          state_slots = state_layout.sum { |entry| entry.fetch(:slot_count, 1) }
+          state_count = state_slots + clock_tracking_slot_count
+          output_state_slots = lowerer.map_output_state_slots(parsed, state_layout)
+          metal_entry = "#{summary[:top_module]}_arcgpu_kernel"
+          top_input_layout = parsed.fetch(:top_module).fetch(:inputs).map do |p|
+            { name: p.fetch(:name), width: p.fetch(:type).fetch(:width) }
+          end
+          top_output_layout = parsed.fetch(:top_module).fetch(:outputs).map do |p|
+            { name: p.fetch(:name), width: p.fetch(:type).fetch(:width) }
+          end
+
+          runtime_input_name_set =
+            if profile_impl.respond_to?(:runtime_input_names)
+              profile_impl.runtime_input_names.to_set
+            end
+          runtime_output_name_set =
+            if profile_impl.respond_to?(:runtime_output_names)
+              profile_impl.runtime_output_names.to_set
+            end
+          runtime_input_layout =
+            if runtime_input_name_set
+              top_input_layout.select { |entry| runtime_input_name_set.include?(entry.fetch(:name)) }
+            else
+              top_input_layout
+            end
+          runtime_output_layout =
+            if runtime_output_name_set
+              top_output_layout.select { |entry| runtime_output_name_set.include?(entry.fetch(:name)) }
+            else
+              top_output_layout
+            end
+
+          if metal_source_path
+            metal_source = profile_impl.emit_metal_source(
+              lowerer: lowerer,
+              parsed: parsed,
+              state_layout: state_layout,
+              metal_entry: metal_entry,
+              scalar_bits: effective_scalar_bits,
+              pack_wide_scalars: pack_wide_scalars,
+              gem_kernel_interpreter: gem_kernel_interpreter
+            )
+            File.write(metal_source_path, metal_source)
+          end
+
+          if metadata_path
+            metadata = {
+              version: metadata_version,
+              profile: profile.to_s,
+              module: summary[:top_module],
+              top_inputs: summary[:top_inputs],
+              top_outputs: summary[:top_outputs],
+              top_input_layout: top_input_layout,
+              top_output_layout: top_output_layout,
+              op_counts: summary[:ops],
+              arc_define_count: summary[:arc_define_count],
+              arc_state_count: summary[:arc_state_count],
+              arc_call_count: summary[:arc_call_count],
+              source_bytes: source.bytesize,
+              metal: {
+                entry: metal_entry,
+                state_count: state_count,
+                io_struct: 'RhdlArcGpuIo',
+                state_scalar_bits: effective_scalar_bits,
+                state_scalar_msl_type: (effective_scalar_bits > 32 ? 'ulong' : 'uint'),
+                packed_wide_scalars: pack_wide_scalars,
+                runtime_input_layout: runtime_input_layout,
+                runtime_output_layout: runtime_output_layout
+              },
+              state_layout: state_layout,
+              output_state_slots: output_state_slots,
+              poke_alias_state_slots: {
+                'pc_reg__q' => output_state_slots['pc_out'],
+                'acc_reg__q' => output_state_slots['acc_out'],
+                'sp_reg__q' => output_state_slots['sp_out']
+              }.compact
+            }
+            if profile_impl.respond_to?(:schedule_mode)
+              metadata[:metal][:schedule_mode] = profile_impl.schedule_mode
+            end
+            if profile_impl.respond_to?(:fast_low_wdata_mode)
+              metadata[:metal][:fast_low_wdata_mode] = profile_impl.fast_low_wdata_mode
+            end
+            if profile_impl.respond_to?(:fast_high_data_addr_mode)
+              metadata[:metal][:fast_high_data_addr_mode] = profile_impl.fast_high_data_addr_mode
+            end
+            if profile_impl.respond_to?(:fast_low_data_addr_mode)
+              metadata[:metal][:fast_low_data_addr_mode] = profile_impl.fast_low_data_addr_mode
+            end
+            if %i[riscv riscv_netlist].include?(profile.to_sym)
+              metadata[:metal][:introspection] = lowerer.riscv_runtime_introspection(parsed, state_layout, output_state_slots)
+            end
+            File.write(metadata_path, JSON.pretty_generate(metadata))
+          end
+
+          {
+            module: summary[:top_module],
+            profile: profile,
+            arc_define_count: summary[:arc_define_count],
+            arc_state_count: summary[:arc_state_count],
+            arc_call_count: summary[:arc_call_count],
+            op_counts: summary[:ops],
+            metal_entry: metal_entry,
+            state_count: state_count,
+            state_scalar_bits: effective_scalar_bits
+          }
+        end
+      end
+    end
+  end
+end
diff --git a/lib/rhdl/sim/native/ir/ir_compiler/.gitignore b/lib/rhdl/sim/native/ir/ir_compiler/.gitignore
new file mode 100644
index 00000000..4081a1fe
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_compiler/.gitignore
@@ -0,0 +1,6 @@
+/target/
+/lib/*.so
+/lib/*.dylib
+/lib/*.dll
+/lib/*.bundle
+Cargo.lock
diff --git a/lib/rhdl/sim/native/ir/ir_compiler/Cargo.toml b/lib/rhdl/sim/native/ir/ir_compiler/Cargo.toml
new file mode 100644
index 00000000..c729e56a
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_compiler/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "ir_compiler"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "IR Compiler - generates specialized native code for circuit simulation"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib", "rlib"]
+
+[features]
+default = []
+aot = []
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = { version = "1.0.100", features = ["unbounded_depth"] }
+libloading = "0.8"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/ir/ir_interpreter/.gitignore b/lib/rhdl/sim/native/ir/ir_interpreter/.gitignore
new file mode 100644
index 00000000..ca98cd96
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_interpreter/.gitignore
@@ -0,0 +1,2 @@
+/target/
+Cargo.lock
diff --git a/lib/rhdl/sim/native/ir/ir_interpreter/Cargo.toml b/lib/rhdl/sim/native/ir/ir_interpreter/Cargo.toml
new file mode 100644
index 00000000..23424a74
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_interpreter/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "ir_interpreter"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "Bytecode-based IR interpreter for Behavior IR - pure Rust core + Fiddle bindings"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib", "rlib"]
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = { version = "1.0.100", features = ["unbounded_depth"] }
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/ir/ir_jit/.gitignore b/lib/rhdl/sim/native/ir/ir_jit/.gitignore
new file mode 100644
index 00000000..27605fb7
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_jit/.gitignore
@@ -0,0 +1,2 @@
+/target/
+/lib/
diff --git a/lib/rhdl/sim/native/ir/ir_jit/Cargo.lock b/lib/rhdl/sim/native/ir/ir_jit/Cargo.lock
new file mode 100644
index 00000000..7ecb21ab
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_jit/Cargo.lock
@@ -0,0 +1,518 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "anyhow"
+version = "1.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
+
+[[package]]
+name = "arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bumpalo"
+version = "3.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510"
+dependencies = [
+ "allocator-api2",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cranelift"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a71de5e59f616d79d14d2c71aa2799ce898241d7f10f7e64a4997014b4000a28"
+dependencies = [
+ "cranelift-codegen",
+ "cranelift-frontend",
+ "cranelift-module",
+]
+
+[[package]]
+name = "cranelift-bforest"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e15d04a0ce86cb36ead88ad68cf693ffd6cda47052b9e0ac114bc47fd9cd23c4"
+dependencies = [
+ "cranelift-entity",
+]
+
+[[package]]
+name = "cranelift-bitset"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c6e3969a7ce267259ce244b7867c5d3bc9e65b0a87e81039588dfdeaede9f34"
+
+[[package]]
+name = "cranelift-codegen"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c22032c4cb42558371cf516bb47f26cdad1819d3475c133e93c49f50ebf304e"
+dependencies = [
+ "bumpalo",
+ "cranelift-bforest",
+ "cranelift-bitset",
+ "cranelift-codegen-meta",
+ "cranelift-codegen-shared",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-isle",
+ "gimli",
+ "hashbrown 0.14.5",
+ "log",
+ "regalloc2",
+ "rustc-hash",
+ "serde",
+ "smallvec",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "cranelift-codegen-meta"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c904bc71c61b27fc57827f4a1379f29de64fe95653b620a3db77d59655eee0b8"
+dependencies = [
+ "cranelift-codegen-shared",
+]
+
+[[package]]
+name = "cranelift-codegen-shared"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40180f5497572f644ce88c255480981ae2ec1d7bb4d8e0c0136a13b87a2f2ceb"
+
+[[package]]
+name = "cranelift-control"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d132c6d0bd8a489563472afc171759da0707804a65ece7ceb15a8c6d7dd5ef"
+dependencies = [
+ "arbitrary",
+]
+
+[[package]]
+name = "cranelift-entity"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b2d0d9618275474fbf679dd018ac6e009acbd6ae6850f6a67be33fb3b00b323"
+dependencies = [
+ "cranelift-bitset",
+]
+
+[[package]]
+name = "cranelift-frontend"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fac41e16729107393174b0c9e3730fb072866100e1e64e80a1a963b2e484d57"
+dependencies = [
+ "cranelift-codegen",
+ "log",
+ "smallvec",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "cranelift-isle"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca20d576e5070044d0a72a9effc2deacf4d6aa650403189d8ea50126483944d"
+
+[[package]]
+name = "cranelift-jit"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e65c42755a719b09662b00c700daaf76cc35d5ace1f5c002ad404b591ff1978"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-module",
+ "cranelift-native",
+ "libc",
+ "log",
+ "region",
+ "target-lexicon 0.13.4",
+ "wasmtime-jit-icache-coherence",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "cranelift-module"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d55612bebcf16ff7306c8a6f5bdb6d45662b8aa1ee058ecce8807ad87db719b"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+]
+
+[[package]]
+name = "cranelift-native"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dee82f3f1f2c4cba9177f1cc5e350fe98764379bcd29340caa7b01f85076c7"
+dependencies = [
+ "cranelift-codegen",
+ "libc",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+dependencies = [
+ "fallible-iterator",
+ "indexmap",
+ "stable_deref_trait",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
+[[package]]
+name = "indexmap"
+version = "2.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.16.1",
+]
+
+[[package]]
+name = "ir_jit"
+version = "0.1.0"
+dependencies = [
+ "cranelift",
+ "cranelift-codegen",
+ "cranelift-frontend",
+ "cranelift-jit",
+ "cranelift-module",
+ "cranelift-native",
+ "serde",
+ "serde_json",
+ "target-lexicon 0.12.16",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
+
+[[package]]
+name = "libc"
+version = "0.2.180"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "mach2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "regalloc2"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc06e6b318142614e4a48bc725abbf08ff166694835c43c9dae5a9009704639a"
+dependencies = [
+ "allocator-api2",
+ "bumpalo",
+ "hashbrown 0.15.5",
+ "log",
+ "rustc-hash",
+ "smallvec",
+]
+
+[[package]]
+name = "region"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b6ebd13bc009aef9cd476c1310d49ac354d36e240cf1bd753290f3dc7199a7"
+dependencies = [
+ "bitflags",
+ "libc",
+ "mach2",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "syn"
+version = "2.0.114"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "target-lexicon"
+version = "0.12.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
+
+[[package]]
+name = "target-lexicon"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
+
+[[package]]
+name = "wasmtime-jit-icache-coherence"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec5e8552e01692e6c2e5293171704fed8abdec79d1a6995a0870ab190e5747d1"
+dependencies = [
+ "anyhow",
+ "cfg-if",
+ "libc",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "zmij"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65"
diff --git a/lib/rhdl/sim/native/ir/ir_jit/Cargo.toml b/lib/rhdl/sim/native/ir/ir_jit/Cargo.toml
new file mode 100644
index 00000000..33a73451
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_jit/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "ir_jit"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "Cranelift-based JIT compiler for IR simulation - runtime code generation for zero-dispatch evaluation"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = { version = "1.0.100", features = ["unbounded_depth"] }
+cranelift = "0.116"
+cranelift-jit = "0.116"
+cranelift-module = "0.116"
+cranelift-codegen = "0.116"
+cranelift-frontend = "0.116"
+cranelift-native = "0.116"
+target-lexicon = "0.12"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/ir/ir_simulator 2.rb b/lib/rhdl/sim/native/ir/ir_simulator 2.rb
new file mode 100644
index 00000000..b3dbe633
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_simulator 2.rb	
@@ -0,0 +1,1350 @@
+# frozen_string_literal: true
+
+# IR-level bytecode interpreter with Rust backend (Fiddle-based)
+#
+# This simulator operates at the IR level, interpreting Behavior IR using
+# a stack-based bytecode interpreter. It's faster than gate-level netlist
+# simulation because it operates on whole words instead of individual bits.
+#
+# Uses Fiddle (Ruby's built-in FFI) to call the Rust library directly,
+# similar to the JIT and Verilator runners.
+
+require 'json'
+require 'fiddle'
+require 'fiddle/import'
+require 'rbconfig'
+
+module RHDL
+  module Sim
+    module Native
+      module IR
+      def self.sim_lib_name(base)
+        case RbConfig::CONFIG['host_os']
+        when /darwin/ then "#{base}.dylib"
+        when /mswin|mingw/ then "#{base}.dll"
+        else "#{base}.so"
+        end
+      end
+
+      def self.sim_backend_available?(lib_path)
+        return false unless File.exist?(lib_path)
+
+        _test_lib = Fiddle.dlopen(lib_path)
+        _test_lib['sim_create']
+        _test_lib['sim_signal']
+        _test_lib['sim_exec']
+        true
+      rescue Fiddle::DLError
+        false
+      end
+
+      IR_INTERPRETER_EXT_DIR = File.expand_path('ir_interpreter/lib', __dir__)
+      IR_INTERPRETER_LIB_NAME = sim_lib_name('ir_interpreter')
+      IR_INTERPRETER_LIB_PATH = File.join(IR_INTERPRETER_EXT_DIR, IR_INTERPRETER_LIB_NAME)
+
+      JIT_EXT_DIR = File.expand_path('ir_jit/lib', __dir__)
+      JIT_LIB_NAME = sim_lib_name('ir_jit')
+      JIT_LIB_PATH = File.join(JIT_EXT_DIR, JIT_LIB_NAME)
+
+      COMPILER_EXT_DIR = File.expand_path('ir_compiler/lib', __dir__)
+      COMPILER_LIB_NAME = sim_lib_name('ir_compiler')
+      COMPILER_LIB_PATH = File.join(COMPILER_EXT_DIR, COMPILER_LIB_NAME)
+
+      IR_INTERPRETER_AVAILABLE = sim_backend_available?(IR_INTERPRETER_LIB_PATH)
+      JIT_AVAILABLE = sim_backend_available?(JIT_LIB_PATH)
+      COMPILER_AVAILABLE = sim_backend_available?(COMPILER_LIB_PATH)
+
+      # Unified IR simulator wrapper for interpreter, JIT and compiler backends.
+      class IrSimulator
+        attr_reader :ir_json, :sub_cycles
+
+        RUNNER_KIND_NONE = 0
+        RUNNER_KIND_APPLE2 = 1
+        RUNNER_KIND_MOS6502 = 2
+        RUNNER_KIND_GAMEBOY = 3
+        RUNNER_KIND_CPU8BIT = 4
+
+        RUNNER_MEM_OP_LOAD = 0
+        RUNNER_MEM_OP_READ = 1
+        RUNNER_MEM_OP_WRITE = 2
+
+        RUNNER_MEM_SPACE_MAIN = 0
+        RUNNER_MEM_SPACE_ROM = 1
+        RUNNER_MEM_SPACE_BOOT_ROM = 2
+        RUNNER_MEM_SPACE_VRAM = 3
+        RUNNER_MEM_SPACE_ZPRAM = 4
+        RUNNER_MEM_SPACE_WRAM = 5
+        RUNNER_MEM_SPACE_FRAMEBUFFER = 6
+
+        RUNNER_MEM_FLAG_MAPPED = 1
+
+        RUNNER_RUN_MODE_BASIC = 0
+        RUNNER_RUN_MODE_FULL = 1
+
+        RUNNER_CONTROL_SET_RESET_VECTOR = 0
+        RUNNER_CONTROL_RESET_SPEAKER_TOGGLES = 1
+        RUNNER_CONTROL_RESET_LCD = 2
+
+        RUNNER_PROBE_KIND = 0
+        RUNNER_PROBE_IS_MODE = 1
+        RUNNER_PROBE_SPEAKER_TOGGLES = 2
+        RUNNER_PROBE_FRAMEBUFFER_LEN = 3
+        RUNNER_PROBE_FRAME_COUNT = 4
+        RUNNER_PROBE_V_CNT = 5
+        RUNNER_PROBE_H_CNT = 6
+        RUNNER_PROBE_VBLANK_IRQ = 7
+        RUNNER_PROBE_IF_R = 8
+        RUNNER_PROBE_SIGNAL = 9
+        RUNNER_PROBE_LCDC_ON = 10
+        RUNNER_PROBE_H_DIV_CNT = 11
+
+        SIM_CAP_SIGNAL_INDEX = 1 << 0
+        SIM_CAP_FORCED_CLOCK = 1 << 1
+        SIM_CAP_TRACE = 1 << 2
+        SIM_CAP_TRACE_STREAMING = 1 << 3
+        SIM_CAP_COMPILE = 1 << 4
+
+        SIM_SIGNAL_HAS = 0
+        SIM_SIGNAL_GET_INDEX = 1
+        SIM_SIGNAL_PEEK = 2
+        SIM_SIGNAL_POKE = 3
+        SIM_SIGNAL_PEEK_INDEX = 4
+        SIM_SIGNAL_POKE_INDEX = 5
+
+        SIM_EXEC_EVALUATE = 0
+        SIM_EXEC_TICK = 1
+        SIM_EXEC_TICK_FORCED = 2
+        SIM_EXEC_SET_PREV_CLOCK = 3
+        SIM_EXEC_GET_CLOCK_LIST_IDX = 4
+        SIM_EXEC_RESET = 5
+        SIM_EXEC_RUN_TICKS = 6
+        SIM_EXEC_SIGNAL_COUNT = 7
+        SIM_EXEC_REG_COUNT = 8
+        SIM_EXEC_COMPILE = 9
+        SIM_EXEC_IS_COMPILED = 10
+
+        SIM_TRACE_START = 0
+        SIM_TRACE_START_STREAMING = 1
+        SIM_TRACE_STOP = 2
+        SIM_TRACE_ENABLED = 3
+        SIM_TRACE_CAPTURE = 4
+        SIM_TRACE_ADD_SIGNAL = 5
+        SIM_TRACE_ADD_SIGNALS_MATCHING = 6
+        SIM_TRACE_ALL_SIGNALS = 7
+        SIM_TRACE_CLEAR_SIGNALS = 8
+        SIM_TRACE_CLEAR = 9
+        SIM_TRACE_CHANGE_COUNT = 10
+        SIM_TRACE_SIGNAL_COUNT = 11
+        SIM_TRACE_SET_TIMESCALE = 12
+        SIM_TRACE_SET_MODULE_NAME = 13
+        SIM_TRACE_SAVE_VCD = 14
+
+        SIM_BLOB_INPUT_NAMES = 0
+        SIM_BLOB_OUTPUT_NAMES = 1
+        SIM_BLOB_TRACE_TO_VCD = 2
+        SIM_BLOB_TRACE_TAKE_LIVE_VCD = 3
+        SIM_BLOB_GENERATED_CODE = 4
+
+        BACKEND_CONFIGS = {
+          interpreter: {
+            available: IR_INTERPRETER_AVAILABLE,
+            lib_path: IR_INTERPRETER_LIB_PATH,
+            native_symbol: :interpret,
+            label: 'interpreter'
+          },
+          jit: {
+            available: JIT_AVAILABLE,
+            lib_path: JIT_LIB_PATH,
+            native_symbol: :jit,
+            label: 'jit'
+          },
+          compiler: {
+            available: COMPILER_AVAILABLE,
+            lib_path: COMPILER_LIB_PATH,
+            native_symbol: :compile,
+            label: 'compiler'
+          }
+        }.freeze
+
+        # @param ir_json [String] JSON representation of the IR
+        # @param backend [Symbol] :interpreter, :jit, :compiler, or :auto
+        # @param allow_fallback [Boolean] Allow fallback to another backend or Ruby implementation
+        # @param sub_cycles [Integer] Number of sub-cycles per CPU cycle (default: 14)
+        def initialize(ir_json, backend: :interpreter, allow_fallback: true, sub_cycles: 14)
+          @ir_json = ir_json
+          @sub_cycles = sub_cycles.clamp(1, 14)
+          @requested_backend = normalize_backend(backend)
+
+          selected = select_backend(@requested_backend)
+
+          if selected
+            configure_backend(selected)
+            load_library
+            create_simulator
+            compile if @backend == :compile
+          elsif allow_fallback
+            @sim = RubyIrSim.new(ir_json)
+            @backend = :ruby
+            @fallback = true
+          else
+            raise LoadError, unavailable_backend_error_message(@requested_backend)
+          end
+        end
+
+        def simulator_type
+          :"hdl_#{@backend}"
+        end
+
+        def native?
+          !@fallback && @backend != :ruby
+        end
+
+        def backend
+          @backend
+        end
+
+        def poke(name, value)
+          return @sim.poke(name, value) if @fallback
+          core_signal(SIM_SIGNAL_POKE, name: name, value: value)[:ok]
+        end
+
+        def peek(name)
+          return @sim.peek(name) if @fallback
+          core_signal(SIM_SIGNAL_PEEK, name: name)[:value]
+        end
+
+        def has_signal?(name)
+          return @sim.respond_to?(:has_signal?) && @sim.has_signal?(name) if @fallback
+          core_signal(SIM_SIGNAL_HAS, name: name)[:value] != 0
+        end
+
+        def evaluate
+          return @sim.evaluate if @fallback
+          core_exec(SIM_EXEC_EVALUATE)
+        end
+
+        def tick
+          return @sim.tick if @fallback
+          core_exec(SIM_EXEC_TICK)
+        end
+
+        def tick_forced
+          return @sim.tick if @fallback  # Ruby fallback doesn't need edge detection
+          core_exec(SIM_EXEC_TICK_FORCED)
+        end
+
+        def set_prev_clock(clock_list_idx, value)
+          return if @fallback  # Ruby fallback doesn't track prev clocks
+          core_exec(SIM_EXEC_SET_PREV_CLOCK, clock_list_idx, value)
+        end
+
+        def get_clock_list_idx(signal_idx)
+          return -1 if @fallback
+          result = core_exec(SIM_EXEC_GET_CLOCK_LIST_IDX, signal_idx)
+          result[:ok] ? result[:value] : -1
+        end
+
+        def reset
+          return @sim.reset if @fallback
+          @sim_runner_speaker_toggles = 0
+          core_exec(SIM_EXEC_RESET)
+        end
+
+        def signal_count
+          return @sim.signal_count if @fallback
+          core_exec(SIM_EXEC_SIGNAL_COUNT)[:value]
+        end
+
+        def reg_count
+          return @sim.reg_count if @fallback
+          core_exec(SIM_EXEC_REG_COUNT)[:value]
+        end
+
+        def compiled?
+          return false if @fallback
+          core_exec(SIM_EXEC_IS_COMPILED)[:value] != 0
+        end
+
+        def compile
+          return true if @fallback
+
+          error_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          error_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack('Q')
+          result = core_exec(SIM_EXEC_COMPILE, 0, 0, error_ptr)
+          return result[:value] != 0 if result[:ok]
+
+          error_str_ptr = error_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q')
+          if error_str_ptr != 0
+            error_msg = Fiddle::Pointer.new(error_str_ptr).to_s
+            @fn_free_error.call(error_str_ptr)
+            raise RuntimeError, "Compilation failed: #{error_msg}"
+          end
+          false
+        end
+
+        def generated_code
+          return '' if @fallback
+          core_blob(SIM_BLOB_GENERATED_CODE)
+        end
+
+        def input_names
+          return @sim.input_names if @fallback
+          csv = core_blob(SIM_BLOB_INPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        def output_names
+          return @sim.output_names if @fallback
+          csv = core_blob(SIM_BLOB_OUTPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        # VCD tracing methods
+        def trace_start
+          return @sim.trace_start if @fallback && @sim.respond_to?(:trace_start)
+          return false if @fallback
+          core_trace(SIM_TRACE_START)[:ok]
+        end
+
+        def trace_start_streaming(path)
+          return @sim.trace_start_streaming(path) if @fallback && @sim.respond_to?(:trace_start_streaming)
+          return false if @fallback
+          core_trace(SIM_TRACE_START_STREAMING, path)[:ok]
+        end
+
+        def trace_stop
+          return @sim.trace_stop if @fallback && @sim.respond_to?(:trace_stop)
+          return nil if @fallback
+          core_trace(SIM_TRACE_STOP)
+        end
+
+        def trace_enabled?
+          return @sim.trace_enabled? if @fallback && @sim.respond_to?(:trace_enabled?)
+          return false if @fallback
+          core_trace(SIM_TRACE_ENABLED)[:value] != 0
+        end
+
+        def trace_capture
+          return @sim.trace_capture if @fallback && @sim.respond_to?(:trace_capture)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CAPTURE)
+        end
+
+        def trace_add_signal(name)
+          return @sim.trace_add_signal(name) if @fallback && @sim.respond_to?(:trace_add_signal)
+          return false if @fallback
+          core_trace(SIM_TRACE_ADD_SIGNAL, name)[:ok]
+        end
+
+        def trace_add_signals_matching(pattern)
+          return @sim.trace_add_signals_matching(pattern) if @fallback && @sim.respond_to?(:trace_add_signals_matching)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_ADD_SIGNALS_MATCHING, pattern)[:value]
+        end
+
+        def trace_all_signals
+          return @sim.trace_all_signals if @fallback && @sim.respond_to?(:trace_all_signals)
+          return nil if @fallback
+          core_trace(SIM_TRACE_ALL_SIGNALS)
+        end
+
+        def trace_clear_signals
+          return @sim.trace_clear_signals if @fallback && @sim.respond_to?(:trace_clear_signals)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CLEAR_SIGNALS)
+        end
+
+        def trace_to_vcd
+          return @sim.trace_to_vcd if @fallback && @sim.respond_to?(:trace_to_vcd)
+          return '' if @fallback
+          core_blob(SIM_BLOB_TRACE_TO_VCD)
+        end
+
+        def trace_take_live_vcd
+          return @sim.trace_take_live_vcd if @fallback && @sim.respond_to?(:trace_take_live_vcd)
+          return '' if @fallback
+          core_blob(SIM_BLOB_TRACE_TAKE_LIVE_VCD)
+        end
+
+        def trace_save_vcd(path)
+          return @sim.trace_save_vcd(path) if @fallback && @sim.respond_to?(:trace_save_vcd)
+          return false if @fallback
+          core_trace(SIM_TRACE_SAVE_VCD, path)[:ok]
+        end
+
+        def trace_clear
+          return @sim.trace_clear if @fallback && @sim.respond_to?(:trace_clear)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CLEAR)
+        end
+
+        def trace_change_count
+          return @sim.trace_change_count if @fallback && @sim.respond_to?(:trace_change_count)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_CHANGE_COUNT)[:value]
+        end
+
+        def trace_signal_count
+          return @sim.trace_signal_count if @fallback && @sim.respond_to?(:trace_signal_count)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_SIGNAL_COUNT)[:value]
+        end
+
+        def trace_set_timescale(timescale)
+          return @sim.trace_set_timescale(timescale) if @fallback && @sim.respond_to?(:trace_set_timescale)
+          return false if @fallback
+          core_trace(SIM_TRACE_SET_TIMESCALE, timescale)[:ok]
+        end
+
+        def trace_set_module_name(name)
+          return @sim.trace_set_module_name(name) if @fallback && @sim.respond_to?(:trace_set_module_name)
+          return false if @fallback
+          core_trace(SIM_TRACE_SET_MODULE_NAME, name)[:ok]
+        end
+
+        def stats
+          return @sim.stats if @fallback
+          runner_kind = runner_kind
+          {
+            signals: signal_count,
+            regs: reg_count,
+            runner_kind: runner_kind,
+            runner_mode: runner_mode?,
+            apple2_mode: runner_kind == :apple2,
+            gameboy_mode: gameboy_mode?,
+            mos6502_mode: runner_kind == :mos6502,
+            cpu8bit_mode: runner_kind == :cpu8bit
+          }
+        end
+
+        # Batched tick execution
+        def run_ticks(n)
+          return @sim.respond_to?(:run_ticks) ? @sim.run_ticks(n) : n.times { @sim.tick } if @fallback
+          core_exec(SIM_EXEC_RUN_TICKS, n)
+        end
+
+        # Get signal index by name (for caching)
+        def get_signal_idx(name)
+          return @sim.respond_to?(:get_signal_idx) ? @sim.get_signal_idx(name) : nil if @fallback
+          result = core_signal(SIM_SIGNAL_GET_INDEX, name: name)
+          result[:ok] ? result[:value] : nil
+        end
+
+        # Poke by index - faster than by name when index is cached
+        def poke_by_idx(idx, value)
+          return @sim.poke_by_idx(idx, value) if @fallback && @sim.respond_to?(:poke_by_idx)
+          core_signal(SIM_SIGNAL_POKE_INDEX, idx: idx, value: value)
+        end
+
+        # Peek by index - faster than by name when index is cached
+        def peek_by_idx(idx)
+          return @sim.peek_by_idx(idx) if @fallback && @sim.respond_to?(:peek_by_idx)
+          core_signal(SIM_SIGNAL_PEEK_INDEX, idx: idx)[:value]
+        end
+
+        # ====================================================================
+        # Unified Runner Extension Methods
+        # ====================================================================
+
+        def runner_kind
+          if @fallback
+            return @sim.runner_kind if @sim.respond_to?(:runner_kind)
+            return nil
+          end
+
+          case runner_probe(RUNNER_PROBE_KIND)
+          when RUNNER_KIND_APPLE2 then :apple2
+          when RUNNER_KIND_MOS6502 then :mos6502
+          when RUNNER_KIND_GAMEBOY then :gameboy
+          when RUNNER_KIND_CPU8BIT then :cpu8bit
+          else nil
+          end
+        end
+
+        def runner_mode?
+          if @fallback
+            return @sim.runner_mode? if @sim.respond_to?(:runner_mode?)
+            return !runner_kind.nil?
+          end
+          runner_probe(RUNNER_PROBE_IS_MODE) != 0
+        end
+
+        def runner_load_memory(data, offset = 0, is_rom = false)
+          if @fallback
+            return @sim.runner_load_memory(data, offset, is_rom) if @sim.respond_to?(:runner_load_memory)
+            return false
+          end
+          data = data.pack('C*') if data.is_a?(Array)
+          return false if data.nil? || data.bytesize.zero?
+
+          space = is_rom ? RUNNER_MEM_SPACE_ROM : RUNNER_MEM_SPACE_MAIN
+          runner_mem(RUNNER_MEM_OP_LOAD, space, offset, data, 0) > 0
+        end
+
+        def runner_read_memory(offset, length, mapped: true)
+          length = [length.to_i, 0].max
+          if @fallback
+            return @sim.runner_read_memory(offset, length, mapped: mapped) if @sim.respond_to?(:runner_read_memory)
+            return Array.new(length, 0)
+          end
+          return [] if length.zero?
+
+          flags = mapped ? RUNNER_MEM_FLAG_MAPPED : 0
+          runner_mem_read(RUNNER_MEM_SPACE_MAIN, offset, length, flags)
+        end
+
+        def runner_write_memory(offset, data, mapped: true)
+          if @fallback
+            return @sim.runner_write_memory(offset, data, mapped: mapped) if @sim.respond_to?(:runner_write_memory)
+            return 0
+          end
+          data = data.pack('C*') if data.is_a?(Array)
+          return 0 if data.nil? || data.bytesize.zero?
+
+          flags = mapped ? RUNNER_MEM_FLAG_MAPPED : 0
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_MAIN, offset, data, flags)
+        end
+
+        def runner_run_cycles(n, key_data = 0, key_ready = false)
+          if @fallback
+            return @sim.runner_run_cycles(n, key_data, key_ready) if @sim.respond_to?(:runner_run_cycles)
+            return { text_dirty: false, key_cleared: false, cycles_run: 0, speaker_toggles: 0 }
+          end
+
+          result_buf = Fiddle::Pointer.malloc(20)
+          ok = @fn_runner_run.call(
+            @ctx,
+            n,
+            key_data,
+            key_ready ? 1 : 0,
+            RUNNER_RUN_MODE_BASIC,
+            result_buf
+          )
+          return nil if ok == 0
+
+          values = result_buf[0, 20].unpack('llLLL')
+          result = {
+            text_dirty: values[0] != 0,
+            key_cleared: values[1] != 0,
+            cycles_run: values[2],
+            speaker_toggles: values[3]
+          }
+          @sim_runner_speaker_toggles = ((@sim_runner_speaker_toggles || 0) + result[:speaker_toggles]) & 0xFFFFFFFF
+          result
+        end
+
+        def runner_load_rom(data, offset = 0)
+          if @fallback
+            return @sim.runner_load_rom(data, offset) if @sim.respond_to?(:runner_load_rom)
+          end
+
+          data = data.pack('C*') if data.is_a?(Array)
+          return false if data.nil? || data.bytesize.zero?
+          runner_mem(RUNNER_MEM_OP_LOAD, RUNNER_MEM_SPACE_ROM, offset, data, 0) > 0
+        end
+
+        def runner_set_reset_vector(addr)
+          vector = addr.to_i & 0xFFFF_FFFF
+          if @fallback
+            return @sim.runner_set_reset_vector(vector) if @sim.respond_to?(:runner_set_reset_vector)
+          end
+
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_SET_RESET_VECTOR, vector, 0) != 0
+        end
+
+        def runner_speaker_toggles
+          if @fallback
+            return @sim.runner_speaker_toggles if @sim.respond_to?(:runner_speaker_toggles)
+            return 0
+          end
+          return runner_probe(RUNNER_PROBE_SPEAKER_TOGGLES) if runner_kind == :mos6502
+          @sim_runner_speaker_toggles || 0
+        end
+
+        def runner_reset_speaker_toggles
+          if @fallback
+            return @sim.runner_reset_speaker_toggles if @sim.respond_to?(:runner_reset_speaker_toggles)
+            return nil
+          end
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RESET_SPEAKER_TOGGLES, 0, 0)
+          @sim_runner_speaker_toggles = 0
+          nil
+        end
+
+        # ====================================================================
+        # Game Boy Extension Methods
+        # ====================================================================
+
+        def gameboy_mode?
+          return @sim.gameboy_mode? if @fallback && @sim.respond_to?(:gameboy_mode?)
+          return false if @fallback
+          runner_kind == :gameboy
+        end
+
+        def load_rom(data)
+          return @sim.load_rom(data) if @fallback && @sim.respond_to?(:load_rom)
+          return if @fallback
+          runner_load_rom(data, 0)
+        end
+
+        def load_boot_rom(data)
+          return @sim.load_boot_rom(data) if @fallback && @sim.respond_to?(:load_boot_rom)
+          return if @fallback
+          data = data.pack('C*') if data.is_a?(Array)
+          runner_mem(RUNNER_MEM_OP_LOAD, RUNNER_MEM_SPACE_BOOT_ROM, 0, data, 0)
+        end
+
+        def run_gb_cycles(n)
+          return @sim.run_gb_cycles(n) if @fallback && @sim.respond_to?(:run_gb_cycles)
+          return { cycles_run: 0, frames_completed: 0 } if @fallback
+
+          result_buf = Fiddle::Pointer.malloc(20)
+          ok = @fn_runner_run.call(@ctx, n, 0, 0, RUNNER_RUN_MODE_FULL, result_buf)
+          return { cycles_run: 0, frames_completed: 0 } if ok == 0
+          values = result_buf[0, 20].unpack('llLLL')
+          {
+            cycles_run: values[2],
+            frames_completed: values[4]
+          }
+        end
+
+        def read_vram(addr)
+          return @sim.read_vram(addr) if @fallback && @sim.respond_to?(:read_vram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_VRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_vram(addr, data)
+          return @sim.write_vram(addr, data) if @fallback && @sim.respond_to?(:write_vram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_VRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_zpram(addr)
+          return @sim.read_zpram(addr) if @fallback && @sim.respond_to?(:read_zpram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_ZPRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_zpram(addr, data)
+          return @sim.write_zpram(addr, data) if @fallback && @sim.respond_to?(:write_zpram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_ZPRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_wram(addr)
+          return @sim.read_wram(addr) if @fallback && @sim.respond_to?(:read_wram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_WRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_wram(addr, data)
+          return @sim.write_wram(addr, data) if @fallback && @sim.respond_to?(:write_wram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_WRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_framebuffer
+          return @sim.read_framebuffer if @fallback && @sim.respond_to?(:read_framebuffer)
+          return [] if @fallback
+
+          len = runner_probe(RUNNER_PROBE_FRAMEBUFFER_LEN)
+          return [] if len <= 0
+          runner_mem_read(RUNNER_MEM_SPACE_FRAMEBUFFER, 0, len, 0)
+        end
+
+        def frame_count
+          return @sim.frame_count if @fallback && @sim.respond_to?(:frame_count)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_FRAME_COUNT)
+        end
+
+        def reset_lcd_state
+          return @sim.reset_lcd_state if @fallback && @sim.respond_to?(:reset_lcd_state)
+          return if @fallback
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RESET_LCD, 0, 0)
+        end
+
+        def get_v_cnt
+          return @sim.get_v_cnt if @fallback && @sim.respond_to?(:get_v_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_V_CNT)
+        end
+
+        def get_h_cnt
+          return @sim.get_h_cnt if @fallback && @sim.respond_to?(:get_h_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_H_CNT)
+        end
+
+        def get_vblank_irq
+          return @sim.get_vblank_irq if @fallback && @sim.respond_to?(:get_vblank_irq)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_VBLANK_IRQ)
+        end
+
+        def get_if_r
+          return @sim.get_if_r if @fallback && @sim.respond_to?(:get_if_r)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_IF_R)
+        end
+
+        def get_signal(idx)
+          return @sim.get_signal(idx) if @fallback && @sim.respond_to?(:get_signal)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_SIGNAL, idx)
+        end
+
+        def get_lcdc_on
+          return @sim.get_lcdc_on if @fallback && @sim.respond_to?(:get_lcdc_on)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_LCDC_ON)
+        end
+
+        def get_h_div_cnt
+          return @sim.get_h_div_cnt if @fallback && @sim.respond_to?(:get_h_div_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_H_DIV_CNT)
+        end
+
+        def core_signal(op, name: nil, idx: 0, value: 0)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_signal.call(@ctx, op, name, idx, value, out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_exec(op, arg0 = 0, arg1 = 0, error_out = nil)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_exec.call(@ctx, op, arg0, arg1, out, error_out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_trace(op, str_arg = nil)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_trace.call(@ctx, op, str_arg, out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_blob(op)
+          len = @fn_sim_blob.call(@ctx, op, nil, 0)
+          return '' if len.nil? || len.to_i <= 0
+          buf = Fiddle::Pointer.malloc(len)
+          actual = @fn_sim_blob.call(@ctx, op, buf, len)
+          return '' if actual.nil? || actual.to_i <= 0
+          buf[0, actual]
+        end
+
+        def runner_mem(op, space, offset, data, flags)
+          @fn_runner_mem.call(@ctx, op, space, offset, data, data.bytesize, flags)
+        end
+
+        def runner_mem_read(space, offset, length, flags)
+          length = [length.to_i, 0].max
+          return [] if length.zero?
+
+          buf = Fiddle::Pointer.malloc(length)
+          read_len = @fn_runner_mem.call(@ctx, RUNNER_MEM_OP_READ, space, offset, buf, length, flags)
+          buf[0, read_len].unpack('C*')
+        end
+
+        def runner_probe(op, arg0 = 0)
+          @fn_runner_probe.call(@ctx, op, arg0)
+        end
+
+        def respond_to_missing?(method_name, include_private = false)
+          (@fallback && @sim.respond_to?(method_name)) || super
+        end
+
+        def method_missing(method_name, *args, &block)
+          if @fallback && @sim.respond_to?(method_name)
+            @sim.send(method_name, *args, &block)
+          else
+            super
+          end
+        end
+
+        private
+
+        def normalize_backend(backend)
+          value = backend.to_sym
+          value = :interpreter if value == :interpret
+          value = :compiler if value == :compile
+          return value if BACKEND_CONFIGS.key?(value) || value == :auto
+          raise ArgumentError, "Unknown IR backend: #{backend.inspect}"
+        end
+
+        def backend_candidates(requested)
+          case requested
+          when :interpreter then %i[interpreter]
+          when :jit then %i[jit interpreter]
+          when :compiler then %i[compiler interpreter]
+          when :auto then %i[compiler jit interpreter]
+          else []
+          end
+        end
+
+        def select_backend(requested)
+          backend_candidates(requested).find { |name| BACKEND_CONFIGS[name][:available] }
+        end
+
+        def configure_backend(name)
+          config = BACKEND_CONFIGS[name]
+          @lib_path = config[:lib_path]
+          @backend = config[:native_symbol]
+          @backend_label = config[:label]
+        end
+
+        def unavailable_backend_error_message(requested)
+          case requested
+          when :interpreter
+            "IR interpreter extension not found at: #{IR_INTERPRETER_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :jit
+            "IR JIT extension not found at: #{JIT_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :compiler
+            "IR compiler extension not found at: #{COMPILER_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :auto
+            "No IR backend extension found (searched compiler, jit, interpreter).\nRun 'rake native:build' to build them."
+          else
+            "IR backend not available."
+          end
+        end
+
+        def load_library
+          @lib = Fiddle.dlopen(@lib_path)
+
+          # Core functions
+          @fn_create = Fiddle::Function.new(
+            @lib['sim_create'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOIDP
+          )
+
+          @fn_destroy = Fiddle::Function.new(
+            @lib['sim_destroy'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+
+          @fn_free_error = Fiddle::Function.new(
+            @lib['sim_free_error'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+
+          @fn_sim_get_caps = Fiddle::Function.new(
+            @lib['sim_get_caps'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_signal = Fiddle::Function.new(
+            @lib['sim_signal'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_ULONG, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_exec = Fiddle::Function.new(
+            @lib['sim_exec'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_ULONG, Fiddle::TYPE_ULONG, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_trace = Fiddle::Function.new(
+            @lib['sim_trace'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_blob = Fiddle::Function.new(
+            @lib['sim_blob'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T],
+            Fiddle::TYPE_SIZE_T
+          )
+
+          # Unified runner functions
+          @fn_runner_get_caps = Fiddle::Function.new(
+            @lib['runner_get_caps'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_mem = Fiddle::Function.new(
+            @lib['runner_mem'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_SIZE_T
+          )
+
+          @fn_runner_run = Fiddle::Function.new(
+            @lib['runner_run'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_CHAR, Fiddle::TYPE_INT, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_control = Fiddle::Function.new(
+            @lib['runner_control'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_probe = Fiddle::Function.new(
+            @lib['runner_probe'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_LONG_LONG
+          )
+        end
+
+        def create_simulator
+          error_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          error_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack('Q')
+
+          @ctx = @fn_create.call(@ir_json, @ir_json.bytesize, @sub_cycles, error_ptr)
+
+          if @ctx.null?
+            error_str_ptr = error_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q')
+            if error_str_ptr != 0
+              error_msg = Fiddle::Pointer.new(error_str_ptr).to_s
+              @fn_free_error.call(error_str_ptr)
+              raise RuntimeError, "Failed to create #{@backend_label} simulator: #{error_msg}"
+            end
+            raise RuntimeError, "Failed to create #{@backend_label} simulator"
+          end
+
+          @sim_runner_speaker_toggles = 0
+          @destructor = @fn_destroy
+        end
+      end
+
+      # Ruby fallback simulator for when native extension is not available
+      class RubyIrSim
+        def initialize(json)
+          @ir = JSON.parse(json, symbolize_names: true, max_nesting: false)
+          @signals = {}
+          @widths = {}
+          @inputs = []
+          @outputs = []
+
+          # Initialize ports
+          @ir[:ports]&.each do |port|
+            @signals[port[:name]] = 0
+            @widths[port[:name]] = port[:width]
+            if port[:direction] == 'in'
+              @inputs << port[:name]
+            else
+              @outputs << port[:name]
+            end
+          end
+
+          # Initialize wires
+          @ir[:nets]&.each do |net|
+            @signals[net[:name]] = 0
+            @widths[net[:name]] = net[:width]
+          end
+
+          # Initialize registers (with reset values if present)
+          @reset_values = {}
+          @ir[:regs]&.each do |reg|
+            reset_val = reg[:reset_value] || 0
+            @signals[reg[:name]] = reset_val
+            @widths[reg[:name]] = reg[:width]
+            @reset_values[reg[:name]] = reset_val
+          end
+
+          @assigns = @ir[:assigns] || []
+          @processes = @ir[:processes] || []
+        end
+
+        def native?
+          false
+        end
+
+        def mask(width)
+          width >= 64 ? 0xFFFFFFFFFFFFFFFF : (1 << width) - 1
+        end
+
+        def eval_expr(expr)
+          case expr[:type]
+          when 'signal'
+            (@signals[expr[:name]] || 0) & mask(expr[:width])
+          when 'literal'
+            expr[:value] & mask(expr[:width])
+          when 'unary_op'
+            val = eval_expr(expr[:operand])
+            m = mask(expr[:width])
+            case expr[:op]
+            when '~', 'not'
+              (~val) & m
+            when '&', 'reduce_and'
+              op_width = expr[:operand][:width]
+              (val & mask(op_width)) == mask(op_width) ? 1 : 0
+            when '|', 'reduce_or'
+              val != 0 ? 1 : 0
+            when '^', 'reduce_xor'
+              val.to_s(2).count('1') & 1
+            else
+              val
+            end
+          when 'binary_op'
+            l = eval_expr(expr[:left])
+            r = eval_expr(expr[:right])
+            m = mask(expr[:width])
+            case expr[:op]
+            when '&' then l & r
+            when '|' then l | r
+            when '^' then l ^ r
+            when '+' then (l + r) & m
+            when '-' then (l - r) & m
+            when '*' then (l * r) & m
+            when '/' then r != 0 ? l / r : 0
+            when '%' then r != 0 ? l % r : 0
+            when '<<' then (l << [r, 63].min) & m
+            when '>>' then l >> [r, 63].min
+            when '==' then l == r ? 1 : 0
+            when '!=' then l != r ? 1 : 0
+            when '<' then l < r ? 1 : 0
+            when '>' then l > r ? 1 : 0
+            when '<=', 'le' then l <= r ? 1 : 0
+            when '>=' then l >= r ? 1 : 0
+            else 0
+            end
+          when 'mux'
+            cond = eval_expr(expr[:condition])
+            m = mask(expr[:width])
+            if cond != 0
+              eval_expr(expr[:when_true]) & m
+            else
+              eval_expr(expr[:when_false]) & m
+            end
+          when 'slice'
+            val = eval_expr(expr[:base])
+            (val >> expr[:low]) & mask(expr[:width])
+          when 'concat'
+            result = 0
+            shift = 0
+            expr[:parts].each do |part|
+              part_val = eval_expr(part)
+              part_width = part[:width]
+              result |= (part_val & mask(part_width)) << shift
+              shift += part_width
+            end
+            result & mask(expr[:width])
+          when 'resize'
+            eval_expr(expr[:expr]) & mask(expr[:width])
+          else
+            0
+          end
+        end
+
+        def poke(name, value)
+          raise "Unknown input: #{name}" unless @inputs.include?(name)
+          width = @widths[name] || 64
+          @signals[name] = value & mask(width)
+        end
+
+        def peek(name)
+          @signals[name] || 0
+        end
+
+        def evaluate
+          10.times do
+            changed = false
+            @assigns.each do |assign|
+              new_val = eval_expr(assign[:expr])
+              width = @widths[assign[:target]] || 64
+              masked = new_val & mask(width)
+              if @signals[assign[:target]] != masked
+                @signals[assign[:target]] = masked
+                changed = true
+              end
+            end
+            break unless changed
+          end
+        end
+
+        def tick
+          evaluate
+
+          # Sample register inputs
+          next_regs = {}
+          @processes.each do |process|
+            next unless process[:clocked]
+            process[:statements]&.each do |stmt|
+              new_val = eval_expr(stmt[:expr])
+              width = @widths[stmt[:target]] || 64
+              next_regs[stmt[:target]] = new_val & mask(width)
+            end
+          end
+
+          # Update registers
+          next_regs.each do |name, val|
+            @signals[name] = val
+          end
+
+          evaluate
+        end
+
+        def reset
+          @signals.transform_values! { 0 }
+          # Apply register reset values
+          @reset_values.each do |name, val|
+            @signals[name] = val
+          end
+        end
+
+        def signal_count
+          @signals.length
+        end
+
+        def reg_count
+          @processes.sum { |p| p[:statements]&.length || 0 }
+        end
+
+        def input_names
+          @inputs
+        end
+
+        def output_names
+          @outputs
+        end
+
+        def stats
+          {
+            signal_count: signal_count,
+            reg_count: reg_count,
+            input_count: @inputs.length,
+            output_count: @outputs.length,
+            assign_count: @assigns.length,
+            process_count: @processes.length
+          }
+        end
+      end
+
+      # Convert Behavior IR to JSON format for the simulator
+      module IRToJson
+        module_function
+
+        def convert(ir)
+          {
+            name: ir.name,
+            ports: ir.ports.map { |p| port_to_hash(p) },
+            nets: ir.nets.map { |n| net_to_hash(n) },
+            regs: ir.regs.map { |r| reg_to_hash(r) },
+            assigns: ir.assigns.map { |a| assign_to_hash(a) },
+            processes: ir.processes.map { |p| process_to_hash(p) },
+            memories: (ir.memories || []).map { |m| memory_to_hash(m) },
+            write_ports: (ir.write_ports || []).map { |wp| write_port_to_hash(wp) },
+            sync_read_ports: (ir.sync_read_ports || []).map { |rp| sync_read_port_to_hash(rp) }
+          }.to_json(max_nesting: false)
+        end
+
+        def port_to_hash(port)
+          {
+            name: port.name.to_s,
+            direction: port.direction.to_s,
+            width: port.width
+          }
+        end
+
+        def net_to_hash(net)
+          {
+            name: net.name.to_s,
+            width: net.width
+          }
+        end
+
+        def reg_to_hash(reg)
+          hash = {
+            name: reg.name.to_s,
+            width: reg.width
+          }
+          hash[:reset_value] = reg.reset_value if reg.reset_value
+          hash
+        end
+
+        def assign_to_hash(assign)
+          {
+            target: assign.target.to_s,
+            expr: expr_to_hash(assign.expr)
+          }
+        end
+
+        def process_to_hash(process)
+          {
+            name: process.name.to_s,
+            clock: process.clock&.to_s,
+            clocked: process.clocked,
+            statements: flatten_statements(process.statements)
+          }
+        end
+
+        def flatten_statements(stmts)
+          return [] unless stmts
+          result = []
+          stmts.each do |stmt|
+            case stmt
+            when RHDL::Codegen::IR::SeqAssign
+              result << seq_assign_to_hash(stmt)
+            when RHDL::Codegen::IR::If
+              flatten_if(stmt, result)
+            end
+          end
+          result
+        end
+
+        def flatten_if(if_stmt, result)
+          cond = expr_to_hash(if_stmt.condition)
+
+          then_assigns = {}
+          if_stmt.then_statements&.each do |s|
+            case s
+            when RHDL::Codegen::IR::SeqAssign
+              then_assigns[s.target.to_s] = expr_to_hash(s.expr)
+            when RHDL::Codegen::IR::If
+              flatten_if(s, result)
+            end
+          end
+
+          else_assigns = {}
+          if_stmt.else_statements&.each do |s|
+            case s
+            when RHDL::Codegen::IR::SeqAssign
+              else_assigns[s.target.to_s] = expr_to_hash(s.expr)
+            when RHDL::Codegen::IR::If
+              flatten_if(s, result)
+            end
+          end
+
+          all_targets = (then_assigns.keys + else_assigns.keys).uniq
+          all_targets.each do |target|
+            then_expr = then_assigns[target]
+            else_expr = else_assigns[target]
+            width = (then_expr || else_expr)&.dig(:width) || 8
+
+            if then_expr && else_expr
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: cond, when_true: then_expr, when_false: else_expr, width: width }
+              }
+            elsif then_expr
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: cond, when_true: then_expr, when_false: { type: 'signal', name: target, width: width }, width: width }
+              }
+            elsif else_expr
+              inv_cond = { type: 'unary_op', op: '~', operand: cond, width: 1 }
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: inv_cond, when_true: else_expr, when_false: { type: 'signal', name: target, width: width }, width: width }
+              }
+            end
+          end
+        end
+
+        def seq_assign_to_hash(stmt)
+          {
+            target: stmt.target.to_s,
+            expr: expr_to_hash(stmt.expr)
+          }
+        end
+
+        def memory_to_hash(mem)
+          hash = {
+            name: mem.name.to_s,
+            depth: mem.depth,
+            width: mem.width
+          }
+          hash[:initial_data] = mem.initial_data if mem.initial_data
+          hash
+        end
+
+        def write_port_to_hash(wp)
+          {
+            memory: wp.memory.to_s,
+            clock: wp.clock.to_s,
+            addr: expr_to_hash(wp.addr),
+            data: expr_to_hash(wp.data),
+            enable: expr_to_hash(wp.enable)
+          }
+        end
+
+        def sync_read_port_to_hash(rp)
+          hash = {
+            memory: rp.memory.to_s,
+            clock: rp.clock.to_s,
+            addr: expr_to_hash(rp.addr),
+            data: rp.data.to_s
+          }
+          hash[:enable] = expr_to_hash(rp.enable) if rp.enable
+          hash
+        end
+
+        def expr_to_hash(expr)
+          case expr
+          when RHDL::Codegen::IR::Signal
+            { type: 'signal', name: expr.name.to_s, width: expr.width }
+          when RHDL::Codegen::IR::Literal
+            { type: 'literal', value: expr.value, width: expr.width }
+          when RHDL::Codegen::IR::UnaryOp
+            { type: 'unary_op', op: expr.op.to_s, operand: expr_to_hash(expr.operand), width: expr.width }
+          when RHDL::Codegen::IR::BinaryOp
+            { type: 'binary_op', op: expr.op.to_s, left: expr_to_hash(expr.left), right: expr_to_hash(expr.right), width: expr.width }
+          when RHDL::Codegen::IR::Mux
+            { type: 'mux', condition: expr_to_hash(expr.condition), when_true: expr_to_hash(expr.when_true), when_false: expr_to_hash(expr.when_false), width: expr.width }
+          when RHDL::Codegen::IR::Slice
+            low = 0
+            high = expr.width - 1
+
+            if expr.range.is_a?(Range)
+              range_begin = expr.range.begin
+              range_end = expr.range.end
+              if range_begin.is_a?(Integer) && range_end.is_a?(Integer)
+                low = [range_begin, range_end].min
+                high = [range_begin, range_end].max
+              end
+            elsif expr.range.is_a?(Integer)
+              low = expr.range
+              high = expr.range
+            end
+            { type: 'slice', base: expr_to_hash(expr.base), low: low, high: high, width: expr.width }
+          when RHDL::Codegen::IR::Concat
+            { type: 'concat', parts: expr.parts.map { |p| expr_to_hash(p) }, width: expr.width }
+          when RHDL::Codegen::IR::Resize
+            { type: 'resize', expr: expr_to_hash(expr.expr), width: expr.width }
+          when RHDL::Codegen::IR::Case
+            if expr.cases.empty?
+              expr_to_hash(expr.default)
+            else
+              result = expr.default ? expr_to_hash(expr.default) : { type: 'literal', value: 0, width: expr.width }
+              expr.cases.each do |values, case_expr|
+                values.each do |v|
+                  cond = { type: 'binary_op', op: '==', left: expr_to_hash(expr.selector), right: { type: 'literal', value: v, width: expr.selector.width }, width: 1 }
+                  result = { type: 'mux', condition: cond, when_true: expr_to_hash(case_expr), when_false: result, width: expr.width }
+                end
+              end
+              result
+            end
+          when RHDL::Codegen::IR::MemoryRead
+            { type: 'mem_read', memory: expr.memory.to_s, addr: expr_to_hash(expr.addr), width: expr.width }
+          else
+            { type: 'literal', value: 0, width: 1 }
+          end
+        end
+      end
+    end
+    end
+  end
+end
diff --git a/lib/rhdl/sim/native/ir/ir_simulator.rb b/lib/rhdl/sim/native/ir/ir_simulator.rb
new file mode 100644
index 00000000..aadac4c3
--- /dev/null
+++ b/lib/rhdl/sim/native/ir/ir_simulator.rb
@@ -0,0 +1,1511 @@
+# frozen_string_literal: true
+
+# IR-level bytecode interpreter with Rust backend (Fiddle-based)
+#
+# This simulator operates at the IR level, interpreting Behavior IR using
+# a stack-based bytecode interpreter. It's faster than gate-level netlist
+# simulation because it operates on whole words instead of individual bits.
+#
+# Uses Fiddle (Ruby's built-in FFI) to call the Rust library directly,
+# similar to the JIT and Verilator runners.
+
+require 'json'
+require 'fiddle'
+require 'fiddle/import'
+require 'rbconfig'
+
+module RHDL
+  module Sim
+    module Native
+      module IR
+      def self.sim_lib_name(base)
+        case RbConfig::CONFIG['host_os']
+        when /darwin/ then "#{base}.dylib"
+        when /mswin|mingw/ then "#{base}.dll"
+        else "#{base}.so"
+        end
+      end
+
+      def self.sim_backend_available?(lib_path)
+        return false unless File.exist?(lib_path)
+
+        _test_lib = Fiddle.dlopen(lib_path)
+        _test_lib['sim_create']
+        _test_lib['sim_signal']
+        _test_lib['sim_exec']
+        true
+      rescue Fiddle::DLError
+        false
+      end
+
+      IR_INTERPRETER_EXT_DIR = File.expand_path('ir_interpreter/lib', __dir__)
+      IR_INTERPRETER_LIB_NAME = sim_lib_name('ir_interpreter')
+      IR_INTERPRETER_LIB_PATH = File.join(IR_INTERPRETER_EXT_DIR, IR_INTERPRETER_LIB_NAME)
+
+      JIT_EXT_DIR = File.expand_path('ir_jit/lib', __dir__)
+      JIT_LIB_NAME = sim_lib_name('ir_jit')
+      JIT_LIB_PATH = File.join(JIT_EXT_DIR, JIT_LIB_NAME)
+
+      COMPILER_EXT_DIR = File.expand_path('ir_compiler/lib', __dir__)
+      COMPILER_LIB_NAME = sim_lib_name('ir_compiler')
+      COMPILER_LIB_PATH = File.join(COMPILER_EXT_DIR, COMPILER_LIB_NAME)
+
+      IR_INTERPRETER_AVAILABLE = sim_backend_available?(IR_INTERPRETER_LIB_PATH)
+      JIT_AVAILABLE = sim_backend_available?(JIT_LIB_PATH)
+      COMPILER_AVAILABLE = sim_backend_available?(COMPILER_LIB_PATH)
+
+      # Unified IR simulator wrapper for interpreter, JIT and compiler backends.
+      class IrSimulator
+        attr_reader :ir_json, :sub_cycles
+
+        RUNNER_KIND_NONE = 0
+        RUNNER_KIND_APPLE2 = 1
+        RUNNER_KIND_MOS6502 = 2
+        RUNNER_KIND_GAMEBOY = 3
+        RUNNER_KIND_CPU8BIT = 4
+        RUNNER_KIND_RISCV = 5
+
+        RUNNER_MEM_OP_LOAD = 0
+        RUNNER_MEM_OP_READ = 1
+        RUNNER_MEM_OP_WRITE = 2
+
+        RUNNER_MEM_SPACE_MAIN = 0
+        RUNNER_MEM_SPACE_ROM = 1
+        RUNNER_MEM_SPACE_BOOT_ROM = 2
+        RUNNER_MEM_SPACE_VRAM = 3
+        RUNNER_MEM_SPACE_ZPRAM = 4
+        RUNNER_MEM_SPACE_WRAM = 5
+        RUNNER_MEM_SPACE_FRAMEBUFFER = 6
+        RUNNER_MEM_SPACE_DISK = 7
+        RUNNER_MEM_SPACE_UART_TX = 8
+        RUNNER_MEM_SPACE_UART_RX = 9
+
+        RUNNER_MEM_FLAG_MAPPED = 1
+
+        RUNNER_RUN_MODE_BASIC = 0
+        RUNNER_RUN_MODE_FULL = 1
+
+        RUNNER_CONTROL_SET_RESET_VECTOR = 0
+        RUNNER_CONTROL_RESET_SPEAKER_TOGGLES = 1
+        RUNNER_CONTROL_RESET_LCD = 2
+        RUNNER_CONTROL_RISCV_SET_IRQS = 3
+        RUNNER_CONTROL_RISCV_SET_PLIC_SOURCES = 4
+        RUNNER_CONTROL_RISCV_UART_PUSH_RX = 5
+        RUNNER_CONTROL_RISCV_CLEAR_UART_TX = 6
+
+        RUNNER_PROBE_KIND = 0
+        RUNNER_PROBE_IS_MODE = 1
+        RUNNER_PROBE_SPEAKER_TOGGLES = 2
+        RUNNER_PROBE_FRAMEBUFFER_LEN = 3
+        RUNNER_PROBE_FRAME_COUNT = 4
+        RUNNER_PROBE_V_CNT = 5
+        RUNNER_PROBE_H_CNT = 6
+        RUNNER_PROBE_VBLANK_IRQ = 7
+        RUNNER_PROBE_IF_R = 8
+        RUNNER_PROBE_SIGNAL = 9
+        RUNNER_PROBE_LCDC_ON = 10
+        RUNNER_PROBE_H_DIV_CNT = 11
+        RUNNER_PROBE_RISCV_UART_TX_LEN = 17
+
+        SIM_CAP_SIGNAL_INDEX = 1 << 0
+        SIM_CAP_FORCED_CLOCK = 1 << 1
+        SIM_CAP_TRACE = 1 << 2
+        SIM_CAP_TRACE_STREAMING = 1 << 3
+        SIM_CAP_COMPILE = 1 << 4
+
+        SIM_SIGNAL_HAS = 0
+        SIM_SIGNAL_GET_INDEX = 1
+        SIM_SIGNAL_PEEK = 2
+        SIM_SIGNAL_POKE = 3
+        SIM_SIGNAL_PEEK_INDEX = 4
+        SIM_SIGNAL_POKE_INDEX = 5
+
+        SIM_EXEC_EVALUATE = 0
+        SIM_EXEC_TICK = 1
+        SIM_EXEC_TICK_FORCED = 2
+        SIM_EXEC_SET_PREV_CLOCK = 3
+        SIM_EXEC_GET_CLOCK_LIST_IDX = 4
+        SIM_EXEC_RESET = 5
+        SIM_EXEC_RUN_TICKS = 6
+        SIM_EXEC_SIGNAL_COUNT = 7
+        SIM_EXEC_REG_COUNT = 8
+        SIM_EXEC_COMPILE = 9
+        SIM_EXEC_IS_COMPILED = 10
+
+        SIM_TRACE_START = 0
+        SIM_TRACE_START_STREAMING = 1
+        SIM_TRACE_STOP = 2
+        SIM_TRACE_ENABLED = 3
+        SIM_TRACE_CAPTURE = 4
+        SIM_TRACE_ADD_SIGNAL = 5
+        SIM_TRACE_ADD_SIGNALS_MATCHING = 6
+        SIM_TRACE_ALL_SIGNALS = 7
+        SIM_TRACE_CLEAR_SIGNALS = 8
+        SIM_TRACE_CLEAR = 9
+        SIM_TRACE_CHANGE_COUNT = 10
+        SIM_TRACE_SIGNAL_COUNT = 11
+        SIM_TRACE_SET_TIMESCALE = 12
+        SIM_TRACE_SET_MODULE_NAME = 13
+        SIM_TRACE_SAVE_VCD = 14
+
+        SIM_BLOB_INPUT_NAMES = 0
+        SIM_BLOB_OUTPUT_NAMES = 1
+        SIM_BLOB_TRACE_TO_VCD = 2
+        SIM_BLOB_TRACE_TAKE_LIVE_VCD = 3
+        SIM_BLOB_GENERATED_CODE = 4
+
+        BACKEND_CONFIGS = {
+          interpreter: {
+            available: IR_INTERPRETER_AVAILABLE,
+            lib_path: IR_INTERPRETER_LIB_PATH,
+            native_symbol: :interpret,
+            label: 'interpreter'
+          },
+          jit: {
+            available: JIT_AVAILABLE,
+            lib_path: JIT_LIB_PATH,
+            native_symbol: :jit,
+            label: 'jit'
+          },
+          compiler: {
+            available: COMPILER_AVAILABLE,
+            lib_path: COMPILER_LIB_PATH,
+            native_symbol: :compile,
+            label: 'compiler'
+          }
+        }.freeze
+
+        # @param ir_json [String] JSON representation of the IR
+        # @param backend [Symbol] :interpreter, :jit, :compiler, or :auto
+        # @param allow_fallback [Boolean] Allow fallback to another backend or Ruby implementation
+        # @param sub_cycles [Integer] Number of sub-cycles per CPU cycle (default: 14)
+        def initialize(ir_json, backend: :interpreter, allow_fallback: true, sub_cycles: 14)
+          @ir_json = ir_json
+          @sub_cycles = sub_cycles.clamp(1, 14)
+          @requested_backend = normalize_backend(backend)
+
+          selected = select_backend(@requested_backend)
+
+          if selected
+            configure_backend(selected)
+            load_library
+            create_simulator
+            compile if @backend == :compile
+          elsif allow_fallback
+            @sim = RubyIrSim.new(ir_json)
+            @backend = :ruby
+            @fallback = true
+          else
+            raise LoadError, unavailable_backend_error_message(@requested_backend)
+          end
+        end
+
+        def simulator_type
+          :"hdl_#{@backend}"
+        end
+
+        def native?
+          !@fallback && @backend != :ruby
+        end
+
+        def backend
+          @backend
+        end
+
+        def poke(name, value)
+          return @sim.poke(name, value) if @fallback
+          core_signal(SIM_SIGNAL_POKE, name: name, value: value)[:ok]
+        end
+
+        def peek(name)
+          return @sim.peek(name) if @fallback
+          core_signal(SIM_SIGNAL_PEEK, name: name)[:value]
+        end
+
+        def has_signal?(name)
+          return @sim.respond_to?(:has_signal?) && @sim.has_signal?(name) if @fallback
+          core_signal(SIM_SIGNAL_HAS, name: name)[:value] != 0
+        end
+
+        def evaluate
+          return @sim.evaluate if @fallback
+          core_exec(SIM_EXEC_EVALUATE)
+        end
+
+        def tick
+          return @sim.tick if @fallback
+          core_exec(SIM_EXEC_TICK)
+        end
+
+        def tick_forced
+          return @sim.tick if @fallback  # Ruby fallback doesn't need edge detection
+          core_exec(SIM_EXEC_TICK_FORCED)
+        end
+
+        def set_prev_clock(clock_list_idx, value)
+          return if @fallback  # Ruby fallback doesn't track prev clocks
+          core_exec(SIM_EXEC_SET_PREV_CLOCK, clock_list_idx, value)
+        end
+
+        def get_clock_list_idx(signal_idx)
+          return -1 if @fallback
+          result = core_exec(SIM_EXEC_GET_CLOCK_LIST_IDX, signal_idx)
+          result[:ok] ? result[:value] : -1
+        end
+
+        def reset
+          return @sim.reset if @fallback
+          @sim_runner_speaker_toggles = 0
+          core_exec(SIM_EXEC_RESET)
+        end
+
+        def signal_count
+          return @sim.signal_count if @fallback
+          core_exec(SIM_EXEC_SIGNAL_COUNT)[:value]
+        end
+
+        def reg_count
+          return @sim.reg_count if @fallback
+          core_exec(SIM_EXEC_REG_COUNT)[:value]
+        end
+
+        def compiled?
+          return false if @fallback
+          core_exec(SIM_EXEC_IS_COMPILED)[:value] != 0
+        end
+
+        def compile
+          return true if @fallback
+
+          error_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          error_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack('Q')
+          result = core_exec(SIM_EXEC_COMPILE, 0, 0, error_ptr)
+          return result[:value] != 0 if result[:ok]
+
+          error_str_ptr = error_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q')
+          if error_str_ptr != 0
+            error_msg = Fiddle::Pointer.new(error_str_ptr).to_s
+            @fn_free_error.call(error_str_ptr)
+            raise RuntimeError, "Compilation failed: #{error_msg}"
+          end
+          false
+        end
+
+        def generated_code
+          return '' if @fallback
+          core_blob(SIM_BLOB_GENERATED_CODE)
+        end
+
+        def input_names
+          return @sim.input_names if @fallback
+          csv = core_blob(SIM_BLOB_INPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        def output_names
+          return @sim.output_names if @fallback
+          csv = core_blob(SIM_BLOB_OUTPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        # VCD tracing methods
+        def trace_start
+          return @sim.trace_start if @fallback && @sim.respond_to?(:trace_start)
+          return false if @fallback
+          core_trace(SIM_TRACE_START)[:ok]
+        end
+
+        def trace_start_streaming(path)
+          return @sim.trace_start_streaming(path) if @fallback && @sim.respond_to?(:trace_start_streaming)
+          return false if @fallback
+          core_trace(SIM_TRACE_START_STREAMING, path)[:ok]
+        end
+
+        def trace_stop
+          return @sim.trace_stop if @fallback && @sim.respond_to?(:trace_stop)
+          return nil if @fallback
+          core_trace(SIM_TRACE_STOP)
+        end
+
+        def trace_enabled?
+          return @sim.trace_enabled? if @fallback && @sim.respond_to?(:trace_enabled?)
+          return false if @fallback
+          core_trace(SIM_TRACE_ENABLED)[:value] != 0
+        end
+
+        def trace_capture
+          return @sim.trace_capture if @fallback && @sim.respond_to?(:trace_capture)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CAPTURE)
+        end
+
+        def trace_add_signal(name)
+          return @sim.trace_add_signal(name) if @fallback && @sim.respond_to?(:trace_add_signal)
+          return false if @fallback
+          core_trace(SIM_TRACE_ADD_SIGNAL, name)[:ok]
+        end
+
+        def trace_add_signals_matching(pattern)
+          return @sim.trace_add_signals_matching(pattern) if @fallback && @sim.respond_to?(:trace_add_signals_matching)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_ADD_SIGNALS_MATCHING, pattern)[:value]
+        end
+
+        def trace_all_signals
+          return @sim.trace_all_signals if @fallback && @sim.respond_to?(:trace_all_signals)
+          return nil if @fallback
+          core_trace(SIM_TRACE_ALL_SIGNALS)
+        end
+
+        def trace_clear_signals
+          return @sim.trace_clear_signals if @fallback && @sim.respond_to?(:trace_clear_signals)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CLEAR_SIGNALS)
+        end
+
+        def trace_to_vcd
+          return @sim.trace_to_vcd if @fallback && @sim.respond_to?(:trace_to_vcd)
+          return '' if @fallback
+          core_blob(SIM_BLOB_TRACE_TO_VCD)
+        end
+
+        def trace_take_live_vcd
+          return @sim.trace_take_live_vcd if @fallback && @sim.respond_to?(:trace_take_live_vcd)
+          return '' if @fallback
+          core_blob(SIM_BLOB_TRACE_TAKE_LIVE_VCD)
+        end
+
+        def trace_save_vcd(path)
+          return @sim.trace_save_vcd(path) if @fallback && @sim.respond_to?(:trace_save_vcd)
+          return false if @fallback
+          core_trace(SIM_TRACE_SAVE_VCD, path)[:ok]
+        end
+
+        def trace_clear
+          return @sim.trace_clear if @fallback && @sim.respond_to?(:trace_clear)
+          return nil if @fallback
+          core_trace(SIM_TRACE_CLEAR)
+        end
+
+        def trace_change_count
+          return @sim.trace_change_count if @fallback && @sim.respond_to?(:trace_change_count)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_CHANGE_COUNT)[:value]
+        end
+
+        def trace_signal_count
+          return @sim.trace_signal_count if @fallback && @sim.respond_to?(:trace_signal_count)
+          return 0 if @fallback
+          core_trace(SIM_TRACE_SIGNAL_COUNT)[:value]
+        end
+
+        def trace_set_timescale(timescale)
+          return @sim.trace_set_timescale(timescale) if @fallback && @sim.respond_to?(:trace_set_timescale)
+          return false if @fallback
+          core_trace(SIM_TRACE_SET_TIMESCALE, timescale)[:ok]
+        end
+
+        def trace_set_module_name(name)
+          return @sim.trace_set_module_name(name) if @fallback && @sim.respond_to?(:trace_set_module_name)
+          return false if @fallback
+          core_trace(SIM_TRACE_SET_MODULE_NAME, name)[:ok]
+        end
+
+        def stats
+          return @sim.stats if @fallback
+          runner_kind = runner_kind
+          {
+            signals: signal_count,
+            regs: reg_count,
+            runner_kind: runner_kind,
+            runner_mode: runner_mode?,
+            apple2_mode: runner_kind == :apple2,
+            gameboy_mode: gameboy_mode?,
+            mos6502_mode: runner_kind == :mos6502,
+            cpu8bit_mode: runner_kind == :cpu8bit,
+            riscv_mode: runner_kind == :riscv
+          }
+        end
+
+        # Batched tick execution
+        def run_ticks(n)
+          return @sim.respond_to?(:run_ticks) ? @sim.run_ticks(n) : n.times { @sim.tick } if @fallback
+          core_exec(SIM_EXEC_RUN_TICKS, n)
+        end
+
+        # Get signal index by name (for caching)
+        def get_signal_idx(name)
+          return @sim.respond_to?(:get_signal_idx) ? @sim.get_signal_idx(name) : nil if @fallback
+          result = core_signal(SIM_SIGNAL_GET_INDEX, name: name)
+          result[:ok] ? result[:value] : nil
+        end
+
+        # Poke by index - faster than by name when index is cached
+        def poke_by_idx(idx, value)
+          return @sim.poke_by_idx(idx, value) if @fallback && @sim.respond_to?(:poke_by_idx)
+          core_signal(SIM_SIGNAL_POKE_INDEX, idx: idx, value: value)
+        end
+
+        # Peek by index - faster than by name when index is cached
+        def peek_by_idx(idx)
+          return @sim.peek_by_idx(idx) if @fallback && @sim.respond_to?(:peek_by_idx)
+          core_signal(SIM_SIGNAL_PEEK_INDEX, idx: idx)[:value]
+        end
+
+        # ====================================================================
+        # Unified Runner Extension Methods
+        # ====================================================================
+
+        def runner_kind
+          if @fallback
+            return @sim.runner_kind if @sim.respond_to?(:runner_kind)
+            return nil
+          end
+
+          case runner_probe(RUNNER_PROBE_KIND)
+          when RUNNER_KIND_APPLE2 then :apple2
+          when RUNNER_KIND_MOS6502 then :mos6502
+          when RUNNER_KIND_GAMEBOY then :gameboy
+          when RUNNER_KIND_CPU8BIT then :cpu8bit
+          when RUNNER_KIND_RISCV then :riscv
+          else nil
+          end
+        end
+
+        def runner_mode?
+          if @fallback
+            return @sim.runner_mode? if @sim.respond_to?(:runner_mode?)
+            return !runner_kind.nil?
+          end
+          runner_probe(RUNNER_PROBE_IS_MODE) != 0
+        end
+
+        def runner_load_memory(data, offset = 0, is_rom = false)
+          if @fallback
+            return @sim.runner_load_memory(data, offset, is_rom) if @sim.respond_to?(:runner_load_memory)
+            return false
+          end
+          data = data.pack('C*') if data.is_a?(Array)
+          return false if data.nil? || data.bytesize.zero?
+
+          space = is_rom ? RUNNER_MEM_SPACE_ROM : RUNNER_MEM_SPACE_MAIN
+          runner_mem(RUNNER_MEM_OP_LOAD, space, offset, data, 0) > 0
+        end
+
+        def runner_read_memory(offset, length, mapped: true)
+          length = [length.to_i, 0].max
+          if @fallback
+            return @sim.runner_read_memory(offset, length, mapped: mapped) if @sim.respond_to?(:runner_read_memory)
+            return Array.new(length, 0)
+          end
+          return [] if length.zero?
+
+          flags = mapped ? RUNNER_MEM_FLAG_MAPPED : 0
+          runner_mem_read(RUNNER_MEM_SPACE_MAIN, offset, length, flags)
+        end
+
+        def runner_write_memory(offset, data, mapped: true)
+          if @fallback
+            return @sim.runner_write_memory(offset, data, mapped: mapped) if @sim.respond_to?(:runner_write_memory)
+            return 0
+          end
+          data = data.pack('C*') if data.is_a?(Array)
+          return 0 if data.nil? || data.bytesize.zero?
+
+          flags = mapped ? RUNNER_MEM_FLAG_MAPPED : 0
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_MAIN, offset, data, flags)
+        end
+
+        def runner_run_cycles(n, key_data = 0, key_ready = false)
+          if @fallback
+            return @sim.runner_run_cycles(n, key_data, key_ready) if @sim.respond_to?(:runner_run_cycles)
+            return { text_dirty: false, key_cleared: false, cycles_run: 0, speaker_toggles: 0 }
+          end
+
+          result_buf = Fiddle::Pointer.malloc(20)
+          ok = @fn_runner_run.call(
+            @ctx,
+            n,
+            key_data,
+            key_ready ? 1 : 0,
+            RUNNER_RUN_MODE_BASIC,
+            result_buf
+          )
+          return nil if ok == 0
+
+          values = result_buf[0, 20].unpack('llLLL')
+          result = {
+            text_dirty: values[0] != 0,
+            key_cleared: values[1] != 0,
+            cycles_run: values[2],
+            speaker_toggles: values[3]
+          }
+          @sim_runner_speaker_toggles = ((@sim_runner_speaker_toggles || 0) + result[:speaker_toggles]) & 0xFFFFFFFF
+          result
+        end
+
+        def runner_load_rom(data, offset = 0)
+          if @fallback
+            return @sim.runner_load_rom(data, offset) if @sim.respond_to?(:runner_load_rom)
+          end
+
+          data = data.pack('C*') if data.is_a?(Array)
+          return false if data.nil? || data.bytesize.zero?
+          runner_mem(RUNNER_MEM_OP_LOAD, RUNNER_MEM_SPACE_ROM, offset, data, 0) > 0
+        end
+
+        def runner_read_rom(offset, length)
+          length = [length.to_i, 0].max
+          if @fallback
+            return @sim.runner_read_rom(offset, length) if @sim.respond_to?(:runner_read_rom)
+            return Array.new(length, 0)
+          end
+          return [] if length.zero?
+
+          runner_mem_read(RUNNER_MEM_SPACE_ROM, offset, length, 0)
+        end
+
+        def runner_set_reset_vector(addr)
+          vector = addr.to_i & 0xFFFF_FFFF
+          if @fallback
+            return @sim.runner_set_reset_vector(vector) if @sim.respond_to?(:runner_set_reset_vector)
+          end
+          return false unless @fn_runner_control
+
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_SET_RESET_VECTOR, vector, 0) != 0
+        end
+
+        def runner_speaker_toggles
+          if @fallback
+            return @sim.runner_speaker_toggles if @sim.respond_to?(:runner_speaker_toggles)
+            return 0
+          end
+          return runner_probe(RUNNER_PROBE_SPEAKER_TOGGLES) if runner_kind == :mos6502
+          @sim_runner_speaker_toggles || 0
+        end
+
+        def runner_reset_speaker_toggles
+          if @fallback
+            return @sim.runner_reset_speaker_toggles if @sim.respond_to?(:runner_reset_speaker_toggles)
+            return nil
+          end
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RESET_SPEAKER_TOGGLES, 0, 0)
+          @sim_runner_speaker_toggles = 0
+          nil
+        end
+
+        # ====================================================================
+        # RISC-V Extension Methods
+        # ====================================================================
+
+        def riscv_mode?
+          return @sim.riscv_mode? if @fallback && @sim.respond_to?(:riscv_mode?)
+          return false if @fallback
+          runner_kind == :riscv
+        end
+
+        def runner_riscv_set_interrupts(software: false, timer: false, external: false)
+          return false unless riscv_mode?
+          bits = 0
+          bits |= 0x1 if software
+          bits |= 0x2 if timer
+          bits |= 0x4 if external
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RISCV_SET_IRQS, bits, 0) != 0
+        end
+
+        def runner_riscv_set_plic_sources(source1: false, source10: false)
+          return false unless riscv_mode?
+          bits = 0
+          bits |= 0x1 if source1
+          bits |= 0x2 if source10
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RISCV_SET_PLIC_SOURCES, bits, 0) != 0
+        end
+
+        def runner_riscv_uart_receive_byte(byte)
+          runner_riscv_uart_receive_bytes([byte.to_i & 0xFF])
+        end
+
+        def runner_riscv_uart_receive_bytes(bytes)
+          return false unless riscv_mode?
+
+          payload = if bytes.is_a?(String)
+            bytes.b
+          elsif bytes.respond_to?(:pack)
+            bytes.pack('C*')
+          else
+            Array(bytes).pack('C*')
+          end
+          return true if payload.empty?
+
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_UART_RX, 0, payload, 0) > 0
+        end
+
+        def runner_riscv_uart_receive_text(text)
+          runner_riscv_uart_receive_bytes(text.to_s.b)
+        end
+
+        def runner_riscv_uart_tx_bytes
+          return [] unless riscv_mode?
+          len = runner_probe(RUNNER_PROBE_RISCV_UART_TX_LEN).to_i
+          return [] if len <= 0
+          runner_mem_read(RUNNER_MEM_SPACE_UART_TX, 0, len, 0)
+        end
+
+        def runner_riscv_clear_uart_tx_bytes
+          return nil unless riscv_mode?
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RISCV_CLEAR_UART_TX, 0, 0)
+          nil
+        end
+
+        def runner_riscv_load_disk(data, offset = 0)
+          return false unless riscv_mode?
+          data = data.pack('C*') if data.is_a?(Array)
+          return false if data.nil? || data.bytesize.zero?
+          runner_mem(RUNNER_MEM_OP_LOAD, RUNNER_MEM_SPACE_DISK, offset, data, 0) > 0
+        end
+
+        def runner_riscv_read_disk(offset, length)
+          return [] unless riscv_mode?
+          length = [length.to_i, 0].max
+          return [] if length.zero?
+          runner_mem_read(RUNNER_MEM_SPACE_DISK, offset, length, 0)
+        end
+
+        # ====================================================================
+        # Game Boy Extension Methods
+        # ====================================================================
+
+        def gameboy_mode?
+          return @sim.gameboy_mode? if @fallback && @sim.respond_to?(:gameboy_mode?)
+          return false if @fallback
+          runner_kind == :gameboy
+        end
+
+        def load_rom(data)
+          return @sim.load_rom(data) if @fallback && @sim.respond_to?(:load_rom)
+          return if @fallback
+          runner_load_rom(data, 0)
+        end
+
+        def load_boot_rom(data)
+          return @sim.load_boot_rom(data) if @fallback && @sim.respond_to?(:load_boot_rom)
+          return if @fallback
+          data = data.pack('C*') if data.is_a?(Array)
+          runner_mem(RUNNER_MEM_OP_LOAD, RUNNER_MEM_SPACE_BOOT_ROM, 0, data, 0)
+        end
+
+        def run_gb_cycles(n)
+          return @sim.run_gb_cycles(n) if @fallback && @sim.respond_to?(:run_gb_cycles)
+          return { cycles_run: 0, frames_completed: 0 } if @fallback
+
+          result_buf = Fiddle::Pointer.malloc(20)
+          ok = @fn_runner_run.call(@ctx, n, 0, 0, RUNNER_RUN_MODE_FULL, result_buf)
+          return { cycles_run: 0, frames_completed: 0 } if ok == 0
+          values = result_buf[0, 20].unpack('llLLL')
+          {
+            cycles_run: values[2],
+            frames_completed: values[4]
+          }
+        end
+
+        def read_vram(addr)
+          return @sim.read_vram(addr) if @fallback && @sim.respond_to?(:read_vram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_VRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_vram(addr, data)
+          return @sim.write_vram(addr, data) if @fallback && @sim.respond_to?(:write_vram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_VRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_zpram(addr)
+          return @sim.read_zpram(addr) if @fallback && @sim.respond_to?(:read_zpram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_ZPRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_zpram(addr, data)
+          return @sim.write_zpram(addr, data) if @fallback && @sim.respond_to?(:write_zpram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_ZPRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_wram(addr)
+          return @sim.read_wram(addr) if @fallback && @sim.respond_to?(:read_wram)
+          return 0 if @fallback
+          bytes = runner_mem_read(RUNNER_MEM_SPACE_WRAM, addr, 1, 0)
+          bytes.empty? ? 0 : (bytes[0] & 0xFF)
+        end
+
+        def write_wram(addr, data)
+          return @sim.write_wram(addr, data) if @fallback && @sim.respond_to?(:write_wram)
+          return if @fallback
+          runner_mem(RUNNER_MEM_OP_WRITE, RUNNER_MEM_SPACE_WRAM, addr, [data].pack('C'), 0)
+        end
+
+        def read_framebuffer
+          return @sim.read_framebuffer if @fallback && @sim.respond_to?(:read_framebuffer)
+          return [] if @fallback
+
+          len = runner_probe(RUNNER_PROBE_FRAMEBUFFER_LEN)
+          return [] if len <= 0
+          runner_mem_read(RUNNER_MEM_SPACE_FRAMEBUFFER, 0, len, 0)
+        end
+
+        def frame_count
+          return @sim.frame_count if @fallback && @sim.respond_to?(:frame_count)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_FRAME_COUNT)
+        end
+
+        def reset_lcd_state
+          return @sim.reset_lcd_state if @fallback && @sim.respond_to?(:reset_lcd_state)
+          return if @fallback
+          @fn_runner_control.call(@ctx, RUNNER_CONTROL_RESET_LCD, 0, 0)
+        end
+
+        def get_v_cnt
+          return @sim.get_v_cnt if @fallback && @sim.respond_to?(:get_v_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_V_CNT)
+        end
+
+        def get_h_cnt
+          return @sim.get_h_cnt if @fallback && @sim.respond_to?(:get_h_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_H_CNT)
+        end
+
+        def get_vblank_irq
+          return @sim.get_vblank_irq if @fallback && @sim.respond_to?(:get_vblank_irq)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_VBLANK_IRQ)
+        end
+
+        def get_if_r
+          return @sim.get_if_r if @fallback && @sim.respond_to?(:get_if_r)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_IF_R)
+        end
+
+        def get_signal(idx)
+          return @sim.get_signal(idx) if @fallback && @sim.respond_to?(:get_signal)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_SIGNAL, idx)
+        end
+
+        def get_lcdc_on
+          return @sim.get_lcdc_on if @fallback && @sim.respond_to?(:get_lcdc_on)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_LCDC_ON)
+        end
+
+        def get_h_div_cnt
+          return @sim.get_h_div_cnt if @fallback && @sim.respond_to?(:get_h_div_cnt)
+          return 0 if @fallback
+          runner_probe(RUNNER_PROBE_H_DIV_CNT)
+        end
+
+        def core_signal(op, name: nil, idx: 0, value: 0)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_signal.call(@ctx, op, name, idx, value, out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_exec(op, arg0 = 0, arg1 = 0, error_out = nil)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_exec.call(@ctx, op, arg0, arg1, out, error_out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_trace(op, str_arg = nil)
+          out = Fiddle::Pointer.malloc(Fiddle::SIZEOF_LONG)
+          out[0, Fiddle::SIZEOF_LONG] = [0].pack(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          rc = @fn_sim_trace.call(@ctx, op, str_arg, out)
+          {
+            ok: rc != 0,
+            value: out[0, Fiddle::SIZEOF_LONG].unpack1(Fiddle::SIZEOF_LONG == 8 ? 'Q' : 'L')
+          }
+        end
+
+        def core_blob(op)
+          len = @fn_sim_blob.call(@ctx, op, nil, 0)
+          return '' if len.nil? || len.to_i <= 0
+          buf = Fiddle::Pointer.malloc(len)
+          actual = @fn_sim_blob.call(@ctx, op, buf, len)
+          return '' if actual.nil? || actual.to_i <= 0
+          buf[0, actual]
+        end
+
+        def runner_mem(op, space, offset, data, flags)
+          @fn_runner_mem.call(@ctx, op, space, offset, data, data.bytesize, flags)
+        end
+
+        def runner_mem_read(space, offset, length, flags)
+          length = [length.to_i, 0].max
+          return [] if length.zero?
+
+          buf = Fiddle::Pointer.malloc(length)
+          read_len = @fn_runner_mem.call(@ctx, RUNNER_MEM_OP_READ, space, offset, buf, length, flags)
+          buf[0, read_len].unpack('C*')
+        end
+
+        def runner_probe(op, arg0 = 0)
+          @fn_runner_probe.call(@ctx, op, arg0)
+        end
+
+        def respond_to_missing?(method_name, include_private = false)
+          (@fallback && @sim.respond_to?(method_name)) || super
+        end
+
+        def method_missing(method_name, *args, &block)
+          if @fallback && @sim.respond_to?(method_name)
+            @sim.send(method_name, *args, &block)
+          else
+            super
+          end
+        end
+
+        private
+
+        def normalize_backend(backend)
+          value = backend.to_sym
+          value = :interpreter if value == :interpret
+          value = :compiler if value == :compile
+          return value if BACKEND_CONFIGS.key?(value) || value == :auto
+          raise ArgumentError, "Unknown IR backend: #{backend.inspect}"
+        end
+
+        def backend_candidates(requested)
+          case requested
+          when :interpreter then %i[interpreter]
+          when :jit then %i[jit interpreter]
+          when :compiler then %i[compiler interpreter]
+          when :auto then %i[compiler jit interpreter]
+          else []
+          end
+        end
+
+        def select_backend(requested)
+          backend_candidates(requested).find { |name| BACKEND_CONFIGS[name][:available] }
+        end
+
+        def configure_backend(name)
+          config = BACKEND_CONFIGS[name]
+          @lib_path = config[:lib_path]
+          @backend = config[:native_symbol]
+          @backend_label = config[:label]
+        end
+
+        def unavailable_backend_error_message(requested)
+          case requested
+          when :interpreter
+            "IR interpreter extension not found at: #{IR_INTERPRETER_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :jit
+            "IR JIT extension not found at: #{JIT_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :compiler
+            "IR compiler extension not found at: #{COMPILER_LIB_PATH}\nRun 'rake native:build' to build it."
+          when :auto
+            "No IR backend extension found (searched compiler, jit, interpreter).\nRun 'rake native:build' to build them."
+          else
+            "IR backend not available."
+          end
+        end
+
+        def load_library
+          @lib = Fiddle.dlopen(@lib_path)
+
+          # Core functions
+          @fn_create = Fiddle::Function.new(
+            @lib['sim_create'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOIDP
+          )
+
+          @fn_destroy = Fiddle::Function.new(
+            @lib['sim_destroy'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+
+          @fn_free_error = Fiddle::Function.new(
+            @lib['sim_free_error'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+
+          @fn_sim_get_caps = Fiddle::Function.new(
+            @lib['sim_get_caps'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_signal = Fiddle::Function.new(
+            @lib['sim_signal'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_ULONG, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_exec = Fiddle::Function.new(
+            @lib['sim_exec'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_ULONG, Fiddle::TYPE_ULONG, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_trace = Fiddle::Function.new(
+            @lib['sim_trace'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_sim_blob = Fiddle::Function.new(
+            @lib['sim_blob'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T],
+            Fiddle::TYPE_SIZE_T
+          )
+
+          # Unified runner functions
+          @fn_runner_get_caps = Fiddle::Function.new(
+            @lib['runner_get_caps'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_mem = Fiddle::Function.new(
+            @lib['runner_mem'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_SIZE_T
+          )
+
+          @fn_runner_run = Fiddle::Function.new(
+            @lib['runner_run'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_CHAR, Fiddle::TYPE_INT, Fiddle::TYPE_UINT, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_control = Fiddle::Function.new(
+            @lib['runner_control'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_INT
+          )
+
+          @fn_runner_probe = Fiddle::Function.new(
+            @lib['runner_probe'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_UINT, Fiddle::TYPE_UINT],
+            Fiddle::TYPE_LONG_LONG
+          )
+        end
+
+        def create_simulator
+          error_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          error_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack('Q')
+
+          @ctx = @fn_create.call(@ir_json, @ir_json.bytesize, @sub_cycles, error_ptr)
+
+          if @ctx.null?
+            error_str_ptr = error_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1('Q')
+            if error_str_ptr != 0
+              error_msg = Fiddle::Pointer.new(error_str_ptr).to_s
+              @fn_free_error.call(error_str_ptr)
+              raise RuntimeError, "Failed to create #{@backend_label} simulator: #{error_msg}"
+            end
+            raise RuntimeError, "Failed to create #{@backend_label} simulator"
+          end
+
+          @sim_runner_speaker_toggles = 0
+          @destructor = @fn_destroy
+        end
+      end
+
+      # Ruby fallback simulator for when native extension is not available
+      class RubyIrSim
+        def initialize(json)
+          @ir = JSON.parse(json, symbolize_names: true, max_nesting: false)
+          @signals = {}
+          @widths = {}
+          @inputs = []
+          @outputs = []
+          @memories = {}
+          @memory_meta = {}
+
+          # Initialize ports
+          @ir[:ports]&.each do |port|
+            @signals[port[:name]] = 0
+            @widths[port[:name]] = port[:width]
+            if port[:direction] == 'in'
+              @inputs << port[:name]
+            else
+              @outputs << port[:name]
+            end
+          end
+
+          # Initialize wires
+          @ir[:nets]&.each do |net|
+            @signals[net[:name]] = 0
+            @widths[net[:name]] = net[:width]
+          end
+
+          # Initialize registers (with reset values if present)
+          @reset_values = {}
+          @ir[:regs]&.each do |reg|
+            reset_val = reg[:reset_value] || 0
+            @signals[reg[:name]] = reset_val
+            @widths[reg[:name]] = reg[:width]
+            @reset_values[reg[:name]] = reset_val
+          end
+
+          # Initialize memories
+          @ir[:memories]&.each do |mem|
+            depth = mem[:depth].to_i
+            width = mem[:width].to_i
+            initial = Array.new(depth, 0)
+            (mem[:initial_data] || []).each_with_index do |value, idx|
+              break if idx >= depth
+              initial[idx] = value.to_i & mask(width)
+            end
+            @memories[mem[:name]] = initial
+            @memory_meta[mem[:name]] = { depth: depth, width: width, initial: initial.dup }
+          end
+
+          @assigns = @ir[:assigns] || []
+          @processes = @ir[:processes] || []
+          @write_ports = @ir[:write_ports] || []
+          @sync_read_ports = @ir[:sync_read_ports] || []
+        end
+
+        def native?
+          false
+        end
+
+        def mask(width)
+          width >= 64 ? 0xFFFFFFFFFFFFFFFF : (1 << width) - 1
+        end
+
+        def eval_expr(expr)
+          case expr[:type]
+          when 'signal'
+            (@signals[expr[:name]] || 0) & mask(expr[:width])
+          when 'literal'
+            expr[:value] & mask(expr[:width])
+          when 'unary_op'
+            val = eval_expr(expr[:operand])
+            m = mask(expr[:width])
+            case expr[:op]
+            when '~', 'not'
+              (~val) & m
+            when '&', 'reduce_and'
+              op_width = expr[:operand][:width]
+              (val & mask(op_width)) == mask(op_width) ? 1 : 0
+            when '|', 'reduce_or'
+              val != 0 ? 1 : 0
+            when '^', 'reduce_xor'
+              val.to_s(2).count('1') & 1
+            else
+              val
+            end
+          when 'binary_op'
+            l = eval_expr(expr[:left])
+            r = eval_expr(expr[:right])
+            m = mask(expr[:width])
+            case expr[:op]
+            when '&' then l & r
+            when '|' then l | r
+            when '^' then l ^ r
+            when '+' then (l + r) & m
+            when '-' then (l - r) & m
+            when '*' then (l * r) & m
+            when '/' then r != 0 ? l / r : 0
+            when '%' then r != 0 ? l % r : 0
+            when '<<' then (l << [r, 63].min) & m
+            when '>>' then l >> [r, 63].min
+            when '==' then l == r ? 1 : 0
+            when '!=' then l != r ? 1 : 0
+            when '<' then l < r ? 1 : 0
+            when '>' then l > r ? 1 : 0
+            when '<=', 'le' then l <= r ? 1 : 0
+            when '>=' then l >= r ? 1 : 0
+            else 0
+            end
+          when 'mux'
+            cond = eval_expr(expr[:condition])
+            m = mask(expr[:width])
+            if cond != 0
+              eval_expr(expr[:when_true]) & m
+            else
+              eval_expr(expr[:when_false]) & m
+            end
+          when 'slice'
+            val = eval_expr(expr[:base])
+            (val >> expr[:low]) & mask(expr[:width])
+          when 'concat'
+            result = 0
+            expr[:parts].each do |part|
+              part_width = part[:width]
+              part_val = eval_expr(part) & mask(part_width)
+              result = ((result << part_width) | part_val) & mask(expr[:width])
+            end
+            result & mask(expr[:width])
+          when 'resize'
+            eval_expr(expr[:expr]) & mask(expr[:width])
+          when 'mem_read'
+            memory = @memories[expr[:memory]]
+            meta = @memory_meta[expr[:memory]]
+            return 0 unless memory && meta
+
+            addr = eval_expr(expr[:addr]) % meta[:depth]
+            width = expr[:width] || meta[:width]
+            memory[addr] & mask(width)
+          else
+            0
+          end
+        end
+
+        def has_signal?(name)
+          @signals.key?(name.to_s) || @signals.key?(name.to_sym)
+        end
+
+        def poke(name, value)
+          raise "Unknown input: #{name}" unless @inputs.include?(name)
+          width = @widths[name] || 64
+          @signals[name] = value & mask(width)
+        end
+
+        def peek(name)
+          @signals[name] || 0
+        end
+
+        def evaluate
+          10.times do
+            changed = false
+            @assigns.each do |assign|
+              new_val = eval_expr(assign[:expr])
+              width = @widths[assign[:target]] || 64
+              masked = new_val & mask(width)
+              if @signals[assign[:target]] != masked
+                @signals[assign[:target]] = masked
+                changed = true
+              end
+            end
+            break unless changed
+          end
+        end
+
+        def tick
+          evaluate
+
+          # Apply memory writes at the active clock edge.
+          @write_ports.each do |wp|
+            next unless (@signals[wp[:clock]] || 0) != 0
+            next unless (eval_expr(wp[:enable]) & 1) == 1
+
+            memory = @memories[wp[:memory]]
+            meta = @memory_meta[wp[:memory]]
+            next unless memory && meta
+
+            addr = eval_expr(wp[:addr]) % meta[:depth]
+            data = eval_expr(wp[:data]) & mask(meta[:width])
+            memory[addr] = data
+          end
+
+          # Sample register inputs
+          next_regs = {}
+          @processes.each do |process|
+            next unless process[:clocked]
+            process[:statements]&.each do |stmt|
+              new_val = eval_expr(stmt[:expr])
+              width = @widths[stmt[:target]] || 64
+              next_regs[stmt[:target]] = new_val & mask(width)
+            end
+          end
+
+          # Update registers
+          next_regs.each do |name, val|
+            @signals[name] = val
+          end
+
+          # Synchronous memory reads update their destination signals on edge.
+          @sync_read_ports.each do |rp|
+            next unless (@signals[rp[:clock]] || 0) != 0
+            if rp[:enable]
+              next unless (eval_expr(rp[:enable]) & 1) == 1
+            end
+
+            memory = @memories[rp[:memory]]
+            meta = @memory_meta[rp[:memory]]
+            next unless memory && meta
+
+            addr = eval_expr(rp[:addr]) % meta[:depth]
+            data = memory[addr] & mask(meta[:width])
+            width = @widths[rp[:data]] || meta[:width]
+            @signals[rp[:data]] = data & mask(width)
+          end
+
+          evaluate
+        end
+
+        def reset
+          @signals.transform_values! { 0 }
+          # Apply register reset values
+          @reset_values.each do |name, val|
+            @signals[name] = val
+          end
+          @memory_meta.each do |name, meta|
+            @memories[name] = meta[:initial].dup
+          end
+        end
+
+        def signal_count
+          @signals.length
+        end
+
+        def reg_count
+          @processes.sum { |p| p[:statements]&.length || 0 }
+        end
+
+        def input_names
+          @inputs
+        end
+
+        def output_names
+          @outputs
+        end
+
+        def stats
+          {
+            signal_count: signal_count,
+            reg_count: reg_count,
+            input_count: @inputs.length,
+            output_count: @outputs.length,
+            assign_count: @assigns.length,
+            process_count: @processes.length
+          }
+        end
+      end
+
+      # Convert Behavior IR to JSON format for the simulator
+      module IRToJson
+        module_function
+
+        def convert(ir)
+          {
+            name: ir.name,
+            ports: ir.ports.map { |p| port_to_hash(p) },
+            nets: ir.nets.map { |n| net_to_hash(n) },
+            regs: ir.regs.map { |r| reg_to_hash(r) },
+            assigns: ir.assigns.map { |a| assign_to_hash(a) },
+            processes: ir.processes.map { |p| process_to_hash(p) },
+            memories: (ir.memories || []).map { |m| memory_to_hash(m) },
+            write_ports: (ir.write_ports || []).map { |wp| write_port_to_hash(wp) },
+            sync_read_ports: (ir.sync_read_ports || []).map { |rp| sync_read_port_to_hash(rp) }
+          }.to_json(max_nesting: false)
+        end
+
+        def port_to_hash(port)
+          {
+            name: port.name.to_s,
+            direction: port.direction.to_s,
+            width: port.width
+          }
+        end
+
+        def net_to_hash(net)
+          {
+            name: net.name.to_s,
+            width: net.width
+          }
+        end
+
+        def reg_to_hash(reg)
+          hash = {
+            name: reg.name.to_s,
+            width: reg.width
+          }
+          hash[:reset_value] = reg.reset_value if reg.reset_value
+          hash
+        end
+
+        def assign_to_hash(assign)
+          {
+            target: assign.target.to_s,
+            expr: expr_to_hash(assign.expr)
+          }
+        end
+
+        def process_to_hash(process)
+          {
+            name: process.name.to_s,
+            clock: process.clock&.to_s,
+            clocked: process.clocked,
+            statements: flatten_statements(process.statements)
+          }
+        end
+
+        def flatten_statements(stmts)
+          return [] unless stmts
+          result = []
+          stmts.each do |stmt|
+            case stmt
+            when RHDL::Codegen::IR::SeqAssign
+              result << seq_assign_to_hash(stmt)
+            when RHDL::Codegen::IR::If
+              flatten_if(stmt, result)
+            end
+          end
+          result
+        end
+
+        def flatten_if(if_stmt, result)
+          cond = expr_to_hash(if_stmt.condition)
+
+          then_assigns = {}
+          if_stmt.then_statements&.each do |s|
+            case s
+            when RHDL::Codegen::IR::SeqAssign
+              then_assigns[s.target.to_s] = expr_to_hash(s.expr)
+            when RHDL::Codegen::IR::If
+              flatten_if(s, result)
+            end
+          end
+
+          else_assigns = {}
+          if_stmt.else_statements&.each do |s|
+            case s
+            when RHDL::Codegen::IR::SeqAssign
+              else_assigns[s.target.to_s] = expr_to_hash(s.expr)
+            when RHDL::Codegen::IR::If
+              flatten_if(s, result)
+            end
+          end
+
+          all_targets = (then_assigns.keys + else_assigns.keys).uniq
+          all_targets.each do |target|
+            then_expr = then_assigns[target]
+            else_expr = else_assigns[target]
+            width = (then_expr || else_expr)&.dig(:width) || 8
+
+            if then_expr && else_expr
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: cond, when_true: then_expr, when_false: else_expr, width: width }
+              }
+            elsif then_expr
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: cond, when_true: then_expr, when_false: { type: 'signal', name: target, width: width }, width: width }
+              }
+            elsif else_expr
+              inv_cond = { type: 'unary_op', op: '~', operand: cond, width: 1 }
+              result << {
+                target: target,
+                expr: { type: 'mux', condition: inv_cond, when_true: else_expr, when_false: { type: 'signal', name: target, width: width }, width: width }
+              }
+            end
+          end
+        end
+
+        def seq_assign_to_hash(stmt)
+          {
+            target: stmt.target.to_s,
+            expr: expr_to_hash(stmt.expr)
+          }
+        end
+
+        def memory_to_hash(mem)
+          hash = {
+            name: mem.name.to_s,
+            depth: mem.depth,
+            width: mem.width
+          }
+          hash[:initial_data] = mem.initial_data if mem.initial_data
+          hash
+        end
+
+        def write_port_to_hash(wp)
+          {
+            memory: wp.memory.to_s,
+            clock: wp.clock.to_s,
+            addr: expr_to_hash(wp.addr),
+            data: expr_to_hash(wp.data),
+            enable: expr_to_hash(wp.enable)
+          }
+        end
+
+        def sync_read_port_to_hash(rp)
+          hash = {
+            memory: rp.memory.to_s,
+            clock: rp.clock.to_s,
+            addr: expr_to_hash(rp.addr),
+            data: rp.data.to_s
+          }
+          hash[:enable] = expr_to_hash(rp.enable) if rp.enable
+          hash
+        end
+
+        def expr_to_hash(expr)
+          case expr
+          when RHDL::Codegen::IR::Signal
+            { type: 'signal', name: expr.name.to_s, width: expr.width }
+          when RHDL::Codegen::IR::Literal
+            { type: 'literal', value: expr.value, width: expr.width }
+          when RHDL::Codegen::IR::UnaryOp
+            { type: 'unary_op', op: expr.op.to_s, operand: expr_to_hash(expr.operand), width: expr.width }
+          when RHDL::Codegen::IR::BinaryOp
+            { type: 'binary_op', op: expr.op.to_s, left: expr_to_hash(expr.left), right: expr_to_hash(expr.right), width: expr.width }
+          when RHDL::Codegen::IR::Mux
+            { type: 'mux', condition: expr_to_hash(expr.condition), when_true: expr_to_hash(expr.when_true), when_false: expr_to_hash(expr.when_false), width: expr.width }
+          when RHDL::Codegen::IR::Slice
+            low = 0
+            high = expr.width - 1
+
+            if expr.range.is_a?(Range)
+              range_begin = expr.range.begin
+              range_end = expr.range.end
+              if range_begin.is_a?(Integer) && range_end.is_a?(Integer)
+                low = [range_begin, range_end].min
+                high = [range_begin, range_end].max
+              end
+            elsif expr.range.is_a?(Integer)
+              low = expr.range
+              high = expr.range
+            end
+            { type: 'slice', base: expr_to_hash(expr.base), low: low, high: high, width: expr.width }
+          when RHDL::Codegen::IR::Concat
+            { type: 'concat', parts: expr.parts.map { |p| expr_to_hash(p) }, width: expr.width }
+          when RHDL::Codegen::IR::Resize
+            { type: 'resize', expr: expr_to_hash(expr.expr), width: expr.width }
+          when RHDL::Codegen::IR::Case
+            if expr.cases.empty?
+              expr_to_hash(expr.default)
+            else
+              result = expr.default ? expr_to_hash(expr.default) : { type: 'literal', value: 0, width: expr.width }
+              expr.cases.each do |values, case_expr|
+                values.each do |v|
+                  cond = { type: 'binary_op', op: '==', left: expr_to_hash(expr.selector), right: { type: 'literal', value: v, width: expr.selector.width }, width: 1 }
+                  result = { type: 'mux', condition: cond, when_true: expr_to_hash(case_expr), when_false: result, width: expr.width }
+                end
+              end
+              result
+            end
+          when RHDL::Codegen::IR::MemoryRead
+            { type: 'mem_read', memory: expr.memory.to_s, addr: expr_to_hash(expr.addr), width: expr.width }
+          else
+            { type: 'literal', value: 0, width: 1 }
+          end
+        end
+      end
+    end
+    end
+  end
+end
diff --git a/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.lock b/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.lock
new file mode 100644
index 00000000..7500dc06
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.lock
@@ -0,0 +1,130 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "itoa"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
+
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
+[[package]]
+name = "netlist_compiler"
+version = "0.1.0"
+dependencies = [
+ "libloading",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.114"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "zmij"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65"
diff --git a/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.toml b/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.toml
new file mode 100644
index 00000000..a175ca3c
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_compiler/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "netlist_compiler"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "Rustc-based compiler for gate-level netlist simulation - generates specialized Rust code"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+libloading = "0.8"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.lock b/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.lock
new file mode 100644
index 00000000..8891fd50
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.lock
@@ -0,0 +1,107 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "itoa"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
+
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
+[[package]]
+name = "netlist_interpreter"
+version = "0.1.0"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.114"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
+
+[[package]]
+name = "zmij"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65"
diff --git a/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.toml b/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.toml
new file mode 100644
index 00000000..ede0241c
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_interpreter/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "netlist_interpreter"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "High-performance gate-level netlist interpreter with Ruby bindings"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.lock b/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.lock
new file mode 100644
index 00000000..70fa01a3
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.lock
@@ -0,0 +1,518 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "anyhow"
+version = "1.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
+
+[[package]]
+name = "arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bumpalo"
+version = "3.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510"
+dependencies = [
+ "allocator-api2",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cranelift"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a71de5e59f616d79d14d2c71aa2799ce898241d7f10f7e64a4997014b4000a28"
+dependencies = [
+ "cranelift-codegen",
+ "cranelift-frontend",
+ "cranelift-module",
+]
+
+[[package]]
+name = "cranelift-bforest"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e15d04a0ce86cb36ead88ad68cf693ffd6cda47052b9e0ac114bc47fd9cd23c4"
+dependencies = [
+ "cranelift-entity",
+]
+
+[[package]]
+name = "cranelift-bitset"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c6e3969a7ce267259ce244b7867c5d3bc9e65b0a87e81039588dfdeaede9f34"
+
+[[package]]
+name = "cranelift-codegen"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c22032c4cb42558371cf516bb47f26cdad1819d3475c133e93c49f50ebf304e"
+dependencies = [
+ "bumpalo",
+ "cranelift-bforest",
+ "cranelift-bitset",
+ "cranelift-codegen-meta",
+ "cranelift-codegen-shared",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-isle",
+ "gimli",
+ "hashbrown 0.14.5",
+ "log",
+ "regalloc2",
+ "rustc-hash",
+ "serde",
+ "smallvec",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "cranelift-codegen-meta"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c904bc71c61b27fc57827f4a1379f29de64fe95653b620a3db77d59655eee0b8"
+dependencies = [
+ "cranelift-codegen-shared",
+]
+
+[[package]]
+name = "cranelift-codegen-shared"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40180f5497572f644ce88c255480981ae2ec1d7bb4d8e0c0136a13b87a2f2ceb"
+
+[[package]]
+name = "cranelift-control"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d132c6d0bd8a489563472afc171759da0707804a65ece7ceb15a8c6d7dd5ef"
+dependencies = [
+ "arbitrary",
+]
+
+[[package]]
+name = "cranelift-entity"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b2d0d9618275474fbf679dd018ac6e009acbd6ae6850f6a67be33fb3b00b323"
+dependencies = [
+ "cranelift-bitset",
+]
+
+[[package]]
+name = "cranelift-frontend"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fac41e16729107393174b0c9e3730fb072866100e1e64e80a1a963b2e484d57"
+dependencies = [
+ "cranelift-codegen",
+ "log",
+ "smallvec",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "cranelift-isle"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca20d576e5070044d0a72a9effc2deacf4d6aa650403189d8ea50126483944d"
+
+[[package]]
+name = "cranelift-jit"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e65c42755a719b09662b00c700daaf76cc35d5ace1f5c002ad404b591ff1978"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+ "cranelift-entity",
+ "cranelift-module",
+ "cranelift-native",
+ "libc",
+ "log",
+ "region",
+ "target-lexicon 0.13.4",
+ "wasmtime-jit-icache-coherence",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "cranelift-module"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d55612bebcf16ff7306c8a6f5bdb6d45662b8aa1ee058ecce8807ad87db719b"
+dependencies = [
+ "anyhow",
+ "cranelift-codegen",
+ "cranelift-control",
+]
+
+[[package]]
+name = "cranelift-native"
+version = "0.116.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dee82f3f1f2c4cba9177f1cc5e350fe98764379bcd29340caa7b01f85076c7"
+dependencies = [
+ "cranelift-codegen",
+ "libc",
+ "target-lexicon 0.13.4",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+dependencies = [
+ "fallible-iterator",
+ "indexmap",
+ "stable_deref_trait",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
+[[package]]
+name = "indexmap"
+version = "2.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.16.1",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
+
+[[package]]
+name = "libc"
+version = "0.2.180"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "mach2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
+[[package]]
+name = "netlist_jit"
+version = "0.1.0"
+dependencies = [
+ "cranelift",
+ "cranelift-codegen",
+ "cranelift-frontend",
+ "cranelift-jit",
+ "cranelift-module",
+ "cranelift-native",
+ "serde",
+ "serde_json",
+ "target-lexicon 0.12.16",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "regalloc2"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc06e6b318142614e4a48bc725abbf08ff166694835c43c9dae5a9009704639a"
+dependencies = [
+ "allocator-api2",
+ "bumpalo",
+ "hashbrown 0.15.5",
+ "log",
+ "rustc-hash",
+ "smallvec",
+]
+
+[[package]]
+name = "region"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b6ebd13bc009aef9cd476c1310d49ac354d36e240cf1bd753290f3dc7199a7"
+dependencies = [
+ "bitflags",
+ "libc",
+ "mach2",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "syn"
+version = "2.0.114"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "target-lexicon"
+version = "0.12.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
+
+[[package]]
+name = "target-lexicon"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
+
+[[package]]
+name = "wasmtime-jit-icache-coherence"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec5e8552e01692e6c2e5293171704fed8abdec79d1a6995a0870ab190e5747d1"
+dependencies = [
+ "anyhow",
+ "cfg-if",
+ "libc",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "zmij"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65"
diff --git a/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.toml b/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.toml
new file mode 100644
index 00000000..45015b14
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_jit/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "netlist_jit"
+version = "0.1.0"
+edition = "2021"
+authors = ["RHDL Team"]
+description = "Cranelift-based JIT compiler for gate-level netlist simulation"
+license = "MIT"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+cranelift = "0.116"
+cranelift-jit = "0.116"
+cranelift-module = "0.116"
+cranelift-codegen = "0.116"
+cranelift-frontend = "0.116"
+cranelift-native = "0.116"
+target-lexicon = "0.12"
+
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
diff --git a/lib/rhdl/sim/native/netlist/netlist_simulator 2.rb b/lib/rhdl/sim/native/netlist/netlist_simulator 2.rb
new file mode 100644
index 00000000..fc11805c
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_simulator 2.rb	
@@ -0,0 +1,451 @@
+# frozen_string_literal: true
+
+require 'json'
+require 'rbconfig'
+require 'rhdl/codegen/netlist/primitives'
+
+module RHDL
+  module Sim
+    module Native
+      module Netlist
+      class << self
+        def native_lib_name(base)
+          case RbConfig::CONFIG['host_os']
+          when /darwin/ then "#{base}.bundle"
+          when /mswin|mingw/ then "#{base}.dll"
+          else "#{base}.so"
+          end
+        end
+
+        def try_load_native_extension(ext_dir:, require_name:)
+          lib_path = File.join(ext_dir, native_lib_name(require_name))
+          return false unless File.exist?(lib_path)
+
+          $LOAD_PATH.unshift(ext_dir) unless $LOAD_PATH.include?(ext_dir)
+          require require_name
+          true
+        rescue LoadError => e
+          warn "#{require_name} extension not available: #{e.message}" if ENV['RHDL_DEBUG']
+          false
+        end
+      end
+
+      unless const_defined?(:NETLIST_INTERPRETER_AVAILABLE)
+        NETLIST_INTERPRETER_EXT_DIR = File.expand_path('netlist_interpreter/lib', __dir__)
+        NETLIST_INTERPRETER_LIB_NAME = native_lib_name('netlist_interpreter')
+        NETLIST_INTERPRETER_LIB_PATH = File.join(NETLIST_INTERPRETER_EXT_DIR, NETLIST_INTERPRETER_LIB_NAME)
+        _interpreter_loaded = try_load_native_extension(
+          ext_dir: NETLIST_INTERPRETER_EXT_DIR,
+          require_name: 'netlist_interpreter'
+        )
+        NETLIST_INTERPRETER_AVAILABLE = _interpreter_loaded unless const_defined?(:NETLIST_INTERPRETER_AVAILABLE)
+      end
+
+      unless const_defined?(:NETLIST_JIT_AVAILABLE)
+        NETLIST_JIT_EXT_DIR = File.expand_path('netlist_jit/lib', __dir__)
+        NETLIST_JIT_LIB_NAME = native_lib_name('netlist_jit')
+        NETLIST_JIT_LIB_PATH = File.join(NETLIST_JIT_EXT_DIR, NETLIST_JIT_LIB_NAME)
+        _jit_loaded = try_load_native_extension(
+          ext_dir: NETLIST_JIT_EXT_DIR,
+          require_name: 'netlist_jit'
+        )
+        NETLIST_JIT_AVAILABLE = _jit_loaded unless const_defined?(:NETLIST_JIT_AVAILABLE)
+      end
+
+      unless const_defined?(:NETLIST_COMPILER_AVAILABLE)
+        NETLIST_COMPILER_EXT_DIR = File.expand_path('netlist_compiler/lib', __dir__)
+        NETLIST_COMPILER_LIB_NAME = native_lib_name('netlist_compiler')
+        NETLIST_COMPILER_LIB_PATH = File.join(NETLIST_COMPILER_EXT_DIR, NETLIST_COMPILER_LIB_NAME)
+        _compiler_loaded = try_load_native_extension(
+          ext_dir: NETLIST_COMPILER_EXT_DIR,
+          require_name: 'netlist_compiler'
+        )
+        NETLIST_COMPILER_AVAILABLE = _compiler_loaded unless const_defined?(:NETLIST_COMPILER_AVAILABLE)
+      end
+
+      # Pure Ruby fallback implementation.
+      class RubyNetlistSimulator
+        attr_reader :ir, :lanes
+
+        def initialize(ir, lanes: 64)
+          @ir = ir.is_a?(String) ? JSON.parse(ir, symbolize_names: true) : ir
+          @lanes = lanes
+          @lane_mask = (1 << lanes) - 1
+          @nets = Array.new(ir_get(:net_count), 0)
+          parse_ir
+        end
+
+        def parse_ir
+          @gates = ir_get(:gates)
+          @dffs = ir_get(:dffs)
+          @sr_latches = ir_get(:sr_latches) || []
+          @inputs = ir_get(:inputs)
+          @outputs = ir_get(:outputs)
+          @schedule = ir_get(:schedule)
+        end
+
+        private
+
+        def ir_get(key)
+          if @ir.respond_to?(key)
+            @ir.send(key)
+          elsif @ir.respond_to?(:[])
+            @ir[key] || @ir[key.to_s]
+          end
+        end
+
+        public
+
+        def poke(name, value)
+          nets = @inputs[name.to_s] || @inputs[name.to_sym]
+          raise "Unknown input: #{name}" unless nets
+
+          val = value.is_a?(Array) ? value.first : value
+          val = val.to_i & @lane_mask
+
+          if nets.length == 1
+            @nets[nets.first] = val
+          else
+            nets.each_with_index { |net, i| @nets[net] = ((val >> i) & 1) == 1 ? @lane_mask : 0 }
+          end
+        end
+
+        def peek(name)
+          nets = @outputs[name.to_s] || @outputs[name.to_sym]
+          raise "Unknown output: #{name}" unless nets
+
+          nets.length == 1 ? @nets[nets.first] : nets.map { |net| @nets[net] }
+        end
+
+        def evaluate
+          @schedule.each do |gate_idx|
+            gate = @gates[gate_idx]
+            eval_gate(gate)
+          end
+
+          # Iterate latches to a fixed point.
+          10.times do
+            changed = false
+            @sr_latches.each do |latch|
+              s = @nets[latch[:s]]
+              r = @nets[latch[:r]]
+              en = @nets[latch[:en]]
+              q_old = @nets[latch[:q]]
+              q_next = ((~en) & q_old) | (en & (~r) & (s | q_old)) & @lane_mask
+              next if q_next == q_old
+
+              @nets[latch[:q]] = q_next
+              @nets[latch[:qn]] = (~q_next) & @lane_mask
+              changed = true
+            end
+            break unless changed
+          end
+        end
+
+        def tick
+          evaluate
+          next_q = @dffs.map do |dff|
+            q = @nets[dff[:q]]
+            d = @nets[dff[:d]]
+            q_next = d
+            if dff[:en]
+              en = @nets[dff[:en]]
+              q_next = (q & ~en) | (d & en)
+            end
+            if dff[:rst]
+              rst = @nets[dff[:rst]]
+              reset_val = dff[:reset_value] || 0
+              q_next = (q_next & ~rst) | (rst & (reset_val.zero? ? 0 : @lane_mask))
+            end
+            q_next
+          end
+          @dffs.each_with_index { |dff, idx| @nets[dff[:q]] = next_q[idx] }
+          evaluate
+        end
+
+        def reset
+          @nets.fill(0)
+          @dffs.each do |dff|
+            reset_val = dff[:reset_value] || 0
+            @nets[dff[:q]] = reset_val.zero? ? 0 : @lane_mask
+          end
+        end
+
+        def run_ticks(n)
+          n.times { tick }
+        end
+
+        def net_count
+          @nets.length
+        end
+
+        def gate_count
+          @gates.length
+        end
+
+        def dff_count
+          @dffs.length
+        end
+
+        def input_names
+          @inputs.keys
+        end
+
+        def output_names
+          @outputs.keys
+        end
+
+        def stats
+          {
+            net_count: @nets.length,
+            gate_count: @gates.length,
+            dff_count: @dffs.length,
+            lanes: @lanes,
+            input_count: @inputs.length,
+            output_count: @outputs.length,
+            backend: 'ruby'
+          }
+        end
+
+        def native?
+          false
+        end
+
+        private
+
+        def eval_gate(gate)
+          type = gate[:type]&.to_sym || gate.type
+          inputs = gate[:inputs] || gate.inputs
+          output = gate[:output] || gate.output
+
+          case type
+          when :and then @nets[output] = @nets[inputs[0]] & @nets[inputs[1]]
+          when :or then @nets[output] = @nets[inputs[0]] | @nets[inputs[1]]
+          when :xor then @nets[output] = @nets[inputs[0]] ^ @nets[inputs[1]]
+          when :not then @nets[output] = (~@nets[inputs[0]]) & @lane_mask
+          when :mux
+            sel = @nets[inputs[2]]
+            @nets[output] = (@nets[inputs[0]] & ~sel) | (@nets[inputs[1]] & sel)
+          when :buf then @nets[output] = @nets[inputs[0]]
+          when :const
+            val = gate[:value] || gate.value
+            @nets[output] = val.to_i.zero? ? 0 : @lane_mask
+          end
+        end
+      end
+
+      # Unified wrapper for interpreter, JIT, compiler, and Ruby fallback.
+      class NetlistSimulator
+        attr_reader :ir, :lanes
+
+        BACKEND_CONFIGS = {
+          interpreter: {
+            available: NETLIST_INTERPRETER_AVAILABLE,
+            class_name: 'NetlistInterpreter',
+            type: :interpret,
+            lib_path: NETLIST_INTERPRETER_LIB_PATH
+          },
+          jit: {
+            available: NETLIST_JIT_AVAILABLE,
+            class_name: 'NetlistJit',
+            type: :jit,
+            lib_path: NETLIST_JIT_LIB_PATH
+          },
+          compiler: {
+            available: NETLIST_COMPILER_AVAILABLE,
+            class_name: 'NetlistCompiler',
+            type: :compile,
+            lib_path: NETLIST_COMPILER_LIB_PATH
+          }
+        }.freeze
+
+        def initialize(ir, backend: :interpreter, lanes: 64, simd: :auto, allow_fallback: true)
+          @ir = ir
+          @lanes = lanes
+          @simd = simd
+          @requested_backend = normalize_backend(backend)
+          @fallback = false
+          @native_error = nil
+
+          native_loaded = false
+          backend_candidates(@requested_backend, allow_fallback: allow_fallback).each do |candidate|
+            next unless BACKEND_CONFIGS[candidate][:available]
+
+            begin
+              create_native_sim(candidate)
+              native_loaded = true
+              break
+            rescue StandardError => e
+              @native_error = e
+            end
+          end
+
+          return if native_loaded
+
+          if allow_fallback
+            @sim = RubyNetlistSimulator.new(ir, lanes: lanes)
+            @backend = :ruby
+            @fallback = true
+          else
+            raise LoadError, unavailable_backend_error_message(@requested_backend, allow_fallback: false)
+          end
+        end
+
+        def simulator_type
+          :"netlist_#{@backend}"
+        end
+
+        def backend
+          @backend
+        end
+
+        def native?
+          !@fallback && @sim.respond_to?(:native?) && @sim.native?
+        end
+
+        def poke(name, value)
+          @sim.poke(name.to_s, value)
+        end
+
+        def peek(name)
+          @sim.peek(name.to_s)
+        end
+
+        def evaluate
+          @sim.evaluate
+        end
+
+        def tick
+          @sim.tick
+        end
+
+        def run_ticks(n)
+          if @sim.respond_to?(:run_ticks)
+            @sim.run_ticks(n)
+          else
+            n.times { @sim.tick }
+          end
+        end
+
+        def reset
+          @sim.reset
+        end
+
+        def compile
+          return true unless @sim.respond_to?(:compile)
+
+          @sim.compile
+        end
+
+        def compiled?
+          return false unless @sim.respond_to?(:compiled?)
+
+          @sim.compiled?
+        end
+
+        def generated_code
+          return nil unless @sim.respond_to?(:generated_code)
+
+          @sim.generated_code
+        end
+
+        def simd_mode
+          return nil unless @sim.respond_to?(:simd_mode)
+
+          @sim.simd_mode
+        end
+
+        def net_count
+          return @sim.net_count if @sim.respond_to?(:net_count)
+
+          @sim.stats[:net_count]
+        end
+
+        def gate_count
+          return @sim.gate_count if @sim.respond_to?(:gate_count)
+
+          @sim.stats[:gate_count]
+        end
+
+        def dff_count
+          return @sim.dff_count if @sim.respond_to?(:dff_count)
+
+          @sim.stats[:dff_count]
+        end
+
+        def input_names
+          return @sim.input_names if @sim.respond_to?(:input_names)
+
+          []
+        end
+
+        def output_names
+          return @sim.output_names if @sim.respond_to?(:output_names)
+
+          []
+        end
+
+        def stats
+          @sim.stats
+        end
+
+        private
+
+        def normalize_backend(backend)
+          case backend.to_sym
+          when :interpreter, :interpret then :interpreter
+          when :jit then :jit
+          when :compiler, :compile then :compiler
+          when :auto then :auto
+          else
+            raise ArgumentError, "Unknown backend: #{backend}. Valid: :interpreter, :jit, :compiler, :auto"
+          end
+        end
+
+        def backend_candidates(backend, allow_fallback:)
+          case backend
+          when :auto then [:compiler, :jit, :interpreter]
+          when :compiler then allow_fallback ? [:compiler, :jit, :interpreter] : [:compiler]
+          when :jit then allow_fallback ? [:jit, :interpreter] : [:jit]
+          when :interpreter then [:interpreter]
+          else
+            [backend]
+          end
+        end
+
+        def create_native_sim(backend)
+          config = BACKEND_CONFIGS.fetch(backend)
+          json = @ir.is_a?(String) ? @ir : @ir.to_json
+          klass = RHDL::Sim::Native::Netlist.const_get(config[:class_name])
+
+          @sim = case backend
+                 when :compiler
+                   compiler = klass.new(json, @simd.to_s)
+                   compiler.compile if compiler.respond_to?(:compile)
+                   compiler
+                 else
+                   klass.new(json, @lanes)
+                 end
+
+          @backend = config[:type]
+        end
+
+        def unavailable_backend_error_message(backend, allow_fallback:)
+          candidates = backend_candidates(backend, allow_fallback: allow_fallback)
+          missing = candidates.reject { |candidate| BACKEND_CONFIGS[candidate][:available] }
+          hint_paths = missing.map { |candidate| BACKEND_CONFIGS[candidate][:lib_path] }
+
+          message = +"Netlist #{backend} backend is not available."
+          unless hint_paths.empty?
+            message << "\nMissing native library: #{hint_paths.join(', ')}"
+          end
+          message << "\nRun 'rake native:build' to build native extensions."
+          message << "\nLast native error: #{@native_error.message}" if @native_error
+          message
+        end
+      end
+
+    end
+    end
+  end
+end
diff --git a/lib/rhdl/sim/native/netlist/netlist_simulator.rb b/lib/rhdl/sim/native/netlist/netlist_simulator.rb
new file mode 100644
index 00000000..1a098bdb
--- /dev/null
+++ b/lib/rhdl/sim/native/netlist/netlist_simulator.rb
@@ -0,0 +1,740 @@
+# frozen_string_literal: true
+
+require 'json'
+require 'fiddle'
+require 'fiddle/import'
+require 'rbconfig'
+require 'rhdl/codegen/netlist/primitives'
+
+module RHDL
+  module Sim
+    module Native
+      module Netlist
+      class << self
+        def native_lib_name(base)
+          case RbConfig::CONFIG['host_os']
+          when /darwin/ then "#{base}.dylib"
+          when /mswin|mingw/ then "#{base}.dll"
+          else "#{base}.so"
+          end
+        end
+
+        def sim_backend_available?(lib_path)
+          return false unless File.exist?(lib_path)
+
+          lib = Fiddle.dlopen(lib_path)
+          lib['sim_create']
+          lib['sim_destroy']
+          lib['sim_poke_bus']
+          lib['sim_peek_bus']
+          lib['sim_exec']
+          lib['sim_query']
+          lib['sim_blob']
+          true
+        rescue Fiddle::DLError
+          false
+        end
+      end
+
+      unless const_defined?(:NETLIST_INTERPRETER_AVAILABLE)
+        NETLIST_INTERPRETER_EXT_DIR = File.expand_path('netlist_interpreter/lib', __dir__)
+        NETLIST_INTERPRETER_LIB_NAME = native_lib_name('netlist_interpreter')
+        NETLIST_INTERPRETER_LIB_PATH = File.join(NETLIST_INTERPRETER_EXT_DIR, NETLIST_INTERPRETER_LIB_NAME)
+        NETLIST_INTERPRETER_AVAILABLE = sim_backend_available?(NETLIST_INTERPRETER_LIB_PATH)
+      end
+
+      unless const_defined?(:NETLIST_JIT_AVAILABLE)
+        NETLIST_JIT_EXT_DIR = File.expand_path('netlist_jit/lib', __dir__)
+        NETLIST_JIT_LIB_NAME = native_lib_name('netlist_jit')
+        NETLIST_JIT_LIB_PATH = File.join(NETLIST_JIT_EXT_DIR, NETLIST_JIT_LIB_NAME)
+        NETLIST_JIT_AVAILABLE = sim_backend_available?(NETLIST_JIT_LIB_PATH)
+      end
+
+      unless const_defined?(:NETLIST_COMPILER_AVAILABLE)
+        NETLIST_COMPILER_EXT_DIR = File.expand_path('netlist_compiler/lib', __dir__)
+        NETLIST_COMPILER_LIB_NAME = native_lib_name('netlist_compiler')
+        NETLIST_COMPILER_LIB_PATH = File.join(NETLIST_COMPILER_EXT_DIR, NETLIST_COMPILER_LIB_NAME)
+        NETLIST_COMPILER_AVAILABLE = sim_backend_available?(NETLIST_COMPILER_LIB_PATH)
+      end
+
+      # Common Fiddle wrapper shared by netlist native backends.
+      class NetlistNativeBackend
+        SIM_EXEC_EVALUATE = 0
+        SIM_EXEC_TICK = 1
+        SIM_EXEC_RUN_TICKS = 2
+        SIM_EXEC_RESET = 3
+        SIM_EXEC_COMPILE = 4
+        SIM_EXEC_IS_COMPILED = 5
+
+        SIM_QUERY_NET_COUNT = 0
+        SIM_QUERY_GATE_COUNT = 1
+        SIM_QUERY_DFF_COUNT = 2
+        SIM_QUERY_LANES = 3
+
+        SIM_BLOB_INPUT_NAMES = 0
+        SIM_BLOB_OUTPUT_NAMES = 1
+        SIM_BLOB_GENERATED_CODE = 2
+        SIM_BLOB_SIMD_MODE = 3
+
+        U64_PACK = 'Q'
+        SIZE_T_PACK = Fiddle::SIZEOF_VOIDP == 8 ? 'Q' : 'L'
+
+        def initialize(lib_path, json, config)
+          @lib = Fiddle.dlopen(lib_path)
+          bind_functions
+
+          error_ptr = alloc_error_ptr
+          @ctx = @fn_create.call(json.to_s, config&.to_s, error_ptr)
+          if @ctx.to_i.zero?
+            raise LoadError, error_from_ptr(error_ptr)
+          end
+        end
+
+        def close
+          return if @ctx.nil? || @ctx.to_i.zero?
+
+          @fn_destroy.call(@ctx)
+          @ctx = 0
+        rescue StandardError
+          @ctx = 0
+        end
+
+        def native?
+          true
+        end
+
+        def poke(name, value)
+          if value.is_a?(Array)
+            values = value.map { |v| v.to_i & 0xFFFFFFFFFFFFFFFF }
+            buf = Fiddle::Pointer[values.pack("#{U64_PACK}*")]
+            exec_with_error do |error_ptr|
+              @fn_poke_bus.call(@ctx, name.to_s, buf, values.length, error_ptr)
+            end
+          else
+            raw = value.to_i & 0xFFFFFFFFFFFFFFFF
+            signed = raw >= 0x8000000000000000 ? raw - 0x1_0000_0000_0000_0000 : raw
+            exec_with_error do |error_ptr|
+              @fn_poke_scalar.call(@ctx, name.to_s, signed, error_ptr)
+            end
+          end
+          true
+        end
+
+        def peek(name)
+          values = peek_bus(name)
+          values.length <= 1 ? (values[0] || 0) : values
+        end
+
+        def evaluate
+          exec_with_error do |error_ptr|
+            @fn_exec.call(@ctx, SIM_EXEC_EVALUATE, 0, error_ptr)
+          end
+          true
+        end
+
+        def tick
+          exec_with_error do |error_ptr|
+            @fn_exec.call(@ctx, SIM_EXEC_TICK, 0, error_ptr)
+          end
+          true
+        end
+
+        def run_ticks(n)
+          exec_with_error do |error_ptr|
+            @fn_exec.call(@ctx, SIM_EXEC_RUN_TICKS, n.to_i, error_ptr)
+          end
+          true
+        end
+
+        def reset
+          exec_with_error do |error_ptr|
+            @fn_exec.call(@ctx, SIM_EXEC_RESET, 0, error_ptr)
+          end
+          true
+        end
+
+        def compile
+          exec_with_error do |error_ptr|
+            @fn_exec.call(@ctx, SIM_EXEC_COMPILE, 0, error_ptr)
+          end
+          true
+        end
+
+        def compiled?
+          @fn_exec.call(@ctx, SIM_EXEC_IS_COMPILED, 0, 0).to_i != 0
+        end
+
+        def generated_code
+          blob(SIM_BLOB_GENERATED_CODE)
+        end
+
+        def simd_mode
+          blob(SIM_BLOB_SIMD_MODE)
+        end
+
+        def net_count
+          @fn_query.call(@ctx, SIM_QUERY_NET_COUNT).to_i
+        end
+
+        def gate_count
+          @fn_query.call(@ctx, SIM_QUERY_GATE_COUNT).to_i
+        end
+
+        def dff_count
+          @fn_query.call(@ctx, SIM_QUERY_DFF_COUNT).to_i
+        end
+
+        def lanes
+          @fn_query.call(@ctx, SIM_QUERY_LANES).to_i
+        end
+
+        def input_names
+          csv = blob(SIM_BLOB_INPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        def output_names
+          csv = blob(SIM_BLOB_OUTPUT_NAMES)
+          csv.empty? ? [] : csv.split(',')
+        end
+
+        def stats
+          {
+            net_count: net_count,
+            gate_count: gate_count,
+            dff_count: dff_count,
+            lanes: lanes,
+            input_count: input_names.length,
+            output_count: output_names.length
+          }
+        end
+
+        private
+
+        def bind_functions
+          @fn_create = Fiddle::Function.new(
+            @lib['sim_create'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOIDP
+          )
+          @fn_destroy = Fiddle::Function.new(
+            @lib['sim_destroy'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+          @fn_free_error = Fiddle::Function.new(
+            @lib['sim_free_error'],
+            [Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_VOID
+          )
+          @fn_poke_bus = Fiddle::Function.new(
+            @lib['sim_poke_bus'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @fn_poke_scalar = Fiddle::Function.new(
+            @lib['sim_poke_scalar'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_LONG_LONG, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @fn_peek_bus = Fiddle::Function.new(
+            @lib['sim_peek_bus'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @fn_exec = Fiddle::Function.new(
+            @lib['sim_exec'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_SIZE_T, Fiddle::TYPE_VOIDP],
+            Fiddle::TYPE_INT
+          )
+          @fn_query = Fiddle::Function.new(
+            @lib['sim_query'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT],
+            Fiddle::TYPE_SIZE_T
+          )
+          @fn_blob = Fiddle::Function.new(
+            @lib['sim_blob'],
+            [Fiddle::TYPE_VOIDP, Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP, Fiddle::TYPE_SIZE_T],
+            Fiddle::TYPE_SIZE_T
+          )
+        end
+
+        def peek_bus(name)
+          out_len_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          out_len_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack(SIZE_T_PACK)
+
+          exec_with_error do |error_ptr|
+            @fn_peek_bus.call(@ctx, name.to_s, 0, 0, out_len_ptr, error_ptr)
+          end
+
+          len = out_len_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1(SIZE_T_PACK)
+          return [] if len.zero?
+
+          out_buf = Fiddle::Pointer.malloc(len * 8)
+          out_len_ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack(SIZE_T_PACK)
+
+          exec_with_error do |error_ptr|
+            @fn_peek_bus.call(@ctx, name.to_s, out_buf, len, out_len_ptr, error_ptr)
+          end
+
+          out_buf[0, len * 8].unpack("#{U64_PACK}*")
+        end
+
+        def blob(op)
+          size = @fn_blob.call(@ctx, op, 0, 0).to_i
+          return '' if size <= 0
+
+          buf = Fiddle::Pointer.malloc(size)
+          written = @fn_blob.call(@ctx, op, buf, size).to_i
+          return '' if written <= 0
+
+          buf.to_s(written)
+        end
+
+        def exec_with_error
+          error_ptr = alloc_error_ptr
+          result = yield(error_ptr)
+          return result if result.to_i != 0
+
+          raise RuntimeError, error_from_ptr(error_ptr)
+        end
+
+        def alloc_error_ptr
+          ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
+          ptr[0, Fiddle::SIZEOF_VOIDP] = [0].pack(SIZE_T_PACK)
+          ptr
+        end
+
+        def error_from_ptr(error_ptr)
+          error_str_ptr = error_ptr[0, Fiddle::SIZEOF_VOIDP].unpack1(SIZE_T_PACK)
+          return 'native netlist backend operation failed' if error_str_ptr.zero?
+
+          error_msg = Fiddle::Pointer.new(error_str_ptr).to_s
+          @fn_free_error.call(error_str_ptr)
+          error_msg
+        rescue StandardError
+          'native netlist backend operation failed'
+        end
+      end
+
+      class NetlistInterpreter < NetlistNativeBackend
+        def initialize(json, lanes = 64)
+          super(NETLIST_INTERPRETER_LIB_PATH, json, lanes)
+        end
+      end
+
+      class NetlistJit < NetlistNativeBackend
+        def initialize(json, lanes = 64)
+          super(NETLIST_JIT_LIB_PATH, json, lanes)
+        end
+      end
+
+      class NetlistCompiler < NetlistNativeBackend
+        def initialize(json, simd_mode = 'auto')
+          super(NETLIST_COMPILER_LIB_PATH, json, simd_mode)
+        end
+
+        def compile
+          super
+        end
+
+        def simd_mode
+          mode = super
+          mode.empty? ? 'scalar' : mode
+        end
+
+        def stats
+          super.merge(
+            simd_mode: simd_mode,
+            compiled: compiled?,
+            backend: 'rustc_compiler_simd'
+          )
+        end
+      end
+
+      # Pure Ruby fallback implementation.
+      class RubyNetlistSimulator
+        attr_reader :ir, :lanes
+
+        def initialize(ir, lanes: 64)
+          @ir = ir.is_a?(String) ? JSON.parse(ir, symbolize_names: true) : ir
+          @lanes = lanes
+          @lane_mask = (1 << lanes) - 1
+          @nets = Array.new(ir_get(:net_count), 0)
+          parse_ir
+        end
+
+        def parse_ir
+          @gates = ir_get(:gates)
+          @dffs = ir_get(:dffs)
+          @sr_latches = ir_get(:sr_latches) || []
+          @inputs = ir_get(:inputs)
+          @outputs = ir_get(:outputs)
+          @schedule = ir_get(:schedule)
+        end
+
+        private
+
+        def ir_get(key)
+          if @ir.respond_to?(key)
+            @ir.send(key)
+          elsif @ir.respond_to?(:[])
+            @ir[key] || @ir[key.to_s]
+          end
+        end
+
+        public
+
+        def poke(name, value)
+          nets = @inputs[name.to_s] || @inputs[name.to_sym]
+          raise "Unknown input: #{name}" unless nets
+
+          val = value.is_a?(Array) ? value.first : value
+          val = val.to_i & @lane_mask
+
+          if nets.length == 1
+            @nets[nets.first] = val
+          else
+            nets.each_with_index { |net, i| @nets[net] = ((val >> i) & 1) == 1 ? @lane_mask : 0 }
+          end
+        end
+
+        def peek(name)
+          nets = @outputs[name.to_s] || @outputs[name.to_sym]
+          raise "Unknown output: #{name}" unless nets
+
+          nets.length == 1 ? @nets[nets.first] : nets.map { |net| @nets[net] }
+        end
+
+        def evaluate
+          @schedule.each do |gate_idx|
+            gate = @gates[gate_idx]
+            eval_gate(gate)
+          end
+
+          # Iterate latches to a fixed point.
+          10.times do
+            changed = false
+            @sr_latches.each do |latch|
+              s = @nets[latch[:s]]
+              r = @nets[latch[:r]]
+              en = @nets[latch[:en]]
+              q_old = @nets[latch[:q]]
+              q_next = ((~en) & q_old) | (en & (~r) & (s | q_old)) & @lane_mask
+              next if q_next == q_old
+
+              @nets[latch[:q]] = q_next
+              @nets[latch[:qn]] = (~q_next) & @lane_mask
+              changed = true
+            end
+            break unless changed
+          end
+        end
+
+        def tick
+          evaluate
+          next_q = @dffs.map do |dff|
+            q = @nets[dff[:q]]
+            d = @nets[dff[:d]]
+            q_next = d
+            if dff[:en]
+              en = @nets[dff[:en]]
+              q_next = (q & ~en) | (d & en)
+            end
+            if dff[:rst]
+              rst = @nets[dff[:rst]]
+              reset_val = dff[:reset_value] || 0
+              q_next = (q_next & ~rst) | (rst & (reset_val.zero? ? 0 : @lane_mask))
+            end
+            q_next
+          end
+          @dffs.each_with_index { |dff, idx| @nets[dff[:q]] = next_q[idx] }
+          evaluate
+        end
+
+        def reset
+          @nets.fill(0)
+          @dffs.each do |dff|
+            reset_val = dff[:reset_value] || 0
+            @nets[dff[:q]] = reset_val.zero? ? 0 : @lane_mask
+          end
+        end
+
+        def run_ticks(n)
+          n.times { tick }
+        end
+
+        def net_count
+          @nets.length
+        end
+
+        def gate_count
+          @gates.length
+        end
+
+        def dff_count
+          @dffs.length
+        end
+
+        def input_names
+          @inputs.keys
+        end
+
+        def output_names
+          @outputs.keys
+        end
+
+        def stats
+          {
+            net_count: @nets.length,
+            gate_count: @gates.length,
+            dff_count: @dffs.length,
+            lanes: @lanes,
+            input_count: @inputs.length,
+            output_count: @outputs.length,
+            backend: 'ruby'
+          }
+        end
+
+        def native?
+          false
+        end
+
+        private
+
+        def eval_gate(gate)
+          type = gate[:type]&.to_sym || gate.type
+          inputs = gate[:inputs] || gate.inputs
+          output = gate[:output] || gate.output
+
+          case type
+          when :and then @nets[output] = @nets[inputs[0]] & @nets[inputs[1]]
+          when :or then @nets[output] = @nets[inputs[0]] | @nets[inputs[1]]
+          when :xor then @nets[output] = @nets[inputs[0]] ^ @nets[inputs[1]]
+          when :not then @nets[output] = (~@nets[inputs[0]]) & @lane_mask
+          when :mux
+            sel = @nets[inputs[2]]
+            @nets[output] = (@nets[inputs[0]] & ~sel) | (@nets[inputs[1]] & sel)
+          when :buf then @nets[output] = @nets[inputs[0]]
+          when :const
+            val = gate[:value] || gate.value
+            @nets[output] = val.to_i.zero? ? 0 : @lane_mask
+          end
+        end
+      end
+
+      # Unified wrapper for interpreter, JIT, compiler, and Ruby fallback.
+      class NetlistSimulator
+        attr_reader :ir, :lanes
+
+        BACKEND_CONFIGS = {
+          interpreter: {
+            available: NETLIST_INTERPRETER_AVAILABLE,
+            class_name: 'NetlistInterpreter',
+            type: :interpret,
+            lib_path: NETLIST_INTERPRETER_LIB_PATH
+          },
+          jit: {
+            available: NETLIST_JIT_AVAILABLE,
+            class_name: 'NetlistJit',
+            type: :jit,
+            lib_path: NETLIST_JIT_LIB_PATH
+          },
+          compiler: {
+            available: NETLIST_COMPILER_AVAILABLE,
+            class_name: 'NetlistCompiler',
+            type: :compile,
+            lib_path: NETLIST_COMPILER_LIB_PATH
+          }
+        }.freeze
+
+        def initialize(ir, backend: :interpreter, lanes: 64, simd: :auto, allow_fallback: true)
+          @ir = ir
+          @lanes = lanes
+          @simd = simd
+          @requested_backend = normalize_backend(backend)
+          @fallback = false
+          @native_error = nil
+
+          native_loaded = false
+          backend_candidates(@requested_backend, allow_fallback: allow_fallback).each do |candidate|
+            next unless BACKEND_CONFIGS[candidate][:available]
+
+            begin
+              create_native_sim(candidate)
+              native_loaded = true
+              break
+            rescue StandardError => e
+              @native_error = e
+            end
+          end
+
+          return if native_loaded
+
+          if allow_fallback
+            @sim = RubyNetlistSimulator.new(ir, lanes: lanes)
+            @backend = :ruby
+            @fallback = true
+          else
+            raise LoadError, unavailable_backend_error_message(@requested_backend, allow_fallback: false)
+          end
+        end
+
+        def simulator_type
+          :"netlist_#{@backend}"
+        end
+
+        def backend
+          @backend
+        end
+
+        def native?
+          !@fallback && @sim.respond_to?(:native?) && @sim.native?
+        end
+
+        def poke(name, value)
+          @sim.poke(name.to_s, value)
+        end
+
+        def peek(name)
+          @sim.peek(name.to_s)
+        end
+
+        def evaluate
+          @sim.evaluate
+        end
+
+        def tick
+          @sim.tick
+        end
+
+        def run_ticks(n)
+          if @sim.respond_to?(:run_ticks)
+            @sim.run_ticks(n)
+          else
+            n.times { @sim.tick }
+          end
+        end
+
+        def reset
+          @sim.reset
+        end
+
+        def compile
+          return true unless @sim.respond_to?(:compile)
+
+          @sim.compile
+        end
+
+        def compiled?
+          return false unless @sim.respond_to?(:compiled?)
+
+          @sim.compiled?
+        end
+
+        def generated_code
+          return nil unless @sim.respond_to?(:generated_code)
+
+          @sim.generated_code
+        end
+
+        def simd_mode
+          return nil unless @sim.respond_to?(:simd_mode)
+
+          @sim.simd_mode
+        end
+
+        def net_count
+          return @sim.net_count if @sim.respond_to?(:net_count)
+
+          @sim.stats[:net_count]
+        end
+
+        def gate_count
+          return @sim.gate_count if @sim.respond_to?(:gate_count)
+
+          @sim.stats[:gate_count]
+        end
+
+        def dff_count
+          return @sim.dff_count if @sim.respond_to?(:dff_count)
+
+          @sim.stats[:dff_count]
+        end
+
+        def input_names
+          return @sim.input_names if @sim.respond_to?(:input_names)
+
+          []
+        end
+
+        def output_names
+          return @sim.output_names if @sim.respond_to?(:output_names)
+
+          []
+        end
+
+        def stats
+          @sim.stats
+        end
+
+        private
+
+        def normalize_backend(backend)
+          case backend.to_sym
+          when :interpreter, :interpret then :interpreter
+          when :jit then :jit
+          when :compiler, :compile then :compiler
+          when :auto then :auto
+          else
+            raise ArgumentError, "Unknown backend: #{backend}. Valid: :interpreter, :jit, :compiler, :auto"
+          end
+        end
+
+        def backend_candidates(backend, allow_fallback:)
+          case backend
+          when :auto then [:compiler, :jit, :interpreter]
+          when :compiler then allow_fallback ? [:compiler, :jit, :interpreter] : [:compiler]
+          when :jit then allow_fallback ? [:jit, :interpreter] : [:jit]
+          when :interpreter then [:interpreter]
+          else
+            [backend]
+          end
+        end
+
+        def create_native_sim(backend)
+          config = BACKEND_CONFIGS.fetch(backend)
+          json = @ir.is_a?(String) ? @ir : @ir.to_json
+          klass = RHDL::Sim::Native::Netlist.const_get(config[:class_name])
+
+          @sim = case backend
+                 when :compiler
+                   compiler = klass.new(json, @simd.to_s)
+                   compiler.compile if compiler.respond_to?(:compile)
+                   compiler
+                 else
+                   klass.new(json, @lanes)
+                 end
+
+          @backend = config[:type]
+        end
+
+        def unavailable_backend_error_message(backend, allow_fallback:)
+          candidates = backend_candidates(backend, allow_fallback: allow_fallback)
+          missing = candidates.reject { |candidate| BACKEND_CONFIGS[candidate][:available] }
+          hint_paths = missing.map { |candidate| BACKEND_CONFIGS[candidate][:lib_path] }
+
+          message = +"Netlist #{backend} backend is not available."
+          unless hint_paths.empty?
+            message << "\nMissing native library: #{hint_paths.join(', ')}"
+          end
+          message << "\nRun 'rake native:build' to build native extensions."
+          message << "\nLast native error: #{@native_error.message}" if @native_error
+          message
+        end
+      end
+
+    end
+    end
+  end
+end
diff --git a/prd/2026_03_03_apple2_arctogpu_perf_prd.md b/prd/2026_03_03_apple2_arctogpu_perf_prd.md
new file mode 100644
index 00000000..b6f658ec
--- /dev/null
+++ b/prd/2026_03_03_apple2_arctogpu_perf_prd.md
@@ -0,0 +1,167 @@
+# Apple2 ArcToGPU Performance PRD
+
+**Status:** In Progress (2026-03-03)
+**Date:** 2026-03-03
+
+## Context
+
+Apple2 ArcToGPU parity is stable, but single-instance Metal runtime remains slower than the IR compiler backend on benchmark workloads. Existing optimizations reduced some overhead, but the generated GPU path still recomputes large portions of the eval graph each sub-cycle and does not exploit dataflow metadata aggressively.
+
+## Goals
+
+1. Use Arc IR dataflow/state metadata to reduce redundant recomputation in Apple2 GPU kernels.
+2. Reduce hot-loop work in generated Metal by phase-specialized evaluation paths.
+3. Lower call overhead and dead work in generated code paths.
+4. Add support for throughput-oriented multi-instance dispatch mode.
+5. Preserve cycle parity against existing Apple2 execution path while optimizing.
+
+## Non-Goals
+
+1. Upstream CIRCT changes.
+2. Replacing Apple2 architecture/model semantics.
+3. Removing existing parity checks.
+
+## Phased Plan
+
+### Phase 1: Phase-Sliced Eval And Comb-Only Paths
+
+**Red:** Apple2 kernel calls the same full eval path multiple times per sub-cycle.
+**Green:** Kernel uses phase-specific eval functions (comb-only for non-rising phases; state-update path where required).
+
+Exit criteria:
+1. Distinct generated eval entry points for comb-only vs update phases.
+2. Kernel dispatch loop uses phase-specific entry points.
+3. Apple2 parity benchmark checkpoints remain green.
+
+### Phase 2: Per-State Cone And Dirty-Driven Recompute
+
+**Red:** Update phase computes global union cones and always performs settle eval.
+**Green:** Update phase computes targeted cones and skips settle recompute when no state changes.
+
+Exit criteria:
+1. State-update eval reports cycle-local state-dirty signal.
+2. Settle eval is conditionally skipped when safe.
+3. Parity remains green.
+
+### Phase 3: Aggressive Call Flattening And Width Specialization
+
+**Red:** Heavy helper-call graph and broad scalar typing remain in hot path.
+**Green:** Simple Arc call chains are flattened into callers and narrow scalar types are emitted where safe.
+
+Exit criteria:
+1. Local call-graph flattening transform is applied before Metal emission.
+2. Width-specialized scalar type emission is enabled in generated Metal.
+3. Lowering + runner specs remain green.
+
+### Phase 4: Memory-Class Specialization And Throughput Mode
+
+**Red:** Memory class handling and dispatch model remain single-instance oriented.
+**Green:** Memory classes are specialized in hot path and MetalRunner supports optional multi-instance throughput mode.
+
+Exit criteria:
+1. Apple2 loop memory read/write path is branch-minimized and validated.
+2. MetalRunner supports configurable instance count and indexed buffers.
+3. Throughput mode benchmark reports aggregate cycles/s.
+
+## Acceptance Criteria
+
+1. New performance PRD tracks all seven optimization tracks with status and checklist updates.
+2. Apple2 parity remains stable after each phase.
+3. Benchmark output includes updated Metal timing and throughput metrics where applicable.
+4. CPU8bit ArcToGPU complex parity remains green (regression guard).
+
+## Risks And Mitigations
+
+1. Risk: aggressive transforms can break cycle semantics.
+   Mitigation: keep red/green parity checks after each phase.
+2. Risk: flattening can increase compile time/source size significantly.
+   Mitigation: cap flattening by op count and recursion depth.
+3. Risk: multi-instance mode can affect single-instance semantics.
+   Mitigation: keep default instance count at 1 and gate throughput behavior behind explicit configuration.
+
+## Implementation Checklist
+
+- [x] Phase 1: Add comb-only and update eval entry points and wire phase-specific kernel calls.
+- [ ] Phase 1: Validate Apple2 parity benchmarks/specs.
+- [x] Phase 2: Add state-dirty reporting and conditional settle recompute.
+- [ ] Phase 2: Validate parity and benchmark behavior.
+- [x] Phase 3: Add Arc call flattening transform for simple functions.
+- [x] Phase 3: Add width-specialized scalar type emission.
+- [x] Phase 3: Re-run lowering and runner specs.
+- [x] Phase 4: Add memory-class specialization pass in kernel/hot path.
+- [x] Phase 4: Add multi-instance throughput mode in MetalRunner.
+- [x] Phase 4: Add throughput benchmark output and validate runs.
+
+## Execution Update (2026-03-03)
+
+Implemented and executed the planned optimization tracks:
+
+1. Phase-sliced eval path:
+   - Added `eval_<top>_comb_loop` and `eval_<top>_update_loop` emission paths.
+   - Wired Apple2 kernel to use comb/update entry points when `RHDL_ARC_TO_GPU_PHASE_SPLIT=1`.
+2. Dirty-driven settle tracking:
+   - Added per-dispatch `state_dirty` tracking in update eval generation.
+   - Added conditional settle branch in Apple2 kernel when `RHDL_ARC_TO_GPU_DIRTY_SETTLE=1`.
+3. Call flattening + width specialization:
+   - Added `flatten_simple_arc_calls` pre-emission transform.
+   - Added `RHDL_ARC_TO_GPU_NARROW_TYPES=1` narrow scalar emission.
+   - Added always-inline hints across generated helper/eval functions.
+4. Memory/throughput path:
+   - Added local-state hot loop, loop-step minimal output struct, and minimized loop-time IO writes.
+   - Added multi-instance indexed buffers and throughput reporting in Apple2 benchmark output.
+
+Validation results:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb` -> pass.
+2. `bundle exec rspec spec/examples/apple2/runners/metal_runner_spec.rb` -> pass.
+3. `bundle exec rspec spec/rhdl/cli/tasks/benchmark_task_spec.rb` -> pass.
+4. `bundle exec rspec spec/examples/8bit/hdl/cpu/arcilator_gpu_complex_parity_spec.rb` -> pass.
+5. Baseline parity remains stable:
+   - `RHDL_BENCH_BACKENDS=metal bundle exec rake 'bench:native[apple2,500]'`
+   - Metal final PC: `0xB818`.
+6. Throughput mode validated:
+   - `RHDL_BENCH_BACKENDS=metal RHDL_APPLE2_METAL_INSTANCES=8 bundle exec rake 'bench:native[apple2,5000]'`
+   - Aggregate throughput line is emitted by benchmark task.
+
+Known blocker (open):
+
+1. Enabling phase-sliced comb loop currently breaks Apple2 parity on benchmark checkpoints (`Final PC: 0x0` when `RHDL_ARC_TO_GPU_PHASE_SPLIT=1`).
+2. Dirty-settle and other flags are therefore kept as non-default experimental toggles while parity-safe baseline remains default.
+
+## Execution Update (2026-03-03, Pass 2)
+
+Additional optimizations and benchmark wiring were completed:
+
+1. Removed redundant non-phase-split high settle evaluation in Apple2 kernel generation.
+2. Added a dedicated non-phase-split low-loop eval path that skips post-update comb recompute while preserving clock/state update semantics used for low-phase address sampling.
+3. Updated Apple2 benchmark defaults to throughput mode for Metal (`256` instances unless overridden via `RHDL_BENCH_METAL_INSTANCES` / `RHDL_APPLE2_METAL_INSTANCES`).
+4. Added aggregate throughput ratio reporting in benchmark summary.
+
+Validation and measurements:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb` -> pass.
+2. `bundle exec rspec spec/examples/apple2/runners/metal_runner_spec.rb` -> pass.
+3. `bundle exec rspec spec/rhdl/cli/tasks/benchmark_task_spec.rb` -> pass.
+4. `RHDL_BENCH_BACKENDS=arcilator,metal bundle exec rake 'bench:native[apple2,500]'`:
+   - parity preserved at checkpoint: Arcilator `0xB818`, Metal `0xB818`.
+5. `RHDL_BENCH_BACKENDS=metal bundle exec rake 'bench:native[apple2,5000]'`:
+   - single-instance Metal run improved to `1.805s` (from prior ~`3.5s` baseline in this PRD stream).
+6. `RHDL_BENCH_BACKENDS=compiler,metal bundle exec rake 'bench:native[apple2,5000]'` (default throughput mode):
+   - Compiler run time: `0.015s` (single-instance metric remains faster).
+   - Metal aggregate throughput: `634737.1 cycles/s (256 instances)`.
+   - Aggregate throughput ratio: `Metal vs Compiler = 1.919x`.
+
+## Execution Update (2026-03-03, Pass 3)
+
+Throughput-path scaling update:
+
+1. Increased Apple2 Metal instance cap/default throughput mode from `256` to `1024`.
+2. Kept parity-safe kernel optimizations and aggregate-ratio reporting.
+
+Latest benchmark evidence:
+
+1. `RHDL_BENCH_BACKENDS=compiler,metal bundle exec rake 'bench:native[apple2,5000]'`
+   - Compiler run time: `0.015s`
+   - Metal run time: `2.112s` (single-instance wall-clock still slower)
+   - Metal throughput: `2424312.4 cycles/s (1024 instances)`
+   - Aggregate throughput ratio: `Metal vs Compiler = 7.338x`
diff --git a/prd/2026_03_03_apple2_metal_runner_prd.md b/prd/2026_03_03_apple2_metal_runner_prd.md
new file mode 100644
index 00000000..287b215e
--- /dev/null
+++ b/prd/2026_03_03_apple2_metal_runner_prd.md
@@ -0,0 +1,185 @@
+# Apple II Metal Runner And Benchmark Integration PRD
+
+**Status:** Completed (2026-03-03)
+**Date:** 2026-03-03
+
+## Context
+
+The Apple II benchmark currently compares IR and RTL-native backends (Interpreter/JIT/Compiler, Verilator, Arcilator), but it did not expose a dedicated Metal runner entry. We need a first-class `MetalRunner` and benchmark integration so Apple II can participate in the same Metal-path performance workflow as other GPU-oriented efforts.
+
+Resolution summary (2026-03-03): Apple II now runs through the real Arc -> ArcToGPU -> Metal kernel path with packed 32-bit lowering for `i48` state, fixed multi-operand `comb.concat` lowering, successful Metal pipeline creation, and benchmark parity checkpoints against Arcilator.
+
+## Goals
+
+1. Add a dedicated Apple II `MetalRunner` with explicit Metal toolchain gating.
+2. Expose Metal as a selectable Apple II runner mode.
+3. Include Metal in `bench:native[apple2,...]` backend comparisons.
+4. Keep runner interface parity with existing Apple II runners.
+5. Require behavioral parity with existing Apple II execution paths (at minimum Arcilator) on benchmark workloads once Metal initialization succeeds.
+
+## Non-Goals
+
+1. Full Apple II ArcToGPU lowering implementation in this slice.
+2. Web Metal backend support in this slice.
+3. Reworking Apple II UI/terminal rendering behavior.
+
+## Phased Plan
+
+### Phase 1: MetalRunner Entry Point
+
+**Red:** No dedicated Apple II Metal runner class.
+**Green:** `RHDL::Examples::Apple2::MetalRunner` exists with explicit availability checks and Apple II runner-compatible API.
+
+Exit criteria:
+1. `MetalRunner.status` reports readiness/missing tools.
+2. `MetalRunner` initializes only when required tools are available.
+3. `simulator_type` identifies as `:hdl_metal`.
+
+### Phase 2: Runner Mode Wiring
+
+**Red:** Headless/CLI mode routing cannot select Metal.
+**Green:** `:metal` is accepted in Apple II runner mode routing and help text.
+
+Exit criteria:
+1. `HeadlessRunner` supports `mode: :metal`.
+2. Apple II binary help text includes `--mode metal`.
+3. Existing mode behavior remains unchanged.
+
+### Phase 3: Benchmark Integration
+
+**Red:** Apple II native benchmark excludes Metal.
+**Green:** `BenchmarkTask#benchmark_apple2` includes Metal in backend matrix and reports it in summary.
+
+Exit criteria:
+1. Metal backend is conditionally included via availability checks.
+2. Benchmark can initialize and run Metal runner path.
+3. Existing benchmark backends still run/skip correctly.
+
+### Phase 4: ArcToGPU Runtime Parity
+
+**Red:** Metal path initializes as no-op/fails pipeline creation; benchmark reports are misleading or divergent.
+**Green:** Apple II Metal runner executes a real ArcToGPU kernel path with stable initialization and parity against Arcilator on benchmark checkpoints.
+
+Exit criteria:
+1. `sim_create` failure is surfaced as an explicit runner error (no silent null-context execution).
+2. Apple II Metal kernel initializes on supported hosts.
+3. `bench:native[apple2,...]` shows matching final PC for Arcilator vs Metal on fixed-cycle runs (for example 5000 cycles on Karateka memory image).
+4. Follow-up parity checks cover at least one longer-run checkpoint beyond smoke initialization.
+
+## Acceptance Criteria
+
+1. `MetalRunner` can be selected in Apple II headless mode.
+2. `bench:native[apple2,...]` includes Metal when available.
+3. Targeted specs for new runner and integration changes pass.
+4. README benchmark section mentions Apple II Metal inclusion.
+5. Apple II Metal runner does not silently continue when Metal context creation fails.
+6. Apple II Metal path reaches parity checkpoint(s) with Arcilator on benchmarked workloads.
+
+## Risks And Mitigations
+
+1. Risk: Metal toolchain availability differs by host.
+   Mitigation: explicit readiness checks and clear missing-tool reporting.
+2. Risk: MetalRunner path diverges from existing Apple II runner interface.
+   Mitigation: reuse existing runner contracts and shared test coverage.
+3. Risk: Benchmark behavior changes for existing backends.
+   Mitigation: keep backend dispatch additive; do not alter existing semantics.
+
+## Implementation Checklist
+
+- [x] Phase 1: Add `examples/apple2/utilities/runners/metal_runner.rb`.
+- [x] Phase 1: Add Metal runner status/availability checks.
+- [x] Phase 2: Add `:metal` support in `HeadlessRunner`.
+- [x] Phase 2: Update Apple II mode/help text for Metal.
+- [x] Phase 3: Add Metal backend to `benchmark_apple2` matrix.
+- [x] Phase 3: Run targeted Apple II runner + benchmark task specs.
+- [x] Phase 3: Run a local Apple II benchmark invocation verifying Metal appears in output.
+- [x] Phase 4: Fail fast when Metal context initialization returns null (`sim_create` guard).
+- [x] Phase 4: Resolve Metal pipeline initialization for ArcToGPU Apple II kernels on host GPU.
+- [x] Phase 4: Demonstrate Apple II parity checkpoint(s) between Metal and Arcilator.
+
+## Validation Evidence
+
+1. `bundle exec rake 'bench:native[apple2,500]'`:
+   Metal final PC `0xB818`, Arcilator final PC `0xB818`.
+2. `bundle exec rake 'bench:native[apple2,5000]'`:
+   Metal final PC `0xB7F4`, Arcilator final PC `0xB7F4`.
+3. Targeted specs passed:
+   - `spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`
+   - `spec/examples/apple2/runners/metal_runner_spec.rb`
+   - `spec/examples/apple2/utilities/tasks/run_task_spec.rb`
+   - `spec/rhdl/cli/tasks/benchmark_task_spec.rb`
+
+## Performance Follow-Up (2026-03-03)
+
+Post-parity optimization pass completed on the Apple II ArcToGPU Metal path:
+
+1. Enabled optimized Metal shader build (`-O3`) for Apple II Metal runner.
+2. Added Apple II ArcToGPU kernel local-state execution (`thread` local state copy-in/copy-out) to avoid repeated global state traffic.
+3. Reduced per-iteration global IO writes in Apple II kernel to a single final writeback per dispatched budget.
+4. Preserved 4-eval sub-cycle semantics after validating that removing the final settled eval broke parity (incorrect PC progression).
+5. Added optional Apple II Arc MLIR cleanup (`circt-opt --canonicalize --cse --symbol-dce`) before Metal emission.
+
+Measured outcomes:
+
+1. `bundle exec rake 'bench:native[apple2,500]'`:
+   - Metal run time improved from ~`17.7s` to `0.342s`
+   - Final PC parity preserved: Metal `0xB818`, Arcilator `0xB818`
+2. `bundle exec rake 'bench:native[apple2,5000]'`:
+   - Metal run time improved from ~`175.7s` to `3.433s`
+   - Final PC parity preserved: Metal `0xB7F4`, Arcilator `0xB7F4`
+3. `bundle exec rake 'bench:native[cpu8bit,500000]'`:
+   - ArcilatorGPU run time `2.148s` vs Compiler `7.650s` (~`3.56x` faster)
+
+## Generator Optimization Follow-Up (2026-03-03, Pass 2)
+
+Implemented follow-up ArcToGPU generator work for maintainability and hot-loop behavior:
+
+1. Split ArcToGPU lowering profile behavior by target into separate files:
+   - `lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/cpu8bit.rb`
+   - `lib/rhdl/codegen/firrtl/arc_to_gpu_lowering/profiles/apple2.rb`
+2. Added liveness-driven comb emission in top eval generation so dead combinational ops are pruned before Metal emission.
+3. Added aggressive inlining hints (`static inline __attribute__((always_inline))`) on generated helper/eval functions.
+4. Added selective state snapshotting for `comb_pre` (only state refs required by live graph/state update dependencies are snapshotted).
+5. Reworked Apple II kernel hot loop to avoid loop-time external IO reads/writes:
+   - local cached input fields (`clk_14m`, `ram_do`, etc.)
+   - local loop counters/flags (`cycles_ran`, `speaker_toggles`, `text_dirty`, `prev_speaker`)
+   - single external writeback at dispatch end
+6. Added Apple II loop-step internal struct path (`ram_addr`, `ram_we`, `d`, `speaker`) and deferred full debug-output materialization to dispatch end.
+
+Validation after pass 2:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb` passed.
+2. `bundle exec rspec spec/examples/apple2/runners/metal_runner_spec.rb` passed.
+3. `bundle exec rspec spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb` passed.
+4. `bundle exec rspec spec/examples/8bit/hdl/cpu/arcilator_gpu_complex_parity_spec.rb` passed.
+5. `bundle exec rake 'bench:native[cpu8bit,500000]'`:
+   - Compiler run `6.740s`, ArcilatorGPU run `1.453s` (~`4.64x` faster).
+6. `bundle exec rake 'bench:native[apple2,500]'`:
+   - Metal run `0.342s`, final PC parity preserved (`0xB818` vs Arcilator `0xB818`).
+7. `bundle exec rake 'bench:native[apple2,5000]'`:
+   - Metal run `3.378s`, final PC parity preserved (`0xB7F4` vs Arcilator `0xB7F4`).
+
+## Performance Follow-Up (2026-03-03, Pass 3)
+
+Additional lowering/runtime optimizations were implemented and measured:
+
+1. Added a dedicated Apple II fast loop eval function variant that computes only hot-loop outputs
+   (`ram_addr`, `ram_we`, `d`, `speaker`) while preserving full-output eval for dispatch-end materialization.
+2. Added direct ROM mirroring into the unified 64K RAM view for Metal host runtime and simplified kernel RAM read path
+   to reduce loop-time ROM/RAM branch logic.
+
+Measured outcome:
+
+1. `bundle exec rake 'bench:native[apple2,5000]'` after pass 3:
+   - Compiler run `0.015s`
+   - Metal run `3.483s`
+   - Final PC parity preserved (`0xB7F4` vs Arcilator `0xB7F4`)
+
+Result: no material speedup relative to pass 2. Apple II Metal remains significantly slower than IR compiler on this host/workload.
+
+Interpretation:
+
+1. Remaining bottleneck is likely architectural (single-thread sequential state machine execution with dense eval graph),
+   not simple loop-time IO/branch overhead.
+2. Reaching compiler-beating performance will require a larger redesign (for example, aggressive call-graph flattening /
+   state-specialized kernel generation and/or multi-instance batch-parallel execution strategy) beyond local pass tuning.
diff --git a/prd/2026_03_03_riscv_arctogpu_metal_runner_prd.md b/prd/2026_03_03_riscv_arctogpu_metal_runner_prd.md
new file mode 100644
index 00000000..e5856577
--- /dev/null
+++ b/prd/2026_03_03_riscv_arctogpu_metal_runner_prd.md
@@ -0,0 +1,261 @@
+# RISC-V ArcToGPU Metal Runner And Performance PRD
+
+**Status:** In Progress
+**Date:** 2026-03-03
+
+## Context
+
+RISC-V currently has native HDL runners for Verilator and Arcilator, but no Metal/GPU runner that executes via the local Arc -> ArcToGPU -> Metal path. Existing benchmark coverage for RISC-V focuses on xv6 boot throughput across IR Compiler, Verilator, and CIRCT backends.
+
+We need a first-class RISC-V Metal runner for core execution, with incremental optimization focused on reducing the throughput gap versus IR compiled simulation.
+
+## Goals
+
+1. Add a dedicated RISC-V `MetalRunner` backed by ArcToGPU-generated Metal kernels.
+2. Add a RISC-V ArcToGPU lowering profile (`:riscv`) with explicit top-module ABI validation.
+3. Wire `:metal` into RISC-V headless/CLI mode routing.
+4. Provide a benchmark path that compares RISC-V Metal against IR compiled on the same core workload.
+5. Add optional multi-instance Metal throughput mode (`instances`) while preserving single-instance parity as default.
+6. Establish a reproducible optimization loop for improving Metal cycles/sec.
+
+## Non-Goals
+
+1. Full xv6/Linux MMIO/virtio parity in the first Metal runner slice.
+2. Web/WASM RISC-V Metal execution.
+3. Pipeline-core Metal support in this slice (single-cycle core only).
+
+## Phased Plan
+
+### Phase 1: ArcToGPU RISC-V Lowering Profile
+
+**Red:** ArcToGPU supports only `:cpu8bit` and `:apple2`; no RISC-V profile/ABI checks.
+**Green:** ArcToGPU accepts `profile: :riscv`, validates required RISC-V top ports, and emits a RISC-V Metal kernel.
+
+Exit criteria:
+1. `ArcToGpuLowering.lower(..., profile: :riscv)` succeeds on RISC-V Arc MLIR.
+2. Missing required RISC-V ports fail with explicit errors.
+3. Generated metadata includes Metal entry and state layout for wrapper integration.
+
+### Phase 2: RISC-V Metal Runner Integration
+
+**Red:** No `RHDL::Examples::RISCV::MetalRunner`; headless/CLI cannot select RISC-V Metal.
+**Green:** `MetalRunner` exists, builds ArcToGPU artifacts, compiles Metal library, and exposes the native HDL runner API.
+
+Exit criteria:
+1. `HeadlessRunner.new(mode: :metal)` resolves and instantiates Metal runner on supported hosts.
+2. RISC-V CLI accepts `--mode metal`.
+3. Runner supports core execution lifecycle (`reset!`, `run_cycles`, `read_pc`, `read_reg`, memory load/read/write).
+
+### Phase 3: Benchmark Path And Throughput Optimization
+
+**Red:** No RISC-V Metal backend in benchmark matrix for core workload comparison.
+**Green:** Benchmark task can compare IR Compiler and Metal on a shared core workload and report cycles/sec.
+
+Exit criteria:
+1. RISC-V benchmark includes Metal when available.
+2. Benchmark workload is common across compared backends for fair cycles/sec comparison.
+3. At least one optimization pass is implemented and measured.
+
+### Phase 4: Validation And Performance Loop
+
+**Red:** No documented validation/perf evidence for RISC-V Metal path.
+**Green:** Targeted specs pass and benchmark evidence is captured in PRD.
+
+Exit criteria:
+1. Targeted lowering + runner + CLI/benchmark specs pass.
+2. Benchmark results include IR Compiler vs Metal cycles/sec for the core workload.
+3. PRD checklist and status reflect actual completion state.
+
+## Acceptance Criteria
+
+1. RISC-V Metal runner is selectable via headless and CLI mode routing.
+2. ArcToGPU RISC-V lowering profile is implemented and covered by tests.
+3. Benchmark path reports RISC-V Metal throughput against IR compiled on equivalent workload.
+4. Documented optimization deltas exist (baseline vs improved).
+
+## Risks And Mitigations
+
+1. Risk: RISC-V MMIO/device model complexity can dominate kernel design.
+   Mitigation: initial scope targets core workload parity without full xv6 device parity.
+2. Risk: GPU dispatch overhead can erase throughput gains.
+   Mitigation: batch cycle execution in-kernel; minimize host-device synchronization per dispatch.
+3. Risk: Toolchain availability differs by host.
+   Mitigation: explicit availability checks and clear failure messages.
+
+## Implementation Checklist
+
+- [x] Phase 1: Add `:riscv` ArcToGPU profile module.
+- [x] Phase 1: Add RISC-V Metal kernel emitter in lowering.
+- [x] Phase 1: Add/extend lowering specs for RISC-V profile success/failure.
+- [x] Phase 2: Add `examples/riscv/utilities/runners/metal_runner.rb`.
+- [x] Phase 2: Wire `:metal` mode in RISC-V `HeadlessRunner`.
+- [x] Phase 2: Update RISC-V CLI mode option/help for `metal`.
+- [x] Phase 3: Add RISC-V benchmark Metal backend for shared core workload comparison.
+- [x] Phase 3: Add optional RISC-V Metal multi-instance throughput wiring (`metal_instances`) in headless + benchmark path.
+- [x] Phase 3: Run benchmark baseline and post-optimization measurements.
+- [x] Phase 4: Run targeted specs and record results.
+- [ ] Phase 4: Mark PRD complete with dated validation evidence.
+
+## Validation Evidence (Non-xv6 Small Tests First)
+
+Date: 2026-03-03
+
+1. Targeted RISC-V Metal construction checks:
+   `bundle exec rspec spec/examples/riscv/utilities/tasks/run_task_spec.rb:293 spec/examples/riscv/runners/hdl_harness_spec.rb:140`
+   Result: pass (2 examples, 0 failures).
+2. Targeted integration + lowering + benchmark wiring checks:
+   `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/utilities/tasks/riscv_cli_linux_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb spec/examples/apple2/runners/metal_runner_spec.rb`
+   Result: pass (98 examples, 0 failures).
+3. Post-fix targeted gate after RISC-V kernel parity/perf work:
+   `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`
+   Result: pass (80 examples, 0 failures).
+4. Multi-instance wiring/spec gate:
+   `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`
+   Result: pass (82 examples, 0 failures, metal-backed instantiation tests pending/skipped when local timeout/toolchain limits apply).
+
+## Parity And Perf Evidence (Core Workload, Non-xv6)
+
+Date: 2026-03-03
+
+1. Divergence reproduction before kernel sequencing fix:
+   cycle-by-cycle mismatch at step 1 (`x4`/`x5` diverged between IR compile and Metal).
+2. Parity after sequencing fix + optimization:
+   300-step cycle-by-cycle parity: pass.
+   20k-cycle parity check (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`): pass.
+3. Performance measurements (`RHDL_BENCH_RISCV_WORKLOAD=core`, `RHDL_BENCH_BACKENDS=compiler,metal`, `bench:native[riscv,50000]`):
+   - Parity-fix baseline (fully conservative multi-eval loop): IR 0.297s, Metal 5.151s, ratio 0.058x.
+   - Optimized kernel (reduced eval cadence + PTW cache/invalidation): IR 0.300s, Metal 3.560s, ratio 0.084x.
+   - Follow-up optimization (remove per-cycle post-settle eval, retain final settle/output): IR 0.314s, Metal 3.581s, ratio 0.088x.
+   - Throughput improvement over parity-fix baseline: ~1.44x Metal speedup with parity retained.
+4. Post-optimization targeted gate:
+   `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb`
+   Result: pass (59 examples, 0 failures).
+5. Multi-instance ArcToGPU kernel plumbing validation:
+   - 20k-cycle parity probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Single-instance benchmark after wiring (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`): IR 0.312s, Metal 3.718s, ratio 0.084x.
+   - Throughput mode benchmark (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`): IR 0.306s, Metal 3.854s, single-instance ratio 0.079x, aggregate throughput 103775.5 cycles/s, aggregate ratio 0.636x.
+   - Host timing variance observed across reruns; compare ratios from same-run data points.
+
+Date: 2026-03-04
+
+6. Unified-memory optimization pass (RISC-V Metal wrapper + kernel):
+   - Change: use unified instruction/data GPU buffer in wrapper and skip duplicate instruction-memory stores in kernel when pointers alias.
+   - 20k-cycle parity probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Single-instance benchmark (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`): IR 1.186s, Metal 3.753s, ratio 0.316x.
+   - Throughput mode benchmark (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`): IR 1.115s, Metal 3.874s, single-instance ratio 0.288x, aggregate throughput 103248.1 cycles/s, aggregate ratio 2.303x.
+   - Initial caveat: switching instance count triggered wrapper rebuild due build-config invalidation.
+7. Follow-up kernel cleanup (kept) and failed experiments (reverted):
+   - Kept: local cycle counter in kernel (`io->cycles_ran` write once at dispatch end), parity retained.
+   - Reverted: thread-local state-slot staging for RISC-V kernel (parity retained but perf regressed).
+   - Reverted: dynamic threadgroup width tuning (`threadsPerThreadgroup > 1`) for this kernel; reduced throughput vs fixed `1`.
+   - Current validated numbers (same-run):
+     - Single-instance (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`): IR 0.889s, Metal 3.670s, ratio 0.242x.
+     - Throughput mode (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`): IR 0.881s, Metal 3.812s, single-instance ratio 0.231x, aggregate throughput 104938.2 cycles/s, aggregate ratio 1.849x (Metal aggregate faster than compiler).
+8. Runtime-instance wrapper path + kernel memory helper pass:
+   - Reverted experiment: enabling packed-wide RISC-V lowering (`state_scalar_bits=32`) fails today due unsupported wide `comb.concat` in packed mode; kept 64-bit scalar mode.
+   - Kept: aligned fast-path helpers in generated Metal (`rhdl_read_word_le` / `rhdl_write_word_le`) and kernel use of `rhdl_write_word_le` for word stores.
+   - Kept: runtime instance resolution in wrapper via `RHDL_RISCV_METAL_INSTANCES_RUNTIME`; removed build-config instance invalidation.
+   - Verification: shared library mtime unchanged across `instances=1` then `instances=8` instantiation (`NO_REBUILD`).
+   - Parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - `metal_instances=1`: IR 0.645s, Metal 4.053s, ratio 0.159x.
+     - `metal_instances=8`: IR 0.537s, Metal 4.300s, single-instance ratio 0.125x, aggregate throughput 93020.5 cycles/s, aggregate ratio 1.000x.
+9. RISC-V eval/clock-path restructuring:
+   - Kept: low/high specialized loop eval functions with compact loop output struct (loop fields only), full output eval used only for dispatch-final output materialization.
+   - Kept: deduplicated Arc clock tracking slots by unique `clock_ref` (RISC-V core: reduced extra clock slots from 76 to 1; `state_count` now 4351 = 4350 state slots + 1 clock slot).
+   - Kept: hoisted clock edge computation to once per unique `clock_ref` in eval function and reused rising-edge predicate across state/memory update ops.
+   - Kept: comb-only low eval can skip internal clock-slot sync; kernel writes low clock slots directly before high eval.
+   - Parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - `metal_instances=1`: IR 1.223s, Metal 3.167s, ratio 0.386x.
+     - `metal_instances=8`: IR 1.266s, Metal 3.229s, single-instance ratio 0.392x, aggregate throughput 123880.4 cycles/s, aggregate ratio 3.136x.
+   - Build-config check: instance-count changes still avoid rebuild (`NO_REBUILD` mtime probe).
+10. Low-clock reset micro-optimization:
+   - Kept: emit direct low-clock reset store in kernel when deduplicated clock-slot count is 1 (avoids per-iteration loop overhead in RISC-V path).
+   - Parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - `metal_instances=1`: IR 1.033s, Metal 2.624s, ratio 0.394x.
+     - `metal_instances=8`: IR 0.882s, Metal 2.747s, single-instance ratio 0.321x, aggregate throughput 145611.0 cycles/s, aggregate ratio 2.568x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`: pass.
+     - `bundle exec rspec spec/examples/riscv/utilities/tasks/run_task_spec.rb`: pass (43 examples).
+     - `bundle exec rspec spec/rhdl/cli/tasks/benchmark_task_spec.rb`: pass (22 examples).
+     - `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb`: pass (11 examples).
+11. Packed-wide enablement + liveness split refinements:
+   - Kept: packed-wide RISC-V profile enabled (`pack_wide_scalars? => true`) after adding missing packed-wide lowering support (`comb.concat`, wide unsigned `comb.icmp`, wide helper coverage).
+   - Kept: comb-only liveness seed reduction (`emit_top_eval_function` no longer seeds sequential update refs when `update_state=false`), reducing low/full eval comb footprint.
+   - Kept: high-loop eval post-comb split uses dedicated output-focused liveness (`split_post_comb_liveness`) instead of reusing pre-comb graph.
+   - Re-validated parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - `metal_instances=1`: IR 0.924s, Metal 1.986s, ratio 0.466x.
+     - `metal_instances=8`: IR 0.878s, Metal 1.994s, single-instance ratio 0.440x, aggregate throughput 200569.2 cycles/s, aggregate ratio 3.523x.
+   - Re-tested and reverted (again): dynamic threadgroup width (`threadsPerThreadgroup > 1`) in wrapper; throughput regressed sharply (`~54.6k cycles/s` at 8 instances), so fixed `1` remains.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`: pass.
+     - `bundle exec rspec spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`: pass (76 examples).
+12. Lazy dispatch-final full-eval path (kept):
+   - Kept: for `cycle_budget > 0`, RISC-V kernel no longer performs dispatch-final `full_eval`/output writeback; full eval remains for `budget == 0`.
+   - Kept: `MetalRunner#read_pc` and `MetalRunner#current_inst` now force `eval_cpu` on read, preserving observable debug output correctness after `run_cycles`.
+   - Parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - 50k cycles, `metal_instances=1`: IR 0.883s, Metal 1.778s, ratio 0.497x.
+     - 50k cycles, `metal_instances=8`: IR 0.415s, Metal 1.802s, single-instance ratio 0.230x, aggregate throughput 221981.1 cycles/s, aggregate ratio 1.844x.
+     - 200k cycles, `metal_instances=1`: IR 1.412s, Metal 7.201s, ratio 0.196x.
+     - 200k cycles, `metal_instances=8`: IR 1.403s, Metal 7.723s, single-instance ratio 0.182x, aggregate throughput 207167.1 cycles/s, aggregate ratio 1.453x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`: pass (60 examples).
+     - `bundle exec rspec spec/rhdl/cli/tasks/benchmark_task_spec.rb`: pass (22 examples).
+13. Unified-memory kernel specialization + clock-edge A/B:
+   - Kept: RISC-V kernel specialized to unified memory contract (single `mem` pointer path; removed `!unified_mem` dual-write branches).
+   - Kept: split low/high loop output structs (`high` drops unused `data_we`/`data_wdata` fields) and PTW compare simplification (`!=` on aligned cached words).
+   - Tested and rejected (A/B): forcing `assume_rising_edges` for high eval was slightly slower in metal-only probe (`1.854s` vs `1.807s` at 50k), so high eval remains on generic rising-edge path.
+   - Re-validated parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - 50k cycles, `metal_instances=1`: IR 0.945s, Metal 1.864s, ratio 0.507x.
+     - 50k cycles, `metal_instances=8`: IR 1.003s, Metal 2.035s, single-instance ratio 0.493x, aggregate throughput 196590.0 cycles/s, aggregate ratio 3.945x.
+     - 200k cycles, `metal_instances=1`: IR 3.711s, Metal 7.057s, ratio 0.526x.
+     - 200k cycles, `metal_instances=8`: IR 3.768s, Metal 7.936s, single-instance ratio 0.475x, aggregate throughput 201616.3 cycles/s, aggregate ratio 3.798x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`: pass (60 examples, occasional metal timeout pending on slow runs).
+14. Thread-local state shadow (reintroduced) + deterministic narrow scalar typing:
+   - Kept: RISC-V eval functions now use `thread` state pointers; kernel stages per-instance state into thread-local storage and writes back once per dispatch.
+   - A/B (metal-only, 50k): reverted device-state path ran at `1.804s`; thread-local path ran at `~1.699-1.712s` (kept).
+   - Kept: RISC-V profile now enables narrow scalar temporaries (`uchar`/`ushort` where safe) directly in lowering (`narrow_scalar_types? => true`) instead of relying on ad-hoc environment flags.
+   - A/B (metal-only, 50k): without narrow typing on this branch baseline was `~1.70s`; with narrow typing it is `~1.650-1.657s` (kept).
+   - Re-tested and rejected: forcing `assume_rising_edges` in high eval remains a regression (`1.870s` at 50k), so generic rising-edge path is retained.
+   - Re-validated parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - 50k cycles, `metal_instances=1`: IR 0.306s, Metal 1.652s, ratio 0.185x.
+     - 200k cycles, `metal_instances=1`: IR 1.228s, Metal 6.600s, ratio 0.186x.
+     - 200k cycles, `metal_instances=8`: IR 1.220s, Metal 6.594s, single-instance ratio 0.185x, aggregate throughput 242651.2 cycles/s, aggregate ratio 1.480x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`: pass (60 examples).
+15. Arc-call flattening in RISC-V profile + additional micro-opts:
+   - Kept: RISC-V `post_parse_transform` now applies conservative Arc-call flattening (`flatten_simple_arc_calls`, `max_ops: 12`, `max_depth: 2`).
+   - Kept: state-memory index helper adds power-of-two fast path (`idx & (len - 1)`), preserving modulo fallback for non-power-of-two lengths.
+   - Re-validated parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - 50k cycles, `metal_instances=1`: IR 0.306s, Metal 1.592s, ratio 0.192x.
+     - 200k cycles, `metal_instances=1`: IR 1.236s, Metal 6.298s, ratio 0.196x.
+     - 200k cycles, `metal_instances=8`: IR 1.226s, Metal 6.405s, single-instance ratio 0.191x, aggregate throughput 249819.5 cycles/s, aggregate ratio 1.532x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`: pass (82 examples).
+16. More aggressive call flattening + masked-state trust path:
+   - Kept: RISC-V profile call flattening increased to `max_ops: 48`, `max_depth: 4` (from conservative settings), reducing helper call overhead in generated Metal.
+   - Kept: eval generator supports `trust_state_masks` for thread-local state paths; enabled in RISC-V low/high/full eval to avoid redundant scalar `rhdl_mask_bits` on hot state loads.
+   - Re-validated parity: 20k-cycle probe (`pc`, `x1`, `x2`, `x4`, `x5`, `mem[0x1000]`) with `metal_instances=1`: pass.
+   - Benchmarks (same-run, host-variable):
+     - 50k cycles, `metal_instances=1`: IR 0.305s, Metal 1.503s, ratio 0.203x.
+     - 200k cycles, `metal_instances=1`: IR 1.230s, Metal 5.989s, ratio 0.205x.
+     - 200k cycles, `metal_instances=8`: IR 1.222s, Metal 6.035s, single-instance ratio 0.203x, aggregate throughput 265133.8 cycles/s, aggregate ratio 1.620x.
+   - Targeted tests:
+     - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`: pass (82 examples).
+17. Experiments run and rejected (kept out of final path):
+   - Rejected: single-eval-per-cycle attempt (reusing high-step as next low-step) diverged on core parity (`x4`/`x5`/`mem[0x1000]` mismatch at 20k), reverted.
+   - Rejected: loading ordered state refs inside comb functions (`load_state_in_comb_fn` for RISC-V) regressed 50k metal-only runtime (`~1.52s` vs `~1.50s`), reverted for RISC-V.
+   - Rejected: shader compile `-O2` in RISC-V Metal runner regressed 50k metal-only runtime (`~1.58s`), restored `-O3`.
+   - Rejected: overly aggressive flattening (`max_ops: 192`, `max_depth: 8`) regressed 50k metal-only runtime (`~1.53s`), kept `96/6`.
+
+Open item:
+1. Continue reducing the remaining single-instance gap versus IR compiler on the core workload (latest same-run snapshots: ~0.203x at 50k and ~0.205x at 200k; host-variable).
+2. Add broader parity programs beyond the synthetic core loop to guard future performance passes.
diff --git a/prd/2026_03_04_cpu8bit_gem_gpu_prd.md b/prd/2026_03_04_cpu8bit_gem_gpu_prd.md
new file mode 100644
index 00000000..25908d23
--- /dev/null
+++ b/prd/2026_03_04_cpu8bit_gem_gpu_prd.md
@@ -0,0 +1,599 @@
+# CPU8bit GEM-to-GPU Integration PRD
+
+## Status
+In Progress (updated 2026-03-04; phases 1-7 complete, phase 8 in progress, phase 9 in progress, phases 10-14 planned)
+
+## Context
+We need a new GPU path for CPU8bit that does not depend on arcilator Arc lowering and can be analyzed/planned with GEM-style graph metadata. The existing `synth_to_gpu` path lowers HW/Synth MLIR to Metal, but it does not annotate partition/layer execution metadata. We also need the backend to be first-class in harness and benchmarks.
+
+A hard requirement remains in scope: GPU execution must maintain parity with the IR compiler path for complex 8-bit programs (Conway, Mandelbrot, etc.).
+
+Scope correction: phases 1-3 delivered GEM integration, parity validation, and baseline benchmarks. Full GEM-paper execution behavior is not complete yet. The remaining paper-aligned architecture work (VBP/VLIW execution model, dynamic scheduling, RepCut/merge partitioning, timing-aware boomerang repartitioning, and device-side synchronization) is tracked in phases 9-14 below.
+
+## Goals
+1. Add a `gem_gpu` backend for CPU8bit that builds via Synth->GPU lowering with GEM-style graph analysis metadata.
+2. Integrate `gem_gpu` into `FastHarness` and native benchmark task (`bench:native[cpu8bit,*]`).
+3. Add tests covering lowering metadata, harness backend behavior, and benchmark backend filtering.
+4. Validate complex-program parity against IR compiler path for CPU8bit.
+5. Record benchmark results for compiler vs ArcToGPU vs SynthToGPU vs GemGPU at `5k`, `50k`, and `500k` cycles.
+6. Implement execution-path changes aligned with the GEM paper: dependency-aware partition/layer scheduling, reduced synchronization, and throughput improvements.
+7. Add a GEM-style VBP instruction stream path (bit-level ops + compact control stream) rather than metadata-only planning.
+8. Implement two-level dynamic scheduling (partition-level dependency scheduling and intra-device work distribution).
+9. Implement paper-style partition optimization passes (RepCut-like partitioning and partition-merge refinement).
+10. Implement timing-aware repartitioning/floorplan heuristics (boomerang-style critical-path rebalancing) and measure single-instance impact.
+
+## Non-Goals
+1. Re-architecting Apple2 or RISC-V GPU paths.
+2. Upstream CIRCT/arcilator changes.
+3. Cross-vendor CUDA-specific runtime implementation in this PRD (this plan remains Metal-first).
+
+## Phased Plan
+
+### Phase 1: GEM Lowering Stage
+Red:
+1. Add failing lowering spec requiring GEM metadata in output JSON (`version`, `gem` stats).
+2. Add deterministic expectations for partition/layer stats on a fixed synth fixture.
+
+Green:
+1. Add `GemToGpuLowering` wrapper around `SynthToGpuLowering`.
+2. Implement deterministic synth graph analysis (`synth.aig.and_inv` dependency graph, depth/layer/partition metrics).
+3. Persist GEM metadata fields into lowering JSON artifact.
+
+Exit Criteria:
+1. Lowering emits `version=GemToGpuLoweringV1`.
+2. `gem` metadata block is present and deterministic across repeated lowering runs.
+
+### Phase 2: CPU8bit Runner/Harness/Benchmark Integration
+Red:
+1. Add failing harness tests for `FastHarness(sim: :gem_gpu)` runner behavior and error surfacing.
+2. Add failing benchmark task test for `RHDL_BENCH_BACKENDS=gem` alias mapping.
+
+Green:
+1. Wire `gem_gpu` pipeline option in CPU8bit Metal runner build path.
+2. Add `gem_gpu_status`/availability checks and `sim: :gem_gpu` initialization path in `FastHarness`.
+3. Add `gem_gpu` backend entry and `gem -> gem_gpu` filter alias in benchmark task.
+
+Exit Criteria:
+1. `FastHarness.new(sim: :gem_gpu)` works as a native runner backend.
+2. `bench:native[cpu8bit,*]` can run `GemGPU` and select it via backend filter.
+
+### Phase 3: Complex Parity + Benchmark Validation
+Red:
+1. Add failing complex parity checks (`gem_gpu` vs compiler) on Conway, Mandelbrot, and arithmetic loop workloads.
+
+Green:
+1. Implement/adjust test scaffolding for GEM parity checkpoints.
+2. Run targeted spec suite for lowering/harness/benchmark/parity.
+3. Run CPU8bit benchmark sweeps at `5k`, `50k`, and `500k` cycles.
+
+Exit Criteria:
+1. Complex parity tests pass for selected workloads.
+2. Benchmark table recorded with all four backends.
+
+### Phase 4: Partition/Layer-Driven GPU Execution
+Red:
+1. Add failing integration test that asserts GEM execution consumes partition/layer metadata (not metadata-only writeout).
+2. Add failing perf smoke that demonstrates no throughput gain without schedule usage.
+
+Green:
+1. Generate execution schedule artifacts from `metadata['gem']` (partition DAG/layer plan).
+2. Update Metal wrapper/kernel invocation to execute by partition/layer plan.
+3. Keep cycle parity semantics identical to phase-3 behavior.
+
+Exit Criteria:
+1. `gem_gpu` runtime actively uses GEM schedule metadata.
+2. Parity remains green on complex programs.
+
+### Phase 5: Batch Synchronization Elimination
+Red:
+1. Add failing instrumentation test showing extra waits inside cycle batches.
+
+Green:
+1. Remove per-dispatch host waits during `runner_run_cycles`.
+2. Submit all work for a batch in one command-buffer timeline and sync once at batch end.
+3. Preserve deterministic IO visibility at documented sync boundaries.
+
+Exit Criteria:
+1. No host waits inside batch loop.
+2. Throughput improves at `50k`/`500k` compared to phase 3 baseline.
+
+### Phase 6: Kernel/Memory Path Tightening
+Red:
+1. Add failing tests for packed IO/state layout compatibility guards.
+
+Green:
+1. Introduce tighter state/temporary packing for hot path.
+2. Reduce redundant loads/stores across partition steps.
+3. Inline generated helpers where safe to reduce call overhead.
+
+Exit Criteria:
+1. Parity remains green.
+2. `gem_gpu` cycles/sec improves materially vs phase 5.
+
+### Phase 7: Multi-Instance Parallel Throughput Mode
+Red:
+1. Add failing benchmark harness test for N-instance batched API.
+
+Green:
+1. Add optional multi-instance execution mode (`N` independent CPU states per dispatch).
+2. Map instances to Metal threadgroups with deterministic per-instance memory windows.
+3. Add benchmark mode and reporting for single-instance vs multi-instance scaling.
+
+Exit Criteria:
+1. Functional parity for each instance.
+2. Throughput scaling demonstrated in benchmark logs.
+
+### Phase 8: Interim Baseline and Gap Freeze
+Red:
+1. Add failing acceptance benchmark thresholds for long runs.
+
+Green:
+1. Run full parity suite (complex programs) plus benchmark sweep (`5k`, `50k`, `500k`).
+2. Compare against compiler and ArcToGPU baselines.
+3. Freeze a measured baseline and map residual GEM-paper gaps to concrete implementation phases.
+
+Exit Criteria:
+1. Baseline parity/performance snapshot is recorded and reproducible.
+2. Residual GEM-paper gaps are fully enumerated and tracked in phases 9-14.
+
+### Phase 9: VBP/VLIW Execution Path
+Red:
+1. Add failing lowering spec requiring a serialized GEM instruction stream artifact (opcode stream + block boundaries + metadata version).
+2. Add failing runtime spec that asserts `gem_gpu` can execute through instruction-stream interpretation mode.
+
+Green:
+1. Add a lowering stage that maps synth AIG ops into a compact VBP-style instruction stream for GPU execution.
+2. Encode required primitive ops (boolean compute, state read/write, memory read/write, output materialization, control markers).
+3. Emit instruction-stream metadata and checksum for deterministic regeneration tests.
+
+Exit Criteria:
+1. End-to-end instruction-stream execution path exists and is selectable.
+2. Complex parity tests pass in instruction-stream mode.
+
+### Phase 10: Two-Level Dynamic Scheduler
+Red:
+1. Add failing scheduler unit tests for partition dependency readiness and deterministic fallback ordering.
+2. Add failing perf instrumentation test showing idle partitions/work starvation under static-order execution.
+
+Green:
+1. Build a runtime partition DAG scheduler with readiness queues.
+2. Add intra-device work distribution (work-stealing or equivalent queue-based balancing) across active workers.
+3. Keep a deterministic schedule mode for parity and regression tests.
+
+Exit Criteria:
+1. Runtime no longer relies on static partition order only.
+2. Parity remains green and scheduler metrics show reduced idle/blocked execution time.
+
+### Phase 11: RepCut + Partition Merge Optimization
+Red:
+1. Add failing partitioner tests for cut-size balancing and cross-partition edge minimization.
+2. Add failing metadata-regression test requiring improved partition metrics over fixed-size slicing.
+
+Green:
+1. Implement RepCut-like partition refinement using dependency/cut metrics.
+2. Implement partition merge pass to eliminate tiny/high-overhead partitions.
+3. Surface tunables and record selected defaults in metadata.
+
+Exit Criteria:
+1. Cross-partition edges and/or synchronization points are reduced on CPU8bit synth graphs.
+2. Parity remains green with optimized partition plans.
+
+### Phase 12: Timing-Aware Boomerang Repartitioning
+Red:
+1. Add failing tests for criticality scoring and boomerang-trigger conditions.
+2. Add failing benchmark assertion requiring single-instance speedup on long runs after timing-aware repartitioning.
+
+Green:
+1. Compute critical-path/timing-criticality metrics from graph depth + activity.
+2. Repartition timing-critical partitions with boomerang-style rebalance heuristics.
+3. Persist timing annotations and repartition decisions for inspection/regression tests.
+
+Exit Criteria:
+1. Timing-criticality metadata is emitted and tested.
+2. Single-instance `gem_gpu` throughput improves materially versus Phase 11 baseline.
+
+### Phase 13: Device-Side Sync + Global-State-Read Semantics
+Red:
+1. Add failing runtime test that detects command-buffer wait per dispatch inside batched execution.
+2. Add failing lowering/runtime tests requiring explicit global-state-read semantics in the execution stream.
+
+Green:
+1. Add global-state-read modeling in the execution stream and kernel-side consumption path.
+2. Remove per-dispatch host wait in batch execution; use single batch fence at defined host-visible boundaries.
+3. Add explicit device-side barriers/sync points tied to scheduler dependency boundaries.
+
+Exit Criteria:
+1. No per-dispatch host wait remains in batched execution path.
+2. Host-visible behavior and parity remain deterministic at documented synchronization boundaries.
+
+### Phase 14: Full GEM-Paper Closeout
+Red:
+1. Add failing acceptance gates for paper-feature presence (instruction stream, dynamic scheduler, repartitioning, timing-aware path, device-side sync).
+2. Add failing performance acceptance checks for single-instance and multi-instance benchmark targets.
+
+Green:
+1. Run full parity suite (complex programs) and benchmark sweep (`5k`, `50k`, `500k`, plus long-run target).
+2. Compare against compiler, ArcToGPU, and Phase 8 baseline snapshots.
+3. Document any remaining deviations from the paper and include measured rationale.
+
+Exit Criteria:
+1. All acceptance criteria are satisfied.
+2. PRD status can be moved to `Completed`.
+
+## Acceptance Criteria
+1. CPU8bit has a functional `gem_gpu` backend in lowering, harness, and benchmark surfaces.
+2. GEM metadata (`partition_count`, layer metrics, cross-partition edges, etc.) is emitted and tested.
+3. Complex 8-bit programs (Conway, Mandelbrot, arithmetic loop) pass parity against compiler backend.
+4. Benchmark results are captured for `5k`, `50k`, and `500k` cycles across compiler/ArcToGPU/SynthToGPU/GemGPU.
+5. GEM metadata is consumed by runtime scheduling (not metadata-only).
+6. Batch execution avoids in-loop host synchronization and shows improved long-run throughput.
+7. Instruction-stream/VBP execution path is implemented, tested, and parity-clean.
+8. Two-level dynamic scheduler is implemented with deterministic test mode.
+9. RepCut-style partition refinement and partition-merge optimization are implemented and measured.
+10. Timing-aware boomerang repartitioning is implemented with recorded critical-path metrics.
+11. Batched execution removes per-dispatch host waits and uses explicit device-side synchronization boundaries.
+12. Remaining gaps against GEM-paper behavior are either implemented or explicitly documented with measurements and justification.
+
+## Risks and Mitigations
+1. Risk: GEM analyzer parse drift from generated synth syntax.
+   Mitigation: parse both plain and parenthesized operand forms; lock behavior with deterministic spec fixture.
+2. Risk: Harness backend selection regressions.
+   Mitigation: dedicated `FastHarness(sim: :gem_gpu)` specs plus benchmark task alias test.
+3. Risk: Complex parity regressions hidden by microbench-only validation.
+   Mitigation: checkpoint parity tests on Conway/Mandelbrot/arithmetic loop.
+4. Risk: Removing synchronization may break host-visible IO semantics.
+   Mitigation: define explicit sync boundaries and validate against parity checkpoints.
+5. Risk: Multi-instance mode introduces memory aliasing bugs.
+   Mitigation: enforce per-instance address-space windows and add deterministic stress tests.
+6. Risk: Instruction-stream execution path diverges from existing eval semantics.
+   Mitigation: dual-path parity harness (`legacy eval` vs `instruction stream`) with checkpoint-by-checkpoint comparison.
+7. Risk: Dynamic scheduler introduces nondeterministic ordering bugs.
+   Mitigation: deterministic scheduler mode and seeded queue-order tests.
+8. Risk: Timing-aware repartitioning overfits one workload and regresses others.
+   Mitigation: evaluate across Conway/Mandelbrot/arithmetic workloads and retain fallback thresholds.
+9. Risk: RepCut/merge cost may increase compile/lowering time excessively.
+   Mitigation: cache partition plans and gate expensive passes behind measurable perf wins.
+
+## Benchmark Evidence Log
+### Phase 3 CPU8bit Benchmark (`RHDL_BENCH_BACKENDS=compiler,arc_to_gpu,synth_to_gpu,gem_gpu`)
+Command set:
+1. `bundle exec rake "bench:native[cpu8bit,5000]"`
+2. `bundle exec rake "bench:native[cpu8bit,50000]"`
+3. `bundle exec rake "bench:native[cpu8bit,500000]"`
+
+`5,000` cycles:
+1. Compiler: `0.075s` (`66,667 cycles/s`)
+2. ArcToGPU: `0.029s` (`172,414 cycles/s`, `2.542x` vs compiler)
+3. SynthToGPU: `0.144s` (`34,722 cycles/s`, `0.520x`)
+4. GemGPU: `0.128s` (`39,063 cycles/s`, `0.585x`)
+
+`50,000` cycles:
+1. Compiler: `0.703s` (`71,124 cycles/s`)
+2. ArcToGPU: `0.151s` (`331,126 cycles/s`, `4.665x` vs compiler)
+3. SynthToGPU: `1.110s` (`45,045 cycles/s`, `0.633x`)
+4. GemGPU: `1.109s` (`45,086 cycles/s`, `0.633x`)
+
+`500,000` cycles:
+1. Compiler: `7.103s` (`70,393 cycles/s`)
+2. ArcToGPU: `1.239s` (`403,551 cycles/s`, `5.735x` vs compiler)
+3. SynthToGPU: `11.038s` (`45,298 cycles/s`, `0.644x`)
+4. GemGPU: `11.026s` (`45,347 cycles/s`, `0.644x`)
+
+### Phase 3 Complex Parity Evidence
+Command:
+1. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+
+Result:
+1. `3 examples, 0 failures`.
+2. Workloads validated:
+   - Conway glider 80x24 checkpoints
+   - Mandelbrot 80x24 checkpoints
+   - long-running arithmetic loop checkpoints
+
+### Phase 4 Runtime-Schedule Consumption Evidence
+Commands:
+1. `bundle exec rspec spec/examples/8bit/utilities/runners/synth_to_gpu_runner_spec.rb spec/rhdl/codegen/firrtl/gem_to_gpu_lowering_spec.rb`
+2. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+3. `bundle exec rake "bench:native[cpu8bit,50000]"`
+4. `bundle exec rake "bench:native[cpu8bit,500000]"`
+
+Results:
+1. GEM metadata now includes execution plan fields:
+   - `gem.execution.schedule_version=GemExecutionPlanV1`
+   - `gem.execution.partition_order`
+   - `gem.execution.layer_count`
+   - `gem.execution.dispatch_cycle_granularity`
+2. Generated `gem_gpu` wrapper consumes GEM schedule metadata and runs chunked dispatch loop in `sim_runner_run_cycles`.
+3. Parity remained green: `3 examples, 0 failures` on complex workloads.
+4. Benchmark snapshots after phase 4:
+   - `50,000` cycles:
+     - Compiler: `0.640s` (`78,125 cycles/s`)
+     - ArcToGPU: `0.148s` (`337,838 cycles/s`, `4.326x`)
+     - SynthToGPU: `1.120s` (`44,643 cycles/s`, `0.571x`)
+     - GemGPU: `1.104s` (`45,290 cycles/s`, `0.580x`)
+   - `500,000` cycles:
+     - Compiler: `6.621s` (`75,517 cycles/s`)
+     - ArcToGPU: `1.212s` (`412,541 cycles/s`, `5.460x`)
+     - SynthToGPU: `10.978s` (`45,545 cycles/s`, `0.603x`)
+     - GemGPU: `10.973s` (`45,567 cycles/s`, `0.603x`)
+
+### Phase 5 Host-Side Batch Sync Elimination Evidence
+Commands:
+1. `bundle exec rspec spec/examples/8bit/hdl/cpu/fast_harness_gem_gpu_spec.rb spec/examples/8bit/hdl/cpu/fast_harness_synth_to_gpu_spec.rb spec/examples/8bit/hdl/cpu/fast_harness_arcilator_gpu_spec.rb`
+2. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+3. `bundle exec rake "bench:native[cpu8bit,50000]"`
+4. `bundle exec rake "bench:native[cpu8bit,500000]"`
+
+Results:
+1. `FastHarness#run_cycles` now issues one external `runner_run_cycles(n)` call for native runner backends (host-side batch loop removed).
+2. Added regression assertion: `run_cycles(100, batch_size: 16)` calls `runner_run_cycles(100, 0, false)` once and does not issue `runner_run_cycles(16, ...)`.
+3. Complex parity remains green: `3 examples, 0 failures`.
+4. Benchmark snapshots after phase 5:
+   - `50,000` cycles:
+     - Compiler: `0.666s` (`75,075 cycles/s`)
+     - ArcToGPU: `0.136s` (`367,647 cycles/s`, `4.912x`)
+     - SynthToGPU: `1.102s` (`45,372 cycles/s`, `0.604x`)
+     - GemGPU: `1.077s` (`46,426 cycles/s`, `0.618x`)
+   - `500,000` cycles:
+     - Compiler: `6.580s` (`75,988 cycles/s`)
+     - ArcToGPU: `1.179s` (`424,088 cycles/s`, `5.581x`)
+     - SynthToGPU: `10.779s` (`46,387 cycles/s`, `0.610x`)
+     - GemGPU: `10.745s` (`46,533 cycles/s`, `0.612x`)
+
+### Phase 6 Kernel/Memory Path Tightening Evidence
+Commands:
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/gem_to_gpu_lowering_spec.rb spec/examples/8bit/utilities/runners/synth_to_gpu_runner_spec.rb spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+2. `bundle exec rake "bench:native[cpu8bit,5000]"`
+3. `bundle exec rake "bench:native[cpu8bit,50000]"`
+4. `bundle exec rake "bench:native[cpu8bit,500000]"`
+
+Implementation details:
+1. CPU8bit profile now enables aggressive eval generation:
+   - `use_state_snapshot: false`
+   - `split_post_comb_liveness: true`
+   - `trust_state_masks: true`
+   - `load_state_in_comb_fn: true`
+   - `eval_always_inline: true`
+   - `schedule_aware_emit: true`
+2. Lowering spec now asserts these runtime-shape properties in generated Metal:
+   - no `state_old_*` snapshot temporaries,
+   - `always_inline` on `eval_cpu8bit`,
+   - presence of split post-comb helper with `state_slots` argument.
+
+Results:
+1. Parity remained green: `3 examples, 0 failures` on complex workloads.
+2. Benchmarks after phase 6:
+   - `5,000` cycles:
+     - Compiler: `0.172s` (`29,070 cycles/s`)
+     - ArcToGPU: `0.014s` (`357,143 cycles/s`, `12.326x`)
+     - SynthToGPU: `0.105s` (`47,619 cycles/s`, `1.640x`)
+     - GemGPU: `0.079s` (`63,291 cycles/s`, `2.184x`)
+   - `50,000` cycles:
+     - Compiler: `1.328s` (`37,651 cycles/s`)
+     - ArcToGPU: `0.107s` (`467,290 cycles/s`, `12.463x`)
+     - SynthToGPU: `0.786s` (`63,613 cycles/s`, `1.690x`)
+     - GemGPU: `0.787s` (`63,532 cycles/s`, `1.687x`)
+   - `500,000` cycles:
+     - Compiler: `14.330s` (`34,893 cycles/s`)
+     - ArcToGPU: `0.930s` (`537,634 cycles/s`, `15.408x`)
+     - SynthToGPU: `7.842s` (`63,759 cycles/s`, `1.827x`)
+     - GemGPU: `7.811s` (`64,013 cycles/s`, `1.835x`)
+
+### Phase 7 Multi-Instance Parallel Throughput Evidence
+Commands:
+1. `bundle exec rspec spec/examples/8bit/utilities/runners/synth_to_gpu_runner_spec.rb spec/examples/8bit/hdl/cpu/fast_harness_gem_gpu_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`
+2. `RHDL_CPU8BIT_GPU_INSTANCES=8 bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+3. `RHDL_CPU8BIT_GPU_INSTANCES=1 RHDL_BENCH_BACKENDS=gem_gpu bundle exec rake "bench:native[cpu8bit,500000]"`
+4. `RHDL_CPU8BIT_GPU_INSTANCES=8 RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake "bench:native[cpu8bit,500000]"`
+5. `RHDL_CPU8BIT_GPU_INSTANCES=16 RHDL_BENCH_BACKENDS=gem_gpu bundle exec rake "bench:native[cpu8bit,500000]"`
+
+Implementation details:
+1. CPU8bit kernel emitter now uses `tid`-indexed slices for state/memory/io in generic `emit_kernel`:
+   - `state_slots = all_state_slots + (tid * state_slot_count)`
+   - `memory = all_memory + (tid * 65536)`
+   - `io = all_io + tid`
+2. Runner wrapper now allocates buffers for `INSTANCE_COUNT` and dispatches `INSTANCE_COUNT` threads per call.
+3. Runner API exposes `runner_parallel_instances`.
+4. `FastHarness` exposes `parallel_instances`.
+5. Benchmark output now reports `Instances` and `Effective` cycles/s, plus `Effective Performance Ratios` when multi-instance mode is active.
+
+Results:
+1. Targeted specs remain green (`39 examples, 0 failures`).
+2. Complex parity remains green with `RHDL_CPU8BIT_GPU_INSTANCES=8` (`3 examples, 0 failures`).
+3. Multi-instance throughput scaling (GemGPU, `500,000` cycles):
+   - `instances=1`: `8.724s` (`57,313 cycles/s`)
+   - `instances=8`: `8.736s` raw (`57,234 cycles/s`), effective `457,882 cycles/s`
+   - `instances=16`: `8.748s` raw (`57,155 cycles/s`), effective `914,503 cycles/s`
+4. With compiler baseline (`instances=8`, `500,000` cycles):
+   - Compiler: `6.503s` (`76,888 cycles/s`)
+   - GemGPU raw: `0.744x` vs compiler
+   - GemGPU effective: `5.955x` vs compiler
+
+### Phase 8 Interim Baseline Evidence (In Progress)
+Commands:
+1. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+2. `RHDL_CPU8BIT_GPU_INSTANCES=8 bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+3. `RHDL_BENCH_BACKENDS=compiler,arc_to_gpu,synth_to_gpu,gem_gpu bundle exec rake "bench:native[cpu8bit,5000]"`
+4. `RHDL_BENCH_BACKENDS=compiler,arc_to_gpu,synth_to_gpu,gem_gpu bundle exec rake "bench:native[cpu8bit,50000]"`
+5. `RHDL_BENCH_BACKENDS=compiler,arc_to_gpu,synth_to_gpu,gem_gpu bundle exec rake "bench:native[cpu8bit,500000]"`
+6. `RHDL_CPU8BIT_GPU_INSTANCES=8 RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake "bench:native[cpu8bit,5000]"`
+7. `RHDL_CPU8BIT_GPU_INSTANCES=8 RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake "bench:native[cpu8bit,50000]"`
+8. `RHDL_CPU8BIT_GPU_INSTANCES=8 RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake "bench:native[cpu8bit,500000]"`
+
+Results snapshot:
+1. Parity remains green in both modes:
+   - `instances=1`: `3 examples, 0 failures`
+   - `instances=8`: `3 examples, 0 failures`
+2. Single-instance benchmark (`instances=1`):
+   - `5,000`: Compiler `0.070s`, ArcToGPU `0.031s`, SynthToGPU `0.105s`, GemGPU `0.087s`
+   - `50,000`: Compiler `0.662s`, ArcToGPU `0.116s`, SynthToGPU `0.868s`, GemGPU `0.869s`
+   - `500,000`: Compiler `6.525s`, ArcToGPU `0.973s`, SynthToGPU `8.680s`, GemGPU `8.681s`
+3. Multi-instance (`instances=8`) effective scaling:
+   - `5,000`: GemGPU effective `385,721 cycles/s` (`5.652x` vs compiler)
+   - `50,000`: GemGPU effective `451,529 cycles/s` (`6.010x` vs compiler)
+   - `500,000`: GemGPU effective `457,826 cycles/s` (`6.092x` vs compiler)
+
+Residual gap for final closeout:
+1. Single-instance GemGPU is still below compiler on this snapshot (`~0.75x` at `50k/500k`).
+2. Multi-instance mode provides the intended throughput win (`~6x effective` vs compiler); this baseline defines targets for phases 9-14.
+
+### Phase 9 VBP/VLIW Instruction-Stream Scaffolding (In Progress)
+Commands:
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/gem_to_gpu_lowering_spec.rb spec/examples/8bit/utilities/runners/synth_to_gpu_runner_spec.rb`
+2. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_instruction_stream_parity_spec.rb`
+3. `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb`
+4. `RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake "bench:native[cpu8bit,50000]"`
+
+Results:
+1. GEM metadata now emits deterministic instruction-stream artifacts:
+   - `gem.instruction_stream.version=GemInstructionStreamV1`
+   - `instruction_count`, `block_boundaries`, `extern_refs`, `instructions`, `primitive_counts`, `control_program`, and `checksum_sha256`
+   - execution metadata now includes partition dependency shape:
+     - `gem.partition_dependency_edges`
+     - `gem.execution.partition_dependency_edge_count`
+     - `gem.execution.ready_layer_count`
+     - `gem.execution.ready_layers`
+2. `gem_gpu` runner/wrapper now runs the instruction-stream path by default.
+3. Runner instruction-stream tuning now consumes stream/plan structure:
+   - `GEM_INSTRUCTION_COUNT`
+   - `GEM_STATE_READ_COUNT`
+   - `GEM_CONTROL_STEP_COUNT`
+   - `GEM_DEPENDENCY_EDGE_COUNT`
+   - `GEM_READY_LAYER_COUNT`
+   to derive chunk scale in stream mode.
+4. GEM stream mode enables kernel-side control-program interpretation during lowering, emitting Metal control-op switch loop markers:
+   - `constexpr ushort kGemControlOps[7]`
+   - `switch (op)` over control micro-ops.
+   - Kernel now binds instruction stream buffer (`[[buffer(3)]]`) and executes per-node `and_inv` shadow interpretation through `rhdl_gem_execute_shadow`.
+5. Stream mode now includes deterministic ready-layer scheduling in `sim_runner_run_cycles` when dependency metadata is present (`GEM_SCHEDULER_MODE`), splitting chunk execution across `GEM_READY_LAYER_COUNT`.
+   - Wrapper now embeds topology arrays:
+     - `GEM_READY_LAYER_OFFSETS[]`
+     - `GEM_READY_LAYER_PARTITIONS[]`
+   - Wrapper now embeds packed instruction stream words:
+   - `GEM_INSTRUCTION_WORD_COUNT`
+   - `GEM_INSTRUCTION_WORDS[]`
+   - Dynamic scheduler can be toggled with `RHDL_GEM_GPU_DYNAMIC_SCHEDULER=0/1`.
+   - Layer budgets are weighted by per-layer partition counts.
+6. Instruction-stream mode parity is green:
+   - arithmetic parity spec: `1 example, 0 failures`
+   - complex parity suite (Conway/Mandelbrot/arithmetic): `3 examples, 0 failures`
+7. Instruction payload ABI now matches kernel parser layout:
+   - `[instruction_count, flags, instruction_words..., watch_count, watch_words...]`
+   - runner default empty payload now materializes as `{ 0u, 0u, 0u, 0u, 0u, 0u }` to avoid out-of-bounds watch/control/extern reads.
+8. Lowering now emits deterministic output-watch metadata in stream artifacts:
+   - `instruction_stream.output_watch_names`
+   - `instruction_stream.output_watch_sources`
+   for `mem_write_en`, `mem_read_en`, `halted`, `zero_flag_out`.
+9. Output-watch override is now explicit/opt-in (`instruction_stream.output_watch_override=true`) instead of auto-enabled, preserving parity in default stream mode.
+10. Fast-default scheduler behavior:
+   - single-instance (`instances=1`): dynamic ready-layer scheduler defaults to disabled unless explicitly enabled.
+   - multi-instance (`instances>1`): dynamic scheduler remains enabled by default.
+11. Stream payload now carries serialized control program ops and kernel consumes them:
+   - runner appends control tail: `[... watch_count, watch_words..., control_count, control_ops...]`
+   - interpreter kernel now reads `control_count/control_ops` and falls back to static 7-op sequence only when absent.
+   - this removes hard-wiring of control order and is a direct step toward fully stream-driven execution.
+12. Instruction-stream benchmark snapshots (`50,000` cycles, latest runs):
+   - default (`instances=1`):
+     - Compiler run: `0.636s`
+     - GemGPU run: `0.906s`
+     - GemGPU vs compiler: `0.702x`
+   - multi-instance (`instances=8`):
+     - Compiler run: `0.695s`
+     - GemGPU run: `0.960s`
+     - GemGPU raw vs compiler: `0.724x`
+     - GemGPU effective vs compiler: `5.793x`
+13. Kernel now pre-decodes serialized control ops into thread-local storage once per dispatch:
+   - removes per-cycle global-memory reads of control op words in the hot loop.
+   - parity remains green on arithmetic + complex workloads.
+14. Stream payload now includes extern reference table and kernel-side extern-table decode mode:
+   - metadata emits `extern_ref_kinds` and `extern_ref_values`.
+   - runner appends extern tail: `[... control_count, control_ops..., extern_count, extern_values...]`.
+   - kernel decode now supports table-backed extern values when `gem_flags & 0x4` is set (fallback constant decode retained).
+15. Stream payload now includes runtime extern-source descriptors and kernel descriptor decode mode:
+   - metadata emits `extern_sources` entries (`const`, `state_bit`, `io_bit`, `unknown`) by analyzing synth `comb.extract` definitions against state/input layouts.
+   - runner appends descriptor tail:
+     `[... extern_count, extern_values..., extern_desc_count, extern_desc_words...]`.
+   - kernel decode now supports descriptor-backed extern resolution when `gem_flags & 0x8` is set:
+     - state bit extraction from `state_slots`
+     - IO bit extraction from `RhdlArcGpuIo`.
+16. Updated instruction-stream benchmark snapshots (`50,000` cycles, latest runs after control predecode + extern table/descriptor mode):
+   - default (`instances=1`):
+     - Compiler run: `0.685s`
+     - GemGPU run: `0.904s`
+     - GemGPU vs compiler: `0.758x`
+   - multi-instance (`instances=8`):
+     - Compiler run: `0.697s`
+     - GemGPU run: `0.973s`
+     - GemGPU raw vs compiler: `0.716x`
+     - GemGPU effective vs compiler: `5.726x`
+17. Build artifact cache-key now includes GEM compile-time toggles:
+   - `execution_mode`, dynamic scheduler setting, and output-watch-override flag are encoded in `build_dir` suffix.
+   - prevents cross-run contamination when switching env-driven stream modes in the same workspace.
+18. Output-watch override mode is now cycle-correct when enabled:
+   - shadow stream execution runs each cycle when override flag is set (instead of only cycle 0).
+   - to preserve loop progress correctness, override currently applies to:
+     - `mem_write_en`
+     - `mem_read_en`
+     - `zero_flag_out`
+     and leaves `halted` authoritative from eval path.
+19. Override-mode cost snapshot (`5,000` cycles):
+   - Compiler: `0.076s`
+   - GemGPU (`RHDL_GEM_GPU_OUTPUT_WATCH_OVERRIDE=1`): `0.341s`
+   - ratio: `0.222x`
+   - indicates current per-cycle shadow execution is correctness scaffolding, not a fast default path.
+20. Shadow execution now precomputes extern values once per cycle into a thread-local cache (`kGemExternValueCap`) and decode reads from cache first:
+   - reduces repeated descriptor/table decode overhead per source evaluation.
+   - keeps parity green in both default and override modes.
+21. Output-watch override now uses metadata-driven subset execution when available:
+   - lowering emits `watch_eval_indices` closure for watch-driven node dependencies.
+   - runner serializes watch-eval index tail:
+     `[... extern_desc_count, extern_desc_words..., watch_eval_count, watch_eval_indices...]`.
+   - kernel executes only this subset when override mode is active (fallback to full stream otherwise).
+   - improved override benchmark from `7.557s` to `0.341s` at `5,000` cycles on latest snapshot.
+22. Remaining phase-9 gap: kernel-side stream interpretation currently runs as shadow execution and does not yet drive architectural outputs/state updates directly (eval path still authoritative).
+
+## Implementation Checklist
+- [x] Phase 1 red tests added.
+- [x] Phase 1 green implementation completed.
+- [x] Phase 1 exit criteria met.
+- [x] Phase 2 red tests added.
+- [x] Phase 2 green implementation completed.
+- [x] Phase 2 exit criteria met.
+- [x] Phase 3 red tests added.
+- [x] Phase 3 green implementation completed.
+- [x] Phase 3 benchmarks recorded.
+- [x] Phase 3 parity checks recorded.
+- [x] Phase 4 red tests added.
+- [x] Phase 4 green implementation completed.
+- [x] Phase 4 exit criteria met.
+- [x] Phase 5 red tests added.
+- [x] Phase 5 green implementation completed.
+- [x] Phase 5 exit criteria met.
+- [x] Phase 6 red tests added.
+- [x] Phase 6 green implementation completed.
+- [x] Phase 6 exit criteria met.
+- [x] Phase 7 red tests added.
+- [x] Phase 7 green implementation completed.
+- [x] Phase 7 exit criteria met.
+- [ ] Phase 8 red tests added.
+- [ ] Phase 8 green implementation completed.
+- [ ] Phase 8 exit criteria met.
+- [x] Phase 9 red tests added.
+- [ ] Phase 9 green implementation completed.
+- [ ] Phase 9 exit criteria met.
+- [ ] Phase 10 red tests added.
+- [ ] Phase 10 green implementation completed.
+- [ ] Phase 10 exit criteria met.
+- [ ] Phase 11 red tests added.
+- [ ] Phase 11 green implementation completed.
+- [ ] Phase 11 exit criteria met.
+- [ ] Phase 12 red tests added.
+- [ ] Phase 12 green implementation completed.
+- [ ] Phase 12 exit criteria met.
+- [ ] Phase 13 red tests added.
+- [ ] Phase 13 green implementation completed.
+- [ ] Phase 13 exit criteria met.
+- [ ] Phase 14 red tests added.
+- [ ] Phase 14 green implementation completed.
+- [ ] Phase 14 exit criteria met.
+- [ ] PRD status updated to Completed.
diff --git a/prd/2026_03_04_riscv_arctogpu_metal_perf_phase2_prd.md b/prd/2026_03_04_riscv_arctogpu_metal_perf_phase2_prd.md
new file mode 100644
index 00000000..3921f4aa
--- /dev/null
+++ b/prd/2026_03_04_riscv_arctogpu_metal_perf_phase2_prd.md
@@ -0,0 +1,846 @@
+# RISC-V ArcToGPU Metal Performance Phase 2 PRD
+
+**Status:** In Progress (2026-03-04)  
+**Date:** 2026-03-04
+
+## Context
+
+`prd/2026_03_03_riscv_arctogpu_metal_runner_prd.md` established a working RISC-V ArcToGPU Metal path with core-workload parity and meaningful aggregate throughput in multi-instance mode. The remaining issue is the single-instance performance gap versus IR compiled execution on long runs.
+
+The latest validated snapshots in the prior PRD still show single-instance Metal behind compiler on the synthetic core workload (for example, around `0.203x` at 50k cycles and `0.205x` at 200k cycles in same-run comparisons), even though multi-instance aggregate throughput can exceed compiler.
+
+This follow-on PRD defines the next optimization wave in explicit phases, with hard red/green gates and benchmark evidence at 5k / 50k / 500k cycles.
+
+## Goals
+
+1. Reduce single-instance RISC-V Metal runtime overhead on the core workload.
+2. Preserve cycle parity and externally visible runner semantics.
+3. Eliminate avoidable host/device synchronization and hot-loop external I/O in batch execution.
+4. Reduce generated kernel work by removing dead helpers/temporaries and minimizing state traffic.
+5. Improve multi-instance throughput scaling without regressing single-instance behavior.
+6. Produce repeatable benchmark evidence (`cycles/s` and ratio vs compiler) at 5k, 50k, and 500k.
+
+## Non-Goals
+
+1. Upstreaming ArcToGPU changes to CIRCT in this phase.
+2. Full xv6/Linux MMIO parity in Metal mode.
+3. Replacing the existing RISC-V architecture model or IR compiler backend.
+4. Broad benchmark redesign outside targeted RISC-V core workload/perf harness changes.
+
+## Phased Plan
+
+### Phase 0: Baseline + Measurement Harness Hardening
+
+**Red:** No locked baseline for 5k/50k/500k with explicit sync/dispatch assumptions.  
+**Green:** Baseline measurements and instrumentation are captured, with clearly defined “one dispatch per batch” execution contract.
+
+Exit criteria:
+1. Record baseline compiler vs Metal timings (`instances=1` and throughput mode) at 5k, 50k, and 500k.
+2. Record current dispatch/wait behavior in wrapper and benchmark docs.
+3. Add/extend a benchmark guard check that fails if batching contract regresses (unexpected extra dispatch/sync in a single `run_cycles` batch).
+
+### Phase 1: Host Sync And Dispatch Contract Tightening
+
+**Red:** Batch execution still performs unnecessary host-visible sync/read/write operations in hot paths.  
+**Green:** For `run_cycles(N)` with `N > 0`, hot-loop execution performs no intermediate host syncs; sync/wait is deferred to dispatch completion only.
+
+Exit criteria:
+1. No host waits inside per-cycle loop logic in native wrapper path.
+2. Exactly one kernel dispatch per contiguous `run_cycles` batch in benchmark flow.
+3. Hot-loop external writes (debug/output materialization) occur only when required by API semantics.
+4. RISC-V parity checks remain green.
+
+### Phase 2: State Traffic Reduction (Hot/Cold Split + Selective Snapshotting)
+
+**Red:** Kernel loop and eval paths still snapshot/copy more state than required by live update/output cones.  
+**Green:** State staging/snapshotting is liveness-driven and split by access pattern.
+
+Exit criteria:
+1. Selective `comb_pre` snapshotting includes only refs required by update logic and required outputs.
+2. Hot/cold state strategy is implemented (hot refs local/thread-cached, cold refs direct as needed) with a profile-safe fallback.
+3. State copy loops and loop-time state reads/writes are reduced and measured.
+4. Parity remains green for targeted RISC-V checks.
+
+### Phase 3: Emitted-Code Cleanup (Liveness DCE + Array Peepholes + Helper Reachability DCE)
+
+**Red:** Generated Metal includes dead helper graph, avoidable temporaries, and pattern-generated allocas in hot eval paths.  
+**Green:** Emission is pruned to live work only, and common temporary-heavy patterns are folded away.
+
+Exit criteria:
+1. Liveness-driven comb emission only materializes refs needed for updates/required outputs.
+2. Peepholes fold `array_get(array_create(...), idx)` and `array_get(aggregate_constant(...), idx)` when statically safe.
+3. Reachability-based helper DCE removes unused generated helper functions.
+4. Generated-source size and/or compiler warnings for unused helpers drop measurably.
+
+### Phase 4: Call/Inline Policy Retuning (Performance-Safe Flattening)
+
+**Red:** Inline and flattening behavior is either too conservative in hot paths or too aggressive globally, causing regressions.  
+**Green:** Inline/flatten policy is profile-tuned by function size/use, improving runtime without parity risk.
+
+Exit criteria:
+1. Generated helper annotations distinguish tiny/hot helpers from large eval functions (avoid blanket `always_inline` on large bodies).
+2. Flattening thresholds are re-tuned with bounded caps and A/B evidence.
+3. No regression in shader compile reliability for RISC-V Metal path.
+4. Targeted perf probes show non-negative delta versus Phase 3.
+
+### Phase 5: Multi-Instance And Threadgroup Scaling Pass
+
+**Red:** Instance scaling exists but threadgroup/mapping strategy remains fixed and under-tuned for current kernels.  
+**Green:** Throughput path is tuned for instance-level parallelism with explicit no-regression guard for `instances=1`.
+
+Exit criteria:
+1. Threadgroup sizing/mapping policy is explicit and benchmarked (including rejected variants if they regress).
+2. `instances > 1` path improves aggregate `cycles/s` on at least one long-run point (50k or 500k).
+3. `instances=1` performance does not regress beyond agreed tolerance (<=5% from prior phase median on same host/run conditions).
+
+### Phase 6: Validation, Documentation, And Completion Gate
+
+**Red:** Optimizations exist without consolidated parity/perf evidence and completion criteria closure.  
+**Green:** PRD includes full evidence for parity + performance gates and checklist/status reflects real completion.
+
+Exit criteria:
+1. Targeted specs for lowering/runner/benchmark wiring pass.
+2. Parity checks for RISC-V Metal vs compiler on core workload pass at required checkpoints.
+3. Benchmark table for 5k / 50k / 500k includes compiler vs Metal (`instances=1`, and throughput mode where enabled) with `cycles/s` and ratio.
+4. PRD status is updated to `Completed (date)` only when all phase exit criteria are satisfied.
+
+### Phase 7: Kernel Variant Specialization (Invariant Inputs)
+
+**Red:** Core benchmark executes fully generic kernel path even when selected top-level inputs are invariant for workload duration.  
+**Green:** Metal core workload supports an explicit specialization mode that pins safe invariant inputs and produces a measurable speedup with parity intact.
+
+Exit criteria:
+1. Add a benchmark-visible toggle for core specialization (`on/off`) with explicit reporting.
+2. Add ArcToGPU lowering support for safe invariant input pinning in RISC-V profile/kernel emission.
+3. Ensure build artifacts are invalidated when specialization toggles change (no stale shader A/B).
+4. Re-run parity/perf probes demonstrating no functional regression and non-negative runtime delta.
+
+### Phase 8: Fast/Slow Kernel Split
+
+**Red:** One monolithic kernel handles both common-case and rare-case logic, inflating hot-path instruction count and branch pressure.  
+**Green:** Dispatch path supports separate fast and fallback kernels, with runtime selection based on explicit preconditions.
+
+Exit criteria:
+1. Introduce a fast kernel constrained to common-case conditions and a correctness-preserving fallback.
+2. Add dispatch-time gating logic and instrumentation to report fast-path hit rate.
+3. Prove parity with fallback coverage for rare cases.
+4. Show net runtime improvement on core workload at 50k/500k.
+
+### Phase 9: Dirty-Cone / Event-Driven Execution
+
+**Red:** Every cycle recomputes broad combinational regions even when active signal cone is small.  
+**Green:** Generated path supports dirty propagation and evaluates only impacted cones per cycle segment.
+
+Exit criteria:
+1. Build dependency/fanout index for refs used in cycle update path.
+2. Add dirty-set propagation and selective op evaluation.
+3. Validate parity under targeted regressions and benchmark programs.
+4. Demonstrate reduced per-cycle work and measurable speedup.
+
+### Phase 10: Scheduled Dataflow Emitter
+
+**Red:** Current codegen emits mostly linear op text with limited scheduling structure for backend optimization.  
+**Green:** RISC-V path has a schedule-aware emitter mode (levelized blocks and explicit phase regions) that can target GPU execution patterns.
+
+Exit criteria:
+1. Add schedule extraction/lowering structure alongside existing emitter (profile-gated).
+2. Preserve semantics and fallback to legacy emitter when disabled.
+3. Validate with existing lowering/runner tests.
+4. Use schedule metadata as the basis for later parallel execution work.
+
+### Phase 11: Single-Instance Intra-Kernel Parallelism
+
+**Red:** Single-instance path is effectively single-threaded per simulated core, leaving significant GPU compute underutilized.  
+**Green:** Single-core simulation can partition combinational work across lanes in a threadgroup with synchronization barriers and deterministic ordering.
+
+Exit criteria:
+1. Define a partition strategy from Arc schedule metadata (work chunks + barrier points).
+2. Implement multi-lane execution path for at least one stable phase region.
+3. Validate parity against compiler/interpreter checkpoints.
+4. Demonstrate meaningful single-instance speedup versus prior phase.
+
+## Acceptance Criteria
+
+1. All phases execute with explicit red/green evidence and checklist updates.
+2. No cycle-parity regressions are introduced in targeted RISC-V Metal parity checks.
+3. Single-instance Metal performance improves by at least `1.5x` versus this PRD’s Phase 0 baseline on at least two of the three benchmark points (5k, 50k, 500k).
+4. Throughput mode (`instances > 1`) remains operational and demonstrates aggregate scaling evidence with no hidden semantic drift.
+5. Benchmark reporting clearly distinguishes single-instance ratio vs aggregate throughput ratio.
+
+## Risks And Mitigations
+
+1. Risk: aggressive pruning/peepholes can alter cycle semantics.
+   Mitigation: keep parity checks as hard gates after each phase; default to conservative fallback when uncertain.
+2. Risk: host timing variance can obscure performance deltas.
+   Mitigation: compare same-run measurements and capture multiple runs when changes are close.
+3. Risk: threadgroup or flattening changes can improve one regime and regress another.
+   Mitigation: keep explicit A/B probes for 5k/50k/500k and enforce `instances=1` no-regression tolerance.
+4. Risk: codegen complexity increases maintenance burden.
+   Mitigation: keep profile-specific logic isolated and documented; require targeted specs for new transforms.
+
+## Testing Gates
+
+1. Lowering/codegen unit gates:
+   - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`
+2. RISC-V runner/task gates:
+   - `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`
+3. Benchmark task gate:
+   - `bundle exec rspec spec/rhdl/cli/tasks/benchmark_task_spec.rb`
+4. Perf/parity execution gate (core workload):
+   - `bundle exec rake 'bench:native[riscv,5000]'`
+   - `bundle exec rake 'bench:native[riscv,50000]'`
+   - `bundle exec rake 'bench:native[riscv,500000]'`
+   with explicit env capture for backend filter and instance count.
+
+## Implementation Checklist
+
+- [x] Phase 0: Capture 5k/50k/500k baseline table (compiler vs Metal; `instances=1` and throughput mode).
+- [x] Phase 0: Document current dispatch/wait contract and add regression guard check.
+- [x] Phase 1: Remove remaining hot-loop intermediate host sync/read/write operations.
+- [x] Phase 1: Enforce one-dispatch-per-batch contract in benchmarked `run_cycles` path.
+- [x] Phase 1: Re-run parity probes and targeted specs.
+- [x] Phase 2: Implement selective `comb_pre` snapshotting for required refs only.
+- [x] Phase 2: Implement hot/cold state handling in RISC-V Metal profile/generator.
+- [x] Phase 2: Measure state-traffic reduction impact on 5k/50k/500k benchmarks.
+- [x] Phase 3: Add array-get peephole folds for create/aggregate-constant patterns.
+- [x] Phase 3: Add helper reachability DCE and liveness-pruned comb emission.
+- [x] Phase 3: Record generated-code and performance deltas.
+- [x] Phase 4: Retune inline/flatten settings with bounded heuristics and A/B evidence.
+- [x] Phase 4: Keep compile reliability checks and revert regressive settings.
+- [x] Phase 5: Tune threadgroup/instance mapping for throughput mode with `instances=1` guard.
+- [x] Phase 5: Record aggregate throughput scaling evidence.
+- [x] Phase 6: Run full targeted spec gates and parity checkpoints.
+- [x] Phase 6: Publish final benchmark table and close Phase 6 validation gates.
+- [x] Phase 7: Add specialization toggle/plumbing + rebuild-safe A/B for invariant-input kernel variants.
+- [x] Phase 7: Validate parity and measure specialization-on vs specialization-off deltas.
+- [x] Phase 8: Implement fast-path kernel dispatch and enforce fast-only runtime path (fallback path removed per updated requirement).
+- [ ] Phase 9: Implement dirty-cone/event-driven combinational execution.
+- [x] Phase 10: Add schedule-aware dataflow emitter mode for RISC-V ArcToGPU path (profile-gated; default off pending perf-positive tuning).
+- [ ] Phase 11: Implement single-instance intra-kernel parallel execution from schedule partitions.
+
+## Execution Update (2026-03-04)
+
+Implemented in this pass:
+
+1. Added RISC-V lowering transforms:
+   - Constant-fold `hw.array_get(hw.array_create(...), const_idx)` into alias.
+   - Constant-fold `hw.array_get(hw.aggregate_constant(...), const_idx)` into constant.
+   - Reachability-prune unused `arc.define` functions from the parsed call graph.
+2. Added additional liveness pruning:
+   - Function-body op emission now liveness-prunes to outputs.
+   - `split_post_comb_liveness` path no longer seeds output refs into `comb_pre`.
+   - Budgeted RISC-V kernel path now stages/copies back value-state slots only (`clock` tracking slots are not copied for `budget > 0` dispatches).
+3. Retuned generated inline policy (initial pass):
+   - Top eval/comb functions use non-forced inline.
+   - `arc.define` helpers use bounded always-inline heuristic (`<= 12` ops, small returns), with env overrides.
+4. Added dispatch/wait batch instrumentation in RISC-V Metal wrapper:
+   - Exported `sim_dispatch_count` / `sim_wait_count`.
+   - Exposed `MetalRunner#dispatch_count` / `#wait_count`.
+   - Benchmark now reports dispatch/wait counts per benchmark batch.
+   - Added guard env `RHDL_BENCH_VERIFY_DISPATCH_BATCH=1` to fail when per-batch dispatch count is not exactly 1.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`  
+   Result: pass.
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`  
+   Result: pass.
+
+Dispatch contract probe:
+
+1. `RHDL_BENCH_RISCV_WORKLOAD=core RHDL_BENCH_BACKENDS=metal RHDL_BENCH_RISCV_METAL_INSTANCES=1 RHDL_BENCH_VERIFY_DISPATCH_BATCH=1 bundle exec rake 'bench:native[riscv,5000]'`  
+   Result: pass, `Metal dispatches: 1 per benchmark batch`, `Metal waits: 1 per benchmark batch`.
+
+Benchmark baselines (core workload, same-run points):
+
+Single-instance (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.152s`, ratio `0.203x`.
+2. `50000` cycles: IR `0.309s`, Metal `1.487s`, ratio `0.208x`.
+3. `500000` cycles: IR `3.100s`, Metal `14.828s`, ratio `0.209x`.
+
+Throughput mode (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.153s`, single-instance ratio `0.199x`, aggregate ratio `1.591x`.
+2. `50000` cycles: IR `0.306s`, Metal `1.482s`, single-instance ratio `0.207x`, aggregate ratio `1.652x`.
+3. `500000` cycles: IR `3.095s`, Metal `14.860s`, single-instance ratio `0.208x`, aggregate ratio `1.666x`.
+
+Notes:
+
+1. Current batching contract is enforced and observed as one dispatch + one wait per benchmark `run_steps(cycles)` call.
+2. Inline/flatten A/B now includes rebuild-correct comparisons with explicit keep/revert outcomes.
+3. Single-instance gap remains large; next work is Phase 2/5 (state traffic reduction + scaling strategy) to improve single-instance runtime while preserving parity.
+
+## Execution Update (2026-03-04, Continued)
+
+Implemented in this continuation pass:
+
+1. Completed RISC-V hot/cold state split in emitted kernel path:
+   - `emit_kernel_riscv` now accepts `cold_memory_layout`.
+   - Large cold memory ranges are excluded from per-batch thread-local copy for `budget > 0`.
+   - Eval functions are invoked with explicit `cold_state_slots` pointer for cold memory read/write ops.
+   - Hot state ranges are copied back at batch end; cold ranges stay device-resident.
+2. Added threadgroup mapping policy in native Metal wrapper:
+   - Replaced fixed `threadsPerThreadgroup = 1` with dynamic width derived from
+     `pipeline.threadExecutionWidth`, bounded by `maxTotalThreadsPerThreadgroup` and `instanceCount`.
+   - Kept one-dispatch/one-wait contract per benchmark batch.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass.
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`  
+   Result: pass.
+3. `RHDL_BENCH_RISCV_WORKLOAD=core RHDL_BENCH_BACKENDS=metal RHDL_BENCH_RISCV_METAL_INSTANCES=8 RHDL_BENCH_VERIFY_DISPATCH_BATCH=1 bundle exec rake 'bench:native[riscv,50000]'`  
+   Result: pass, dispatch/wait remains `1/1` per batch.
+4. Inline policy A/B probe (`instances=1`, `500000` cycles, Metal-only):
+   - default inline policy: `14.265s`
+   - `RHDL_ARC_TO_GPU_FORCE_ALWAYS_INLINE=1`: `14.241s`
+   - `RHDL_ARC_TO_GPU_DISABLE_ALWAYS_INLINE=1`: `14.246s`
+   Result: differences are within run noise; no default-policy change from this probe.
+
+Post-change benchmarks (core workload, same-run points):
+
+Single-instance (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.032s`, Metal `0.155s`, ratio `0.205x`.
+2. `50000` cycles: IR `0.319s`, Metal `1.432s`, ratio `0.222x`.
+3. `500000` cycles: IR `3.163s`, Metal `14.246s`, ratio `0.222x`.
+
+Throughput mode (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.152s`, single-instance ratio `0.206x`, aggregate ratio `1.652x`.
+2. `50000` cycles: IR `0.315s`, Metal `1.447s`, single-instance ratio `0.218x`, aggregate ratio `1.743x`.
+3. `500000` cycles: IR `3.141s`, Metal `14.426s`, single-instance ratio `0.218x`, aggregate ratio `1.742x`.
+
+Observed deltas versus earlier baseline in this PRD:
+
+1. Single-instance ratio improved from ~`0.209x` baseline to ~`0.222x` on long runs (`500k`).
+2. Throughput aggregate ratio improved from ~`1.666x` baseline to ~`1.742x` on long runs (`500k`, `instances=8`).
+3. Metal is still behind IR compiler in `instances=1`; remaining work is focused on Phase 4 code-shape/inlining retuning and deeper kernel-level hot-loop simplification.
+
+## Execution Update (2026-03-04, Continued 2)
+
+Implemented in this continuation pass:
+
+1. Added explicit ArcToGPU tuning knobs:
+   - `RHDL_ARC_TO_GPU_RISCV_FLATTEN_MAX_OPS`
+   - `RHDL_ARC_TO_GPU_RISCV_FLATTEN_MAX_DEPTH`
+   - `RHDL_ARC_TO_GPU_ALWAYS_INLINE_MAX_OPS`
+   - `RHDL_ARC_TO_GPU_ALWAYS_INLINE_MAX_RETURNS`
+2. Fixed Metal build invalidation for tuning correctness:
+   - RISC-V Metal runner now records tracked ArcToGPU env values in `riscv_metal_build_config.json`.
+   - Any tracked env change now forces a rebuild, preventing stale-shader A/B results.
+3. Attempted kernel loop refactor (single helper-call-per-cycle) and reverted:
+   - Rebuild-correct benchmarks showed regression versus baseline.
+   - Kept existing hot loop shape to satisfy no-regression requirement.
+4. Ran flatten/inlining A/B with rebuild correctness:
+   - Candidate flatten `160/8` looked better in stale-cache runs but regressed in rebuild-correct runs.
+   - Baseline `96/6` retained as default.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass.
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`  
+   Result: pass.
+3. `RHDL_BENCH_RISCV_WORKLOAD=core RHDL_BENCH_BACKENDS=metal RHDL_BENCH_RISCV_METAL_INSTANCES=8 RHDL_BENCH_VERIFY_DISPATCH_BATCH=1 bundle exec rake 'bench:native[riscv,50000]'`  
+   Result: pass, dispatch/wait remains `1/1` per batch.
+
+Rebuild-correct flatten A/B evidence (`instances=1`, `500000` cycles, Metal-only):
+
+1. Baseline `96/6`: `14.447s`
+2. Candidate `160/8`: `14.690s`
+3. Decision: keep baseline defaults.
+
+Latest benchmark table (core workload, same-run points):
+
+Single-instance (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.032s`, Metal `0.149s`, ratio `0.213x`.
+2. `50000` cycles: IR `0.329s`, Metal `1.451s`, ratio `0.227x`.
+3. `500000` cycles: IR `3.140s`, Metal `14.468s`, ratio `0.217x`.
+
+Throughput mode (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.032s`, Metal `0.147s`, single-instance ratio `0.216x`, aggregate ratio `1.729x`.
+2. `50000` cycles: IR `0.315s`, Metal `1.475s`, single-instance ratio `0.214x`, aggregate ratio `1.710x`.
+3. `500000` cycles: IR `3.143s`, Metal `14.651s`, single-instance ratio `0.215x`, aggregate ratio `1.716x`.
+
+Current outcome:
+
+1. Phase 4 is complete with explicit A/B and keep/revert behavior.
+2. Perf remains materially below IR compiler for `instances=1`.
+3. Next progress likely requires architecture-level changes (true multi-thread-in-kernel partitioning or per-workload specialized kernels), not further small inline/flatten tweaks.
+
+## Execution Update (2026-03-04, Continued 3)
+
+Implemented in this continuation pass:
+
+1. Added architecture-phase scaffolding in this PRD (Phase 7-11):
+   - invariant-input kernel specialization
+   - fast/slow kernel split
+   - dirty-cone/event-driven execution
+   - schedule-aware emitter
+   - intra-kernel parallel partitioning
+2. Executed Phase 7 (invariant-input specialization) first:
+   - Added benchmark toggle `RHDL_BENCH_RISCV_METAL_CORE_SPECIALIZE` with explicit benchmark reporting.
+   - Added `metal_core_specialize` plumbing from benchmark -> `HeadlessRunner` -> `MetalRunner`.
+   - Added ArcToGPU RISC-V kernel invariant pinning for safe IRQ inputs (`irq_software`, `irq_timer`, `irq_external`).
+3. Hardened specialization A/B reliability:
+   - Build invalidation now tracks specialization env in `riscv_metal_build_config.json`.
+4. Correctness fix during execution:
+   - Initial specialization attempt that pinned `debug_reg_addr` and `rst` caused external-observability/reset regressions.
+   - Reverted those pins and kept only safe IRQ invariant pins.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb`  
+   Result: pass (`88` examples, `0` failures).
+2. `RHDL_BENCH_RISCV_METAL_CORE_SPECIALIZE=1 RHDL_BENCH_RISCV_WORKLOAD=core RHDL_BENCH_BACKENDS=metal RHDL_BENCH_RISCV_METAL_INSTANCES=8 RHDL_BENCH_VERIFY_DISPATCH_BATCH=1 bundle exec rake 'bench:native[riscv,50000]'`  
+   Result: pass, dispatch/wait remains `1/1` per batch.
+
+Specialization A/B (Metal-only, `instances=1`):
+
+1. `5000` cycles: off `0.161s`, on `0.142s` (~`1.13x` faster).
+2. `50000` cycles: off `1.461s`, on `1.416s` (~`1.03x` faster).
+3. `500000` cycles: off `14.496s`, on `14.177s` (~`1.02x` faster).
+
+Latest benchmark table with specialization enabled (`RHDL_BENCH_RISCV_METAL_CORE_SPECIALIZE=1`):
+
+Single-instance (`RHDL_BENCH_RISCV_METAL_INSTANCES=1`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.143s`, ratio `0.221x`.
+2. `50000` cycles: IR `0.311s`, Metal `1.417s`, ratio `0.220x`.
+3. `500000` cycles: IR `3.095s`, Metal `14.203s`, ratio `0.218x`.
+
+Throughput mode (`RHDL_BENCH_RISCV_METAL_INSTANCES=8`, `RHDL_BENCH_BACKENDS=compiler,metal`):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.156s`, single-instance ratio `0.198x`, aggregate ratio `1.587x`.
+2. `50000` cycles: IR `0.312s`, Metal `1.439s`, single-instance ratio `0.217x`, aggregate ratio `1.734x`.
+3. `500000` cycles: IR `3.084s`, Metal `14.405s`, single-instance ratio `0.214x`, aggregate ratio `1.713x`.
+
+Current outcome:
+
+1. Phase 7 is executed with measurable positive delta and preserved parity/dispatch contracts.
+2. Phase 8-11 remain pending and are now the primary path to material single-instance gains.
+3. PRD remains `In Progress` because later optimization phases and full acceptance criteria (notably the `1.5x` single-instance target) are still unmet.
+
+## Execution Update (2026-03-04, Continued 4)
+
+Implemented in this continuation pass:
+
+1. Removed RISC-V Metal runtime fallback dispatch path:
+   - Wrapper no longer builds/keeps a secondary fallback pipeline.
+   - Dispatch always targets the fast ArcToGPU kernel entry.
+   - Runtime gating method for fallback selection was removed.
+2. Simplified ArcToGPU metadata + kernel emission for RISC-V:
+   - Removed separate `metal.fast_entry` metadata key.
+   - Primary kernel entry now emits/uses the fast-path kernel body directly.
+3. Preserved metrics compatibility while enforcing no fallback:
+   - `dispatch_count`, `wait_count`, `fast_dispatch_count` remain exported.
+   - `fallback_dispatch_count` remains exported and now reports `0`.
+4. Fixed runner FIRRTL API wiring in this branch shape:
+   - Added explicit `require 'rhdl/codegen/firrtl/firrtl'` so `RHDL::Codegen::FIRRTL.generate` resolves during Metal build.
+5. Added runner coverage:
+   - Extended `spec/examples/riscv/runners/hdl_harness_spec.rb` with `MetalRunner` definition/interface checks.
+
+Validation gates run:
+
+1. `bundle exec rake native:build`  
+   Result: pass (rebuilt IR + netlist native extensions used for parity/perf comparisons).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`62` examples, `0` failures).
+3. Direct parity/dispatch probe (Metal runner):
+   - `run_cycles(500000)` delta counters: `dispatch=1`, `wait=1`, `fast=1`, `fallback=0`.
+   - Final state: `pc=0x14`, `x1=0x186B4` (matches compiler run for same core loop program).
+
+Direct harness benchmark (core loop program, warmup + timed batch):
+
+1. `5000` cycles: IR `0.031s`, Metal `0.101s`, ratio `0.305x`.
+2. `50000` cycles: IR `0.318s`, Metal `0.999s`, ratio `0.318x`.
+3. `500000` cycles: IR `3.232s`, Metal `9.904s`, ratio `0.326x`.
+
+Current outcome:
+
+1. Runtime fallback path is eliminated for the RISC-V Metal ArcToGPU dispatch path.
+2. Batch contract and parity checkpoints remain intact on the core loop workload.
+3. Single-instance Metal remains below IR compiler, but current long-run ratio improved to ~`0.326x` on this host/run.
+
+## Execution Update (2026-03-04, Continued 5)
+
+Implemented in this continuation pass:
+
+1. Closed Phase 6 checklist gating item:
+   - Marked Phase 6 benchmark/reporting gate complete in this PRD checklist.
+2. Started Phase 9 (dirty-cone/event-driven path) on RISC-V lowering:
+   - Added profile-level toggle `RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE`.
+   - Added dirty-set tracking in the fast high-loop eval function (`track_state_dirty` + `state_dirty` output).
+   - Added conservative fast-forward rule in fast kernel loop:
+     - if `high.state_dirty == 0` and `low.data_we == 0`, mark remaining batch cycles as completed and exit loop early.
+   - Kept this path **opt-in** (default off) due neutral/slightly regressive core-loop A/B.
+3. Ensured rebuild correctness for A/B:
+   - Added `RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE` to Metal build-config tracked env vars.
+4. Added lowering regression coverage:
+   - New spec verifies dirty-settle guard emission when toggle is enabled.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`  
+   Result: pass (`61` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`63` examples, `0` failures).
+
+Direct dirty-settle A/B benchmark (core loop, direct harness):
+
+1. `5000` cycles:
+   - IR `0.030s`
+   - Metal dirty=0 `0.105s` (`0.289x`)
+   - Metal dirty=1 `0.116s` (`0.262x`)
+2. `50000` cycles:
+   - IR `0.318s`
+   - Metal dirty=0 `0.996s` (`0.319x`)
+   - Metal dirty=1 `0.994s` (`0.320x`)
+3. `500000` cycles:
+   - IR `3.173s`
+   - Metal dirty=0 `9.824s` (`0.323x`)
+   - Metal dirty=1 `9.844s` (`0.322x`)
+
+Dispatch/parity checkpoints in this A/B:
+
+1. Per timed batch: `dispatch=1`, `wait=1`, `fast=1`, `fallback=0`.
+2. Final parity: `pc=0x14`, `x1=0x3FC/0x2724/0x186B4` matches IR at 5k/50k/500k.
+
+Current outcome:
+
+1. Phase 9 functionality is now present behind a guarded toggle with parity intact.
+2. Core-loop throughput benefit is not yet demonstrated; default remains `dirty_settle=off` until improved heuristics/partitioning are added.
+3. Next work remains Phase 9 refinement (higher-hit-rate dirty criteria), then Phase 10 scheduling groundwork.
+
+## Execution Update (2026-03-04, Continued 6)
+
+Implemented in this continuation pass:
+
+1. Executed Phase 10 schedule-aware emitter groundwork for RISC-V:
+   - Added profile toggle `RHDL_ARC_TO_GPU_RISCV_SCHEDULED_EMIT`.
+   - Added profile metadata mode reporting (`legacy` vs `levelized`).
+   - Added levelized emission structure for comb/post-comb op generation (schedule phase + schedule level markers).
+   - Kept legacy emission as default fallback path (`scheduled_emit=0`).
+2. Added rebuild-safe A/B support:
+   - Added `RHDL_ARC_TO_GPU_RISCV_SCHEDULED_EMIT` to Metal runner tracked ArcToGPU env vars.
+3. Added coverage:
+   - New lowering spec verifies schedule-aware markers and metadata when enabled.
+   - Existing RISC-V lowering/runner/task gates remain green.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb`  
+   Result: pass (`62` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`64` examples, `0` failures).
+
+Direct scheduled-emitter A/B benchmark (core loop, dirty-settle forced off):
+
+1. `5000` cycles:
+   - IR `0.031s`
+   - Metal scheduled=0 `0.106s` (`0.291x`)
+   - Metal scheduled=1 `0.115s` (`0.266x`)
+2. `50000` cycles:
+   - IR `0.320s`
+   - Metal scheduled=0 `1.002s` (`0.319x`)
+   - Metal scheduled=1 `1.010s` (`0.316x`)
+3. `500000` cycles:
+   - IR `3.209s`
+   - Metal scheduled=0 `9.828s` (`0.326x`)
+   - Metal scheduled=1 `9.878s` (`0.325x`)
+
+Dispatch/parity checkpoints in this A/B:
+
+1. Per timed batch: `dispatch=1`, `wait=1`, `fast=1`, `fallback=0`.
+2. Final parity: `pc=0x14`, `x1=0x3FC/0x2724/0x186B4` matches IR at 5k/50k/500k.
+
+Current outcome:
+
+1. Phase 10 structure is implemented and profile-gated with legacy fallback preserved.
+2. Current levelized emission is performance-neutral to slightly regressive on core loop; default remains `scheduled_emit=off`.
+3. Next work is Phase 9 refinement and then Phase 11 intra-kernel parallel partitioning built on this schedule metadata.
+
+## Execution Update (2026-03-04, Continued 7)
+
+Implemented in this continuation pass:
+
+1. Refined Phase 9 dirty-tracking overhead:
+   - Updated generated `state_dirty` writeback checks to short-circuit once dirty is already set.
+   - This avoids repeated expensive state equality comparisons after first detected mutation in a cycle.
+2. Kept dirty-settle as profile-gated behavior (`RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE`) with default off.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`11` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`64` examples, `0` failures).
+
+Direct dirty-settle A/B benchmark after short-circuit change (scheduled emit forced off):
+
+1. `5000` cycles:
+   - IR `0.031s`
+   - Metal dirty=0 `0.113s` (`0.273x`)
+   - Metal dirty=1 `0.114s` (`0.269x`)
+2. `50000` cycles:
+   - IR `0.323s`
+   - Metal dirty=0 `1.003s` (`0.323x`)
+   - Metal dirty=1 `1.007s` (`0.321x`)
+3. `500000` cycles:
+   - IR `3.246s`
+   - Metal dirty=0 `9.894s` (`0.328x`)
+   - Metal dirty=1 `9.908s` (`0.328x`)
+
+Dispatch/parity checkpoints:
+
+1. Per timed batch: `dispatch=1`, `wait=1`.
+2. Final parity: `pc=0x14`, `x1=0x3FC/0x2724/0x186B4` remains aligned with IR at 5k/50k/500k.
+
+Current outcome:
+
+1. Dirty tracking overhead is reduced and now near-neutral in long runs.
+2. Core-loop still does not show a positive dirty-settle speedup, so default remains off pending better quiescence heuristics or workload-specific gating.
+
+## Execution Update (2026-03-04, Continued 8)
+
+Implemented in this continuation pass:
+
+1. Added Phase 9 selective-eval split for fast low-loop write data:
+   - Added profile toggle `RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_WDATA` (default off).
+   - Added split output sets:
+     - base fast low outputs (without `data_wdata`)
+     - dedicated fast low write-data output (`data_wdata`) in a separate eval function.
+   - Fast kernel now evaluates the dedicated low write-data function only when `low0.data_we != 0`.
+2. Added metadata and rebuild tracking:
+   - Added `metal.fast_low_wdata_mode` metadata (`inline` or `split`).
+   - Added `RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_WDATA` to tracked build env vars in Metal runner.
+3. Added lowering coverage:
+   - New spec verifies split fast low write-data function emission and metadata mode when enabled.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`12` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`65` examples, `0` failures).
+
+Direct split A/B benchmark (core loop, dirty-settle/scheduled emit off, core specialization on):
+
+1. split=0:
+   - `5000`: IR `0.030s`, Metal `0.113s` (`0.267x`)
+   - `50000`: IR `0.317s`, Metal `0.990s` (`0.320x`)
+   - `500000`: IR `3.170s`, Metal `9.838s` (`0.322x`)
+2. split=1:
+   - `5000`: IR `0.033s`, Metal `0.112s` (`0.294x`)
+   - `50000`: IR `0.316s`, Metal `0.986s` (`0.321x`)
+   - `500000`: IR `3.171s`, Metal `9.808s` (`0.323x`)
+
+Parity checkpoints in this A/B:
+
+1. Final state parity remained aligned (`pc=0x14`, `x1=0x3FB/0x2723/0x186B3`, `mem[0x100]=0x55`).
+2. Batch counters observed as `dispatch=3`, `wait=3`, `fast=3`, `fallback=0` in probe output because post-batch state reads trigger extra `sim_eval` calls; timed batch dispatch contract for `run_cycles` remains one kernel dispatch per call.
+
+Current outcome:
+
+1. Split low write-data selective eval is implemented and parity-clean for the core loop probe.
+2. Throughput delta is small but non-negative on 50k/500k in this run.
+3. Phase 9 remains in progress; next work is to extend selective eval to other conditional cones with a clearer long-run speedup.
+
+## Execution Update (2026-03-04, Continued 9)
+
+Implemented in this continuation pass:
+
+1. Added Phase 9 selective-eval split for fast high-loop data address:
+   - Added profile toggle `RHDL_ARC_TO_GPU_RISCV_SPLIT_HIGH_DATA_ADDR` (default off).
+   - Added split output sets:
+     - base fast high outputs (without `data_addr`)
+     - dedicated fast high data-address output (`data_addr`) in a separate eval function.
+   - Fast kernel now evaluates the dedicated high data-address function only when `high.data_re != 0`.
+2. Added metadata and rebuild tracking:
+   - Added `metal.fast_high_data_addr_mode` metadata (`inline` or `split`).
+   - Added `RHDL_ARC_TO_GPU_RISCV_SPLIT_HIGH_DATA_ADDR` to tracked build env vars in Metal runner.
+3. Added lowering coverage:
+   - New spec verifies split fast high data-address function emission and metadata mode when enabled.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`13` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`66` examples, `0` failures).
+
+Direct split-combination A/B benchmark (core loop, dirty-settle/scheduled emit off, core specialization on):
+
+1. baseline (`low=0 high=0`):
+   - `5000`: IR `0.031s`, Metal `0.118s` (`0.260x`)
+   - `50000`: IR `0.318s`, Metal `0.993s` (`0.320x`)
+   - `500000`: IR `3.179s`, Metal `9.840s` (`0.323x`)
+2. low split only (`low=1 high=0`):
+   - `5000`: IR `0.032s`, Metal `0.111s` (`0.292x`)
+   - `50000`: IR `0.317s`, Metal `0.992s` (`0.319x`)
+   - `500000`: IR `3.178s`, Metal `9.848s` (`0.323x`)
+3. high split only (`low=0 high=1`):
+   - `5000`: IR `0.032s`, Metal `0.110s` (`0.293x`)
+   - `50000`: IR `0.318s`, Metal `1.001s` (`0.317x`)
+   - `500000`: IR `3.179s`, Metal `9.894s` (`0.321x`)
+4. both splits (`low=1 high=1`):
+   - `5000`: IR `0.032s`, Metal `0.114s` (`0.284x`)
+   - `50000`: IR `0.322s`, Metal `0.999s` (`0.322x`)
+   - `500000`: IR `3.203s`, Metal `9.843s` (`0.325x`)
+
+Parity checkpoints in this A/B:
+
+1. Final state parity remained aligned for all configs (`pc=0x14`, `x1=0x3FB/0x2723/0x186B3`, `mem[0x100]=0x55`).
+2. Probe counter deltas show `dispatch=3`, `wait=3`, `fast=3`, `fallback=0` because the script reads state after timing and triggers evals; timed batch dispatch contract for `run_cycles` remains one dispatch per call.
+
+Current outcome:
+
+1. High data-address selective eval is implemented and parity-clean.
+2. Combined low+high split is slightly best at longer points in this run, but gains are still small.
+3. Phase 9 remains in progress; next optimization should target larger conditional cones (for example PTW outputs or decode/debug cone split) to increase long-run delta.
+
+## Execution Update (2026-03-04, Continued 10)
+
+Implemented in this continuation pass:
+
+1. Reduced fast-loop invariant input churn:
+   - Hoisted constant PTW input-zero writes (`inst_ptw_pte0/1`, `data_ptw_pte0/1`) out of the per-cycle loop in the fast kernel path.
+   - This keeps semantics aligned with existing fast path behavior (always-zero PTW PTE inputs in loop mode) while removing repeated per-iteration stores.
+2. Re-ran split-combination A/B after this change.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`13` examples, `0` failures).
+
+Direct split-combination A/B benchmark after invariant-hoist (core loop, dirty-settle/scheduled emit off, core specialization on):
+
+1. baseline (`low=0 high=0`):
+   - `5000`: IR `0.033s`, Metal `0.106s` (`0.311x`)
+   - `50000`: IR `0.319s`, Metal `1.020s` (`0.313x`)
+   - `500000`: IR `3.207s`, Metal `9.746s` (`0.329x`)
+2. low split only (`low=1 high=0`):
+   - `5000`: IR `0.033s`, Metal `0.108s` (`0.308x`)
+   - `50000`: IR `0.319s`, Metal `1.017s` (`0.313x`)
+   - `500000`: IR `3.558s`, Metal `9.763s` (`0.364x`) *(IR timing variance observed in this run)*
+3. high split only (`low=0 high=1`):
+   - `5000`: IR `0.032s`, Metal `0.111s` (`0.288x`)
+   - `50000`: IR `0.320s`, Metal `1.012s` (`0.316x`)
+   - `500000`: IR `3.217s`, Metal `9.773s` (`0.329x`)
+4. both splits (`low=1 high=1`):
+   - `5000`: IR `0.032s`, Metal `0.108s` (`0.295x`)
+   - `50000`: IR `0.324s`, Metal `0.984s` (`0.329x`)
+   - `500000`: IR `3.217s`, Metal `9.725s` (`0.331x`)
+
+Parity checkpoints in this A/B:
+
+1. Final state parity remained aligned for all configs (`pc=0x14`, `x1=0x3FB/0x2723/0x186B3`, `mem[0x100]=0x55`).
+2. Probe counter deltas remained `dispatch=3`, `wait=3`, `fast=3`, `fallback=0` due post-timing state reads; timed batch dispatch contract for `run_cycles` remains one dispatch per call.
+
+Current outcome:
+
+1. Fast-loop invariant write churn is reduced, with parity preserved.
+2. In this run, combined low+high split remains the best long-run configuration.
+3. Phase 9 still needs a larger cone-level pruning pass to achieve a clearer single-instance gain.
+
+## Execution Update (2026-03-04, Continued 11)
+
+Implemented in this continuation pass:
+
+1. Added Phase 9 selective-eval split for fast low-loop data address:
+   - Added profile toggle `RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_DATA_ADDR` (default off).
+   - Added split output sets:
+     - base fast low outputs (without `data_addr`)
+     - dedicated fast low data-address output (`data_addr`) in a separate eval function.
+   - Fast kernel now evaluates the dedicated low data-address function only when `(low.data_re | low.data_we) != 0`.
+2. Added metadata and rebuild tracking:
+   - Added `metal.fast_low_data_addr_mode` metadata (`inline` or `split`).
+   - Added `RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_DATA_ADDR` to tracked build env vars in Metal runner.
+3. Added lowering coverage:
+   - New spec verifies split fast low data-address function emission and metadata mode when enabled.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`14` examples, `0` failures).
+2. `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`  
+   Result: pass (`67` examples, `0` failures).
+
+Targeted config A/B benchmark (core loop, dirty-settle/scheduled emit off, core specialization on):
+
+1. baseline (`low_w=0 high_addr=0 low_addr=0`):
+   - `5000`: IR `0.030s`, Metal `0.111s` (`0.273x`)
+   - `50000`: IR `0.321s`, Metal `0.980s` (`0.327x`)
+   - `500000`: IR `3.190s`, Metal `9.712s` (`0.328x`)
+2. previous combined (`low_w=1 high_addr=1 low_addr=0`):
+   - `5000`: IR `0.033s`, Metal `0.109s` (`0.301x`)
+   - `50000`: IR `0.321s`, Metal `1.002s` (`0.321x`)
+   - `500000`: IR `3.195s`, Metal `9.722s` (`0.329x`)
+3. new low-addr only (`low_w=0 high_addr=0 low_addr=1`):
+   - `5000`: IR `0.033s`, Metal `0.136s` (`0.242x`)
+   - `50000`: IR `0.321s`, Metal `0.979s` (`0.327x`)
+   - `500000`: IR `3.192s`, Metal `9.776s` (`0.326x`)
+4. all three splits (`low_w=1 high_addr=1 low_addr=1`):
+   - `5000`: IR `0.033s`, Metal `0.114s` (`0.286x`)
+   - `50000`: IR `0.318s`, Metal `0.995s` (`0.320x`)
+   - `500000`: IR `3.672s`, Metal `9.706s` (`0.378x`) *(IR timing variance observed in this run)*
+
+Parity checkpoints in this A/B:
+
+1. Final state parity remained aligned for all configs (`pc=0x14`, `x1=0x3FB/0x2723/0x186B3`, `mem[0x100]=0x55`).
+2. Probe counter deltas remained `dispatch=3`, `wait=3`, `fast=3`, `fallback=0` due post-timing state reads; timed batch dispatch contract for `run_cycles` remains one dispatch per call.
+
+Current outcome:
+
+1. Low data-address selective eval is implemented and parity-clean.
+2. Measured impact is neutral to slightly regressive on this core loop workload; keep default off.
+3. Next meaningful gain likely requires larger cone-level pruning (for example gated PTW/debug/decode cone emission), not additional micro-splits.
+
+## Execution Update (2026-03-04, Continued 12)
+
+Implemented in this continuation pass:
+
+1. Corrected high data-address split helper semantics:
+   - The dedicated high data-address helper is now emitted as comb-only (`update_state: false` and no comb-only clock-slot sync side effects).
+   - This avoids accidental double state-update risk when helper is invoked conditionally after high update step.
+2. Re-ran the targeted config A/B matrix to confirm parity and updated timing trends.
+
+Validation gates run:
+
+1. `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`  
+   Result: pass (`14` examples, `0` failures).
+
+Targeted config A/B benchmark after semantic fix (core loop, dirty-settle/scheduled emit off, core specialization on):
+
+1. baseline (`low_w=0 high_addr=0 low_addr=0`):
+   - `5000`: IR `0.030s`, Metal `0.111s` (`0.275x`)
+   - `50000`: IR `0.320s`, Metal `0.985s` (`0.325x`)
+   - `500000`: IR `3.188s`, Metal `9.791s` (`0.326x`)
+2. previous combined (`low_w=1 high_addr=1 low_addr=0`):
+   - `5000`: IR `0.032s`, Metal `0.114s` (`0.284x`)
+   - `50000`: IR `0.318s`, Metal `0.984s` (`0.323x`)
+   - `500000`: IR `3.183s`, Metal `9.753s` (`0.326x`)
+3. new low-addr only (`low_w=0 high_addr=0 low_addr=1`):
+   - `5000`: IR `0.033s`, Metal `0.107s` (`0.305x`)
+   - `50000`: IR `0.319s`, Metal `0.997s` (`0.320x`)
+   - `500000`: IR `3.193s`, Metal `9.856s` (`0.324x`)
+4. all three splits (`low_w=1 high_addr=1 low_addr=1`):
+   - `5000`: IR `0.032s`, Metal `0.113s` (`0.285x`)
+   - `50000`: IR `0.317s`, Metal `0.979s` (`0.324x`)
+   - `500000`: IR `3.250s`, Metal `9.677s` (`0.336x`)
+
+Parity checkpoints in this A/B:
+
+1. Final state parity remained aligned for all configs (`pc=0x14`, `x1=0x3FB/0x2723/0x186B3`, `mem[0x100]=0x55`).
+2. Probe counter deltas remained `dispatch=3`, `wait=3`, `fast=3`, `fallback=0` due post-timing state reads; timed batch dispatch contract for `run_cycles` remains one dispatch per call.
+
+Current outcome:
+
+1. High split helper semantics are now correctness-safe by construction.
+2. Best observed long-run config in this pass is all three selective splits enabled, with a small improvement versus baseline.
+3. Gains remain modest; larger cone-level pruning is still required for meaningful single-instance improvement.
diff --git a/prd/2026_03_04_riscv_fast_default_metal_prd.md b/prd/2026_03_04_riscv_fast_default_metal_prd.md
new file mode 100644
index 00000000..32bd7482
--- /dev/null
+++ b/prd/2026_03_04_riscv_fast_default_metal_prd.md
@@ -0,0 +1,156 @@
+# RISC-V Metal Fast-Default ABI Slimming PRD
+
+## Status
+Completed (2026-03-04)
+
+## Context
+The RISC-V ArcToGPU Metal path currently exposes a large runtime IO surface that includes debug-oriented outputs and is controlled by multiple environment-based tuning toggles. Profiling shows the runtime is compute-bound in the Metal kernel path, and the requested direction is:
+
+1. Fast path by default.
+2. No separate execution profile.
+3. Remove debug outputs and minimize runtime outputs.
+4. Keep API compatibility for runner call sites.
+
+## Goals
+1. Make the default RISC-V Metal runtime ABI minimal, with debug outputs removed.
+2. Keep full execution parity semantics (PTW/funct3/MMIO behavior retained).
+3. Preserve `MetalRunner` public API (`read_pc`, `read_reg`, `current_inst`, `state`) without relying on debug outputs.
+4. Remove RISC-V ArcToGPU profile/env toggles tied to alternate/non-default paths.
+5. Capture benchmark results after each implementation phase (`5k`, `50k`, `500k`).
+
+## Non-Goals
+1. Apple2 or 8-bit ArcToGPU runtime ABI changes.
+2. Upstream CIRCT integration.
+3. New parallel kernel architecture changes beyond this scope.
+
+## Phased Plan
+
+### Phase 1: Runtime ABI Output Slimming
+Red:
+1. Add failing lowering tests that require RISC-V runtime output layout to exclude debug outputs.
+2. Add failing metadata checks for explicit runtime input/output layout fields.
+
+Green:
+1. Add explicit runtime IO layout metadata fields for RISC-V.
+2. Emit RISC-V Metal IO struct from runtime output layout (minimal outputs).
+3. Keep top-module ABI validation for correctness-critical outputs.
+
+Exit Criteria:
+1. Generated runtime output layout excludes debug outputs.
+2. Metal wrapper builds against runtime layout metadata.
+3. Benchmarks captured for this phase.
+
+### Phase 2: API Parity Without Debug Outputs
+Red:
+1. Add failing runner tests proving `read_pc`/`read_reg`/`current_inst` work without debug outputs.
+
+Green:
+1. Add metadata introspection fields (pc slot + regfile base/length).
+2. Add C-wrapper read helpers (`sim_read_pc`, `sim_read_reg`, `sim_read_inst`).
+3. Update `MetalRunner` to use helper symbols instead of debug poke/peek path.
+
+Exit Criteria:
+1. Runner API remains unchanged and tests pass.
+2. Debug output dependency is removed from Metal read methods.
+3. Benchmarks captured for this phase.
+
+### Phase 3: Toggle Removal and Fast-Default Consolidation
+Red:
+1. Add failing tests that assert removed env toggles are no longer consumed.
+2. Add failing tests for fixed fast-mode metadata markers.
+
+Green:
+1. Remove RISC-V profile env toggle plumbing for split/schedule/dirty/flatten tuning in this path.
+2. Fix RISC-V profile to a single fast-default configuration.
+3. Remove corresponding tracked env vars from Metal runner build signature.
+
+Exit Criteria:
+1. Removed env toggles have no effect.
+2. Profile metadata reports fixed fast-default settings.
+3. Benchmarks captured for this phase.
+
+### Phase 4: Parity/Regression Validation and PRD Closeout
+Red:
+1. Add/refresh parity checks on representative RISC-V microprograms.
+
+Green:
+1. Run targeted RSpec suites for lowering + runner behavior.
+2. Run benchmark sweep and compare phase-to-phase.
+3. Update PRD status/checklist with concrete evidence.
+
+Exit Criteria:
+1. Targeted tests green.
+2. Parity checks green for selected microprograms.
+3. Benchmark table populated for every phase.
+
+## Acceptance Criteria
+1. RISC-V Metal runtime outputs are minimized and exclude debug outputs.
+2. `MetalRunner` API compatibility is preserved.
+3. Full parity semantics are retained in default fast path.
+4. RISC-V env-based alternate profile toggles are removed.
+5. Benchmarks are reported after each phase (`5k`, `50k`, `500k`).
+
+## Risks and Mitigations
+1. Risk: Removing debug outputs could break runner read methods.
+   Mitigation: Add explicit introspection metadata + wrapper helpers and tests before removal.
+2. Risk: Toggle removal could regress performance unexpectedly.
+   Mitigation: Benchmark after each phase and compare.
+3. Risk: Metadata/schema drift could break wrapper generation.
+   Mitigation: Add explicit metadata field assertions in lowering specs.
+
+## Benchmark Evidence Log
+### Phase 1 (Runtime ABI Output Slimming)
+- Command: `bundle exec ruby /tmp/riscv_phase_bench.rb` (cleaned JSON at `/tmp/riscv_phase1_bench_clean.json`)
+- Throughput (cycles/sec, median of 3):
+  - IR compile: `5k=158,649`, `50k=161,192`, `500k=162,256`
+  - Metal: `5k=51,249`, `50k=51,870`, `500k=51,771`
+  - Metal/IR ratio: `~0.323x`, `~0.322x`, `~0.319x`
+- Dispatch/wait behavior: `dispatch_delta=1`, `wait_delta=1` at `500k`.
+- Note: parity snapshot via `read_pc/read_reg` is expectedly broken at this point because debug-output-backed read API has not been migrated yet (Phase 2).
+
+### Phase 2 (API Parity Without Debug Outputs)
+- Command: `bundle exec ruby /tmp/riscv_phase_bench.rb` (cleaned JSON at `/tmp/riscv_phase2_bench_clean.json`)
+- Throughput (cycles/sec, median of 3):
+  - IR compile: `5k=162,744`, `50k=162,137`, `500k=162,989`
+  - Metal: `5k=51,365`, `50k=51,926`, `500k=51,881`
+  - Metal/IR ratio: `~0.316x`, `~0.320x`, `~0.318x`
+- Dispatch/wait behavior: `dispatch_delta=1`, `wait_delta=1` at `500k`.
+- Parity snapshot (`pc/x1/x2/mem`) restored: `5k=true`, `50k=true`, `500k=true`.
+
+### Phase 3 (Toggle Removal and Fast-Default Consolidation)
+- Command: `bundle exec ruby /tmp/riscv_phase_bench.rb` (raw output at `/tmp/riscv_phase3_bench_clean.json`, cleaned JSON at `/tmp/riscv_phase3_bench.json`)
+- Throughput (cycles/sec, median of 3):
+  - IR compile: `5k=162,248`, `50k=161,986`, `500k=160,998`
+  - Metal: `5k=45,958`, `50k=46,554`, `500k=46,460`
+  - Metal/IR ratio: `~0.283x`, `~0.287x`, `~0.289x`
+- Dispatch/wait behavior: `dispatch_delta=1`, `wait_delta=1` at `5k`, `50k`, `500k`.
+- Parity snapshot (`pc/x1/x2/mem`): `5k=true`, `50k=true`, `500k=true`.
+- Notes:
+  - Fast-default profile is now fixed in code (`flatten=96/6`, split helpers enabled, scheduled/dirty disabled).
+  - Removed env toggles are no longer consumed by the profile path.
+
+### Phase 4 (Parity/Regression Validation and PRD Closeout)
+- Targeted validation command:
+  - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb spec/examples/riscv/runners/hdl_harness_spec.rb spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/rhdl/cli/tasks/benchmark_task_spec.rb:90`
+  - Result: `63 examples, 0 failures`.
+- Benchmark command: `bundle exec ruby /tmp/riscv_phase_bench.rb` (raw output at `/tmp/riscv_phase4_bench_raw.txt`, cleaned JSON at `/tmp/riscv_phase4_bench.json`)
+- Throughput (cycles/sec, median of 3):
+  - IR compile: `5k=162,253`, `50k=160,619`, `500k=161,280`
+  - Metal: `5k=46,401`, `50k=46,617`, `500k=46,396`
+  - Metal/IR ratio: `~0.286x`, `~0.290x`, `~0.288x`
+- Dispatch/wait behavior: `dispatch_delta=1`, `wait_delta=1` at `5k`, `50k`, `500k`.
+- Parity snapshot (`pc/x1/x2/mem`): `5k=true`, `50k=true`, `500k=true`.
+
+## Implementation Checklist
+- [x] Phase 1 red tests added.
+- [x] Phase 1 green implementation completed.
+- [x] Phase 1 benchmarks recorded.
+- [x] Phase 2 red tests added.
+- [x] Phase 2 green implementation completed.
+- [x] Phase 2 benchmarks recorded.
+- [x] Phase 3 red tests added.
+- [x] Phase 3 green implementation completed.
+- [x] Phase 3 benchmarks recorded.
+- [x] Phase 4 parity/regression checks completed.
+- [x] Phase 4 benchmarks recorded.
+- [x] PRD status updated to Completed.
diff --git a/prd/2026_03_04_riscv_netlist_gpu_experiment_prd.md b/prd/2026_03_04_riscv_netlist_gpu_experiment_prd.md
new file mode 100644
index 00000000..7621d8da
--- /dev/null
+++ b/prd/2026_03_04_riscv_netlist_gpu_experiment_prd.md
@@ -0,0 +1,126 @@
+# RISC-V Netlist-to-GPU Experiment PRD
+
+## Status
+Completed (2026-03-04)
+
+## Context
+We need a separate experiment path named `riscv_netlist` that netlistizes RISC-V combinational logic through CIRCT synth lowering and runs the result on the Metal GPU backend.
+
+Current RISC-V Metal execution uses ArcToGPU lowering directly from Arc MLIR. This experiment adds a second path:
+
+1. Arc MLIR emission.
+2. Comb-to-synth (AIG-style) netlistization pass chain.
+3. Synth-to-comb re-materialization.
+4. ArcToGPU Metal codegen and execution.
+
+## Goals
+1. Add a dedicated ArcToGPU profile `:riscv_netlist` with synth netlistization in `prepare_source`.
+2. Add a dedicated runner `RiscvNetlistRunner`.
+3. Expose `:riscv_netlist` as a runnable RISC-V headless/CLI mode.
+4. Validate parity against IR on a representative loop program.
+5. Capture baseline throughput for `IR compile`, `Metal`, and `riscv_netlist`.
+
+## Non-Goals
+1. Replace existing `:metal` RISC-V path.
+2. Re-architect Metal kernel threading/dispatch behavior.
+3. Full Linux/xv6 validation on this first experiment pass.
+
+## Phased Plan
+
+### Phase 1: Lowering Profile
+Red:
+1. Add failing lowering tests for `profile: :riscv_netlist`.
+
+Green:
+1. Add `profiles/riscv_netlist.rb`.
+2. Wire `:riscv_netlist` into `ArcToGpuLowering.profile_module_for`.
+3. Add reusable `circt-opt` pipeline helper in lowering module.
+
+Exit Criteria:
+1. Lowering spec passes for `:riscv_netlist`.
+2. Metadata includes `profile=riscv_netlist` and runtime introspection fields.
+
+### Phase 2: Runner + Mode Wiring
+Red:
+1. Add/adjust runner/headless specs for new mode/runner.
+
+Green:
+1. Add `RiscvNetlistRunner` class.
+2. Parameterize `MetalRunner` for profile/variant reuse.
+3. Wire `HeadlessRunner` and CLI mode handling for `:riscv_netlist`.
+
+Exit Criteria:
+1. Runner and task specs pass.
+2. `examples/riscv/bin/riscv --mode riscv_netlist` is accepted.
+
+### Phase 3: Parity + Baseline Performance
+Red:
+1. Define parity gate: `pc/x1/x2/mem` snapshot equality versus IR at 5k/50k/500k cycles.
+
+Green:
+1. Run parity benchmark for `riscv_netlist`.
+2. Run throughput baseline for IR, Metal, and `riscv_netlist`.
+3. Record results in this PRD.
+
+Exit Criteria:
+1. Parity gate passes at all benchmark points.
+2. Throughput table recorded.
+
+## Acceptance Criteria
+1. New mode `:riscv_netlist` is implemented and runnable.
+2. Netlistization pass chain is active in the experiment profile.
+3. Parity gate passes on benchmark program.
+4. Baseline benchmark numbers are documented.
+
+## Risks and Mitigations
+1. Risk: CIRCT synth passes unavailable/unsupported on host toolchain.
+   Mitigation: pipeline helper fails soft to prior source; tests detect profile behavior and runner still builds.
+2. Risk: Mode wiring regressions in RISC-V runner selection.
+   Mitigation: extend runner and task specs.
+3. Risk: Large compile latency in tests.
+   Mitigation: keep new tests interface-focused and skip when backend unavailable.
+
+## Benchmark Evidence Log
+### Phase 1: Lowering Profile
+- Validation command:
+  - `bundle exec rspec spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb`
+- Evidence:
+  - `profile: :riscv_netlist` artifacts emitted.
+  - Metadata includes `profile=riscv_netlist`, `metal.schedule_mode=netlist_aig_legacy`, and runtime introspection fields.
+
+### Phase 2: Runner + Mode Wiring
+- Validation commands:
+  - `bundle exec rspec spec/examples/riscv/runners/hdl_harness_spec.rb`
+  - `bundle exec rspec spec/examples/riscv/utilities/tasks/run_task_spec.rb spec/examples/riscv/utilities/tasks/riscv_cli_linux_spec.rb`
+- Evidence:
+  - `RiscvNetlistRunner` class available with expected runner interface.
+  - `HeadlessRunner` accepts `mode: :riscv_netlist` without fallback.
+  - CLI accepts `--mode riscv_netlist`.
+  - Objective-C Metal wrapper class is now build-variant namespaced to allow loading both `metal` and `riscv_netlist` shared libraries in one process.
+
+### Phase 3: Parity + Baseline Performance
+- Benchmark command:
+  - `bundle exec ruby /tmp/riscv_netlist_experiment_bench.rb`
+  - Raw output: `/tmp/riscv_netlist_experiment_bench_raw.txt`
+  - Clean JSON: `/tmp/riscv_netlist_experiment_bench.json`
+- Throughput (cycles/sec, median of 3):
+  - IR compile: `5k=164,403`, `50k=162,633`, `500k=161,794`
+  - Metal: `5k=46,628`, `50k=46,657`, `500k=46,300`
+  - riscv_netlist: `5k=46,444`, `50k=46,199`, `500k=46,527`
+  - Metal/IR ratio: `~0.284x`, `~0.287x`, `~0.286x`
+  - riscv_netlist/IR ratio: `~0.282x`, `~0.284x`, `~0.288x`
+- Parity vs IR (`pc/x1/x2/mem`) at `5k`, `50k`, `500k`:
+  - Metal: `true`, `true`, `true`
+  - riscv_netlist: `true`, `true`, `true`
+- Dispatch/wait deltas per benchmark run:
+  - Metal: `dispatch=1`, `wait=1`
+  - riscv_netlist: `dispatch=1`, `wait=1`
+
+## Implementation Checklist
+- [x] Phase 1 red tests added.
+- [x] Phase 1 green implementation completed.
+- [x] Phase 2 red tests added.
+- [x] Phase 2 green implementation completed.
+- [x] Phase 3 parity checks completed.
+- [x] Phase 3 benchmarks recorded.
+- [x] PRD status updated to Completed.
diff --git a/prd/2026_03_05_cpu8bit_arcilator_gpu_instances_prd.md b/prd/2026_03_05_cpu8bit_arcilator_gpu_instances_prd.md
new file mode 100644
index 00000000..4e1e5704
--- /dev/null
+++ b/prd/2026_03_05_cpu8bit_arcilator_gpu_instances_prd.md
@@ -0,0 +1,80 @@
+# CPU8bit ArcilatorGPU Multi-Instance PRD
+
+## Status
+Completed (2026-03-05)
+
+## Context
+The CPU8bit `:arcilator_gpu` runner currently executes exactly one logical CPU instance. The native benchmark surface already knows how to report instance-adjusted throughput when a runner exposes `runner_parallel_instances`, but the CPU8bit ArcilatorGPU path does not provide any instance-count control or reporting today.
+
+Apple II and RISC-V already have explicit Metal/ArcilatorGPU instance knobs. CPU8bit should expose the same capability with a backend-specific environment variable and benchmark-visible instance count.
+
+## Goals
+1. Add configurable CPU8bit ArcilatorGPU multi-instance execution.
+2. Keep the existing `FastHarness` API unchanged.
+3. Expose instance-adjusted throughput automatically in `bench:native[cpu8bit,...]`.
+4. Avoid rebuilds when only the requested instance count changes.
+
+## Non-Goals
+1. Re-architecting the CPU8bit ArcilatorGPU pipeline into a new backend.
+2. Changing benchmark result formatting beyond existing instance-aware reporting.
+3. Adding per-instance external memory APIs in Ruby.
+
+## Phased Plan
+
+### Phase 1: Red Tests
+Red:
+1. Add failing runner specs for CPU8bit ArcilatorGPU instance-count normalization and env fallback.
+2. Add failing harness spec proving `parallel_instances` reflects the runner count for `:arcilator_gpu`.
+
+Green:
+1. Add the minimum runner API surface needed for the new tests to pass.
+
+Exit Criteria:
+1. Tests fail before implementation and pass after runner wiring lands.
+
+### Phase 2: Native Runner Support
+Red:
+1. Instance count is ignored by the native CPU8bit ArcilatorGPU wrapper.
+
+Green:
+1. Add `instances` / env normalization in the CPU8bit runner.
+2. Update the generated native wrapper so one simulation context owns `N` mirrored CPU states and memories.
+3. Keep instance 0 as the externally visible state while mirroring loads/writes/resets across all instances.
+
+Exit Criteria:
+1. The runner reports `runner_parallel_instances`.
+2. Multi-instance execution does not require a rebuild when only the count changes.
+
+### Phase 3: Verification and Docs
+Red:
+1. No documented user-facing way to request multi-instance CPU8bit ArcilatorGPU runs.
+
+Green:
+1. Update benchmark docs with the new env vars.
+2. Run targeted specs and a CPU8bit ArcilatorGPU smoke benchmark if the toolchain is available.
+
+Exit Criteria:
+1. Docs mention the new CPU8bit ArcilatorGPU instance knob.
+2. Targeted specs are green.
+
+## Acceptance Criteria
+1. `RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES` configures the CPU8bit ArcilatorGPU runner.
+2. `RHDL_BENCH_ARCILATOR_GPU_INSTANCES` works as a benchmark-wide fallback.
+3. `FastHarness#parallel_instances` reports the configured count for CPU8bit ArcilatorGPU.
+4. `bench:native[cpu8bit,...]` shows instance-adjusted throughput automatically when instances > 1.
+5. Targeted CPU8bit runner and harness specs are green.
+
+## Risks and Mitigations
+1. Risk: Mirroring memory/state across instances could drift from existing single-instance semantics.
+   Mitigation: Keep instance 0 as the externally visible state and mirror writes/loads uniformly.
+2. Risk: Instance-count changes could incorrectly force native rebuilds.
+   Mitigation: Pass instance count at runtime rather than baking it into the generated artifact.
+3. Risk: Multi-instance execution may not increase throughput materially.
+   Mitigation: Keep the feature focused on correctness/reporting first and verify with a smoke benchmark.
+
+## Implementation Checklist
+- [x] Phase 1: Add red tests for CPU8bit ArcilatorGPU instance env parsing and harness reporting.
+- [x] Phase 2: Add CPU8bit ArcilatorGPU instance normalization and reporting.
+- [x] Phase 2: Update the native wrapper to execute mirrored multi-instance contexts.
+- [x] Phase 3: Update docs for CPU8bit ArcilatorGPU instance env vars.
+- [x] Phase 3: Run targeted specs and smoke verification.
diff --git a/prd/2026_03_05_gem_cuda_to_metal_prd.md b/prd/2026_03_05_gem_cuda_to_metal_prd.md
new file mode 100644
index 00000000..2db36f77
--- /dev/null
+++ b/prd/2026_03_05_gem_cuda_to_metal_prd.md
@@ -0,0 +1,234 @@
+# GEM CUDA-to-Metal Migration PRD
+
+## Status
+In Progress (started 2026-03-05; Phase 0 complete, Phase 1 complete, Phase 2 complete, Phase 3 complete, Phase 4 complete, Phase 5 in progress)
+
+## Context
+GEM currently has a CUDA-only execution backend:
+- Build flow compiles `csrc/kernel_v1.cu`.
+- Runtime launch path is wired through CUDA-specific bindings.
+- CLI binaries (`cuda_test`, `cuda_dummy_test`) depend directly on CUDA runtime and generated bindings.
+
+We need a Metal backend for Apple Silicon macOS while preserving execution parity with current CPU reference semantics and avoiding regressions in existing CUDA flow during migration.
+
+## Goals
+1. Add a Metal backend for GEM simulation (`kernel_v1` semantics preserved).
+2. Keep CUDA backend functional during migration.
+3. Preserve `FlattenedScriptV1` ABI in initial migration.
+4. Add first-class Metal binaries (`metal_test`, `metal_dummy_test`).
+5. Enforce parity-first acceptance before performance optimization.
+
+## Non-Goals
+1. Redesigning `FlattenedScriptV1` in this PRD.
+2. Removing CUDA during initial migration phases.
+3. Cross-platform non-macOS Metal-like backends.
+4. Netlist/AIG algorithm changes unrelated to backend execution.
+
+## Phased Plan
+
+### Phase 0: Baseline Freeze
+Red:
+1. Add a deterministic baseline harness command that captures script/state hash from CPU reference flow.
+2. Add a failing baseline check that reports mismatch against committed golden values.
+
+Green:
+1. Lock baseline artifacts and command in docs.
+2. Make baseline command reproducible and mandatory for migration validation.
+
+Exit Criteria:
+1. Baseline command runs deterministically on same input.
+2. Golden mismatch is reported with actionable diff context.
+
+### Phase 1: Backend Abstraction (CUDA routed through trait)
+Red:
+1. Add compile-time checks that require a common GPU backend interface.
+2. Keep existing binaries failing until routed through the backend abstraction.
+
+Green:
+1. Add `src/gpu/backend.rs` trait and `src/gpu/cuda_backend.rs` implementation.
+2. Route `cuda_test` and `cuda_dummy_test` through the backend module.
+3. Keep runtime behavior unchanged for CUDA execution.
+
+Exit Criteria:
+1. CUDA bins compile via backend abstraction.
+2. CUDA output hashes remain unchanged versus Phase 0.
+
+### Phase 2: Metal Runtime Skeleton
+Red:
+1. Add failing Metal smoke test path for device/pipeline setup.
+2. Add failing command path for missing Metal runtime prerequisites.
+
+Green:
+1. Add Metal backend module and runtime setup (device/queue/pipeline/buffers).
+2. Add clear diagnostics for unsupported host/toolchain scenarios.
+
+Exit Criteria:
+1. Metal runtime initialization succeeds on supported Apple Silicon host.
+2. Unsupported environment failure messages are explicit.
+
+### Phase 3: Metal Kernel Port (`kernel_v1` ABI-preserving)
+Red:
+1. Add failing unit parity checks (CPU reference vs Metal) on synthetic scripts.
+2. Add failing SRAM/duplicate-writeout parity cases.
+
+Green:
+1. Port `kernel_v1` semantics into `msrc/kernel_v1.metal`.
+2. Preserve metadata/global-read/boomerang/sram/clken execution semantics.
+3. Implement stage synchronization via host dispatch ordering.
+
+Exit Criteria:
+1. Bit-exact parity for targeted unit cases.
+2. No script ABI change required.
+
+### Phase 4: End-to-End Metal VCD Path
+Red:
+1. Add failing end-to-end VCD parity test for `metal_test`.
+2. Add failing output hash check against CPU reference.
+
+Green:
+1. Implement `metal_test` with same VCD ingest/output behavior as `cuda_test`.
+2. Implement `metal_dummy_test` parity/perf smoke path.
+
+Exit Criteria:
+1. End-to-end VCD parity passes on representative designs.
+2. Metal binaries are usable from CLI with documented commands.
+
+### Phase 5: Performance Stabilization
+Red:
+1. Add failing benchmark regression guardrails against phase baseline.
+2. Add instrumentation for dispatch/sync overhead visibility.
+
+Green:
+1. Optimize runtime submission/memory reuse.
+2. Add safe kernel fast paths where parity is preserved.
+3. Re-baseline with benchmark tables.
+
+Exit Criteria:
+1. No parity regressions.
+2. Throughput improves over phase-4 baseline.
+
+## Acceptance Criteria (Full Completion)
+1. All phase exit criteria are met.
+2. Metal backend passes parity gates against CPU reference.
+3. CUDA backend remains functional and regression-free.
+4. Documentation reflects both CUDA and Metal usage paths.
+
+## Risks and Mitigations
+1. Risk: Grid-level sync semantics differ from CUDA cooperative groups.
+   Mitigation: enforce host-stage dispatch barriers first; optimize later.
+2. Risk: SIMD/subgroup behavior divergence in Metal implementation.
+   Mitigation: parity-first validation on targeted deterministic cases.
+3. Risk: dependency/toolchain drift on host.
+   Mitigation: explicit runtime/toolchain checks with actionable errors.
+4. Risk: regression in legacy CUDA path while refactoring.
+   Mitigation: route through abstraction without changing kernel semantics and keep CUDA checks in every phase.
+
+## Implementation Checklist
+- [x] Phase 0 Red: add deterministic baseline harness + failing mismatch check.
+- [x] Phase 0 Green: lock baseline artifacts and command docs.
+- [x] Phase 1 Red: add backend abstraction compile checks.
+- [x] Phase 1 Green: route CUDA bins through backend trait.
+- [x] Phase 2 Red: add failing Metal runtime smoke path.
+- [x] Phase 2 Green: implement Metal runtime setup and diagnostics.
+- [x] Phase 3 Red: add failing parity cases for Metal kernel.
+- [x] Phase 3 Green: port `kernel_v1` semantics to Metal with parity.
+- [x] Phase 4 Red: add failing end-to-end `metal_test` parity check.
+- [x] Phase 4 Green: implement end-to-end Metal VCD flow.
+- [x] Phase 5 Red: add performance regression guardrails/instrumentation.
+- [ ] Phase 5 Green: optimize and publish updated benchmark baselines.
+
+## Execution Log
+2026-03-05:
+1. Added `external/GEM/src/gpu/backend.rs` with `GpuBackendV1` interface.
+2. Added `external/GEM/src/gpu/cuda_backend.rs` and `external/GEM/src/gpu/mod.rs`.
+3. Exported `gpu` module from `external/GEM/src/lib.rs`.
+4. Refactored `external/GEM/src/bin/cuda_test.rs` and `external/GEM/src/bin/cuda_dummy_test.rs` to use `CudaBackend` instead of direct local bindgen module wiring.
+5. Validation:
+   - `cargo check --bin cut_map_interactive` passed.
+   - `cargo check --features cuda --bin cuda_dummy_test` failed in this environment due missing CUDA installation (`Could not find a cuda installation`), not due Rust type/syntax errors in non-CUDA build.
+6. Added `external/GEM/src/bin/baseline_lock.rs` deterministic baseline harness:
+   - Builds flattened script from netlist + gemparts.
+   - Prints deterministic script hash and summary.
+   - Supports `--expected-script-hash` and fails on mismatch.
+7. Additional validation:
+   - `cargo check --bin baseline_lock` passed.
+8. Added deterministic baseline fixture artifacts:
+   - `external/GEM/baseline/tiny_gatelevel.gv`
+   - `external/GEM/baseline/tiny.gemparts`
+   - `external/GEM/baseline/manifest.toml`
+   - `external/GEM/baseline/README.md`
+9. Baseline lock check now has a fixed expected hash:
+   - `14926125099726623616` for `tiny_v1`.
+   - Verified with `cargo run --bin baseline_lock -- baseline/tiny_gatelevel.gv baseline/tiny.gemparts 1 --expected-script-hash 14926125099726623616`.
+10. Added compile-time backend contract assertions for `cuda`/`metal` backend implementations in `external/GEM/src/gpu/backend.rs`.
+11. Added phase-2 Metal skeleton:
+    - `external/GEM/src/gpu/metal_backend.rs` with explicit platform/toolchain diagnostics (`xcrun -f metal` probe).
+    - `external/GEM/src/bin/metal_dummy_test.rs` and `external/GEM/src/bin/metal_test.rs` as metal-feature probes.
+    - `external/GEM/Cargo.toml` now includes `metal` feature and metal bins.
+12. Documentation updates:
+    - Added baseline-lock section to `external/GEM/README.md`.
+    - Added baseline-lock and metal probe commands to `external/GEM/usage.md`.
+13. Additional validation:
+    - `cargo check --features metal --bin metal_dummy_test` passed.
+    - `cargo check --features metal --bin metal_test` passed.
+    - `cargo run --features metal --bin metal_dummy_test -- --strict` passed on this host.
+14. Extended phase-2 Metal skeleton to compile a real kernel library:
+    - Added `external/GEM/msrc/kernel_v1.metal`.
+    - `MetalBackend::new()` now compiles `msrc/kernel_v1.metal` via:
+      - `xcrun metal -std=metal3.1 -O3 -c ...`
+      - `xcrun metallib ...`
+    - Generated artifact path: `external/GEM/target/metal/kernel_v1.metallib` (under ignored `target/`).
+15. Probe runtime behavior:
+    - `cargo run --features metal --bin metal_dummy_test -- baseline/tiny_gatelevel.gv baseline/tiny.gemparts 1 1` now validates toolchain + shader compile and exercises real GEM front-half flow.
+    - `cargo run --features metal --bin metal_test -- --strict` intentionally exits with code `3` after successful probe, signaling kernel execution path is still pending (Phase 3+).
+16. Upgraded `metal_dummy_test` to run real GEM front-half flow (netlist + parts + flatten + dispatch call shape):
+    - It now accepts the same core positional args as `cuda_dummy_test`.
+    - It builds script and allocates buffers, then enters backend dispatch.
+    - Current result is an intentional panic at `MetalBackend::simulate_v1_noninteractive_simple_scan` (execution unimplemented), providing the Phase-3 red signal on actual script inputs.
+17. Implemented ABI-preserving Metal dispatch path with real data movement:
+    - `external/GEM/csrc/kernel_v1_metal.mm` now allocates/shared-copies `blocks_start`, `blocks_data`, `states_noninteractive`, and `sram_data` into Metal buffers.
+    - The shim now dispatches in strict `(cycle, major_stage)` order and waits each dispatch, matching CUDA grid-sync ordering semantics at host barrier granularity.
+    - Added `sram_size` FFI argument wiring from Rust (`external/GEM/src/gpu/metal_backend.rs`) to enforce safe SRAM bounds in kernel-side logic.
+18. Replaced no-op Metal kernel with correctness-first `simulate_block_v1` implementation:
+    - `external/GEM/msrc/kernel_v1.metal` now executes full part loop semantics (global reads, boomerang hierarchy, writeout hooks, SRAM read/write/duplicate, clock-enable permutation, and output update).
+    - Kernel currently runs one logical block-script executor per GPU thread, preserving `FlattenedScriptV1` ABI and script format.
+19. Validation after phase-3 kernel landing:
+    - `cargo check --features metal --bin metal_dummy_test` passed.
+    - `cargo run --features metal --bin metal_dummy_test -- baseline/tiny_gatelevel.gv baseline/tiny.gemparts 1 1` passed end-to-end with dispatch completion and expected script hash.
+20. Added phase-3 parity smoke tests in `external/GEM/tests/metal_parity_smoke.rs`:
+    - `metal_matches_reference_on_tiny_script` validates multi-cycle Metal parity against a CPU reference executor on the baseline tiny fixture.
+    - `metal_matches_reference_on_sram_duplicate_case` validates SRAM read/write + duplicate writeout semantics against the same CPU reference path using a synthetic script.
+21. Phase-3 parity validation:
+    - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (2 tests).
+    - Covered targeted SRAM/duplicate-writeout cases and preserved `FlattenedScriptV1` ABI (no script format change).
+22. Implemented Phase-4 end-to-end Metal VCD flow by replacing the placeholder `metal_test` probe with the full `cuda_test`-equivalent pipeline wired to `MetalBackend`.
+23. Added deterministic tiny VCD fixture `external/GEM/baseline/tiny_input.vcd` for end-to-end output parity checks.
+24. Added Phase-4 parity test `external/GEM/tests/metal_vcd_e2e.rs`:
+    - Runs `flatten_test` (CPU reference) and `metal_test` on the same tiny fixture.
+    - Compares output VCD bytes for exact match.
+25. Phase-4 validation:
+    - `cargo check --features metal --bin metal_test` passed.
+    - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (1 test).
+    - Direct output hash parity also confirmed:
+      - `baseline/tiny_output_cpu.vcd` and `baseline/tiny_output_metal.vcd` both hashed to `04ef69a57eb6bc3f7dc85ebd7be99a6218dd4c0c6bd65ad43539d537b7c8be51`.
+26. Phase-5 red: added dispatch/sync instrumentation and guardrails:
+    - Extended native stats ABI with logical dispatch count, GPU dispatch count, encode time, wait time, and total time.
+    - Plumbed stats through `external/GEM/src/gpu/metal_backend.rs` and surfaced per-run metrics from `metal_dummy_test`.
+    - Added parity-test assertions on instrumentation invariants (`dispatch_count`, bounds on timing fields) in `external/GEM/tests/metal_parity_smoke.rs`.
+27. Phase-5 green (partial): host submission optimization:
+    - Reworked Metal submission loop to batch multiple logical dispatches per command buffer (for multi-stage cases).
+    - Added a parity-safe single-stage fast path: one GPU dispatch handles `cycle_count` cycles internally when `num_major_stages == 1`.
+28. Phase-5 validation after optimization work:
+    - `cargo check --features metal --bin metal_dummy_test` passed.
+    - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (2 tests).
+    - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (1 test).
+29. Performance snapshots (capped at <=50k cycles):
+    - Pre fast-path:
+      - 5k cycles: `dispatches=5000 total_ms=4189.616 cycles_per_sec=1193.43`
+      - 50k cycles: `dispatches=50000 total_ms=41687.411 cycles_per_sec=1199.40`
+    - Post fast-path:
+      - 5k cycles: `logical_dispatches=5000 gpu_dispatches=1 total_ms=4221.766 cycles_per_sec=1184.34`
+      - 50k cycles: `logical_dispatches=50000 gpu_dispatches=1 total_ms=41454.135 cycles_per_sec=1206.15`
+30. Observation:
+    - Dispatch-count collapse is successful (`gpu_dispatches` reduced from O(cycles) to 1 for single-stage workloads), but throughput remains roughly flat; kernel compute dominates on this tiny fixture.
+    - Bench runs above 50k cycles were intentionally deferred due runtime cost in this iteration.
diff --git a/prd/2026_03_05_gem_metal_cuda_parity_phase5c_prd.md b/prd/2026_03_05_gem_metal_cuda_parity_phase5c_prd.md
new file mode 100644
index 00000000..de65f190
--- /dev/null
+++ b/prd/2026_03_05_gem_metal_cuda_parity_phase5c_prd.md
@@ -0,0 +1,105 @@
+# GEM Metal CUDA Parity (Phase 5C) PRD
+
+## Status
+Completed (2026-03-05)
+
+## Context
+Phase 5B completed kernel-structure mirroring and baseline parity gates, but CUDA-implementation parity still needs stronger coverage for edge cases that are sensitive to script layout and ordering semantics.
+
+## Goals
+1. Expand parity coverage for CUDA-sensitive control/data paths.
+2. Keep Metal bit-exact with CPU reference under complex script shapes.
+3. Avoid workload/perf tuning in this phase.
+
+## Non-Goals
+1. Throughput optimization.
+2. ABI or script format redesign.
+
+## Phased Plan
+
+### 5C.1 Multipart + Stage/Part Ordering
+Red:
+1. Add parity tests with multipart block scripts (`is_last_part` chaining) and mixed global-read/stage payloads.
+
+Green:
+1. Fix Metal kernel/script cursor behavior if ordering diverges.
+
+Exit Criteria:
+1. Multipart parity tests pass consistently.
+
+### 5C.2 SRAM/CLKEN Boundary Semantics
+Red:
+1. Add parity tests covering multiple SRAM banks, boundary-indexed `num_ios`, and duplicate/clken interactions.
+
+Green:
+1. Fix any read-before-write or output materialization mismatch.
+
+Exit Criteria:
+1. New boundary tests pass.
+
+### 5C.3 Randomized CUDA-Shape Parity Fuzz
+Red:
+1. Add deterministic randomized synthetic script parity sweep over CUDA-shape sections.
+
+Green:
+1. Resolve all discovered mismatches with minimal kernel changes.
+
+Exit Criteria:
+1. Randomized sweep passes for fixed seed set.
+2. Existing parity and VCD tests remain green.
+
+## Acceptance Criteria
+1. 5C.1-5C.3 exit criteria are met.
+2. `metal_parity_smoke` and `metal_vcd_e2e` pass.
+3. No workload benchmark/perf changes are required for completion.
+
+## Risks and Mitigations
+1. Risk: synthetic scripts miss real regressions.
+   Mitigation: combine targeted boundary cases with deterministic fuzz.
+2. Risk: parity fixes introduce regressions in prior passing suites.
+   Mitigation: rerun full Metal parity + VCD checks after each green step.
+
+## Implementation Checklist
+- [x] 5C.1 Red/Green complete.
+- [x] 5C.2 Red/Green complete.
+- [x] 5C.3 Red/Green complete.
+
+## Execution Log
+2026-03-05:
+1. Created parity-focused Phase 5C PRD.
+2. Added multipart parity case and helper script builder:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - `build_multipart_sram_dependency_script`
+   - `metal_matches_reference_on_multipart_sram_dependency_case`
+3. Added multi-SRAM boundary parity case and helper script builder:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - `build_multi_sram_boundary_script`
+   - `metal_matches_reference_on_multi_sram_boundary_case`
+4. Added deterministic randomized CUDA-shape parity sweep:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - `ScriptRng`, `build_random_cuda_part`, `build_random_cuda_case`
+   - `metal_matches_reference_on_randomized_cuda_shape_cases` (12 fixed seeds)
+5. Hardened randomized script generation to respect CUDA-script invariants:
+   - hooks beyond `num_ios` forced disabled
+   - SRAM address tuples constrained to in-range `[0, 8191]`
+   - randomized multi-block race source removed (single-block randomized sweep)
+6. Validation:
+   - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (`8` tests)
+   - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (`1` test)
+7. Added manifest-driven real-script corpus parity gate:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - new parser/helpers: `parse_baseline_manifest_cases`, `build_script_from_artifacts`
+   - new test: `metal_matches_reference_on_manifest_baseline_corpus`
+   - baseline corpus currently resolves to one checked-in entry (`tiny_v1`), and will auto-expand as new manifest entries are added.
+8. Re-validation after adding corpus gate:
+   - `cargo test --features metal --test metal_parity_smoke metal_matches_reference_on_manifest_baseline_corpus -- --nocapture` passed (`1` test)
+   - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (`9` tests)
+   - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (`1` test)
+9. Added deterministic multi-block/multi-stage disjoint parity case:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - new helpers: `build_sram_stage_script_part_with_offsets`, `build_multiblock_multistage_disjoint_script`
+   - new test: `metal_matches_reference_on_multiblock_multistage_disjoint_case`
+10. Re-validation after multi-block parity addition:
+   - `cargo test --features metal --test metal_parity_smoke metal_matches_reference_on_multiblock_multistage_disjoint_case -- --nocapture` passed (`1` test)
+   - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (`10` tests)
+   - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (`1` test)
diff --git a/prd/2026_03_05_gem_metal_cuda_structure_phase5b_prd.md b/prd/2026_03_05_gem_metal_cuda_structure_phase5b_prd.md
new file mode 100644
index 00000000..3e2d181e
--- /dev/null
+++ b/prd/2026_03_05_gem_metal_cuda_structure_phase5b_prd.md
@@ -0,0 +1,164 @@
+# GEM Metal CUDA-Structure Mirror (Phase 5B) PRD
+
+## Status
+Completed (2026-03-05)
+
+## Context
+The initial CUDA-to-Metal migration established functional parity and end-to-end VCD parity, but Metal still executes a mostly scalar kernel path internally. The next step is to mirror CUDA kernel structure more closely while maintaining strict per-change parity and practical iteration speed.
+
+The prior migration PRD remains the source of baseline history:
+- `prd/2026_03_05_gem_cuda_to_metal_prd.md`
+
+## Goals
+1. Mirror CUDA launch geometry and execution structure in Metal incrementally.
+2. Preserve bit-exact parity at every change.
+3. Keep all required validation capped to <=50k cycles for practical iteration.
+4. Keep `FlattenedScriptV1` ABI unchanged.
+5. Add repeatable perf snapshots for progress tracking.
+
+## Non-Goals
+1. Script format redesign or ABI churn.
+2. >50k-cycle mandatory gates in this phase.
+3. Any temporary parity regressions for performance.
+
+## Phased Plan
+
+### 5B.1 Launch Geometry + Multi-Stage Dependency Guard
+Red:
+1. Add failing synthetic multi-stage parity case with cross-stage dependency.
+
+Green:
+1. Switch Metal launch geometry to one threadgroup per logical block (CUDA-shaped geometry).
+2. Keep correctness via lane-0 execution while preserving current semantics.
+3. Validate new multi-stage parity case plus existing parity suites.
+
+Exit Criteria:
+1. Multi-stage synthetic parity passes.
+2. Existing tiny and SRAM/duplicate parity tests pass.
+3. End-to-end tiny VCD parity test passes.
+
+### 5B.2 Boomerang Lane-Parallel Mirror
+Red:
+1. Add failing hierarchy-focused parity case stressing `hier[0..12]` writeout behavior.
+
+Green:
+1. Port boomerang hierarchy to lane-parallel/threadgroup-memory execution mirroring CUDA organization.
+2. Keep bit-exact hook timing and output bits.
+
+Exit Criteria:
+1. New hierarchy parity case passes.
+2. Existing parity suites remain green.
+
+### 5B.3 SRAM/Duplicate/Clock-Enable Lane Mirror
+Red:
+1. Add failing stress case for consecutive-cycle SRAM read/write ordering plus duplicate and clken masking interactions.
+
+Green:
+1. Mirror CUDA-style lane role split for SRAM/duplicate/clken sections.
+2. Preserve read-before-write semantics and commit ordering.
+
+Exit Criteria:
+1. New SRAM stress case passes.
+2. Existing parity suites remain green.
+
+### 5B.4 Runtime Submission Stabilization
+Red:
+1. Add stats/guard assertions for logical vs GPU dispatch accounting and timing fields.
+
+Green:
+1. Keep single-stage `cycle_count` fast path.
+2. Keep chunked multi-stage submission.
+3. Preserve dispatch/timing instrumentation visibility from Rust and smoke binaries.
+
+Exit Criteria:
+1. Instrumentation assertions pass in parity tests.
+2. No parity regressions.
+
+### 5B.5 Workload Gates + Re-Baseline
+Red:
+1. Add/update reproducible benchmark harness table for tiny + 8bit representative workload at 5k/50k.
+
+Green:
+1. Run required benchmark gates.
+2. Publish current table and trend in PRD.
+
+Exit Criteria:
+1. 5k/50k benchmark entries are recorded.
+2. No unexplained regressions against previous snapshot.
+
+## Acceptance Criteria
+1. 5B.1-5B.5 exit criteria are all satisfied.
+2. Metal remains bit-exact against CPU reference on required suites.
+3. CUDA backend behavior is unchanged.
+4. Performance reporting is reproducible at 5k/50k.
+
+## Risks and Mitigations
+1. Risk: CUDA warp idioms do not map directly to Metal SIMD execution.
+   Mitigation: enforce parity-first lane-level red/green tests.
+2. Risk: structural mirroring may regress throughput short-term.
+   Mitigation: keep per-phase perf snapshots and regression notes.
+3. Risk: real-workload gates can be too slow for tight loops.
+   Mitigation: keep mandatory gates at <=50k cycles.
+
+## Implementation Checklist
+- [x] 5B.1 Red: add multi-stage synthetic dependency parity test.
+- [x] 5B.1 Green: switch launch geometry to threadgroup-per-block and validate parity.
+- [x] 5B.2 Red: add boomerang hierarchy stress parity case.
+- [x] 5B.2 Green: implement lane-parallel boomerang mirror.
+- [x] 5B.3 Red: add SRAM ordering/duplicate/clken stress case.
+- [x] 5B.3 Green: implement lane-parallel SRAM/duplicate/clken mirror.
+- [x] 5B.4 Red: instrumentation guard assertions in parity suite.
+- [x] 5B.4 Green: keep stable submission/stats paths with parity.
+- [x] 5B.5 Red/Green: tiny + 8bit representative benchmark table (5k/50k) updated.
+
+## Execution Log
+2026-03-05:
+1. Added synthetic multi-stage dependency parity case:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - `metal_matches_reference_on_multistage_sram_dependency_case`
+2. Added synthetic script builders for stage-level SRAM dependency:
+   - `build_sram_stage_script`
+   - `build_multistage_sram_dependency_script`
+3. Updated Metal kernel launch mapping to CUDA-shaped geometry:
+   - `external/GEM/msrc/kernel_v1.metal`
+   - kernel now uses `threadgroup_position_in_grid` + `thread_index_in_threadgroup`; lane 0 executes current block logic.
+4. Updated native Metal host submission to dispatch threadgroups with fixed 256-thread threadgroup size:
+   - `external/GEM/csrc/kernel_v1_metal.mm`
+   - uses `dispatchThreadgroups(..., threadsPerThreadgroup=256)`
+   - adds explicit guard when pipeline threadgroup capacity < 256.
+5. Validation:
+   - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (`3` tests).
+   - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (`1` test).
+6. Current tiny benchmark snapshots (post-change):
+   - 5k cycles: `logical_dispatches=5000 gpu_dispatches=1 total_ms=4234.148 cycles_per_sec=1180.88`
+   - 50k cycles: `logical_dispatches=50000 gpu_dispatches=1 total_ms=41657.575 cycles_per_sec=1200.26`
+7. 8bit complex representative gate attempt:
+   - attempted `bundle exec rspec spec/examples/8bit/hdl/cpu/gem_gpu_complex_parity_spec.rb:80`
+   - run was terminated due impractical runtime for this execution slice; remaining 8bit representative gate work is tracked under 5B.5.
+8. Added 5B.2 hierarchy stress parity coverage:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - new synthetic builder `build_hierarchy_stress_script`
+   - new test `metal_matches_reference_on_hierarchy_stress_case`
+9. Added 5B.3 SRAM ordering + duplicate + clken stress coverage:
+   - `external/GEM/tests/metal_parity_smoke.rs`
+   - new synthetic builder `build_sram_ordering_duplicate_clken_stress_script`
+   - new test `metal_matches_reference_on_multicycle_sram_duplicate_clken_stress_case`
+10. Reworked Metal kernel inner path to lane-parallel SRAM/duplicate/clken processing:
+   - `external/GEM/msrc/kernel_v1.metal`
+   - removed lane-0 serialized loops for these sections
+   - added threadgroup permutation scratch (`tg_sram_duplicate`) and per-lane commit/update path
+11. Validation after kernel + tests update:
+   - `cargo test --features metal --test metal_parity_smoke -- --nocapture` passed (`5` tests)
+   - `cargo test --features metal --test metal_vcd_e2e -- --nocapture` passed (`1` test)
+12. Updated tiny benchmark snapshots (new kernel):
+   - 5k cycles: `logical_dispatches=5000 gpu_dispatches=1 encode_ms=0.013 wait_ms=54.329 total_ms=85.339 cycles_per_sec=58590.12`
+   - 50k cycles: `logical_dispatches=50000 gpu_dispatches=1 encode_ms=0.017 wait_ms=371.940 total_ms=402.406 cycles_per_sec=124252.77`
+13. Added 8bit representative benchmark entries (compiler vs gem_gpu):
+   - command: `RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake 'bench:native[cpu8bit,5000]'`
+     - Compiler run: `0.080s` (5000 cycles)
+     - GemGPU run: `16.247s` (5000 cycles)
+     - Ratio: `GemGPU vs Compiler = 0.005x`
+   - command: `RHDL_BENCH_BACKENDS=compiler,gem_gpu bundle exec rake 'bench:native[cpu8bit,50000]'`
+     - Compiler run: `0.684s` (50000 cycles)
+     - GemGPU run: `163.171s` (50000 cycles)
+     - Ratio: `GemGPU vs Compiler = 0.004x`
diff --git a/prd/2026_03_05_lowering_boundary_cleanup_prd.md b/prd/2026_03_05_lowering_boundary_cleanup_prd.md
new file mode 100644
index 00000000..86522264
--- /dev/null
+++ b/prd/2026_03_05_lowering_boundary_cleanup_prd.md
@@ -0,0 +1,68 @@
+# Lowering Boundary Cleanup PRD
+
+## Status
+Completed (2026-03-05)
+
+## Context
+`ArcToGpuLowering` had become a mixed frontend/backend entrypoint, with `SynthToGpuLowering` and `GemToGpuLowering` routing through Arc-specific options. This obscured ownership boundaries and made feature gating (for example GEM interpreter mode) harder to reason about.
+
+## Goals
+- Keep `ArcToGpuLowering` as the ARC-IR frontend entrypoint.
+- Keep `SynthToGpuLowering` as the synth/hw IR frontend entrypoint.
+- Keep `GemToGpuLowering` as the AIG-oriented frontend entrypoint.
+- Share common GPU lowering mechanics via a dedicated delegate.
+
+## Non-Goals
+- Rewrite kernel generation internals.
+- Change runtime behavior/perf policy outside frontend ownership boundaries.
+
+## Phased Plan
+### Phase 1: Introduce shared delegate
+Red:
+- Existing lowering specs fail if delegate does not preserve metadata/output contracts.
+Green:
+- Add `GpuLoweringDelegate` for shared parse/validate/emit/metadata flow.
+- Keep existing Arc/Synth/GEM output format unchanged.
+Exit Criteria:
+- Arc/Synth/GEM lowering specs pass.
+
+### Phase 2: ARC frontend boundary
+Red:
+- Arc entrypoint accepts non-ARC frontend options.
+Green:
+- Restrict `ArcToGpuLowering.lower` to ARC-facing API and route through delegate with ARC parser semantics.
+Exit Criteria:
+- Arc lowering specs pass and enforce ARC requirements.
+
+### Phase 3: Synth and GEM frontend boundaries
+Red:
+- Synth frontend accepts ARC wrappers; GEM frontend path naming remains synth-generic.
+Green:
+- `SynthToGpuLowering` uses delegate with synth parser guard.
+- `GemToGpuLowering` takes AIG-oriented input key (`aig_mlir_path`) while retaining legacy compatibility.
+- Update runner callsites to use AIG-oriented key.
+Exit Criteria:
+- Synth/GEM specs pass, runner specs pass.
+
+## Acceptance Criteria
+- Distinct frontend ownership:
+  - Arc -> ARC input contract
+  - Synth -> synth/hw input contract
+  - Gem -> AIG input contract
+- Shared backend mechanics live in delegate.
+- All touched specs green.
+
+## Risks and Mitigations
+- Risk: subtle metadata drift during delegate extraction.
+  - Mitigation: preserve existing field layout and verify via existing specs.
+- Risk: break legacy callers for GEM input key.
+  - Mitigation: keep `synth_mlir_path` compatibility alias and add test.
+
+## Implementation Checklist
+- [x] Add `GpuLoweringDelegate`.
+- [x] Refactor `ArcToGpuLowering.lower` to ARC-only entry semantics.
+- [x] Refactor `SynthToGpuLowering` to synth-owned parsing and delegate usage.
+- [x] Refactor `GemToGpuLowering` to AIG-owned entry semantics (+ compatibility alias).
+- [x] Update runner callsites to `aig_mlir_path`.
+- [x] Add/adjust specs for new boundaries.
+- [x] Run targeted lowering and runner specs.
diff --git a/spec/examples/8bit/hdl/cpu/arcilator_gpu_complex_parity_spec.rb b/spec/examples/8bit/hdl/cpu/arcilator_gpu_complex_parity_spec.rb
new file mode 100644
index 00000000..2f4c2b2f
--- /dev/null
+++ b/spec/examples/8bit/hdl/cpu/arcilator_gpu_complex_parity_spec.rb
@@ -0,0 +1,514 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'support/cpu_assembler'
+require 'fileutils'
+require 'open3'
+
+RSpec.describe '8-bit CPU arcilator_gpu complex parity' do
+  DISPLAY_START = 0x0800
+  DISPLAY_LEN = 80 * 24
+  NATIVE_RUNNER_BACKENDS = %i[arcilator_gpu arcilator verilator].freeze
+
+  def build_harness(sim)
+    RHDL::HDL::CPU::FastHarness.new(nil, sim: sim)
+  end
+
+  def compiler_backend_available?
+    build_harness(:compile)
+    true
+  rescue StandardError
+    false
+  end
+
+  def checksum_region(memory, start_addr, length)
+    sum = 0
+    rolling_xor = 0
+
+    length.times do |offset|
+      byte = memory.read((start_addr + offset) & 0xFFFF).to_i & 0xFF
+      sum = (sum + byte) & 0xFFFF_FFFF
+      rolling_xor ^= ((byte << (offset & 7)) & 0xFF)
+    end
+
+    [sum, rolling_xor]
+  end
+
+  def compare_snapshots(compiler:, candidate:, regions:, label:, backend_label:)
+    expect(candidate.halted).to eq(compiler.halted), "halted mismatch at #{label} (#{backend_label})"
+    expect(candidate.acc).to eq(compiler.acc), "acc mismatch at #{label} (#{backend_label})"
+    expect(candidate.pc).to eq(compiler.pc), "pc mismatch at #{label} (#{backend_label})"
+    expect(candidate.sp).to eq(compiler.sp), "sp mismatch at #{label} (#{backend_label})"
+    expect(candidate.state).to eq(compiler.state), "state mismatch at #{label} (#{backend_label})"
+    expect(candidate.zero_flag).to eq(compiler.zero_flag), "zero_flag mismatch at #{label} (#{backend_label})"
+
+    regions.each do |region|
+      compiler_sig = checksum_region(compiler.memory, region.fetch(:start), region.fetch(:length))
+      candidate_sig = checksum_region(candidate.memory, region.fetch(:start), region.fetch(:length))
+      expect(candidate_sig).to eq(compiler_sig),
+        "memory checksum mismatch at #{label} (#{backend_label}) for 0x#{region.fetch(:start).to_s(16)}+#{region.fetch(:length)}"
+    end
+  end
+
+  def assert_native_runner_backends_available!
+    backends = {
+      arcilator_gpu: RHDL::HDL::CPU::FastHarness.arcilator_gpu_status,
+      arcilator: RHDL::HDL::CPU::FastHarness.arcilator_status,
+      verilator: RHDL::HDL::CPU::FastHarness.verilator_status
+    }
+    backends.each do |backend, status|
+      expect(status[:ready]).to be(true), "#{backend} backend unavailable: #{status.inspect}"
+    end
+  end
+
+  def run_checkpoint_parity(program_bytes:, start_pc:, checkpoints:, regions:, batch_size: 4096, backends: NATIVE_RUNNER_BACKENDS)
+    compiler = build_harness(:compile)
+    backend_harnesses = backends.to_h do |backend|
+      [backend, build_harness(backend)]
+    end
+
+    bytes = Array(program_bytes).dup
+    if start_pc.to_i.nonzero?
+      # Native runner paths currently cannot poke internal pc register directly
+      # because arcilator state JSON does not expose that register by default.
+      # Use an explicit reset-time trampoline so both backends start identically.
+      bytes[0, 3] = [0xF9, ((start_pc >> 8) & 0xFF), (start_pc & 0xFF)] # JMP_LONG start_pc
+      start_pc = 0
+    end
+
+    ([compiler] + backend_harnesses.values).each do |harness|
+      harness.memory.load(bytes, 0)
+      harness.pc = start_pc
+    end
+
+    last = 0
+    checkpoints.each do |checkpoint|
+      step = checkpoint - last
+      raise ArgumentError, "checkpoints must be increasing (#{checkpoints.inspect})" if step <= 0
+
+      compiler_ran = compiler.run_cycles(step, batch_size: batch_size)
+      backend_harnesses.each do |backend, harness|
+        backend_ran = harness.run_cycles(step, batch_size: batch_size)
+        expect(backend_ran).to eq(compiler_ran), "cycle progress mismatch at checkpoint #{checkpoint} (#{backend})"
+
+        compare_snapshots(
+          compiler: compiler,
+          candidate: harness,
+          regions: regions,
+          label: "#{checkpoint} cycles",
+          backend_label: backend
+        )
+      end
+
+      last = checkpoint
+    end
+  end
+
+  def normalize_program_for_start_pc(program_bytes:, start_pc:)
+    bytes = Array(program_bytes).dup
+    pc = start_pc.to_i
+    if pc.nonzero?
+      bytes[0, 3] = [0xF9, ((pc >> 8) & 0xFF), (pc & 0xFF)] # JMP_LONG start_pc
+      pc = 0
+    end
+    [bytes, pc]
+  end
+
+  def measure_harness_cycles_per_sec(sim:, program_bytes:, start_pc:, cycles:, batch_size: 4096)
+    harness = build_harness(sim)
+    bytes, pc = normalize_program_for_start_pc(program_bytes: program_bytes, start_pc: start_pc)
+    harness.memory.load(bytes, 0)
+    harness.pc = pc
+
+    started = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+    cycles_run = harness.run_cycles(cycles, batch_size: batch_size)
+    elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started
+    elapsed = 1.0e-9 if elapsed <= 0.0
+
+    {
+      backend: sim,
+      cycles_run: cycles_run,
+      elapsed_s: elapsed,
+      cycles_per_sec: cycles_run.to_f / elapsed
+    }
+  end
+
+  def command_available?(tool)
+    ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+      File.executable?(File.join(path, tool))
+    end
+  end
+
+  def gem_project_root
+    File.expand_path('../../../../../', __dir__)
+  end
+
+  def gem_root
+    File.join(gem_project_root, 'external', 'GEM')
+  end
+
+  def gem_cpu8bit_build_dir
+    File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_BUILD_DIR', File.join(gem_project_root, 'examples/8bit/.gem_metal_cpu8bit'))
+    )
+  end
+
+  def ensure_gem_cpu8bit_artifacts!(top_module:)
+    raise 'cargo not found in PATH' unless command_available?('cargo')
+    raise 'yosys not found in PATH' unless command_available?('yosys')
+    raise "external GEM repo not found at #{gem_root}" unless Dir.exist?(gem_root)
+
+    build_dir = gem_cpu8bit_build_dir
+    FileUtils.mkdir_p(build_dir)
+
+    rtl_path = File.join(build_dir, 'cpu8bit_rtl.v')
+    yosys_script_path = File.join(build_dir, 'cpu8bit_gem.ys')
+    yosys_log_path = File.join(build_dir, 'cpu8bit_yosys.log')
+    cut_map_log_path = File.join(build_dir, 'cpu8bit_cut_map.log')
+
+    netlist_path = File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_NETLIST', File.join(build_dir, 'cpu8bit_gatelevel.gv'))
+    )
+    gemparts_path = File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_GEMPARTS', File.join(build_dir, 'cpu8bit.gemparts'))
+    )
+
+    level_split = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_LEVEL_SPLIT', '').strip
+    max_stage_degrad = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_MAX_STAGE_DEGRAD', '').strip
+
+    aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+    raise "missing AIGPDK library at #{aigpdk_nomem_lib}" unless File.exist?(aigpdk_nomem_lib)
+
+    unless File.exist?(netlist_path)
+      File.write(rtl_path, RHDL::HDL::CPU::CPU.to_verilog_hierarchy(top_name: top_module))
+
+      yosys_script = <<~YOSYS
+        read_verilog "#{rtl_path}"
+        hierarchy -check -top #{top_module}
+        synth -flatten
+        delete t:\\$print
+        dfflibmap -liberty "#{aigpdk_nomem_lib}"
+        opt_clean -purge
+        abc -liberty "#{aigpdk_nomem_lib}"
+        opt_clean -purge
+        write_verilog "#{netlist_path}"
+      YOSYS
+      File.write(yosys_script_path, yosys_script)
+
+      yosys_out, yosys_status = Open3.capture2e('yosys', '-q', '-s', yosys_script_path)
+      File.write(yosys_log_path, yosys_out)
+      raise "yosys synthesis failed. See #{yosys_log_path}" unless yosys_status.success?
+    end
+
+    unless File.exist?(gemparts_path)
+      cut_map_cmd = [
+        'cargo', 'run', '--release', '--features', 'metal', '--bin', 'cut_map_interactive', '--',
+        netlist_path
+      ]
+      cut_map_cmd += ['--top-module', top_module]
+      cut_map_cmd += ['--level-split', level_split] unless level_split.empty?
+      cut_map_cmd += ['--max-stage-degrad', max_stage_degrad] unless max_stage_degrad.empty?
+      cut_map_cmd << gemparts_path
+
+      cut_map_out, cut_map_status = Open3.capture2e(*cut_map_cmd, chdir: gem_root)
+      File.write(cut_map_log_path, cut_map_out)
+      raise "cut_map_interactive failed. See #{cut_map_log_path}" unless cut_map_status.success?
+    end
+
+    [netlist_path, gemparts_path]
+  end
+
+  def collect_compiler_mem_data_trace(program_bytes:, start_pc:, cycles:)
+    compiler = build_harness(:compile)
+    bytes, pc = normalize_program_for_start_pc(program_bytes: program_bytes, start_pc: start_pc)
+
+    compiler.memory.load(bytes, 0)
+    compiler.pc = pc
+
+    sim = compiler.instance_variable_get(:@sim)
+    memory = compiler.memory
+    trace = []
+
+    cycles.times do
+      break if compiler.halted
+
+      addr = sim.peek('mem_addr')
+      write_en = sim.peek('mem_write_en')
+      memory.write(addr, sim.peek('mem_data_out')) if write_en == 1
+
+      data = memory.read(addr) & 0xFF
+      trace << data
+
+      sim.poke('mem_data_in', data)
+      sim.evaluate
+      sim.poke('clk', 0)
+      sim.evaluate
+      sim.poke('clk', 1)
+      sim.tick
+
+      compiler.instance_variable_set(:@cycle_count, compiler.cycle_count + 1)
+      compiler.instance_variable_set(:@halted, true) if sim.peek('halted') == 1
+    end
+
+    trace
+  end
+
+  def write_gem_input_vcd(path, mem_data_trace)
+    first_data = mem_data_trace.first.to_i & 0xFF
+    time = 0
+    prev_data = first_data
+
+    File.open(path, 'w') do |f|
+      f.puts '$timescale 1ns $end'
+      f.puts '$scope module cpu8bit $end'
+      f.puts '$var wire 1 ! clk $end'
+      f.puts '$var wire 1 " rst $end'
+      f.puts '$var wire 8 # mem_data_in $end'
+      f.puts '$upscope $end'
+      f.puts '$enddefinitions $end'
+      f.puts '$dumpvars'
+      f.puts '0!'
+      f.puts '0"'
+      f.puts "b#{first_data.to_s(2).rjust(8, '0')} #"
+      f.puts '$end'
+
+      mem_data_trace.each do |data|
+        d = data.to_i & 0xFF
+        time += 1
+        f.puts "##{time}"
+        if d != prev_data
+          f.puts "b#{d.to_s(2).rjust(8, '0')} #"
+          prev_data = d
+        end
+        f.puts '1!'
+
+        time += 1
+        f.puts "##{time}"
+        f.puts '0!'
+      end
+    end
+  end
+
+  def run_gem_metal_test(
+    netlist_path:,
+    gemparts_path:,
+    input_vcd_path:,
+    output_vcd_path:,
+    log_path:,
+    top_module:,
+    max_cycles:,
+    check_with_cpu:
+  )
+    cmd = [
+      'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_test', '--',
+      netlist_path, gemparts_path, input_vcd_path, output_vcd_path, '5',
+      '--top-module', top_module,
+      '--input-vcd-scope', top_module,
+      '--max-cycles', max_cycles.to_s
+    ]
+    cmd << '--check-with-cpu' if check_with_cpu
+
+    started = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+    out, status = Open3.capture2e(*cmd, chdir: gem_root)
+    elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started
+    elapsed = 1.0e-9 if elapsed <= 0.0
+
+    File.write(log_path, out)
+    expect(status.success?).to be(true), "metal_test failed. See #{log_path}"
+    expect(File.exist?(output_vcd_path)).to be(true)
+    expect(File.size(output_vcd_path)).to be > 0
+    expect(out).to include('sanity test passed!') if check_with_cpu
+
+    {
+      output: out,
+      elapsed_s: elapsed,
+      cycles_run: max_cycles.to_i,
+      cycles_per_sec: max_cycles.to_f / elapsed
+    }
+  end
+
+  def run_gem_sanity_with_compiler_stimulus(program_bytes:, start_pc:, cycles:, top_module: 'cpu8bit', batch_size: 4096, label:)
+    netlist_path, gemparts_path = ensure_gem_cpu8bit_artifacts!(top_module: top_module)
+    mem_data_trace = collect_compiler_mem_data_trace(
+      program_bytes: program_bytes,
+      start_pc: start_pc,
+      cycles: cycles
+    )
+    raise 'compiler trace generation produced no cycles' if mem_data_trace.empty?
+    effective_cycles = mem_data_trace.length
+    compiler_perf = measure_harness_cycles_per_sec(
+      sim: :compile,
+      program_bytes: program_bytes,
+      start_pc: start_pc,
+      cycles: effective_cycles,
+      batch_size: batch_size
+    )
+    arcilator_gpu_perf = measure_harness_cycles_per_sec(
+      sim: :arcilator_gpu,
+      program_bytes: program_bytes,
+      start_pc: start_pc,
+      cycles: effective_cycles,
+      batch_size: batch_size
+    )
+    arcilator_perf = measure_harness_cycles_per_sec(
+      sim: :arcilator,
+      program_bytes: program_bytes,
+      start_pc: start_pc,
+      cycles: effective_cycles,
+      batch_size: batch_size
+    )
+    verilator_perf = measure_harness_cycles_per_sec(
+      sim: :verilator,
+      program_bytes: program_bytes,
+      start_pc: start_pc,
+      cycles: effective_cycles,
+      batch_size: batch_size
+    )
+
+    build_dir = gem_cpu8bit_build_dir
+    input_vcd_path = File.join(build_dir, "cpu8bit_gem_complex_#{cycles}.input.vcd")
+    gpu_only_output_vcd_path = File.join(build_dir, "cpu8bit_gem_complex_#{cycles}.gpu_only.output.vcd")
+    gpu_only_log_path = File.join(build_dir, "cpu8bit_gem_complex_#{cycles}.gpu_only.log")
+    check_output_vcd_path = File.join(build_dir, "cpu8bit_gem_complex_#{cycles}.check.output.vcd")
+    check_log_path = File.join(build_dir, "cpu8bit_gem_complex_#{cycles}.check.log")
+    write_gem_input_vcd(input_vcd_path, mem_data_trace)
+
+    gpu_only = run_gem_metal_test(
+      netlist_path: netlist_path,
+      gemparts_path: gemparts_path,
+      input_vcd_path: input_vcd_path,
+      output_vcd_path: gpu_only_output_vcd_path,
+      log_path: gpu_only_log_path,
+      top_module: top_module,
+      max_cycles: effective_cycles,
+      check_with_cpu: false
+    )
+    with_check = run_gem_metal_test(
+      netlist_path: netlist_path,
+      gemparts_path: gemparts_path,
+      input_vcd_path: input_vcd_path,
+      output_vcd_path: check_output_vcd_path,
+      log_path: check_log_path,
+      top_module: top_module,
+      max_cycles: effective_cycles,
+      check_with_cpu: true
+    )
+
+    compiler_cps = compiler_perf.fetch(:cycles_per_sec)
+    arcilator_gpu_ratio = compiler_cps.positive? ? (arcilator_gpu_perf.fetch(:cycles_per_sec) / compiler_cps) : 0.0
+    arcilator_ratio = compiler_cps.positive? ? (arcilator_perf.fetch(:cycles_per_sec) / compiler_cps) : 0.0
+    verilator_ratio = compiler_cps.positive? ? (verilator_perf.fetch(:cycles_per_sec) / compiler_cps) : 0.0
+    gpu_only_ratio = compiler_cps.positive? ? (gpu_only.fetch(:cycles_per_sec) / compiler_cps) : 0.0
+    with_check_ratio = compiler_cps.positive? ? (with_check.fetch(:cycles_per_sec) / compiler_cps) : 0.0
+    RSpec.configuration.reporter.message(
+      format(
+        '[%s] compiler=%.2f cyc/s, arcilator=%.2f cyc/s (%.3fx), verilator=%.2f cyc/s (%.3fx), arcilator_gpu=%.2f cyc/s (%.3fx), gem(no-check)=%.2f cyc/s (%.3fx), gem(check-with-cpu)=%.2f cyc/s (%.3fx)',
+        label,
+        compiler_cps,
+        arcilator_perf.fetch(:cycles_per_sec),
+        arcilator_ratio,
+        verilator_perf.fetch(:cycles_per_sec),
+        verilator_ratio,
+        arcilator_gpu_perf.fetch(:cycles_per_sec),
+        arcilator_gpu_ratio,
+        gpu_only.fetch(:cycles_per_sec),
+        gpu_only_ratio,
+        with_check.fetch(:cycles_per_sec),
+        with_check_ratio
+      )
+    )
+  end
+
+  it 'matches compiler backend on conway glider 80x24 checkpoints', timeout: 420 do
+    skip 'IR compiler backend unavailable' unless compiler_backend_available?
+    assert_native_runner_backends_available!
+
+    bin_path = File.expand_path('../../../../../examples/8bit/software/bin/conway_glider_80x24.bin', __dir__)
+    program = File.binread(bin_path).bytes
+
+    run_checkpoint_parity(
+      program_bytes: program,
+      start_pc: 0x20,
+      checkpoints: [50_000, 100_000, 200_000],
+      regions: [
+        { start: DISPLAY_START, length: DISPLAY_LEN },
+        { start: 0x0200, length: 0x240 }
+      ]
+    )
+
+    run_gem_sanity_with_compiler_stimulus(
+      program_bytes: program,
+      start_pc: 0x20,
+      cycles: 50_000,
+      label: 'conway'
+    )
+  rescue RuntimeError => e
+    skip "GEM backend unavailable for conway parity: #{e.message}"
+  end
+
+  it 'matches compiler backend on mandelbrot 80x24 checkpoints', timeout: 420 do
+    skip 'IR compiler backend unavailable' unless compiler_backend_available?
+    assert_native_runner_backends_available!
+
+    bin_path = File.expand_path('../../../../../examples/8bit/software/bin/mandelbrot_80x24.bin', __dir__)
+    program = File.binread(bin_path).bytes
+
+    run_checkpoint_parity(
+      program_bytes: program,
+      start_pc: 0x00,
+      checkpoints: [40_000, 80_000, 120_000],
+      regions: [
+        { start: DISPLAY_START, length: DISPLAY_LEN },
+        { start: 0x0100, length: 0x300 }
+      ]
+    )
+
+    run_gem_sanity_with_compiler_stimulus(
+      program_bytes: program,
+      start_pc: 0x00,
+      cycles: 40_000,
+      label: 'mandelbrot'
+    )
+  rescue RuntimeError => e
+    skip "GEM backend unavailable for mandelbrot parity: #{e.message}"
+  end
+
+  it 'matches compiler backend on long-running arithmetic loop checkpoints', timeout: 420 do
+    skip 'IR compiler backend unavailable' unless compiler_backend_available?
+    assert_native_runner_backends_available!
+
+    program = Assembler.build(0x40) do |p|
+      p.instr :LDI, 1
+      p.instr :STA, 0x02
+      p.instr :LDI, 0
+      p.instr :STA, 0x0E
+
+      p.label :loop
+      p.instr :LDA, 0x0E
+      p.instr :ADD, 0x02
+      p.instr :STA, 0x0E
+      p.instr :LDA, 0x0E
+      p.instr :STA, 0x90
+      p.instr :JMP_LONG, :loop
+    end
+
+    run_checkpoint_parity(
+      program_bytes: program,
+      start_pc: 0x40,
+      checkpoints: [25_000, 50_000, 100_000],
+      regions: [
+        { start: 0x0080, length: 0x40 },
+        { start: 0x0800, length: 0x80 }
+      ]
+    )
+
+    run_gem_sanity_with_compiler_stimulus(
+      program_bytes: program,
+      start_pc: 0x40,
+      cycles: 25_000,
+      label: 'arith-loop'
+    )
+  rescue RuntimeError => e
+    skip "GEM backend unavailable for arithmetic parity: #{e.message}"
+  end
+end
diff --git a/spec/examples/8bit/hdl/cpu/fast_harness_arcilator_gpu_spec.rb b/spec/examples/8bit/hdl/cpu/fast_harness_arcilator_gpu_spec.rb
index 83cc20a4..36a584e6 100644
--- a/spec/examples/8bit/hdl/cpu/fast_harness_arcilator_gpu_spec.rb
+++ b/spec/examples/8bit/hdl/cpu/fast_harness_arcilator_gpu_spec.rb
@@ -11,6 +11,7 @@
       backend: :arcilator_gpu,
       runner_mode?: true,
       runner_kind: :cpu8bit,
+      runner_parallel_instances: 1,
       poke: true,
       evaluate: true
     )
@@ -76,4 +77,11 @@
     expect(harness.run_cycles(64, batch_size: 16)).to eq(0)
     expect(harness.halted).to be(true)
   end
+
+  it 'reports runner parallel instances in arcilator_gpu mode' do
+    allow(sim).to receive(:runner_parallel_instances).and_return(8)
+
+    harness = described_class.new(nil, sim: :arcilator_gpu)
+    expect(harness.parallel_instances).to eq(8)
+  end
 end
diff --git a/spec/examples/8bit/hdl/cpu/gem_metal_cpu8bit_parity_spec.rb b/spec/examples/8bit/hdl/cpu/gem_metal_cpu8bit_parity_spec.rb
new file mode 100644
index 00000000..3cfcd17f
--- /dev/null
+++ b/spec/examples/8bit/hdl/cpu/gem_metal_cpu8bit_parity_spec.rb
@@ -0,0 +1,177 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'fileutils'
+require 'open3'
+require_relative '../../../../../examples/8bit/hdl/cpu/cpu'
+
+RSpec.describe '8-bit CPU GEM Metal parity (Yosys -> GEM)', timeout: 300 do
+  def command_available?(tool)
+    ENV.fetch('PATH', '').split(File::PATH_SEPARATOR).any? do |path|
+      File.executable?(File.join(path, tool))
+    end
+  end
+
+  def run_cmd!(cmd, chdir:, log_path:, step:)
+    out, status = Open3.capture2e(*cmd, chdir: chdir)
+    File.write(log_path, out)
+    expect(status.success?).to be(true), "#{step} failed (exit #{status.exitstatus}). Log: #{log_path}"
+    out
+  end
+
+  def write_input_vcd(path, cycles:)
+    cycles = [cycles.to_i, 1].max
+    time = 0
+    rst = 1
+    clk = 0
+
+    File.open(path, 'w') do |f|
+      f.puts '$timescale 1ns $end'
+      f.puts '$scope module cpu8bit $end'
+      f.puts '$var wire 1 ! clk $end'
+      f.puts '$var wire 1 " rst $end'
+      f.puts '$var wire 8 # mem_data_in $end'
+      f.puts '$upscope $end'
+      f.puts '$enddefinitions $end'
+      f.puts '$dumpvars'
+      f.puts '0!'
+      f.puts '1"'
+      f.puts 'b00000000 #'
+      f.puts '$end'
+
+      cycles.times do |i|
+        time += 1
+        f.puts "##{time}"
+        f.puts '1!'
+
+        if i.zero?
+          time += 1
+          f.puts "##{time}"
+          f.puts '0!'
+          rst = 0
+          f.puts "#{rst}\""
+          next
+        end
+
+        time += 1
+        f.puts "##{time}"
+        f.puts '0!'
+      end
+    end
+  end
+
+  it 'runs GEM Metal parity checks on the same Yosys path as gem_metal_cpu8bit' do
+    skip 'cargo not found in PATH' unless command_available?('cargo')
+    skip 'yosys not found in PATH' unless command_available?('yosys')
+
+    project_root = File.expand_path('../../../../../', __dir__)
+    gem_root = File.join(project_root, 'external', 'GEM')
+    skip "external GEM repo not found at #{gem_root}" unless Dir.exist?(gem_root)
+
+    top_module = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_TOP', 'cpu8bit')
+    build_dir = File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_BUILD_DIR', File.join(project_root, 'examples/8bit/.gem_metal_cpu8bit'))
+    )
+    FileUtils.mkdir_p(build_dir)
+
+    rtl_path = File.join(build_dir, 'cpu8bit_rtl.v')
+    yosys_script_path = File.join(build_dir, 'cpu8bit_gem.ys')
+    yosys_log_path = File.join(build_dir, 'cpu8bit_yosys.log')
+    cut_map_log_path = File.join(build_dir, 'cpu8bit_cut_map.log')
+    metal_dummy_log_path = File.join(build_dir, 'cpu8bit_metal_dummy.log')
+    metal_parity_log_path = File.join(build_dir, 'cpu8bit_metal_parity.log')
+
+    netlist_path = File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_NETLIST', File.join(build_dir, 'cpu8bit_gatelevel.gv'))
+    )
+    gemparts_path = File.expand_path(
+      ENV.fetch('RHDL_GEM_METAL_CPU8BIT_GEMPARTS', File.join(build_dir, 'cpu8bit.gemparts'))
+    )
+
+    level_split = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_LEVEL_SPLIT', '').strip
+    max_stage_degrad = ENV.fetch('RHDL_GEM_METAL_CPU8BIT_MAX_STAGE_DEGRAD', '').strip
+
+    aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+    skip "missing AIGPDK library at #{aigpdk_nomem_lib}" unless File.exist?(aigpdk_nomem_lib)
+
+    unless File.exist?(netlist_path)
+      File.write(rtl_path, RHDL::HDL::CPU::CPU.to_verilog_hierarchy(top_name: top_module))
+
+      yosys_script = <<~YOSYS
+        read_verilog "#{rtl_path}"
+        hierarchy -check -top #{top_module}
+        synth -flatten
+        delete t:\\$print
+        dfflibmap -liberty "#{aigpdk_nomem_lib}"
+        opt_clean -purge
+        abc -liberty "#{aigpdk_nomem_lib}"
+        opt_clean -purge
+        write_verilog "#{netlist_path}"
+      YOSYS
+      File.write(yosys_script_path, yosys_script)
+
+      run_cmd!(
+        ['yosys', '-q', '-s', yosys_script_path],
+        chdir: project_root,
+        log_path: yosys_log_path,
+        step: 'yosys synthesis'
+      )
+    end
+
+    unless File.exist?(gemparts_path)
+      cut_map_cmd = [
+        'cargo', 'run', '--release', '--features', 'metal', '--bin', 'cut_map_interactive', '--',
+        netlist_path
+      ]
+      cut_map_cmd += ['--top-module', top_module]
+      cut_map_cmd += ['--level-split', level_split] unless level_split.empty?
+      cut_map_cmd += ['--max-stage-degrad', max_stage_degrad] unless max_stage_degrad.empty?
+      cut_map_cmd << gemparts_path
+
+      run_cmd!(
+        cut_map_cmd,
+        chdir: gem_root,
+        log_path: cut_map_log_path,
+        step: 'cut_map_interactive'
+      )
+    end
+
+    expect(File.exist?(netlist_path)).to be(true), "missing netlist at #{netlist_path}"
+    expect(File.exist?(gemparts_path)).to be(true), "missing gemparts at #{gemparts_path}"
+
+    # Run the same benchmark path used by gem_metal_cpu8bit.
+    dummy_out = run_cmd!(
+      [
+        'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_dummy_test', '--',
+        netlist_path, gemparts_path, '5', '256'
+      ],
+      chdir: gem_root,
+      log_path: metal_dummy_log_path,
+      step: 'metal_dummy_test'
+    )
+    expect(dummy_out).to include('metal_dummy_test: logical_dispatches=')
+
+    # Parity check on the same netlist/partitions (GPU path must match GEM CPU execution).
+    input_vcd_path = File.join(build_dir, 'cpu8bit_gem_input.vcd')
+    output_vcd_path = File.join(build_dir, 'cpu8bit_gem_output.vcd')
+    write_input_vcd(input_vcd_path, cycles: 64)
+
+    parity_out = run_cmd!(
+      [
+        'cargo', 'run', '--release', '--features', 'metal', '--bin', 'metal_test', '--',
+        netlist_path, gemparts_path, input_vcd_path, output_vcd_path, '5',
+        '--top-module', top_module,
+        '--input-vcd-scope', top_module,
+        '--check-with-cpu',
+        '--max-cycles', '64'
+      ],
+      chdir: gem_root,
+      log_path: metal_parity_log_path,
+      step: 'metal_test parity'
+    )
+
+    expect(parity_out).to include('sanity test passed!')
+    expect(File.exist?(output_vcd_path)).to be(true)
+    expect(File.size(output_vcd_path)).to be > 0
+  end
+end
diff --git a/spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb b/spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb
index e4d906fb..cd056f23 100644
--- a/spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb
+++ b/spec/examples/8bit/utilities/runners/arcilator_gpu_runner_spec.rb
@@ -1,9 +1,21 @@
 # frozen_string_literal: true
 
 require 'spec_helper'
+require 'tmpdir'
 require_relative '../../../../../examples/8bit/utilities/runners/arcilator_gpu_runner'
 
 RSpec.describe RHDL::Examples::CPU8Bit::ArcilatorGpuRunner do
+  around do |example|
+    original_cpu8bit_instances = ENV['RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES']
+    original_bench_instances = ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES']
+    begin
+      example.run
+    ensure
+      ENV['RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES'] = original_cpu8bit_instances
+      ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES'] = original_bench_instances
+    end
+  end
+
   describe '.detect_gpu_option_tokens' do
     around do |example|
       original = ENV['RHDL_ARCILATOR_GPU_OPTION']
@@ -33,7 +45,7 @@
       allow(described_class).to receive(:command_success?).and_return(true)
     end
 
-    it 'reports ready when tools and gpu option are available' do
+    it 'reports ready when tools are available' do
       allow(described_class).to receive(:command_available?).and_return(true)
       allow(described_class).to receive(:command_output).with(%w[arcilator --help]).and_return('--arc-to-gpu')
 
@@ -44,13 +56,132 @@
       expect(status[:gpu_option_tokens]).to eq(['--arc-to-gpu'])
     end
 
-    it 'reports missing ArcToGPU capability when no gpu option is present' do
+    it 'remains ready when no gpu option is advertised in arcilator help' do
       allow(described_class).to receive(:command_available?).and_return(true)
       allow(described_class).to receive(:command_output).with(%w[arcilator --help]).and_return('--help')
 
       status = described_class.status
-      expect(status[:ready]).to be(false)
-      expect(status[:missing_capabilities]).to include(/ArcToGPU/)
+      expect(status[:ready]).to be(true)
+      expect(status[:missing_capabilities]).to eq([])
+      expect(status[:gpu_option_tokens]).to eq([])
+    end
+  end
+
+  describe '#build_simulation' do
+    def with_build_dir(dir)
+      original = described_class::BUILD_DIR
+      described_class.send(:remove_const, :BUILD_DIR)
+      described_class.const_set(:BUILD_DIR, dir)
+      yield
+    ensure
+      described_class.send(:remove_const, :BUILD_DIR)
+      described_class.const_set(:BUILD_DIR, original)
+    end
+
+    def write_artifact(path, contents = 'artifact')
+      File.write(path, contents)
+    end
+
+    def exercise_build_simulation(dir)
+      with_build_dir(dir) do
+        runner = described_class.allocate
+        shared_lib_path = runner.send(:shared_lib_path)
+        fir_file = File.join(dir, 'cpu8bit.fir')
+        mlir_file = File.join(dir, 'cpu8bit_hw.mlir')
+        ll_file = File.join(dir, 'cpu8bit_arcgpu.ll')
+        state_file = File.join(dir, 'cpu8bit_state.json')
+        obj_file = File.join(dir, 'cpu8bit_arcgpu.o')
+        wrapper_file = File.join(dir, 'cpu8bit_arcgpu_wrapper.cpp')
+
+        [
+          fir_file,
+          mlir_file,
+          ll_file,
+          state_file,
+          obj_file,
+          wrapper_file,
+          shared_lib_path
+        ].each { |path| write_artifact(path) }
+
+        allow(runner).to receive(:write_file_if_changed).and_return(false)
+        allow(runner).to receive(:write_wrapper).and_return(false)
+        allow(runner).to receive(:compile_with_arcilator)
+        allow(runner).to receive(:link_shared_library)
+
+        yield runner, fir_file, mlir_file, ll_file, state_file, obj_file, wrapper_file, shared_lib_path
+      end
+    end
+
+    it 'rebuilds generated GPU objects when the runner source is newer than the cached object' do
+      Dir.mktmpdir('arcilator-gpu-runner-spec') do |dir|
+        exercise_build_simulation(dir) do |runner, fir_file, mlir_file, ll_file, state_file, obj_file, wrapper_file, shared_lib_path|
+          stale = Time.at(0)
+          File.utime(stale, stale, obj_file)
+
+          runner.send(:build_simulation)
+
+          expect(runner).to have_received(:compile_with_arcilator).with(fir_file, mlir_file, ll_file, state_file, obj_file)
+          expect(runner).to have_received(:link_shared_library).with(wrapper_file, obj_file, shared_lib_path)
+        end
+      end
+    end
+
+    it 'relinks when the shared library is older than the generated wrapper and object' do
+      Dir.mktmpdir('arcilator-gpu-runner-spec') do |dir|
+        exercise_build_simulation(dir) do |runner, _fir_file, _mlir_file, _ll_file, _state_file, obj_file, wrapper_file, shared_lib_path|
+          fresh = Time.now + 60
+          stale = Time.at(0)
+          File.utime(fresh, fresh, obj_file)
+          File.utime(fresh, fresh, wrapper_file)
+          File.utime(stale, stale, shared_lib_path)
+
+          runner.send(:build_simulation)
+
+          expect(runner).not_to have_received(:compile_with_arcilator)
+          expect(runner).to have_received(:link_shared_library).with(wrapper_file, obj_file, shared_lib_path)
+        end
+      end
+    end
+  end
+
+  describe 'instance count' do
+    let(:runner) { described_class.allocate }
+
+    it 'defaults to one instance' do
+      ENV.delete('RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES')
+      ENV.delete('RHDL_BENCH_ARCILATOR_GPU_INSTANCES')
+
+      expect(runner.send(:normalize_instance_count, nil)).to eq(1)
+    end
+
+    it 'uses the CPU8bit-specific instance env var' do
+      ENV['RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES'] = '8'
+      ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES'] = '4'
+
+      expect(runner.send(:normalize_instance_count, nil)).to eq(8)
+    end
+
+    it 'falls back to the benchmark-wide instance env var' do
+      ENV.delete('RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES')
+      ENV['RHDL_BENCH_ARCILATOR_GPU_INSTANCES'] = '16'
+
+      expect(runner.send(:normalize_instance_count, nil)).to eq(16)
+    end
+
+    it 'clamps the instance count to the maximum' do
+      ENV['RHDL_CPU8BIT_ARCILATOR_GPU_INSTANCES'] = '999999'
+
+      expect(runner.send(:normalize_instance_count, nil)).to eq(described_class::MAX_INSTANCE_COUNT)
+    end
+
+    it 'reports the configured parallel instance count' do
+      allow(described_class).to receive(:ensure_available!).and_return({})
+      allow_any_instance_of(described_class).to receive(:build_simulation)
+      allow_any_instance_of(described_class).to receive(:load_library)
+      allow_any_instance_of(described_class).to receive(:reset)
+
+      instance = described_class.new(instances: 12)
+      expect(instance.runner_parallel_instances).to eq(12)
     end
   end
 end
diff --git a/spec/examples/apple2/runners/arcilator_gpu_runner_spec.rb b/spec/examples/apple2/runners/arcilator_gpu_runner_spec.rb
new file mode 100644
index 00000000..6376cac8
--- /dev/null
+++ b/spec/examples/apple2/runners/arcilator_gpu_runner_spec.rb
@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'tmpdir'
+require_relative '../../../../examples/apple2/utilities/runners/arcilator_gpu_runner'
+
+RSpec.describe RHDL::Examples::Apple2::ArcilatorGpuRunner do
+  describe '.status' do
+    it 'reports ready when required tools are available' do
+      allow(described_class).to receive(:command_available?).and_return(true)
+      allow(described_class).to receive(:macos_host?).and_return(true)
+      allow(described_class).to receive(:command_success?).and_return(true)
+
+      status = described_class.status
+      expect(status[:ready]).to be(true)
+      expect(status[:missing_tools]).to eq([])
+    end
+
+    it 'reports missing metal toolchain on non-macos hosts' do
+      allow(described_class).to receive(:command_available?).and_return(true)
+      allow(described_class).to receive(:macos_host?).and_return(false)
+      allow(described_class).to receive(:command_success?).and_return(false)
+
+      status = described_class.status
+      expect(status[:ready]).to be(false)
+      expect(status[:missing_tools]).to include('macOS Metal toolchain')
+    end
+  end
+
+  describe '.ensure_available!' do
+    it 'raises with a clear message when unavailable' do
+      allow(described_class).to receive(:status).and_return(
+        { ready: false, missing_tools: %w[xcrun metal metallib] }
+      )
+
+      expect { described_class.ensure_available! }
+        .to raise_error(ArgumentError, /arcilator_gpu backend unavailable/i)
+    end
+  end
+
+  describe 'instance metadata' do
+    let(:runner) { described_class.allocate }
+
+    it 'reports arcilator gpu simulator type' do
+      expect(runner.simulator_type).to eq(:hdl_arcilator_gpu)
+    end
+
+    it 'reports dry-run metadata for arcilator gpu mode' do
+      expect(runner.dry_run_info).to include(
+        mode: :arcilator_gpu,
+        simulator_type: :hdl_arcilator_gpu,
+        native: true
+      )
+    end
+  end
+
+  describe '#build_arcilator_gpu_simulation' do
+    it 'clears the clang module cache before compiling the Metal shader' do
+      Dir.mktmpdir('apple2-arcilator-gpu-runner-spec') do |dir|
+        runner = described_class.allocate
+        runner.instance_variable_set(:@instance_count, 1)
+
+        allow(runner).to receive(:build_dir).and_return(dir)
+        allow(runner).to receive(:shared_lib_path).and_return(File.join(dir, 'libapple2_arcilator_gpu_sim.dylib'))
+        allow(runner).to receive(:export_firrtl)
+        allow(runner).to receive(:write_wrapper)
+        allow(runner).to receive(:link_shared_library)
+        allow(runner).to receive(:load_shared_library) do
+          runner.instance_variable_set(:@sim_ctx, Object.new)
+        end
+        allow(runner).to receive(:run_or_raise)
+        allow(RHDL::Codegen::FIRRTL::ArcToGpuLowering).to receive(:lower)
+        allow(FileUtils).to receive(:mkdir_p).and_call_original
+        allow(FileUtils).to receive(:rm_rf).and_call_original
+
+        module_cache_dir = File.join(dir, 'clang_module_cache')
+        FileUtils.mkdir_p(module_cache_dir)
+        File.write(File.join(module_cache_dir, 'stale.pcm'), 'stale')
+
+        runner.send(:build_arcilator_gpu_simulation)
+
+        expect(FileUtils).to have_received(:rm_rf).with(module_cache_dir)
+        expect(File.exist?(File.join(module_cache_dir, 'stale.pcm'))).to be(false)
+      end
+    end
+  end
+end
diff --git a/spec/examples/riscv/runners/arcilator_gpu_runner_spec.rb b/spec/examples/riscv/runners/arcilator_gpu_runner_spec.rb
new file mode 100644
index 00000000..16aa4664
--- /dev/null
+++ b/spec/examples/riscv/runners/arcilator_gpu_runner_spec.rb
@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'tmpdir'
+require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+
+RSpec.describe RHDL::Examples::RISCV::ArcilatorGpuRunner do
+  describe '#compile_metal_shader' do
+    it 'clears the clang module cache before compiling the Metal shader' do
+      Dir.mktmpdir('riscv-arcilator-gpu-runner-spec') do |dir|
+        runner = described_class.allocate
+
+        allow(runner).to receive(:build_dir).and_return(dir)
+        allow(runner).to receive(:run_or_raise)
+        allow(FileUtils).to receive(:mkdir_p).and_call_original
+        allow(FileUtils).to receive(:rm_rf).and_call_original
+
+        module_cache_dir = File.join(dir, 'clang_module_cache')
+        FileUtils.mkdir_p(module_cache_dir)
+        File.write(File.join(module_cache_dir, 'stale.pcm'), 'stale')
+
+        runner.send(
+          :compile_metal_shader,
+          metal_source_file: File.join(dir, 'kernel.metal'),
+          metal_air_file: File.join(dir, 'kernel.air'),
+          metal_lib_file: File.join(dir, 'kernel.metallib'),
+          log_file: File.join(dir, 'build.log')
+        )
+
+        expect(FileUtils).to have_received(:rm_rf).with(module_cache_dir)
+        expect(File.exist?(File.join(module_cache_dir, 'stale.pcm'))).to be(false)
+      end
+    end
+  end
+
+  describe '#build_config_signature' do
+    it 'tracks the absolute build directory so repo moves invalidate stale GPU artifacts' do
+      runner = described_class.allocate
+      allow(runner).to receive(:build_dir).and_return('/tmp/riscv-gpu-build')
+      runner.instance_variable_set(:@shared_lib_name, 'libriscv_arcilator_gpu_sim.so')
+
+      signature = runner.send(:build_config_signature)
+
+      expect(signature['build_dir']).to eq('/tmp/riscv-gpu-build')
+    end
+  end
+
+  describe '#write_wrapper' do
+    it 'resolves the metallib relative to the loaded shared library at runtime' do
+      Dir.mktmpdir('riscv-arcilator-gpu-wrapper-spec') do |dir|
+        metadata_path = File.join(dir, 'metadata.json')
+        output_path = File.join(dir, 'wrapper.mm')
+
+        File.write(
+          metadata_path,
+          JSON.pretty_generate(
+            {
+              'metal' => {
+                'state_count' => 1,
+                'state_scalar_bits' => 32,
+                'entry' => 'kernel',
+                'runtime_input_layout' => [],
+                'runtime_output_layout' => []
+              },
+              'state_layout' => []
+            }
+          )
+        )
+
+        runner = described_class.allocate
+        runner.instance_variable_set(:@build_variant, 'arcilator_gpu')
+
+        runner.send(
+          :write_wrapper,
+          path: output_path,
+          metadata_path: metadata_path,
+          metallib_path: '/tmp/stale/riscv_cpu_arc_to_gpu.metallib'
+        )
+
+        wrapper = File.read(output_path)
+        expect(wrapper).to include('#include <dlfcn.h>')
+        expect(wrapper).to include('resolveMetallibPath()')
+        expect(wrapper).to include('stringByDeletingLastPathComponent')
+        expect(wrapper).to include('kMetallibFilename')
+      end
+    end
+  end
+
+  describe '#validate_sim_context!' do
+    it 'raises when sim_create returns a null simulation context' do
+      runner = described_class.allocate
+      runner.instance_variable_set(:@sim_ctx, 0)
+
+      expect { runner.send(:validate_sim_context!) }
+        .to raise_error(LoadError, /sim_create returned null/i)
+    end
+  end
+end
diff --git a/spec/examples/riscv/runners/hdl_harness_spec.rb b/spec/examples/riscv/runners/hdl_harness_spec.rb
index cc42dd86..3f756dab 100644
--- a/spec/examples/riscv/runners/hdl_harness_spec.rb
+++ b/spec/examples/riscv/runners/hdl_harness_spec.rb
@@ -7,11 +7,20 @@
   before(:all) do
     @verilator_available = HdlToolchain.verilator_available?
     @arcilator_available = HdlToolchain.arcilator_available?
+    @arcilator_gpu_available = false
 
     if @verilator_available || @arcilator_available
       require_relative '../../../../examples/riscv/utilities/runners/headless_runner'
       require_relative '../../../../examples/riscv/utilities/assembler'
     end
+
+    begin
+      require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+      @arcilator_gpu_available = RHDL::Examples::RISCV::ArcilatorGpuRunner.available?
+    rescue LoadError, NameError
+      @arcilator_gpu_available = false
+    end
+
   end
 
   describe 'VerilogRunner' do
@@ -74,6 +83,37 @@
     end
   end
 
+  describe 'ArcilatorGpuRunner' do
+    it 'is defined when arcilator gpu toolchain is available' do
+      skip 'ArcilatorGPU runner not available' unless @arcilator_gpu_available
+      require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+      expect(defined?(RHDL::Examples::RISCV::ArcilatorGpuRunner)).to eq('constant')
+    end
+
+    it 'has the required public interface methods' do
+      skip 'ArcilatorGPU runner not available' unless @arcilator_gpu_available
+      require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+
+      required_methods = %i[
+        native? simulator_type backend reset!
+        run_cycles clock_count
+        read_reg read_pc load_program load_data
+        read_inst_word read_data_word write_data_word
+        set_interrupts set_plic_sources
+        uart_receive_byte uart_receive_bytes uart_receive_text
+        uart_tx_bytes clear_uart_tx_bytes
+        load_virtio_disk read_virtio_disk_byte
+        state current_inst
+        dispatch_count wait_count fast_dispatch_count fallback_dispatch_count
+      ]
+
+      required_methods.each do |method|
+        expect(RHDL::Examples::RISCV::ArcilatorGpuRunner.instance_methods).to include(method),
+          "Missing method: #{method}"
+      end
+    end
+  end
+
   describe 'HeadlessRunner integration' do
     it 'creates verilator-backed runner' do
       skip 'Verilator not available' unless @verilator_available
@@ -99,6 +139,18 @@
       skip "Arcilator backend unavailable: #{e.message}"
     end
 
+    it 'creates arcilator gpu-backed runner' do
+      skip 'ArcilatorGPU runner not available' unless @arcilator_gpu_available
+
+      runner = RHDL::Examples::RISCV::HeadlessRunner.new(mode: :arcilator_gpu)
+      expect(runner.mode).to eq(:arcilator_gpu)
+      expect(runner.effective_mode).to eq(:arcilator_gpu)
+      expect(runner.cpu).to be_a(RHDL::Examples::RISCV::ArcilatorGpuRunner)
+      expect(runner.cpu.simulator_type).to eq(:hdl_arcilator_gpu)
+    rescue LoadError, RuntimeError => e
+      skip "ArcilatorGPU backend unavailable: #{e.message}"
+    end
+
     it 'creates ruby-backed runner' do
       runner = RHDL::Examples::RISCV::HeadlessRunner.new(mode: :ruby, sim: :ruby)
       expect(runner.mode).to eq(:ruby)
diff --git a/spec/examples/riscv/utilities/tasks/run_task_spec.rb b/spec/examples/riscv/utilities/tasks/run_task_spec.rb
index 6532cfb5..2e1c09f8 100644
--- a/spec/examples/riscv/utilities/tasks/run_task_spec.rb
+++ b/spec/examples/riscv/utilities/tasks/run_task_spec.rb
@@ -289,6 +289,7 @@ def find_li_values(words, rd)
     rescue LoadError, RuntimeError => e
       skip "Arcilator backend unavailable: #{e.message}"
     end
+
   end
 
   describe 'program execution' do
diff --git a/spec/rhdl/cli/tasks/benchmark_task_spec.rb b/spec/rhdl/cli/tasks/benchmark_task_spec.rb
index 4b86365a..9c7ca402 100644
--- a/spec/rhdl/cli/tasks/benchmark_task_spec.rb
+++ b/spec/rhdl/cli/tasks/benchmark_task_spec.rb
@@ -2,6 +2,7 @@
 
 require 'spec_helper'
 require 'rhdl/cli'
+require 'tmpdir'
 
 RSpec.describe RHDL::CLI::Tasks::BenchmarkTask do
   describe 'initialization' do
@@ -78,6 +79,7 @@
         task.run
       end
     end
+
   end
 
   describe '#benchmark_gates' do
@@ -87,6 +89,447 @@
     end
   end
 
+  describe '#benchmark_cpu8bit' do
+    it 'maps arc filter alias to the arcilator_gpu runner' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV['RHDL_BENCH_BACKENDS'] = 'arc'
+
+      task = described_class.new(type: :cpu8bit, cycles: 16, batch_size: 8)
+      memory = double('memory', load: true)
+      harness = double('fast_harness', memory: memory, pc: 0, run_cycles: 16, parallel_instances: 1, :"pc=" => true)
+
+      allow(RHDL::HDL::CPU::FastHarness).to receive(:arcilator_gpu_status).and_return({ ready: true })
+      allow(RHDL::HDL::CPU::FastHarness).to receive(:new).with(nil, sim: :arcilator_gpu).and_return(harness)
+
+      expect { task.benchmark_cpu8bit }.to output(/ArcilatorGPU/).to_stdout
+      expect(RHDL::HDL::CPU::FastHarness).to have_received(:new).with(nil, sim: :arcilator_gpu)
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+
+    it 'reports effective throughput when runner has parallel instances' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV['RHDL_BENCH_BACKENDS'] = 'arc'
+
+      task = described_class.new(type: :cpu8bit, cycles: 16, batch_size: 8)
+      memory = double('memory', load: true)
+      harness = double('fast_harness', memory: memory, pc: 0, run_cycles: 16, parallel_instances: 8, :"pc=" => true)
+
+      allow(RHDL::HDL::CPU::FastHarness).to receive(:arcilator_gpu_status).and_return({ ready: true })
+      allow(RHDL::HDL::CPU::FastHarness).to receive(:new).with(nil, sim: :arcilator_gpu).and_return(harness)
+
+      output = capture_stdout { task.benchmark_cpu8bit }
+      expect(output).to match(/Instances:\s+8/)
+      expect(output).to include('Effective:')
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+
+    it 'runs GemMetal by default alongside the other runners' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV.delete('RHDL_BENCH_BACKENDS')
+
+      stub_const('RHDL::Codegen::IR::IR_COMPILER_AVAILABLE', false)
+      allow(RHDL::HDL::CPU::FastHarness).to receive(:arcilator_gpu_status).and_return({ ready: false })
+
+      task = described_class.new(type: :cpu8bit, cycles: 16, batch_size: 8)
+      allow(task).to receive(:benchmark_gem_metal_cpu8bit).with(cycles: 16, standalone: false).and_return(
+        {
+          name: 'GemMetal',
+          status: :success,
+          init_time: 0.25,
+          run_time: 0.5,
+          cycles_per_sec: 32.0
+        }
+      )
+
+      output = capture_stdout { task.benchmark_cpu8bit }
+      expect(output).to include('GemMetal')
+      expect(task).to have_received(:benchmark_gem_metal_cpu8bit).with(cycles: 16, standalone: false)
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+  end
+
+  describe '#benchmark_apple2' do
+    it 'runs GemMetal by default alongside the other runners' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV.delete('RHDL_BENCH_BACKENDS')
+
+      require_relative '../../../../examples/apple2/hdl'
+      require_relative '../../../../examples/apple2/utilities/runners/arcilator_gpu_runner'
+
+      rom_fixture = Array.new(0x3000, 0).pack('C*')
+      mem_fixture = Array.new(48 * 1024, 0).pack('C*')
+
+      allow(File).to receive(:exist?).and_call_original
+      allow(File).to receive(:exist?).with(/appleiigo\.rom|karateka_mem\.bin/).and_return(true)
+      allow(File).to receive(:binread).and_call_original
+      allow(File).to receive(:binread).with(/appleiigo\.rom/).and_return(rom_fixture)
+      allow(File).to receive(:binread).with(/karateka_mem\.bin/).and_return(mem_fixture)
+      allow(RHDL::Examples::Apple2::Apple2).to receive(:to_flat_ir).and_return(:ir)
+      allow(RHDL::Codegen::IR::IRToJson).to receive(:convert).with(:ir).and_return('{}')
+      allow(RHDL::Codegen::IR).to receive(:const_get).and_call_original
+      allow(RHDL::Codegen::IR).to receive(:const_get).with(:IR_INTERPRETER_AVAILABLE).and_return(false)
+      allow(RHDL::Codegen::IR).to receive(:const_get).with(:IR_JIT_AVAILABLE).and_return(false)
+      allow(RHDL::Codegen::IR).to receive(:const_get).with(:IR_COMPILER_AVAILABLE).and_return(false)
+
+      task = described_class.new(type: :apple2, cycles: 16)
+      allow(task).to receive(:verilator_available?).and_return(false)
+      allow(task).to receive(:arcilator_available?).and_return(false)
+      allow(RHDL::Examples::Apple2::ArcilatorGpuRunner).to receive(:available?).and_return(false)
+      allow(task).to receive(:benchmark_gem_metal_apple2).with(cycles: 16, standalone: false).and_return(
+        {
+          name: 'GemMetal',
+          status: :success,
+          init_time: 0.5,
+          run_time: 1.0,
+          cycles_per_sec: 16.0
+        }
+      )
+
+      output = capture_stdout { task.benchmark_apple2 }
+      expect(output).to include('GemMetal')
+      expect(task).to have_received(:benchmark_gem_metal_apple2).with(cycles: 16, standalone: false)
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+  end
+
+  describe '#benchmark_riscv' do
+    it 'runs GemMetal by default alongside the other runners' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV.delete('RHDL_BENCH_BACKENDS')
+
+      require_relative '../../../../examples/riscv/utilities/runners/headless_runner'
+      require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+
+      stub_const('RHDL::Codegen::IR::IR_COMPILER_AVAILABLE', false)
+      task = described_class.new(type: :riscv, cycles: 16)
+
+      allow(File).to receive(:exist?).and_call_original
+      allow(File).to receive(:exist?).with(/xv6_kernel\.bin|xv6_fs\.img/).and_return(true)
+      allow(task).to receive(:verilator_available?).and_return(false)
+      allow(task).to receive(:arcilator_available?).and_return(false)
+      allow(RHDL::Examples::RISCV::ArcilatorGpuRunner).to receive(:available?).and_return(false)
+      allow(task).to receive(:benchmark_gem_metal_riscv).with(cycles: 16, standalone: false).and_return(
+        {
+          name: 'GemMetal',
+          status: :success,
+          init_time: 0.5,
+          run_time: 1.0,
+          cycles_per_sec: 16.0
+        }
+      )
+
+      output = capture_stdout { task.benchmark_riscv }
+
+      expect(output).to include('GemMetal')
+      expect(task).to have_received(:benchmark_gem_metal_riscv).with(cycles: 16, standalone: false)
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+
+    it 'marks ArcilatorGPU as failed when xv6 never establishes a non-zero PC' do
+      original_filter = ENV['RHDL_BENCH_BACKENDS']
+      ENV['RHDL_BENCH_BACKENDS'] = 'arcilator_gpu'
+
+      require_relative '../../../../examples/riscv/utilities/runners/headless_runner'
+      require_relative '../../../../examples/riscv/utilities/runners/arcilator_gpu_runner'
+
+      task = described_class.new(type: :riscv, cycles: 16)
+      runner = instance_double(RHDL::Examples::RISCV::HeadlessRunner)
+
+      allow(File).to receive(:exist?).and_call_original
+      allow(File).to receive(:exist?).with(/xv6_kernel\.bin|xv6_fs\.img/).and_return(true)
+      allow(RHDL::Examples::RISCV::ArcilatorGpuRunner).to receive(:available?).and_return(true)
+      allow(RHDL::Examples::RISCV::HeadlessRunner).to receive(:new).with(mode: :arcilator_gpu, core: :single).and_return(runner)
+      allow(runner).to receive(:load_xv6)
+      allow(runner).to receive(:run_steps)
+      allow(runner).to receive(:cpu_state).and_return({ pc: 0 })
+
+      output = capture_stdout { task.benchmark_riscv }
+
+      expect(output).to include('ArcilatorGPU')
+      expect(output).to include('FAILED')
+      expect(output).to match(/PC remained 0x0/i)
+    ensure
+      ENV['RHDL_BENCH_BACKENDS'] = original_filter
+    end
+  end
+
+  describe '#benchmark_gem_metal_riscv' do
+    it 'generates a yosys script with MMU disabled and a single explicit abc liberty mapping pass' do
+      task = described_class.new(type: :riscv, cycles: 16)
+      benchmark_task_path = described_class.instance_method(:benchmark_gem_metal_riscv).source_location.first
+      project_root = File.expand_path('../../../..', File.dirname(benchmark_task_path))
+      gem_root = File.join(project_root, 'external', 'GEM')
+      aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+
+      Dir.mktmpdir('gem_metal_riscv') do |build_dir|
+        original_build_dir = ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR']
+        ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR'] = build_dir
+
+        netlist_path = File.join(build_dir, 'riscv_gatelevel.gv')
+        gemparts_path = File.join(build_dir, 'riscv.gemparts')
+        File.write(gemparts_path, "parts\n")
+
+        allow(task).to receive(:command_available?) { |cmd| %w[cargo yosys].include?(cmd) }
+        allow(Dir).to receive(:exist?).and_call_original
+        allow(Dir).to receive(:exist?).with(gem_root).and_return(true)
+        allow(File).to receive(:exist?).and_call_original
+        allow(File).to receive(:exist?).with(aigpdk_nomem_lib).and_return(true)
+
+        require_relative '../../../../examples/riscv/hdl/cpu'
+        allow(RHDL::Examples::RISCV::CPU).to receive(:to_verilog_hierarchy).and_return(<<~VERILOG)
+          module riscv_cpu;
+            wire itlb__hit;
+            wire [19:0] itlb__ppn;
+            wire itlb__perm_r;
+            wire itlb__perm_w;
+            wire itlb__perm_x;
+            wire itlb__perm_u;
+            wire dtlb__hit;
+            wire [19:0] dtlb__ppn;
+            wire dtlb__perm_r;
+            wire dtlb__perm_w;
+            wire dtlb__perm_x;
+            wire dtlb__perm_u;
+            assign satp_translate = some_expr;
+            riscv_sv32_tlb itlb (
+              .hit(itlb__hit),
+              .ppn(itlb__ppn),
+              .perm_r(itlb__perm_r),
+              .perm_w(itlb__perm_w),
+              .perm_x(itlb__perm_x),
+              .perm_u(itlb__perm_u)
+            );
+            riscv_sv32_tlb dtlb (
+              .hit(dtlb__hit),
+              .ppn(dtlb__ppn),
+              .perm_r(dtlb__perm_r),
+              .perm_w(dtlb__perm_w),
+              .perm_x(dtlb__perm_x),
+              .perm_u(dtlb__perm_u)
+            );
+          endmodule
+        VERILOG
+
+        yosys_status = instance_double(Process::Status, success?: true)
+        metal_status = instance_double(Process::Status, success?: true)
+        allow(Open3).to receive(:capture2e) do |*cmd, **kwargs|
+          if cmd == ['yosys', '-q', '-s', File.join(build_dir, 'riscv_gem.ys')]
+            File.write(netlist_path, "module riscv_cpu;\nendmodule\n")
+            ['', yosys_status]
+          elsif cmd.first(5) == ['cargo', 'run', '--release', '--features', 'metal']
+            expect(kwargs[:chdir]).to eq(gem_root)
+            ["metal_dummy_test: logical_dispatches=1 gpu_dispatches=1 total_ms=1.0 cycles_per_sec=16.0\n", metal_status]
+          else
+            raise "unexpected command: #{cmd.inspect}"
+          end
+        end
+
+        capture_stdout { task.benchmark_gem_metal_riscv }
+
+        yosys_script = File.read(File.join(build_dir, 'riscv_gem.ys'))
+        rtl = File.read(File.join(build_dir, 'riscv_rtl.v'))
+        expect(yosys_script.scan(/abc -liberty/).size).to eq(1)
+        expect(yosys_script).not_to include("\ntechmap\n")
+        expect(rtl).to include("assign satp_translate = 1'b0;")
+        expect(rtl).to include("assign itlb__hit = 1'b0;")
+        expect(rtl).to include("assign dtlb__hit = 1'b0;")
+        expect(rtl).not_to include('riscv_sv32_tlb itlb')
+        expect(rtl).not_to include('riscv_sv32_tlb dtlb')
+      ensure
+        ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR'] = original_build_dir
+      end
+    end
+
+    it 'rebuilds stale artifacts when the RISC-V GEM build config is missing' do
+      task = described_class.new(type: :riscv, cycles: 16)
+      benchmark_task_path = described_class.instance_method(:benchmark_gem_metal_riscv).source_location.first
+      project_root = File.expand_path('../../../..', File.dirname(benchmark_task_path))
+      gem_root = File.join(project_root, 'external', 'GEM')
+      aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+
+      Dir.mktmpdir('gem_metal_riscv_stale') do |build_dir|
+        original_build_dir = ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR']
+        ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR'] = build_dir
+
+        File.write(File.join(build_dir, 'riscv_gatelevel.gv'), "module stale;\nendmodule\n")
+        File.write(File.join(build_dir, 'riscv.gemparts'), "stale\n")
+
+        allow(task).to receive(:command_available?) { |cmd| %w[cargo yosys].include?(cmd) }
+        allow(Dir).to receive(:exist?).and_call_original
+        allow(Dir).to receive(:exist?).with(gem_root).and_return(true)
+        allow(File).to receive(:exist?).and_call_original
+        allow(File).to receive(:exist?).with(aigpdk_nomem_lib).and_return(true)
+
+        require_relative '../../../../examples/riscv/hdl/cpu'
+        allow(RHDL::Examples::RISCV::CPU).to receive(:to_verilog_hierarchy).and_return(<<~VERILOG)
+          module riscv_cpu;
+            wire itlb__hit;
+            wire [19:0] itlb__ppn;
+            wire itlb__perm_r;
+            wire itlb__perm_w;
+            wire itlb__perm_x;
+            wire itlb__perm_u;
+            wire dtlb__hit;
+            wire [19:0] dtlb__ppn;
+            wire dtlb__perm_r;
+            wire dtlb__perm_w;
+            wire dtlb__perm_x;
+            wire dtlb__perm_u;
+            assign satp_translate = some_expr;
+            riscv_sv32_tlb itlb (
+              .hit(itlb__hit),
+              .ppn(itlb__ppn),
+              .perm_r(itlb__perm_r),
+              .perm_w(itlb__perm_w),
+              .perm_x(itlb__perm_x),
+              .perm_u(itlb__perm_u)
+            );
+            riscv_sv32_tlb dtlb (
+              .hit(dtlb__hit),
+              .ppn(dtlb__ppn),
+              .perm_r(dtlb__perm_r),
+              .perm_w(dtlb__perm_w),
+              .perm_x(dtlb__perm_x),
+              .perm_u(dtlb__perm_u)
+            );
+          endmodule
+        VERILOG
+
+        yosys_status = instance_double(Process::Status, success?: true)
+        metal_status = instance_double(Process::Status, success?: true)
+        allow(Open3).to receive(:capture2e) do |*cmd, **kwargs|
+          if cmd == ['yosys', '-q', '-s', File.join(build_dir, 'riscv_gem.ys')]
+            File.write(File.join(build_dir, 'riscv_gatelevel.gv'), "module riscv_cpu;\nendmodule\n")
+            ['', yosys_status]
+          elsif cmd.first(5) == ['cargo', 'run', '--release', '--features', 'metal']
+            expect(kwargs[:chdir]).to eq(gem_root)
+            ["metal_dummy_test: logical_dispatches=1 gpu_dispatches=1 total_ms=1.0 cycles_per_sec=16.0\n", metal_status]
+          else
+            raise "unexpected command: #{cmd.inspect}"
+          end
+        end
+
+        capture_stdout { task.benchmark_gem_metal_riscv }
+
+        expect(RHDL::Examples::RISCV::CPU).to have_received(:to_verilog_hierarchy)
+        expect(File.exist?(File.join(build_dir, 'riscv_gem_build_config.json'))).to be(true)
+      ensure
+        ENV['RHDL_GEM_METAL_RISCV_BUILD_DIR'] = original_build_dir
+      end
+    end
+  end
+
+  describe '#benchmark_gem_metal_apple2' do
+    it 'generates a yosys script with a single explicit abc liberty mapping pass' do
+      task = described_class.new(type: :apple2, cycles: 16)
+      benchmark_task_path = described_class.instance_method(:benchmark_gem_metal_apple2).source_location.first
+      project_root = File.expand_path('../../../..', File.dirname(benchmark_task_path))
+      gem_root = File.join(project_root, 'external', 'GEM')
+      aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+
+      Dir.mktmpdir('gem_metal_apple2') do |build_dir|
+        original_build_dir = ENV['RHDL_GEM_METAL_APPLE2_BUILD_DIR']
+        ENV['RHDL_GEM_METAL_APPLE2_BUILD_DIR'] = build_dir
+
+        netlist_path = File.join(build_dir, 'apple2_gatelevel.gv')
+        gemparts_path = File.join(build_dir, 'apple2.gemparts')
+        File.write(gemparts_path, "parts\n")
+
+        allow(task).to receive(:command_available?) { |cmd| %w[cargo yosys].include?(cmd) }
+        allow(Dir).to receive(:exist?).and_call_original
+        allow(Dir).to receive(:exist?).with(gem_root).and_return(true)
+        allow(File).to receive(:exist?).and_call_original
+        allow(File).to receive(:exist?).with(aigpdk_nomem_lib).and_return(true)
+
+        require_relative '../../../../examples/apple2/hdl'
+        allow(RHDL::Examples::Apple2::Apple2).to receive(:to_verilog_hierarchy).and_return(<<~VERILOG)
+          module apple2_apple2;
+          endmodule
+        VERILOG
+
+        yosys_status = instance_double(Process::Status, success?: true)
+        metal_status = instance_double(Process::Status, success?: true)
+        allow(Open3).to receive(:capture2e) do |*cmd, **kwargs|
+          if cmd == ['yosys', '-q', '-s', File.join(build_dir, 'apple2_gem.ys')]
+            File.write(netlist_path, "module apple2_apple2;\nendmodule\n")
+            ['', yosys_status]
+          elsif cmd.first(5) == ['cargo', 'run', '--release', '--features', 'metal']
+            expect(kwargs[:chdir]).to eq(gem_root)
+            ["metal_dummy_test: logical_dispatches=1 gpu_dispatches=1 total_ms=1.0 cycles_per_sec=16.0\n", metal_status]
+          else
+            raise "unexpected command: #{cmd.inspect}"
+          end
+        end
+
+        capture_stdout { task.benchmark_gem_metal_apple2 }
+
+        yosys_script = File.read(File.join(build_dir, 'apple2_gem.ys'))
+        expect(yosys_script.scan(/abc -liberty/).size).to eq(1)
+        expect(yosys_script).not_to include("\ntechmap\n")
+      ensure
+        ENV['RHDL_GEM_METAL_APPLE2_BUILD_DIR'] = original_build_dir
+      end
+    end
+  end
+
+  describe '#benchmark_gem_metal_cpu8bit' do
+    it 'generates a yosys script with a single explicit abc liberty mapping pass' do
+      task = described_class.new(type: :cpu8bit, cycles: 16)
+      benchmark_task_path = described_class.instance_method(:benchmark_gem_metal_cpu8bit).source_location.first
+      project_root = File.expand_path('../../../..', File.dirname(benchmark_task_path))
+      gem_root = File.join(project_root, 'external', 'GEM')
+      aigpdk_nomem_lib = File.join(gem_root, 'aigpdk', 'aigpdk_nomem.lib')
+
+      Dir.mktmpdir('gem_metal_cpu8bit') do |build_dir|
+        original_build_dir = ENV['RHDL_GEM_METAL_CPU8BIT_BUILD_DIR']
+        ENV['RHDL_GEM_METAL_CPU8BIT_BUILD_DIR'] = build_dir
+
+        netlist_path = File.join(build_dir, 'cpu8bit_gatelevel.gv')
+        gemparts_path = File.join(build_dir, 'cpu8bit.gemparts')
+        File.write(gemparts_path, "parts\n")
+
+        allow(task).to receive(:command_available?) { |cmd| %w[cargo yosys].include?(cmd) }
+        allow(Dir).to receive(:exist?).and_call_original
+        allow(Dir).to receive(:exist?).with(gem_root).and_return(true)
+        allow(File).to receive(:exist?).and_call_original
+        allow(File).to receive(:exist?).with(aigpdk_nomem_lib).and_return(true)
+
+        require_relative '../../../../examples/8bit/hdl/cpu/cpu'
+        allow(RHDL::HDL::CPU::CPU).to receive(:to_verilog_hierarchy).and_return(<<~VERILOG)
+          module cpu8bit;
+          endmodule
+        VERILOG
+
+        yosys_status = instance_double(Process::Status, success?: true)
+        metal_status = instance_double(Process::Status, success?: true)
+        allow(Open3).to receive(:capture2e) do |*cmd, **kwargs|
+          if cmd == ['yosys', '-q', '-s', File.join(build_dir, 'cpu8bit_gem.ys')]
+            File.write(netlist_path, "module cpu8bit;\nendmodule\n")
+            ['', yosys_status]
+          elsif cmd.first(5) == ['cargo', 'run', '--release', '--features', 'metal']
+            expect(kwargs[:chdir]).to eq(gem_root)
+            ["metal_dummy_test: logical_dispatches=1 gpu_dispatches=1 total_ms=1.0 cycles_per_sec=16.0\n", metal_status]
+          else
+            raise "unexpected command: #{cmd.inspect}"
+          end
+        end
+
+        capture_stdout { task.benchmark_gem_metal_cpu8bit }
+
+        yosys_script = File.read(File.join(build_dir, 'cpu8bit_gem.ys'))
+        expect(yosys_script.scan(/abc -liberty/).size).to eq(1)
+        expect(yosys_script).not_to include("\ntechmap\n")
+      ensure
+        ENV['RHDL_GEM_METAL_CPU8BIT_BUILD_DIR'] = original_build_dir
+      end
+    end
+  end
+
   describe 'environment variables' do
     it 'respects RHDL_BENCH_LANES environment variable' do
       original_lanes = ENV['RHDL_BENCH_LANES']
@@ -138,5 +581,59 @@
         )
       end
     end
+
+    describe '#disable_riscv_mmu_for_gem_rtl' do
+      it 'forces satp_translate low and replaces TLB instances with constants' do
+        rtl = <<~VERILOG
+          module riscv_cpu;
+            wire itlb__hit;
+            wire [19:0] itlb__ppn;
+            wire itlb__perm_r;
+            wire itlb__perm_w;
+            wire itlb__perm_x;
+            wire itlb__perm_u;
+            wire dtlb__hit;
+            wire [19:0] dtlb__ppn;
+            wire dtlb__perm_r;
+            wire dtlb__perm_w;
+            wire dtlb__perm_x;
+            wire dtlb__perm_u;
+            assign satp_translate = some_expr;
+            riscv_sv32_tlb itlb (
+              .hit(itlb__hit),
+              .ppn(itlb__ppn),
+              .perm_r(itlb__perm_r),
+              .perm_w(itlb__perm_w),
+              .perm_x(itlb__perm_x),
+              .perm_u(itlb__perm_u)
+            );
+            riscv_sv32_tlb dtlb (
+              .hit(dtlb__hit),
+              .ppn(dtlb__ppn),
+              .perm_r(dtlb__perm_r),
+              .perm_w(dtlb__perm_w),
+              .perm_x(dtlb__perm_x),
+              .perm_u(dtlb__perm_u)
+            );
+          endmodule
+        VERILOG
+
+        patched = task.send(:disable_riscv_mmu_for_gem_rtl, rtl)
+        expect(patched).to include("assign satp_translate = 1'b0;")
+        expect(patched).to include("assign itlb__hit = 1'b0;")
+        expect(patched).to include("assign dtlb__hit = 1'b0;")
+        expect(patched).not_to include('riscv_sv32_tlb itlb')
+        expect(patched).not_to include('riscv_sv32_tlb dtlb')
+      end
+    end
+  end
+
+  def capture_stdout
+    original_stdout = $stdout
+    $stdout = StringIO.new
+    yield
+    $stdout.string
+  ensure
+    $stdout = original_stdout
   end
 end
diff --git a/spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb b/spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb
new file mode 100644
index 00000000..d9381609
--- /dev/null
+++ b/spec/rhdl/codegen/firrtl/arc_to_gpu_lowering_spec.rb
@@ -0,0 +1,435 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'tmpdir'
+require 'json'
+require 'rhdl/codegen/firrtl/arc_to_gpu_lowering'
+
+RSpec.describe RHDL::Codegen::FIRRTL::ArcToGpuLowering do
+  def arc_fixture(top_outputs: nil, extra_op_line: nil)
+    outputs = top_outputs || 'out mem_data_out : i8, out mem_addr : i16, out mem_write_en : i1, out mem_read_en : i1, out pc_out : i16, out acc_out : i8, out sp_out : i8, out halted : i1, out state_out : i8, out zero_flag_out : i1'
+
+    <<~MLIR
+      module {
+        arc.define @arc_probe_passthrough(%arg0: i8) -> i8 {
+          #{extra_op_line}
+          arc.output %arg0 : i8
+        }
+
+        arc.define @arc_probe_clock(%arg0: i1) -> !seq.clock {
+          %0 = seq.to_clock %arg0
+          arc.output %0 : !seq.clock
+        }
+
+        hw.module @cpu8bit(in %clk : i1, in %rst : i1, in %mem_data_in : i8, #{outputs}) {
+          %c0_i16 = hw.constant 0 : i16
+          %false = hw.constant false
+          %clk_i = arc.call @arc_probe_clock(%clk) : (i1) -> !seq.clock
+          %state = arc.state @arc_probe_passthrough(%mem_data_in) clock %clk_i reset %rst latency 1 : (i8) -> i8
+          hw.output %state, %c0_i16, %false, %false, %c0_i16, %state, %state, %false, %state, %false : i8, i16, i1, i1, i16, i8, i8, i1, i8, i1
+        }
+      }
+    MLIR
+  end
+
+  def riscv_arc_fixture(top_outputs: nil)
+    outputs = top_outputs || 'out inst_addr : i32, out inst_ptw_addr1 : i32, out inst_ptw_addr0 : i32, out data_addr : i32, out data_wdata : i32, out data_we : i1, out data_re : i1, out data_funct3 : i3, out data_ptw_addr1 : i32, out data_ptw_addr0 : i32, out debug_pc : i32, out debug_inst : i32, out debug_x1 : i32, out debug_x2 : i32, out debug_x10 : i32, out debug_x11 : i32, out debug_reg_data : i32'
+
+    <<~MLIR
+      module {
+        arc.define @riscv_passthrough(%arg0: i32) -> i32 {
+          arc.output %arg0 : i32
+        }
+
+        arc.define @riscv_clock(%arg0: i1) -> !seq.clock {
+          %0 = seq.to_clock %arg0
+          arc.output %0 : !seq.clock
+        }
+
+        hw.module @riscv_cpu(in %clk : i1, in %rst : i1, in %irq_software : i1, in %irq_timer : i1, in %irq_external : i1, in %inst_data : i32, in %inst_ptw_pte1 : i32, in %inst_ptw_pte0 : i32, in %data_rdata : i32, in %data_ptw_pte1 : i32, in %data_ptw_pte0 : i32, in %debug_reg_addr : i5, #{outputs}) {
+          %false = hw.constant false
+          %c0_i3 = hw.constant 0 : i3
+          %clk_i = arc.call @riscv_clock(%clk) : (i1) -> !seq.clock
+          %state = arc.state @riscv_passthrough(%inst_data) clock %clk_i reset %rst latency 1 : (i32) -> i32
+          hw.output %state, %state, %state, %state, %state, %false, %false, %c0_i3, %state, %state, %state, %state, %state, %state, %state, %state, %state : i32, i32, i32, i32, i32, i1, i1, i3, i32, i32, i32, i32, i32, i32, i32, i32, i32
+        }
+      }
+    MLIR
+  end
+
+  def arc_fixture_with_dead_define
+    <<~MLIR
+      module {
+        arc.define @arc_used_passthrough(%arg0: i8) -> i8 {
+          arc.output %arg0 : i8
+        }
+
+        arc.define @arc_dead_passthrough(%arg0: i8) -> i8 {
+          %c1 = hw.constant 1 : i8
+          arc.output %c1 : i8
+        }
+
+        arc.define @arc_probe_clock(%arg0: i1) -> !seq.clock {
+          %0 = seq.to_clock %arg0
+          arc.output %0 : !seq.clock
+        }
+
+        hw.module @cpu8bit(in %clk : i1, in %rst : i1, in %mem_data_in : i8, out mem_data_out : i8, out mem_addr : i16, out mem_write_en : i1, out mem_read_en : i1, out pc_out : i16, out acc_out : i8, out sp_out : i8, out halted : i1, out state_out : i8, out zero_flag_out : i1) {
+          %c0_i16 = hw.constant 0 : i16
+          %false = hw.constant false
+          %clk_i = arc.call @arc_probe_clock(%clk) : (i1) -> !seq.clock
+          %state = arc.state @arc_used_passthrough(%mem_data_in) clock %clk_i reset %rst latency 1 : (i8) -> i8
+          hw.output %state, %c0_i16, %false, %false, %c0_i16, %state, %state, %false, %state, %false : i8, i16, i1, i1, i16, i8, i8, i1, i8, i1
+        }
+      }
+    MLIR
+  end
+
+  def arc_fixture_with_constant_array_get
+    <<~MLIR
+      module {
+        arc.define @arc_probe_passthrough(%arg0: i8) -> i8 {
+          %c1 = hw.constant 1 : i8
+          %c2 = hw.constant 2 : i8
+          %c3 = hw.constant 3 : i8
+          %idx = hw.constant 5 : i8
+          %arr = hw.array_create %c1, %c2, %c3 : i8
+          %sel = hw.array_get %arr[%idx] : !hw.array<3xi8>, i8
+          arc.output %sel : i8
+        }
+
+        arc.define @arc_probe_clock(%arg0: i1) -> !seq.clock {
+          %0 = seq.to_clock %arg0
+          arc.output %0 : !seq.clock
+        }
+
+        hw.module @cpu8bit(in %clk : i1, in %rst : i1, in %mem_data_in : i8, out mem_data_out : i8, out mem_addr : i16, out mem_write_en : i1, out mem_read_en : i1, out pc_out : i16, out acc_out : i8, out sp_out : i8, out halted : i1, out state_out : i8, out zero_flag_out : i1) {
+          %c0_i16 = hw.constant 0 : i16
+          %false = hw.constant false
+          %clk_i = arc.call @arc_probe_clock(%clk) : (i1) -> !seq.clock
+          %state = arc.state @arc_probe_passthrough(%mem_data_in) clock %clk_i reset %rst latency 1 : (i8) -> i8
+          hw.output %state, %c0_i16, %false, %false, %c0_i16, %state, %state, %false, %state, %false : i8, i16, i1, i1, i16, i8, i8, i1, i8, i1
+        }
+      }
+    MLIR
+  end
+
+  def arc_fixture_with_aggregate_array_get
+    <<~MLIR
+      module {
+        arc.define @arc_probe_passthrough(%arg0: i8) -> i8 {
+          %idx = hw.constant 2 : i8
+          %arr = hw.aggregate_constant [11 : i8, 22 : i8, 33 : i8] : !hw.array<3xi8>
+          %sel = hw.array_get %arr[%idx] : !hw.array<3xi8>, i8
+          arc.output %sel : i8
+        }
+
+        arc.define @arc_probe_clock(%arg0: i1) -> !seq.clock {
+          %0 = seq.to_clock %arg0
+          arc.output %0 : !seq.clock
+        }
+
+        hw.module @cpu8bit(in %clk : i1, in %rst : i1, in %mem_data_in : i8, out mem_data_out : i8, out mem_addr : i16, out mem_write_en : i1, out mem_read_en : i1, out pc_out : i16, out acc_out : i8, out sp_out : i8, out halted : i1, out state_out : i8, out zero_flag_out : i1) {
+          %c0_i16 = hw.constant 0 : i16
+          %false = hw.constant false
+          %clk_i = arc.call @arc_probe_clock(%clk) : (i1) -> !seq.clock
+          %state = arc.state @arc_probe_passthrough(%mem_data_in) clock %clk_i reset %rst latency 1 : (i8) -> i8
+          hw.output %state, %c0_i16, %false, %false, %c0_i16, %state, %state, %false, %state, %false : i8, i16, i1, i1, i16, i8, i8, i1, i8, i1
+        }
+      }
+    MLIR
+  end
+
+  it 'emits ArcToGPU artifacts and metadata for supported Arc MLIR' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'input.arc.mlir')
+      gpu_path = File.join(dir, 'output.gpu.mlir')
+      meta_path = File.join(dir, 'output.arc_to_gpu.json')
+
+      File.write(arc_path, arc_fixture)
+
+      summary = described_class.lower(
+        arc_mlir_path: arc_path,
+        gpu_mlir_path: gpu_path,
+        metadata_path: meta_path
+      )
+
+      expect(summary[:module]).to eq('cpu8bit')
+      expect(summary[:arc_define_count]).to be >= 1
+      expect(summary[:arc_state_count]).to be >= 1
+      expect(File).to exist(gpu_path)
+      expect(File).to exist(meta_path)
+
+      gpu_text = File.read(gpu_path)
+      expect(gpu_text).to include('gpu.module')
+      expect(gpu_text).to include('rhdl.arc_to_gpu.version')
+
+      metadata = JSON.parse(File.read(meta_path))
+      expect(metadata['version']).to eq('ArcToGpuLoweringV2')
+      expect(metadata['module']).to eq('cpu8bit')
+      expect(metadata.dig('metal', 'entry')).to match(/cpu8bit/)
+      expect(metadata.dig('metal', 'state_count')).to be >= 1
+    end
+  end
+
+  it 'fails when required top outputs are missing' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'input.arc.mlir')
+      gpu_path = File.join(dir, 'output.gpu.mlir')
+
+      File.write(
+        arc_path,
+        arc_fixture(top_outputs: 'out mem_data_out : i8, out mem_addr : i16')
+      )
+
+      expect do
+        described_class.lower(arc_mlir_path: arc_path, gpu_mlir_path: gpu_path)
+      end.to raise_error(described_class::LoweringError, /missing required outputs/i)
+    end
+  end
+
+  it 'fails when unsupported operations are present' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'input.arc.mlir')
+      gpu_path = File.join(dir, 'output.gpu.mlir')
+
+      File.write(
+        arc_path,
+        arc_fixture(extra_op_line: '%x = comb.shrs %arg0, %arg0 : i8')
+      )
+
+      expect do
+        described_class.lower(arc_mlir_path: arc_path, gpu_mlir_path: gpu_path)
+      end.to raise_error(described_class::LoweringError, /does not support ops/i)
+    end
+  end
+
+  it 'supports comb.concat with more than two operands' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'input.arc.mlir')
+      gpu_path = File.join(dir, 'output.gpu.mlir')
+      metal_path = File.join(dir, 'output.metal')
+
+      concat_fixture = <<~MLIR
+        module {
+          arc.define @arc_probe_passthrough(%arg0: i8) -> i8 {
+            %x = comb.concat %arg0, %arg0, %arg0 : i8, i8, i8
+            %y = comb.extract %x from 0 : (i24) -> i8
+            arc.output %y : i8
+          }
+
+          arc.define @arc_probe_clock(%arg0: i1) -> !seq.clock {
+            %0 = seq.to_clock %arg0
+            arc.output %0 : !seq.clock
+          }
+
+          hw.module @cpu8bit(in %clk : i1, in %rst : i1, in %mem_data_in : i8, out mem_data_out : i8, out mem_addr : i16, out mem_write_en : i1, out mem_read_en : i1, out pc_out : i16, out acc_out : i8, out sp_out : i8, out halted : i1, out state_out : i8, out zero_flag_out : i1) {
+            %c0_i16 = hw.constant 0 : i16
+            %false = hw.constant false
+            %clk_i = arc.call @arc_probe_clock(%clk) : (i1) -> !seq.clock
+            %state = arc.state @arc_probe_passthrough(%mem_data_in) clock %clk_i reset %rst latency 1 : (i8) -> i8
+            hw.output %state, %c0_i16, %false, %false, %c0_i16, %state, %state, %false, %state, %false : i8, i16, i1, i1, i16, i8, i8, i1, i8, i1
+          }
+        }
+      MLIR
+      File.write(arc_path, concat_fixture)
+
+      described_class.lower(
+        arc_mlir_path: arc_path,
+        gpu_mlir_path: gpu_path,
+        metal_source_path: metal_path
+      )
+
+      metal_source = File.read(metal_path)
+      expect(metal_source).to include('<< 16u')
+      expect(metal_source).to include('<< 8u')
+    end
+  end
+
+  it 'emits ArcToGPU artifacts for riscv profile' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'riscv.arc.mlir')
+      gpu_path = File.join(dir, 'riscv.gpu.mlir')
+      meta_path = File.join(dir, 'riscv.arc_to_gpu.json')
+      metal_path = File.join(dir, 'riscv.metal')
+
+      File.write(arc_path, riscv_arc_fixture)
+
+      summary = described_class.lower(
+        arc_mlir_path: arc_path,
+        gpu_mlir_path: gpu_path,
+        metadata_path: meta_path,
+        metal_source_path: metal_path,
+        profile: :riscv
+      )
+
+      expect(summary[:module]).to eq('riscv_cpu')
+      expect(summary[:profile]).to eq(:riscv)
+      metadata = JSON.parse(File.read(meta_path))
+      expect(metadata['profile']).to eq('riscv')
+      expect(metadata.dig('metal', 'entry')).to include('riscv_cpu')
+      runtime_output_names = Array(metadata.dig('metal', 'runtime_output_layout')).map { |entry| entry.fetch('name') }
+      expect(runtime_output_names).not_to include('debug_pc')
+      expect(runtime_output_names).not_to include('debug_inst')
+      expect(runtime_output_names).not_to include('debug_x1')
+      expect(runtime_output_names).not_to include('debug_x2')
+      expect(runtime_output_names).not_to include('debug_x10')
+      expect(runtime_output_names).not_to include('debug_x11')
+      expect(runtime_output_names).not_to include('debug_reg_data')
+      introspection = metadata.dig('metal', 'introspection')
+      expect(introspection).to include('pc_slot', 'pc_width', 'regfile_base_slot', 'regfile_length')
+      expect(introspection.fetch('pc_width')).to eq(32)
+      expect(introspection.fetch('regfile_length')).to be >= 0
+      expect(metadata.dig('metal', 'schedule_mode')).to eq('legacy')
+      expect(metadata.dig('metal', 'fast_low_wdata_mode')).to eq('split')
+      expect(metadata.dig('metal', 'fast_high_data_addr_mode')).to eq('split')
+      expect(metadata.dig('metal', 'fast_low_data_addr_mode')).to eq('split')
+      expect(metadata['top_inputs']).to include('inst_data')
+      metal_source = File.read(metal_path)
+      expect(metal_source).to include('rhdl_read_mem_funct3')
+      io_struct = metal_source[/struct RhdlArcGpuIo \{.*?\n\};/m]
+      expect(io_struct).not_to be_nil
+      expect(io_struct).not_to include('uint debug_pc;')
+      expect(io_struct).not_to include('uint debug_reg_data;')
+      expect(metal_source).to include('riscv_eval_low_wdata_fast')
+      expect(metal_source).to include('riscv_eval_low_data_addr_fast')
+      expect(metal_source).to include('riscv_eval_high_data_addr_fast')
+    end
+  end
+
+  it 'emits ArcToGPU artifacts for riscv_netlist profile' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'riscv.arc.mlir')
+      gpu_path = File.join(dir, 'riscv_netlist.gpu.mlir')
+      meta_path = File.join(dir, 'riscv_netlist.arc_to_gpu.json')
+      metal_path = File.join(dir, 'riscv_netlist.metal')
+
+      File.write(arc_path, riscv_arc_fixture)
+
+      summary = described_class.lower(
+        arc_mlir_path: arc_path,
+        gpu_mlir_path: gpu_path,
+        metadata_path: meta_path,
+        metal_source_path: metal_path,
+        profile: :riscv_netlist
+      )
+
+      expect(summary[:module]).to eq('riscv_cpu')
+      expect(summary[:profile]).to eq(:riscv_netlist)
+      metadata = JSON.parse(File.read(meta_path))
+      expect(metadata['profile']).to eq('riscv_netlist')
+      expect(metadata.dig('metal', 'entry')).to include('riscv_cpu')
+      expect(metadata.dig('metal', 'schedule_mode')).to eq('netlist_aig_legacy')
+      runtime_output_names = Array(metadata.dig('metal', 'runtime_output_layout')).map { |entry| entry.fetch('name') }
+      expect(runtime_output_names).to be_empty
+      introspection = metadata.dig('metal', 'introspection')
+      expect(introspection).to include('pc_slot', 'pc_width', 'regfile_base_slot', 'regfile_length')
+      expect(introspection.fetch('pc_width')).to eq(32)
+      metal_source = File.read(metal_path)
+      expect(metal_source).to include('rhdl_read_mem_funct3')
+      expect(metal_source).to include('riscv_eval_low_wdata_fast')
+      expect(metal_source).to include('riscv_eval_low_data_addr_fast')
+      expect(metal_source).to include('riscv_eval_high_data_addr_fast')
+    end
+  end
+
+  it 'uses fixed riscv fast-default modes regardless of removed env toggles' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'riscv.arc.mlir')
+      gpu_path = File.join(dir, 'riscv.gpu.mlir')
+      meta_path = File.join(dir, 'riscv.arc_to_gpu.json')
+      metal_path = File.join(dir, 'riscv.metal')
+      original_split_low_wdata = ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_WDATA']
+      original_split_high_data_addr = ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_HIGH_DATA_ADDR']
+      original_split_low_data_addr = ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_DATA_ADDR']
+      original_dirty_settle = ENV['RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE']
+      original_scheduled_emit = ENV['RHDL_ARC_TO_GPU_RISCV_SCHEDULED_EMIT']
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_WDATA'] = '0'
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_HIGH_DATA_ADDR'] = '0'
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_DATA_ADDR'] = '0'
+      ENV['RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE'] = '1'
+      ENV['RHDL_ARC_TO_GPU_RISCV_SCHEDULED_EMIT'] = '1'
+
+      File.write(arc_path, riscv_arc_fixture)
+
+      described_class.lower(
+        arc_mlir_path: arc_path,
+        gpu_mlir_path: gpu_path,
+        metadata_path: meta_path,
+        metal_source_path: metal_path,
+        profile: :riscv
+      )
+
+      metal_source = File.read(metal_path)
+      metadata = JSON.parse(File.read(meta_path))
+      expect(metal_source).to include('riscv_eval_low_wdata_fast')
+      expect(metal_source).to include('loww = riscv_cpu_riscv_eval_low_wdata_fast(')
+      expect(metadata.dig('metal', 'fast_low_wdata_mode')).to eq('split')
+      expect(metal_source).to include('riscv_eval_high_data_addr_fast')
+      expect(metal_source).to include('high_addr = riscv_cpu_riscv_eval_high_data_addr_fast(')
+      expect(metadata.dig('metal', 'fast_high_data_addr_mode')).to eq('split')
+      expect(metal_source).to include('riscv_eval_low_data_addr_fast')
+      expect(metal_source).to include('low_addr = riscv_cpu_riscv_eval_low_data_addr_fast(')
+      expect(metadata.dig('metal', 'fast_low_data_addr_mode')).to eq('split')
+      expect(metadata.dig('metal', 'schedule_mode')).to eq('legacy')
+      expect(metal_source).not_to include('state_dirty')
+      expect(metal_source).not_to include('// schedule_phase:')
+      expect(metal_source).not_to include('// schedule_level')
+    ensure
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_WDATA'] = original_split_low_wdata
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_HIGH_DATA_ADDR'] = original_split_high_data_addr
+      ENV['RHDL_ARC_TO_GPU_RISCV_SPLIT_LOW_DATA_ADDR'] = original_split_low_data_addr
+      ENV['RHDL_ARC_TO_GPU_RISCV_DIRTY_SETTLE'] = original_dirty_settle
+      ENV['RHDL_ARC_TO_GPU_RISCV_SCHEDULED_EMIT'] = original_scheduled_emit
+    end
+  end
+
+  it 'fails riscv profile when required outputs are missing' do
+    Dir.mktmpdir('arc_to_gpu_lowering_spec') do |dir|
+      arc_path = File.join(dir, 'riscv.arc.mlir')
+      gpu_path = File.join(dir, 'riscv.gpu.mlir')
+
+      File.write(
+        arc_path,
+        riscv_arc_fixture(top_outputs: 'out inst_addr : i32, out inst_ptw_addr1 : i32')
+      )
+
+      expect do
+        described_class.lower(
+          arc_mlir_path: arc_path,
+          gpu_mlir_path: gpu_path,
+          profile: :riscv
+        )
+      end.to raise_error(described_class::LoweringError, /missing required outputs/i)
+    end
+  end
+
+  it 'prunes unreachable arc.define functions from parsed graph' do
+    parsed = described_class.parse_arc_mlir(arc_fixture_with_dead_define)
+    expect(parsed[:functions].keys).to include('arc_dead_passthrough')
+
+    pruned = described_class.prune_unreachable_functions(parsed)
+    expect(pruned[:functions].keys).to include('arc_used_passthrough')
+    expect(pruned[:functions].keys).not_to include('arc_dead_passthrough')
+  end
+
+  it 'folds array_get(array_create(...), constant) to alias' do
+    parsed = described_class.parse_arc_mlir(arc_fixture_with_constant_array_get)
+    folded = described_class.fold_constant_array_gets(parsed)
+    fn = folded.fetch(:functions).fetch('arc_probe_passthrough')
+    sel_op = fn.fetch(:ops).find { |op| op.fetch(:result_refs).include?('%sel') }
+
+    expect(sel_op.fetch(:kind)).to eq(:alias)
+    expect(sel_op.fetch(:source_ref)).to eq('%c3')
+  end
+
+  it 'folds array_get(aggregate_constant(...), constant) to constant' do
+    parsed = described_class.parse_arc_mlir(arc_fixture_with_aggregate_array_get)
+    folded = described_class.fold_constant_array_gets(parsed)
+    fn = folded.fetch(:functions).fetch('arc_probe_passthrough')
+    sel_op = fn.fetch(:ops).find { |op| op.fetch(:result_refs).include?('%sel') }
+
+    expect(sel_op.fetch(:kind)).to eq(:constant)
+    expect(sel_op.fetch(:value)).to eq(11)
+  end
+end
diff --git a/spec/support/firrtl_helper.rb b/spec/support/firrtl_helper.rb
new file mode 100644
index 00000000..6e59bc88
--- /dev/null
+++ b/spec/support/firrtl_helper.rb
@@ -0,0 +1,322 @@
+# FIRRTL export validation helper
+# Converts RHDL FIRRTL to Verilog using firtool and validates against RHDL Verilog output
+
+require "fileutils"
+require "open3"
+
+module FirrtlHelper
+  module_function
+
+  # Convert FIRRTL to Verilog using firtool
+  # Returns the generated Verilog string or nil on failure
+  def firtool_to_verilog(firrtl_source, base_dir:)
+    base_dir = File.expand_path(base_dir)
+    FileUtils.mkdir_p(base_dir)
+
+    firrtl_path = File.join(base_dir, "design.fir")
+    verilog_path = File.join(base_dir, "design.v")
+
+    File.write(firrtl_path, firrtl_source)
+
+    # Run firtool to convert FIRRTL to Verilog
+    # Use lowering options for iverilog compatibility:
+    # - disallowLocalVariables: iverilog doesn't support 'automatic' lifetime
+    # - disallowPackedArrays: iverilog doesn't support packed arrays
+    result = run_cmd(
+      ["firtool", firrtl_path, "-o", verilog_path, "--format=fir",
+       "--lowering-options=disallowLocalVariables,disallowPackedArrays"],
+      cwd: base_dir
+    )
+
+    unless result[:status].success?
+      return { success: false, error: "firtool failed: #{result[:stderr]}\n#{result[:stdout]}" }
+    end
+
+    unless File.exist?(verilog_path)
+      return { success: false, error: "firtool did not generate output file" }
+    end
+
+    { success: true, verilog: File.read(verilog_path) }
+  end
+
+  # Validate FIRRTL export by comparing simulation results
+  # Takes a component class and test vectors
+  # Returns comparison result with success status
+  def validate_firrtl_export(component_class, test_vectors:, base_dir:, has_clock: false)
+    # Get RHDL outputs
+    rhdl_verilog = component_class.to_verilog
+    rhdl_firrtl = component_class.to_firrtl
+
+    # Convert FIRRTL to Verilog using firtool
+    firrtl_result = firtool_to_verilog(rhdl_firrtl, base_dir: File.join(base_dir, "firrtl"))
+
+    unless firrtl_result[:success]
+      return {
+        success: false,
+        error: firrtl_result[:error],
+        rhdl_verilog: rhdl_verilog,
+        rhdl_firrtl: rhdl_firrtl
+      }
+    end
+
+    firrtl_verilog = firrtl_result[:verilog]
+
+    # Extract port info from RHDL component
+    rhdl_inputs = {}
+    rhdl_outputs = {}
+    component_class._ports.each do |port|
+      if port.direction == :in
+        rhdl_inputs[port.name] = port.width
+      else
+        rhdl_outputs[port.name] = port.width
+      end
+    end
+
+    module_name = component_class.verilog_module_name
+
+    # Run simulation on RHDL-generated Verilog
+    rhdl_sim = NetlistHelper.run_behavior_simulation(
+      rhdl_verilog,
+      module_name: module_name,
+      inputs: rhdl_inputs,
+      outputs: rhdl_outputs,
+      test_vectors: test_vectors,
+      base_dir: File.join(base_dir, "rhdl_sim"),
+      has_clock: has_clock
+    )
+
+    unless rhdl_sim[:success]
+      return {
+        success: false,
+        error: "RHDL Verilog simulation failed: #{rhdl_sim[:error]}",
+        rhdl_verilog: rhdl_verilog,
+        rhdl_firrtl: rhdl_firrtl,
+        firrtl_verilog: firrtl_verilog
+      }
+    end
+
+    # Extract actual port names from FIRRTL-generated Verilog
+    # (firtool may rename ports to avoid reserved words, e.g., 'eq' -> 'eq_fir')
+    firrtl_ports = extract_verilog_ports(firrtl_verilog)
+
+    # Build mappings from RHDL port names to FIRRTL port names
+    input_mapping = build_port_mapping(rhdl_inputs, firrtl_ports[:inputs])
+    output_mapping = build_port_mapping(rhdl_outputs, firrtl_ports[:outputs])
+
+    # Create FIRRTL port definitions using mapped names
+    firrtl_inputs = {}
+    rhdl_inputs.each do |name, width|
+      firrtl_name = input_mapping[name] || name
+      firrtl_inputs[firrtl_name.to_sym] = width
+    end
+
+    firrtl_outputs = {}
+    rhdl_outputs.each do |name, width|
+      firrtl_name = output_mapping[name] || name
+      firrtl_outputs[firrtl_name.to_sym] = width
+    end
+
+    # Transform test vectors to use FIRRTL port names
+    firrtl_test_vectors = test_vectors.map do |vec|
+      firrtl_inputs_vec = {}
+      vec[:inputs].each do |name, value|
+        firrtl_name = input_mapping[name] || name
+        firrtl_inputs_vec[firrtl_name.to_sym] = value
+      end
+
+      firrtl_expected = {}
+      vec[:expected].each do |name, value|
+        firrtl_name = output_mapping[name] || name
+        firrtl_expected[firrtl_name.to_sym] = value
+      end
+
+      { inputs: firrtl_inputs_vec, expected: firrtl_expected }
+    end
+
+    # Run simulation on FIRRTL-generated Verilog with mapped port names
+    firrtl_sim = NetlistHelper.run_behavior_simulation(
+      firrtl_verilog,
+      module_name: module_name,
+      inputs: firrtl_inputs,
+      outputs: firrtl_outputs,
+      test_vectors: firrtl_test_vectors,
+      base_dir: File.join(base_dir, "firrtl_sim"),
+      has_clock: has_clock
+    )
+
+    unless firrtl_sim[:success]
+      return {
+        success: false,
+        error: "FIRRTL Verilog simulation failed: #{firrtl_sim[:error]}",
+        rhdl_verilog: rhdl_verilog,
+        rhdl_firrtl: rhdl_firrtl,
+        firrtl_verilog: firrtl_verilog,
+        rhdl_results: rhdl_sim[:results]
+      }
+    end
+
+    # Build reverse mapping (FIRRTL -> RHDL) for result comparison
+    reverse_output_mapping = output_mapping.invert
+
+    # Compare results (map FIRRTL port names back to RHDL names for comparison)
+    # Only compare outputs that are specified in the test vector's expected hash
+    mismatches = []
+    test_vectors.each_with_index do |vec, idx|
+      rhdl_out = rhdl_sim[:results][idx]
+      firrtl_raw = firrtl_sim[:results][idx]
+
+      # Map FIRRTL result keys back to RHDL names
+      firrtl_out = {}
+      firrtl_raw.each do |firrtl_name, value|
+        rhdl_name = reverse_output_mapping[firrtl_name.to_s] || firrtl_name
+        firrtl_out[rhdl_name.to_sym] = value
+      end
+
+      # Only compare outputs specified in expected hash
+      expected_keys = vec[:expected].keys
+      rhdl_filtered = rhdl_out.select { |k, _| expected_keys.include?(k) }
+      firrtl_filtered = firrtl_out.select { |k, _| expected_keys.include?(k) }
+
+      next if rhdl_filtered == firrtl_filtered
+
+      mismatches << {
+        cycle: idx,
+        rhdl: rhdl_filtered,
+        firrtl: firrtl_filtered
+      }
+    end
+
+    if mismatches.any?
+      {
+        success: false,
+        error: "Output mismatch between RHDL and FIRRTL Verilog",
+        mismatches: mismatches,
+        rhdl_verilog: rhdl_verilog,
+        rhdl_firrtl: rhdl_firrtl,
+        firrtl_verilog: firrtl_verilog,
+        rhdl_results: rhdl_sim[:results],
+        firrtl_results: firrtl_sim[:results]
+      }
+    else
+      {
+        success: true,
+        rhdl_verilog: rhdl_verilog,
+        rhdl_firrtl: rhdl_firrtl,
+        firrtl_verilog: firrtl_verilog,
+        rhdl_results: rhdl_sim[:results],
+        firrtl_results: firrtl_sim[:results]
+      }
+    end
+  end
+
+  # Simple validation that just checks firtool can parse and compile the FIRRTL
+  # without running full simulation comparison
+  def validate_firrtl_syntax(component_class, base_dir:)
+    rhdl_firrtl = component_class.to_firrtl
+    result = firtool_to_verilog(rhdl_firrtl, base_dir: base_dir)
+
+    {
+      success: result[:success],
+      error: result[:error],
+      firrtl: rhdl_firrtl,
+      verilog: result[:verilog]
+    }
+  end
+
+  # Validate hierarchical FIRRTL export using to_firrtl_hierarchy
+  # This includes all submodule definitions in a single circuit
+  def validate_hierarchical_firrtl(component_class, base_dir:)
+    rhdl_firrtl = component_class.to_firrtl_hierarchy
+    result = firtool_to_verilog(rhdl_firrtl, base_dir: base_dir)
+
+    {
+      success: result[:success],
+      error: result[:error],
+      firrtl: rhdl_firrtl,
+      verilog: result[:verilog]
+    }
+  end
+
+  def run_cmd(cmd, cwd:)
+    stdout, stderr, status = Open3.capture3(*cmd, chdir: cwd)
+    { stdout: stdout, stderr: stderr, status: status }
+  end
+
+  # Extract port names and widths from Verilog module definition
+  # Returns { inputs: { name => width }, outputs: { name => width } }
+  def extract_verilog_ports(verilog_source)
+    inputs = {}
+    outputs = {}
+    current_direction = nil
+    current_width = 1
+
+    # Match module definition to end of port list
+    if verilog_source =~ /module\s+\w+\s*\(([\s\S]*?)\);/m
+      port_block = $1
+
+      # Parse line by line to handle firtool's multi-line format
+      port_block.split("\n").each do |line|
+        line = line.strip
+
+        # Detect direction changes (input/output)
+        if line =~ /\b(input|output)\b/
+          current_direction = $1.to_sym
+
+          # Check for width declaration
+          if line =~ /\[(\d+):0\]/
+            current_width = $1.to_i + 1
+          else
+            current_width = 1
+          end
+        end
+
+        # Extract port names from this line (handles comma-separated and single ports)
+        # Skip reserved words and empty lines
+        port_names = line.scan(/\b([a-zA-Z_][a-zA-Z0-9_]*)\b/).flatten
+        port_names.reject! { |n| %w[input output wire reg].include?(n) }
+
+        # For each valid port name, add to appropriate hash
+        port_names.each do |name|
+          next if name.empty?
+
+          case current_direction
+          when :input
+            inputs[name] = current_width
+          when :output
+            outputs[name] = current_width
+          end
+        end
+      end
+    end
+
+    { inputs: inputs, outputs: outputs }
+  end
+
+  # Build port name mapping from RHDL ports to FIRRTL-generated ports
+  # FIRRTL may rename ports (e.g., 'eq' -> 'eq_fir' to avoid reserved words)
+  def build_port_mapping(rhdl_ports, firrtl_ports)
+    mapping = {}
+
+    rhdl_ports.each do |rhdl_name, width|
+      rhdl_name_str = rhdl_name.to_s
+
+      # Try exact match first (handles both symbol and string keys)
+      if firrtl_ports[rhdl_name_str] || firrtl_ports[rhdl_name]
+        mapping[rhdl_name] = rhdl_name_str
+        next
+      end
+
+      # Try common FIRRTL renaming patterns (appending _fir suffix)
+      firrtl_name = "#{rhdl_name_str}_fir"
+      if firrtl_ports[firrtl_name]
+        mapping[rhdl_name] = firrtl_name
+        next
+      end
+
+      # If no match found, log warning but don't use loose matching
+      # (loose prefix matching like 'd' -> 'd_in' causes bugs)
+    end
+
+    mapping
+  end
+end