Merge branch 'hashx_perf' into 'main'

hashx: Performance improvements for program generation See merge request tpo/core/arti!1524
2023-08-23 09:24:13 +00:00 · 2023-08-23 09:24:13 +00:00 · 696696857d
parent 8007c1bd08 26b5ae9a3c
commit 696696857d
10 changed files with 393 additions and 394 deletions
--- a/crates/hashx/src/compiler.rs
+++ b/crates/hashx/src/compiler.rs
@ -7,7 +7,7 @@
 //! `Executable` wraps a mmap buffer from the `dynasmrt` crate and the
 //! `Architecture` is implemented in a CPU-specific way.

-use crate::{program::InstructionArray, register::RegisterFile, CompilerError};
+use crate::{program::Instruction, register::RegisterFile, CompilerError};

 #[cfg(all(feature = "compiler", target_arch = "x86_64"))]
 mod x86_64;
@ -41,7 +41,7 @@ pub(crate) struct Executable {
    not(any(target_arch = "x86_64", target_arch = "aarch64"))
 ))]
 impl Architecture for Executable {
-    fn compile(_program: &InstructionArray) -> Result<Self, CompilerError> {
+    fn compile(_program: &[Instruction]) -> Result<Self, CompilerError> {
        Err(CompilerError::NotAvailable)
    }

@ -71,7 +71,7 @@ where
    Self: Sized,
 {
    /// Compile an array of instructions into an Executable
-    fn compile(program: &InstructionArray) -> Result<Self, CompilerError>;
+    fn compile(program: &[Instruction]) -> Result<Self, CompilerError>;

    /// Run the compiled code, with a RegisterFile for input and output
    fn invoke(&self, regs: &mut RegisterFile);
--- a/crates/hashx/src/compiler/aarch64.rs
+++ b/crates/hashx/src/compiler/aarch64.rs
@ -1,22 +1,31 @@
 //! Dynamically emitted HashX assembly code for aarch64 targets

 use crate::compiler::{util, Architecture, Executable};
-use crate::program::{self, Instruction, InstructionArray};
-use crate::register::{RegisterFile, RegisterId, RegisterSet};
+use crate::program::{self, Instruction};
+use crate::register::{RegisterFile, RegisterId};
 use crate::CompilerError;
 use dynasmrt::{aarch64, DynasmApi, DynasmLabelApi};
 use std::mem;

 impl Architecture for Executable {
-    fn compile(program: &InstructionArray) -> Result<Self, CompilerError> {
+    fn compile(program: &[Instruction]) -> Result<Self, CompilerError> {
        let mut asm = Assembler::new();
-        emit_load_input(&mut asm);
-        emit_init_locals(&mut asm);
-        for inst in program {
-            emit_instruction(&mut asm, inst);
+        {
+            emit_load_input(&mut asm);
+            emit_init_locals(&mut asm);
+            debug_assert_eq!(asm.len(), PROLOGUE_SIZE);
+        }
+        for inst in program {
+            let prev_len = asm.len();
+            emit_instruction(&mut asm, inst);
+            debug_assert!(asm.len() - prev_len <= INSTRUCTION_SIZE_LIMIT);
+        }
+        {
+            let prev_len = asm.len();
+            emit_store_output(&mut asm);
+            emit_return(&mut asm);
+            debug_assert_eq!(asm.len() - prev_len, EPILOGUE_SIZE);
        }
-        emit_store_output(&mut asm);
-        emit_return(&mut asm);
        asm.finalize()
    }

@ -36,8 +45,19 @@ impl Architecture for Executable {
    }
 }

-/// Architecture-specific capacity for the temporary output buffer
-const BUFFER_CAPACITY: usize = 0x200 + program::NUM_INSTRUCTIONS * 16;
+/// Architecture-specific fixed prologue size
+const PROLOGUE_SIZE: usize = 0x28;
+
+/// Architecture-specific fixed epilogue size
+const EPILOGUE_SIZE: usize = 0x24;
+
+/// Architecture-specific maximum size for one instruction
+const INSTRUCTION_SIZE_LIMIT: usize = 0x18;
+
+/// Capacity for the temporary output buffer, before code is copied into
+/// a long-lived allocation that can be made executable.
+const BUFFER_CAPACITY: usize =
+    PROLOGUE_SIZE + EPILOGUE_SIZE + program::NUM_INSTRUCTIONS * INSTRUCTION_SIZE_LIMIT;

 /// Architecture-specific specialization of the Assembler
 type Assembler = util::Assembler<aarch64::Aarch64Relocation, BUFFER_CAPACITY>;
@ -51,9 +71,12 @@ trait RegisterMapper {
 }

 impl RegisterMapper for RegisterId {
+    #[inline(always)]
    fn x(&self) -> u32 {
        1 + (self.as_usize() as u32)
    }
+
+    #[inline(always)]
    fn offset(&self) -> u32 {
        (self.as_usize() * mem::size_of::<u64>()) as u32
    }
@ -75,7 +98,8 @@ macro_rules! dynasm {
 }

 /// Emit code to initialize our local variables to default values.
-fn emit_init_locals(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_init_locals<A: DynasmApi>(asm: &mut A) {
    dynasm!(asm
        ; mov mulh_result32, wzr
        ; mov branch_prohibit_flag, wzr
@ -84,30 +108,32 @@ fn emit_init_locals(asm: &mut Assembler) {

 /// Emit code to move all input values from the RegisterFile into their
 /// actual hardware registers.
-fn emit_load_input(asm: &mut Assembler) {
-    RegisterSet::all().filter(|reg| {
+#[inline(always)]
+fn emit_load_input<A: DynasmApi>(asm: &mut A) {
+    for reg in RegisterId::all() {
        dynasm!(asm; ldr X(reg.x()), [register_file_ptr, #(reg.offset())]);
-        true
-    });
+    }
 }

 /// Emit code to move all output values from machine registers back into
 /// their RegisterFile slots.
-fn emit_store_output(asm: &mut Assembler) {
-    RegisterSet::all().filter(|reg| {
+#[inline(always)]
+fn emit_store_output<A: DynasmApi>(asm: &mut A) {
+    for reg in RegisterId::all() {
        dynasm!(asm; str X(reg.x()), [register_file_ptr, #(reg.offset())]);
-        true
-    });
+    }
 }

 /// Emit a return instruction.
-fn emit_return(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_return<A: DynasmApi>(asm: &mut A) {
    dynasm!(asm; ret);
 }

 /// Load a sign extended 32-bit constant into the const_temp_64
 /// register, using a movz/movn and movk pair.
-fn emit_i32_const_temp_64(asm: &mut Assembler, value: i32) {
+#[inline(always)]
+fn emit_i32_const_temp_64<A: DynasmApi>(asm: &mut A, value: i32) {
    let high = (value >> 16) as u32;
    let low = (value & 0xFFFF) as u32;
    if value < 0 {
@ -119,7 +145,8 @@ fn emit_i32_const_temp_64(asm: &mut Assembler, value: i32) {
 }

 /// Load a 32-bit constant into const_temp_32, without extending.
-fn emit_u32_const_temp_32(asm: &mut Assembler, value: u32) {
+#[inline(always)]
+fn emit_u32_const_temp_32<A: DynasmApi>(asm: &mut A, value: u32) {
    let high = value >> 16;
    let low = value & 0xFFFF;
    dynasm!(asm
@ -129,6 +156,7 @@ fn emit_u32_const_temp_32(asm: &mut Assembler, value: u32) {
 }

 /// Emit code for a single [`Instruction`] in the hash program.
+#[inline(always)]
 fn emit_instruction(asm: &mut Assembler, inst: &Instruction) {
    /// Common implementation for binary operations on registers
    macro_rules! reg_op {
--- a/crates/hashx/src/compiler/util.rs
+++ b/crates/hashx/src/compiler/util.rs
@ -33,14 +33,22 @@ pub(crate) struct Assembler<R: Relocation, const S: usize> {

 impl<R: Relocation, const S: usize> Assembler<R, S> {
    /// Return the entry point as an [`AssemblyOffset`].
+    #[inline(always)]
    pub(crate) fn entry() -> AssemblyOffset {
        AssemblyOffset(0)
    }

+    /// Size of the code stored so far, in bytes
+    #[inline(always)]
+    pub(crate) fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
    /// Make a new assembler with a temporary buffer but no executable buffer.
+    #[inline(always)]
    pub(crate) fn new() -> Self {
        Self {
-            buffer: Default::default(),
+            buffer: ArrayVec::new(),
            target: None,
            phantom: PhantomData,
        }
@ -51,6 +59,7 @@ impl<R: Relocation, const S: usize> Assembler<R, S> {
    /// This may fail if we can't allocate some memory, fill it, and mark
    /// it as executable. For example, a Linux platform with policy to restrict
    /// `mprotect` will show runtime errors at this point.
+    #[inline(always)]
    pub(crate) fn finalize(self) -> Result<Executable, CompilerError> {
        // We never execute code from the buffer until it's complete, and we use
        // a freshly mmap'ed buffer for each program. Because of this, we don't
@ -83,11 +92,13 @@ impl std::fmt::Debug for Executable {
 impl<R: Relocation, const S: usize> DynasmLabelApi for Assembler<R, S> {
    type Relocation = R;

+    #[inline(always)]
    fn local_label(&mut self, name: &'static str) {
        debug_assert_eq!(name, "target");
        self.target = Some(self.offset());
    }

+    #[inline(always)]
    fn backward_relocation(
        &mut self,
        name: &'static str,
@ -154,12 +165,14 @@ impl<R: Relocation, const S: usize> DynasmLabelApi for Assembler<R, S> {
 }

 impl<R: Relocation, const S: usize> Extend<u8> for Assembler<R, S> {
+    #[inline(always)]
    fn extend<T: IntoIterator<Item = u8>>(&mut self, iter: T) {
        self.buffer.extend(iter);
    }
 }

 impl<'a, R: Relocation, const S: usize> Extend<&'a u8> for Assembler<R, S> {
+    #[inline(always)]
    fn extend<T: IntoIterator<Item = &'a u8>>(&mut self, iter: T) {
        for byte in iter {
            self.buffer.push(*byte);
@ -168,20 +181,17 @@ impl<'a, R: Relocation, const S: usize> Extend<&'a u8> for Assembler<R, S> {
 }

 impl<R: Relocation, const S: usize> DynasmApi for Assembler<R, S> {
+    #[inline(always)]
    fn offset(&self) -> AssemblyOffset {
        AssemblyOffset(self.buffer.len())
    }

+    #[inline(always)]
    fn push(&mut self, byte: u8) {
        self.buffer.push(byte);
    }

-    fn align(&mut self, alignment: usize, with: u8) {
-        let offset = self.buffer.len() % alignment;
-        if offset != 0 {
-            for _ in offset..alignment {
-                self.buffer.push(with);
-            }
-        }
+    fn align(&mut self, _alignment: usize, _with: u8) {
+        unreachable!();
    }
 }
--- a/crates/hashx/src/compiler/x86_64.rs
+++ b/crates/hashx/src/compiler/x86_64.rs
@ -1,24 +1,33 @@
 //! Dynamically emitted HashX assembly code for x86_64 targets

 use crate::compiler::{util, Architecture, Executable};
-use crate::program::{self, Instruction, InstructionArray};
-use crate::register::{RegisterFile, RegisterId, RegisterSet};
+use crate::program::{self, Instruction};
+use crate::register::{RegisterFile, RegisterId};
 use crate::CompilerError;
-use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi, Register};
+use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi};
 use std::mem;

 impl Architecture for Executable {
-    fn compile(program: &InstructionArray) -> Result<Self, CompilerError> {
+    fn compile(program: &[Instruction]) -> Result<Self, CompilerError> {
        let mut asm = Assembler::new();
-        emit_save_regs(&mut asm);
-        emit_load_input(&mut asm);
-        emit_init_locals(&mut asm);
-        for inst in program {
-            emit_instruction(&mut asm, inst);
+        {
+            emit_save_regs(&mut asm);
+            emit_load_input(&mut asm);
+            emit_init_locals(&mut asm);
+            debug_assert_eq!(asm.len(), PROLOGUE_SIZE);
+        }
+        for inst in program {
+            let prev_len = asm.len();
+            emit_instruction(&mut asm, inst);
+            debug_assert!(asm.len() - prev_len <= INSTRUCTION_SIZE_LIMIT);
+        }
+        {
+            let prev_len = asm.len();
+            emit_store_output(&mut asm);
+            emit_restore_regs(&mut asm);
+            emit_return(&mut asm);
+            debug_assert_eq!(asm.len() - prev_len, EPILOGUE_SIZE);
        }
-        emit_store_output(&mut asm);
-        emit_restore_regs(&mut asm);
-        emit_return(&mut asm);
        asm.finalize()
    }

@ -37,8 +46,19 @@ impl Architecture for Executable {
    }
 }

-/// Architecture-specific capacity for the temporary output buffer
-const BUFFER_CAPACITY: usize = 0x200 + program::NUM_INSTRUCTIONS * 16;
+/// Architecture-specific fixed prologue size
+const PROLOGUE_SIZE: usize = 0x68;
+
+/// Architecture-specific fixed epilogue size
+const EPILOGUE_SIZE: usize = 0x60;
+
+/// Architecture-specific maximum size for one instruction
+const INSTRUCTION_SIZE_LIMIT: usize = 0x11;
+
+/// Capacity for the temporary output buffer, before code is copied into
+/// a long-lived allocation that can be made executable.
+const BUFFER_CAPACITY: usize =
+    PROLOGUE_SIZE + EPILOGUE_SIZE + program::NUM_INSTRUCTIONS * INSTRUCTION_SIZE_LIMIT;

 /// Architecture-specific specialization of the Assembler
 type Assembler = util::Assembler<x64::X64Relocation, BUFFER_CAPACITY>;
@ -52,9 +72,12 @@ trait RegisterMapper {
 }

 impl RegisterMapper for RegisterId {
+    #[inline(always)]
    fn rq(&self) -> u8 {
        8 + (self.as_usize() as u8)
    }
+
+    #[inline(always)]
    fn offset(&self) -> i32 {
        (self.as_usize() * mem::size_of::<u64>()) as i32
    }
@ -77,7 +100,8 @@ macro_rules! dynasm {
 }

 /// Emit code to initialize our local variables to default values.
-fn emit_init_locals(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_init_locals<A: DynasmApi>(asm: &mut A) {
    dynasm!(asm
    ; xor mulh_result64, mulh_result64
    ; xor branch_prohibit_flag, branch_prohibit_flag
@ -105,47 +129,51 @@ const fn stack_size() -> i32 {
 }

 /// Emit code to allocate stack space and store REGS_TO_SAVE.
-fn emit_save_regs(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_save_regs<A: DynasmApi>(asm: &mut A) {
    dynasm!(asm; sub rsp, stack_size());
    for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() {
        let offset = (i * mem::size_of::<u64>()) as i32;
-        dynasm!(asm; mov [rsp + offset], Rq(reg.code()));
+        dynasm!(asm; mov [rsp + offset], Rq(*reg as u8));
    }
 }

 /// Emit code to restore REGS_TO_SAVE and deallocate stack space.
-fn emit_restore_regs(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_restore_regs<A: DynasmApi>(asm: &mut A) {
    for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() {
        let offset = (i * mem::size_of::<u64>()) as i32;
-        dynasm!(asm; mov Rq(reg.code()), [rsp + offset]);
+        dynasm!(asm; mov Rq(*reg as u8), [rsp + offset]);
    }
    dynasm!(asm; add rsp, stack_size());
 }

 /// Emit code to move all input values from the RegisterFile into their
 /// actual hardware registers.
-fn emit_load_input(asm: &mut Assembler) {
-    RegisterSet::all().filter(|reg| {
+#[inline(always)]
+fn emit_load_input<A: DynasmApi>(asm: &mut A) {
+    for reg in RegisterId::all() {
        dynasm!(asm; mov Rq(reg.rq()), [register_file_ptr + reg.offset()]);
-        true
-    });
+    }
 }

 /// Emit code to move all output values from machine registers back into
 /// their RegisterFile slots.
-fn emit_store_output(asm: &mut Assembler) {
-    RegisterSet::all().filter(|reg| {
+#[inline(always)]
+fn emit_store_output<A: DynasmApi>(asm: &mut A) {
+    for reg in RegisterId::all() {
        dynasm!(asm; mov [register_file_ptr + reg.offset()], Rq(reg.rq()));
-        true
-    });
+    }
 }

 /// Emit a return instruction.
-fn emit_return(asm: &mut Assembler) {
+#[inline(always)]
+fn emit_return<A: DynasmApi>(asm: &mut A) {
    dynasm!(asm; ret);
 }

 /// Emit code for a single [`Instruction`] in the hash program.
+#[inline(always)]
 fn emit_instruction(asm: &mut Assembler, inst: &Instruction) {
    /// Common implementation for binary operations on registers
    macro_rules! reg_op {
--- a/crates/hashx/src/constraints.rs
+++ b/crates/hashx/src/constraints.rs
@ -9,7 +9,7 @@
 //! Generating correct HashX output depends on applying exactly the right
 //! constraints.

-use crate::program::{Instruction, InstructionArray, Opcode};
+use crate::program::{Instruction, Opcode};
 use crate::register::{RegisterId, RegisterSet, NUM_REGISTERS};
 use crate::scheduler::Scheduler;

@ -39,7 +39,7 @@ mod model {
        matches!(op, Opcode::Mul | Opcode::SMulH | Opcode::UMulH)
    }

-    /// Does an instruction prohibit using the same register for source and dest?
+    /// Does an instruction prohibit using the same register for src and dst?
    ///
    /// Meaningful only for ops that have both a source and destination register.
    #[inline(always)]
@ -85,31 +85,19 @@ mod model {
    #[inline(always)]
    pub(super) fn writer_pair_allowed(
        pass: Pass,
-        last_writer: Option<&RegisterWriter>,
-        this_writer: &RegisterWriter,
+        last_writer: RegisterWriter,
+        this_writer: RegisterWriter,
    ) -> bool {
        match (last_writer, this_writer) {
            // HashX disallows back-to-back 64-bit multiplies on the
            // same destination register in Pass::Original but permits
            // them on the retry if the source register isn't identical.
-            (
-                Some(RegisterWriter::RegSource(Opcode::Mul, _)),
-                RegisterWriter::RegSource(Opcode::Mul, _),
-            ) if matches!(pass, Pass::Original) => false,
-
-            // Add/Sub from the same source register can't be paired
-            // with each other. (They might cancel out)
-            (
-                Some(RegisterWriter::RegSource(Opcode::AddShift, last_src)),
-                RegisterWriter::RegSource(Opcode::Sub, this_src),
-            ) if this_src == last_src => false,
-            (
-                Some(RegisterWriter::RegSource(Opcode::Sub, last_src)),
-                RegisterWriter::RegSource(Opcode::AddShift, this_src),
-            ) if this_src == last_src => false,
+            (RegisterWriter::Mul(_), RegisterWriter::Mul(_)) if matches!(pass, Pass::Original) => {
+                false
+            }

            // Other pairings are allowed if the writer info differs at all.
-            (last_writer, this_writer) => last_writer != Some(this_writer),
+            (last_writer, this_writer) => last_writer != this_writer,
        }
    }

@ -132,29 +120,57 @@ mod model {
    /// This is conceptually similar to storing the last [`super::Instruction`]
    /// that wrote to a register, but HashX sometimes needs information for
    /// constraints which won't end up in the final `Instruction`.
-    #[derive(Debug, Clone, Eq, PartialEq)]
+    ///
+    /// We've chosen the encoding to minimize the code size in
+    /// writer_pair_allowed. Most pairwise comparisons can just be a register
+    /// equality test.
+    ///
+    /// The instructions here fall into three categories which use their own
+    /// format for encoding arguments:
+    ///
+    ///  - Wide Multiply, extra u32
+    ///
+    ///    UMulH and SMulH use an additional otherwise unused 32-bit value
+    ///    from the Rng when considering writer collisions.
+    ///
+    ///    As far as I can tell this is a bug in the original implementation
+    ///    but we can't change the behavior without breaking compatibility.
+    ///
+    ///    The collisions are rare enough not to be a worthwhile addition
+    ///    to ASIC-resistance. It seems like this was a vestigial feature
+    ///    left over from immediate value matching features which were removed
+    ///    during the development of HashX, but I can't be sure.
+    ///
+    ///  - Constant source
+    ///
+    ///    Only considers the opcode itself, not the specific immediate value.
+    ///
+    ///  - Register source
+    ///
+    ///    Considers the source register, collapses add/subtract into one op.
+    ///
+    #[derive(Debug, Default, Clone, Copy, Eq, PartialEq)]
    pub(crate) enum RegisterWriter {
-        /// Special format for wide multiply
-        ///
-        /// HashX includes an otherwise unused phantom immediate value which
-        /// can (very rarely) affect constraint selection if it collides.
-        ///
-        /// As far as I can tell this is a bug in the original implementation
-        /// but we can't change the behavior without breaking compatibility.
-        ///
-        /// The collisions are rare enough not to be a worthwhile addition
-        /// to ASIC-resistance. It seems like this was a vestigial feature
-        /// left over from immediate value matching features which were removed
-        /// during the development of HashX, but I can't be sure.
-        WideMul(Opcode, u32),
-
-        /// Writer for instructions with an immediate source
-        ///
-        /// The specific immediate value is not used.
-        ConstSource(Opcode),
-
-        /// Writer for instructions with register source, unique by source register
-        RegSource(Opcode, RegisterId),
+        /// Register not written yet
+        #[default]
+        None,
+        /// Register source writer for [`super::Instruction::Mul`]
+        Mul(RegisterId),
+        /// Wide multiply writer for [`super::Instruction::UMulH`]
+        UMulH(u32),
+        /// Wide multiply writer for [`super::Instruction::SMulH`]
+        SMulH(u32),
+        /// Register source writer for [`super::Instruction::AddShift`]
+        /// and [`super::Instruction::Sub`]
+        AddSub(RegisterId),
+        /// Constant source writer for [`super::Instruction::AddConst`]
+        AddConst,
+        /// Register source writer for [`super::Instruction::Xor`]
+        Xor(RegisterId),
+        /// Constant source writer for [`super::Instruction::XorConst`]
+        XorConst,
+        /// Constant source writer for [`super::Instruction::Rotate`]
+        Rotate,
    }
 }

@ -187,16 +203,13 @@ impl Validator {

    /// Commit a new instruction to the validator state.
    #[inline(always)]
-    pub(crate) fn commit_instruction(&mut self, inst: &Instruction, regw: Option<RegisterWriter>) {
+    pub(crate) fn commit_instruction(&mut self, inst: &Instruction, regw: RegisterWriter) {
        if model::is_multiply(inst.opcode()) {
            self.multiply_count += 1;
        }
        match inst.destination() {
-            None => assert!(regw.is_none()),
-            Some(dst) => self.writer_map.insert(
-                dst,
-                regw.expect("instructions with destination always have a RegisterWriter"),
-            ),
+            None => debug_assert_eq!(regw, RegisterWriter::None),
+            Some(dst) => self.writer_map.insert(dst, regw),
        }
    }

@ -208,7 +221,7 @@ impl Validator {
    pub(crate) fn check_whole_program(
        &self,
        scheduler: &Scheduler,
-        instructions: &InstructionArray,
+        instructions: &[Instruction],
    ) -> Result<(), ()> {
        if instructions.len() == model::REQUIRED_INSTRUCTIONS
            && scheduler.overall_latency().as_usize() == model::REQUIRED_OVERALL_RESULT_AT_CYCLE
@ -220,36 +233,67 @@ impl Validator {
        }
    }

-    /// Figure out the allowed set of destination registers for an op after its
-    /// source is known, using the current state of the validator.
+    /// Begin checking which destination registers are allowed for an op after
+    /// its source is known, using the current state of the validator.
+    ///
+    /// Returns a DstRegisterChecker which can be used to test each specific
+    /// destination RegisterId quickly.
    #[inline(always)]
    pub(crate) fn dst_registers_allowed(
        &self,
-        available: RegisterSet,
        op: Opcode,
        pass: Pass,
-        writer_info: &RegisterWriter,
+        writer_info: RegisterWriter,
        src: Option<RegisterId>,
-    ) -> RegisterSet {
-        available.filter(
-            #[inline(always)]
-            |dst| {
-                // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't
-                // be used as destination for AddShift.
-                if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT {
-                    return false;
-                }
-
-                // A few instructions disallow choosing src and dst as the same
-                if model::disallow_src_is_dst(op) && src == Some(dst) {
-                    return false;
-                }
-
-                // Additional constraints are written on the pair of previous and
-                // current instructions with the same destination.
-                model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info)
+    ) -> DstRegisterChecker<'_> {
+        DstRegisterChecker {
+            pass,
+            writer_info,
+            writer_map: &self.writer_map,
+            op_is_add_shift: op == Opcode::AddShift,
+            disallow_equal: if model::disallow_src_is_dst(op) {
+                src
+            } else {
+                None
            },
-        )
+        }
+    }
+}
+
+/// State information returned by [`Validator::dst_registers_allowed`]
+#[derive(Debug, Clone)]
+pub(crate) struct DstRegisterChecker<'v> {
+    /// Is this the original or retry pass?
+    pass: Pass,
+    /// Reference to a table of [`RegisterWriter`] information for each register
+    writer_map: &'v RegisterWriterMap,
+    /// The new [`RegisterWriter`] under consideration
+    writer_info: RegisterWriter,
+    /// Was this [`Opcode::AddShift`]?
+    op_is_add_shift: bool,
+    /// Optionally disallow one matching register, used to implement [`model::disallow_src_is_dst`]
+    disallow_equal: Option<RegisterId>,
+}
+
+impl<'v> DstRegisterChecker<'v> {
+    /// Check a single destination register for usability, using context from
+    /// [`Validator::dst_registers_allowed`]
+    #[inline(always)]
+    pub(crate) fn check(&self, dst: RegisterId) -> bool {
+        // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't
+        // be used as destination for AddShift.
+        if self.op_is_add_shift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT {
+            return false;
+        }
+
+        // A few instructions disallow choosing src and dst as the same
+        if Some(dst) == self.disallow_equal {
+            return false;
+        }
+
+        // Additional constraints are written on the pair of previous and
+        // current instructions with the same destination.
+        model::writer_pair_allowed(self.pass, self.writer_map.get(dst), self.writer_info)
    }
 }

@ -259,7 +303,7 @@ impl Validator {
 pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> RegisterSet {
    // HashX defines a special case DISALLOW_REGISTER_FOR_ADDSHIFT for
    // destination registers, and it also includes a look-ahead
-    // condition here in source register allocation to prevent the dest
+    // condition here in source register allocation to prevent the dst
    // allocation from getting stuck as often. If we have only two
    // remaining registers for AddShift and one is the disallowed reg,
    // HashX defines that the random choice is short-circuited early
@ -269,7 +313,7 @@ pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> Regis
        && available.contains(model::DISALLOW_REGISTER_FOR_ADDSHIFT)
        && available.len() == 2
    {
-        available.filter(
+        RegisterSet::from_filter(
            #[inline(always)]
            |reg| reg == model::DISALLOW_REGISTER_FOR_ADDSHIFT,
        )
@ -297,9 +341,9 @@ pub(crate) fn opcode_pair_allowed(previous: Option<Opcode>, proposed: Opcode) ->
    }
 }

-/// Map each [`RegisterId`] to an [`Option<RegisterWriter>`]
+/// Map each [`RegisterId`] to an [`RegisterWriter`]
 #[derive(Default, Debug, Clone)]
-struct RegisterWriterMap([Option<RegisterWriter>; NUM_REGISTERS]);
+struct RegisterWriterMap([RegisterWriter; NUM_REGISTERS]);

 impl RegisterWriterMap {
    /// Make a new empty register writer map.
@ -313,12 +357,12 @@ impl RegisterWriterMap {
    /// Write or overwrite the last [`RegisterWriter`] associated with `reg`.
    #[inline(always)]
    fn insert(&mut self, reg: RegisterId, writer: RegisterWriter) {
-        self.0[reg.as_usize()] = Some(writer);
+        self.0[reg.as_usize()] = writer;
    }

    /// Return the most recent mapping for 'reg', if any.
    #[inline(always)]
-    fn get(&self, reg: RegisterId) -> Option<&RegisterWriter> {
-        self.0[reg.as_usize()].as_ref()
+    fn get(&self, reg: RegisterId) -> RegisterWriter {
+        self.0[reg.as_usize()]
    }
 }
--- a/crates/hashx/src/generator.rs
+++ b/crates/hashx/src/generator.rs
@ -1,7 +1,7 @@
 //! Pseudorandom generator for hash programs and parts thereof

 use crate::constraints::{self, Pass, RegisterWriter, Validator};
-use crate::program::{Instruction, InstructionArray, Opcode, Program};
+use crate::program::{Instruction, Opcode};
 use crate::rand::RngBuffer;
 use crate::register::{RegisterId, RegisterSet};
 use crate::scheduler::{InstructionPlan, Scheduler};
@ -84,17 +84,8 @@ mod model {
    pub(super) const BRANCH_MASK_BIT_WEIGHT: usize = 4;
 }

-/// Generate a hash program from an arbitrary [`RngCore`] implementer.
-///
-/// This can return [`Error::ProgramConstraints`] if the HashX post-generation
-/// program verification fails. During normal use this will happen once per
-/// several thousand random seeds, and the caller should skip to another seed.
-pub(crate) fn generate_program<T: RngCore>(rng: &mut T) -> Result<Program, Error> {
-    Generator::new(rng).generate_program()
-}
-
-/// Internal state for the program generator
-struct Generator<'r, R: RngCore> {
+/// Program generator
+pub(crate) struct Generator<'r, R: RngCore> {
    /// The program generator wraps a random number generator, via [`RngBuffer`].
    rng: RngBuffer<'r, R>,

@ -118,7 +109,7 @@ struct Generator<'r, R: RngCore> {
 impl<'r, R: RngCore> Generator<'r, R> {
    /// Create a fresh program generator from a random number generator state.
    #[inline(always)]
-    fn new(rng: &'r mut R) -> Self {
+    pub(crate) fn new(rng: &'r mut R) -> Self {
        Generator {
            rng: RngBuffer::new(rng),
            scheduler: Scheduler::new(),
@ -135,7 +126,7 @@ impl<'r, R: RngCore> Generator<'r, R> {
    /// The choice is perfectly uniform only if the register set is a power of
    /// two length. Uniformity is not critical here.
    #[inline(always)]
-    fn select_register(&mut self, reg_options: RegisterSet) -> Result<RegisterId, ()> {
+    fn select_register(&mut self, reg_options: &RegisterSet) -> Result<RegisterId, ()> {
        match reg_options.len() {
            0 => Err(()),
            1 => Ok(reg_options.index(0)),
@ -198,30 +189,28 @@ impl<'r, R: RngCore> Generator<'r, R> {

    /// Generate an entire program.
    ///
-    /// This generates instructions until the state can't be advanced any
-    /// further. Returns with [`Error::ProgramConstraints`] if the program
-    /// fails the `HashX` whole-program checks. These constraint failures occur
-    /// in normal use, on a small fraction of seed values.
+    /// Generates instructions into a provided [`Vec`] until the generator
+    /// state can't be advanced any further. Runs the whole-program validator.
+    /// Returns with [`Error::ProgramConstraints`] if the program fails these
+    /// checks. This happens in normal use on a small fraction of seed values.
    #[inline(always)]
-    fn generate_program(&mut self) -> Result<Program, Error> {
-        let mut array: InstructionArray = Default::default();
-        while array.len() < array.capacity() {
+    pub(crate) fn generate_program(&mut self, output: &mut Vec<Instruction>) -> Result<(), Error> {
+        assert!(output.is_empty());
+        while output.len() < output.capacity() {
            match self.generate_instruction() {
                Err(()) => break,
                Ok((inst, regw)) => {
                    let state_advance = self.commit_instruction_state(&inst, regw);
-                    array.push(inst);
+                    output.push(inst);
                    if let Err(()) = state_advance {
                        break;
                    }
                }
            }
        }
-        let result = self.validator.check_whole_program(&self.scheduler, &array);
-        match result {
-            Err(()) => Err(Error::ProgramConstraints),
-            Ok(()) => Ok(Program::new(array)),
-        }
+        self.validator
+            .check_whole_program(&self.scheduler, output)
+            .map_err(|()| Error::ProgramConstraints)
    }

    /// Generate the next instruction.
@ -235,7 +224,7 @@ impl<'r, R: RngCore> Generator<'r, R> {
    /// This only returns `Err(())` if we've hit a stopping condition for the
    /// program.
    #[inline(always)]
-    fn generate_instruction(&mut self) -> Result<(Instruction, Option<RegisterWriter>), ()> {
+    fn generate_instruction(&mut self) -> Result<(Instruction, RegisterWriter), ()> {
        loop {
            if let Ok(result) = self.instruction_gen_attempt(Pass::Original) {
                return Ok(result);
@ -268,18 +257,30 @@ impl<'r, R: RngCore> Generator<'r, R> {
    /// choosing the opcode-specific parts of the instruction. Each of these
    /// choices affects the Rng state, and may fail if conditions are not met.
    #[inline(always)]
-    fn instruction_gen_attempt(
-        &mut self,
-        pass: Pass,
-    ) -> Result<(Instruction, Option<RegisterWriter>), ()> {
+    fn instruction_gen_attempt(&mut self, pass: Pass) -> Result<(Instruction, RegisterWriter), ()> {
        let op = self.choose_opcode(pass);
        let plan = self.scheduler.instruction_plan(op)?;
        let (inst, regw) = self.choose_instruction_with_opcode_plan(op, pass, &plan)?;
-        assert_eq!(inst.opcode(), op);
+        debug_assert_eq!(inst.opcode(), op);
        self.scheduler.commit_instruction_plan(&plan, &inst);
        Ok((inst, regw))
    }

+    /// Choose only a source register, depending on the opcode and timing plan
+    #[inline(never)]
+    fn choose_src_reg(
+        &mut self,
+        op: Opcode,
+        timing_plan: &InstructionPlan,
+    ) -> Result<RegisterId, ()> {
+        let src_set = RegisterSet::from_filter(|src| {
+            self.scheduler
+                .register_available(src, timing_plan.cycle_issued())
+        });
+        let src_set = constraints::src_registers_allowed(src_set, op);
+        self.select_register(&src_set)
+    }
+
    /// Choose both a source and destination register using a normal
    /// [`RegisterWriter`] for two-operand instructions.
    #[inline(always)]
@ -287,18 +288,12 @@ impl<'r, R: RngCore> Generator<'r, R> {
        &mut self,
        op: Opcode,
        pass: Pass,
+        writer_info_fn: fn(RegisterId) -> RegisterWriter,
        timing_plan: &InstructionPlan,
    ) -> Result<(RegisterId, RegisterId, RegisterWriter), ()> {
-        let avail_set = self
-            .scheduler
-            .registers_available(timing_plan.cycle_issued());
-        let src_set = constraints::src_registers_allowed(avail_set, op);
-        let src = self.select_register(src_set)?;
-        let writer_info = RegisterWriter::RegSource(op, src);
-        let dst_set =
-            self.validator
-                .dst_registers_allowed(avail_set, op, pass, &writer_info, Some(src));
-        let dst = self.select_register(dst_set)?;
+        let src = self.choose_src_reg(op, timing_plan)?;
+        let writer_info = writer_info_fn(src);
+        let dst = self.choose_dst_reg(op, pass, writer_info, Some(src), timing_plan)?;
        Ok((src, dst, writer_info))
    }

@ -310,38 +305,34 @@ impl<'r, R: RngCore> Generator<'r, R> {
        &mut self,
        op: Opcode,
        pass: Pass,
-        writer_info: &RegisterWriter,
+        writer_info: RegisterWriter,
        timing_plan: &InstructionPlan,
    ) -> Result<(RegisterId, RegisterId), ()> {
-        let avail_set = self
-            .scheduler
-            .registers_available(timing_plan.cycle_issued());
-        let src_set = constraints::src_registers_allowed(avail_set, op);
-        let src = self.select_register(src_set)?;
-        let dst_set =
-            self.validator
-                .dst_registers_allowed(avail_set, op, pass, writer_info, Some(src));
-        let dst = self.select_register(dst_set)?;
+        let src = self.choose_src_reg(op, timing_plan)?;
+        let dst = self.choose_dst_reg(op, pass, writer_info, Some(src), timing_plan)?;
        Ok((src, dst))
    }

-    /// Choose a destination register only.
-    #[inline(always)]
+    /// Choose a destination register only, using source and writer info
+    /// as well as the current state of the validator.
+    #[inline(never)]
    fn choose_dst_reg(
        &mut self,
        op: Opcode,
        pass: Pass,
-        writer_info: &RegisterWriter,
+        writer_info: RegisterWriter,
+        src: Option<RegisterId>,
        timing_plan: &InstructionPlan,
    ) -> Result<RegisterId, ()> {
-        let avail_set = self
-            .scheduler
-            .registers_available(timing_plan.cycle_issued());
-        let dst_set = self
+        let validator = self
            .validator
-            .dst_registers_allowed(avail_set, op, pass, writer_info, None);
-        let dst = self.select_register(dst_set)?;
-        Ok(dst)
+            .dst_registers_allowed(op, pass, writer_info, src);
+        let dst_set = RegisterSet::from_filter(|dst| {
+            self.scheduler
+                .register_available(dst, timing_plan.cycle_issued())
+                && validator.check(dst)
+        });
+        self.select_register(&dst_set)
    }

    /// With an [`Opcode`] and an execution unit timing plan already in mind,
@ -355,78 +346,80 @@ impl<'r, R: RngCore> Generator<'r, R> {
        op: Opcode,
        pass: Pass,
        plan: &InstructionPlan,
-    ) -> Result<(Instruction, Option<RegisterWriter>), ()> {
+    ) -> Result<(Instruction, RegisterWriter), ()> {
        Ok(match op {
-            Opcode::Target => (Instruction::Target, None),
+            Opcode::Target => (Instruction::Target, RegisterWriter::None),

            Opcode::Branch => (
                Instruction::Branch {
                    mask: self.select_constant_weight_bit_mask(model::BRANCH_MASK_BIT_WEIGHT),
                },
-                None,
+                RegisterWriter::None,
            ),

            Opcode::UMulH => {
-                let regw = RegisterWriter::WideMul(op, self.rng.next_u32());
-                let (src, dst) =
-                    self.choose_src_dst_regs_with_writer_info(op, pass, &regw, plan)?;
-                (Instruction::UMulH { src, dst }, Some(regw))
+                let regw = RegisterWriter::UMulH(self.rng.next_u32());
+                let (src, dst) = self.choose_src_dst_regs_with_writer_info(op, pass, regw, plan)?;
+                (Instruction::UMulH { src, dst }, regw)
            }

            Opcode::SMulH => {
-                let regw = RegisterWriter::WideMul(op, self.rng.next_u32());
-                let (src, dst) =
-                    self.choose_src_dst_regs_with_writer_info(op, pass, &regw, plan)?;
-                (Instruction::SMulH { src, dst }, Some(regw))
+                let regw = RegisterWriter::SMulH(self.rng.next_u32());
+                let (src, dst) = self.choose_src_dst_regs_with_writer_info(op, pass, regw, plan)?;
+                (Instruction::SMulH { src, dst }, regw)
            }

            Opcode::Mul => {
-                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?;
-                (Instruction::Mul { src, dst }, Some(regw))
+                let regw = RegisterWriter::Mul;
+                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?;
+                (Instruction::Mul { src, dst }, regw)
            }

            Opcode::Sub => {
-                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?;
-                (Instruction::Sub { src, dst }, Some(regw))
+                let regw = RegisterWriter::AddSub;
+                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?;
+                (Instruction::Sub { src, dst }, regw)
            }

            Opcode::Xor => {
-                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?;
-                (Instruction::Xor { src, dst }, Some(regw))
+                let regw = RegisterWriter::Xor;
+                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?;
+                (Instruction::Xor { src, dst }, regw)
            }

            Opcode::AddShift => {
+                let regw = RegisterWriter::AddSub;
                let left_shift = (self.rng.next_u32() & 3) as u8;
-                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?;
+                let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?;
                (
                    Instruction::AddShift {
                        src,
                        dst,
                        left_shift,
                    },
-                    Some(regw),
+                    regw,
                )
            }

            Opcode::AddConst => {
-                let regw = RegisterWriter::ConstSource(op);
+                let regw = RegisterWriter::AddConst;
                let src = self.select_nonzero_u32(u32::MAX) as i32;
-                let dst = self.choose_dst_reg(op, pass, &regw, plan)?;
-                (Instruction::AddConst { src, dst }, Some(regw))
+                let dst = self.choose_dst_reg(op, pass, regw, None, plan)?;
+                (Instruction::AddConst { src, dst }, regw)
            }

            Opcode::XorConst => {
-                let regw = RegisterWriter::ConstSource(op);
+                let regw = RegisterWriter::XorConst;
                let src = self.select_nonzero_u32(u32::MAX) as i32;
-                let dst = self.choose_dst_reg(op, pass, &regw, plan)?;
-                (Instruction::XorConst { src, dst }, Some(regw))
+                let dst = self.choose_dst_reg(op, pass, regw, None, plan)?;
+                (Instruction::XorConst { src, dst }, regw)
            }

            Opcode::Rotate => {
-                let regw = RegisterWriter::ConstSource(op);
+                let regw = RegisterWriter::Rotate;
                let right_rotate: u8 = self.select_nonzero_u32(63) as u8;
-                let dst = self.choose_dst_reg(op, pass, &regw, plan)?;
-                (Instruction::Rotate { dst, right_rotate }, Some(regw))
+                let dst = self.choose_dst_reg(op, pass, regw, None, plan)?;
+                (Instruction::Rotate { dst, right_rotate }, regw)
            }
        })
    }
@ -440,7 +433,7 @@ impl<'r, R: RngCore> Generator<'r, R> {
    fn commit_instruction_state(
        &mut self,
        inst: &Instruction,
-        regw: Option<RegisterWriter>,
+        regw: RegisterWriter,
    ) -> Result<(), ()> {
        self.validator.commit_instruction(inst, regw);
        self.scheduler.advance_instruction_stream(inst.opcode())
--- a/crates/hashx/src/lib.rs
+++ b/crates/hashx/src/lib.rs
@ -51,7 +51,6 @@ mod scheduler;
 mod siphash;

 use crate::compiler::{Architecture, Executable};
-use crate::generator::generate_program;
 use crate::program::Program;
 use rand_core::RngCore;

@ -111,8 +110,8 @@ pub struct HashX {
 /// to store the program data.
 #[derive(Debug)]
 enum RuntimeProgram {
-    /// Select the interpreted runtime, and hold a boxed Program for it to run.
-    Interpret(Box<Program>),
+    /// Select the interpreted runtime, and hold a Program for it to run.
+    Interpret(Program),
    /// Select the compiled runtime, and hold an executable code page.
    Compiled(Executable),
 }
@ -203,7 +202,7 @@ impl HashXBuilder {
        rng: &mut R,
        register_key: SipState,
    ) -> Result<HashX, Error> {
-        let program = generate_program(rng)?;
+        let program = Program::generate(rng)?;
        self.build_from_program(program, register_key)
    }

@ -217,13 +216,13 @@ impl HashXBuilder {
        Ok(HashX {
            register_key,
            program: match self.runtime {
-                RuntimeOption::InterpretOnly => RuntimeProgram::Interpret(Box::new(program)),
+                RuntimeOption::InterpretOnly => RuntimeProgram::Interpret(program),
                RuntimeOption::CompileOnly => {
                    RuntimeProgram::Compiled(Architecture::compile((&program).into())?)
                }
                RuntimeOption::TryCompile => match Architecture::compile((&program).into()) {
                    Ok(exec) => RuntimeProgram::Compiled(exec),
-                    Err(_) => RuntimeProgram::Interpret(Box::new(program)),
+                    Err(_) => RuntimeProgram::Interpret(program),
                },
            },
        })
--- a/crates/hashx/src/program.rs
+++ b/crates/hashx/src/program.rs
@ -1,7 +1,9 @@
 //! Define the internal hash program representation used by HashX.

+use crate::generator::Generator;
 use crate::register::{RegisterFile, RegisterId};
-use arrayvec::ArrayVec;
+use crate::Error;
+use rand_core::RngCore;
 use std::fmt;
 use std::ops::BitXor;

@ -179,24 +181,14 @@ impl Instruction {
    }
 }

-/// Fixed-size array of instructions, either a complete program or a
-/// program under construction
-pub(crate) type InstructionArray = ArrayVec<Instruction, NUM_INSTRUCTIONS>;
-
-/// Generated `HashX` program, as a list of instructions.
+/// Generated `HashX` program, as a boxed slice of instructions
 #[derive(Clone, Default)]
-pub struct Program {
-    /// The InstructionArray that this Program wraps
-    ///
-    /// InstructionArray provides storage, and this type indicates that the
-    /// program should be a well-formed HashX function.
-    instructions: InstructionArray,
-}
+pub struct Program(Box<[Instruction]>);

 impl fmt::Debug for Program {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        writeln!(f, "Program {{")?;
-        for (addr, inst) in self.instructions.iter().enumerate() {
+        for (addr, inst) in self.0.iter().enumerate() {
            writeln!(f, " [{:3}]: {:?}", addr, inst)?;
        }
        write!(f, "}}")
@ -204,10 +196,16 @@ impl fmt::Debug for Program {
 }

 impl Program {
-    /// Construct a finished `Program` from a list of instructions.
-    #[inline(always)]
-    pub(crate) fn new(instructions: InstructionArray) -> Self {
-        Self { instructions }
+    /// Generate a new `Program` from an arbitrary [`RngCore`] implementer
+    ///
+    /// This can return [`Error::ProgramConstraints`] if the HashX
+    /// post-generation program verification fails. During normal use this
+    /// will happen once per several thousand random seeds, and the caller
+    /// should skip to another seed.
+    pub(crate) fn generate<T: RngCore>(rng: &mut T) -> Result<Self, Error> {
+        let mut instructions = Vec::with_capacity(NUM_INSTRUCTIONS);
+        Generator::new(rng).generate_program(&mut instructions)?;
+        Ok(Program(instructions.into_boxed_slice()))
    }

    /// Reference implementation for `Program` behavior
@ -254,9 +252,9 @@ impl Program {
            }};
        }

-        while program_counter < self.instructions.len() {
+        while program_counter < self.0.len() {
            let next_pc = program_counter + 1;
-            program_counter = match &self.instructions[program_counter] {
+            program_counter = match &self.0[program_counter] {
                Instruction::Target => {
                    branch_target = Some(program_counter);
                    next_pc
@ -305,9 +303,9 @@ impl Program {
    }
 }

-impl<'a> From<&'a Program> for &'a InstructionArray {
+impl<'a> From<&'a Program> for &'a [Instruction] {
    #[inline(always)]
    fn from(prog: &'a Program) -> Self {
-        &prog.instructions
+        &prog.0
    }
 }
--- a/crates/hashx/src/register.rs
+++ b/crates/hashx/src/register.rs
@ -1,6 +1,7 @@
 //! Define HashX's register file, and how it's created and digested.

 use crate::siphash::{siphash24_ctr, SipState};
+use arrayvec::ArrayVec;
 use std::fmt;

 /// Number of virtual registers in the HashX machine
@ -29,6 +30,12 @@ impl RegisterId {
    pub(crate) fn as_usize(&self) -> usize {
        self.0 as usize
    }
+
+    /// Create an iterator over all RegisterId
+    #[inline(always)]
+    pub(crate) fn all() -> impl Iterator<Item = RegisterId> {
+        (0_u8..(NUM_REGISTERS as u8)).map(RegisterId)
+    }
 }

 /// Identify a set of RegisterIds
@ -36,15 +43,10 @@ impl RegisterId {
 /// This could be done compactly as a u8 bitfield for storage purposes, but
 /// in our program generator this is never stored long-term. Instead, we want
 /// something the optimizer can reason about as effectively as possible, and
-/// let's inline as much as possible in order to resolve special cases in
-/// the program generator at compile time.
-#[derive(Clone, Copy, Eq, PartialEq)]
-pub(crate) struct RegisterSet {
-    /// Number of registers in the set
-    len: usize,
-    /// Array indexed by register Id, indicating registers we've excluded
-    reg_not_in_set: [bool; 8],
-}
+/// we want to optimize for an index() implementation that doesn't branch.
+/// This uses a fixed-capacity array of registers in-set, always sorted.
+#[derive(Default, Clone, Eq, PartialEq)]
+pub(crate) struct RegisterSet(ArrayVec<RegisterId, NUM_REGISTERS>);

 impl fmt::Debug for RegisterSet {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@ -60,66 +62,29 @@ impl fmt::Debug for RegisterSet {
 }

 impl RegisterSet {
-    /// Construct the set of all registers.
-    ///
-    /// This is the main way to construct a new RegisterId, starting with
-    /// all available registers and filtering them repeatedly.
-    #[inline(always)]
-    pub(crate) fn all() -> Self {
-        Self {
-            len: NUM_REGISTERS,
-            reg_not_in_set: Default::default(),
-        }
-    }
-
    /// Number of registers still contained in this set
    #[inline(always)]
    pub(crate) fn len(&self) -> usize {
-        self.len
+        self.0.len()
    }

    /// Test if a register is contained in the set.
    #[inline(always)]
    pub(crate) fn contains(&self, id: RegisterId) -> bool {
-        !self.reg_not_in_set[id.0 as usize]
+        self.0.contains(&id)
    }

-    /// Filter this register set through a predicate.
-    ///
-    /// Invokes the predicate only for registers in this set, and returns the
-    /// set of registers for which it returned true.
+    /// Build a new RegisterSet from each register for which a predicate
+    /// function returns `true`.
    #[inline(always)]
-    pub(crate) fn filter<P: FnMut(RegisterId) -> bool>(&self, mut predicate: P) -> Self {
-        let mut result = Self {
-            len: 0,
-            reg_not_in_set: Default::default(),
-        };
-        self.filter_impl(0, &mut predicate, &mut result);
-        self.filter_impl(1, &mut predicate, &mut result);
-        self.filter_impl(2, &mut predicate, &mut result);
-        self.filter_impl(3, &mut predicate, &mut result);
-        self.filter_impl(4, &mut predicate, &mut result);
-        self.filter_impl(5, &mut predicate, &mut result);
-        self.filter_impl(6, &mut predicate, &mut result);
-        self.filter_impl(7, &mut predicate, &mut result);
-        result
-    }
-
-    /// Internal implementation to be unrolled by `filter`
-    #[inline(always)]
-    fn filter_impl<P: FnMut(RegisterId) -> bool>(
-        &self,
-        id: usize,
-        predicate: &mut P,
-        result: &mut Self,
-    ) {
-        if self.reg_not_in_set[id] {
-            result.reg_not_in_set[id] = true;
-        } else if predicate(RegisterId(id as u8)) {
-            result.len += 1;
-        } else {
-            result.reg_not_in_set[id] = true;
+    pub(crate) fn from_filter<P: FnMut(RegisterId) -> bool>(mut predicate: P) -> Self {
+        let mut result: Self = Default::default();
+        for r in RegisterId::all() {
+            if predicate(r) {
+                result.0.push(r);
+            }
        }
+        result
    }

    /// Return a particular register within this set, counting from R0 to R7.
@ -127,45 +92,8 @@ impl RegisterSet {
    /// The supplied index must be less than the [`Self::len()`] of this set.
    /// Panics if the index is out of range.
    #[inline(always)]
-    pub(crate) fn index(&self, mut index: usize) -> RegisterId {
-        if let Some(result) = self.index_impl(0, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(1, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(2, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(3, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(4, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(5, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(6, &mut index) {
-            return result;
-        }
-        if let Some(result) = self.index_impl(7, &mut index) {
-            return result;
-        }
-        unreachable!();
-    }
-
-    /// Internal implementation to be unrolled by `index`
-    #[inline(always)]
-    fn index_impl(&self, id: usize, index: &mut usize) -> Option<RegisterId> {
-        if self.reg_not_in_set[id] {
-            None
-        } else if *index == 0 {
-            Some(RegisterId(id as u8))
-        } else {
-            *index -= 1;
-            None
-        }
+    pub(crate) fn index(&self, index: usize) -> RegisterId {
+        self.0[index]
    }
 }

@ -224,29 +152,3 @@ impl RegisterFile {
        [x.v0 ^ y.v0, x.v1 ^ y.v1, x.v2 ^ y.v2, x.v3 ^ y.v3]
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::RegisterSet;
-
-    #[test]
-    fn register_set() {
-        let r = RegisterSet::all().filter(|_reg| true);
-        assert_eq!(r.len(), 8);
-        assert_eq!(r.index(7).as_usize(), 7);
-        assert_eq!(r.index(0).as_usize(), 0);
-        let r = r.filter(|reg| (reg.as_usize() & 1) != 0);
-        assert_eq!(r.len(), 4);
-        assert_eq!(r.index(0).as_usize(), 1);
-        assert_eq!(r.index(1).as_usize(), 3);
-        assert_eq!(r.index(2).as_usize(), 5);
-        assert_eq!(r.index(3).as_usize(), 7);
-        let r = r.filter(|reg| (reg.as_usize() & 2) != 0);
-        assert_eq!(r.index(0).as_usize(), 3);
-        assert_eq!(r.index(1).as_usize(), 7);
-        let r = r.filter(|_reg| true);
-        assert_eq!(r.len(), 2);
-        let r = r.filter(|_reg| false);
-        assert_eq!(r.len(), 0);
-    }
-}
--- a/crates/hashx/src/scheduler.rs
+++ b/crates/hashx/src/scheduler.rs
@ -5,7 +5,7 @@
 //! avoid stalls.

 use crate::program::{Instruction, Opcode};
-use crate::register::{RegisterId, RegisterSet, NUM_REGISTERS};
+use crate::register::{RegisterId, NUM_REGISTERS};

 /// Scheduling information for each opcode
 mod model {
@ -197,10 +197,10 @@ impl Scheduler {
        }
    }

-    /// Figure out which registers will be available at or before the indicated cycle.
+    /// Look up if a register will be available at or before the indicated cycle.
    #[inline(always)]
-    pub(crate) fn registers_available(&self, cycle: Cycle) -> RegisterSet {
-        self.data.registers_available(cycle)
+    pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
+        self.data.register_available(reg, cycle)
    }

    /// Return the overall data latency.
@ -323,13 +323,10 @@ impl DataSchedule {
        self.latencies[dst.as_usize()] = cycle;
    }

-    /// Figure out which registers will be available at or before the indicated cycle
+    /// Look up if a register will be available at or before the indicated cycle.
    #[inline(always)]
-    fn registers_available(&self, cycle: Cycle) -> RegisterSet {
-        RegisterSet::all().filter(
-            #[inline(always)]
-            |reg| self.latencies[reg.as_usize()] <= cycle,
-        )
+    fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
+        self.latencies[reg.as_usize()] <= cycle
    }

    /// Return the overall latency, the [`Cycle`] at which we expect
@ -447,7 +444,7 @@ struct MicroOpPlan {
 ///
 /// This is defined as either one or two micro-operations
 /// scheduled on the same cycle.
-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub(crate) struct InstructionPlan {
    /// The Cycle this whole instruction begins on
    cycle: Cycle,