hashx: register set optimizations, 20% faster generator

I was hoping most of the program generator would get inlined, so we can resolve a lot of the edge cases at compile-time. This patch gets us close to that, adding many inline attrs and rewriting RegisterSet with explicit unrolling and storage types that are easier for the optimizer to reason about. From the disassembly of the program generator, it's now mostly one big function with a jump table. From callgrind instruction profiles, there are no longer obvious hotspots in register set scanning loops. It also looks like we're often keeping per-register schedule information all loaded into machine registers now. Keeping the Rng entry points non-inlined for now seems to be slightly better, by a percent or two. There's some work left to do in compiled programs, and maybe room for improvement in the Program representation too. That will be in a future patch. Benchmark shows about 20% improvement on my machine, generate-interp time: [75.440 µs 75.551 µs 75.684 µs] change: [-24.083% -23.775% -23.483%] (p = 0.00 < 0.05) Performance has improved. Found 11 outliers among 100 measurements (11.00%) 5 (5.00%) high mild 6 (6.00%) high severe generate-x86_64 time: [96.068 µs 96.273 µs 96.540 µs] change: [-18.699% -18.381% -18.013%] (p = 0.00 < 0.05) Performance has improved. Found 10 outliers among 100 measurements (10.00%) 4 (4.00%) high mild 6 (6.00%) high severe Signed-off-by: Micah Elizabeth Scott <beth@torproject.org>
2023-07-05 12:41:05 -07:00 · 2023-07-05 12:41:05 -07:00 · fdfe3ce55f
parent 3e8b8d035a
commit fdfe3ce55f
6 changed files with 177 additions and 64 deletions
--- a/crates/hashx/src/constraints.rs
+++ b/crates/hashx/src/constraints.rs
@ -35,12 +35,14 @@ mod model {
    pub(super) const REQUIRED_MULTIPLIES: usize = 192;

    /// Determine which ops count as a multiply when testing REQUIRED_MULTIPLIES
+    #[inline(always)]
    pub(super) fn is_multiply(op: Opcode) -> bool {
        matches!(op, Opcode::Mul | Opcode::SMulH | Opcode::UMulH)
    }

    /// Does an instruction prohibit using the same register for source and dest?
    /// Meaningful only for ops that have both a source and destination register.
+    #[inline(always)]
    pub(super) fn disallow_src_is_dst(op: Opcode) -> bool {
        matches!(
            op,
@ -60,6 +62,7 @@ mod model {
    pub(super) const DISALLOW_REGISTER_FOR_ADDSHIFT: RegisterId = register::R5;

    /// Should a particular pair of opcodes be rejected early?
+    #[inline(always)]
    pub(super) fn disallow_opcode_pair(previous: Opcode, proposed: Opcode) -> bool {
        match proposed {
            // Never rejected at this stage
@ -77,6 +80,7 @@ mod model {

    /// Constraints for pairs of instructions that would be writing to the same
    /// destination
+    #[inline(always)]
    pub(super) fn writer_pair_allowed(
        pass: Pass,
        last_writer: Option<&RegisterWriter>,
@ -159,6 +163,7 @@ pub(crate) struct Validator {

 impl Validator {
    /// Construct a new empty Validator
+    #[inline(always)]
    pub(crate) fn new() -> Self {
        Self {
            writer_map: RegisterWriterMap::new(),
@ -167,6 +172,7 @@ impl Validator {
    }

    /// Commit a new instruction to the validator state
+    #[inline(always)]
    pub(crate) fn commit_instruction(&mut self, inst: &Instruction, regw: Option<RegisterWriter>) {
        if model::is_multiply(inst.opcode()) {
            self.multiply_count += 1;
@ -182,6 +188,7 @@ impl Validator {

    /// Once the whole program is assembled, HashX still has a chance to reject
    /// it if it fails certain criteria.
+    #[inline(always)]
    pub(crate) fn check_whole_program(
        &self,
        scheduler: &Scheduler,
@ -199,6 +206,7 @@ impl Validator {

    /// Figure out the allowed set of destination registers for an op after its
    /// source is known, using the current state of the validator.
+    #[inline(always)]
    pub(crate) fn dst_registers_allowed(
        &self,
        available: RegisterSet,
@ -207,27 +215,31 @@ impl Validator {
        writer_info: &RegisterWriter,
        src: Option<RegisterId>,
    ) -> RegisterSet {
-        available.filter(|dst| {
-            // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't
-            // be used as destination for AddShift.
-            if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT {
-                return false;
-            }
+        available.filter(
+            #[inline(always)]
+            |dst| {
+                // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't
+                // be used as destination for AddShift.
+                if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT {
+                    return false;
+                }

-            // A few instructions disallow choosing src and dst as the same
-            if model::disallow_src_is_dst(op) && src == Some(dst) {
-                return false;
-            }
+                // A few instructions disallow choosing src and dst as the same
+                if model::disallow_src_is_dst(op) && src == Some(dst) {
+                    return false;
+                }

-            // Additional constraints are written on the pair of previous and
-            // current instructions with the same destination.
-            model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info)
-        })
+                // Additional constraints are written on the pair of previous and
+                // current instructions with the same destination.
+                model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info)
+            },
+        )
    }
 }

 /// Figure out the allowed register set for an operation, given what's available
 /// in the schedule
+#[inline(always)]
 pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> RegisterSet {
    // HashX defines a special case DISALLOW_REGISTER_FOR_ADDSHIFT for
    // destination registers, and it also includes a look-ahead
@ -241,7 +253,10 @@ pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> Regis
        && available.contains(model::DISALLOW_REGISTER_FOR_ADDSHIFT)
        && available.len() == 2
    {
-        available.filter(|reg| reg == model::DISALLOW_REGISTER_FOR_ADDSHIFT)
+        available.filter(
+            #[inline(always)]
+            |reg| reg == model::DISALLOW_REGISTER_FOR_ADDSHIFT,
+        )
    } else {
        available
    }
@ -249,6 +264,7 @@ pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> Regis

 /// Some pairs of adjacent [`Opcode`]s are rejected at the opcode selector level
 /// without causing an entire instruction generation pass to fail.
+#[inline(always)]
 pub(crate) fn opcode_pair_allowed(previous: Option<Opcode>, proposed: Opcode) -> Result<(), ()> {
    match previous {
        None => Ok(()),
@ -273,6 +289,7 @@ struct RegisterWriterMap {

 impl RegisterWriterMap {
    /// A new empty register writer map. All registers are set to None
+    #[inline(always)]
    fn new() -> Self {
        Self {
            regs: [None; NUM_REGISTERS],
@ -281,6 +298,7 @@ impl RegisterWriterMap {
    }

    /// Write or overwrite the last [`RegisterWriter`] associated with `reg`
+    #[inline(always)]
    fn insert(&mut self, reg: RegisterId, writer: RegisterWriter) {
        let previous = self.regs[reg.as_usize()];
        match previous {
@ -300,6 +318,7 @@ impl RegisterWriterMap {
    }

    /// Return the most recent mapping for 'reg', if any
+    #[inline(always)]
    fn get(&self, reg: RegisterId) -> Option<&RegisterWriter> {
        self.regs[reg.as_usize()].map(|index| &self.writers[index as usize])
    }
--- a/crates/hashx/src/generator.rs
+++ b/crates/hashx/src/generator.rs
@ -31,6 +31,7 @@ mod model {
    /// Normal cycles are all replaced by `ImmediateSrc` if this is the retry
    /// pass, so that retries won't need to attempt source register selection
    /// in this case.
+    #[inline(always)]
    pub(super) fn choose_opcode_selector(pass: Pass, sub_cycle: SubCycle) -> OpcodeSelector {
        let n = sub_cycle.as_usize() % 36;
        if n == 1 {
@ -107,6 +108,7 @@ struct Generator {

 impl Generator {
    /// Create a fresh program generator from the corresponding siphash state
+    #[inline(always)]
    fn new(key: siphash::State) -> Self {
        Generator {
            rng: siphash::Rng::new(key),
@ -119,6 +121,7 @@ impl Generator {
    /// Pick a pseudorandom register from a RegisterSet, or return `Err(())`
    /// if the set is empty. Consumes one `u32` from the `Rng` only if the set
    /// contains more than one item.
+    #[inline(always)]
    fn select_register(&mut self, reg_options: RegisterSet) -> Result<RegisterId, ()> {
        match reg_options.len() {
            0 => Err(()),
@ -138,6 +141,7 @@ impl Generator {
    }

    /// Pick a pseudorandom operation from a list of at least two options
+    #[inline(always)]
    fn select_op<'a, T, const SIZE: usize>(&mut self, options: &'a [T; SIZE]) -> &'a T {
        &options[(self.rng.next_u8() as usize) % options.len()]
    }
@ -145,6 +149,7 @@ impl Generator {
    /// Generate a random u32 bit mask, with a constant number of bits set.
    /// This uses an iterative algorithm that selects one bit at a time
    /// using a u8 from the Rng for each, discarding duplicates.
+    #[inline(always)]
    fn select_constant_weight_bit_mask(&mut self, num_ones: usize) -> u32 {
        let mut result = 0_u32;
        let mut count = 0;
@ -160,6 +165,7 @@ impl Generator {

    /// Generate random nonzero values, by iteratively picking a random u32,
    /// masking it, and discarding results that would be all zero.
+    #[inline(always)]
    fn select_nonzero_u32(&mut self, mask: u32) -> u32 {
        loop {
            let result = self.rng.next_u32() & mask;
@ -173,6 +179,7 @@ impl Generator {
    /// state can't be advanced any further. Returns with
    /// [`Error::ProgramConstraints`] if the program fails the HashX
    /// whole-program checks.
+    #[inline(always)]
    fn generate_program(&mut self) -> Result<Program, Error> {
        let mut array: InstructionArray = Default::default();
        while array.len() < array.capacity() {
@ -203,6 +210,7 @@ impl Generator {
    ///
    /// This only returns `Err(())` if we've hit a stopping condition for the
    /// program.
+    #[inline(always)]
    fn generate_instruction(&mut self) -> Result<(Instruction, Option<RegisterWriter>), ()> {
        loop {
            if let Ok(result) = self.instruction_gen_attempt(Pass::Original) {
@ -217,6 +225,7 @@ impl Generator {

    /// Choose an opcode using the current [`OpcodeSelector`], subject to
    /// stateful constraints on adjacent opcode choices.
+    #[inline(always)]
    fn choose_opcode(&mut self, pass: Pass) -> Opcode {
        let op = loop {
            let sub_cycle = self.scheduler.instruction_stream_sub_cycle();
@ -233,6 +242,7 @@ impl Generator {
    /// [`OpcodeSelector`], chooses an opcode, then finishes choosing the
    /// opcode-specific parts of the instruction. Each of these choices affects
    /// the [`siphash::Rng`] state, and may fail if conditions are not met.
+    #[inline(always)]
    fn instruction_gen_attempt(
        &mut self,
        pass: Pass,
@ -247,6 +257,7 @@ impl Generator {

    /// Choose both a source and destination register using a normal
    /// [`RegisterWriter`] for two-operand instructions
+    #[inline(always)]
    fn choose_src_dst_regs(
        &mut self,
        op: Opcode,
@ -269,6 +280,7 @@ impl Generator {
    /// Choose both a source and destination register, with a custom
    /// [`RegisterWriter`] constraint that doesn't depend on source
    /// register choice.
+    #[inline(always)]
    fn choose_src_dst_regs_with_writer_info(
        &mut self,
        op: Opcode,
@ -289,6 +301,7 @@ impl Generator {
    }

    /// Choose a destination register only
+    #[inline(always)]
    fn choose_dst_reg(
        &mut self,
        op: Opcode,
@ -309,6 +322,7 @@ impl Generator {
    /// With an [`Opcode`] and an execution unit timing plan already in mind,
    /// generate the other pieces necessary to fully describe an
    /// [`Instruction`]. This can fail if register selection fails.
+    #[inline(always)]
    fn choose_instruction_with_opcode_plan(
        &mut self,
        op: Opcode,
@ -394,6 +408,7 @@ impl Generator {
    /// that's for certain being written to the final program. Returns `Ok(())`
    /// on success or `Err(())` if the new state is no longer valid for
    /// program generation and we're done writing code.
+    #[inline(always)]
    fn commit_instruction_state(
        &mut self,
        inst: &Instruction,
@ -425,6 +440,7 @@ enum OpcodeSelector {

 impl OpcodeSelector {
    /// Apply the selector, advancing the Rng state and returning an Opcode
+    #[inline(always)]
    fn apply(&self, gen: &mut Generator) -> Opcode {
        match self {
            OpcodeSelector::Target => Opcode::Target,
--- a/crates/hashx/src/program.rs
+++ b/crates/hashx/src/program.rs
@ -143,6 +143,7 @@ pub(crate) enum Opcode {

 impl Instruction {
    /// Get this instruction's [`Opcode`]
+    #[inline(always)]
    pub(crate) fn opcode(&self) -> Opcode {
        match self {
            Instruction::AddConst { .. } => Opcode::AddConst,
@ -160,6 +161,7 @@ impl Instruction {
    }

    /// Get this instruction's destination register if any
+    #[inline(always)]
    pub(crate) fn destination(&self) -> Option<RegisterId> {
        match self {
            Instruction::AddConst { dst, .. } => Some(*dst),
@ -203,6 +205,7 @@ impl fmt::Debug for Program {

 impl Program {
    /// Construct a finished `Program` from a list of instructions
+    #[inline(always)]
    pub(crate) fn new(instructions: InstructionArray) -> Self {
        Self { instructions }
    }
@ -303,6 +306,7 @@ impl Program {
 }

 impl<'a> From<&'a Program> for &'a InstructionArray {
+    #[inline(always)]
    fn from(prog: &'a Program) -> Self {
        &prog.instructions
    }
--- a/crates/hashx/src/register.rs
+++ b/crates/hashx/src/register.rs
@ -27,26 +27,21 @@ impl RegisterId {
    pub(crate) fn as_usize(&self) -> usize {
        self.0 as usize
    }
-
-    /// Convert a usize into a RegisterId. Panics if out of range.
-    ///
-    /// This is only available within the module, so we can implement
-    /// RegisterSet. The public interface to RegisterId does not allow
-    /// creating new instances of specific registers.
-    fn from_usize(n: usize) -> Self {
-        assert!(n < NUM_REGISTERS);
-        Self(
-            n.try_into()
-                .expect("register ID type wide enough for register file"),
-        )
-    }
 }

 /// Identify a set of RegisterIds
+///
+/// This could be done compactly as a u8 bitfield for storage purposes, but
+/// in our program generator this is never stored long-term. Instead, we want
+/// something the optimizer can reason about as effectively as possible, and
+/// let's inline as much as possible in order to resolve special cases in
+/// the program generator at compile time.
 #[derive(Clone, Copy, Eq, PartialEq)]
 pub(crate) struct RegisterSet {
-    /// Bit field, in LSB-first order, tracking which registers are in the set
-    bits: u8,
+    /// Number of registers in the set
+    len: usize,
+    /// Array indexed by register Id, indicating registers we've excluded
+    reg_not_in_set: [bool; 8],
 }

 impl fmt::Debug for RegisterSet {
@ -67,61 +62,104 @@ impl RegisterSet {
    ///
    /// This is the main way to construct a new RegisterId, starting with
    /// all available registers and filtering them repeatedly.
+    #[inline(always)]
    pub(crate) fn all() -> Self {
        Self {
-            bits: ((1_usize << NUM_REGISTERS) - 1)
-                .try_into()
-                .expect("register set is wide enough to hold all registers"),
+            len: NUM_REGISTERS,
+            reg_not_in_set: Default::default(),
        }
    }

    /// Number of registers still contained in this set
+    #[inline(always)]
    pub(crate) fn len(&self) -> usize {
-        self.bits
-            .count_ones()
-            .try_into()
-            .expect("register set length always fits in usize")
+        self.len
    }

    /// Test if a register is contained in the set
+    #[inline(always)]
    pub(crate) fn contains(&self, id: RegisterId) -> bool {
-        (self.bits & (1 << id.0)) != 0
+        !self.reg_not_in_set[id.0 as usize]
    }

    /// Filter this register set through a predicate. Invokes the predicate only
    /// for registers in this set, and returns the set of registers for which it
    /// returned true.
+    #[inline(always)]
    pub(crate) fn filter<P: FnMut(RegisterId) -> bool>(&self, mut predicate: P) -> Self {
-        let mut shift = 0;
-        let mut result = *self;
-        loop {
-            if result.bits == 0 {
-                break;
-            }
-            shift += result.bits.wrapping_shr(shift as _).trailing_zeros() as usize;
-            if shift >= NUM_REGISTERS {
-                break;
-            }
-            if !predicate(RegisterId::from_usize(shift)) {
-                result.bits &= !(1 << shift);
-            }
-            shift += 1;
-        }
+        let mut result = Self {
+            len: 0,
+            reg_not_in_set: Default::default(),
+        };
+        self.filter_impl(0, &mut predicate, &mut result);
+        self.filter_impl(1, &mut predicate, &mut result);
+        self.filter_impl(2, &mut predicate, &mut result);
+        self.filter_impl(3, &mut predicate, &mut result);
+        self.filter_impl(4, &mut predicate, &mut result);
+        self.filter_impl(5, &mut predicate, &mut result);
+        self.filter_impl(6, &mut predicate, &mut result);
+        self.filter_impl(7, &mut predicate, &mut result);
        result
    }

+    /// Internal implementation to be unrolled by `filter`
+    #[inline(always)]
+    fn filter_impl<P: FnMut(RegisterId) -> bool>(
+        &self,
+        id: usize,
+        predicate: &mut P,
+        result: &mut Self,
+    ) {
+        if self.reg_not_in_set[id] {
+            result.reg_not_in_set[id] = true;
+        } else if predicate(RegisterId(id as u8)) {
+            result.len += 1;
+        } else {
+            result.reg_not_in_set[id] = true;
+        }
+    }
+
    /// Return a particular register within this set, counting from R0 to R7.
    /// The supplied index must be less than the len() of this set.
-    pub(crate) fn index(&self, mut idx: usize) -> RegisterId {
-        let mut shift = 0;
-        loop {
-            shift += (self.bits >> shift).trailing_zeros() as usize;
-            assert!(shift < NUM_REGISTERS);
-            if idx == 0 {
-                return RegisterId::from_usize(shift);
-            }
-            idx -= 1;
-            shift += 1;
+    #[inline(always)]
+    pub(crate) fn index(&self, mut index: usize) -> RegisterId {
+        if let Some(result) = self.index_impl(0, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(1, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(2, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(3, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(4, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(5, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(6, &mut index) {
+            return result;
+        }
+        if let Some(result) = self.index_impl(7, &mut index) {
+            return result;
+        }
+        unreachable!();
+    }
+
+    /// Internal implementation to be unrolled by `index`
+    #[inline(always)]
+    fn index_impl(&self, id: usize, index: &mut usize) -> Option<RegisterId> {
+        if self.reg_not_in_set[id] {
+            None
+        } else if *index == 0 {
+            Some(RegisterId(id as u8))
+        } else {
+            *index -= 1;
+            None
        }
    }
 }
--- a/crates/hashx/src/scheduler.rs
+++ b/crates/hashx/src/scheduler.rs
@ -50,6 +50,7 @@ mod model {
    const MAX_LATENCY: usize = 4;

    /// Latency for each operation, in cycles
+    #[inline(always)]
    pub(super) fn instruction_latency_cycles(op: Opcode) -> usize {
        match op {
            Opcode::AddConst => 1,
@ -67,6 +68,7 @@ mod model {
    }

    /// Break an instruction down into one or two micro-operation port sets
+    #[inline(always)]
    pub(super) fn micro_operations(op: Opcode) -> (ExecPorts, Option<ExecPorts>) {
        match op {
            Opcode::AddConst { .. } => (P015, None),
@ -85,6 +87,7 @@ mod model {

    /// Each instruction advances the earliest possible issuing cycle by one
    /// sub-cycle per micro-op.
+    #[inline(always)]
    pub(super) fn instruction_sub_cycle_count(op: Opcode) -> usize {
        match micro_operations(op) {
            (_, None) => 1,
@ -120,23 +123,27 @@ pub(crate) struct Scheduler {

 impl Scheduler {
    /// Create a new empty execution schedule at cycle 0
+    #[inline(always)]
    pub(crate) fn new() -> Self {
        Default::default()
    }

    /// Stall for one cycle, used when register allocation fails.
    /// Returns Ok if we had enough time, or Err if we ran out.
+    #[inline(always)]
    pub(crate) fn stall(&mut self) -> Result<(), ()> {
        self.advance(SubCycle::PER_CYCLE as usize)
    }

    /// Return the current instruction fetch/decode timestamp in sub-cycles
+    #[inline(always)]
    pub(crate) fn instruction_stream_sub_cycle(&self) -> SubCycle {
        self.sub_cycle
    }

    /// Advance forward in time by some number of sub-cycles. Stops just before
    /// reaching the target cycle, where we stop scheduling new instructions.
+    #[inline(always)]
    fn advance(&mut self, n: usize) -> Result<(), ()> {
        let sub_cycle = self.sub_cycle.add_usize(n)?;
        let cycle = sub_cycle.cycle();
@ -152,12 +159,14 @@ impl Scheduler {
    /// Advance time forward by the modeled duration of the instruction fetch
    /// and decode, in sub-cycles. Returns Ok if we still have enough time to
    /// schedule more, or Err if the schedule would be full.
+    #[inline(always)]
    pub(crate) fn advance_instruction_stream(&mut self, op: Opcode) -> Result<(), ()> {
        self.advance(model::instruction_sub_cycle_count(op))
    }

    /// Calculate a timing plan describing the cycle and execution units
    /// on which a particular opcode could run, at the earliest.
+    #[inline(always)]
    pub(crate) fn instruction_plan(&self, op: Opcode) -> Result<InstructionPlan, ()> {
        self.exec.instruction_plan(self.cycle, op)
    }
@ -166,6 +175,7 @@ impl Scheduler {
    /// concrete Instruction instance. Marks as busy each execution unit cycle
    /// specified, and updates the register latency for the instruction's
    /// destination if it has one.
+    #[inline(always)]
    pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) {
        self.exec.mark_instruction_busy(plan);
        if let Some(dst) = inst.destination() {
@ -175,12 +185,14 @@ impl Scheduler {
    }

    /// Figure out which registers will be available at or before the indicated cycle
+    #[inline(always)]
    pub(crate) fn registers_available(&self, cycle: Cycle) -> RegisterSet {
        self.data.registers_available(cycle)
    }

    /// Return the overall latency, the Cycle at which we expect every register
    /// to reach its latest simulated state.
+    #[inline(always)]
    pub(crate) fn overall_latency(&self) -> Cycle {
        self.data.overall_latency()
    }
@ -195,6 +207,7 @@ pub(crate) struct Cycle(u8);

 impl Cycle {
    /// HashX will stop generating code once the issue cycle reaches this target
+    #[inline(always)]
    fn target() -> Self {
        Cycle(
            model::TARGET_CYCLES
@ -204,11 +217,13 @@ impl Cycle {
    }

    /// Cast this Cycle count to a usize losslessly
+    #[inline(always)]
    pub(crate) fn as_usize(&self) -> usize {
        self.0.into()
    }

    /// Add an integer number of cycles, returning Err(()) if we reach the end
+    #[inline(always)]
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
        let result = self.as_usize() + n;
        if result < model::SCHEDULE_SIZE {
@ -238,11 +253,13 @@ impl SubCycle {
    const MAX: Self = SubCycle(model::SCHEDULE_SIZE as u16 * Self::PER_CYCLE - 1);

    /// Cast this sub-cycle count into a usize losslessly
+    #[inline(always)]
    pub(crate) fn as_usize(&self) -> usize {
        self.0.into()
    }

    /// Convert this sub-cycle into a full Cycle timestamp
+    #[inline(always)]
    fn cycle(&self) -> Cycle {
        Cycle(
            (self.0 / Self::PER_CYCLE)
@ -255,6 +272,7 @@ impl SubCycle {
    ///
    /// Returns the new advanced [`SubCycle`], or `Err(())`
    /// if we reach the end of the schedule.
+    #[inline(always)]
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
        let result = self.as_usize() + n;
        if result < Self::MAX.0.into() {
@ -285,17 +303,23 @@ struct DataSchedule {

 impl DataSchedule {
    /// Plan to finish a register write at the indicated cycle
+    #[inline(always)]
    fn plan_register_write(&mut self, dst: RegisterId, cycle: Cycle) {
        self.latencies[dst.as_usize()] = cycle;
    }

    /// Figure out which registers will be available at or before the indicated cycle
+    #[inline(always)]
    fn registers_available(&self, cycle: Cycle) -> RegisterSet {
-        RegisterSet::all().filter(|reg| self.latencies[reg.as_usize()] <= cycle)
+        RegisterSet::all().filter(
+            #[inline(always)]
+            |reg| self.latencies[reg.as_usize()] <= cycle,
+        )
    }

    /// Return the overall latency, the [`Cycle`] at which we expect
    /// every register to reach its latest simulated state.
+    #[inline(always)]
    fn overall_latency(&self) -> Cycle {
        match self.latencies.iter().max() {
            Some(cycle) => *cycle,
@ -320,6 +344,7 @@ impl ExecSchedule {
    ///
    /// HashX always searches execution ports in the same order, and it will
    /// look ahead up to the entire length of the schedule before failing.
+    #[inline(always)]
    fn micro_plan(&self, begin: Cycle, ports: model::ExecPorts) -> Result<MicroOpPlan, ()> {
        let mut cycle = begin;
        loop {
@ -338,6 +363,7 @@ impl ExecSchedule {
    }

    /// Mark the schedule busy according to a previously calculated plan
+    #[inline(always)]
    fn mark_micro_busy(&mut self, plan: MicroOpPlan) {
        self.ports[plan.port.index as usize]
            .busy
@ -347,6 +373,7 @@ impl ExecSchedule {
    /// Calculate a timing plan describing the cycle and execution units
    /// we could use for scheduling one entire instruction, based on its Opcode
    /// but not its arguments.
+    #[inline(always)]
    fn instruction_plan(&self, begin: Cycle, op: Opcode) -> Result<InstructionPlan, ()> {
        match model::micro_operations(op) {
            // Single-op instructions
@ -377,6 +404,7 @@ impl ExecSchedule {
    }

    /// Mark each micro-op in an InstructionPlan as busy in the schedule
+    #[inline(always)]
    fn mark_instruction_busy(&mut self, plan: &InstructionPlan) {
        let (first, second) = plan.as_micro_plans();
        self.mark_micro_busy(first);
@ -414,11 +442,13 @@ pub(crate) struct InstructionPlan {

 impl InstructionPlan {
    /// Get the Cycle this whole instruction begins on
+    #[inline(always)]
    pub(crate) fn cycle_issued(&self) -> Cycle {
        self.cycle
    }

    /// Calculate the cycle this instruction is completed by
+    #[inline(always)]
    pub(crate) fn cycle_retired(&self, op: Opcode) -> Cycle {
        self.cycle
            .add_usize(model::instruction_latency_cycles(op))
@ -426,6 +456,7 @@ impl InstructionPlan {
    }

    /// Convert this InstructionPlan back to one or two MicroOp plans
+    #[inline(always)]
    fn as_micro_plans(&self) -> (MicroOpPlan, Option<MicroOpPlan>) {
        (
            MicroOpPlan {
@ -443,6 +474,7 @@ impl InstructionPlan {
    /// if they are on matching cycles.
    ///
    /// Returns `Err(())` if the combination is not possible.
+    #[inline(always)]
    fn from_micro_plans(first_op: MicroOpPlan, second_op: Option<MicroOpPlan>) -> Result<Self, ()> {
        let second_port = match second_op {
            None => None,
--- a/crates/hashx/src/siphash.rs
+++ b/crates/hashx/src/siphash.rs
@ -59,6 +59,7 @@ impl Rng {
    ///
    /// The internal SipHash1,3 generator is initialized to a supplied
    /// internal state, and the counter is reset to zero.
+    #[inline(always)]
    pub(crate) fn new(key: State) -> Self {
        Rng {
            key,
@ -124,6 +125,7 @@ where
    const ITEM_MASK: u64 = (1 << Self::BITS_PER_ITEM) - 1;

    /// Construct a new shift register buffer containing no data bits
+    #[inline(always)]
    fn new() -> Self {
        Self {
            word: 0,
@ -143,6 +145,7 @@ where

    /// Refill the buffer and remove the most significant item.
    /// Buffer must be empty.
+    #[inline(always)]
    fn refill_and_pop(&mut self, word: u64) -> T {
        assert!(self.remaining == 0);
        self.word = word;
@ -151,6 +154,7 @@ where
    }

    /// Remove the most significant item in this buffer
+    #[inline(always)]
    fn pop(&mut self) -> Option<T> {
        if self.remaining >= Self::BITS_PER_ITEM {
            self.remaining -= Self::BITS_PER_ITEM;