From e142fd988283925660ab109fed3a1d92a36fd0b5 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Wed, 16 Aug 2023 15:54:10 -0700 Subject: [PATCH 1/7] hashx: new RegisterWriter format handles more cases transparently There was a special case in writer_pair_allowed for making add and subtract equivalent. This patch changes RegisterWriter's encoding, using per-opcode variants instead of per-format variants. The Add/Sub merge can now happen earlier, when RegisterWriter is constructed. Before and after RegisterWriter sizes are the same, at 8 bytes. This patch removes many uses of Option in favor of using a new RegisterWriter::None default, and passes by value rather than by reference. Wallclock time benchmarks: generate-interp improves, -7.5% generate-x86_64 improves, -5.3% Cachegrind benchmarks: generate_interp_1000x, negligible change in total instructions, improvement in cache footprint: -22.8% L2 accesses Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/constraints.rs | 117 ++++++++++++++++++-------------- crates/hashx/src/generator.rs | 88 ++++++++++++------------ crates/hashx/src/scheduler.rs | 6 +- 3 files changed, 112 insertions(+), 99 deletions(-) diff --git a/crates/hashx/src/constraints.rs b/crates/hashx/src/constraints.rs index b28d19d66..297b87f78 100644 --- a/crates/hashx/src/constraints.rs +++ b/crates/hashx/src/constraints.rs @@ -85,31 +85,19 @@ mod model { #[inline(always)] pub(super) fn writer_pair_allowed( pass: Pass, - last_writer: Option<&RegisterWriter>, - this_writer: &RegisterWriter, + last_writer: RegisterWriter, + this_writer: RegisterWriter, ) -> bool { match (last_writer, this_writer) { // HashX disallows back-to-back 64-bit multiplies on the // same destination register in Pass::Original but permits // them on the retry if the source register isn't identical. - ( - Some(RegisterWriter::RegSource(Opcode::Mul, _)), - RegisterWriter::RegSource(Opcode::Mul, _), - ) if matches!(pass, Pass::Original) => false, - - // Add/Sub from the same source register can't be paired - // with each other. (They might cancel out) - ( - Some(RegisterWriter::RegSource(Opcode::AddShift, last_src)), - RegisterWriter::RegSource(Opcode::Sub, this_src), - ) if this_src == last_src => false, - ( - Some(RegisterWriter::RegSource(Opcode::Sub, last_src)), - RegisterWriter::RegSource(Opcode::AddShift, this_src), - ) if this_src == last_src => false, + (RegisterWriter::Mul(_), RegisterWriter::Mul(_)) if matches!(pass, Pass::Original) => { + false + } // Other pairings are allowed if the writer info differs at all. - (last_writer, this_writer) => last_writer != Some(this_writer), + (last_writer, this_writer) => last_writer != this_writer, } } @@ -132,29 +120,57 @@ mod model { /// This is conceptually similar to storing the last [`super::Instruction`] /// that wrote to a register, but HashX sometimes needs information for /// constraints which won't end up in the final `Instruction`. - #[derive(Debug, Clone, Eq, PartialEq)] + /// + /// We've chosen the encoding to minimize the code size in + /// writer_pair_allowed. Most pairwise comparisons can just be a register + /// equality test. + /// + /// The instructions here fall into three categories which use their own + /// format for encoding arguments: + /// + /// - Wide Multiply, extra u32 + /// + /// UMulH and SMulH use an additional otherwise unused 32-bit value + /// from the Rng when considering writer collisions. + /// + /// As far as I can tell this is a bug in the original implementation + /// but we can't change the behavior without breaking compatibility. + /// + /// The collisions are rare enough not to be a worthwhile addition + /// to ASIC-resistance. It seems like this was a vestigial feature + /// left over from immediate value matching features which were removed + /// during the development of HashX, but I can't be sure. + /// + /// - Constant source + /// + /// Only considers the opcode itself, not the specific immediate value. + /// + /// - Register source + /// + /// Considers the source register, collapses add/subtract into one op. + /// + #[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] pub(crate) enum RegisterWriter { - /// Special format for wide multiply - /// - /// HashX includes an otherwise unused phantom immediate value which - /// can (very rarely) affect constraint selection if it collides. - /// - /// As far as I can tell this is a bug in the original implementation - /// but we can't change the behavior without breaking compatibility. - /// - /// The collisions are rare enough not to be a worthwhile addition - /// to ASIC-resistance. It seems like this was a vestigial feature - /// left over from immediate value matching features which were removed - /// during the development of HashX, but I can't be sure. - WideMul(Opcode, u32), - - /// Writer for instructions with an immediate source - /// - /// The specific immediate value is not used. - ConstSource(Opcode), - - /// Writer for instructions with register source, unique by source register - RegSource(Opcode, RegisterId), + /// Register not written yet + #[default] + None, + /// Register source writer for [`super::Instruction::Mul`] + Mul(RegisterId), + /// Wide multiply writer for [`super::Instruction::UMulH`] + UMulH(u32), + /// Wide multiply writer for [`super::Instruction::SMulH`] + SMulH(u32), + /// Register source writer for [`super::Instruction::AddShift`] + /// and [`super::Instruction::Sub`] + AddSub(RegisterId), + /// Constant source writer for [`super::Instruction::AddConst`] + AddConst, + /// Register source writer for [`super::Instruction::Xor`] + Xor(RegisterId), + /// Constant source writer for [`super::Instruction::XorConst`] + XorConst, + /// Constant source writer for [`super::Instruction::Rotate`] + Rotate, } } @@ -187,16 +203,13 @@ impl Validator { /// Commit a new instruction to the validator state. #[inline(always)] - pub(crate) fn commit_instruction(&mut self, inst: &Instruction, regw: Option) { + pub(crate) fn commit_instruction(&mut self, inst: &Instruction, regw: RegisterWriter) { if model::is_multiply(inst.opcode()) { self.multiply_count += 1; } match inst.destination() { - None => assert!(regw.is_none()), - Some(dst) => self.writer_map.insert( - dst, - regw.expect("instructions with destination always have a RegisterWriter"), - ), + None => debug_assert_eq!(regw, RegisterWriter::None), + Some(dst) => self.writer_map.insert(dst, regw), } } @@ -228,7 +241,7 @@ impl Validator { available: RegisterSet, op: Opcode, pass: Pass, - writer_info: &RegisterWriter, + writer_info: RegisterWriter, src: Option, ) -> RegisterSet { available.filter( @@ -297,9 +310,9 @@ pub(crate) fn opcode_pair_allowed(previous: Option, proposed: Opcode) -> } } -/// Map each [`RegisterId`] to an [`Option`] +/// Map each [`RegisterId`] to an [`RegisterWriter`] #[derive(Default, Debug, Clone)] -struct RegisterWriterMap([Option; NUM_REGISTERS]); +struct RegisterWriterMap([RegisterWriter; NUM_REGISTERS]); impl RegisterWriterMap { /// Make a new empty register writer map. @@ -313,12 +326,12 @@ impl RegisterWriterMap { /// Write or overwrite the last [`RegisterWriter`] associated with `reg`. #[inline(always)] fn insert(&mut self, reg: RegisterId, writer: RegisterWriter) { - self.0[reg.as_usize()] = Some(writer); + self.0[reg.as_usize()] = writer; } /// Return the most recent mapping for 'reg', if any. #[inline(always)] - fn get(&self, reg: RegisterId) -> Option<&RegisterWriter> { - self.0[reg.as_usize()].as_ref() + fn get(&self, reg: RegisterId) -> RegisterWriter { + self.0[reg.as_usize()] } } diff --git a/crates/hashx/src/generator.rs b/crates/hashx/src/generator.rs index 568346668..e2c59e302 100644 --- a/crates/hashx/src/generator.rs +++ b/crates/hashx/src/generator.rs @@ -235,7 +235,7 @@ impl<'r, R: RngCore> Generator<'r, R> { /// This only returns `Err(())` if we've hit a stopping condition for the /// program. #[inline(always)] - fn generate_instruction(&mut self) -> Result<(Instruction, Option), ()> { + fn generate_instruction(&mut self) -> Result<(Instruction, RegisterWriter), ()> { loop { if let Ok(result) = self.instruction_gen_attempt(Pass::Original) { return Ok(result); @@ -268,15 +268,12 @@ impl<'r, R: RngCore> Generator<'r, R> { /// choosing the opcode-specific parts of the instruction. Each of these /// choices affects the Rng state, and may fail if conditions are not met. #[inline(always)] - fn instruction_gen_attempt( - &mut self, - pass: Pass, - ) -> Result<(Instruction, Option), ()> { + fn instruction_gen_attempt(&mut self, pass: Pass) -> Result<(Instruction, RegisterWriter), ()> { let op = self.choose_opcode(pass); let plan = self.scheduler.instruction_plan(op)?; - let (inst, regw) = self.choose_instruction_with_opcode_plan(op, pass, &plan)?; + let (inst, regw) = self.choose_instruction_with_opcode_plan(op, pass, plan)?; assert_eq!(inst.opcode(), op); - self.scheduler.commit_instruction_plan(&plan, &inst); + self.scheduler.commit_instruction_plan(plan, &inst); Ok((inst, regw)) } @@ -287,17 +284,18 @@ impl<'r, R: RngCore> Generator<'r, R> { &mut self, op: Opcode, pass: Pass, - timing_plan: &InstructionPlan, + writer_info_fn: fn(RegisterId) -> RegisterWriter, + timing_plan: InstructionPlan, ) -> Result<(RegisterId, RegisterId, RegisterWriter), ()> { let avail_set = self .scheduler .registers_available(timing_plan.cycle_issued()); let src_set = constraints::src_registers_allowed(avail_set, op); let src = self.select_register(src_set)?; - let writer_info = RegisterWriter::RegSource(op, src); + let writer_info = writer_info_fn(src); let dst_set = self.validator - .dst_registers_allowed(avail_set, op, pass, &writer_info, Some(src)); + .dst_registers_allowed(avail_set, op, pass, writer_info, Some(src)); let dst = self.select_register(dst_set)?; Ok((src, dst, writer_info)) } @@ -310,8 +308,8 @@ impl<'r, R: RngCore> Generator<'r, R> { &mut self, op: Opcode, pass: Pass, - writer_info: &RegisterWriter, - timing_plan: &InstructionPlan, + writer_info: RegisterWriter, + timing_plan: InstructionPlan, ) -> Result<(RegisterId, RegisterId), ()> { let avail_set = self .scheduler @@ -331,8 +329,8 @@ impl<'r, R: RngCore> Generator<'r, R> { &mut self, op: Opcode, pass: Pass, - writer_info: &RegisterWriter, - timing_plan: &InstructionPlan, + writer_info: RegisterWriter, + timing_plan: InstructionPlan, ) -> Result { let avail_set = self .scheduler @@ -354,79 +352,81 @@ impl<'r, R: RngCore> Generator<'r, R> { &mut self, op: Opcode, pass: Pass, - plan: &InstructionPlan, - ) -> Result<(Instruction, Option), ()> { + plan: InstructionPlan, + ) -> Result<(Instruction, RegisterWriter), ()> { Ok(match op { - Opcode::Target => (Instruction::Target, None), + Opcode::Target => (Instruction::Target, RegisterWriter::None), Opcode::Branch => ( Instruction::Branch { mask: self.select_constant_weight_bit_mask(model::BRANCH_MASK_BIT_WEIGHT), }, - None, + RegisterWriter::None, ), Opcode::UMulH => { - let regw = RegisterWriter::WideMul(op, self.rng.next_u32()); - let (src, dst) = - self.choose_src_dst_regs_with_writer_info(op, pass, ®w, plan)?; - (Instruction::UMulH { src, dst }, Some(regw)) + let regw = RegisterWriter::UMulH(self.rng.next_u32()); + let (src, dst) = self.choose_src_dst_regs_with_writer_info(op, pass, regw, plan)?; + (Instruction::UMulH { src, dst }, regw) } Opcode::SMulH => { - let regw = RegisterWriter::WideMul(op, self.rng.next_u32()); - let (src, dst) = - self.choose_src_dst_regs_with_writer_info(op, pass, ®w, plan)?; - (Instruction::SMulH { src, dst }, Some(regw)) + let regw = RegisterWriter::SMulH(self.rng.next_u32()); + let (src, dst) = self.choose_src_dst_regs_with_writer_info(op, pass, regw, plan)?; + (Instruction::SMulH { src, dst }, regw) } Opcode::Mul => { - let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?; - (Instruction::Mul { src, dst }, Some(regw)) + let regw = RegisterWriter::Mul; + let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?; + (Instruction::Mul { src, dst }, regw) } Opcode::Sub => { - let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?; - (Instruction::Sub { src, dst }, Some(regw)) + let regw = RegisterWriter::AddSub; + let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?; + (Instruction::Sub { src, dst }, regw) } Opcode::Xor => { - let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?; - (Instruction::Xor { src, dst }, Some(regw)) + let regw = RegisterWriter::Xor; + let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?; + (Instruction::Xor { src, dst }, regw) } Opcode::AddShift => { + let regw = RegisterWriter::AddSub; let left_shift = (self.rng.next_u32() & 3) as u8; - let (src, dst, regw) = self.choose_src_dst_regs(op, pass, plan)?; + let (src, dst, regw) = self.choose_src_dst_regs(op, pass, regw, plan)?; ( Instruction::AddShift { src, dst, left_shift, }, - Some(regw), + regw, ) } Opcode::AddConst => { - let regw = RegisterWriter::ConstSource(op); + let regw = RegisterWriter::AddConst; let src = self.select_nonzero_u32(u32::MAX) as i32; - let dst = self.choose_dst_reg(op, pass, ®w, plan)?; - (Instruction::AddConst { src, dst }, Some(regw)) + let dst = self.choose_dst_reg(op, pass, regw, plan)?; + (Instruction::AddConst { src, dst }, regw) } Opcode::XorConst => { - let regw = RegisterWriter::ConstSource(op); + let regw = RegisterWriter::XorConst; let src = self.select_nonzero_u32(u32::MAX) as i32; - let dst = self.choose_dst_reg(op, pass, ®w, plan)?; - (Instruction::XorConst { src, dst }, Some(regw)) + let dst = self.choose_dst_reg(op, pass, regw, plan)?; + (Instruction::XorConst { src, dst }, regw) } Opcode::Rotate => { - let regw = RegisterWriter::ConstSource(op); + let regw = RegisterWriter::Rotate; let right_rotate: u8 = self.select_nonzero_u32(63) as u8; - let dst = self.choose_dst_reg(op, pass, ®w, plan)?; - (Instruction::Rotate { dst, right_rotate }, Some(regw)) + let dst = self.choose_dst_reg(op, pass, regw, plan)?; + (Instruction::Rotate { dst, right_rotate }, regw) } }) } @@ -440,7 +440,7 @@ impl<'r, R: RngCore> Generator<'r, R> { fn commit_instruction_state( &mut self, inst: &Instruction, - regw: Option, + regw: RegisterWriter, ) -> Result<(), ()> { self.validator.commit_instruction(inst, regw); self.scheduler.advance_instruction_stream(inst.opcode()) diff --git a/crates/hashx/src/scheduler.rs b/crates/hashx/src/scheduler.rs index 6d48f17f5..d5f681408 100644 --- a/crates/hashx/src/scheduler.rs +++ b/crates/hashx/src/scheduler.rs @@ -189,7 +189,7 @@ impl Scheduler { /// Marks as busy each execution unit cycle in the plan, and updates the /// latency for the [`Instruction`]'s destination register if it has one. #[inline(always)] - pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) { + pub(crate) fn commit_instruction_plan(&mut self, plan: InstructionPlan, inst: &Instruction) { self.exec.mark_instruction_busy(plan); if let Some(dst) = inst.destination() { self.data @@ -422,7 +422,7 @@ impl ExecSchedule { /// Mark each micro-op in an InstructionPlan as busy in the schedule. #[inline(always)] - fn mark_instruction_busy(&mut self, plan: &InstructionPlan) { + fn mark_instruction_busy(&mut self, plan: InstructionPlan) { let (first, second) = plan.as_micro_plans(); self.mark_micro_busy(first); if let Some(second) = second { @@ -447,7 +447,7 @@ struct MicroOpPlan { /// /// This is defined as either one or two micro-operations /// scheduled on the same cycle. -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, Eq, PartialEq)] pub(crate) struct InstructionPlan { /// The Cycle this whole instruction begins on cycle: Cycle, From ee6acfa5cdd2d79676221f3675daec466cd73e1e Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Thu, 17 Aug 2023 21:19:51 -0700 Subject: [PATCH 2/7] hashx: Rewrite RegisterSet again to reduce CPU frontend stalls Closer inspection of the CPU counters showed that the branching in RegisterSet::index() was a big problem, contributing to the overall CPU frontend stall bottleneck in program generation. This new version is less general, and closer to the appraoch used by the original C implementation. We store a sorted ArrayVec of in-set registers, and most operations construct the RegisterSet only once using a combined filter predicate. Choosing a register from a set is now cheaper in branches, instructions, and L1 cache space. We now very rarely manipulate an entire RegisterSet in any way other than by selecting a register randomly. (Just for the register R5 special case.) Wallclock time benchmarks: generate-interp improves, -7.0% generate-x86_64 improves, -7.2% Cachegrind benchmarks: generate_interp_1000x, more total instructions run but a large decrease in frontend cache misses. +4.6% instructions, +11% L1 accesses, -99% L2 access, -40% RAM access. generate_compiled_100x, +4.0% instructions, +9.4% L1 access. cache miss improvements: -57% L2 access, -25% RAM access. Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/compiler/aarch64.rs | 12 +-- crates/hashx/src/compiler/x86_64.rs | 12 +-- crates/hashx/src/constraints.rs | 41 ++++---- crates/hashx/src/generator.rs | 81 ++++++++------- crates/hashx/src/register.rs | 146 +++++---------------------- crates/hashx/src/scheduler.rs | 21 ++-- 6 files changed, 103 insertions(+), 210 deletions(-) diff --git a/crates/hashx/src/compiler/aarch64.rs b/crates/hashx/src/compiler/aarch64.rs index 95cba0686..127301324 100644 --- a/crates/hashx/src/compiler/aarch64.rs +++ b/crates/hashx/src/compiler/aarch64.rs @@ -2,7 +2,7 @@ use crate::compiler::{util, Architecture, Executable}; use crate::program::{self, Instruction, InstructionArray}; -use crate::register::{RegisterFile, RegisterId, RegisterSet}; +use crate::register::{RegisterFile, RegisterId}; use crate::CompilerError; use dynasmrt::{aarch64, DynasmApi, DynasmLabelApi}; use std::mem; @@ -85,19 +85,17 @@ fn emit_init_locals(asm: &mut Assembler) { /// Emit code to move all input values from the RegisterFile into their /// actual hardware registers. fn emit_load_input(asm: &mut Assembler) { - RegisterSet::all().filter(|reg| { + for reg in RegisterId::all() { dynasm!(asm; ldr X(reg.x()), [register_file_ptr, #(reg.offset())]); - true - }); + } } /// Emit code to move all output values from machine registers back into /// their RegisterFile slots. fn emit_store_output(asm: &mut Assembler) { - RegisterSet::all().filter(|reg| { + for reg in RegisterId::all() { dynasm!(asm; str X(reg.x()), [register_file_ptr, #(reg.offset())]); - true - }); + } } /// Emit a return instruction. diff --git a/crates/hashx/src/compiler/x86_64.rs b/crates/hashx/src/compiler/x86_64.rs index ec81f3282..070295b8c 100644 --- a/crates/hashx/src/compiler/x86_64.rs +++ b/crates/hashx/src/compiler/x86_64.rs @@ -2,7 +2,7 @@ use crate::compiler::{util, Architecture, Executable}; use crate::program::{self, Instruction, InstructionArray}; -use crate::register::{RegisterFile, RegisterId, RegisterSet}; +use crate::register::{RegisterFile, RegisterId}; use crate::CompilerError; use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi, Register}; use std::mem; @@ -125,19 +125,17 @@ fn emit_restore_regs(asm: &mut Assembler) { /// Emit code to move all input values from the RegisterFile into their /// actual hardware registers. fn emit_load_input(asm: &mut Assembler) { - RegisterSet::all().filter(|reg| { + for reg in RegisterId::all() { dynasm!(asm; mov Rq(reg.rq()), [register_file_ptr + reg.offset()]); - true - }); + } } /// Emit code to move all output values from machine registers back into /// their RegisterFile slots. fn emit_store_output(asm: &mut Assembler) { - RegisterSet::all().filter(|reg| { + for reg in RegisterId::all() { dynasm!(asm; mov [register_file_ptr + reg.offset()], Rq(reg.rq())); - true - }); + } } /// Emit a return instruction. diff --git a/crates/hashx/src/constraints.rs b/crates/hashx/src/constraints.rs index 297b87f78..48911f85a 100644 --- a/crates/hashx/src/constraints.rs +++ b/crates/hashx/src/constraints.rs @@ -233,36 +233,31 @@ impl Validator { } } - /// Figure out the allowed set of destination registers for an op after its - /// source is known, using the current state of the validator. + /// Check whether a destination register is actually allowed for an op after + /// its source is known, using the current state of the validator. #[inline(always)] - pub(crate) fn dst_registers_allowed( + pub(crate) fn dst_register_allowed( &self, - available: RegisterSet, + dst: RegisterId, op: Opcode, pass: Pass, writer_info: RegisterWriter, src: Option, - ) -> RegisterSet { - available.filter( - #[inline(always)] - |dst| { - // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't - // be used as destination for AddShift. - if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT { - return false; - } + ) -> bool { + // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't + // be used as destination for AddShift. + if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT { + return false; + } - // A few instructions disallow choosing src and dst as the same - if model::disallow_src_is_dst(op) && src == Some(dst) { - return false; - } + // A few instructions disallow choosing src and dst as the same + if model::disallow_src_is_dst(op) && src == Some(dst) { + return false; + } - // Additional constraints are written on the pair of previous and - // current instructions with the same destination. - model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info) - }, - ) + // Additional constraints are written on the pair of previous and + // current instructions with the same destination. + model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info) } } @@ -282,7 +277,7 @@ pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> Regis && available.contains(model::DISALLOW_REGISTER_FOR_ADDSHIFT) && available.len() == 2 { - available.filter( + RegisterSet::from_filter( #[inline(always)] |reg| reg == model::DISALLOW_REGISTER_FOR_ADDSHIFT, ) diff --git a/crates/hashx/src/generator.rs b/crates/hashx/src/generator.rs index e2c59e302..6b8f0ab31 100644 --- a/crates/hashx/src/generator.rs +++ b/crates/hashx/src/generator.rs @@ -135,7 +135,7 @@ impl<'r, R: RngCore> Generator<'r, R> { /// The choice is perfectly uniform only if the register set is a power of /// two length. Uniformity is not critical here. #[inline(always)] - fn select_register(&mut self, reg_options: RegisterSet) -> Result { + fn select_register(&mut self, reg_options: &RegisterSet) -> Result { match reg_options.len() { 0 => Err(()), 1 => Ok(reg_options.index(0)), @@ -271,12 +271,27 @@ impl<'r, R: RngCore> Generator<'r, R> { fn instruction_gen_attempt(&mut self, pass: Pass) -> Result<(Instruction, RegisterWriter), ()> { let op = self.choose_opcode(pass); let plan = self.scheduler.instruction_plan(op)?; - let (inst, regw) = self.choose_instruction_with_opcode_plan(op, pass, plan)?; - assert_eq!(inst.opcode(), op); - self.scheduler.commit_instruction_plan(plan, &inst); + let (inst, regw) = self.choose_instruction_with_opcode_plan(op, pass, &plan)?; + debug_assert_eq!(inst.opcode(), op); + self.scheduler.commit_instruction_plan(&plan, &inst); Ok((inst, regw)) } + /// Choose only a source register, depending on the opcode and timing plan + #[inline(never)] + fn choose_src_reg( + &mut self, + op: Opcode, + timing_plan: &InstructionPlan, + ) -> Result { + let src_set = RegisterSet::from_filter(|src| { + self.scheduler + .register_available(src, timing_plan.cycle_issued()) + }); + let src_set = constraints::src_registers_allowed(src_set, op); + self.select_register(&src_set) + } + /// Choose both a source and destination register using a normal /// [`RegisterWriter`] for two-operand instructions. #[inline(always)] @@ -285,18 +300,11 @@ impl<'r, R: RngCore> Generator<'r, R> { op: Opcode, pass: Pass, writer_info_fn: fn(RegisterId) -> RegisterWriter, - timing_plan: InstructionPlan, + timing_plan: &InstructionPlan, ) -> Result<(RegisterId, RegisterId, RegisterWriter), ()> { - let avail_set = self - .scheduler - .registers_available(timing_plan.cycle_issued()); - let src_set = constraints::src_registers_allowed(avail_set, op); - let src = self.select_register(src_set)?; + let src = self.choose_src_reg(op, timing_plan)?; let writer_info = writer_info_fn(src); - let dst_set = - self.validator - .dst_registers_allowed(avail_set, op, pass, writer_info, Some(src)); - let dst = self.select_register(dst_set)?; + let dst = self.choose_dst_reg(op, pass, writer_info, Some(src), timing_plan)?; Ok((src, dst, writer_info)) } @@ -309,37 +317,32 @@ impl<'r, R: RngCore> Generator<'r, R> { op: Opcode, pass: Pass, writer_info: RegisterWriter, - timing_plan: InstructionPlan, + timing_plan: &InstructionPlan, ) -> Result<(RegisterId, RegisterId), ()> { - let avail_set = self - .scheduler - .registers_available(timing_plan.cycle_issued()); - let src_set = constraints::src_registers_allowed(avail_set, op); - let src = self.select_register(src_set)?; - let dst_set = - self.validator - .dst_registers_allowed(avail_set, op, pass, writer_info, Some(src)); - let dst = self.select_register(dst_set)?; + let src = self.choose_src_reg(op, timing_plan)?; + let dst = self.choose_dst_reg(op, pass, writer_info, Some(src), timing_plan)?; Ok((src, dst)) } - /// Choose a destination register only. - #[inline(always)] + /// Choose a destination register only, using source and writer info + /// as well as the current state of the validator. + #[inline(never)] fn choose_dst_reg( &mut self, op: Opcode, pass: Pass, writer_info: RegisterWriter, - timing_plan: InstructionPlan, + src: Option, + timing_plan: &InstructionPlan, ) -> Result { - let avail_set = self - .scheduler - .registers_available(timing_plan.cycle_issued()); - let dst_set = self - .validator - .dst_registers_allowed(avail_set, op, pass, writer_info, None); - let dst = self.select_register(dst_set)?; - Ok(dst) + let dst_set = RegisterSet::from_filter(|dst| { + self.scheduler + .register_available(dst, timing_plan.cycle_issued()) + && self + .validator + .dst_register_allowed(dst, op, pass, writer_info, src) + }); + self.select_register(&dst_set) } /// With an [`Opcode`] and an execution unit timing plan already in mind, @@ -352,7 +355,7 @@ impl<'r, R: RngCore> Generator<'r, R> { &mut self, op: Opcode, pass: Pass, - plan: InstructionPlan, + plan: &InstructionPlan, ) -> Result<(Instruction, RegisterWriter), ()> { Ok(match op { Opcode::Target => (Instruction::Target, RegisterWriter::None), @@ -411,21 +414,21 @@ impl<'r, R: RngCore> Generator<'r, R> { Opcode::AddConst => { let regw = RegisterWriter::AddConst; let src = self.select_nonzero_u32(u32::MAX) as i32; - let dst = self.choose_dst_reg(op, pass, regw, plan)?; + let dst = self.choose_dst_reg(op, pass, regw, None, plan)?; (Instruction::AddConst { src, dst }, regw) } Opcode::XorConst => { let regw = RegisterWriter::XorConst; let src = self.select_nonzero_u32(u32::MAX) as i32; - let dst = self.choose_dst_reg(op, pass, regw, plan)?; + let dst = self.choose_dst_reg(op, pass, regw, None, plan)?; (Instruction::XorConst { src, dst }, regw) } Opcode::Rotate => { let regw = RegisterWriter::Rotate; let right_rotate: u8 = self.select_nonzero_u32(63) as u8; - let dst = self.choose_dst_reg(op, pass, regw, plan)?; + let dst = self.choose_dst_reg(op, pass, regw, None, plan)?; (Instruction::Rotate { dst, right_rotate }, regw) } }) diff --git a/crates/hashx/src/register.rs b/crates/hashx/src/register.rs index b5073aa81..a70d39dd9 100644 --- a/crates/hashx/src/register.rs +++ b/crates/hashx/src/register.rs @@ -1,6 +1,7 @@ //! Define HashX's register file, and how it's created and digested. use crate::siphash::{siphash24_ctr, SipState}; +use arrayvec::ArrayVec; use std::fmt; /// Number of virtual registers in the HashX machine @@ -29,6 +30,12 @@ impl RegisterId { pub(crate) fn as_usize(&self) -> usize { self.0 as usize } + + /// Create an iterator over all RegisterId + #[inline(always)] + pub(crate) fn all() -> impl Iterator { + (0_u8..(NUM_REGISTERS as u8)).map(RegisterId) + } } /// Identify a set of RegisterIds @@ -36,15 +43,10 @@ impl RegisterId { /// This could be done compactly as a u8 bitfield for storage purposes, but /// in our program generator this is never stored long-term. Instead, we want /// something the optimizer can reason about as effectively as possible, and -/// let's inline as much as possible in order to resolve special cases in -/// the program generator at compile time. -#[derive(Clone, Copy, Eq, PartialEq)] -pub(crate) struct RegisterSet { - /// Number of registers in the set - len: usize, - /// Array indexed by register Id, indicating registers we've excluded - reg_not_in_set: [bool; 8], -} +/// we want to optimize for an index() implementation that doesn't branch. +/// This uses a fixed-capacity array of registers in-set, always sorted. +#[derive(Default, Clone, Eq, PartialEq)] +pub(crate) struct RegisterSet(ArrayVec); impl fmt::Debug for RegisterSet { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -60,66 +62,29 @@ impl fmt::Debug for RegisterSet { } impl RegisterSet { - /// Construct the set of all registers. - /// - /// This is the main way to construct a new RegisterId, starting with - /// all available registers and filtering them repeatedly. - #[inline(always)] - pub(crate) fn all() -> Self { - Self { - len: NUM_REGISTERS, - reg_not_in_set: Default::default(), - } - } - /// Number of registers still contained in this set #[inline(always)] pub(crate) fn len(&self) -> usize { - self.len + self.0.len() } /// Test if a register is contained in the set. #[inline(always)] pub(crate) fn contains(&self, id: RegisterId) -> bool { - !self.reg_not_in_set[id.0 as usize] + self.0.contains(&id) } - /// Filter this register set through a predicate. - /// - /// Invokes the predicate only for registers in this set, and returns the - /// set of registers for which it returned true. + /// Build a new RegisterSet from each register for which a predicate + /// function returns `true`. #[inline(always)] - pub(crate) fn filter bool>(&self, mut predicate: P) -> Self { - let mut result = Self { - len: 0, - reg_not_in_set: Default::default(), - }; - self.filter_impl(0, &mut predicate, &mut result); - self.filter_impl(1, &mut predicate, &mut result); - self.filter_impl(2, &mut predicate, &mut result); - self.filter_impl(3, &mut predicate, &mut result); - self.filter_impl(4, &mut predicate, &mut result); - self.filter_impl(5, &mut predicate, &mut result); - self.filter_impl(6, &mut predicate, &mut result); - self.filter_impl(7, &mut predicate, &mut result); - result - } - - /// Internal implementation to be unrolled by `filter` - #[inline(always)] - fn filter_impl bool>( - &self, - id: usize, - predicate: &mut P, - result: &mut Self, - ) { - if self.reg_not_in_set[id] { - result.reg_not_in_set[id] = true; - } else if predicate(RegisterId(id as u8)) { - result.len += 1; - } else { - result.reg_not_in_set[id] = true; + pub(crate) fn from_filter bool>(mut predicate: P) -> Self { + let mut result: Self = Default::default(); + for r in RegisterId::all() { + if predicate(r) { + result.0.push(r); + } } + result } /// Return a particular register within this set, counting from R0 to R7. @@ -127,45 +92,8 @@ impl RegisterSet { /// The supplied index must be less than the [`Self::len()`] of this set. /// Panics if the index is out of range. #[inline(always)] - pub(crate) fn index(&self, mut index: usize) -> RegisterId { - if let Some(result) = self.index_impl(0, &mut index) { - return result; - } - if let Some(result) = self.index_impl(1, &mut index) { - return result; - } - if let Some(result) = self.index_impl(2, &mut index) { - return result; - } - if let Some(result) = self.index_impl(3, &mut index) { - return result; - } - if let Some(result) = self.index_impl(4, &mut index) { - return result; - } - if let Some(result) = self.index_impl(5, &mut index) { - return result; - } - if let Some(result) = self.index_impl(6, &mut index) { - return result; - } - if let Some(result) = self.index_impl(7, &mut index) { - return result; - } - unreachable!(); - } - - /// Internal implementation to be unrolled by `index` - #[inline(always)] - fn index_impl(&self, id: usize, index: &mut usize) -> Option { - if self.reg_not_in_set[id] { - None - } else if *index == 0 { - Some(RegisterId(id as u8)) - } else { - *index -= 1; - None - } + pub(crate) fn index(&self, index: usize) -> RegisterId { + self.0[index] } } @@ -224,29 +152,3 @@ impl RegisterFile { [x.v0 ^ y.v0, x.v1 ^ y.v1, x.v2 ^ y.v2, x.v3 ^ y.v3] } } - -#[cfg(test)] -mod test { - use super::RegisterSet; - - #[test] - fn register_set() { - let r = RegisterSet::all().filter(|_reg| true); - assert_eq!(r.len(), 8); - assert_eq!(r.index(7).as_usize(), 7); - assert_eq!(r.index(0).as_usize(), 0); - let r = r.filter(|reg| (reg.as_usize() & 1) != 0); - assert_eq!(r.len(), 4); - assert_eq!(r.index(0).as_usize(), 1); - assert_eq!(r.index(1).as_usize(), 3); - assert_eq!(r.index(2).as_usize(), 5); - assert_eq!(r.index(3).as_usize(), 7); - let r = r.filter(|reg| (reg.as_usize() & 2) != 0); - assert_eq!(r.index(0).as_usize(), 3); - assert_eq!(r.index(1).as_usize(), 7); - let r = r.filter(|_reg| true); - assert_eq!(r.len(), 2); - let r = r.filter(|_reg| false); - assert_eq!(r.len(), 0); - } -} diff --git a/crates/hashx/src/scheduler.rs b/crates/hashx/src/scheduler.rs index d5f681408..f57609c25 100644 --- a/crates/hashx/src/scheduler.rs +++ b/crates/hashx/src/scheduler.rs @@ -5,7 +5,7 @@ //! avoid stalls. use crate::program::{Instruction, Opcode}; -use crate::register::{RegisterId, RegisterSet, NUM_REGISTERS}; +use crate::register::{RegisterId, NUM_REGISTERS}; /// Scheduling information for each opcode mod model { @@ -189,7 +189,7 @@ impl Scheduler { /// Marks as busy each execution unit cycle in the plan, and updates the /// latency for the [`Instruction`]'s destination register if it has one. #[inline(always)] - pub(crate) fn commit_instruction_plan(&mut self, plan: InstructionPlan, inst: &Instruction) { + pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) { self.exec.mark_instruction_busy(plan); if let Some(dst) = inst.destination() { self.data @@ -197,10 +197,10 @@ impl Scheduler { } } - /// Figure out which registers will be available at or before the indicated cycle. + /// Look up if a register will be available at or before the indicated cycle. #[inline(always)] - pub(crate) fn registers_available(&self, cycle: Cycle) -> RegisterSet { - self.data.registers_available(cycle) + pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool { + self.data.register_available(reg, cycle) } /// Return the overall data latency. @@ -323,13 +323,10 @@ impl DataSchedule { self.latencies[dst.as_usize()] = cycle; } - /// Figure out which registers will be available at or before the indicated cycle + /// Look up if a register will be available at or before the indicated cycle. #[inline(always)] - fn registers_available(&self, cycle: Cycle) -> RegisterSet { - RegisterSet::all().filter( - #[inline(always)] - |reg| self.latencies[reg.as_usize()] <= cycle, - ) + fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool { + self.latencies[reg.as_usize()] <= cycle } /// Return the overall latency, the [`Cycle`] at which we expect @@ -422,7 +419,7 @@ impl ExecSchedule { /// Mark each micro-op in an InstructionPlan as busy in the schedule. #[inline(always)] - fn mark_instruction_busy(&mut self, plan: InstructionPlan) { + fn mark_instruction_busy(&mut self, plan: &InstructionPlan) { let (first, second) = plan.as_micro_plans(); self.mark_micro_busy(first); if let Some(second) = second { From ceacd5c98808ed8ecd91d6645be3ad32ce606ad0 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Tue, 15 Aug 2023 14:35:05 -0700 Subject: [PATCH 3/7] hashx: New approach to avoid memcpy in Program I was trying to eliminate all the places where we copied a Program (about 4100 bytes) except for the one final copy into a Box; but that approach was proving too annoying. Even returning a Program via Result will cause multiple unnecessary copies that don't optimize out. This patch switches approaches, and instead allocates a Vec presized to the correct capacity. This allocation is made as early as possible and retained for the lifetime of the program if necessary. This means we'll never avoid a heap allocation, but we can always avoid extra copies and we don't need a separate Box for interpreted programs. Performance effects are subtle. Overall wallclock time doesn't change much. Cachegrind shows some accesses moving up from RAM to L2 cache. Using GDB to probe memcpy sizes shows that large (>1024b) memcpy are now totally gone in the generate-interp test. Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/compiler.rs | 6 ++-- crates/hashx/src/compiler/aarch64.rs | 4 +-- crates/hashx/src/compiler/x86_64.rs | 4 +-- crates/hashx/src/constraints.rs | 4 +-- crates/hashx/src/generator.rs | 41 ++++++++++----------------- crates/hashx/src/lib.rs | 11 ++++---- crates/hashx/src/program.rs | 42 +++++++++++++--------------- 7 files changed, 49 insertions(+), 63 deletions(-) diff --git a/crates/hashx/src/compiler.rs b/crates/hashx/src/compiler.rs index d0f610a8d..fd8f7dc1f 100644 --- a/crates/hashx/src/compiler.rs +++ b/crates/hashx/src/compiler.rs @@ -7,7 +7,7 @@ //! `Executable` wraps a mmap buffer from the `dynasmrt` crate and the //! `Architecture` is implemented in a CPU-specific way. -use crate::{program::InstructionArray, register::RegisterFile, CompilerError}; +use crate::{program::Instruction, register::RegisterFile, CompilerError}; #[cfg(all(feature = "compiler", target_arch = "x86_64"))] mod x86_64; @@ -41,7 +41,7 @@ pub(crate) struct Executable { not(any(target_arch = "x86_64", target_arch = "aarch64")) ))] impl Architecture for Executable { - fn compile(_program: &InstructionArray) -> Result { + fn compile(_program: &[Instruction]) -> Result { Err(CompilerError::NotAvailable) } @@ -71,7 +71,7 @@ where Self: Sized, { /// Compile an array of instructions into an Executable - fn compile(program: &InstructionArray) -> Result; + fn compile(program: &[Instruction]) -> Result; /// Run the compiled code, with a RegisterFile for input and output fn invoke(&self, regs: &mut RegisterFile); diff --git a/crates/hashx/src/compiler/aarch64.rs b/crates/hashx/src/compiler/aarch64.rs index 127301324..835fa0af8 100644 --- a/crates/hashx/src/compiler/aarch64.rs +++ b/crates/hashx/src/compiler/aarch64.rs @@ -1,14 +1,14 @@ //! Dynamically emitted HashX assembly code for aarch64 targets use crate::compiler::{util, Architecture, Executable}; -use crate::program::{self, Instruction, InstructionArray}; +use crate::program::{self, Instruction}; use crate::register::{RegisterFile, RegisterId}; use crate::CompilerError; use dynasmrt::{aarch64, DynasmApi, DynasmLabelApi}; use std::mem; impl Architecture for Executable { - fn compile(program: &InstructionArray) -> Result { + fn compile(program: &[Instruction]) -> Result { let mut asm = Assembler::new(); emit_load_input(&mut asm); emit_init_locals(&mut asm); diff --git a/crates/hashx/src/compiler/x86_64.rs b/crates/hashx/src/compiler/x86_64.rs index 070295b8c..8adc3367f 100644 --- a/crates/hashx/src/compiler/x86_64.rs +++ b/crates/hashx/src/compiler/x86_64.rs @@ -1,14 +1,14 @@ //! Dynamically emitted HashX assembly code for x86_64 targets use crate::compiler::{util, Architecture, Executable}; -use crate::program::{self, Instruction, InstructionArray}; +use crate::program::{self, Instruction}; use crate::register::{RegisterFile, RegisterId}; use crate::CompilerError; use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi, Register}; use std::mem; impl Architecture for Executable { - fn compile(program: &InstructionArray) -> Result { + fn compile(program: &[Instruction]) -> Result { let mut asm = Assembler::new(); emit_save_regs(&mut asm); emit_load_input(&mut asm); diff --git a/crates/hashx/src/constraints.rs b/crates/hashx/src/constraints.rs index 48911f85a..7d3104736 100644 --- a/crates/hashx/src/constraints.rs +++ b/crates/hashx/src/constraints.rs @@ -9,7 +9,7 @@ //! Generating correct HashX output depends on applying exactly the right //! constraints. -use crate::program::{Instruction, InstructionArray, Opcode}; +use crate::program::{Instruction, Opcode}; use crate::register::{RegisterId, RegisterSet, NUM_REGISTERS}; use crate::scheduler::Scheduler; @@ -221,7 +221,7 @@ impl Validator { pub(crate) fn check_whole_program( &self, scheduler: &Scheduler, - instructions: &InstructionArray, + instructions: &[Instruction], ) -> Result<(), ()> { if instructions.len() == model::REQUIRED_INSTRUCTIONS && scheduler.overall_latency().as_usize() == model::REQUIRED_OVERALL_RESULT_AT_CYCLE diff --git a/crates/hashx/src/generator.rs b/crates/hashx/src/generator.rs index 6b8f0ab31..ede007058 100644 --- a/crates/hashx/src/generator.rs +++ b/crates/hashx/src/generator.rs @@ -1,7 +1,7 @@ //! Pseudorandom generator for hash programs and parts thereof use crate::constraints::{self, Pass, RegisterWriter, Validator}; -use crate::program::{Instruction, InstructionArray, Opcode, Program}; +use crate::program::{Instruction, Opcode}; use crate::rand::RngBuffer; use crate::register::{RegisterId, RegisterSet}; use crate::scheduler::{InstructionPlan, Scheduler}; @@ -84,17 +84,8 @@ mod model { pub(super) const BRANCH_MASK_BIT_WEIGHT: usize = 4; } -/// Generate a hash program from an arbitrary [`RngCore`] implementer. -/// -/// This can return [`Error::ProgramConstraints`] if the HashX post-generation -/// program verification fails. During normal use this will happen once per -/// several thousand random seeds, and the caller should skip to another seed. -pub(crate) fn generate_program(rng: &mut T) -> Result { - Generator::new(rng).generate_program() -} - -/// Internal state for the program generator -struct Generator<'r, R: RngCore> { +/// Program generator +pub(crate) struct Generator<'r, R: RngCore> { /// The program generator wraps a random number generator, via [`RngBuffer`]. rng: RngBuffer<'r, R>, @@ -118,7 +109,7 @@ struct Generator<'r, R: RngCore> { impl<'r, R: RngCore> Generator<'r, R> { /// Create a fresh program generator from a random number generator state. #[inline(always)] - fn new(rng: &'r mut R) -> Self { + pub(crate) fn new(rng: &'r mut R) -> Self { Generator { rng: RngBuffer::new(rng), scheduler: Scheduler::new(), @@ -198,30 +189,28 @@ impl<'r, R: RngCore> Generator<'r, R> { /// Generate an entire program. /// - /// This generates instructions until the state can't be advanced any - /// further. Returns with [`Error::ProgramConstraints`] if the program - /// fails the `HashX` whole-program checks. These constraint failures occur - /// in normal use, on a small fraction of seed values. + /// Generates instructions into a provided [`Vec`] until the generator + /// state can't be advanced any further. Runs the whole-program validator. + /// Returns with [`Error::ProgramConstraints`] if the program fails these + /// checks. This happens in normal use on a small fraction of seed values. #[inline(always)] - fn generate_program(&mut self) -> Result { - let mut array: InstructionArray = Default::default(); - while array.len() < array.capacity() { + pub(crate) fn generate_program(&mut self, output: &mut Vec) -> Result<(), Error> { + assert!(output.is_empty()); + while output.len() < output.capacity() { match self.generate_instruction() { Err(()) => break, Ok((inst, regw)) => { let state_advance = self.commit_instruction_state(&inst, regw); - array.push(inst); + output.push(inst); if let Err(()) = state_advance { break; } } } } - let result = self.validator.check_whole_program(&self.scheduler, &array); - match result { - Err(()) => Err(Error::ProgramConstraints), - Ok(()) => Ok(Program::new(array)), - } + self.validator + .check_whole_program(&self.scheduler, output) + .map_err(|()| Error::ProgramConstraints) } /// Generate the next instruction. diff --git a/crates/hashx/src/lib.rs b/crates/hashx/src/lib.rs index 28d3209fb..f63ab972f 100644 --- a/crates/hashx/src/lib.rs +++ b/crates/hashx/src/lib.rs @@ -51,7 +51,6 @@ mod scheduler; mod siphash; use crate::compiler::{Architecture, Executable}; -use crate::generator::generate_program; use crate::program::Program; use rand_core::RngCore; @@ -111,8 +110,8 @@ pub struct HashX { /// to store the program data. #[derive(Debug)] enum RuntimeProgram { - /// Select the interpreted runtime, and hold a boxed Program for it to run. - Interpret(Box), + /// Select the interpreted runtime, and hold a Program for it to run. + Interpret(Program), /// Select the compiled runtime, and hold an executable code page. Compiled(Executable), } @@ -203,7 +202,7 @@ impl HashXBuilder { rng: &mut R, register_key: SipState, ) -> Result { - let program = generate_program(rng)?; + let program = Program::generate(rng)?; self.build_from_program(program, register_key) } @@ -217,13 +216,13 @@ impl HashXBuilder { Ok(HashX { register_key, program: match self.runtime { - RuntimeOption::InterpretOnly => RuntimeProgram::Interpret(Box::new(program)), + RuntimeOption::InterpretOnly => RuntimeProgram::Interpret(program), RuntimeOption::CompileOnly => { RuntimeProgram::Compiled(Architecture::compile((&program).into())?) } RuntimeOption::TryCompile => match Architecture::compile((&program).into()) { Ok(exec) => RuntimeProgram::Compiled(exec), - Err(_) => RuntimeProgram::Interpret(Box::new(program)), + Err(_) => RuntimeProgram::Interpret(program), }, }, }) diff --git a/crates/hashx/src/program.rs b/crates/hashx/src/program.rs index 0ffa6426b..d843d7ee2 100644 --- a/crates/hashx/src/program.rs +++ b/crates/hashx/src/program.rs @@ -1,7 +1,9 @@ //! Define the internal hash program representation used by HashX. +use crate::generator::Generator; use crate::register::{RegisterFile, RegisterId}; -use arrayvec::ArrayVec; +use crate::Error; +use rand_core::RngCore; use std::fmt; use std::ops::BitXor; @@ -179,24 +181,14 @@ impl Instruction { } } -/// Fixed-size array of instructions, either a complete program or a -/// program under construction -pub(crate) type InstructionArray = ArrayVec; - -/// Generated `HashX` program, as a list of instructions. +/// Generated `HashX` program, as a Vec of instructions. #[derive(Clone, Default)] -pub struct Program { - /// The InstructionArray that this Program wraps - /// - /// InstructionArray provides storage, and this type indicates that the - /// program should be a well-formed HashX function. - instructions: InstructionArray, -} +pub struct Program(Vec); impl fmt::Debug for Program { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "Program {{")?; - for (addr, inst) in self.instructions.iter().enumerate() { + for (addr, inst) in self.0.iter().enumerate() { writeln!(f, " [{:3}]: {:?}", addr, inst)?; } write!(f, "}}") @@ -204,10 +196,16 @@ impl fmt::Debug for Program { } impl Program { - /// Construct a finished `Program` from a list of instructions. - #[inline(always)] - pub(crate) fn new(instructions: InstructionArray) -> Self { - Self { instructions } + /// Generate a new `Program` from an arbitrary [`RngCore`] implementer + /// + /// This can return [`Error::ProgramConstraints`] if the HashX + /// post-generation program verification fails. During normal use this + /// will happen once per several thousand random seeds, and the caller + /// should skip to another seed. + pub(crate) fn generate(rng: &mut T) -> Result { + let mut instructions = Vec::with_capacity(NUM_INSTRUCTIONS); + Generator::new(rng).generate_program(&mut instructions)?; + Ok(Program(instructions)) } /// Reference implementation for `Program` behavior @@ -254,9 +252,9 @@ impl Program { }}; } - while program_counter < self.instructions.len() { + while program_counter < self.0.len() { let next_pc = program_counter + 1; - program_counter = match &self.instructions[program_counter] { + program_counter = match &self.0[program_counter] { Instruction::Target => { branch_target = Some(program_counter); next_pc @@ -305,9 +303,9 @@ impl Program { } } -impl<'a> From<&'a Program> for &'a InstructionArray { +impl<'a> From<&'a Program> for &'a [Instruction] { #[inline(always)] fn from(prog: &'a Program) -> Self { - &prog.instructions + &prog.0 } } From 0af908bcf2b61b67a1f3e876ee938afd2ffd8e56 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Fri, 18 Aug 2023 14:33:21 -0700 Subject: [PATCH 4/7] hashx: Rearrange destination register validator for performance This hoists a few decisions out of the innermost portions of choose_dst_reg, by moving what we can out of dst_register_allowed. Wallclock time benchmarks: generate-interp improves, -6.0% Cachegrind benchmarks: generate_interp_1000x, -5.0% instructions, -11.6% L2 access, -6% RAM Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/constraints.rs | 54 +++++++++++++++++++++++++++------ crates/hashx/src/generator.rs | 7 +++-- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/crates/hashx/src/constraints.rs b/crates/hashx/src/constraints.rs index 7d3104736..4177cfebd 100644 --- a/crates/hashx/src/constraints.rs +++ b/crates/hashx/src/constraints.rs @@ -39,7 +39,7 @@ mod model { matches!(op, Opcode::Mul | Opcode::SMulH | Opcode::UMulH) } - /// Does an instruction prohibit using the same register for source and dest? + /// Does an instruction prohibit using the same register for src and dst? /// /// Meaningful only for ops that have both a source and destination register. #[inline(always)] @@ -233,31 +233,67 @@ impl Validator { } } - /// Check whether a destination register is actually allowed for an op after + /// Begin checking which destination registers are allowed for an op after /// its source is known, using the current state of the validator. + /// + /// Returns a DstRegisterChecker which can be used to test each specific + /// destination RegisterId quickly. #[inline(always)] - pub(crate) fn dst_register_allowed( + pub(crate) fn dst_registers_allowed( &self, - dst: RegisterId, op: Opcode, pass: Pass, writer_info: RegisterWriter, src: Option, - ) -> bool { + ) -> DstRegisterChecker<'_> { + DstRegisterChecker { + pass, + writer_info, + writer_map: &self.writer_map, + op_is_add_shift: op == Opcode::AddShift, + disallow_equal: if model::disallow_src_is_dst(op) { + src + } else { + None + }, + } + } +} + +/// State information returned by [`Validator::dst_registers_allowed`] +#[derive(Debug, Clone)] +pub(crate) struct DstRegisterChecker<'v> { + /// Is this the original or retry pass? + pass: Pass, + /// Reference to a table of [`RegisterWriter`] information for each register + writer_map: &'v RegisterWriterMap, + /// The new [`RegisterWriter`] under consideration + writer_info: RegisterWriter, + /// Was this [`Opcode::AddShift`]? + op_is_add_shift: bool, + /// Optionally disallow one matching register, used to implement [`model::disallow_src_is_dst`] + disallow_equal: Option, +} + +impl<'v> DstRegisterChecker<'v> { + /// Check a single destination register for usability, using context from + /// [`Validator::dst_registers_allowed`] + #[inline(always)] + pub(crate) fn check(&self, dst: RegisterId) -> bool { // One register specified by DISALLOW_REGISTER_FOR_ADDSHIFT can't // be used as destination for AddShift. - if op == Opcode::AddShift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT { + if self.op_is_add_shift && dst == model::DISALLOW_REGISTER_FOR_ADDSHIFT { return false; } // A few instructions disallow choosing src and dst as the same - if model::disallow_src_is_dst(op) && src == Some(dst) { + if Some(dst) == self.disallow_equal { return false; } // Additional constraints are written on the pair of previous and // current instructions with the same destination. - model::writer_pair_allowed(pass, self.writer_map.get(dst), writer_info) + model::writer_pair_allowed(self.pass, self.writer_map.get(dst), self.writer_info) } } @@ -267,7 +303,7 @@ impl Validator { pub(crate) fn src_registers_allowed(available: RegisterSet, op: Opcode) -> RegisterSet { // HashX defines a special case DISALLOW_REGISTER_FOR_ADDSHIFT for // destination registers, and it also includes a look-ahead - // condition here in source register allocation to prevent the dest + // condition here in source register allocation to prevent the dst // allocation from getting stuck as often. If we have only two // remaining registers for AddShift and one is the disallowed reg, // HashX defines that the random choice is short-circuited early diff --git a/crates/hashx/src/generator.rs b/crates/hashx/src/generator.rs index ede007058..c5a50459c 100644 --- a/crates/hashx/src/generator.rs +++ b/crates/hashx/src/generator.rs @@ -324,12 +324,13 @@ impl<'r, R: RngCore> Generator<'r, R> { src: Option, timing_plan: &InstructionPlan, ) -> Result { + let validator = self + .validator + .dst_registers_allowed(op, pass, writer_info, src); let dst_set = RegisterSet::from_filter(|dst| { self.scheduler .register_available(dst, timing_plan.cycle_issued()) - && self - .validator - .dst_register_allowed(dst, op, pass, writer_info, src) + && validator.check(dst) }); self.select_register(&dst_set) } From acf598e7858e87daaf8d0abac300812f26e06aa0 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Fri, 18 Aug 2023 17:27:24 -0700 Subject: [PATCH 5/7] hashx: avoid surprising overhead of enum code() method This is a very simple change that avoids a surprising performance pitfall: using the code() method on an enum from another crate caused a non-inlined function call in code where we otherwise expect a high level of compiler optimization. Replacing code() with a cast to u8 avoids this function call and allows more intensive optimization at the call site. Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/compiler/x86_64.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/hashx/src/compiler/x86_64.rs b/crates/hashx/src/compiler/x86_64.rs index 8adc3367f..c3fbebb7d 100644 --- a/crates/hashx/src/compiler/x86_64.rs +++ b/crates/hashx/src/compiler/x86_64.rs @@ -4,7 +4,7 @@ use crate::compiler::{util, Architecture, Executable}; use crate::program::{self, Instruction}; use crate::register::{RegisterFile, RegisterId}; use crate::CompilerError; -use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi, Register}; +use dynasmrt::{x64, x64::Rq, DynasmApi, DynasmLabelApi}; use std::mem; impl Architecture for Executable { @@ -109,7 +109,7 @@ fn emit_save_regs(asm: &mut Assembler) { dynasm!(asm; sub rsp, stack_size()); for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() { let offset = (i * mem::size_of::()) as i32; - dynasm!(asm; mov [rsp + offset], Rq(reg.code())); + dynasm!(asm; mov [rsp + offset], Rq(*reg as u8)); } } @@ -117,7 +117,7 @@ fn emit_save_regs(asm: &mut Assembler) { fn emit_restore_regs(asm: &mut Assembler) { for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() { let offset = (i * mem::size_of::()) as i32; - dynasm!(asm; mov Rq(reg.code()), [rsp + offset]); + dynasm!(asm; mov Rq(*reg as u8), [rsp + offset]); } dynasm!(asm; add rsp, stack_size()); } From 52dbdbea3b9d1c1ca0c0842facbd08bd3106bd59 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Fri, 18 Aug 2023 17:21:23 -0700 Subject: [PATCH 6/7] hashx: Assembly buffer sizing and tidying I was looking for ways to optimize out the many redundant capacity checks in the Assembler. I didn't find any promising approaches, but I also saw no evidence that it was an important bottleneck. (A simple unsafe fix didn't improve any important metrics) While I was in there, I tightened up the buffer size definitions for both x86_64 and aarch64, and added assertions to test the limits we set for the size of prologue, epilogue, and single instructions. I kept some of the inlining and data type tweaks, even though benchmarks show no difference. They seem like a step in the right direction, from the disassembly at least. Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/compiler/aarch64.rs | 58 +++++++++++++++++++------- crates/hashx/src/compiler/util.rs | 26 ++++++++---- crates/hashx/src/compiler/x86_64.rs | 62 +++++++++++++++++++++------- 3 files changed, 108 insertions(+), 38 deletions(-) diff --git a/crates/hashx/src/compiler/aarch64.rs b/crates/hashx/src/compiler/aarch64.rs index 835fa0af8..9fbd4e14c 100644 --- a/crates/hashx/src/compiler/aarch64.rs +++ b/crates/hashx/src/compiler/aarch64.rs @@ -10,13 +10,22 @@ use std::mem; impl Architecture for Executable { fn compile(program: &[Instruction]) -> Result { let mut asm = Assembler::new(); - emit_load_input(&mut asm); - emit_init_locals(&mut asm); - for inst in program { - emit_instruction(&mut asm, inst); + { + emit_load_input(&mut asm); + emit_init_locals(&mut asm); + debug_assert_eq!(asm.len(), PROLOGUE_SIZE); + } + for inst in program { + let prev_len = asm.len(); + emit_instruction(&mut asm, inst); + debug_assert!(asm.len() - prev_len <= INSTRUCTION_SIZE_LIMIT); + } + { + let prev_len = asm.len(); + emit_store_output(&mut asm); + emit_return(&mut asm); + debug_assert_eq!(asm.len() - prev_len, EPILOGUE_SIZE); } - emit_store_output(&mut asm); - emit_return(&mut asm); asm.finalize() } @@ -36,8 +45,19 @@ impl Architecture for Executable { } } -/// Architecture-specific capacity for the temporary output buffer -const BUFFER_CAPACITY: usize = 0x200 + program::NUM_INSTRUCTIONS * 16; +/// Architecture-specific fixed prologue size +const PROLOGUE_SIZE: usize = 0x28; + +/// Architecture-specific fixed epilogue size +const EPILOGUE_SIZE: usize = 0x24; + +/// Architecture-specific maximum size for one instruction +const INSTRUCTION_SIZE_LIMIT: usize = 0x18; + +/// Capacity for the temporary output buffer, before code is copied into +/// a long-lived allocation that can be made executable. +const BUFFER_CAPACITY: usize = + PROLOGUE_SIZE + EPILOGUE_SIZE + program::NUM_INSTRUCTIONS * INSTRUCTION_SIZE_LIMIT; /// Architecture-specific specialization of the Assembler type Assembler = util::Assembler; @@ -51,9 +71,12 @@ trait RegisterMapper { } impl RegisterMapper for RegisterId { + #[inline(always)] fn x(&self) -> u32 { 1 + (self.as_usize() as u32) } + + #[inline(always)] fn offset(&self) -> u32 { (self.as_usize() * mem::size_of::()) as u32 } @@ -75,7 +98,8 @@ macro_rules! dynasm { } /// Emit code to initialize our local variables to default values. -fn emit_init_locals(asm: &mut Assembler) { +#[inline(always)] +fn emit_init_locals(asm: &mut A) { dynasm!(asm ; mov mulh_result32, wzr ; mov branch_prohibit_flag, wzr @@ -84,7 +108,8 @@ fn emit_init_locals(asm: &mut Assembler) { /// Emit code to move all input values from the RegisterFile into their /// actual hardware registers. -fn emit_load_input(asm: &mut Assembler) { +#[inline(always)] +fn emit_load_input(asm: &mut A) { for reg in RegisterId::all() { dynasm!(asm; ldr X(reg.x()), [register_file_ptr, #(reg.offset())]); } @@ -92,20 +117,23 @@ fn emit_load_input(asm: &mut Assembler) { /// Emit code to move all output values from machine registers back into /// their RegisterFile slots. -fn emit_store_output(asm: &mut Assembler) { +#[inline(always)] +fn emit_store_output(asm: &mut A) { for reg in RegisterId::all() { dynasm!(asm; str X(reg.x()), [register_file_ptr, #(reg.offset())]); } } /// Emit a return instruction. -fn emit_return(asm: &mut Assembler) { +#[inline(always)] +fn emit_return(asm: &mut A) { dynasm!(asm; ret); } /// Load a sign extended 32-bit constant into the const_temp_64 /// register, using a movz/movn and movk pair. -fn emit_i32_const_temp_64(asm: &mut Assembler, value: i32) { +#[inline(always)] +fn emit_i32_const_temp_64(asm: &mut A, value: i32) { let high = (value >> 16) as u32; let low = (value & 0xFFFF) as u32; if value < 0 { @@ -117,7 +145,8 @@ fn emit_i32_const_temp_64(asm: &mut Assembler, value: i32) { } /// Load a 32-bit constant into const_temp_32, without extending. -fn emit_u32_const_temp_32(asm: &mut Assembler, value: u32) { +#[inline(always)] +fn emit_u32_const_temp_32(asm: &mut A, value: u32) { let high = value >> 16; let low = value & 0xFFFF; dynasm!(asm @@ -127,6 +156,7 @@ fn emit_u32_const_temp_32(asm: &mut Assembler, value: u32) { } /// Emit code for a single [`Instruction`] in the hash program. +#[inline(always)] fn emit_instruction(asm: &mut Assembler, inst: &Instruction) { /// Common implementation for binary operations on registers macro_rules! reg_op { diff --git a/crates/hashx/src/compiler/util.rs b/crates/hashx/src/compiler/util.rs index 0188211a1..03ec54e43 100644 --- a/crates/hashx/src/compiler/util.rs +++ b/crates/hashx/src/compiler/util.rs @@ -33,14 +33,22 @@ pub(crate) struct Assembler { impl Assembler { /// Return the entry point as an [`AssemblyOffset`]. + #[inline(always)] pub(crate) fn entry() -> AssemblyOffset { AssemblyOffset(0) } + /// Size of the code stored so far, in bytes + #[inline(always)] + pub(crate) fn len(&self) -> usize { + self.buffer.len() + } + /// Make a new assembler with a temporary buffer but no executable buffer. + #[inline(always)] pub(crate) fn new() -> Self { Self { - buffer: Default::default(), + buffer: ArrayVec::new(), target: None, phantom: PhantomData, } @@ -51,6 +59,7 @@ impl Assembler { /// This may fail if we can't allocate some memory, fill it, and mark /// it as executable. For example, a Linux platform with policy to restrict /// `mprotect` will show runtime errors at this point. + #[inline(always)] pub(crate) fn finalize(self) -> Result { // We never execute code from the buffer until it's complete, and we use // a freshly mmap'ed buffer for each program. Because of this, we don't @@ -83,11 +92,13 @@ impl std::fmt::Debug for Executable { impl DynasmLabelApi for Assembler { type Relocation = R; + #[inline(always)] fn local_label(&mut self, name: &'static str) { debug_assert_eq!(name, "target"); self.target = Some(self.offset()); } + #[inline(always)] fn backward_relocation( &mut self, name: &'static str, @@ -154,12 +165,14 @@ impl DynasmLabelApi for Assembler { } impl Extend for Assembler { + #[inline(always)] fn extend>(&mut self, iter: T) { self.buffer.extend(iter); } } impl<'a, R: Relocation, const S: usize> Extend<&'a u8> for Assembler { + #[inline(always)] fn extend>(&mut self, iter: T) { for byte in iter { self.buffer.push(*byte); @@ -168,20 +181,17 @@ impl<'a, R: Relocation, const S: usize> Extend<&'a u8> for Assembler { } impl DynasmApi for Assembler { + #[inline(always)] fn offset(&self) -> AssemblyOffset { AssemblyOffset(self.buffer.len()) } + #[inline(always)] fn push(&mut self, byte: u8) { self.buffer.push(byte); } - fn align(&mut self, alignment: usize, with: u8) { - let offset = self.buffer.len() % alignment; - if offset != 0 { - for _ in offset..alignment { - self.buffer.push(with); - } - } + fn align(&mut self, _alignment: usize, _with: u8) { + unreachable!(); } } diff --git a/crates/hashx/src/compiler/x86_64.rs b/crates/hashx/src/compiler/x86_64.rs index c3fbebb7d..8e5e20305 100644 --- a/crates/hashx/src/compiler/x86_64.rs +++ b/crates/hashx/src/compiler/x86_64.rs @@ -10,15 +10,24 @@ use std::mem; impl Architecture for Executable { fn compile(program: &[Instruction]) -> Result { let mut asm = Assembler::new(); - emit_save_regs(&mut asm); - emit_load_input(&mut asm); - emit_init_locals(&mut asm); - for inst in program { - emit_instruction(&mut asm, inst); + { + emit_save_regs(&mut asm); + emit_load_input(&mut asm); + emit_init_locals(&mut asm); + debug_assert_eq!(asm.len(), PROLOGUE_SIZE); + } + for inst in program { + let prev_len = asm.len(); + emit_instruction(&mut asm, inst); + debug_assert!(asm.len() - prev_len <= INSTRUCTION_SIZE_LIMIT); + } + { + let prev_len = asm.len(); + emit_store_output(&mut asm); + emit_restore_regs(&mut asm); + emit_return(&mut asm); + debug_assert_eq!(asm.len() - prev_len, EPILOGUE_SIZE); } - emit_store_output(&mut asm); - emit_restore_regs(&mut asm); - emit_return(&mut asm); asm.finalize() } @@ -37,8 +46,19 @@ impl Architecture for Executable { } } -/// Architecture-specific capacity for the temporary output buffer -const BUFFER_CAPACITY: usize = 0x200 + program::NUM_INSTRUCTIONS * 16; +/// Architecture-specific fixed prologue size +const PROLOGUE_SIZE: usize = 0x68; + +/// Architecture-specific fixed epilogue size +const EPILOGUE_SIZE: usize = 0x60; + +/// Architecture-specific maximum size for one instruction +const INSTRUCTION_SIZE_LIMIT: usize = 0x11; + +/// Capacity for the temporary output buffer, before code is copied into +/// a long-lived allocation that can be made executable. +const BUFFER_CAPACITY: usize = + PROLOGUE_SIZE + EPILOGUE_SIZE + program::NUM_INSTRUCTIONS * INSTRUCTION_SIZE_LIMIT; /// Architecture-specific specialization of the Assembler type Assembler = util::Assembler; @@ -52,9 +72,12 @@ trait RegisterMapper { } impl RegisterMapper for RegisterId { + #[inline(always)] fn rq(&self) -> u8 { 8 + (self.as_usize() as u8) } + + #[inline(always)] fn offset(&self) -> i32 { (self.as_usize() * mem::size_of::()) as i32 } @@ -77,7 +100,8 @@ macro_rules! dynasm { } /// Emit code to initialize our local variables to default values. -fn emit_init_locals(asm: &mut Assembler) { +#[inline(always)] +fn emit_init_locals(asm: &mut A) { dynasm!(asm ; xor mulh_result64, mulh_result64 ; xor branch_prohibit_flag, branch_prohibit_flag @@ -105,7 +129,8 @@ const fn stack_size() -> i32 { } /// Emit code to allocate stack space and store REGS_TO_SAVE. -fn emit_save_regs(asm: &mut Assembler) { +#[inline(always)] +fn emit_save_regs(asm: &mut A) { dynasm!(asm; sub rsp, stack_size()); for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() { let offset = (i * mem::size_of::()) as i32; @@ -114,7 +139,8 @@ fn emit_save_regs(asm: &mut Assembler) { } /// Emit code to restore REGS_TO_SAVE and deallocate stack space. -fn emit_restore_regs(asm: &mut Assembler) { +#[inline(always)] +fn emit_restore_regs(asm: &mut A) { for (i, reg) in REGS_TO_SAVE.as_ref().iter().enumerate() { let offset = (i * mem::size_of::()) as i32; dynasm!(asm; mov Rq(*reg as u8), [rsp + offset]); @@ -124,7 +150,8 @@ fn emit_restore_regs(asm: &mut Assembler) { /// Emit code to move all input values from the RegisterFile into their /// actual hardware registers. -fn emit_load_input(asm: &mut Assembler) { +#[inline(always)] +fn emit_load_input(asm: &mut A) { for reg in RegisterId::all() { dynasm!(asm; mov Rq(reg.rq()), [register_file_ptr + reg.offset()]); } @@ -132,18 +159,21 @@ fn emit_load_input(asm: &mut Assembler) { /// Emit code to move all output values from machine registers back into /// their RegisterFile slots. -fn emit_store_output(asm: &mut Assembler) { +#[inline(always)] +fn emit_store_output(asm: &mut A) { for reg in RegisterId::all() { dynasm!(asm; mov [register_file_ptr + reg.offset()], Rq(reg.rq())); } } /// Emit a return instruction. -fn emit_return(asm: &mut Assembler) { +#[inline(always)] +fn emit_return(asm: &mut A) { dynasm!(asm; ret); } /// Emit code for a single [`Instruction`] in the hash program. +#[inline(always)] fn emit_instruction(asm: &mut Assembler, inst: &Instruction) { /// Common implementation for binary operations on registers macro_rules! reg_op { From 26b5ae9a3c20e682dde718aae7f54536fea94383 Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott Date: Mon, 21 Aug 2023 15:19:16 -0700 Subject: [PATCH 7/7] hashx: Use a boxed slice for Program storage This is a very small change that converts our Vec cheaply into a boxed slice during program generation. Program generation speed shows no changes, and there's no change when using compiled hashes, but is a surprisingly effective 10% speedup to interpreted hash execution. Signed-off-by: Micah Elizabeth Scott --- crates/hashx/src/program.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/hashx/src/program.rs b/crates/hashx/src/program.rs index d843d7ee2..ea96a0ee0 100644 --- a/crates/hashx/src/program.rs +++ b/crates/hashx/src/program.rs @@ -181,9 +181,9 @@ impl Instruction { } } -/// Generated `HashX` program, as a Vec of instructions. +/// Generated `HashX` program, as a boxed slice of instructions #[derive(Clone, Default)] -pub struct Program(Vec); +pub struct Program(Box<[Instruction]>); impl fmt::Debug for Program { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -205,7 +205,7 @@ impl Program { pub(crate) fn generate(rng: &mut T) -> Result { let mut instructions = Vec::with_capacity(NUM_INSTRUCTIONS); Generator::new(rng).generate_program(&mut instructions)?; - Ok(Program(instructions)) + Ok(Program(instructions.into_boxed_slice())) } /// Reference implementation for `Program` behavior