1 //! A fuzz testing oracle for roundtrip assembly-disassembly.
2 //!
3 //! This contains manual implementations of the `Arbitrary` trait for types
4 //! throughout this crate to avoid depending on the `arbitrary` crate
5 //! unconditionally (use the `fuzz` feature instead).
6 
7 use std::string::{String, ToString};
8 use std::vec::Vec;
9 use std::{format, println};
10 
11 use crate::{
12     AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,
13     KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,
14 };
15 use arbitrary::{Arbitrary, Result, Unstructured};
16 use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};
17 
18 /// Take a random assembly instruction and check its encoding and
19 /// pretty-printing against a known-good disassembler.
20 ///
21 /// # Panics
22 ///
23 /// This function panics to express failure as expected by the `arbitrary`
24 /// fuzzer infrastructure. It may fail during assembly, disassembly, or when
25 /// comparing the disassembled strings.
roundtrip(inst: &Inst<FuzzRegs>)26 pub fn roundtrip(inst: &Inst<FuzzRegs>) {
27     // Check that we can actually assemble this instruction.
28     let assembled = assemble(inst);
29     let expected = disassemble(&assembled, inst);
30 
31     // Check that our pretty-printed output matches the known-good output. Trim
32     // off the instruction offset first.
33     let expected = expected.split_once(' ').unwrap().1;
34     let actual = inst.to_string();
35     if expected != actual && expected.trim() != fix_up(&actual) {
36         println!("> {inst}");
37         println!("  debug: {inst:x?}");
38         println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
39         println!("  expected (capstone): {expected}");
40         println!("  actual (to_string):  {actual}");
41         assert_eq!(expected, &actual);
42     }
43 }
44 
45 /// Use this assembler to emit machine code into a byte buffer.
46 ///
47 /// This will skip any traps or label registrations, but this is fine for the
48 /// single-instruction disassembly we're doing here.
assemble(inst: &Inst<FuzzRegs>) -> Vec<u8>49 fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {
50     let mut sink = TestCodeSink::default();
51     inst.encode(&mut sink);
52     sink.patch_labels_as_if_they_referred_to_end();
53     sink.buf
54 }
55 
56 #[derive(Default)]
57 struct TestCodeSink {
58     buf: Vec<u8>,
59     offsets_using_label: Vec<usize>,
60 }
61 
62 impl TestCodeSink {
63     /// References to labels, e.g. RIP-relative addressing, is stored with an
64     /// adjustment that takes into account the distance from the relative offset
65     /// to the end of the instruction, where the offset is relative to. That
66     /// means that to indeed make the offset relative to the end of the
67     /// instruction, which is what we pretend all labels are bound to, it's
68     /// required that this adjustment is taken into account.
69     ///
70     /// This function will iterate over all labels bound to this code sink and
71     /// pretend the label is found at the end of the `buf`. That means that the
72     /// distance from the label to the end of `buf` minus 4, which is the width
73     /// of the offset, is added to what's already present in the encoding buffer.
74     ///
75     /// This is effectively undoing the `bytes_at_end` adjustment that's part of
76     /// `Amode::RipRelative` addressing.
patch_labels_as_if_they_referred_to_end(&mut self)77     fn patch_labels_as_if_they_referred_to_end(&mut self) {
78         let len = i32::try_from(self.buf.len()).unwrap();
79         for offset in self.offsets_using_label.iter() {
80             let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();
81             let offset = i32::try_from(*offset).unwrap() + 4;
82             let rel_distance = len - offset;
83             *range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();
84         }
85     }
86 }
87 
88 impl CodeSink for TestCodeSink {
put1(&mut self, v: u8)89     fn put1(&mut self, v: u8) {
90         self.buf.extend_from_slice(&[v]);
91     }
92 
put2(&mut self, v: u16)93     fn put2(&mut self, v: u16) {
94         self.buf.extend_from_slice(&v.to_le_bytes());
95     }
96 
put4(&mut self, v: u32)97     fn put4(&mut self, v: u32) {
98         self.buf.extend_from_slice(&v.to_le_bytes());
99     }
100 
put8(&mut self, v: u64)101     fn put8(&mut self, v: u64) {
102         self.buf.extend_from_slice(&v.to_le_bytes());
103     }
104 
add_trap(&mut self, _: TrapCode)105     fn add_trap(&mut self, _: TrapCode) {}
106 
use_target(&mut self, _: DeferredTarget)107     fn use_target(&mut self, _: DeferredTarget) {
108         let offset = self.buf.len();
109         self.offsets_using_label.push(offset);
110     }
111 
known_offset(&self, target: KnownOffset) -> i32112     fn known_offset(&self, target: KnownOffset) -> i32 {
113         panic!("unsupported known target {target:?}")
114     }
115 }
116 
117 /// Building a new `Capstone` each time is suboptimal (TODO).
disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String118 fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {
119     let cs = Capstone::new()
120         .x86()
121         .mode(x86::ArchMode::Mode64)
122         .syntax(x86::ArchSyntax::Att)
123         .detail(true)
124         .build()
125         .expect("failed to create Capstone object");
126     let insts = cs
127         .disasm_all(assembled, 0x0)
128         .expect("failed to disassemble");
129 
130     if insts.len() != 1 {
131         println!("> {original}");
132         println!("  debug: {original:x?}");
133         println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
134         assert_eq!(insts.len(), 1, "not a single instruction");
135     }
136 
137     let inst = insts.first().expect("at least one instruction");
138     if assembled.len() != inst.len() {
139         println!("> {original}");
140         println!("  debug: {original:x?}");
141         println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
142         println!(
143             "  capstone-assembled: {}",
144             pretty_print_hexadecimal(inst.bytes())
145         );
146         assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");
147     }
148 
149     inst.to_string()
150 }
151 
pretty_print_hexadecimal(hex: &[u8]) -> String152 fn pretty_print_hexadecimal(hex: &[u8]) -> String {
153     use core::fmt::Write;
154     let mut s = String::with_capacity(hex.len() * 2);
155     for b in hex {
156         write!(&mut s, "{b:02X}").unwrap();
157     }
158     s
159 }
160 
161 /// See `replace_signed_immediates`.
162 macro_rules! hex_print_signed_imm {
163     ($hex:expr, $from:ty => $to:ty) => {{
164         let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;
165         let mut simm = String::new();
166         if imm < 0 {
167             simm.push_str("-");
168         }
169         let abs = match imm.checked_abs() {
170             Some(i) => i,
171             None => <$to>::MIN,
172         };
173         if imm > -10 && imm < 10 {
174             simm.push_str(&format!("{:x}", abs));
175         } else {
176             simm.push_str(&format!("0x{:x}", abs));
177         }
178         simm
179     }};
180 }
181 
182 /// Replace signed immediates in the disassembly with their unsigned hexadecimal
183 /// equivalent. This is only necessary to match `capstone`'s complex
184 /// pretty-printing rules; e.g. `capstone` will:
185 /// - omit the `0x` prefix when printing `0x0` as `0`.
186 /// - omit the `0x` prefix when print small values (less than 10)
187 /// - print negative values as `-0x...` (signed hex) instead of `0xff...`
188 ///   (normal hex)
189 /// - print `mov` immediates as base-10 instead of base-16 (?!).
replace_signed_immediates(dis: &str) -> alloc::borrow::Cow<'_, str>190 fn replace_signed_immediates(dis: &str) -> alloc::borrow::Cow<'_, str> {
191     match dis.find('$') {
192         None => dis.into(),
193         Some(idx) => {
194             let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.
195             let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.
196             let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.
197             let n = rest.chars().take_while(char::is_ascii_hexdigit).count();
198             let (hex, rest) = rest.split_at(n); // Split at next non-hex character.
199             let simm = if dis.starts_with("mov") {
200                 u64::from_str_radix(hex, 16).unwrap().to_string()
201             } else {
202                 match hex.len() {
203                     1 | 2 => hex_print_signed_imm!(hex, u8 => i8),
204                     4 => hex_print_signed_imm!(hex, u16 => i16),
205                     8 => hex_print_signed_imm!(hex, u32 => i32),
206                     16 => hex_print_signed_imm!(hex, u64 => i64),
207                     _ => panic!("unexpected length for hex: {hex}"),
208                 }
209             };
210             format!("{prefix}{simm}{rest}").into()
211         }
212     }
213 }
214 
215 // See `replace_signed_immediates`.
chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str)216 fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {
217     if s.starts_with(pat) {
218         s.split_at(pat.len())
219     } else {
220         ("", s)
221     }
222 }
223 
224 #[test]
replace()225 fn replace() {
226     assert_eq!(
227         replace_signed_immediates("andl $0xffffff9a, %r11d"),
228         "andl $-0x66, %r11d"
229     );
230     assert_eq!(
231         replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),
232         "xorq $-0x44, 0x7f139ecc(%r9)"
233     );
234     assert_eq!(
235         replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),
236         "subl $0x3ca77a19, -0x1a030f40(%r14)"
237     );
238     assert_eq!(
239         replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),
240         "movq $18446744071667638531, %rsi"
241     );
242 }
243 
244 /// Remove everything after the first semicolon in the disassembly and trim any
245 /// trailing spaces. This is necessary to remove the implicit operands we end up
246 /// printing for Cranelift's sake.
remove_after_semicolon(dis: &str) -> &str247 fn remove_after_semicolon(dis: &str) -> &str {
248     match dis.find(';') {
249         None => dis,
250         Some(idx) => {
251             let (prefix, _) = dis.split_at(idx);
252             prefix.trim()
253         }
254     }
255 }
256 
257 #[test]
remove_after_parenthesis_test()258 fn remove_after_parenthesis_test() {
259     assert_eq!(
260         remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),
261         "imulb 0x7658eddd(%rcx)"
262     );
263 }
264 
265 /// Run some post-processing on the disassembly to make it match Capstone.
fix_up(dis: &str) -> alloc::borrow::Cow<'_, str>266 fn fix_up(dis: &str) -> alloc::borrow::Cow<'_, str> {
267     let dis = remove_after_semicolon(dis);
268     replace_signed_immediates(&dis)
269 }
270 
271 /// Fuzz-specific registers.
272 ///
273 /// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].
274 #[derive(Clone, Arbitrary, Debug)]
275 pub struct FuzzRegs;
276 
277 impl Registers for FuzzRegs {
278     type ReadGpr = FuzzReg;
279     type ReadWriteGpr = FuzzReg;
280     type WriteGpr = FuzzReg;
281     type ReadXmm = FuzzReg;
282     type ReadWriteXmm = FuzzReg;
283     type WriteXmm = FuzzReg;
284 }
285 
286 /// A simple `u8` register type for fuzzing only.
287 #[derive(Clone, Copy, Debug, PartialEq)]
288 pub struct FuzzReg(u8);
289 
290 impl<'a> Arbitrary<'a> for FuzzReg {
arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self>291     fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
292         Ok(Self(u.int_in_range(0..=15)?))
293     }
294 }
295 
296 impl AsReg for FuzzReg {
new(enc: u8) -> Self297     fn new(enc: u8) -> Self {
298         Self(enc)
299     }
enc(&self) -> u8300     fn enc(&self) -> u8 {
301         self.0
302     }
303 }
304 
305 impl Arbitrary<'_> for AmodeOffset {
arbitrary(u: &mut Unstructured<'_>) -> Result<Self>306     fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
307         // Custom implementation to try to generate some "interesting" offsets.
308         // For example choose either an arbitrary 8-bit or 32-bit number as the
309         // base, and then optionally shift that number to the left to create
310         // multiples of constants. This can help stress some of the more
311         // interesting encodings in EVEX instructions for example.
312         let base = if u.arbitrary()? {
313             i32::from(u.arbitrary::<i8>()?)
314         } else {
315             u.arbitrary::<i32>()?
316         };
317         Ok(match u.int_in_range(0..=5)? {
318             0 => AmodeOffset::ZERO,
319             n => AmodeOffset::new(base << (n - 1)),
320         })
321     }
322 }
323 
324 impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {
arbitrary(u: &mut Unstructured<'_>) -> Result<Self>325     fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
326         // For now, we don't generate offsets (TODO).
327         Ok(Self {
328             simm32: AmodeOffset::arbitrary(u)?,
329             offset: None,
330         })
331     }
332 }
333 
334 impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {
arbitrary(_: &mut Unstructured<'_>) -> Result<Self>335     fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {
336         Ok(Self::new(E))
337     }
338 }
339 
340 impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {
arbitrary(u: &mut Unstructured<'_>) -> Result<Self>341     fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
342         use crate::gpr::enc::*;
343         let gpr = u.choose(&[
344             RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
345         ])?;
346         Ok(Self::new(R::new(*gpr)))
347     }
348 }
349 impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {
arbitrary(u: &mut Unstructured<'a>) -> Result<Self>350     fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
351         Ok(Self(R::new(u.int_in_range(0..=15)?)))
352     }
353 }
354 impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {
arbitrary(u: &mut Unstructured<'a>) -> Result<Self>355     fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
356         Ok(Self(R::new(u.int_in_range(0..=15)?)))
357     }
358 }
359 
360 /// Helper trait that's used to be the same as `Registers` except with an extra
361 /// `for<'a> Arbitrary<'a>` bound on all of the associated types.
362 pub trait RegistersArbitrary:
363     Registers<
364         ReadGpr: for<'a> Arbitrary<'a>,
365         ReadWriteGpr: for<'a> Arbitrary<'a>,
366         WriteGpr: for<'a> Arbitrary<'a>,
367         ReadXmm: for<'a> Arbitrary<'a>,
368         ReadWriteXmm: for<'a> Arbitrary<'a>,
369         WriteXmm: for<'a> Arbitrary<'a>,
370     >
371 {
372 }
373 
374 impl<R> RegistersArbitrary for R
375 where
376     R: Registers,
377     R::ReadGpr: for<'a> Arbitrary<'a>,
378     R::ReadWriteGpr: for<'a> Arbitrary<'a>,
379     R::WriteGpr: for<'a> Arbitrary<'a>,
380     R::ReadXmm: for<'a> Arbitrary<'a>,
381     R::ReadWriteXmm: for<'a> Arbitrary<'a>,
382     R::WriteXmm: for<'a> Arbitrary<'a>,
383 {
384 }
385 
386 #[cfg(test)]
387 mod test {
388     use super::*;
389     use arbtest::arbtest;
390     use std::sync::atomic::{AtomicUsize, Ordering};
391 
392     #[test]
smoke()393     fn smoke() {
394         let count = AtomicUsize::new(0);
395         arbtest(|u| {
396             let inst: Inst<FuzzRegs> = u.arbitrary()?;
397             roundtrip(&inst);
398             println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));
399             Ok(())
400         })
401         .budget_ms(1_000);
402 
403         // This will run the `roundtrip` fuzzer for one second. To repeatably
404         // test a single input, append `.seed(0x<failing seed>)`.
405     }
406 
407     #[test]
callq()408     fn callq() {
409         for i in -500..500 {
410             println!("immediate: {i}");
411             let inst = crate::inst::callq_d::new(i);
412             roundtrip(&inst.into());
413         }
414     }
415 }
416