//! Support for maintaining the usefulness of a corpus over time. //! //! Wasmtime's fuzzing strategy in general is to use `wasm-smith` to generate //! modules which interprets fuzz input from libFuzzer as a sort of "DNA". This //! works to generate pretty interesting modules but falls down over time //! because the DNA to generate the same module over time can change. This //! means that maintaining a corpus for Wasmtime is not the most useful thing //! in the world unfortunately and any historical discoveries of coverage need //! to be rediscovered every time the DNA changes. //! //! To help with this the module here implements a scheme where Wasmtime's fuzz //! inputs are highly likely to be of the form: //! //! ```text //! [ ... wasm module ... ][ .. fuzz custom section .. ] //! ``` //! //! The `fuzz custom section` here contains the original fuzz input used to //! generate the `wasm module`, and if the DNA hasn't changed then it should //! still be possible to do that as well. The benefit of this format, though, //! is that if the DNA is changed then the interpretation of the `fuzz custom //! section` will change but the original `wasm module` will not. This enables //! us to populate the corpus, ideally, with a set of interesting `wasm module` //! entries. //! //! Over time the `fuzz custom section` will "bitrot" and will be no longer able //! to generate the original `wasm module`. The main consequence of this is that //! when the original test case is mutated the generated wasm module from the //! mutation will be nothing alike from the original test case's wasm module. //! This means libFuzzer will have to rediscover ways to mutate into //! interesting modules, but we're no worse off than before hopefully. //! Additionally this more easily opens the door to integrate `wasm-mutate` one //! day into mutation here as well. //! //! Currently this is all supported via two methods: //! //! 1. A custom mutator is registered with libfuzzer. This means that all //! inputs generated by the mutator, so long as they fit, will be the //! "envelope" format of this module. This means that the corpus will //! hopefully naturally get populated with wasm files rather than random //! inputs. Note that this is not guaranteed to succeed since sometimes the //! buffer to store the fuzz input in the mutator is not big enough to store //! the final wasm module, in which case a non-enveloped wasm module is //! stored. //! //! 2. If the environment variable `WRITE_FUZZ_INPUT_TO is set then the fuzz //! input, in its envelope format, will be written to the specified file. //! This can be useful in case an input is in its binary form or if a //! preexisting corpus is being rewritten. use std::borrow::Cow; use arbitrary::{Arbitrary, Result, Unstructured}; use wasm_encoder::Section; /// Helper macro for fuzz targets that are single-module fuzzers. /// /// This combines the features of this module into one macro invocation to /// generate the fuzz entry point and mutator in tandem. #[macro_export] macro_rules! single_module_fuzzer { ($execute:ident $generate:ident) => { libfuzzer_sys::fuzz_target!(|data: &[u8]| { $crate::init_fuzzing(); drop($crate::single_module_fuzzer::execute( data, $execute, $generate, )); }); libfuzzer_sys::fuzz_mutator!(|data: &mut [u8], size: usize, max_size: usize, seed: u32| { $crate::single_module_fuzzer::mutate( data, size, max_size, $generate, libfuzzer_sys::fuzzer_mutate, ) }); }; } /// Executes a "single module fuzzer" given the raw `input` from libfuzzer. /// /// This will use the `input` to generate `T`, some configuration, which is /// then used by `gen_module` to generate a WebAssembly module. The module is /// then passed to `run` along with the configuration and remaining data that /// can be used as fuzz input. /// /// The main purpose of this function is to handle when `input` is actually a /// WebAssembly module "envelope". If the `input` is a valid wasm module and /// ends with a specific trailing custom section then the module generated by /// `gen_module` is actually discarded. The purpose of this is to handle the /// case where the input used to generate a module may change over time but /// we're still interested in the historical coverage of the original wasm /// module. pub fn execute<'a, T, U>( input: &'a [u8], run: fn(&[u8], KnownValid, T, &mut Unstructured<'a>) -> Result, gen_module: fn(&mut T, &mut Unstructured<'a>) -> Result<(Vec, KnownValid)>, ) -> Result where T: Arbitrary<'a>, { let (fuzz_data, module_in_input) = match extract_fuzz_input(input) { Ok(input) => { log::debug!("fuzz input was a valid module with trailing custom section"); (input.fuzz_data, Some(input.module)) } Err(e) => { log::debug!("fuzz input not a valid module: {e:?}"); (input, None) } }; let mut u = Unstructured::new(fuzz_data); let mut config = u.arbitrary()?; let (generated, known_valid) = gen_module(&mut config, &mut u)?; let module = module_in_input.unwrap_or(&generated); if let Ok(file) = std::env::var("WRITE_FUZZ_INPUT_TO") { std::fs::write(file, encode_module(&module, &fuzz_data)).unwrap(); } let known_valid = if module_in_input.is_some() { KnownValid::No } else { known_valid }; run(module, known_valid, config, &mut u) } /// Used as part of `execute` above to determine whether a module is known to /// be valid ahead of time. #[derive(Debug, PartialEq, Eq, Copy, Clone)] pub enum KnownValid { /// This module is known to be valid so it should assert compilation /// succeeds for example. Yes, /// This module is not known to be valid and it may not compile /// successfully. Note that it's also not known to compile unsuccessfully. No, } const SECTION_NAME: &str = "wasmtime-fuzz-input"; /// Implementation of a libfuzzer custom mutator for a single-module-fuzzer. /// /// This mutator will take the seed specified in `data` and attempt to mutate /// it with the provided `mutate` function. The `mutate` function may not /// receive the `data` as-specified, but instead may receive only the seed /// that was used to generate `data`. pub fn mutate( data: &mut [u8], mut size: usize, max_size: usize, gen_module: fn(&mut T, &mut Unstructured<'_>) -> Result<(Vec, KnownValid)>, mutate: fn(&mut [u8], usize, usize) -> usize, ) -> usize where T: for<'a> Arbitrary<'a>, { // If `data` is a valid wasm module with the fuzz seed at the end, then // discard the wasm module portion and instead shuffle the seed into the // beginning of the `data` slice. This is the "de-envelope" part of the // seed management here. // // After this the `data` array should contain the raw contents used to // produce the module and is ripe for mutation/minimization/etc. if let Ok(input) = extract_fuzz_input(&data[..size]) { let start = input.fuzz_data.as_ptr() as usize - data.as_ptr() as usize; size = input.fuzz_data.len(); data.copy_within(start..start + input.fuzz_data.len(), 0); } // Delegate to the provided mutation function for standard mutations to // apply. let new_size = mutate(data, size, max_size); // Next the goal of this function is to produce a test case which is an // actual wasm module. To that end this will run module generation over the // input provided. If this is all successful then the custom section // representing the seed is appended to the module, making it a sort of // self-referential module. // // After all this it's copied into `data` if the it fits. If the module // doesn't fit then the seed is left un-perturbed since there's not much // that we can do about that. let mut u = Unstructured::new(&data[..new_size]); match u .arbitrary() .and_then(|mut config| gen_module(&mut config, &mut u)) { Ok((module, _known_valid)) => { let module = encode_module(&module, &data[..new_size]); if module.len() < max_size { log::debug!( "successfully generated mutated module with \ appended input section" ); data[..module.len()].copy_from_slice(&module); return module.len(); } else { log::debug!("mutated module doesn't fit in original slice"); } } // If our new seed can't generate a new module then that's something // for the fuzzer to figure out later when it "officially" executes // this fuzz input. For the purposes of this function it's not too // useful to try to put it in an envelope otherwise so ignore it. Err(e) => { log::debug!("failed to generate module from mutated seed {e:?}"); } } new_size } fn encode_module(module: &[u8], fuzz_data: &[u8]) -> Vec { let mut module = module.to_vec(); wasm_encoder::CustomSection { name: SECTION_NAME.into(), data: Cow::Borrowed(&fuzz_data), } .append_to(&mut module); module } struct FuzzInput<'a> { /// The module extracted from the input, without the fuzz input custom /// section. module: &'a [u8], /// The contents of the fuzz input custom section. fuzz_data: &'a [u8], } /// Attempts to extract a fuzz input from the `data` provided. /// /// This will attempt to read `data` as a WebAssembly binary. If successful /// and the module ends with a custom section indicating it's a fuzz input /// then the contents of the custom section are returned along with the /// contents of the original module. fn extract_fuzz_input(data: &[u8]) -> wasmtime::Result> { use wasmparser::{Parser, Payload}; let mut prev_end = 8; for section in Parser::new(0).parse_all(data) { let section = section?; // If this is a custom section, the end of the section is the end of // the entire module, and it's got the expected name, then this section // is assumed to be the input seed to the fuzzer. // // The section's contents are returned through `fuzz_data` and the wasm // binary format means that we can simply chop off the last custom // section and still have a valid module. if let Payload::CustomSection(s) = §ion { if s.name() == SECTION_NAME && s.range().end == data.len() { return Ok(FuzzInput { module: &data[..prev_end], fuzz_data: s.data(), }); } } // Record each section's end to record what the end of the module is // up to this point. if let Some((_, range)) = section.as_section() { prev_end = range.end; } } wasmtime::bail!("no input found") } #[cfg(test)] mod tests { use super::*; use rand::rngs::SmallRng; use rand::{RngCore, SeedableRng}; #[test] fn changing_configuration_does_not_change_module() { drop(env_logger::try_init()); // This test asserts that if the static configuration associated with a // module changes then the generated module, as sourced from the // original fuzz input, does not change. That's the whole purpose of // this module, to enable our fuzz inputs to be in a format that's // resilient to changes in configuration over time (or at least the // module part of the input). // // This test will execute N=200 iterations where each iteration will // attempt to, with some fresh random data, generate a module. This // module is then "mutated" with a noop mutation to effectively // serialize it into the envelope where the module is preserved. The // now-mutated input, which should be a wasm module, is then passed // as the seed to a second execution which has a different static input. // // This simulates having a fuzzer one day produce an interesting test // case through mutation, and then the next day the configuration of // the fuzzer changes. On both days the module input to the function // should have been the same. let mut rng = SmallRng::seed_from_u64(0); let max_size = 4096; let seed_size = 128; let mut buf = vec![0; max_size]; let mut compares = 0; for _ in 0..200 { rng.fill_bytes(&mut buf[..seed_size]); let run1 = run_config::; let mutate = mutate::; let run2 = run_config::<(u32, u32)>; if let Ok((module, known_valid)) = execute(&buf[..seed_size], run1, generate) { assert_eq!(known_valid, KnownValid::Yes); let new_size = mutate(&mut buf, seed_size, max_size, generate, noop_mutate); if let Ok((module2, known_valid)) = execute(&buf[..new_size], run2, generate) { assert_eq!(known_valid, KnownValid::No); compares += 1; if module != module2 { panic!("modules differ"); } } } } // At least one iteration should have succeeded in the fuzz generation // above. assert!(compares > 0); fn run_config( data: &[u8], known_valid: KnownValid, _: T, _: &mut Unstructured<'_>, ) -> Result<(Vec, KnownValid)> where T: for<'a> Arbitrary<'a>, { Ok((data.to_vec(), known_valid)) } fn generate(_: &mut T, u: &mut Unstructured<'_>) -> Result<(Vec, KnownValid)> where T: for<'a> Arbitrary<'a>, { Ok(( u.arbitrary::()?.to_bytes(), KnownValid::Yes, )) } fn noop_mutate(_buf: &mut [u8], size: usize, _new_size: usize) -> usize { size } } }