1 //! Support for maintaining the usefulness of a corpus over time. 2 //! 3 //! Wasmtime's fuzzing strategy in general is to use `wasm-smith` to generate 4 //! modules which interprets fuzz input from libFuzzer as a sort of "DNA". This 5 //! works to generate pretty interesting modules but falls down over time 6 //! because the DNA to generate the same module over time can change. This 7 //! means that maintaining a corpus for Wasmtime is not the most useful thing 8 //! in the world unfortunately and any historical discoveries of coverage need 9 //! to be rediscovered every time the DNA changes. 10 //! 11 //! To help with this the module here implements a scheme where Wasmtime's fuzz 12 //! inputs are highly likely to be of the form: 13 //! 14 //! ```text 15 //! [ ... wasm module ... ][ .. fuzz custom section .. ] 16 //! ``` 17 //! 18 //! The `fuzz custom section` here contains the original fuzz input used to 19 //! generate the `wasm module`, and if the DNA hasn't changed then it should 20 //! still be possible to do that as well. The benefit of this format, though, 21 //! is that if the DNA is changed then the interpretation of the `fuzz custom 22 //! section` will change but the original `wasm module` will not. This enables 23 //! us to populate the corpus, ideally, with a set of interesting `wasm module` 24 //! entries. 25 //! 26 //! Over time the `fuzz custom section` will "bitrot" and will be no longer able 27 //! to generate the original `wasm module`. The main consequence of this is that 28 //! when the original test case is mutated the generated wasm module from the 29 //! mutation will be nothing alike from the original test case's wasm module. 30 //! This means libFuzzer will have to rediscover ways to mutate into 31 //! interesting modules, but we're no worse off than before hopefully. 32 //! Additionally this more easily opens the door to integrate `wasm-mutate` one 33 //! day into mutation here as well. 34 //! 35 //! Currently this is all supported via two methods: 36 //! 37 //! 1. A custom mutator is registered with libfuzzer. This means that all 38 //! inputs generated by the mutator, so long as they fit, will be the 39 //! "envelope" format of this module. This means that the corpus will 40 //! hopefully naturally get populated with wasm files rather than random 41 //! inputs. Note that this is not guaranteed to succeed since sometimes the 42 //! buffer to store the fuzz input in the mutator is not big enough to store 43 //! the final wasm module, in which case a non-enveloped wasm module is 44 //! stored. 45 //! 46 //! 2. If the environment variable `WRITE_FUZZ_INPUT_TO is set then the fuzz 47 //! input, in its envelope format, will be written to the specified file. 48 //! This can be useful in case an input is in its binary form or if a 49 //! preexisting corpus is being rewritten. 50 51 use std::borrow::Cow; 52 53 use arbitrary::{Arbitrary, Result, Unstructured}; 54 use wasm_encoder::Section; 55 56 /// Helper macro for fuzz targets that are single-module fuzzers. 57 /// 58 /// This combines the features of this module into one macro invocation to 59 /// generate the fuzz entry point and mutator in tandem. 60 #[macro_export] 61 macro_rules! single_module_fuzzer { 62 ($execute:ident $generate:ident) => { 63 libfuzzer_sys::fuzz_target!(|data: &[u8]| { 64 $crate::init_fuzzing(); 65 drop($crate::single_module_fuzzer::execute( 66 data, $execute, $generate, 67 )); 68 }); 69 70 libfuzzer_sys::fuzz_mutator!(|data: &mut [u8], size: usize, max_size: usize, seed: u32| { 71 $crate::single_module_fuzzer::mutate( 72 data, 73 size, 74 max_size, 75 $generate, 76 libfuzzer_sys::fuzzer_mutate, 77 ) 78 }); 79 }; 80 } 81 82 /// Executes a "single module fuzzer" given the raw `input` from libfuzzer. 83 /// 84 /// This will use the `input` to generate `T`, some configuration, which is 85 /// then used by `gen_module` to generate a WebAssembly module. The module is 86 /// then passed to `run` along with the configuration and remaining data that 87 /// can be used as fuzz input. 88 /// 89 /// The main purpose of this function is to handle when `input` is actually a 90 /// WebAssembly module "envelope". If the `input` is a valid wasm module and 91 /// ends with a specific trailing custom section then the module generated by 92 /// `gen_module` is actually discarded. The purpose of this is to handle the 93 /// case where the input used to generate a module may change over time but 94 /// we're still interested in the historical coverage of the original wasm 95 /// module. 96 pub fn execute<'a, T, U>( 97 input: &'a [u8], 98 run: fn(&[u8], bool, T, &mut Unstructured<'a>) -> Result<U>, 99 gen_module: fn(&mut T, &mut Unstructured<'a>) -> Result<Vec<u8>>, 100 ) -> Result<U> 101 where 102 T: Arbitrary<'a>, 103 { 104 let (fuzz_data, module_in_input) = match extract_fuzz_input(input) { 105 Ok(input) => { 106 log::debug!("fuzz input was a valid module with trailing custom section"); 107 (input.fuzz_data, Some(input.module)) 108 } 109 Err(e) => { 110 log::debug!("fuzz input not a valid module: {e:?}"); 111 (input, None) 112 } 113 }; 114 let mut u = Unstructured::new(fuzz_data); 115 let mut config = u.arbitrary()?; 116 let generated = gen_module(&mut config, &mut u)?; 117 let module = module_in_input.unwrap_or(&generated); 118 if let Ok(file) = std::env::var("WRITE_FUZZ_INPUT_TO") { 119 std::fs::write(file, encode_module(&module, &fuzz_data)).unwrap(); 120 } 121 run(module, module_in_input.is_none(), config, &mut u) 122 } 123 124 const SECTION_NAME: &str = "wasmtime-fuzz-input"; 125 126 /// Implementation of a libfuzzer custom mutator for a single-module-fuzzer. 127 /// 128 /// This mutator will take the seed specified in `data` and attempt to mutate 129 /// it with the provided `mutate` function. The `mutate` function may not 130 /// receive the `data` as-specified, but instead may receive only the seed 131 /// that was used to generate `data`. 132 pub fn mutate<T>( 133 data: &mut [u8], 134 mut size: usize, 135 max_size: usize, 136 gen_module: fn(&mut T, &mut Unstructured<'_>) -> Result<Vec<u8>>, 137 mutate: fn(&mut [u8], usize, usize) -> usize, 138 ) -> usize 139 where 140 T: for<'a> Arbitrary<'a>, 141 { 142 // If `data` is a valid wasm module with the fuzz seed at the end, then 143 // discard the wasm module portion and instead shuffle the seed into the 144 // beginning of the `data` slice. This is the "de-envelope" part of the 145 // seed management here. 146 // 147 // After this the `data` array should contain the raw contents used to 148 // produce the module and is ripe for mutation/minimization/etc. 149 if let Ok(input) = extract_fuzz_input(&data[..size]) { 150 let start = input.fuzz_data.as_ptr() as usize - data.as_ptr() as usize; 151 size = input.fuzz_data.len(); 152 data.copy_within(start..start + input.fuzz_data.len(), 0); 153 } 154 155 // Delegate to the provided mutation function for standard mutations to 156 // apply. 157 let new_size = mutate(data, size, max_size); 158 159 // Next the goal of this function is to produce a test case which is an 160 // actual wasm module. To that end this will run module generation over the 161 // input provided. If this is all successful then the custom section 162 // representing the seed is appended to the module, making it a sort of 163 // self-referential module. 164 // 165 // After all this it's copied into `data` if the it fits. If the module 166 // doesn't fit then the seed is left un-perturbed since there's not much 167 // that we can do about that. 168 let mut u = Unstructured::new(&data[..new_size]); 169 match u 170 .arbitrary() 171 .and_then(|mut config| gen_module(&mut config, &mut u)) 172 { 173 Ok(module) => { 174 let module = encode_module(&module, &data[..new_size]); 175 176 if module.len() < max_size { 177 log::debug!( 178 "successfully generated mutated module with \ 179 appended input section" 180 ); 181 data[..module.len()].copy_from_slice(&module); 182 return module.len(); 183 } else { 184 log::debug!("mutated module doesn't fit in original slice"); 185 } 186 } 187 188 // If our new seed can't generate a new module then that's something 189 // for the fuzzer to figure out later when it "officially" executes 190 // this fuzz input. For the purposes of this function it's not too 191 // useful to try to put it in an envelope otherwise so ignore it. 192 Err(e) => { 193 log::debug!("failed to generate module from mutated seed {e:?}"); 194 } 195 } 196 197 new_size 198 } 199 200 fn encode_module(module: &[u8], fuzz_data: &[u8]) -> Vec<u8> { 201 let mut module = module.to_vec(); 202 wasm_encoder::CustomSection { 203 name: SECTION_NAME.into(), 204 data: Cow::Borrowed(&fuzz_data), 205 } 206 .append_to(&mut module); 207 module 208 } 209 210 struct FuzzInput<'a> { 211 /// The module extracted from the input, without the fuzz input custom 212 /// section. 213 module: &'a [u8], 214 215 /// The contents of the fuzz input custom section. 216 fuzz_data: &'a [u8], 217 } 218 219 /// Attempts to extract a fuzz input from the `data` provided. 220 /// 221 /// This will attempt to read `data` as a WebAssembly binary. If successful 222 /// and the module ends with a custom section indicating it's a fuzz input 223 /// then the contents of the custom section are returned along with the 224 /// contents of the original module. 225 fn extract_fuzz_input(data: &[u8]) -> anyhow::Result<FuzzInput<'_>> { 226 use wasmparser::{Parser, Payload}; 227 let mut prev_end = 8; 228 for section in Parser::new(0).parse_all(data) { 229 let section = section?; 230 231 // If this is a custom section, the end of the section is the end of 232 // the entire module, and it's got the expected name, then this section 233 // is assumed to be the input seed to the fuzzer. 234 // 235 // The section's contents are returned through `fuzz_data` and the wasm 236 // binary format means that we can simply chop off the last custom 237 // section and still have a valid module. 238 if let Payload::CustomSection(s) = §ion { 239 if s.name() == SECTION_NAME && s.range().end == data.len() { 240 return Ok(FuzzInput { 241 module: &data[..prev_end], 242 fuzz_data: s.data(), 243 }); 244 } 245 } 246 247 // Record each section's end to record what the end of the module is 248 // up to this point. 249 if let Some((_, range)) = section.as_section() { 250 prev_end = range.end; 251 } 252 } 253 anyhow::bail!("no input found") 254 } 255 256 #[cfg(test)] 257 mod tests { 258 use super::*; 259 use rand::rngs::SmallRng; 260 use rand::{RngCore, SeedableRng}; 261 262 #[test] 263 fn changing_configuration_does_not_change_module() { 264 drop(env_logger::try_init()); 265 266 // This test asserts that if the static configuration associated with a 267 // module changes then the generated module, as sourced from the 268 // original fuzz input, does not change. That's the whole purpose of 269 // this module, to enable our fuzz inputs to be in a format that's 270 // resilient to changes in configuration over time (or at least the 271 // module part of the input). 272 // 273 // This test will execute N=200 iterations where each iteration will 274 // attempt to, with some fresh random data, generate a module. This 275 // module is then "mutated" with a noop mutation to effectively 276 // serialize it into the envelope where the module is preserved. The 277 // now-mutated input, which should be a wasm module, is then passed 278 // as the seed to a second execution which has a different static input. 279 // 280 // This simulates having a fuzzer one day produce an interesting test 281 // case through mutation, and then the next day the configuration of 282 // the fuzzer changes. On both days the module input to the function 283 // should have been the same. 284 285 let mut rng = SmallRng::seed_from_u64(0); 286 let max_size = 2048; 287 let seed_size = 128; 288 let mut buf = vec![0; max_size]; 289 let mut compares = 0; 290 for _ in 0..200 { 291 rng.fill_bytes(&mut buf[..seed_size]); 292 293 let run1 = run_config::<u32>; 294 let mutate = mutate::<u32>; 295 let run2 = run_config::<(u32, u32)>; 296 297 if let Ok((module, known_valid)) = execute(&buf[..seed_size], run1, gen) { 298 assert!(known_valid); 299 let new_size = mutate(&mut buf, seed_size, max_size, gen, noop_mutate); 300 if let Ok((module2, known_valid)) = execute(&buf[..new_size], run2, gen) { 301 assert!(!known_valid); 302 compares += 1; 303 if module != module2 { 304 panic!("modules differ"); 305 } 306 } 307 } 308 } 309 310 // At least one iteration should have succeeded in the fuzz generation 311 // above. 312 assert!(compares > 0); 313 314 fn run_config<T>( 315 data: &[u8], 316 known_valid: bool, 317 _: T, 318 _: &mut Unstructured<'_>, 319 ) -> Result<(Vec<u8>, bool)> 320 where 321 T: for<'a> Arbitrary<'a>, 322 { 323 Ok((data.to_vec(), known_valid)) 324 } 325 326 fn gen<T>(_: &mut T, u: &mut Unstructured<'_>) -> Result<Vec<u8>> 327 where 328 T: for<'a> Arbitrary<'a>, 329 { 330 Ok(u.arbitrary::<wasm_smith::Module>()?.to_bytes()) 331 } 332 333 fn noop_mutate(_buf: &mut [u8], size: usize, _new_size: usize) -> usize { 334 size 335 } 336 } 337 } 338