1 //! Support for maintaining the usefulness of a corpus over time. 2 //! 3 //! Wasmtime's fuzzing strategy in general is to use `wasm-smith` to generate 4 //! modules which interprets fuzz input from libFuzzer as a sort of "DNA". This 5 //! works to generate pretty interesting modules but falls down over time 6 //! because the DNA to generate the same module over time can change. This 7 //! means that maintaining a corpus for Wasmtime is not the most useful thing 8 //! in the world unfortunately and any historical discoveries of coverage need 9 //! to be rediscovered every time the DNA changes. 10 //! 11 //! To help with this the module here implements a scheme where Wasmtime's fuzz 12 //! inputs are highly likely to be of the form: 13 //! 14 //! ```text 15 //! [ ... wasm module ... ][ .. fuzz custom section .. ] 16 //! ``` 17 //! 18 //! The `fuzz custom section` here contains the original fuzz input used to 19 //! generate the `wasm module`, and if the DNA hasn't changed then it should 20 //! still be possible to do that as well. The benefit of this format, though, 21 //! is that if the DNA is changed then the interpretation of the `fuzz custom 22 //! section` will change but the original `wasm module` will not. This enables 23 //! us to populate the corpus, ideally, with a set of interesting `wasm module` 24 //! entries. 25 //! 26 //! Over time the `fuzz custom section` will "bitrot" and will be no longer able 27 //! to generate the original `wasm module`. The main consequence of this is that 28 //! when the original test case is mutated the generated wasm module from the 29 //! mutation will be nothing alike from the original test case's wasm module. 30 //! This means libFuzzer will have to rediscover ways to mutate into 31 //! interesting modules, but we're no worse off than before hopefully. 32 //! Additionally this more easily opens the door to integrate `wasm-mutate` one 33 //! day into mutation here as well. 34 //! 35 //! Currently this is all supported via two methods: 36 //! 37 //! 1. A custom mutator is registered with libfuzzer. This means that all 38 //! inputs generated by the mutator, so long as they fit, will be the 39 //! "envelope" format of this module. This means that the corpus will 40 //! hopefully naturally get populated with wasm files rather than random 41 //! inputs. Note that this is not guaranteed to succeed since sometimes the 42 //! buffer to store the fuzz input in the mutator is not big enough to store 43 //! the final wasm module, in which case a non-enveloped wasm module is 44 //! stored. 45 //! 46 //! 2. If the environment variable `WRITE_FUZZ_INPUT_TO is set then the fuzz 47 //! input, in its envelope format, will be written to the specified file. 48 //! This can be useful in case an input is in its binary form or if a 49 //! preexisting corpus is being rewritten. 50 51 use std::borrow::Cow; 52 53 use arbitrary::{Arbitrary, Result, Unstructured}; 54 use wasm_encoder::Section; 55 56 /// Helper macro for fuzz targets that are single-module fuzzers. 57 /// 58 /// This combines the features of this module into one macro invocation to 59 /// generate the fuzz entry point and mutator in tandem. 60 #[macro_export] 61 macro_rules! single_module_fuzzer { 62 ($execute:ident $generate:ident) => { 63 libfuzzer_sys::fuzz_target!(|data: &[u8]| { 64 $crate::init_fuzzing(); 65 drop($crate::single_module_fuzzer::execute( 66 data, $execute, $generate, 67 )); 68 }); 69 70 libfuzzer_sys::fuzz_mutator!(|data: &mut [u8], size: usize, max_size: usize, seed: u32| { 71 $crate::single_module_fuzzer::mutate( 72 data, 73 size, 74 max_size, 75 $generate, 76 libfuzzer_sys::fuzzer_mutate, 77 ) 78 }); 79 }; 80 } 81 82 /// Executes a "single module fuzzer" given the raw `input` from libfuzzer. 83 /// 84 /// This will use the `input` to generate `T`, some configuration, which is 85 /// then used by `gen_module` to generate a WebAssembly module. The module is 86 /// then passed to `run` along with the configuration and remaining data that 87 /// can be used as fuzz input. 88 /// 89 /// The main purpose of this function is to handle when `input` is actually a 90 /// WebAssembly module "envelope". If the `input` is a valid wasm module and 91 /// ends with a specific trailing custom section then the module generated by 92 /// `gen_module` is actually discarded. The purpose of this is to handle the 93 /// case where the input used to generate a module may change over time but 94 /// we're still interested in the historical coverage of the original wasm 95 /// module. 96 pub fn execute<'a, T, U>( 97 input: &'a [u8], 98 run: fn(&[u8], KnownValid, T, &mut Unstructured<'a>) -> Result<U>, 99 gen_module: fn(&mut T, &mut Unstructured<'a>) -> Result<(Vec<u8>, KnownValid)>, 100 ) -> Result<U> 101 where 102 T: Arbitrary<'a>, 103 { 104 let (fuzz_data, module_in_input) = match extract_fuzz_input(input) { 105 Ok(input) => { 106 log::debug!("fuzz input was a valid module with trailing custom section"); 107 (input.fuzz_data, Some(input.module)) 108 } 109 Err(e) => { 110 log::debug!("fuzz input not a valid module: {e:?}"); 111 (input, None) 112 } 113 }; 114 let mut u = Unstructured::new(fuzz_data); 115 let mut config = u.arbitrary()?; 116 let (generated, known_valid) = gen_module(&mut config, &mut u)?; 117 let module = module_in_input.unwrap_or(&generated); 118 if let Ok(file) = std::env::var("WRITE_FUZZ_INPUT_TO") { 119 std::fs::write(file, encode_module(&module, &fuzz_data)).unwrap(); 120 } 121 let known_valid = if module_in_input.is_some() { 122 KnownValid::No 123 } else { 124 known_valid 125 }; 126 run(module, known_valid, config, &mut u) 127 } 128 129 /// Used as part of `execute` above to determine whether a module is known to 130 /// be valid ahead of time. 131 #[derive(Debug, PartialEq, Eq, Copy, Clone)] 132 pub enum KnownValid { 133 /// This module is known to be valid so it should assert compilation 134 /// succeeds for example. 135 Yes, 136 /// This module is not known to be valid and it may not compile 137 /// successfully. Note that it's also not known to compile unsuccessfully. 138 No, 139 } 140 141 const SECTION_NAME: &str = "wasmtime-fuzz-input"; 142 143 /// Implementation of a libfuzzer custom mutator for a single-module-fuzzer. 144 /// 145 /// This mutator will take the seed specified in `data` and attempt to mutate 146 /// it with the provided `mutate` function. The `mutate` function may not 147 /// receive the `data` as-specified, but instead may receive only the seed 148 /// that was used to generate `data`. 149 pub fn mutate<T>( 150 data: &mut [u8], 151 mut size: usize, 152 max_size: usize, 153 gen_module: fn(&mut T, &mut Unstructured<'_>) -> Result<(Vec<u8>, KnownValid)>, 154 mutate: fn(&mut [u8], usize, usize) -> usize, 155 ) -> usize 156 where 157 T: for<'a> Arbitrary<'a>, 158 { 159 // If `data` is a valid wasm module with the fuzz seed at the end, then 160 // discard the wasm module portion and instead shuffle the seed into the 161 // beginning of the `data` slice. This is the "de-envelope" part of the 162 // seed management here. 163 // 164 // After this the `data` array should contain the raw contents used to 165 // produce the module and is ripe for mutation/minimization/etc. 166 if let Ok(input) = extract_fuzz_input(&data[..size]) { 167 let start = input.fuzz_data.as_ptr() as usize - data.as_ptr() as usize; 168 size = input.fuzz_data.len(); 169 data.copy_within(start..start + input.fuzz_data.len(), 0); 170 } 171 172 // Delegate to the provided mutation function for standard mutations to 173 // apply. 174 let new_size = mutate(data, size, max_size); 175 176 // Next the goal of this function is to produce a test case which is an 177 // actual wasm module. To that end this will run module generation over the 178 // input provided. If this is all successful then the custom section 179 // representing the seed is appended to the module, making it a sort of 180 // self-referential module. 181 // 182 // After all this it's copied into `data` if the it fits. If the module 183 // doesn't fit then the seed is left un-perturbed since there's not much 184 // that we can do about that. 185 let mut u = Unstructured::new(&data[..new_size]); 186 match u 187 .arbitrary() 188 .and_then(|mut config| gen_module(&mut config, &mut u)) 189 { 190 Ok((module, _known_valid)) => { 191 let module = encode_module(&module, &data[..new_size]); 192 193 if module.len() < max_size { 194 log::debug!( 195 "successfully generated mutated module with \ 196 appended input section" 197 ); 198 data[..module.len()].copy_from_slice(&module); 199 return module.len(); 200 } else { 201 log::debug!("mutated module doesn't fit in original slice"); 202 } 203 } 204 205 // If our new seed can't generate a new module then that's something 206 // for the fuzzer to figure out later when it "officially" executes 207 // this fuzz input. For the purposes of this function it's not too 208 // useful to try to put it in an envelope otherwise so ignore it. 209 Err(e) => { 210 log::debug!("failed to generate module from mutated seed {e:?}"); 211 } 212 } 213 214 new_size 215 } 216 217 fn encode_module(module: &[u8], fuzz_data: &[u8]) -> Vec<u8> { 218 let mut module = module.to_vec(); 219 wasm_encoder::CustomSection { 220 name: SECTION_NAME.into(), 221 data: Cow::Borrowed(&fuzz_data), 222 } 223 .append_to(&mut module); 224 module 225 } 226 227 struct FuzzInput<'a> { 228 /// The module extracted from the input, without the fuzz input custom 229 /// section. 230 module: &'a [u8], 231 232 /// The contents of the fuzz input custom section. 233 fuzz_data: &'a [u8], 234 } 235 236 /// Attempts to extract a fuzz input from the `data` provided. 237 /// 238 /// This will attempt to read `data` as a WebAssembly binary. If successful 239 /// and the module ends with a custom section indicating it's a fuzz input 240 /// then the contents of the custom section are returned along with the 241 /// contents of the original module. 242 fn extract_fuzz_input(data: &[u8]) -> wasmtime::Result<FuzzInput<'_>> { 243 use wasmparser::{Parser, Payload}; 244 let mut prev_end = 8; 245 for section in Parser::new(0).parse_all(data) { 246 let section = section?; 247 248 // If this is a custom section, the end of the section is the end of 249 // the entire module, and it's got the expected name, then this section 250 // is assumed to be the input seed to the fuzzer. 251 // 252 // The section's contents are returned through `fuzz_data` and the wasm 253 // binary format means that we can simply chop off the last custom 254 // section and still have a valid module. 255 if let Payload::CustomSection(s) = §ion { 256 if s.name() == SECTION_NAME && s.range().end == data.len() { 257 return Ok(FuzzInput { 258 module: &data[..prev_end], 259 fuzz_data: s.data(), 260 }); 261 } 262 } 263 264 // Record each section's end to record what the end of the module is 265 // up to this point. 266 if let Some((_, range)) = section.as_section() { 267 prev_end = range.end; 268 } 269 } 270 wasmtime::bail!("no input found") 271 } 272 273 #[cfg(test)] 274 mod tests { 275 use super::*; 276 use rand::rngs::SmallRng; 277 use rand::{RngCore, SeedableRng}; 278 279 #[test] 280 fn changing_configuration_does_not_change_module() { 281 drop(env_logger::try_init()); 282 283 // This test asserts that if the static configuration associated with a 284 // module changes then the generated module, as sourced from the 285 // original fuzz input, does not change. That's the whole purpose of 286 // this module, to enable our fuzz inputs to be in a format that's 287 // resilient to changes in configuration over time (or at least the 288 // module part of the input). 289 // 290 // This test will execute N=200 iterations where each iteration will 291 // attempt to, with some fresh random data, generate a module. This 292 // module is then "mutated" with a noop mutation to effectively 293 // serialize it into the envelope where the module is preserved. The 294 // now-mutated input, which should be a wasm module, is then passed 295 // as the seed to a second execution which has a different static input. 296 // 297 // This simulates having a fuzzer one day produce an interesting test 298 // case through mutation, and then the next day the configuration of 299 // the fuzzer changes. On both days the module input to the function 300 // should have been the same. 301 302 let mut rng = SmallRng::seed_from_u64(0); 303 let max_size = 4096; 304 let seed_size = 128; 305 let mut buf = vec![0; max_size]; 306 let mut compares = 0; 307 for _ in 0..200 { 308 rng.fill_bytes(&mut buf[..seed_size]); 309 310 let run1 = run_config::<u32>; 311 let mutate = mutate::<u32>; 312 let run2 = run_config::<(u32, u32)>; 313 314 if let Ok((module, known_valid)) = execute(&buf[..seed_size], run1, generate) { 315 assert_eq!(known_valid, KnownValid::Yes); 316 let new_size = mutate(&mut buf, seed_size, max_size, generate, noop_mutate); 317 if let Ok((module2, known_valid)) = execute(&buf[..new_size], run2, generate) { 318 assert_eq!(known_valid, KnownValid::No); 319 compares += 1; 320 if module != module2 { 321 panic!("modules differ"); 322 } 323 } 324 } 325 } 326 327 // At least one iteration should have succeeded in the fuzz generation 328 // above. 329 assert!(compares > 0); 330 331 fn run_config<T>( 332 data: &[u8], 333 known_valid: KnownValid, 334 _: T, 335 _: &mut Unstructured<'_>, 336 ) -> Result<(Vec<u8>, KnownValid)> 337 where 338 T: for<'a> Arbitrary<'a>, 339 { 340 Ok((data.to_vec(), known_valid)) 341 } 342 343 fn generate<T>(_: &mut T, u: &mut Unstructured<'_>) -> Result<(Vec<u8>, KnownValid)> 344 where 345 T: for<'a> Arbitrary<'a>, 346 { 347 Ok(( 348 u.arbitrary::<wasm_smith::Module>()?.to_bytes(), 349 KnownValid::Yes, 350 )) 351 } 352 353 fn noop_mutate(_buf: &mut [u8], size: usize, _new_size: usize) -> usize { 354 size 355 } 356 } 357 } 358