1 //! Support for maintaining the usefulness of a corpus over time.
2 //!
3 //! Wasmtime's fuzzing strategy in general is to use `wasm-smith` to generate
4 //! modules which interprets fuzz input from libFuzzer as a sort of "DNA". This
5 //! works to generate pretty interesting modules but falls down over time
6 //! because the DNA to generate the same module over time can change. This
7 //! means that maintaining a corpus for Wasmtime is not the most useful thing
8 //! in the world unfortunately and any historical discoveries of coverage need
9 //! to be rediscovered every time the DNA changes.
10 //!
11 //! To help with this the module here implements a scheme where Wasmtime's fuzz
12 //! inputs are highly likely to be of the form:
13 //!
14 //! ```text
15 //! [ ... wasm module ... ][ .. fuzz custom section .. ]
16 //! ```
17 //!
18 //! The `fuzz custom section` here contains the original fuzz input used to
19 //! generate the `wasm module`, and if the DNA hasn't changed then it should
20 //! still be possible to do that as well. The benefit of this format, though,
21 //! is that if the DNA is changed then the interpretation of the `fuzz custom
22 //! section` will change but the original `wasm module` will not. This enables
23 //! us to populate the corpus, ideally, with a set of interesting `wasm module`
24 //! entries.
25 //!
26 //! Over time the `fuzz custom section` will "bitrot" and will be no longer able
27 //! to generate the original `wasm module`. The main consequence of this is that
28 //! when the original test case is mutated the generated wasm module from the
29 //! mutation will be nothing alike from the original test case's wasm module.
30 //! This means libFuzzer will have to rediscover ways to mutate into
31 //! interesting modules, but we're no worse off than before hopefully.
32 //! Additionally this more easily opens the door to integrate `wasm-mutate` one
33 //! day into mutation here as well.
34 //!
35 //! Currently this is all supported via two methods:
36 //!
37 //! 1. A custom mutator is registered with libfuzzer. This means that all
38 //!    inputs generated by the mutator, so long as they fit, will be the
39 //!    "envelope" format of this module. This means that the corpus will
40 //!    hopefully naturally get populated with wasm files rather than random
41 //!    inputs. Note that this is not guaranteed to succeed since sometimes the
42 //!    buffer to store the fuzz input in the mutator is not big enough to store
43 //!    the final wasm module, in which case a non-enveloped wasm module is
44 //!    stored.
45 //!
46 //! 2. If the environment variable `WRITE_FUZZ_INPUT_TO is set then the fuzz
47 //!    input, in its envelope format, will be written to the specified file.
48 //!    This can be useful in case an input is in its binary form or if a
49 //!    preexisting corpus is being rewritten.
50 
51 use std::borrow::Cow;
52 
53 use arbitrary::{Arbitrary, Result, Unstructured};
54 use wasm_encoder::Section;
55 
56 /// Helper macro for fuzz targets that are single-module fuzzers.
57 ///
58 /// This combines the features of this module into one macro invocation to
59 /// generate the fuzz entry point and mutator in tandem.
60 #[macro_export]
61 macro_rules! single_module_fuzzer {
62     ($execute:ident $generate:ident) => {
63         libfuzzer_sys::fuzz_target!(|data: &[u8]| {
64             $crate::init_fuzzing();
65             drop($crate::single_module_fuzzer::execute(
66                 data, $execute, $generate,
67             ));
68         });
69 
70         libfuzzer_sys::fuzz_mutator!(|data: &mut [u8], size: usize, max_size: usize, seed: u32| {
71             $crate::single_module_fuzzer::mutate(
72                 data,
73                 size,
74                 max_size,
75                 $generate,
76                 libfuzzer_sys::fuzzer_mutate,
77             )
78         });
79     };
80 }
81 
82 /// Executes a "single module fuzzer" given the raw `input` from libfuzzer.
83 ///
84 /// This will use the `input` to generate `T`, some configuration, which is
85 /// then used by `gen_module` to generate a WebAssembly module. The module is
86 /// then passed to `run` along with the configuration and remaining data that
87 /// can be used as fuzz input.
88 ///
89 /// The main purpose of this function is to handle when `input` is actually a
90 /// WebAssembly module "envelope". If the `input` is a valid wasm module and
91 /// ends with a specific trailing custom section then the module generated by
92 /// `gen_module` is actually discarded. The purpose of this is to handle the
93 /// case where the input used to generate a module may change over time but
94 /// we're still interested in the historical coverage of the original wasm
95 /// module.
96 pub fn execute<'a, T, U>(
97     input: &'a [u8],
98     run: fn(&[u8], bool, T, &mut Unstructured<'a>) -> Result<U>,
99     gen_module: fn(&mut T, &mut Unstructured<'a>) -> Result<Vec<u8>>,
100 ) -> Result<U>
101 where
102     T: Arbitrary<'a>,
103 {
104     let (fuzz_data, module_in_input) = match extract_fuzz_input(input) {
105         Ok(input) => {
106             log::debug!("fuzz input was a valid module with trailing custom section");
107             (input.fuzz_data, Some(input.module))
108         }
109         Err(e) => {
110             log::debug!("fuzz input not a valid module: {e:?}");
111             (input, None)
112         }
113     };
114     let mut u = Unstructured::new(fuzz_data);
115     let mut config = u.arbitrary()?;
116     let generated = gen_module(&mut config, &mut u)?;
117     let module = module_in_input.unwrap_or(&generated);
118     if let Ok(file) = std::env::var("WRITE_FUZZ_INPUT_TO") {
119         std::fs::write(file, encode_module(&module, &fuzz_data)).unwrap();
120     }
121     run(module, module_in_input.is_none(), config, &mut u)
122 }
123 
124 const SECTION_NAME: &str = "wasmtime-fuzz-input";
125 
126 /// Implementation of a libfuzzer custom mutator for a single-module-fuzzer.
127 ///
128 /// This mutator will take the seed specified in `data` and attempt to mutate
129 /// it with the provided `mutate` function. The `mutate` function may not
130 /// receive the `data` as-specified, but instead may receive only the seed
131 /// that was used to generate `data`.
132 pub fn mutate<T>(
133     data: &mut [u8],
134     mut size: usize,
135     max_size: usize,
136     gen_module: fn(&mut T, &mut Unstructured<'_>) -> Result<Vec<u8>>,
137     mutate: fn(&mut [u8], usize, usize) -> usize,
138 ) -> usize
139 where
140     T: for<'a> Arbitrary<'a>,
141 {
142     // If `data` is a valid wasm module with the fuzz seed at the end, then
143     // discard the wasm module portion and instead shuffle the seed into the
144     // beginning of the `data` slice. This is the "de-envelope" part of the
145     // seed management here.
146     //
147     // After this the `data` array should contain the raw contents used to
148     // produce the module and is ripe for mutation/minimization/etc.
149     if let Ok(input) = extract_fuzz_input(&data[..size]) {
150         let start = input.fuzz_data.as_ptr() as usize - data.as_ptr() as usize;
151         size = input.fuzz_data.len();
152         data.copy_within(start..start + input.fuzz_data.len(), 0);
153     }
154 
155     // Delegate to the provided mutation function for standard mutations to
156     // apply.
157     let new_size = mutate(data, size, max_size);
158 
159     // Next the goal of this function is to produce a test case which is an
160     // actual wasm module. To that end this will run module generation over the
161     // input provided. If this is all successful then the custom section
162     // representing the seed is appended to the module, making it a sort of
163     // self-referential module.
164     //
165     // After all this it's copied into `data` if the it fits. If the module
166     // doesn't fit then the seed is left un-perturbed since there's not much
167     // that we can do about that.
168     let mut u = Unstructured::new(&data[..new_size]);
169     match u
170         .arbitrary()
171         .and_then(|mut config| gen_module(&mut config, &mut u))
172     {
173         Ok(module) => {
174             let module = encode_module(&module, &data[..new_size]);
175 
176             if module.len() < max_size {
177                 log::debug!(
178                     "successfully generated mutated module with \
179                      appended input section"
180                 );
181                 data[..module.len()].copy_from_slice(&module);
182                 return module.len();
183             } else {
184                 log::debug!("mutated module doesn't fit in original slice");
185             }
186         }
187 
188         // If our new seed can't generate a new module then that's something
189         // for the fuzzer to figure out later when it "officially" executes
190         // this fuzz input. For the purposes of this function it's not too
191         // useful to try to put it in an envelope otherwise so ignore it.
192         Err(e) => {
193             log::debug!("failed to generate module from mutated seed {e:?}");
194         }
195     }
196 
197     new_size
198 }
199 
200 fn encode_module(module: &[u8], fuzz_data: &[u8]) -> Vec<u8> {
201     let mut module = module.to_vec();
202     wasm_encoder::CustomSection {
203         name: SECTION_NAME.into(),
204         data: Cow::Borrowed(&fuzz_data),
205     }
206     .append_to(&mut module);
207     module
208 }
209 
210 struct FuzzInput<'a> {
211     /// The module extracted from the input, without the fuzz input custom
212     /// section.
213     module: &'a [u8],
214 
215     /// The contents of the fuzz input custom section.
216     fuzz_data: &'a [u8],
217 }
218 
219 /// Attempts to extract a fuzz input from the `data` provided.
220 ///
221 /// This will attempt to read `data` as a WebAssembly binary. If successful
222 /// and the module ends with a custom section indicating it's a fuzz input
223 /// then the contents of the custom section are returned along with the
224 /// contents of the original module.
225 fn extract_fuzz_input(data: &[u8]) -> anyhow::Result<FuzzInput<'_>> {
226     use wasmparser::{Parser, Payload};
227     let mut prev_end = 8;
228     for section in Parser::new(0).parse_all(data) {
229         let section = section?;
230 
231         // If this is a custom section, the end of the section is the end of
232         // the entire module, and it's got the expected name, then this section
233         // is assumed to be the input seed to the fuzzer.
234         //
235         // The section's contents are returned through `fuzz_data` and the wasm
236         // binary format means that we can simply chop off the last custom
237         // section and still have a valid module.
238         if let Payload::CustomSection(s) = &section {
239             if s.name() == SECTION_NAME && s.range().end == data.len() {
240                 return Ok(FuzzInput {
241                     module: &data[..prev_end],
242                     fuzz_data: s.data(),
243                 });
244             }
245         }
246 
247         // Record each section's end to record what the end of the module is
248         // up to this point.
249         if let Some((_, range)) = section.as_section() {
250             prev_end = range.end;
251         }
252     }
253     anyhow::bail!("no input found")
254 }
255 
256 #[cfg(test)]
257 mod tests {
258     use super::*;
259     use rand::rngs::SmallRng;
260     use rand::{RngCore, SeedableRng};
261 
262     #[test]
263     fn changing_configuration_does_not_change_module() {
264         drop(env_logger::try_init());
265 
266         // This test asserts that if the static configuration associated with a
267         // module changes then the generated module, as sourced from the
268         // original fuzz input, does not change. That's the whole purpose of
269         // this module, to enable our fuzz inputs to be in a format that's
270         // resilient to changes in configuration over time (or at least the
271         // module part of the input).
272         //
273         // This test will execute N=200 iterations where each iteration will
274         // attempt to, with some fresh random data, generate a module. This
275         // module is then "mutated" with a noop mutation to effectively
276         // serialize it into the envelope where the module is preserved. The
277         // now-mutated input, which should be a wasm module, is then passed
278         // as the seed to a second execution which has a different static input.
279         //
280         // This simulates having a fuzzer one day produce an interesting test
281         // case through mutation, and then the next day the configuration of
282         // the fuzzer changes. On both days the module input to the function
283         // should have been the same.
284 
285         let mut rng = SmallRng::seed_from_u64(0);
286         let max_size = 2048;
287         let seed_size = 128;
288         let mut buf = vec![0; max_size];
289         let mut compares = 0;
290         for _ in 0..200 {
291             rng.fill_bytes(&mut buf[..seed_size]);
292 
293             let run1 = run_config::<u32>;
294             let mutate = mutate::<u32>;
295             let run2 = run_config::<(u32, u32)>;
296 
297             if let Ok((module, known_valid)) = execute(&buf[..seed_size], run1, gen) {
298                 assert!(known_valid);
299                 let new_size = mutate(&mut buf, seed_size, max_size, gen, noop_mutate);
300                 if let Ok((module2, known_valid)) = execute(&buf[..new_size], run2, gen) {
301                     assert!(!known_valid);
302                     compares += 1;
303                     if module != module2 {
304                         panic!("modules differ");
305                     }
306                 }
307             }
308         }
309 
310         // At least one iteration should have succeeded in the fuzz generation
311         // above.
312         assert!(compares > 0);
313 
314         fn run_config<T>(
315             data: &[u8],
316             known_valid: bool,
317             _: T,
318             _: &mut Unstructured<'_>,
319         ) -> Result<(Vec<u8>, bool)>
320         where
321             T: for<'a> Arbitrary<'a>,
322         {
323             Ok((data.to_vec(), known_valid))
324         }
325 
326         fn gen<T>(_: &mut T, u: &mut Unstructured<'_>) -> Result<Vec<u8>>
327         where
328             T: for<'a> Arbitrary<'a>,
329         {
330             Ok(u.arbitrary::<wasm_smith::Module>()?.to_bytes())
331         }
332 
333         fn noop_mutate(_buf: &mut [u8], size: usize, _new_size: usize) -> usize {
334             size
335         }
336     }
337 }
338