1 //! Implements a memory pool using a single allocated memory slab.
2 //!
3 //! The pooling instance allocator maps one large slab of memory in advance and
4 //! allocates WebAssembly memories from this slab--a [`MemoryPool`]. Each
5 //! WebAssembly memory is allocated in its own slot (see uses of `index` and
6 //! [`SlotId`] in this module):
7 //!
8 //! ```text
9 //! ┌──────┬──────┬──────┬──────┬──────┐
10 //! │Slot 0│Slot 1│Slot 2│Slot 3│......│
11 //! └──────┴──────┴──────┴──────┴──────┘
12 //! ```
13 //!
14 //! Diving deeper, we note that a [`MemoryPool`] protects Wasmtime from
15 //! out-of-bounds memory accesses by inserting inaccessible guard regions
16 //! between memory slots. These guard regions are configured to raise a signal
17 //! if they are accessed--a WebAssembly out-of-bounds (OOB) memory access. The
18 //! [`MemoryPool`] documentation has a more detailed chart but one can think of
19 //! memory slots being laid out like the following:
20 //!
21 //! ```text
22 //! ┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
23 //! │Guard│Mem 0│Guard│Mem 1│Guard│Mem 2│.....│Guard│
24 //! └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
25 //! ```
26 //!
27 //! But we can be more efficient about guard regions: with memory protection
28 //! keys (MPK) enabled, the interleaved guard regions can be smaller. If we
29 //! surround a memory with memories from other instances and each instance is
30 //! protected by different protection keys, the guard region can be smaller AND
31 //! the pool will still raise a signal on an OOB access. This complicates how we
32 //! lay out memory slots: we must store memories from the same instance in the
33 //! same "stripe". Each stripe is protected by a different protection key.
34 //!
35 //! This concept, dubbed [ColorGuard] in the original paper, relies on careful
36 //! calculation of the memory sizes to prevent any "overlapping access" (see
37 //! [`calculate`]): there are limited protection keys available (15) so the next
38 //! memory using the same key must be at least as far away as the guard region
39 //! we would insert otherwise. This ends up looking like the following, where a
40 //! store for instance 0 (`I0`) "stripes" two memories (`M0` and `M1`) with the
41 //! same protection key 1 and far enough apart to signal an OOB access:
42 //!
43 //! ```text
44 //! ┌─────┬─────┬─────┬─────┬────────────────┬─────┬─────┬─────┐
45 //! │.....│I0:M1│.....│.....│.<enough slots>.│I0:M2│.....│.....│
46 //! ├─────┼─────┼─────┼─────┼────────────────┼─────┼─────┼─────┤
47 //! │.....│key 1│key 2│key 3│..<more keys>...│key 1│key 2│.....│
48 //! └─────┴─────┴─────┴─────┴────────────────┴─────┴─────┴─────┘
49 //! ```
50 //!
51 //! [ColorGuard]: https://plas2022.github.io/files/pdf/SegueColorGuard.pdf
52 
53 use super::{
54     MemoryAllocationIndex,
55     index_allocator::{MemoryInModule, ModuleAffinityIndexAllocator, SlotId},
56 };
57 use crate::prelude::*;
58 use crate::runtime::vm::{
59     CompiledModuleId, InstanceAllocationRequest, InstanceLimits, Memory, MemoryBase,
60     MemoryImageSlot, Mmap, MmapOffset, PoolingInstanceAllocatorConfig, mmap::AlignedLength,
61 };
62 use crate::{
63     Enabled,
64     runtime::vm::mpk::{self, ProtectionKey, ProtectionMask},
65     vm::HostAlignedByteCount,
66 };
67 use std::mem;
68 use std::sync::atomic::{AtomicUsize, Ordering};
69 use std::sync::{Arc, Mutex};
70 use wasmtime_environ::{DefinedMemoryIndex, Module, Tunables};
71 
72 /// A set of allocator slots.
73 ///
74 /// The allocated slots can be split by striping them: e.g., with two stripe
75 /// colors 0 and 1, we would allocate all even slots using stripe 0 and all odd
76 /// slots using stripe 1.
77 ///
78 /// This is helpful for the use of protection keys: (a) if a request comes to
79 /// allocate multiple instances, we can allocate them all from the same stripe
80 /// and (b) if a store wants to allocate more from the same stripe it can.
81 #[derive(Debug)]
82 struct Stripe {
83     allocator: ModuleAffinityIndexAllocator,
84     pkey: Option<ProtectionKey>,
85 }
86 
87 /// Represents a pool of WebAssembly linear memories.
88 ///
89 /// A linear memory is divided into accessible pages and guard pages. A memory
90 /// pool contains linear memories: each memory occupies a slot in an
91 /// allocated slab (i.e., `mapping`):
92 ///
93 /// ```text
94 ///          layout.max_memory_bytes                 layout.slot_bytes
95 ///                    |                                   |
96 ///              ◄─────┴────►                  ◄───────────┴──────────►
97 /// ┌───────────┬────────────┬───────────┐     ┌───────────┬───────────┬───────────┐
98 /// | PROT_NONE |            | PROT_NONE | ... |           | PROT_NONE | PROT_NONE |
99 /// └───────────┴────────────┴───────────┘     └───────────┴───────────┴───────────┘
100 /// |           |◄──────────────────┬─────────────────────────────────► ◄────┬────►
101 /// |           |                   |                                        |
102 /// mapping     |            `layout.num_slots` memories         layout.post_slab_guard_size
103 ///             |
104 ///   layout.pre_slab_guard_size
105 /// ```
106 #[derive(Debug)]
107 pub struct MemoryPool {
108     mapping: Arc<Mmap<AlignedLength>>,
109     /// This memory pool is stripe-aware. If using  memory protection keys, this
110     /// will contain one stripe per available key; otherwise, a single stripe
111     /// with an empty key.
112     stripes: Vec<Stripe>,
113 
114     /// If using a copy-on-write allocation scheme, the slot management. We
115     /// dynamically transfer ownership of a slot to a Memory when in use.
116     image_slots: Vec<Mutex<ImageSlot>>,
117 
118     /// A description of the various memory sizes used in allocating the
119     /// `mapping` slab.
120     layout: SlabLayout,
121 
122     /// The maximum number of memories that a single core module instance may
123     /// use.
124     ///
125     /// NB: this is needed for validation but does not affect the pool's size.
126     memories_per_instance: usize,
127 
128     /// How much linear memory, in bytes, to keep resident after resetting for
129     /// use with the next instance. This much memory will be `memset` to zero
130     /// when a linear memory is deallocated.
131     ///
132     /// Memory exceeding this amount in the wasm linear memory will be released
133     /// with `madvise` back to the kernel.
134     ///
135     /// Only applicable on Linux.
136     pub(super) keep_resident: HostAlignedByteCount,
137 
138     /// Keep track of protection keys handed out to initialized stores; this
139     /// allows us to round-robin the assignment of stores to stripes.
140     next_available_pkey: AtomicUsize,
141 }
142 
143 /// The state of memory for each slot in this pool.
144 #[derive(Debug)]
145 enum ImageSlot {
146     /// This slot is guaranteed to be entirely unmapped.
147     ///
148     /// This is the initial state of all slots.
149     Unmapped,
150 
151     /// The state of this slot is unknown.
152     ///
153     /// This encompasses a number of situations such as:
154     ///
155     /// * The slot is currently in use.
156     /// * The slot was attempted to be in use, but allocation failed.
157     /// * The slot was used but not deallocated properly.
158     ///
159     /// All of these situations are lumped into this one variant indicating
160     /// that, at a base level, no knowledge is known about this slot. Using a
161     /// slot in this state first requires resetting all memory in this slot by
162     /// mapping anonymous memory on top of the entire slot.
163     Unknown,
164 
165     /// This slot was previously used and `MemoryImageSlot` maintains the state
166     /// about what this slot was last configured as.
167     ///
168     /// Future use of this slot will use `MemoryImageSlot` to continue to
169     /// re-instantiate and reuse images and such. This state is entered after
170     /// and allocated slot is successfully deallocated.
171     PreviouslyUsed(MemoryImageSlot),
172 }
173 
174 impl MemoryPool {
175     /// Create a new `MemoryPool`.
new(config: &PoolingInstanceAllocatorConfig, tunables: &Tunables) -> Result<Self>176     pub fn new(config: &PoolingInstanceAllocatorConfig, tunables: &Tunables) -> Result<Self> {
177         if u64::try_from(config.limits.max_memory_size).unwrap() > tunables.memory_reservation {
178             bail!(
179                 "maximum memory size of {:#x} bytes exceeds the configured \
180                  memory reservation of {:#x} bytes",
181                 config.limits.max_memory_size,
182                 tunables.memory_reservation
183             );
184         }
185         let pkeys = match config.memory_protection_keys {
186             Enabled::Auto => {
187                 if mpk::is_supported() {
188                     mpk::keys(config.max_memory_protection_keys)
189                 } else {
190                     &[]
191                 }
192             }
193             Enabled::Yes => {
194                 if mpk::is_supported() {
195                     mpk::keys(config.max_memory_protection_keys)
196                 } else {
197                     bail!("mpk is disabled on this system")
198                 }
199             }
200             Enabled::No => &[],
201         };
202 
203         // This is a tricky bit of global state: when creating a memory pool
204         // that uses memory protection keys, we ensure here that any host code
205         // will have access to all keys (i.e., stripes). It's only when we enter
206         // the WebAssembly guest code (see `StoreInner::call_hook`) that we
207         // enforce which keys/stripes can be accessed. Be forewarned about the
208         // assumptions here:
209         // - we expect this "allow all" configuration to reset the default
210         //   process state (only allow key 0) _before_ any memories are accessed
211         // - and we expect no other code (e.g., host-side code) to modify this
212         //   global MPK configuration
213         if !pkeys.is_empty() {
214             mpk::allow(ProtectionMask::all());
215         }
216 
217         // Create a slab layout and allocate it as a completely inaccessible
218         // region to start--`PROT_NONE`.
219         let constraints = SlabConstraints::new(&config.limits, tunables, pkeys.len())?;
220         let layout = calculate(&constraints)?;
221         log::debug!(
222             "creating memory pool: {constraints:?} -> {layout:?} (total: {})",
223             layout.total_slab_bytes()?
224         );
225         let mut mapping =
226             Mmap::accessible_reserved(HostAlignedByteCount::ZERO, layout.total_slab_bytes()?)
227                 .context("failed to create memory pool mapping")?;
228 
229         // Then, stripe the memory with the available protection keys. This is
230         // unnecessary if there is only one stripe color.
231         if layout.num_stripes >= 2 {
232             let mut cursor = layout.pre_slab_guard_bytes;
233             let pkeys = &pkeys[..layout.num_stripes];
234             for i in 0..constraints.num_slots {
235                 let pkey = &pkeys[i % pkeys.len()];
236                 let region = unsafe {
237                     mapping.slice_mut(
238                         cursor.byte_count()..cursor.byte_count() + layout.slot_bytes.byte_count(),
239                     )
240                 };
241                 pkey.protect(region)?;
242                 cursor = cursor
243                     .checked_add(layout.slot_bytes)
244                     .context("cursor + slot_bytes overflows")?;
245             }
246             debug_assert_eq!(
247                 cursor
248                     .checked_add(layout.post_slab_guard_bytes)
249                     .context("cursor + post_slab_guard_bytes overflows")?,
250                 layout.total_slab_bytes()?
251             );
252         }
253 
254         let image_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(ImageSlot::Unmapped))
255             .take(constraints.num_slots)
256             .collect();
257 
258         let create_stripe = |i| {
259             let num_slots = constraints.num_slots / layout.num_stripes
260                 + usize::from(constraints.num_slots % layout.num_stripes > i);
261             let allocator = ModuleAffinityIndexAllocator::new(
262                 num_slots.try_into().unwrap(),
263                 config.max_unused_warm_slots,
264             );
265             Stripe {
266                 allocator,
267                 pkey: pkeys.get(i).cloned(),
268             }
269         };
270 
271         debug_assert!(layout.num_stripes > 0);
272         let stripes: Vec<_> = (0..layout.num_stripes).map(create_stripe).collect();
273 
274         let pool = Self {
275             stripes,
276             mapping: Arc::new(mapping),
277             image_slots,
278             layout,
279             memories_per_instance: usize::try_from(config.limits.max_memories_per_module).unwrap(),
280             keep_resident: HostAlignedByteCount::new_rounded_up(
281                 config.linear_memory_keep_resident,
282             )?,
283             next_available_pkey: AtomicUsize::new(0),
284         };
285 
286         Ok(pool)
287     }
288 
289     /// Return a protection key that stores can use for requesting new
next_available_pkey(&self) -> Option<ProtectionKey>290     pub fn next_available_pkey(&self) -> Option<ProtectionKey> {
291         let index = self.next_available_pkey.fetch_add(1, Ordering::SeqCst) % self.stripes.len();
292         debug_assert!(
293             self.stripes.len() < 2 || self.stripes[index].pkey.is_some(),
294             "if we are using stripes, we cannot have an empty protection key"
295         );
296         self.stripes[index].pkey
297     }
298 
299     /// Validate whether this memory pool supports the given module.
validate_memories(&self, module: &Module) -> Result<()>300     pub fn validate_memories(&self, module: &Module) -> Result<()> {
301         let memories = module.num_defined_memories();
302         if memories > self.memories_per_instance {
303             bail!(
304                 "defined memories count of {} exceeds the per-instance limit of {}",
305                 memories,
306                 self.memories_per_instance,
307             );
308         }
309 
310         for (i, memory) in module.memories.iter().skip(module.num_imported_memories) {
311             self.validate_memory(memory).with_context(|| {
312                 format!(
313                     "memory index {} is unsupported in this pooling allocator configuration",
314                     i.as_u32()
315                 )
316             })?;
317         }
318         Ok(())
319     }
320 
321     /// Validate one memory for this pool.
validate_memory(&self, memory: &wasmtime_environ::Memory) -> Result<()>322     pub fn validate_memory(&self, memory: &wasmtime_environ::Memory) -> Result<()> {
323         let min = memory.minimum_byte_size().with_context(|| {
324             format!("memory has a minimum byte size that cannot be represented in a u64",)
325         })?;
326         if min > u64::try_from(self.layout.max_memory_bytes.byte_count()).unwrap() {
327             bail!(
328                 "memory has a minimum byte size of {} which exceeds the limit of {} bytes",
329                 min,
330                 self.layout.max_memory_bytes,
331             );
332         }
333         if memory.shared {
334             // FIXME(#4244): since the pooling allocator owns the memory
335             // allocation (which is torn down with the instance), that
336             // can't be used with shared memory where threads or the host
337             // might persist the memory beyond the lifetime of the instance
338             // itself.
339             bail!("memory is shared which is not supported in the pooling allocator");
340         }
341         Ok(())
342     }
343 
344     /// Are zero slots in use right now?
is_empty(&self) -> bool345     pub fn is_empty(&self) -> bool {
346         self.stripes.iter().all(|s| s.allocator.is_empty())
347     }
348 
349     /// Allocate a single memory for the given instance allocation request.
allocate( &self, request: &mut InstanceAllocationRequest<'_, '_>, ty: &wasmtime_environ::Memory, memory_index: Option<DefinedMemoryIndex>, ) -> Result<(MemoryAllocationIndex, Memory)>350     pub async fn allocate(
351         &self,
352         request: &mut InstanceAllocationRequest<'_, '_>,
353         ty: &wasmtime_environ::Memory,
354         memory_index: Option<DefinedMemoryIndex>,
355     ) -> Result<(MemoryAllocationIndex, Memory)> {
356         let tunables = request.store.engine().tunables();
357         let stripe_index = if let Some(pkey) = request.store.get_pkey() {
358             pkey.as_stripe()
359         } else {
360             debug_assert!(self.stripes.len() < 2);
361             0
362         };
363 
364         let striped_allocation_index = self.stripes[stripe_index]
365             .allocator
366             .alloc(memory_index.and_then(|mem_idx| {
367                 request
368                     .runtime_info
369                     .unique_id()
370                     .map(|id| MemoryInModule(id, mem_idx))
371             }))
372             .map(|slot| StripedAllocationIndex(u32::try_from(slot.index()).unwrap()))
373             .ok_or_else(|| {
374                 super::PoolConcurrencyLimitError::new(
375                     self.stripes[stripe_index].allocator.len(),
376                     format!("memory stripe {stripe_index}"),
377                 )
378             })?;
379         let mut guard = DeallocateIndexGuard {
380             pool: self,
381             stripe_index,
382             striped_allocation_index,
383             active: true,
384         };
385 
386         let allocation_index =
387             striped_allocation_index.as_unstriped_slot_index(stripe_index, self.stripes.len());
388 
389         // Double-check that the runtime requirements of the memory are
390         // satisfied by the configuration of this pooling allocator. This
391         // should be returned as an error through `validate_memory_plans`
392         // but double-check here to be sure.
393         assert!(
394             tunables.memory_reservation + tunables.memory_guard_size
395                 <= u64::try_from(self.layout.bytes_to_next_stripe_slot().byte_count()).unwrap()
396         );
397 
398         let base = self.get_base(allocation_index);
399         let base_capacity = self.layout.max_memory_bytes;
400 
401         let mut slot = self.take_memory_image_slot(allocation_index)?;
402         let image = match memory_index {
403             Some(memory_index) => request.runtime_info.memory_image(memory_index)?,
404             None => None,
405         };
406         let initial_size = ty
407             .minimum_byte_size()
408             .expect("min size checked in validation");
409 
410         // If instantiation fails, we can propagate the error
411         // upward and drop the slot. This will cause the Drop
412         // handler to attempt to map the range with PROT_NONE
413         // memory, to reserve the space while releasing any
414         // stale mappings. The next use of this slot will then
415         // create a new slot that will try to map over
416         // this, returning errors as well if the mapping
417         // errors persist. The unmap-on-drop is best effort;
418         // if it fails, then we can still soundly continue
419         // using the rest of the pool and allowing the rest of
420         // the process to continue, because we never perform a
421         // mmap that would leave an open space for someone
422         // else to come in and map something.
423         let initial_size = usize::try_from(initial_size).unwrap();
424         slot.instantiate(initial_size, image, ty, tunables)?;
425 
426         let memory = Memory::new_static(
427             ty,
428             tunables,
429             MemoryBase::Mmap(base),
430             base_capacity.byte_count(),
431             slot,
432             request.limiter.as_deref_mut(),
433         )
434         .await?;
435         guard.active = false;
436         return Ok((allocation_index, memory));
437 
438         struct DeallocateIndexGuard<'a> {
439             pool: &'a MemoryPool,
440             stripe_index: usize,
441             striped_allocation_index: StripedAllocationIndex,
442             active: bool,
443         }
444 
445         impl Drop for DeallocateIndexGuard<'_> {
446             fn drop(&mut self) {
447                 if !self.active {
448                     return;
449                 }
450                 self.pool.stripes[self.stripe_index]
451                     .allocator
452                     .free(SlotId(self.striped_allocation_index.0), 0);
453             }
454         }
455     }
456 
457     /// Deallocate a previously-allocated memory.
458     ///
459     /// If `image` is `None` then the state of this memory's slot is left
460     /// unknown. Otherwise `image` is used to retain information about the state
461     /// of this slot.
462     ///
463     /// # Safety
464     ///
465     /// The memory must have been previously allocated from this pool and
466     /// assigned the given index, must currently be in an allocated state, and
467     /// must never be used again.
468     ///
469     /// The caller must have already called `clear_and_remain_ready` on the
470     /// memory's image and flushed any enqueued decommits for this memory. Note
471     /// that if `image` is `None` then this is not required.
deallocate( &self, allocation_index: MemoryAllocationIndex, image: Option<MemoryImageSlot>, bytes_resident: usize, )472     pub unsafe fn deallocate(
473         &self,
474         allocation_index: MemoryAllocationIndex,
475         image: Option<MemoryImageSlot>,
476         bytes_resident: usize,
477     ) {
478         self.return_memory_image_slot(allocation_index, image);
479 
480         let (stripe_index, striped_allocation_index) =
481             StripedAllocationIndex::from_unstriped_slot_index(allocation_index, self.stripes.len());
482         self.stripes[stripe_index]
483             .allocator
484             .free(SlotId(striped_allocation_index.0), bytes_resident);
485     }
486 
487     /// Purging everything related to `module`.
purge_module(&self, module: CompiledModuleId)488     pub fn purge_module(&self, module: CompiledModuleId) {
489         // This primarily means clearing out all of its memory images present in
490         // the virtual address space. Go through the index allocator for slots
491         // affine to `module` and reset them, freeing up the index when we're
492         // done.
493         //
494         // Note that this is only called when the specified `module` won't be
495         // allocated further (the module is being dropped) so this shouldn't hit
496         // any sort of infinite loop since this should be the final operation
497         // working with `module`.
498         //
499         // TODO: We are given a module id, but key affinity by pair of module id
500         // and defined memory index. We are missing any defined memory index or
501         // count of how many memories the module defines here. Therefore, we
502         // probe up to the maximum number of memories per instance. This is fine
503         // because that maximum is generally relatively small. If this method
504         // somehow ever gets hot because of unnecessary probing, we should
505         // either pass in the actual number of defined memories for the given
506         // module to this method, or keep a side table of all slots that are
507         // associated with a module (not just module and memory). The latter
508         // would require care to make sure that its maintenance wouldn't be too
509         // expensive for normal allocation/free operations.
510         for (stripe_index, stripe) in self.stripes.iter().enumerate() {
511             for i in 0..self.memories_per_instance {
512                 use wasmtime_environ::EntityRef;
513                 let memory_index = DefinedMemoryIndex::new(i);
514                 while let Some(id) = stripe
515                     .allocator
516                     .alloc_affine_and_clear_affinity(module, memory_index)
517                 {
518                     // Attempt to acquire the `MemoryImageSlot` state for this
519                     // slot, and then if we have that try to remove the image,
520                     // and then if all that succeeds put the slot back in.
521                     //
522                     // If anything fails then the slot will be in an "unknown"
523                     // state which means that on next use it'll be remapped with
524                     // anonymous memory.
525                     let index = StripedAllocationIndex(id.0)
526                         .as_unstriped_slot_index(stripe_index, self.stripes.len());
527                     if let Ok(mut slot) = self.take_memory_image_slot(index) {
528                         if slot.remove_image().is_ok() {
529                             self.return_memory_image_slot(index, Some(slot));
530                         }
531                     }
532 
533                     stripe.allocator.free(id, 0);
534                 }
535             }
536         }
537     }
538 
get_base(&self, allocation_index: MemoryAllocationIndex) -> MmapOffset539     fn get_base(&self, allocation_index: MemoryAllocationIndex) -> MmapOffset {
540         assert!(allocation_index.index() < self.layout.num_slots);
541         let offset = self
542             .layout
543             .slot_bytes
544             .checked_mul(allocation_index.index())
545             .and_then(|c| c.checked_add(self.layout.pre_slab_guard_bytes))
546             .expect("slot_bytes * index + pre_slab_guard_bytes overflows");
547         self.mapping.offset(offset).expect("offset is in bounds")
548     }
549 
550     /// Take ownership of the given image slot.
551     ///
552     /// This method is used when a `MemoryAllocationIndex` has been allocated
553     /// and the state of the slot needs to be acquired. This will lazily
554     /// allocate a `MemoryImageSlot` which describes the current (and possibly
555     /// prior) state of the slot.
556     ///
557     /// During deallocation this structure is passed back to
558     /// `return_memory_image_slot`.
559     ///
560     /// Note that this is a fallible method because using a slot might require
561     /// resetting the memory that was previously there. This reset operation
562     /// is a fallible operation that may not succeed. If it fails then this
563     /// slot cannot be used at this time.
take_memory_image_slot( &self, allocation_index: MemoryAllocationIndex, ) -> Result<MemoryImageSlot>564     fn take_memory_image_slot(
565         &self,
566         allocation_index: MemoryAllocationIndex,
567     ) -> Result<MemoryImageSlot> {
568         let (maybe_slot, needs_reset) = {
569             let mut slot = self.image_slots[allocation_index.index()].lock().unwrap();
570             match mem::replace(&mut *slot, ImageSlot::Unknown) {
571                 ImageSlot::Unmapped => (None, false),
572                 ImageSlot::Unknown => (None, true),
573                 ImageSlot::PreviouslyUsed(state) => (Some(state), false),
574             }
575         };
576         let mut slot = maybe_slot.unwrap_or_else(|| {
577             MemoryImageSlot::create(
578                 self.get_base(allocation_index),
579                 HostAlignedByteCount::ZERO,
580                 self.layout.max_memory_bytes.byte_count(),
581             )
582         });
583 
584         // For `Unknown` slots it means that `slot` is brand new and isn't
585         // actually tracking the state of the previous slot, so reset it
586         // entirely with anonymous memory to wipe the slate clean and start
587         // from zero. This should only happen if allocation of the previous
588         // slot failed, for example.
589         if needs_reset {
590             slot.reset_with_anon_memory()?;
591         }
592         Ok(slot)
593     }
594 
595     /// Return ownership of the given image slot.
596     ///
597     /// If `slot` is not provided then it's reset with `Unknown` meaning a
598     /// future allocation will need to pave over it to use it.
return_memory_image_slot( &self, allocation_index: MemoryAllocationIndex, slot: Option<MemoryImageSlot>, )599     fn return_memory_image_slot(
600         &self,
601         allocation_index: MemoryAllocationIndex,
602         slot: Option<MemoryImageSlot>,
603     ) {
604         let prev = mem::replace(
605             &mut *self.image_slots[allocation_index.index()].lock().unwrap(),
606             match slot {
607                 Some(slot) => {
608                     assert!(!slot.is_dirty());
609                     ImageSlot::PreviouslyUsed(slot)
610                 }
611                 None => ImageSlot::Unknown,
612             },
613         );
614         assert!(matches!(prev, ImageSlot::Unknown));
615     }
616 
unused_warm_slots(&self) -> u32617     pub fn unused_warm_slots(&self) -> u32 {
618         self.stripes
619             .iter()
620             .map(|i| i.allocator.unused_warm_slots())
621             .sum()
622     }
623 
unused_bytes_resident(&self) -> usize624     pub fn unused_bytes_resident(&self) -> usize {
625         self.stripes
626             .iter()
627             .map(|i| i.allocator.unused_bytes_resident())
628             .sum()
629     }
630 }
631 
632 /// The index of a memory allocation within an `InstanceAllocator`.
633 #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
634 pub struct StripedAllocationIndex(u32);
635 
636 impl StripedAllocationIndex {
from_unstriped_slot_index( index: MemoryAllocationIndex, num_stripes: usize, ) -> (usize, Self)637     fn from_unstriped_slot_index(
638         index: MemoryAllocationIndex,
639         num_stripes: usize,
640     ) -> (usize, Self) {
641         let stripe_index = index.index() % num_stripes;
642         let num_stripes: u32 = num_stripes.try_into().unwrap();
643         let index_within_stripe = Self(index.0 / num_stripes);
644         (stripe_index, index_within_stripe)
645     }
646 
as_unstriped_slot_index(self, stripe: usize, num_stripes: usize) -> MemoryAllocationIndex647     fn as_unstriped_slot_index(self, stripe: usize, num_stripes: usize) -> MemoryAllocationIndex {
648         let num_stripes: u32 = num_stripes.try_into().unwrap();
649         let stripe: u32 = stripe.try_into().unwrap();
650         MemoryAllocationIndex(self.0 * num_stripes + stripe)
651     }
652 }
653 
654 #[derive(Clone, Debug)]
655 struct SlabConstraints {
656     /// Essentially, the `static_memory_bound`: this is an assumption that the
657     /// runtime and JIT compiler make about how much space will be guarded
658     /// between slots.
659     expected_slot_bytes: HostAlignedByteCount,
660     /// The maximum size of any memory in the pool. Always a non-zero multiple
661     /// of the page size.
662     max_memory_bytes: HostAlignedByteCount,
663     num_slots: usize,
664     num_pkeys_available: usize,
665     guard_bytes: HostAlignedByteCount,
666     guard_before_slots: bool,
667 }
668 
669 impl SlabConstraints {
new( limits: &InstanceLimits, tunables: &Tunables, num_pkeys_available: usize, ) -> Result<Self>670     fn new(
671         limits: &InstanceLimits,
672         tunables: &Tunables,
673         num_pkeys_available: usize,
674     ) -> Result<Self> {
675         // `memory_reservation` is the configured number of bytes for a
676         // static memory slot (see `Config::memory_reservation`); even
677         // if the memory never grows to this size (e.g., it has a lower memory
678         // maximum), codegen will assume that this unused memory is mapped
679         // `PROT_NONE`. Typically `memory_reservation` is 4GiB which helps
680         // elide most bounds checks. `MemoryPool` must respect this bound,
681         // though not explicitly: if we can achieve the same effect via
682         // MPK-protected stripes, the slot size can be lower than the
683         // `memory_reservation`.
684         let expected_slot_bytes =
685             HostAlignedByteCount::new_rounded_up_u64(tunables.memory_reservation)
686                 .context("memory reservation is too large")?;
687 
688         // Page-align the maximum size of memory since that's the granularity that
689         // permissions are going to be controlled at.
690         let max_memory_bytes = HostAlignedByteCount::new_rounded_up(limits.max_memory_size)
691             .context("maximum size of memory is too large")?;
692 
693         let guard_bytes = HostAlignedByteCount::new_rounded_up_u64(tunables.memory_guard_size)
694             .context("guard region is too large")?;
695 
696         let num_slots = usize::try_from(limits.total_memories).context("too many memories")?;
697 
698         let constraints = SlabConstraints {
699             max_memory_bytes,
700             num_slots,
701             expected_slot_bytes,
702             num_pkeys_available,
703             guard_bytes,
704             guard_before_slots: tunables.guard_before_linear_memory,
705         };
706         Ok(constraints)
707     }
708 }
709 
710 #[derive(Debug)]
711 struct SlabLayout {
712     /// The total number of slots available in the memory pool slab.
713     num_slots: usize,
714     /// The size of each slot in the memory pool; this contains the maximum
715     /// memory size (i.e., from WebAssembly or Wasmtime configuration) plus any
716     /// guard region after the memory to catch OOB access. On these guard
717     /// regions, note that:
718     /// - users can configure how aggressively (or not) to elide bounds checks
719     ///   via `Config::memory_guard_size` (see also:
720     ///   `memory_and_guard_size`)
721     /// - memory protection keys can compress the size of the guard region by
722     ///   placing slots from a different key (i.e., a stripe) in the guard
723     ///   region; this means the slot itself can be smaller and we can allocate
724     ///   more of them.
725     slot_bytes: HostAlignedByteCount,
726     /// The maximum size that can become accessible, in bytes, for each linear
727     /// memory. Guaranteed to be a whole number of Wasm pages.
728     max_memory_bytes: HostAlignedByteCount,
729     /// If necessary, the number of bytes to reserve as a guard region at the
730     /// beginning of the slab.
731     pre_slab_guard_bytes: HostAlignedByteCount,
732     /// Like `pre_slab_guard_bytes`, but at the end of the slab.
733     post_slab_guard_bytes: HostAlignedByteCount,
734     /// The number of stripes needed in the slab layout.
735     num_stripes: usize,
736 }
737 
738 impl SlabLayout {
739     /// Return the total size of the slab, using the final layout (where `n =
740     /// num_slots`):
741     ///
742     /// ```text
743     /// ┌────────────────────┬──────┬──────┬───┬──────┬─────────────────────┐
744     /// │pre_slab_guard_bytes│slot 1│slot 2│...│slot n│post_slab_guard_bytes│
745     /// └────────────────────┴──────┴──────┴───┴──────┴─────────────────────┘
746     /// ```
total_slab_bytes(&self) -> Result<HostAlignedByteCount>747     fn total_slab_bytes(&self) -> Result<HostAlignedByteCount> {
748         self.slot_bytes
749             .checked_mul(self.num_slots)
750             .and_then(|c| c.checked_add(self.pre_slab_guard_bytes))
751             .and_then(|c| c.checked_add(self.post_slab_guard_bytes))
752             .context("total size of memory reservation exceeds addressable memory")
753     }
754 
755     /// Returns the number of Wasm bytes from the beginning of one slot to the
756     /// next slot in the same stripe--this is the striped equivalent of
757     /// `static_memory_bound`. Recall that between slots of the same stripe we
758     /// will see a slot from every other stripe.
759     ///
760     /// For example, in a 3-stripe pool, this function measures the distance
761     /// from the beginning of slot 1 to slot 4, which are of the same stripe:
762     ///
763     /// ```text
764     ///  ◄────────────────────►
765     /// ┌────────┬──────┬──────┬────────┬───┐
766     /// │*slot 1*│slot 2│slot 3│*slot 4*│...|
767     /// └────────┴──────┴──────┴────────┴───┘
768     /// ```
bytes_to_next_stripe_slot(&self) -> HostAlignedByteCount769     fn bytes_to_next_stripe_slot(&self) -> HostAlignedByteCount {
770         self.slot_bytes
771             .checked_mul(self.num_stripes)
772             .expect("constructor checks that self.slot_bytes * self.num_stripes is in bounds")
773     }
774 }
775 
calculate(constraints: &SlabConstraints) -> Result<SlabLayout>776 fn calculate(constraints: &SlabConstraints) -> Result<SlabLayout> {
777     let SlabConstraints {
778         max_memory_bytes,
779         num_slots,
780         expected_slot_bytes,
781         num_pkeys_available,
782         guard_bytes,
783         guard_before_slots,
784     } = *constraints;
785 
786     // If the user specifies a guard region, we always need to allocate a
787     // `PROT_NONE` region for it before any memory slots. Recall that we can
788     // avoid bounds checks for loads and stores with immediates up to
789     // `guard_bytes`, but we rely on Wasmtime to emit bounds checks for any
790     // accesses greater than this.
791     let pre_slab_guard_bytes = if guard_before_slots {
792         guard_bytes
793     } else {
794         HostAlignedByteCount::ZERO
795     };
796 
797     // To calculate the slot size, we start with the default configured size and
798     // attempt to chip away at this via MPK protection. Note here how we begin
799     // to define a slot as "all of the memory and guard region."
800     let faulting_region_bytes = expected_slot_bytes
801         .max(max_memory_bytes)
802         .checked_add(guard_bytes)
803         .context("faulting region is too large")?;
804 
805     let (num_stripes, slot_bytes) = if guard_bytes == 0 || max_memory_bytes == 0 || num_slots == 0 {
806         // In the uncommon case where the memory/guard regions are empty or we don't need any slots , we
807         // will not need any stripes: we just lay out the slots back-to-back
808         // using a single stripe.
809         (1, faulting_region_bytes.byte_count())
810     } else if num_pkeys_available < 2 {
811         // If we do not have enough protection keys to stripe the memory, we do
812         // the same. We can't elide any of the guard bytes because we aren't
813         // overlapping guard regions with other stripes...
814         (1, faulting_region_bytes.byte_count())
815     } else {
816         // ...but if we can create at least two stripes, we can use another
817         // stripe (i.e., a different pkey) as this slot's guard region--this
818         // reduces the guard bytes each slot has to allocate. We must make
819         // sure, though, that if the size of that other stripe(s) does not
820         // fully cover `guard_bytes`, we keep those around to prevent OOB
821         // access.
822 
823         // We first calculate the number of stripes we need: we want to
824         // minimize this so that there is less chance of a single store
825         // running out of slots with its stripe--we need at least two,
826         // though. But this is not just an optimization; we need to handle
827         // the case when there are fewer slots than stripes. E.g., if our
828         // pool is configured with only three slots (`num_memory_slots =
829         // 3`), we will run into failures if we attempt to set up more than
830         // three stripes.
831         let needed_num_stripes = faulting_region_bytes
832             .checked_div(max_memory_bytes)
833             .expect("if condition above implies max_memory_bytes is non-zero")
834             + usize::from(
835                 faulting_region_bytes
836                     .checked_rem(max_memory_bytes)
837                     .expect("if condition above implies max_memory_bytes is non-zero")
838                     != 0,
839             );
840         assert!(needed_num_stripes > 0);
841         let num_stripes = num_pkeys_available.min(needed_num_stripes).min(num_slots);
842 
843         // Next, we try to reduce the slot size by "overlapping" the stripes: we
844         // can make slot `n` smaller since we know that slot `n+1` and following
845         // are in different stripes and will look just like `PROT_NONE` memory.
846         // Recall that codegen expects a guarantee that at least
847         // `faulting_region_bytes` will catch OOB accesses via segfaults.
848         let needed_slot_bytes = faulting_region_bytes
849             .byte_count()
850             .checked_div(num_stripes)
851             .unwrap_or(faulting_region_bytes.byte_count())
852             .max(max_memory_bytes.byte_count());
853         assert!(needed_slot_bytes >= max_memory_bytes.byte_count());
854 
855         (num_stripes, needed_slot_bytes)
856     };
857 
858     // The page-aligned slot size; equivalent to `memory_and_guard_size`.
859     let slot_bytes =
860         HostAlignedByteCount::new_rounded_up(slot_bytes).context("slot size is too large")?;
861 
862     // We may need another guard region (like `pre_slab_guard_bytes`) at the end
863     // of our slab to maintain our `faulting_region_bytes` guarantee. We could
864     // be conservative and just create it as large as `faulting_region_bytes`,
865     // but because we know that the last slot's `slot_bytes` make up the first
866     // part of that region, we reduce the final guard region by that much.
867     let post_slab_guard_bytes = faulting_region_bytes.saturating_sub(slot_bytes);
868 
869     // Check that we haven't exceeded the slab we can calculate given the limits
870     // of `usize`.
871     let layout = SlabLayout {
872         num_slots,
873         slot_bytes,
874         max_memory_bytes,
875         pre_slab_guard_bytes,
876         post_slab_guard_bytes,
877         num_stripes,
878     };
879     match layout.total_slab_bytes() {
880         Ok(_) => Ok(layout),
881         Err(e) => Err(e),
882     }
883 }
884 
885 #[cfg(test)]
886 mod tests {
887     use super::*;
888     use proptest::prelude::*;
889 
890     const WASM_PAGE_SIZE: u32 = wasmtime_environ::Memory::DEFAULT_PAGE_SIZE;
891 
892     #[cfg(target_pointer_width = "64")]
893     #[test]
test_memory_pool() -> Result<()>894     fn test_memory_pool() -> Result<()> {
895         let pool = MemoryPool::new(
896             &PoolingInstanceAllocatorConfig {
897                 limits: InstanceLimits {
898                     total_memories: 5,
899                     max_tables_per_module: 0,
900                     max_memories_per_module: 3,
901                     table_elements: 0,
902                     max_memory_size: WASM_PAGE_SIZE as usize,
903                     ..Default::default()
904                 },
905                 ..Default::default()
906             },
907             &Tunables {
908                 memory_reservation: WASM_PAGE_SIZE as u64,
909                 memory_guard_size: 0,
910                 ..Tunables::default_host()
911             },
912         )?;
913 
914         assert_eq!(pool.layout.slot_bytes, WASM_PAGE_SIZE as usize);
915         assert_eq!(pool.layout.num_slots, 5);
916         assert_eq!(pool.layout.max_memory_bytes, WASM_PAGE_SIZE as usize);
917 
918         let base = pool.mapping.as_ptr() as usize;
919 
920         for i in 0..5 {
921             let index = MemoryAllocationIndex(i);
922             let ptr = pool.get_base(index).as_mut_ptr();
923             assert_eq!(
924                 ptr as usize - base,
925                 i as usize * pool.layout.slot_bytes.byte_count()
926             );
927         }
928 
929         Ok(())
930     }
931 
932     #[test]
933     #[cfg_attr(miri, ignore)]
test_pooling_allocator_striping()934     fn test_pooling_allocator_striping() {
935         if !mpk::is_supported() {
936             println!("skipping `test_pooling_allocator_striping` test; mpk is not supported");
937             return;
938         }
939 
940         // Force the use of MPK.
941         let config = PoolingInstanceAllocatorConfig {
942             memory_protection_keys: Enabled::Yes,
943             ..PoolingInstanceAllocatorConfig::default()
944         };
945         let pool = MemoryPool::new(&config, &Tunables::default_host()).unwrap();
946         assert!(pool.stripes.len() >= 2);
947 
948         let max_memory_slots = config.limits.total_memories;
949         dbg!(pool.stripes[0].allocator.num_empty_slots());
950         dbg!(pool.stripes[1].allocator.num_empty_slots());
951         let available_memory_slots: usize = pool
952             .stripes
953             .iter()
954             .map(|s| s.allocator.num_empty_slots())
955             .sum();
956         assert_eq!(
957             max_memory_slots,
958             u32::try_from(available_memory_slots).unwrap()
959         );
960     }
961 
962     #[test]
check_known_layout_calculations()963     fn check_known_layout_calculations() {
964         for num_pkeys_available in 0..16 {
965             for num_memory_slots in [0, 1, 10, 64] {
966                 for expected_slot_bytes in [0, 1 << 30 /* 1GB */, 4 << 30 /* 4GB */] {
967                     let expected_slot_bytes =
968                         HostAlignedByteCount::new(expected_slot_bytes).unwrap();
969                     for max_memory_bytes in
970                         [0, 1 * WASM_PAGE_SIZE as usize, 10 * WASM_PAGE_SIZE as usize]
971                     {
972                         // Note new rather than new_rounded_up here -- for now,
973                         // WASM_PAGE_SIZE is 64KiB, which is a multiple of the
974                         // host page size on all platforms.
975                         let max_memory_bytes = HostAlignedByteCount::new(max_memory_bytes).unwrap();
976                         for guard_bytes in [0, 2 << 30 /* 2GB */] {
977                             let guard_bytes = HostAlignedByteCount::new(guard_bytes).unwrap();
978                             for guard_before_slots in [true, false] {
979                                 let constraints = SlabConstraints {
980                                     max_memory_bytes,
981                                     num_slots: num_memory_slots,
982                                     expected_slot_bytes,
983                                     num_pkeys_available,
984                                     guard_bytes,
985                                     guard_before_slots,
986                                 };
987                                 match calculate(&constraints) {
988                                     Ok(layout) => {
989                                         assert_slab_layout_invariants(constraints, layout)
990                                     }
991                                     Err(e) => {
992                                         // Only allow failure on 32-bit
993                                         // platforms where the calculation
994                                         // exceeded the size of the address
995                                         // space
996                                         assert!(
997                                             cfg!(target_pointer_width = "32")
998                                                 && e.to_string()
999                                                     .contains("exceeds addressable memory"),
1000                                             "bad error: {e:?}"
1001                                         );
1002                                     }
1003                                 }
1004                             }
1005                         }
1006                     }
1007                 }
1008             }
1009         }
1010     }
1011 
1012     proptest! {
1013         #[test]
1014         #[cfg_attr(miri, ignore)]
1015         fn check_random_layout_calculations(c in constraints()) {
1016             if let Ok(l) = calculate(&c) {
1017                 assert_slab_layout_invariants(c, l);
1018             }
1019         }
1020     }
1021 
constraints() -> impl Strategy<Value = SlabConstraints>1022     fn constraints() -> impl Strategy<Value = SlabConstraints> {
1023         (
1024             any::<HostAlignedByteCount>(),
1025             any::<usize>(),
1026             any::<HostAlignedByteCount>(),
1027             any::<usize>(),
1028             any::<HostAlignedByteCount>(),
1029             any::<bool>(),
1030         )
1031             .prop_map(
1032                 |(
1033                     max_memory_bytes,
1034                     num_memory_slots,
1035                     expected_slot_bytes,
1036                     num_pkeys_available,
1037                     guard_bytes,
1038                     guard_before_slots,
1039                 )| {
1040                     SlabConstraints {
1041                         max_memory_bytes,
1042                         num_slots: num_memory_slots,
1043                         expected_slot_bytes,
1044                         num_pkeys_available,
1045                         guard_bytes,
1046                         guard_before_slots,
1047                     }
1048                 },
1049             )
1050     }
1051 
assert_slab_layout_invariants(c: SlabConstraints, s: SlabLayout)1052     fn assert_slab_layout_invariants(c: SlabConstraints, s: SlabLayout) {
1053         // Check that all the sizes add up.
1054         assert_eq!(
1055             s.total_slab_bytes().unwrap(),
1056             s.pre_slab_guard_bytes
1057                 .checked_add(s.slot_bytes.checked_mul(c.num_slots).unwrap())
1058                 .and_then(|c| c.checked_add(s.post_slab_guard_bytes))
1059                 .unwrap(),
1060             "the slab size does not add up: {c:?} => {s:?}"
1061         );
1062         assert!(
1063             s.slot_bytes >= s.max_memory_bytes,
1064             "slot is not big enough: {c:?} => {s:?}"
1065         );
1066 
1067         // The HostAlignedByteCount newtype wrapper ensures that the various
1068         // byte values are page-aligned.
1069 
1070         // Check that we use no more or less stripes than needed.
1071         assert!(s.num_stripes >= 1, "not enough stripes: {c:?} => {s:?}");
1072         if c.num_pkeys_available == 0 || c.num_slots == 0 {
1073             assert_eq!(
1074                 s.num_stripes, 1,
1075                 "expected at least one stripe: {c:?} => {s:?}"
1076             );
1077         } else {
1078             assert!(
1079                 s.num_stripes <= c.num_pkeys_available,
1080                 "layout has more stripes than available pkeys: {c:?} => {s:?}"
1081             );
1082             assert!(
1083                 s.num_stripes <= c.num_slots,
1084                 "layout has more stripes than memory slots: {c:?} => {s:?}"
1085             );
1086         }
1087 
1088         // Check that we use the minimum number of stripes/protection keys.
1089         // - if the next MPK-protected slot is bigger or the same as the
1090         //   required guard region, we only need two stripes
1091         // - if the next slot is smaller than the guard region, we only need
1092         //   enough stripes to add up to at least that guard region size.
1093         if c.num_pkeys_available > 1 && !c.max_memory_bytes.is_zero() {
1094             assert!(
1095                 s.num_stripes <= (c.guard_bytes.checked_div(c.max_memory_bytes).unwrap() + 2),
1096                 "calculated more stripes than needed: {c:?} => {s:?}"
1097             );
1098         }
1099 
1100         // Check that the memory-striping will not allow OOB access.
1101         // - we may have reduced the slot size from `expected_slot_bytes` to
1102         //   `slot_bytes` assuming MPK striping; we check that our guaranteed
1103         //   "faulting region" is respected
1104         // - the last slot won't have MPK striping after it; we check that the
1105         //   `post_slab_guard_bytes` accounts for this
1106         assert!(
1107             s.bytes_to_next_stripe_slot()
1108                 >= c.expected_slot_bytes
1109                     .max(c.max_memory_bytes)
1110                     .checked_add(c.guard_bytes)
1111                     .unwrap(),
1112             "faulting region not large enough: {c:?} => {s:?}"
1113         );
1114         assert!(
1115             s.slot_bytes.checked_add(s.post_slab_guard_bytes).unwrap() >= c.expected_slot_bytes,
1116             "last slot may allow OOB access: {c:?} => {s:?}"
1117         );
1118     }
1119 }
1120