1b4edb8d2Swuqiang.matt /* SPDX-License-Identifier: GPL-2.0 */
2b4edb8d2Swuqiang.matt
3b4edb8d2Swuqiang.matt #ifndef _LINUX_OBJPOOL_H
4b4edb8d2Swuqiang.matt #define _LINUX_OBJPOOL_H
5b4edb8d2Swuqiang.matt
6b4edb8d2Swuqiang.matt #include <linux/types.h>
7b4edb8d2Swuqiang.matt #include <linux/refcount.h>
8a3b00f10SAndrii Nakryiko #include <linux/atomic.h>
9a3b00f10SAndrii Nakryiko #include <linux/cpumask.h>
10a3b00f10SAndrii Nakryiko #include <linux/irqflags.h>
11a3b00f10SAndrii Nakryiko #include <linux/smp.h>
12b4edb8d2Swuqiang.matt
13b4edb8d2Swuqiang.matt /*
14b4edb8d2Swuqiang.matt * objpool: ring-array based lockless MPMC queue
15b4edb8d2Swuqiang.matt *
16b4edb8d2Swuqiang.matt * Copyright: [email protected],[email protected]
17b4edb8d2Swuqiang.matt *
18b4edb8d2Swuqiang.matt * objpool is a scalable implementation of high performance queue for
19b4edb8d2Swuqiang.matt * object allocation and reclamation, such as kretprobe instances.
20b4edb8d2Swuqiang.matt *
21b4edb8d2Swuqiang.matt * With leveraging percpu ring-array to mitigate hot spots of memory
22b4edb8d2Swuqiang.matt * contention, it delivers near-linear scalability for high parallel
23b4edb8d2Swuqiang.matt * scenarios. The objpool is best suited for the following cases:
24b4edb8d2Swuqiang.matt * 1) Memory allocation or reclamation are prohibited or too expensive
25b4edb8d2Swuqiang.matt * 2) Consumers are of different priorities, such as irqs and threads
26b4edb8d2Swuqiang.matt *
27b4edb8d2Swuqiang.matt * Limitations:
28b4edb8d2Swuqiang.matt * 1) Maximum objects (capacity) is fixed after objpool creation
29b4edb8d2Swuqiang.matt * 2) All pre-allocated objects are managed in percpu ring array,
30b4edb8d2Swuqiang.matt * which consumes more memory than linked lists
31b4edb8d2Swuqiang.matt */
32b4edb8d2Swuqiang.matt
33b4edb8d2Swuqiang.matt /**
34b4edb8d2Swuqiang.matt * struct objpool_slot - percpu ring array of objpool
35b4edb8d2Swuqiang.matt * @head: head sequence of the local ring array (to retrieve at)
36b4edb8d2Swuqiang.matt * @tail: tail sequence of the local ring array (to append at)
37b4edb8d2Swuqiang.matt * @last: the last sequence number marked as ready for retrieve
38b4edb8d2Swuqiang.matt * @mask: bits mask for modulo capacity to compute array indexes
39b4edb8d2Swuqiang.matt * @entries: object entries on this slot
40b4edb8d2Swuqiang.matt *
41b4edb8d2Swuqiang.matt * Represents a cpu-local array-based ring buffer, its size is specialized
42b4edb8d2Swuqiang.matt * during initialization of object pool. The percpu objpool node is to be
43b4edb8d2Swuqiang.matt * allocated from local memory for NUMA system, and to be kept compact in
44b4edb8d2Swuqiang.matt * continuous memory: CPU assigned number of objects are stored just after
45b4edb8d2Swuqiang.matt * the body of objpool_node.
46b4edb8d2Swuqiang.matt *
47b4edb8d2Swuqiang.matt * Real size of the ring array is far too smaller than the value range of
48b4edb8d2Swuqiang.matt * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask)
49b4edb8d2Swuqiang.matt * of head and tail are used as the actual position in the ring array. In
50b4edb8d2Swuqiang.matt * general the ring array is acting like a small sliding window, which is
51b4edb8d2Swuqiang.matt * always moving forward in the loop of [0, 2^32).
52b4edb8d2Swuqiang.matt */
53b4edb8d2Swuqiang.matt struct objpool_slot {
54b4edb8d2Swuqiang.matt uint32_t head;
55b4edb8d2Swuqiang.matt uint32_t tail;
56b4edb8d2Swuqiang.matt uint32_t last;
57b4edb8d2Swuqiang.matt uint32_t mask;
58b4edb8d2Swuqiang.matt void *entries[];
59b4edb8d2Swuqiang.matt } __packed;
60b4edb8d2Swuqiang.matt
61b4edb8d2Swuqiang.matt struct objpool_head;
62b4edb8d2Swuqiang.matt
63b4edb8d2Swuqiang.matt /*
64b4edb8d2Swuqiang.matt * caller-specified callback for object initial setup, it's only called
65b4edb8d2Swuqiang.matt * once for each object (just after the memory allocation of the object)
66b4edb8d2Swuqiang.matt */
67b4edb8d2Swuqiang.matt typedef int (*objpool_init_obj_cb)(void *obj, void *context);
68b4edb8d2Swuqiang.matt
69b4edb8d2Swuqiang.matt /* caller-specified cleanup callback for objpool destruction */
70b4edb8d2Swuqiang.matt typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
71b4edb8d2Swuqiang.matt
72b4edb8d2Swuqiang.matt /**
73b4edb8d2Swuqiang.matt * struct objpool_head - object pooling metadata
74b4edb8d2Swuqiang.matt * @obj_size: object size, aligned to sizeof(void *)
75b4edb8d2Swuqiang.matt * @nr_objs: total objs (to be pre-allocated with objpool)
7678d0b161SAndrii Nakryiko * @nr_possible_cpus: cached value of num_possible_cpus()
77b4edb8d2Swuqiang.matt * @capacity: max objs can be managed by one objpool_slot
78b4edb8d2Swuqiang.matt * @gfp: gfp flags for kmalloc & vmalloc
79b4edb8d2Swuqiang.matt * @ref: refcount of objpool
80b4edb8d2Swuqiang.matt * @flags: flags for objpool management
81b4edb8d2Swuqiang.matt * @cpu_slots: pointer to the array of objpool_slot
82b4edb8d2Swuqiang.matt * @release: resource cleanup callback
83b4edb8d2Swuqiang.matt * @context: caller-provided context
84b4edb8d2Swuqiang.matt */
85b4edb8d2Swuqiang.matt struct objpool_head {
86b4edb8d2Swuqiang.matt int obj_size;
87b4edb8d2Swuqiang.matt int nr_objs;
8878d0b161SAndrii Nakryiko int nr_possible_cpus;
89b4edb8d2Swuqiang.matt int capacity;
90b4edb8d2Swuqiang.matt gfp_t gfp;
91b4edb8d2Swuqiang.matt refcount_t ref;
92b4edb8d2Swuqiang.matt unsigned long flags;
93b4edb8d2Swuqiang.matt struct objpool_slot **cpu_slots;
94b4edb8d2Swuqiang.matt objpool_fini_cb release;
95b4edb8d2Swuqiang.matt void *context;
96b4edb8d2Swuqiang.matt };
97b4edb8d2Swuqiang.matt
98b4edb8d2Swuqiang.matt #define OBJPOOL_NR_OBJECT_MAX (1UL << 24) /* maximum numbers of total objects */
99b4edb8d2Swuqiang.matt #define OBJPOOL_OBJECT_SIZE_MAX (1UL << 16) /* maximum size of an object */
100b4edb8d2Swuqiang.matt
101b4edb8d2Swuqiang.matt /**
102b4edb8d2Swuqiang.matt * objpool_init() - initialize objpool and pre-allocated objects
103b4edb8d2Swuqiang.matt * @pool: the object pool to be initialized, declared by caller
104b4edb8d2Swuqiang.matt * @nr_objs: total objects to be pre-allocated by this object pool
105b4edb8d2Swuqiang.matt * @object_size: size of an object (should be > 0)
106b4edb8d2Swuqiang.matt * @gfp: flags for memory allocation (via kmalloc or vmalloc)
107b4edb8d2Swuqiang.matt * @context: user context for object initialization callback
108b4edb8d2Swuqiang.matt * @objinit: object initialization callback for extra setup
109b4edb8d2Swuqiang.matt * @release: cleanup callback for extra cleanup task
110b4edb8d2Swuqiang.matt *
111b4edb8d2Swuqiang.matt * return value: 0 for success, otherwise error code
112b4edb8d2Swuqiang.matt *
113b4edb8d2Swuqiang.matt * All pre-allocated objects are to be zeroed after memory allocation.
114b4edb8d2Swuqiang.matt * Caller could do extra initialization in objinit callback. objinit()
115b4edb8d2Swuqiang.matt * will be called just after slot allocation and called only once for
116b4edb8d2Swuqiang.matt * each object. After that the objpool won't touch any content of the
117b4edb8d2Swuqiang.matt * objects. It's caller's duty to perform reinitialization after each
118b4edb8d2Swuqiang.matt * pop (object allocation) or do clearance before each push (object
119b4edb8d2Swuqiang.matt * reclamation).
120b4edb8d2Swuqiang.matt */
121b4edb8d2Swuqiang.matt int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
122b4edb8d2Swuqiang.matt gfp_t gfp, void *context, objpool_init_obj_cb objinit,
123b4edb8d2Swuqiang.matt objpool_fini_cb release);
124b4edb8d2Swuqiang.matt
125a3b00f10SAndrii Nakryiko /* try to retrieve object from slot */
__objpool_try_get_slot(struct objpool_head * pool,int cpu)126a3b00f10SAndrii Nakryiko static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu)
127a3b00f10SAndrii Nakryiko {
128a3b00f10SAndrii Nakryiko struct objpool_slot *slot = pool->cpu_slots[cpu];
129a3b00f10SAndrii Nakryiko /* load head snapshot, other cpus may change it */
130a3b00f10SAndrii Nakryiko uint32_t head = smp_load_acquire(&slot->head);
131a3b00f10SAndrii Nakryiko
132a3b00f10SAndrii Nakryiko while (head != READ_ONCE(slot->last)) {
133a3b00f10SAndrii Nakryiko void *obj;
134a3b00f10SAndrii Nakryiko
135a3b00f10SAndrii Nakryiko /*
136a3b00f10SAndrii Nakryiko * data visibility of 'last' and 'head' could be out of
137a3b00f10SAndrii Nakryiko * order since memory updating of 'last' and 'head' are
138a3b00f10SAndrii Nakryiko * performed in push() and pop() independently
139a3b00f10SAndrii Nakryiko *
140a3b00f10SAndrii Nakryiko * before any retrieving attempts, pop() must guarantee
141a3b00f10SAndrii Nakryiko * 'last' is behind 'head', that is to say, there must
142a3b00f10SAndrii Nakryiko * be available objects in slot, which could be ensured
143a3b00f10SAndrii Nakryiko * by condition 'last != head && last - head <= nr_objs'
144a3b00f10SAndrii Nakryiko * that is equivalent to 'last - head - 1 < nr_objs' as
145a3b00f10SAndrii Nakryiko * 'last' and 'head' are both unsigned int32
146a3b00f10SAndrii Nakryiko */
147a3b00f10SAndrii Nakryiko if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
148a3b00f10SAndrii Nakryiko head = READ_ONCE(slot->head);
149a3b00f10SAndrii Nakryiko continue;
150a3b00f10SAndrii Nakryiko }
151a3b00f10SAndrii Nakryiko
152a3b00f10SAndrii Nakryiko /* obj must be retrieved before moving forward head */
153a3b00f10SAndrii Nakryiko obj = READ_ONCE(slot->entries[head & slot->mask]);
154a3b00f10SAndrii Nakryiko
155a3b00f10SAndrii Nakryiko /* move head forward to mark it's consumption */
156a3b00f10SAndrii Nakryiko if (try_cmpxchg_release(&slot->head, &head, head + 1))
157a3b00f10SAndrii Nakryiko return obj;
158a3b00f10SAndrii Nakryiko }
159a3b00f10SAndrii Nakryiko
160a3b00f10SAndrii Nakryiko return NULL;
161a3b00f10SAndrii Nakryiko }
162a3b00f10SAndrii Nakryiko
163b4edb8d2Swuqiang.matt /**
164b4edb8d2Swuqiang.matt * objpool_pop() - allocate an object from objpool
165b4edb8d2Swuqiang.matt * @pool: object pool
166b4edb8d2Swuqiang.matt *
167b4edb8d2Swuqiang.matt * return value: object ptr or NULL if failed
168b4edb8d2Swuqiang.matt */
objpool_pop(struct objpool_head * pool)169a3b00f10SAndrii Nakryiko static inline void *objpool_pop(struct objpool_head *pool)
170a3b00f10SAndrii Nakryiko {
171a3b00f10SAndrii Nakryiko void *obj = NULL;
172a3b00f10SAndrii Nakryiko unsigned long flags;
173*d81603b3SYury Norov int start, cpu;
174a3b00f10SAndrii Nakryiko
175a3b00f10SAndrii Nakryiko /* disable local irq to avoid preemption & interruption */
176a3b00f10SAndrii Nakryiko raw_local_irq_save(flags);
177a3b00f10SAndrii Nakryiko
178*d81603b3SYury Norov start = raw_smp_processor_id();
179*d81603b3SYury Norov for_each_possible_cpu_wrap(cpu, start) {
180a3b00f10SAndrii Nakryiko obj = __objpool_try_get_slot(pool, cpu);
181a3b00f10SAndrii Nakryiko if (obj)
182a3b00f10SAndrii Nakryiko break;
183a3b00f10SAndrii Nakryiko }
184a3b00f10SAndrii Nakryiko raw_local_irq_restore(flags);
185a3b00f10SAndrii Nakryiko
186a3b00f10SAndrii Nakryiko return obj;
187a3b00f10SAndrii Nakryiko }
188a3b00f10SAndrii Nakryiko
189a3b00f10SAndrii Nakryiko /* adding object to slot, abort if the slot was already full */
190a3b00f10SAndrii Nakryiko static inline int
__objpool_try_add_slot(void * obj,struct objpool_head * pool,int cpu)191a3b00f10SAndrii Nakryiko __objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
192a3b00f10SAndrii Nakryiko {
193a3b00f10SAndrii Nakryiko struct objpool_slot *slot = pool->cpu_slots[cpu];
194a3b00f10SAndrii Nakryiko uint32_t head, tail;
195a3b00f10SAndrii Nakryiko
196a3b00f10SAndrii Nakryiko /* loading tail and head as a local snapshot, tail first */
197a3b00f10SAndrii Nakryiko tail = READ_ONCE(slot->tail);
198a3b00f10SAndrii Nakryiko
199a3b00f10SAndrii Nakryiko do {
200a3b00f10SAndrii Nakryiko head = READ_ONCE(slot->head);
201a3b00f10SAndrii Nakryiko /* fault caught: something must be wrong */
202a3b00f10SAndrii Nakryiko WARN_ON_ONCE(tail - head > pool->nr_objs);
203a3b00f10SAndrii Nakryiko } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
204a3b00f10SAndrii Nakryiko
205a3b00f10SAndrii Nakryiko /* now the tail position is reserved for the given obj */
206a3b00f10SAndrii Nakryiko WRITE_ONCE(slot->entries[tail & slot->mask], obj);
207a3b00f10SAndrii Nakryiko /* update sequence to make this obj available for pop() */
208a3b00f10SAndrii Nakryiko smp_store_release(&slot->last, tail + 1);
209a3b00f10SAndrii Nakryiko
210a3b00f10SAndrii Nakryiko return 0;
211a3b00f10SAndrii Nakryiko }
212b4edb8d2Swuqiang.matt
213b4edb8d2Swuqiang.matt /**
214b4edb8d2Swuqiang.matt * objpool_push() - reclaim the object and return back to objpool
215b4edb8d2Swuqiang.matt * @obj: object ptr to be pushed to objpool
216b4edb8d2Swuqiang.matt * @pool: object pool
217b4edb8d2Swuqiang.matt *
218b4edb8d2Swuqiang.matt * return: 0 or error code (it fails only when user tries to push
219b4edb8d2Swuqiang.matt * the same object multiple times or wrong "objects" into objpool)
220b4edb8d2Swuqiang.matt */
objpool_push(void * obj,struct objpool_head * pool)221a3b00f10SAndrii Nakryiko static inline int objpool_push(void *obj, struct objpool_head *pool)
222a3b00f10SAndrii Nakryiko {
223a3b00f10SAndrii Nakryiko unsigned long flags;
224a3b00f10SAndrii Nakryiko int rc;
225a3b00f10SAndrii Nakryiko
226a3b00f10SAndrii Nakryiko /* disable local irq to avoid preemption & interruption */
227a3b00f10SAndrii Nakryiko raw_local_irq_save(flags);
228a3b00f10SAndrii Nakryiko rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id());
229a3b00f10SAndrii Nakryiko raw_local_irq_restore(flags);
230a3b00f10SAndrii Nakryiko
231a3b00f10SAndrii Nakryiko return rc;
232a3b00f10SAndrii Nakryiko }
233a3b00f10SAndrii Nakryiko
234b4edb8d2Swuqiang.matt
235b4edb8d2Swuqiang.matt /**
236b4edb8d2Swuqiang.matt * objpool_drop() - discard the object and deref objpool
237b4edb8d2Swuqiang.matt * @obj: object ptr to be discarded
238b4edb8d2Swuqiang.matt * @pool: object pool
239b4edb8d2Swuqiang.matt *
240b4edb8d2Swuqiang.matt * return: 0 if objpool was released; -EAGAIN if there are still
241b4edb8d2Swuqiang.matt * outstanding objects
242b4edb8d2Swuqiang.matt *
243b4edb8d2Swuqiang.matt * objpool_drop is normally for the release of outstanding objects
244b4edb8d2Swuqiang.matt * after objpool cleanup (objpool_fini). Thinking of this example:
245b4edb8d2Swuqiang.matt * kretprobe is unregistered and objpool_fini() is called to release
246b4edb8d2Swuqiang.matt * all remained objects, but there are still objects being used by
247b4edb8d2Swuqiang.matt * unfinished kretprobes (like blockable function: sys_accept). So
248b4edb8d2Swuqiang.matt * only when the last outstanding object is dropped could the whole
249b4edb8d2Swuqiang.matt * objpool be released along with the call of objpool_drop()
250b4edb8d2Swuqiang.matt */
251b4edb8d2Swuqiang.matt int objpool_drop(void *obj, struct objpool_head *pool);
252b4edb8d2Swuqiang.matt
253b4edb8d2Swuqiang.matt /**
254b4edb8d2Swuqiang.matt * objpool_free() - release objpool forcely (all objects to be freed)
255b4edb8d2Swuqiang.matt * @pool: object pool to be released
256b4edb8d2Swuqiang.matt */
257b4edb8d2Swuqiang.matt void objpool_free(struct objpool_head *pool);
258b4edb8d2Swuqiang.matt
259b4edb8d2Swuqiang.matt /**
260b4edb8d2Swuqiang.matt * objpool_fini() - deref object pool (also releasing unused objects)
261b4edb8d2Swuqiang.matt * @pool: object pool to be dereferenced
262b4edb8d2Swuqiang.matt *
263b4edb8d2Swuqiang.matt * objpool_fini() will try to release all remained free objects and
264b4edb8d2Swuqiang.matt * then drop an extra reference of the objpool. If all objects are
265b4edb8d2Swuqiang.matt * already returned to objpool (so called synchronous use cases),
266b4edb8d2Swuqiang.matt * the objpool itself will be freed together. But if there are still
267b4edb8d2Swuqiang.matt * outstanding objects (so called asynchronous use cases, such like
268b4edb8d2Swuqiang.matt * blockable kretprobe), the objpool won't be released until all
269b4edb8d2Swuqiang.matt * the outstanding objects are dropped, but the caller must assure
270b4edb8d2Swuqiang.matt * there are no concurrent objpool_push() on the fly. Normally RCU
271b4edb8d2Swuqiang.matt * is being required to make sure all ongoing objpool_push() must
272b4edb8d2Swuqiang.matt * be finished before calling objpool_fini(), so does test_objpool,
273b4edb8d2Swuqiang.matt * kretprobe or rethook
274b4edb8d2Swuqiang.matt */
275b4edb8d2Swuqiang.matt void objpool_fini(struct objpool_head *pool);
276b4edb8d2Swuqiang.matt
277b4edb8d2Swuqiang.matt #endif /* _LINUX_OBJPOOL_H */
278