1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 39 static u8 size_index[24] __ro_after_init = { 40 3, /* 8 */ 41 3, /* 16 */ 42 4, /* 24 */ 43 4, /* 32 */ 44 5, /* 40 */ 45 5, /* 48 */ 46 5, /* 56 */ 47 5, /* 64 */ 48 1, /* 72 */ 49 1, /* 80 */ 50 1, /* 88 */ 51 1, /* 96 */ 52 6, /* 104 */ 53 6, /* 112 */ 54 6, /* 120 */ 55 6, /* 128 */ 56 2, /* 136 */ 57 2, /* 144 */ 58 2, /* 152 */ 59 2, /* 160 */ 60 2, /* 168 */ 61 2, /* 176 */ 62 2, /* 184 */ 63 2 /* 192 */ 64 }; 65 66 static int bpf_mem_cache_idx(size_t size) 67 { 68 if (!size || size > 4096) 69 return -1; 70 71 if (size <= 192) 72 return size_index[(size - 1) / 8] - 1; 73 74 return fls(size - 1) - 2; 75 } 76 77 #define NUM_CACHES 11 78 79 struct bpf_mem_cache { 80 /* per-cpu list of free objects of size 'unit_size'. 81 * All accesses are done with interrupts disabled and 'active' counter 82 * protection with __llist_add() and __llist_del_first(). 83 */ 84 struct llist_head free_llist; 85 local_t active; 86 87 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 88 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 89 * fail. When 'active' is busy the unit_free() will add an object to 90 * free_llist_extra. 91 */ 92 struct llist_head free_llist_extra; 93 94 struct irq_work refill_work; 95 struct obj_cgroup *objcg; 96 int unit_size; 97 /* count of objects in free_llist */ 98 int free_cnt; 99 int low_watermark, high_watermark, batch; 100 int percpu_size; 101 bool draining; 102 struct bpf_mem_cache *tgt; 103 104 /* list of objects to be freed after RCU GP */ 105 struct llist_head free_by_rcu; 106 struct llist_node *free_by_rcu_tail; 107 struct llist_head waiting_for_gp; 108 struct llist_node *waiting_for_gp_tail; 109 struct rcu_head rcu; 110 atomic_t call_rcu_in_progress; 111 struct llist_head free_llist_extra_rcu; 112 113 /* list of objects to be freed after RCU tasks trace GP */ 114 struct llist_head free_by_rcu_ttrace; 115 struct llist_head waiting_for_gp_ttrace; 116 struct rcu_head rcu_ttrace; 117 atomic_t call_rcu_ttrace_in_progress; 118 }; 119 120 struct bpf_mem_caches { 121 struct bpf_mem_cache cache[NUM_CACHES]; 122 }; 123 124 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 125 126 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 127 { 128 struct llist_node *entry, *next; 129 130 entry = head->first; 131 if (!entry) 132 return NULL; 133 next = entry->next; 134 head->first = next; 135 return entry; 136 } 137 138 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 139 { 140 if (c->percpu_size) { 141 void **obj = kmalloc_node(c->percpu_size, flags, node); 142 void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 143 144 if (!obj || !pptr) { 145 free_percpu(pptr); 146 kfree(obj); 147 return NULL; 148 } 149 obj[1] = pptr; 150 return obj; 151 } 152 153 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 154 } 155 156 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 157 { 158 #ifdef CONFIG_MEMCG_KMEM 159 if (c->objcg) 160 return get_mem_cgroup_from_objcg(c->objcg); 161 #endif 162 163 #ifdef CONFIG_MEMCG 164 return root_mem_cgroup; 165 #else 166 return NULL; 167 #endif 168 } 169 170 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 171 { 172 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 173 /* In RT irq_work runs in per-cpu kthread, so disable 174 * interrupts to avoid preemption and interrupts and 175 * reduce the chance of bpf prog executing on this cpu 176 * when active counter is busy. 177 */ 178 local_irq_save(*flags); 179 /* alloc_bulk runs from irq_work which will not preempt a bpf 180 * program that does unit_alloc/unit_free since IRQs are 181 * disabled there. There is no race to increment 'active' 182 * counter. It protects free_llist from corruption in case NMI 183 * bpf prog preempted this loop. 184 */ 185 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 186 } 187 188 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 189 { 190 local_dec(&c->active); 191 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 192 local_irq_restore(*flags); 193 } 194 195 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 196 { 197 unsigned long flags; 198 199 inc_active(c, &flags); 200 __llist_add(obj, &c->free_llist); 201 c->free_cnt++; 202 dec_active(c, &flags); 203 } 204 205 /* Mostly runs from irq_work except __init phase. */ 206 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 207 { 208 struct mem_cgroup *memcg = NULL, *old_memcg; 209 gfp_t gfp; 210 void *obj; 211 int i; 212 213 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 214 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 215 216 for (i = 0; i < cnt; i++) { 217 /* 218 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 219 * done only by one CPU == current CPU. Other CPUs might 220 * llist_add() and llist_del_all() in parallel. 221 */ 222 obj = llist_del_first(&c->free_by_rcu_ttrace); 223 if (!obj) 224 break; 225 add_obj_to_free_list(c, obj); 226 } 227 if (i >= cnt) 228 return; 229 230 for (; i < cnt; i++) { 231 obj = llist_del_first(&c->waiting_for_gp_ttrace); 232 if (!obj) 233 break; 234 add_obj_to_free_list(c, obj); 235 } 236 if (i >= cnt) 237 return; 238 239 memcg = get_memcg(c); 240 old_memcg = set_active_memcg(memcg); 241 for (; i < cnt; i++) { 242 /* Allocate, but don't deplete atomic reserves that typical 243 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 244 * will allocate from the current numa node which is what we 245 * want here. 246 */ 247 obj = __alloc(c, node, gfp); 248 if (!obj) 249 break; 250 add_obj_to_free_list(c, obj); 251 } 252 set_active_memcg(old_memcg); 253 mem_cgroup_put(memcg); 254 } 255 256 static void free_one(void *obj, bool percpu) 257 { 258 if (percpu) { 259 free_percpu(((void **)obj)[1]); 260 kfree(obj); 261 return; 262 } 263 264 kfree(obj); 265 } 266 267 static int free_all(struct llist_node *llnode, bool percpu) 268 { 269 struct llist_node *pos, *t; 270 int cnt = 0; 271 272 llist_for_each_safe(pos, t, llnode) { 273 free_one(pos, percpu); 274 cnt++; 275 } 276 return cnt; 277 } 278 279 static void __free_rcu(struct rcu_head *head) 280 { 281 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 283 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 } 286 287 static void __free_rcu_tasks_trace(struct rcu_head *head) 288 { 289 /* If RCU Tasks Trace grace period implies RCU grace period, 290 * there is no need to invoke call_rcu(). 291 */ 292 if (rcu_trace_implies_rcu_gp()) 293 __free_rcu(head); 294 else 295 call_rcu(head, __free_rcu); 296 } 297 298 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 299 { 300 struct llist_node *llnode = obj; 301 302 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 303 * Nothing races to add to free_by_rcu_ttrace list. 304 */ 305 llist_add(llnode, &c->free_by_rcu_ttrace); 306 } 307 308 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 309 { 310 struct llist_node *llnode, *t; 311 312 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 313 if (unlikely(READ_ONCE(c->draining))) { 314 llnode = llist_del_all(&c->free_by_rcu_ttrace); 315 free_all(llnode, !!c->percpu_size); 316 } 317 return; 318 } 319 320 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 321 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 322 llist_add(llnode, &c->waiting_for_gp_ttrace); 323 324 if (unlikely(READ_ONCE(c->draining))) { 325 __free_rcu(&c->rcu_ttrace); 326 return; 327 } 328 329 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 330 * If RCU Tasks Trace grace period implies RCU grace period, free 331 * these elements directly, else use call_rcu() to wait for normal 332 * progs to finish and finally do free_one() on each element. 333 */ 334 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 335 } 336 337 static void free_bulk(struct bpf_mem_cache *c) 338 { 339 struct bpf_mem_cache *tgt = c->tgt; 340 struct llist_node *llnode, *t; 341 unsigned long flags; 342 int cnt; 343 344 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 345 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 346 347 do { 348 inc_active(c, &flags); 349 llnode = __llist_del_first(&c->free_llist); 350 if (llnode) 351 cnt = --c->free_cnt; 352 else 353 cnt = 0; 354 dec_active(c, &flags); 355 if (llnode) 356 enque_to_free(tgt, llnode); 357 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 358 359 /* and drain free_llist_extra */ 360 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 361 enque_to_free(tgt, llnode); 362 do_call_rcu_ttrace(tgt); 363 } 364 365 static void __free_by_rcu(struct rcu_head *head) 366 { 367 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 368 struct bpf_mem_cache *tgt = c->tgt; 369 struct llist_node *llnode; 370 371 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 372 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 373 374 llnode = llist_del_all(&c->waiting_for_gp); 375 if (!llnode) 376 goto out; 377 378 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 379 380 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 381 do_call_rcu_ttrace(tgt); 382 out: 383 atomic_set(&c->call_rcu_in_progress, 0); 384 } 385 386 static void check_free_by_rcu(struct bpf_mem_cache *c) 387 { 388 struct llist_node *llnode, *t; 389 unsigned long flags; 390 391 /* drain free_llist_extra_rcu */ 392 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 393 inc_active(c, &flags); 394 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 395 if (__llist_add(llnode, &c->free_by_rcu)) 396 c->free_by_rcu_tail = llnode; 397 dec_active(c, &flags); 398 } 399 400 if (llist_empty(&c->free_by_rcu)) 401 return; 402 403 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 404 /* 405 * Instead of kmalloc-ing new rcu_head and triggering 10k 406 * call_rcu() to hit rcutree.qhimark and force RCU to notice 407 * the overload just ask RCU to hurry up. There could be many 408 * objects in free_by_rcu list. 409 * This hint reduces memory consumption for an artificial 410 * benchmark from 2 Gbyte to 150 Mbyte. 411 */ 412 rcu_request_urgent_qs_task(current); 413 return; 414 } 415 416 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 417 418 inc_active(c, &flags); 419 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 420 c->waiting_for_gp_tail = c->free_by_rcu_tail; 421 dec_active(c, &flags); 422 423 if (unlikely(READ_ONCE(c->draining))) { 424 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 425 atomic_set(&c->call_rcu_in_progress, 0); 426 } else { 427 call_rcu_hurry(&c->rcu, __free_by_rcu); 428 } 429 } 430 431 static void bpf_mem_refill(struct irq_work *work) 432 { 433 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 434 int cnt; 435 436 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 437 cnt = c->free_cnt; 438 if (cnt < c->low_watermark) 439 /* irq_work runs on this cpu and kmalloc will allocate 440 * from the current numa node which is what we want here. 441 */ 442 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 443 else if (cnt > c->high_watermark) 444 free_bulk(c); 445 446 check_free_by_rcu(c); 447 } 448 449 static void notrace irq_work_raise(struct bpf_mem_cache *c) 450 { 451 irq_work_queue(&c->refill_work); 452 } 453 454 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 455 * the freelist cache will be elem_size * 64 (or less) on each cpu. 456 * 457 * For bpf programs that don't have statically known allocation sizes and 458 * assuming (low_mark + high_mark) / 2 as an average number of elements per 459 * bucket and all buckets are used the total amount of memory in freelists 460 * on each cpu will be: 461 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 462 * == ~ 116 Kbyte using below heuristic. 463 * Initialized, but unused bpf allocator (not bpf map specific one) will 464 * consume ~ 11 Kbyte per cpu. 465 * Typical case will be between 11K and 116K closer to 11K. 466 * bpf progs can and should share bpf_mem_cache when possible. 467 */ 468 static void init_refill_work(struct bpf_mem_cache *c) 469 { 470 init_irq_work(&c->refill_work, bpf_mem_refill); 471 if (c->unit_size <= 256) { 472 c->low_watermark = 32; 473 c->high_watermark = 96; 474 } else { 475 /* When page_size == 4k, order-0 cache will have low_mark == 2 476 * and high_mark == 6 with batch alloc of 3 individual pages at 477 * a time. 478 * 8k allocs and above low == 1, high == 3, batch == 1. 479 */ 480 c->low_watermark = max(32 * 256 / c->unit_size, 1); 481 c->high_watermark = max(96 * 256 / c->unit_size, 3); 482 } 483 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 484 } 485 486 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 487 { 488 int cnt = 1; 489 490 /* To avoid consuming memory, for non-percpu allocation, assume that 491 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 492 * irq disabled region if unit size is less than or equal to 256. 493 * For all other cases, let us just do one allocation. 494 */ 495 if (!c->percpu_size && c->unit_size <= 256) 496 cnt = 4; 497 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 498 } 499 500 /* When size != 0 bpf_mem_cache for each cpu. 501 * This is typical bpf hash map use case when all elements have equal size. 502 * 503 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 504 * kmalloc/kfree. Max allocation size is 4096 in this case. 505 * This is bpf_dynptr and bpf_kptr use case. 506 */ 507 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 508 { 509 struct bpf_mem_caches *cc, __percpu *pcc; 510 struct bpf_mem_cache *c, __percpu *pc; 511 struct obj_cgroup *objcg = NULL; 512 int cpu, i, unit_size, percpu_size = 0; 513 514 if (percpu && size == 0) 515 return -EINVAL; 516 517 /* room for llist_node and per-cpu pointer */ 518 if (percpu) 519 percpu_size = LLIST_NODE_SZ + sizeof(void *); 520 ma->percpu = percpu; 521 522 if (size) { 523 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 524 if (!pc) 525 return -ENOMEM; 526 527 if (!percpu) 528 size += LLIST_NODE_SZ; /* room for llist_node */ 529 unit_size = size; 530 531 #ifdef CONFIG_MEMCG_KMEM 532 if (memcg_bpf_enabled()) 533 objcg = get_obj_cgroup_from_current(); 534 #endif 535 ma->objcg = objcg; 536 537 for_each_possible_cpu(cpu) { 538 c = per_cpu_ptr(pc, cpu); 539 c->unit_size = unit_size; 540 c->objcg = objcg; 541 c->percpu_size = percpu_size; 542 c->tgt = c; 543 init_refill_work(c); 544 prefill_mem_cache(c, cpu); 545 } 546 ma->cache = pc; 547 return 0; 548 } 549 550 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 551 if (!pcc) 552 return -ENOMEM; 553 #ifdef CONFIG_MEMCG_KMEM 554 objcg = get_obj_cgroup_from_current(); 555 #endif 556 ma->objcg = objcg; 557 for_each_possible_cpu(cpu) { 558 cc = per_cpu_ptr(pcc, cpu); 559 for (i = 0; i < NUM_CACHES; i++) { 560 c = &cc->cache[i]; 561 c->unit_size = sizes[i]; 562 c->objcg = objcg; 563 c->percpu_size = percpu_size; 564 c->tgt = c; 565 566 init_refill_work(c); 567 prefill_mem_cache(c, cpu); 568 } 569 } 570 571 ma->caches = pcc; 572 return 0; 573 } 574 575 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 576 { 577 struct bpf_mem_caches __percpu *pcc; 578 579 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 580 if (!pcc) 581 return -ENOMEM; 582 583 ma->caches = pcc; 584 ma->objcg = objcg; 585 ma->percpu = true; 586 return 0; 587 } 588 589 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 590 { 591 struct bpf_mem_caches *cc, __percpu *pcc; 592 int cpu, i, unit_size, percpu_size; 593 struct obj_cgroup *objcg; 594 struct bpf_mem_cache *c; 595 596 i = bpf_mem_cache_idx(size); 597 if (i < 0) 598 return -EINVAL; 599 600 /* room for llist_node and per-cpu pointer */ 601 percpu_size = LLIST_NODE_SZ + sizeof(void *); 602 603 unit_size = sizes[i]; 604 objcg = ma->objcg; 605 pcc = ma->caches; 606 607 for_each_possible_cpu(cpu) { 608 cc = per_cpu_ptr(pcc, cpu); 609 c = &cc->cache[i]; 610 if (cpu == 0 && c->unit_size) 611 break; 612 613 c->unit_size = unit_size; 614 c->objcg = objcg; 615 c->percpu_size = percpu_size; 616 c->tgt = c; 617 618 init_refill_work(c); 619 prefill_mem_cache(c, cpu); 620 } 621 622 return 0; 623 } 624 625 static void drain_mem_cache(struct bpf_mem_cache *c) 626 { 627 bool percpu = !!c->percpu_size; 628 629 /* No progs are using this bpf_mem_cache, but htab_map_free() called 630 * bpf_mem_cache_free() for all remaining elements and they can be in 631 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 632 * 633 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 634 * on these lists, so it is safe to use __llist_del_all(). 635 */ 636 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 637 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 638 free_all(__llist_del_all(&c->free_llist), percpu); 639 free_all(__llist_del_all(&c->free_llist_extra), percpu); 640 free_all(__llist_del_all(&c->free_by_rcu), percpu); 641 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 642 free_all(llist_del_all(&c->waiting_for_gp), percpu); 643 } 644 645 static void check_mem_cache(struct bpf_mem_cache *c) 646 { 647 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 648 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 649 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 650 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 651 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 652 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 653 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 654 } 655 656 static void check_leaked_objs(struct bpf_mem_alloc *ma) 657 { 658 struct bpf_mem_caches *cc; 659 struct bpf_mem_cache *c; 660 int cpu, i; 661 662 if (ma->cache) { 663 for_each_possible_cpu(cpu) { 664 c = per_cpu_ptr(ma->cache, cpu); 665 check_mem_cache(c); 666 } 667 } 668 if (ma->caches) { 669 for_each_possible_cpu(cpu) { 670 cc = per_cpu_ptr(ma->caches, cpu); 671 for (i = 0; i < NUM_CACHES; i++) { 672 c = &cc->cache[i]; 673 check_mem_cache(c); 674 } 675 } 676 } 677 } 678 679 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 680 { 681 check_leaked_objs(ma); 682 free_percpu(ma->cache); 683 free_percpu(ma->caches); 684 ma->cache = NULL; 685 ma->caches = NULL; 686 } 687 688 static void free_mem_alloc(struct bpf_mem_alloc *ma) 689 { 690 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 691 * might still execute. Wait for them. 692 * 693 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 694 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 695 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 696 * so if call_rcu(head, __free_rcu) is skipped due to 697 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 698 * using rcu_trace_implies_rcu_gp() as well. 699 */ 700 rcu_barrier(); /* wait for __free_by_rcu */ 701 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 702 if (!rcu_trace_implies_rcu_gp()) 703 rcu_barrier(); 704 free_mem_alloc_no_barrier(ma); 705 } 706 707 static void free_mem_alloc_deferred(struct work_struct *work) 708 { 709 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 710 711 free_mem_alloc(ma); 712 kfree(ma); 713 } 714 715 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 716 { 717 struct bpf_mem_alloc *copy; 718 719 if (!rcu_in_progress) { 720 /* Fast path. No callbacks are pending, hence no need to do 721 * rcu_barrier-s. 722 */ 723 free_mem_alloc_no_barrier(ma); 724 return; 725 } 726 727 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 728 if (!copy) { 729 /* Slow path with inline barrier-s */ 730 free_mem_alloc(ma); 731 return; 732 } 733 734 /* Defer barriers into worker to let the rest of map memory to be freed */ 735 memset(ma, 0, sizeof(*ma)); 736 INIT_WORK(©->work, free_mem_alloc_deferred); 737 queue_work(system_unbound_wq, ©->work); 738 } 739 740 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 741 { 742 struct bpf_mem_caches *cc; 743 struct bpf_mem_cache *c; 744 int cpu, i, rcu_in_progress; 745 746 if (ma->cache) { 747 rcu_in_progress = 0; 748 for_each_possible_cpu(cpu) { 749 c = per_cpu_ptr(ma->cache, cpu); 750 WRITE_ONCE(c->draining, true); 751 irq_work_sync(&c->refill_work); 752 drain_mem_cache(c); 753 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 754 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 755 } 756 if (ma->objcg) 757 obj_cgroup_put(ma->objcg); 758 destroy_mem_alloc(ma, rcu_in_progress); 759 } 760 if (ma->caches) { 761 rcu_in_progress = 0; 762 for_each_possible_cpu(cpu) { 763 cc = per_cpu_ptr(ma->caches, cpu); 764 for (i = 0; i < NUM_CACHES; i++) { 765 c = &cc->cache[i]; 766 WRITE_ONCE(c->draining, true); 767 irq_work_sync(&c->refill_work); 768 drain_mem_cache(c); 769 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 770 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 771 } 772 } 773 if (ma->objcg) 774 obj_cgroup_put(ma->objcg); 775 destroy_mem_alloc(ma, rcu_in_progress); 776 } 777 } 778 779 /* notrace is necessary here and in other functions to make sure 780 * bpf programs cannot attach to them and cause llist corruptions. 781 */ 782 static void notrace *unit_alloc(struct bpf_mem_cache *c) 783 { 784 struct llist_node *llnode = NULL; 785 unsigned long flags; 786 int cnt = 0; 787 788 /* Disable irqs to prevent the following race for majority of prog types: 789 * prog_A 790 * bpf_mem_alloc 791 * preemption or irq -> prog_B 792 * bpf_mem_alloc 793 * 794 * but prog_B could be a perf_event NMI prog. 795 * Use per-cpu 'active' counter to order free_list access between 796 * unit_alloc/unit_free/bpf_mem_refill. 797 */ 798 local_irq_save(flags); 799 if (local_inc_return(&c->active) == 1) { 800 llnode = __llist_del_first(&c->free_llist); 801 if (llnode) { 802 cnt = --c->free_cnt; 803 *(struct bpf_mem_cache **)llnode = c; 804 } 805 } 806 local_dec(&c->active); 807 808 WARN_ON(cnt < 0); 809 810 if (cnt < c->low_watermark) 811 irq_work_raise(c); 812 /* Enable IRQ after the enqueue of irq work completes, so irq work 813 * will run after IRQ is enabled and free_llist may be refilled by 814 * irq work before other task preempts current task. 815 */ 816 local_irq_restore(flags); 817 818 return llnode; 819 } 820 821 /* Though 'ptr' object could have been allocated on a different cpu 822 * add it to the free_llist of the current cpu. 823 * Let kfree() logic deal with it when it's later called from irq_work. 824 */ 825 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 826 { 827 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 828 unsigned long flags; 829 int cnt = 0; 830 831 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 832 833 /* 834 * Remember bpf_mem_cache that allocated this object. 835 * The hint is not accurate. 836 */ 837 c->tgt = *(struct bpf_mem_cache **)llnode; 838 839 local_irq_save(flags); 840 if (local_inc_return(&c->active) == 1) { 841 __llist_add(llnode, &c->free_llist); 842 cnt = ++c->free_cnt; 843 } else { 844 /* unit_free() cannot fail. Therefore add an object to atomic 845 * llist. free_bulk() will drain it. Though free_llist_extra is 846 * a per-cpu list we have to use atomic llist_add here, since 847 * it also can be interrupted by bpf nmi prog that does another 848 * unit_free() into the same free_llist_extra. 849 */ 850 llist_add(llnode, &c->free_llist_extra); 851 } 852 local_dec(&c->active); 853 854 if (cnt > c->high_watermark) 855 /* free few objects from current cpu into global kmalloc pool */ 856 irq_work_raise(c); 857 /* Enable IRQ after irq_work_raise() completes, otherwise when current 858 * task is preempted by task which does unit_alloc(), unit_alloc() may 859 * return NULL unexpectedly because irq work is already pending but can 860 * not been triggered and free_llist can not be refilled timely. 861 */ 862 local_irq_restore(flags); 863 } 864 865 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 866 { 867 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 868 unsigned long flags; 869 870 c->tgt = *(struct bpf_mem_cache **)llnode; 871 872 local_irq_save(flags); 873 if (local_inc_return(&c->active) == 1) { 874 if (__llist_add(llnode, &c->free_by_rcu)) 875 c->free_by_rcu_tail = llnode; 876 } else { 877 llist_add(llnode, &c->free_llist_extra_rcu); 878 } 879 local_dec(&c->active); 880 881 if (!atomic_read(&c->call_rcu_in_progress)) 882 irq_work_raise(c); 883 local_irq_restore(flags); 884 } 885 886 /* Called from BPF program or from sys_bpf syscall. 887 * In both cases migration is disabled. 888 */ 889 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 890 { 891 int idx; 892 void *ret; 893 894 if (!size) 895 return NULL; 896 897 if (!ma->percpu) 898 size += LLIST_NODE_SZ; 899 idx = bpf_mem_cache_idx(size); 900 if (idx < 0) 901 return NULL; 902 903 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 904 return !ret ? NULL : ret + LLIST_NODE_SZ; 905 } 906 907 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 908 { 909 struct bpf_mem_cache *c; 910 int idx; 911 912 if (!ptr) 913 return; 914 915 c = *(void **)(ptr - LLIST_NODE_SZ); 916 idx = bpf_mem_cache_idx(c->unit_size); 917 if (WARN_ON_ONCE(idx < 0)) 918 return; 919 920 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 921 } 922 923 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 924 { 925 struct bpf_mem_cache *c; 926 int idx; 927 928 if (!ptr) 929 return; 930 931 c = *(void **)(ptr - LLIST_NODE_SZ); 932 idx = bpf_mem_cache_idx(c->unit_size); 933 if (WARN_ON_ONCE(idx < 0)) 934 return; 935 936 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 937 } 938 939 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 940 { 941 void *ret; 942 943 ret = unit_alloc(this_cpu_ptr(ma->cache)); 944 return !ret ? NULL : ret + LLIST_NODE_SZ; 945 } 946 947 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 948 { 949 if (!ptr) 950 return; 951 952 unit_free(this_cpu_ptr(ma->cache), ptr); 953 } 954 955 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 956 { 957 if (!ptr) 958 return; 959 960 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 961 } 962 963 /* Directly does a kfree() without putting 'ptr' back to the free_llist 964 * for reuse and without waiting for a rcu_tasks_trace gp. 965 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 966 * before calling bpf_mem_cache_raw_free(). 967 * It could be used when the rcu_tasks_trace callback does not have 968 * a hold on the original bpf_mem_alloc object that allocated the 969 * 'ptr'. This should only be used in the uncommon code path. 970 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 971 * and may affect performance. 972 */ 973 void bpf_mem_cache_raw_free(void *ptr) 974 { 975 if (!ptr) 976 return; 977 978 kfree(ptr - LLIST_NODE_SZ); 979 } 980 981 /* When flags == GFP_KERNEL, it signals that the caller will not cause 982 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 983 * kmalloc if the free_llist is empty. 984 */ 985 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 986 { 987 struct bpf_mem_cache *c; 988 void *ret; 989 990 c = this_cpu_ptr(ma->cache); 991 992 ret = unit_alloc(c); 993 if (!ret && flags == GFP_KERNEL) { 994 struct mem_cgroup *memcg, *old_memcg; 995 996 memcg = get_memcg(c); 997 old_memcg = set_active_memcg(memcg); 998 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 999 if (ret) 1000 *(struct bpf_mem_cache **)ret = c; 1001 set_active_memcg(old_memcg); 1002 mem_cgroup_put(memcg); 1003 } 1004 1005 return !ret ? NULL : ret + LLIST_NODE_SZ; 1006 } 1007