1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 39 static u8 size_index[24] __ro_after_init = { 40 3, /* 8 */ 41 3, /* 16 */ 42 4, /* 24 */ 43 4, /* 32 */ 44 5, /* 40 */ 45 5, /* 48 */ 46 5, /* 56 */ 47 5, /* 64 */ 48 1, /* 72 */ 49 1, /* 80 */ 50 1, /* 88 */ 51 1, /* 96 */ 52 6, /* 104 */ 53 6, /* 112 */ 54 6, /* 120 */ 55 6, /* 128 */ 56 2, /* 136 */ 57 2, /* 144 */ 58 2, /* 152 */ 59 2, /* 160 */ 60 2, /* 168 */ 61 2, /* 176 */ 62 2, /* 184 */ 63 2 /* 192 */ 64 }; 65 66 static int bpf_mem_cache_idx(size_t size) 67 { 68 if (!size || size > 4096) 69 return -1; 70 71 if (size <= 192) 72 return size_index[(size - 1) / 8] - 1; 73 74 return fls(size - 1) - 2; 75 } 76 77 #define NUM_CACHES 11 78 79 struct bpf_mem_cache { 80 /* per-cpu list of free objects of size 'unit_size'. 81 * All accesses are done with interrupts disabled and 'active' counter 82 * protection with __llist_add() and __llist_del_first(). 83 */ 84 struct llist_head free_llist; 85 local_t active; 86 87 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 88 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 89 * fail. When 'active' is busy the unit_free() will add an object to 90 * free_llist_extra. 91 */ 92 struct llist_head free_llist_extra; 93 94 struct irq_work refill_work; 95 struct obj_cgroup *objcg; 96 int unit_size; 97 /* count of objects in free_llist */ 98 int free_cnt; 99 int low_watermark, high_watermark, batch; 100 int percpu_size; 101 bool draining; 102 struct bpf_mem_cache *tgt; 103 104 /* list of objects to be freed after RCU GP */ 105 struct llist_head free_by_rcu; 106 struct llist_node *free_by_rcu_tail; 107 struct llist_head waiting_for_gp; 108 struct llist_node *waiting_for_gp_tail; 109 struct rcu_head rcu; 110 atomic_t call_rcu_in_progress; 111 struct llist_head free_llist_extra_rcu; 112 113 /* list of objects to be freed after RCU tasks trace GP */ 114 struct llist_head free_by_rcu_ttrace; 115 struct llist_head waiting_for_gp_ttrace; 116 struct rcu_head rcu_ttrace; 117 atomic_t call_rcu_ttrace_in_progress; 118 }; 119 120 struct bpf_mem_caches { 121 struct bpf_mem_cache cache[NUM_CACHES]; 122 }; 123 124 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 125 126 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 127 { 128 struct llist_node *entry, *next; 129 130 entry = head->first; 131 if (!entry) 132 return NULL; 133 next = entry->next; 134 head->first = next; 135 return entry; 136 } 137 138 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 139 { 140 if (c->percpu_size) { 141 void **obj = kmalloc_node(c->percpu_size, flags, node); 142 void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 143 144 if (!obj || !pptr) { 145 free_percpu(pptr); 146 kfree(obj); 147 return NULL; 148 } 149 obj[1] = pptr; 150 return obj; 151 } 152 153 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 154 } 155 156 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 157 { 158 #ifdef CONFIG_MEMCG_KMEM 159 if (c->objcg) 160 return get_mem_cgroup_from_objcg(c->objcg); 161 #endif 162 163 #ifdef CONFIG_MEMCG 164 return root_mem_cgroup; 165 #else 166 return NULL; 167 #endif 168 } 169 170 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 171 { 172 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 173 /* In RT irq_work runs in per-cpu kthread, so disable 174 * interrupts to avoid preemption and interrupts and 175 * reduce the chance of bpf prog executing on this cpu 176 * when active counter is busy. 177 */ 178 local_irq_save(*flags); 179 /* alloc_bulk runs from irq_work which will not preempt a bpf 180 * program that does unit_alloc/unit_free since IRQs are 181 * disabled there. There is no race to increment 'active' 182 * counter. It protects free_llist from corruption in case NMI 183 * bpf prog preempted this loop. 184 */ 185 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 186 } 187 188 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 189 { 190 local_dec(&c->active); 191 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 192 local_irq_restore(*flags); 193 } 194 195 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 196 { 197 unsigned long flags; 198 199 inc_active(c, &flags); 200 __llist_add(obj, &c->free_llist); 201 c->free_cnt++; 202 dec_active(c, &flags); 203 } 204 205 /* Mostly runs from irq_work except __init phase. */ 206 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 207 { 208 struct mem_cgroup *memcg = NULL, *old_memcg; 209 gfp_t gfp; 210 void *obj; 211 int i; 212 213 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 214 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 215 216 for (i = 0; i < cnt; i++) { 217 /* 218 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 219 * done only by one CPU == current CPU. Other CPUs might 220 * llist_add() and llist_del_all() in parallel. 221 */ 222 obj = llist_del_first(&c->free_by_rcu_ttrace); 223 if (!obj) 224 break; 225 add_obj_to_free_list(c, obj); 226 } 227 if (i >= cnt) 228 return; 229 230 for (; i < cnt; i++) { 231 obj = llist_del_first(&c->waiting_for_gp_ttrace); 232 if (!obj) 233 break; 234 add_obj_to_free_list(c, obj); 235 } 236 if (i >= cnt) 237 return; 238 239 memcg = get_memcg(c); 240 old_memcg = set_active_memcg(memcg); 241 for (; i < cnt; i++) { 242 /* Allocate, but don't deplete atomic reserves that typical 243 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 244 * will allocate from the current numa node which is what we 245 * want here. 246 */ 247 obj = __alloc(c, node, gfp); 248 if (!obj) 249 break; 250 add_obj_to_free_list(c, obj); 251 } 252 set_active_memcg(old_memcg); 253 mem_cgroup_put(memcg); 254 } 255 256 static void free_one(void *obj, bool percpu) 257 { 258 if (percpu) { 259 free_percpu(((void **)obj)[1]); 260 kfree(obj); 261 return; 262 } 263 264 kfree(obj); 265 } 266 267 static int free_all(struct llist_node *llnode, bool percpu) 268 { 269 struct llist_node *pos, *t; 270 int cnt = 0; 271 272 llist_for_each_safe(pos, t, llnode) { 273 free_one(pos, percpu); 274 cnt++; 275 } 276 return cnt; 277 } 278 279 static void __free_rcu(struct rcu_head *head) 280 { 281 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 283 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 } 286 287 static void __free_rcu_tasks_trace(struct rcu_head *head) 288 { 289 /* If RCU Tasks Trace grace period implies RCU grace period, 290 * there is no need to invoke call_rcu(). 291 */ 292 if (rcu_trace_implies_rcu_gp()) 293 __free_rcu(head); 294 else 295 call_rcu(head, __free_rcu); 296 } 297 298 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 299 { 300 struct llist_node *llnode = obj; 301 302 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 303 * Nothing races to add to free_by_rcu_ttrace list. 304 */ 305 llist_add(llnode, &c->free_by_rcu_ttrace); 306 } 307 308 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 309 { 310 struct llist_node *llnode, *t; 311 312 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 313 if (unlikely(READ_ONCE(c->draining))) { 314 llnode = llist_del_all(&c->free_by_rcu_ttrace); 315 free_all(llnode, !!c->percpu_size); 316 } 317 return; 318 } 319 320 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 321 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 322 llist_add(llnode, &c->waiting_for_gp_ttrace); 323 324 if (unlikely(READ_ONCE(c->draining))) { 325 __free_rcu(&c->rcu_ttrace); 326 return; 327 } 328 329 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 330 * If RCU Tasks Trace grace period implies RCU grace period, free 331 * these elements directly, else use call_rcu() to wait for normal 332 * progs to finish and finally do free_one() on each element. 333 */ 334 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 335 } 336 337 static void free_bulk(struct bpf_mem_cache *c) 338 { 339 struct bpf_mem_cache *tgt = c->tgt; 340 struct llist_node *llnode, *t; 341 unsigned long flags; 342 int cnt; 343 344 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 345 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 346 347 do { 348 inc_active(c, &flags); 349 llnode = __llist_del_first(&c->free_llist); 350 if (llnode) 351 cnt = --c->free_cnt; 352 else 353 cnt = 0; 354 dec_active(c, &flags); 355 if (llnode) 356 enque_to_free(tgt, llnode); 357 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 358 359 /* and drain free_llist_extra */ 360 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 361 enque_to_free(tgt, llnode); 362 do_call_rcu_ttrace(tgt); 363 } 364 365 static void __free_by_rcu(struct rcu_head *head) 366 { 367 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 368 struct bpf_mem_cache *tgt = c->tgt; 369 struct llist_node *llnode; 370 371 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 372 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 373 374 llnode = llist_del_all(&c->waiting_for_gp); 375 if (!llnode) 376 goto out; 377 378 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 379 380 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 381 do_call_rcu_ttrace(tgt); 382 out: 383 atomic_set(&c->call_rcu_in_progress, 0); 384 } 385 386 static void check_free_by_rcu(struct bpf_mem_cache *c) 387 { 388 struct llist_node *llnode, *t; 389 unsigned long flags; 390 391 /* drain free_llist_extra_rcu */ 392 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 393 inc_active(c, &flags); 394 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 395 if (__llist_add(llnode, &c->free_by_rcu)) 396 c->free_by_rcu_tail = llnode; 397 dec_active(c, &flags); 398 } 399 400 if (llist_empty(&c->free_by_rcu)) 401 return; 402 403 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 404 /* 405 * Instead of kmalloc-ing new rcu_head and triggering 10k 406 * call_rcu() to hit rcutree.qhimark and force RCU to notice 407 * the overload just ask RCU to hurry up. There could be many 408 * objects in free_by_rcu list. 409 * This hint reduces memory consumption for an artificial 410 * benchmark from 2 Gbyte to 150 Mbyte. 411 */ 412 rcu_request_urgent_qs_task(current); 413 return; 414 } 415 416 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 417 418 inc_active(c, &flags); 419 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 420 c->waiting_for_gp_tail = c->free_by_rcu_tail; 421 dec_active(c, &flags); 422 423 if (unlikely(READ_ONCE(c->draining))) { 424 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 425 atomic_set(&c->call_rcu_in_progress, 0); 426 } else { 427 call_rcu_hurry(&c->rcu, __free_by_rcu); 428 } 429 } 430 431 static void bpf_mem_refill(struct irq_work *work) 432 { 433 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 434 int cnt; 435 436 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 437 cnt = c->free_cnt; 438 if (cnt < c->low_watermark) 439 /* irq_work runs on this cpu and kmalloc will allocate 440 * from the current numa node which is what we want here. 441 */ 442 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 443 else if (cnt > c->high_watermark) 444 free_bulk(c); 445 446 check_free_by_rcu(c); 447 } 448 449 static void notrace irq_work_raise(struct bpf_mem_cache *c) 450 { 451 irq_work_queue(&c->refill_work); 452 } 453 454 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 455 * the freelist cache will be elem_size * 64 (or less) on each cpu. 456 * 457 * For bpf programs that don't have statically known allocation sizes and 458 * assuming (low_mark + high_mark) / 2 as an average number of elements per 459 * bucket and all buckets are used the total amount of memory in freelists 460 * on each cpu will be: 461 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 462 * == ~ 116 Kbyte using below heuristic. 463 * Initialized, but unused bpf allocator (not bpf map specific one) will 464 * consume ~ 11 Kbyte per cpu. 465 * Typical case will be between 11K and 116K closer to 11K. 466 * bpf progs can and should share bpf_mem_cache when possible. 467 */ 468 static void init_refill_work(struct bpf_mem_cache *c) 469 { 470 init_irq_work(&c->refill_work, bpf_mem_refill); 471 if (c->unit_size <= 256) { 472 c->low_watermark = 32; 473 c->high_watermark = 96; 474 } else { 475 /* When page_size == 4k, order-0 cache will have low_mark == 2 476 * and high_mark == 6 with batch alloc of 3 individual pages at 477 * a time. 478 * 8k allocs and above low == 1, high == 3, batch == 1. 479 */ 480 c->low_watermark = max(32 * 256 / c->unit_size, 1); 481 c->high_watermark = max(96 * 256 / c->unit_size, 3); 482 } 483 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 484 } 485 486 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 487 { 488 /* To avoid consuming memory assume that 1st run of bpf 489 * prog won't be doing more than 4 map_update_elem from 490 * irq disabled region 491 */ 492 alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); 493 } 494 495 /* When size != 0 bpf_mem_cache for each cpu. 496 * This is typical bpf hash map use case when all elements have equal size. 497 * 498 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 499 * kmalloc/kfree. Max allocation size is 4096 in this case. 500 * This is bpf_dynptr and bpf_kptr use case. 501 */ 502 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 503 { 504 struct bpf_mem_caches *cc, __percpu *pcc; 505 struct bpf_mem_cache *c, __percpu *pc; 506 struct obj_cgroup *objcg = NULL; 507 int cpu, i, unit_size, percpu_size = 0; 508 509 if (percpu && size == 0) 510 return -EINVAL; 511 512 /* room for llist_node and per-cpu pointer */ 513 if (percpu) 514 percpu_size = LLIST_NODE_SZ + sizeof(void *); 515 ma->percpu = percpu; 516 517 if (size) { 518 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 519 if (!pc) 520 return -ENOMEM; 521 522 if (!percpu) 523 size += LLIST_NODE_SZ; /* room for llist_node */ 524 unit_size = size; 525 526 #ifdef CONFIG_MEMCG_KMEM 527 if (memcg_bpf_enabled()) 528 objcg = get_obj_cgroup_from_current(); 529 #endif 530 ma->objcg = objcg; 531 532 for_each_possible_cpu(cpu) { 533 c = per_cpu_ptr(pc, cpu); 534 c->unit_size = unit_size; 535 c->objcg = objcg; 536 c->percpu_size = percpu_size; 537 c->tgt = c; 538 init_refill_work(c); 539 prefill_mem_cache(c, cpu); 540 } 541 ma->cache = pc; 542 return 0; 543 } 544 545 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 546 if (!pcc) 547 return -ENOMEM; 548 #ifdef CONFIG_MEMCG_KMEM 549 objcg = get_obj_cgroup_from_current(); 550 #endif 551 ma->objcg = objcg; 552 for_each_possible_cpu(cpu) { 553 cc = per_cpu_ptr(pcc, cpu); 554 for (i = 0; i < NUM_CACHES; i++) { 555 c = &cc->cache[i]; 556 c->unit_size = sizes[i]; 557 c->objcg = objcg; 558 c->percpu_size = percpu_size; 559 c->tgt = c; 560 561 init_refill_work(c); 562 prefill_mem_cache(c, cpu); 563 } 564 } 565 566 ma->caches = pcc; 567 return 0; 568 } 569 570 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 571 { 572 struct bpf_mem_caches __percpu *pcc; 573 574 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 575 if (!pcc) 576 return -ENOMEM; 577 578 ma->caches = pcc; 579 ma->objcg = objcg; 580 ma->percpu = true; 581 return 0; 582 } 583 584 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 585 { 586 struct bpf_mem_caches *cc, __percpu *pcc; 587 int cpu, i, unit_size, percpu_size; 588 struct obj_cgroup *objcg; 589 struct bpf_mem_cache *c; 590 591 i = bpf_mem_cache_idx(size); 592 if (i < 0) 593 return -EINVAL; 594 595 /* room for llist_node and per-cpu pointer */ 596 percpu_size = LLIST_NODE_SZ + sizeof(void *); 597 598 unit_size = sizes[i]; 599 objcg = ma->objcg; 600 pcc = ma->caches; 601 602 for_each_possible_cpu(cpu) { 603 cc = per_cpu_ptr(pcc, cpu); 604 c = &cc->cache[i]; 605 if (cpu == 0 && c->unit_size) 606 break; 607 608 c->unit_size = unit_size; 609 c->objcg = objcg; 610 c->percpu_size = percpu_size; 611 c->tgt = c; 612 613 init_refill_work(c); 614 prefill_mem_cache(c, cpu); 615 } 616 617 return 0; 618 } 619 620 static void drain_mem_cache(struct bpf_mem_cache *c) 621 { 622 bool percpu = !!c->percpu_size; 623 624 /* No progs are using this bpf_mem_cache, but htab_map_free() called 625 * bpf_mem_cache_free() for all remaining elements and they can be in 626 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 627 * 628 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 629 * on these lists, so it is safe to use __llist_del_all(). 630 */ 631 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 632 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 633 free_all(__llist_del_all(&c->free_llist), percpu); 634 free_all(__llist_del_all(&c->free_llist_extra), percpu); 635 free_all(__llist_del_all(&c->free_by_rcu), percpu); 636 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 637 free_all(llist_del_all(&c->waiting_for_gp), percpu); 638 } 639 640 static void check_mem_cache(struct bpf_mem_cache *c) 641 { 642 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 643 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 644 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 645 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 646 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 647 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 648 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 649 } 650 651 static void check_leaked_objs(struct bpf_mem_alloc *ma) 652 { 653 struct bpf_mem_caches *cc; 654 struct bpf_mem_cache *c; 655 int cpu, i; 656 657 if (ma->cache) { 658 for_each_possible_cpu(cpu) { 659 c = per_cpu_ptr(ma->cache, cpu); 660 check_mem_cache(c); 661 } 662 } 663 if (ma->caches) { 664 for_each_possible_cpu(cpu) { 665 cc = per_cpu_ptr(ma->caches, cpu); 666 for (i = 0; i < NUM_CACHES; i++) { 667 c = &cc->cache[i]; 668 check_mem_cache(c); 669 } 670 } 671 } 672 } 673 674 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 675 { 676 check_leaked_objs(ma); 677 free_percpu(ma->cache); 678 free_percpu(ma->caches); 679 ma->cache = NULL; 680 ma->caches = NULL; 681 } 682 683 static void free_mem_alloc(struct bpf_mem_alloc *ma) 684 { 685 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 686 * might still execute. Wait for them. 687 * 688 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 689 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 690 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 691 * so if call_rcu(head, __free_rcu) is skipped due to 692 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 693 * using rcu_trace_implies_rcu_gp() as well. 694 */ 695 rcu_barrier(); /* wait for __free_by_rcu */ 696 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 697 if (!rcu_trace_implies_rcu_gp()) 698 rcu_barrier(); 699 free_mem_alloc_no_barrier(ma); 700 } 701 702 static void free_mem_alloc_deferred(struct work_struct *work) 703 { 704 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 705 706 free_mem_alloc(ma); 707 kfree(ma); 708 } 709 710 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 711 { 712 struct bpf_mem_alloc *copy; 713 714 if (!rcu_in_progress) { 715 /* Fast path. No callbacks are pending, hence no need to do 716 * rcu_barrier-s. 717 */ 718 free_mem_alloc_no_barrier(ma); 719 return; 720 } 721 722 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 723 if (!copy) { 724 /* Slow path with inline barrier-s */ 725 free_mem_alloc(ma); 726 return; 727 } 728 729 /* Defer barriers into worker to let the rest of map memory to be freed */ 730 memset(ma, 0, sizeof(*ma)); 731 INIT_WORK(©->work, free_mem_alloc_deferred); 732 queue_work(system_unbound_wq, ©->work); 733 } 734 735 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 736 { 737 struct bpf_mem_caches *cc; 738 struct bpf_mem_cache *c; 739 int cpu, i, rcu_in_progress; 740 741 if (ma->cache) { 742 rcu_in_progress = 0; 743 for_each_possible_cpu(cpu) { 744 c = per_cpu_ptr(ma->cache, cpu); 745 WRITE_ONCE(c->draining, true); 746 irq_work_sync(&c->refill_work); 747 drain_mem_cache(c); 748 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 749 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 750 } 751 if (ma->objcg) 752 obj_cgroup_put(ma->objcg); 753 destroy_mem_alloc(ma, rcu_in_progress); 754 } 755 if (ma->caches) { 756 rcu_in_progress = 0; 757 for_each_possible_cpu(cpu) { 758 cc = per_cpu_ptr(ma->caches, cpu); 759 for (i = 0; i < NUM_CACHES; i++) { 760 c = &cc->cache[i]; 761 WRITE_ONCE(c->draining, true); 762 irq_work_sync(&c->refill_work); 763 drain_mem_cache(c); 764 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 765 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 766 } 767 } 768 if (ma->objcg) 769 obj_cgroup_put(ma->objcg); 770 destroy_mem_alloc(ma, rcu_in_progress); 771 } 772 } 773 774 /* notrace is necessary here and in other functions to make sure 775 * bpf programs cannot attach to them and cause llist corruptions. 776 */ 777 static void notrace *unit_alloc(struct bpf_mem_cache *c) 778 { 779 struct llist_node *llnode = NULL; 780 unsigned long flags; 781 int cnt = 0; 782 783 /* Disable irqs to prevent the following race for majority of prog types: 784 * prog_A 785 * bpf_mem_alloc 786 * preemption or irq -> prog_B 787 * bpf_mem_alloc 788 * 789 * but prog_B could be a perf_event NMI prog. 790 * Use per-cpu 'active' counter to order free_list access between 791 * unit_alloc/unit_free/bpf_mem_refill. 792 */ 793 local_irq_save(flags); 794 if (local_inc_return(&c->active) == 1) { 795 llnode = __llist_del_first(&c->free_llist); 796 if (llnode) { 797 cnt = --c->free_cnt; 798 *(struct bpf_mem_cache **)llnode = c; 799 } 800 } 801 local_dec(&c->active); 802 803 WARN_ON(cnt < 0); 804 805 if (cnt < c->low_watermark) 806 irq_work_raise(c); 807 /* Enable IRQ after the enqueue of irq work completes, so irq work 808 * will run after IRQ is enabled and free_llist may be refilled by 809 * irq work before other task preempts current task. 810 */ 811 local_irq_restore(flags); 812 813 return llnode; 814 } 815 816 /* Though 'ptr' object could have been allocated on a different cpu 817 * add it to the free_llist of the current cpu. 818 * Let kfree() logic deal with it when it's later called from irq_work. 819 */ 820 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 821 { 822 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 823 unsigned long flags; 824 int cnt = 0; 825 826 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 827 828 /* 829 * Remember bpf_mem_cache that allocated this object. 830 * The hint is not accurate. 831 */ 832 c->tgt = *(struct bpf_mem_cache **)llnode; 833 834 local_irq_save(flags); 835 if (local_inc_return(&c->active) == 1) { 836 __llist_add(llnode, &c->free_llist); 837 cnt = ++c->free_cnt; 838 } else { 839 /* unit_free() cannot fail. Therefore add an object to atomic 840 * llist. free_bulk() will drain it. Though free_llist_extra is 841 * a per-cpu list we have to use atomic llist_add here, since 842 * it also can be interrupted by bpf nmi prog that does another 843 * unit_free() into the same free_llist_extra. 844 */ 845 llist_add(llnode, &c->free_llist_extra); 846 } 847 local_dec(&c->active); 848 849 if (cnt > c->high_watermark) 850 /* free few objects from current cpu into global kmalloc pool */ 851 irq_work_raise(c); 852 /* Enable IRQ after irq_work_raise() completes, otherwise when current 853 * task is preempted by task which does unit_alloc(), unit_alloc() may 854 * return NULL unexpectedly because irq work is already pending but can 855 * not been triggered and free_llist can not be refilled timely. 856 */ 857 local_irq_restore(flags); 858 } 859 860 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 861 { 862 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 863 unsigned long flags; 864 865 c->tgt = *(struct bpf_mem_cache **)llnode; 866 867 local_irq_save(flags); 868 if (local_inc_return(&c->active) == 1) { 869 if (__llist_add(llnode, &c->free_by_rcu)) 870 c->free_by_rcu_tail = llnode; 871 } else { 872 llist_add(llnode, &c->free_llist_extra_rcu); 873 } 874 local_dec(&c->active); 875 876 if (!atomic_read(&c->call_rcu_in_progress)) 877 irq_work_raise(c); 878 local_irq_restore(flags); 879 } 880 881 /* Called from BPF program or from sys_bpf syscall. 882 * In both cases migration is disabled. 883 */ 884 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 885 { 886 int idx; 887 void *ret; 888 889 if (!size) 890 return NULL; 891 892 if (!ma->percpu) 893 size += LLIST_NODE_SZ; 894 idx = bpf_mem_cache_idx(size); 895 if (idx < 0) 896 return NULL; 897 898 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 899 return !ret ? NULL : ret + LLIST_NODE_SZ; 900 } 901 902 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 903 { 904 struct bpf_mem_cache *c; 905 int idx; 906 907 if (!ptr) 908 return; 909 910 c = *(void **)(ptr - LLIST_NODE_SZ); 911 idx = bpf_mem_cache_idx(c->unit_size); 912 if (WARN_ON_ONCE(idx < 0)) 913 return; 914 915 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 916 } 917 918 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 919 { 920 struct bpf_mem_cache *c; 921 int idx; 922 923 if (!ptr) 924 return; 925 926 c = *(void **)(ptr - LLIST_NODE_SZ); 927 idx = bpf_mem_cache_idx(c->unit_size); 928 if (WARN_ON_ONCE(idx < 0)) 929 return; 930 931 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 932 } 933 934 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 935 { 936 void *ret; 937 938 ret = unit_alloc(this_cpu_ptr(ma->cache)); 939 return !ret ? NULL : ret + LLIST_NODE_SZ; 940 } 941 942 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 943 { 944 if (!ptr) 945 return; 946 947 unit_free(this_cpu_ptr(ma->cache), ptr); 948 } 949 950 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 951 { 952 if (!ptr) 953 return; 954 955 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 956 } 957 958 /* Directly does a kfree() without putting 'ptr' back to the free_llist 959 * for reuse and without waiting for a rcu_tasks_trace gp. 960 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 961 * before calling bpf_mem_cache_raw_free(). 962 * It could be used when the rcu_tasks_trace callback does not have 963 * a hold on the original bpf_mem_alloc object that allocated the 964 * 'ptr'. This should only be used in the uncommon code path. 965 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 966 * and may affect performance. 967 */ 968 void bpf_mem_cache_raw_free(void *ptr) 969 { 970 if (!ptr) 971 return; 972 973 kfree(ptr - LLIST_NODE_SZ); 974 } 975 976 /* When flags == GFP_KERNEL, it signals that the caller will not cause 977 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 978 * kmalloc if the free_llist is empty. 979 */ 980 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 981 { 982 struct bpf_mem_cache *c; 983 void *ret; 984 985 c = this_cpu_ptr(ma->cache); 986 987 ret = unit_alloc(c); 988 if (!ret && flags == GFP_KERNEL) { 989 struct mem_cgroup *memcg, *old_memcg; 990 991 memcg = get_memcg(c); 992 old_memcg = set_active_memcg(memcg); 993 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 994 if (ret) 995 *(struct bpf_mem_cache **)ret = c; 996 set_active_memcg(old_memcg); 997 mem_cgroup_put(memcg); 998 } 999 1000 return !ret ? NULL : ret + LLIST_NODE_SZ; 1001 } 1002