1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 16 struct bpf_struct_ops_value { 17 struct bpf_struct_ops_common_value common; 18 char data[] ____cacheline_aligned_in_smp; 19 }; 20 21 struct bpf_struct_ops_map { 22 struct bpf_map map; 23 struct rcu_head rcu; 24 const struct bpf_struct_ops_desc *st_ops_desc; 25 /* protect map_update */ 26 struct mutex lock; 27 /* link has all the bpf_links that is populated 28 * to the func ptr of the kernel's struct 29 * (in kvalue.data). 30 */ 31 struct bpf_link **links; 32 u32 links_cnt; 33 /* image is a page that has all the trampolines 34 * that stores the func args before calling the bpf_prog. 35 * A PAGE_SIZE "image" is enough to store all trampoline for 36 * "links[]". 37 */ 38 void *image; 39 /* The owner moduler's btf. */ 40 struct btf *btf; 41 /* uvalue->data stores the kernel struct 42 * (e.g. tcp_congestion_ops) that is more useful 43 * to userspace than the kvalue. For example, 44 * the bpf_prog's id is stored instead of the kernel 45 * address of a func ptr. 46 */ 47 struct bpf_struct_ops_value *uvalue; 48 /* kvalue.data stores the actual kernel's struct 49 * (e.g. tcp_congestion_ops) that will be 50 * registered to the kernel subsystem. 51 */ 52 struct bpf_struct_ops_value kvalue; 53 }; 54 55 struct bpf_struct_ops_link { 56 struct bpf_link link; 57 struct bpf_map __rcu *map; 58 }; 59 60 static DEFINE_MUTEX(update_mutex); 61 62 #define VALUE_PREFIX "bpf_struct_ops_" 63 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 64 65 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 66 }; 67 68 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 69 #ifdef CONFIG_NET 70 .test_run = bpf_struct_ops_test_run, 71 #endif 72 }; 73 74 BTF_ID_LIST(st_ops_ids) 75 BTF_ID(struct, module) 76 BTF_ID(struct, bpf_struct_ops_common_value) 77 78 enum { 79 IDX_MODULE_ID, 80 IDX_ST_OPS_COMMON_VALUE_ID, 81 }; 82 83 extern struct btf *btf_vmlinux; 84 85 static bool is_valid_value_type(struct btf *btf, s32 value_id, 86 const struct btf_type *type, 87 const char *value_name) 88 { 89 const struct btf_type *common_value_type; 90 const struct btf_member *member; 91 const struct btf_type *vt, *mt; 92 93 vt = btf_type_by_id(btf, value_id); 94 if (btf_vlen(vt) != 2) { 95 pr_warn("The number of %s's members should be 2, but we get %d\n", 96 value_name, btf_vlen(vt)); 97 return false; 98 } 99 member = btf_type_member(vt); 100 mt = btf_type_by_id(btf, member->type); 101 common_value_type = btf_type_by_id(btf_vmlinux, 102 st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); 103 if (mt != common_value_type) { 104 pr_warn("The first member of %s should be bpf_struct_ops_common_value\n", 105 value_name); 106 return false; 107 } 108 member++; 109 mt = btf_type_by_id(btf, member->type); 110 if (mt != type) { 111 pr_warn("The second member of %s should be %s\n", 112 value_name, btf_name_by_offset(btf, type->name_off)); 113 return false; 114 } 115 116 return true; 117 } 118 119 #define MAYBE_NULL_SUFFIX "__nullable" 120 #define MAX_STUB_NAME 128 121 122 /* Return the type info of a stub function, if it exists. 123 * 124 * The name of a stub function is made up of the name of the struct_ops and 125 * the name of the function pointer member, separated by "__". For example, 126 * if the struct_ops type is named "foo_ops" and the function pointer 127 * member is named "bar", the stub function name would be "foo_ops__bar". 128 */ 129 static const struct btf_type * 130 find_stub_func_proto(const struct btf *btf, const char *st_op_name, 131 const char *member_name) 132 { 133 char stub_func_name[MAX_STUB_NAME]; 134 const struct btf_type *func_type; 135 s32 btf_id; 136 int cp; 137 138 cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", 139 st_op_name, member_name); 140 if (cp >= MAX_STUB_NAME) { 141 pr_warn("Stub function name too long\n"); 142 return NULL; 143 } 144 btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); 145 if (btf_id < 0) 146 return NULL; 147 func_type = btf_type_by_id(btf, btf_id); 148 if (!func_type) 149 return NULL; 150 151 return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ 152 } 153 154 /* Prepare argument info for every nullable argument of a member of a 155 * struct_ops type. 156 * 157 * Initialize a struct bpf_struct_ops_arg_info according to type info of 158 * the arguments of a stub function. (Check kCFI for more information about 159 * stub functions.) 160 * 161 * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info 162 * to provide an array of struct bpf_ctx_arg_aux, which in turn provides 163 * the information that used by the verifier to check the arguments of the 164 * BPF struct_ops program assigned to the member. Here, we only care about 165 * the arguments that are marked as __nullable. 166 * 167 * The array of struct bpf_ctx_arg_aux is eventually assigned to 168 * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the 169 * verifier. (See check_struct_ops_btf_id()) 170 * 171 * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If 172 * fails, it will be kept untouched. 173 */ 174 static int prepare_arg_info(struct btf *btf, 175 const char *st_ops_name, 176 const char *member_name, 177 const struct btf_type *func_proto, 178 struct bpf_struct_ops_arg_info *arg_info) 179 { 180 const struct btf_type *stub_func_proto, *pointed_type; 181 const struct btf_param *stub_args, *args; 182 struct bpf_ctx_arg_aux *info, *info_buf; 183 u32 nargs, arg_no, info_cnt = 0; 184 u32 arg_btf_id; 185 int offset; 186 187 stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); 188 if (!stub_func_proto) 189 return 0; 190 191 /* Check if the number of arguments of the stub function is the same 192 * as the number of arguments of the function pointer. 193 */ 194 nargs = btf_type_vlen(func_proto); 195 if (nargs != btf_type_vlen(stub_func_proto)) { 196 pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", 197 st_ops_name, member_name, member_name, st_ops_name); 198 return -EINVAL; 199 } 200 201 if (!nargs) 202 return 0; 203 204 args = btf_params(func_proto); 205 stub_args = btf_params(stub_func_proto); 206 207 info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); 208 if (!info_buf) 209 return -ENOMEM; 210 211 /* Prepare info for every nullable argument */ 212 info = info_buf; 213 for (arg_no = 0; arg_no < nargs; arg_no++) { 214 /* Skip arguments that is not suffixed with 215 * "__nullable". 216 */ 217 if (!btf_param_match_suffix(btf, &stub_args[arg_no], 218 MAYBE_NULL_SUFFIX)) 219 continue; 220 221 /* Should be a pointer to struct */ 222 pointed_type = btf_type_resolve_ptr(btf, 223 args[arg_no].type, 224 &arg_btf_id); 225 if (!pointed_type || 226 !btf_type_is_struct(pointed_type)) { 227 pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", 228 st_ops_name, member_name, MAYBE_NULL_SUFFIX); 229 goto err_out; 230 } 231 232 offset = btf_ctx_arg_offset(btf, func_proto, arg_no); 233 if (offset < 0) { 234 pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", 235 st_ops_name, member_name, arg_no); 236 goto err_out; 237 } 238 239 if (args[arg_no].type != stub_args[arg_no].type) { 240 pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", 241 arg_no, st_ops_name, member_name); 242 goto err_out; 243 } 244 245 /* Fill the information of the new argument */ 246 info->reg_type = 247 PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; 248 info->btf_id = arg_btf_id; 249 info->btf = btf; 250 info->offset = offset; 251 252 info++; 253 info_cnt++; 254 } 255 256 if (info_cnt) { 257 arg_info->info = info_buf; 258 arg_info->cnt = info_cnt; 259 } else { 260 kfree(info_buf); 261 } 262 263 return 0; 264 265 err_out: 266 kfree(info_buf); 267 268 return -EINVAL; 269 } 270 271 /* Clean up the arg_info in a struct bpf_struct_ops_desc. */ 272 void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) 273 { 274 struct bpf_struct_ops_arg_info *arg_info; 275 int i; 276 277 arg_info = st_ops_desc->arg_info; 278 for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) 279 kfree(arg_info[i].info); 280 281 kfree(arg_info); 282 } 283 284 int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, 285 struct btf *btf, 286 struct bpf_verifier_log *log) 287 { 288 struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; 289 struct bpf_struct_ops_arg_info *arg_info; 290 const struct btf_member *member; 291 const struct btf_type *t; 292 s32 type_id, value_id; 293 char value_name[128]; 294 const char *mname; 295 int i, err; 296 297 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 298 sizeof(value_name)) { 299 pr_warn("struct_ops name %s is too long\n", 300 st_ops->name); 301 return -EINVAL; 302 } 303 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 304 305 type_id = btf_find_by_name_kind(btf, st_ops->name, 306 BTF_KIND_STRUCT); 307 if (type_id < 0) { 308 pr_warn("Cannot find struct %s in %s\n", 309 st_ops->name, btf_get_name(btf)); 310 return -EINVAL; 311 } 312 t = btf_type_by_id(btf, type_id); 313 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 314 pr_warn("Cannot support #%u members in struct %s\n", 315 btf_type_vlen(t), st_ops->name); 316 return -EINVAL; 317 } 318 319 value_id = btf_find_by_name_kind(btf, value_name, 320 BTF_KIND_STRUCT); 321 if (value_id < 0) { 322 pr_warn("Cannot find struct %s in %s\n", 323 value_name, btf_get_name(btf)); 324 return -EINVAL; 325 } 326 if (!is_valid_value_type(btf, value_id, t, value_name)) 327 return -EINVAL; 328 329 arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), 330 GFP_KERNEL); 331 if (!arg_info) 332 return -ENOMEM; 333 334 st_ops_desc->arg_info = arg_info; 335 st_ops_desc->type = t; 336 st_ops_desc->type_id = type_id; 337 st_ops_desc->value_id = value_id; 338 st_ops_desc->value_type = btf_type_by_id(btf, value_id); 339 340 for_each_member(i, t, member) { 341 const struct btf_type *func_proto; 342 343 mname = btf_name_by_offset(btf, member->name_off); 344 if (!*mname) { 345 pr_warn("anon member in struct %s is not supported\n", 346 st_ops->name); 347 err = -EOPNOTSUPP; 348 goto errout; 349 } 350 351 if (__btf_member_bitfield_size(t, member)) { 352 pr_warn("bit field member %s in struct %s is not supported\n", 353 mname, st_ops->name); 354 err = -EOPNOTSUPP; 355 goto errout; 356 } 357 358 func_proto = btf_type_resolve_func_ptr(btf, 359 member->type, 360 NULL); 361 if (!func_proto) 362 continue; 363 364 if (btf_distill_func_proto(log, btf, 365 func_proto, mname, 366 &st_ops->func_models[i])) { 367 pr_warn("Error in parsing func ptr %s in struct %s\n", 368 mname, st_ops->name); 369 err = -EINVAL; 370 goto errout; 371 } 372 373 err = prepare_arg_info(btf, st_ops->name, mname, 374 func_proto, 375 arg_info + i); 376 if (err) 377 goto errout; 378 } 379 380 if (st_ops->init(btf)) { 381 pr_warn("Error in init bpf_struct_ops %s\n", 382 st_ops->name); 383 err = -EINVAL; 384 goto errout; 385 } 386 387 return 0; 388 389 errout: 390 bpf_struct_ops_desc_release(st_ops_desc); 391 392 return err; 393 } 394 395 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 396 void *next_key) 397 { 398 if (key && *(u32 *)key == 0) 399 return -ENOENT; 400 401 *(u32 *)next_key = 0; 402 return 0; 403 } 404 405 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 406 void *value) 407 { 408 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 409 struct bpf_struct_ops_value *uvalue, *kvalue; 410 enum bpf_struct_ops_state state; 411 s64 refcnt; 412 413 if (unlikely(*(u32 *)key != 0)) 414 return -ENOENT; 415 416 kvalue = &st_map->kvalue; 417 /* Pair with smp_store_release() during map_update */ 418 state = smp_load_acquire(&kvalue->common.state); 419 if (state == BPF_STRUCT_OPS_STATE_INIT) { 420 memset(value, 0, map->value_size); 421 return 0; 422 } 423 424 /* No lock is needed. state and refcnt do not need 425 * to be updated together under atomic context. 426 */ 427 uvalue = value; 428 memcpy(uvalue, st_map->uvalue, map->value_size); 429 uvalue->common.state = state; 430 431 /* This value offers the user space a general estimate of how 432 * many sockets are still utilizing this struct_ops for TCP 433 * congestion control. The number might not be exact, but it 434 * should sufficiently meet our present goals. 435 */ 436 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 437 refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0)); 438 439 return 0; 440 } 441 442 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 443 { 444 return ERR_PTR(-EINVAL); 445 } 446 447 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 448 { 449 u32 i; 450 451 for (i = 0; i < st_map->links_cnt; i++) { 452 if (st_map->links[i]) { 453 bpf_link_put(st_map->links[i]); 454 st_map->links[i] = NULL; 455 } 456 } 457 } 458 459 static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) 460 { 461 const struct btf_member *member; 462 u32 i, moff, msize, prev_mend = 0; 463 const struct btf_type *mtype; 464 465 for_each_member(i, t, member) { 466 moff = __btf_member_bit_offset(t, member) / 8; 467 if (moff > prev_mend && 468 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 469 return -EINVAL; 470 471 mtype = btf_type_by_id(btf, member->type); 472 mtype = btf_resolve_size(btf, mtype, &msize); 473 if (IS_ERR(mtype)) 474 return PTR_ERR(mtype); 475 prev_mend = moff + msize; 476 } 477 478 if (t->size > prev_mend && 479 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 480 return -EINVAL; 481 482 return 0; 483 } 484 485 static void bpf_struct_ops_link_release(struct bpf_link *link) 486 { 487 } 488 489 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 490 { 491 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 492 493 kfree(tlink); 494 } 495 496 const struct bpf_link_ops bpf_struct_ops_link_lops = { 497 .release = bpf_struct_ops_link_release, 498 .dealloc = bpf_struct_ops_link_dealloc, 499 }; 500 501 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 502 struct bpf_tramp_link *link, 503 const struct btf_func_model *model, 504 void *stub_func, void *image, void *image_end) 505 { 506 u32 flags = BPF_TRAMP_F_INDIRECT; 507 int size; 508 509 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 510 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 511 512 if (model->ret_size > 0) 513 flags |= BPF_TRAMP_F_RET_FENTRY_RET; 514 515 size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); 516 if (size < 0) 517 return size; 518 if (size > (unsigned long)image_end - (unsigned long)image) 519 return -E2BIG; 520 return arch_prepare_bpf_trampoline(NULL, image, image_end, 521 model, flags, tlinks, stub_func); 522 } 523 524 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 525 void *value, u64 flags) 526 { 527 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 528 const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; 529 const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; 530 struct bpf_struct_ops_value *uvalue, *kvalue; 531 const struct btf_type *module_type; 532 const struct btf_member *member; 533 const struct btf_type *t = st_ops_desc->type; 534 struct bpf_tramp_links *tlinks; 535 void *udata, *kdata; 536 int prog_fd, err; 537 void *image, *image_end; 538 u32 i; 539 540 if (flags) 541 return -EINVAL; 542 543 if (*(u32 *)key != 0) 544 return -E2BIG; 545 546 err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value); 547 if (err) 548 return err; 549 550 uvalue = value; 551 err = check_zero_holes(st_map->btf, t, uvalue->data); 552 if (err) 553 return err; 554 555 if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) 556 return -EINVAL; 557 558 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 559 if (!tlinks) 560 return -ENOMEM; 561 562 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 563 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 564 565 mutex_lock(&st_map->lock); 566 567 if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) { 568 err = -EBUSY; 569 goto unlock; 570 } 571 572 memcpy(uvalue, value, map->value_size); 573 574 udata = &uvalue->data; 575 kdata = &kvalue->data; 576 image = st_map->image; 577 image_end = st_map->image + PAGE_SIZE; 578 579 module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); 580 for_each_member(i, t, member) { 581 const struct btf_type *mtype, *ptype; 582 struct bpf_prog *prog; 583 struct bpf_tramp_link *link; 584 u32 moff; 585 586 moff = __btf_member_bit_offset(t, member) / 8; 587 ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); 588 if (ptype == module_type) { 589 if (*(void **)(udata + moff)) 590 goto reset_unlock; 591 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 592 continue; 593 } 594 595 err = st_ops->init_member(t, member, kdata, udata); 596 if (err < 0) 597 goto reset_unlock; 598 599 /* The ->init_member() has handled this member */ 600 if (err > 0) 601 continue; 602 603 /* If st_ops->init_member does not handle it, 604 * we will only handle func ptrs and zero-ed members 605 * here. Reject everything else. 606 */ 607 608 /* All non func ptr member must be 0 */ 609 if (!ptype || !btf_type_is_func_proto(ptype)) { 610 u32 msize; 611 612 mtype = btf_type_by_id(st_map->btf, member->type); 613 mtype = btf_resolve_size(st_map->btf, mtype, &msize); 614 if (IS_ERR(mtype)) { 615 err = PTR_ERR(mtype); 616 goto reset_unlock; 617 } 618 619 if (memchr_inv(udata + moff, 0, msize)) { 620 err = -EINVAL; 621 goto reset_unlock; 622 } 623 624 continue; 625 } 626 627 prog_fd = (int)(*(unsigned long *)(udata + moff)); 628 /* Similar check as the attr->attach_prog_fd */ 629 if (!prog_fd) 630 continue; 631 632 prog = bpf_prog_get(prog_fd); 633 if (IS_ERR(prog)) { 634 err = PTR_ERR(prog); 635 goto reset_unlock; 636 } 637 638 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 639 prog->aux->attach_btf_id != st_ops_desc->type_id || 640 prog->expected_attach_type != i) { 641 bpf_prog_put(prog); 642 err = -EINVAL; 643 goto reset_unlock; 644 } 645 646 link = kzalloc(sizeof(*link), GFP_USER); 647 if (!link) { 648 bpf_prog_put(prog); 649 err = -ENOMEM; 650 goto reset_unlock; 651 } 652 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 653 &bpf_struct_ops_link_lops, prog); 654 st_map->links[i] = &link->link; 655 656 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 657 &st_ops->func_models[i], 658 *(void **)(st_ops->cfi_stubs + moff), 659 image, image_end); 660 if (err < 0) 661 goto reset_unlock; 662 663 *(void **)(kdata + moff) = image + cfi_get_offset(); 664 image += err; 665 666 /* put prog_id to udata */ 667 *(unsigned long *)(udata + moff) = prog->aux->id; 668 } 669 670 if (st_map->map.map_flags & BPF_F_LINK) { 671 err = 0; 672 if (st_ops->validate) { 673 err = st_ops->validate(kdata); 674 if (err) 675 goto reset_unlock; 676 } 677 arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); 678 /* Let bpf_link handle registration & unregistration. 679 * 680 * Pair with smp_load_acquire() during lookup_elem(). 681 */ 682 smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); 683 goto unlock; 684 } 685 686 arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); 687 err = st_ops->reg(kdata); 688 if (likely(!err)) { 689 /* This refcnt increment on the map here after 690 * 'st_ops->reg()' is secure since the state of the 691 * map must be set to INIT at this moment, and thus 692 * bpf_struct_ops_map_delete_elem() can't unregister 693 * or transition it to TOBEFREE concurrently. 694 */ 695 bpf_map_inc(map); 696 /* Pair with smp_load_acquire() during lookup_elem(). 697 * It ensures the above udata updates (e.g. prog->aux->id) 698 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 699 */ 700 smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); 701 goto unlock; 702 } 703 704 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 705 * verified as a whole, after all init_member() calls. Can also happen if 706 * there was a race in registering the struct_ops (under the same name) to 707 * a sub-system through different struct_ops's maps. 708 */ 709 arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); 710 711 reset_unlock: 712 bpf_struct_ops_map_put_progs(st_map); 713 memset(uvalue, 0, map->value_size); 714 memset(kvalue, 0, map->value_size); 715 unlock: 716 kfree(tlinks); 717 mutex_unlock(&st_map->lock); 718 return err; 719 } 720 721 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 722 { 723 enum bpf_struct_ops_state prev_state; 724 struct bpf_struct_ops_map *st_map; 725 726 st_map = (struct bpf_struct_ops_map *)map; 727 if (st_map->map.map_flags & BPF_F_LINK) 728 return -EOPNOTSUPP; 729 730 prev_state = cmpxchg(&st_map->kvalue.common.state, 731 BPF_STRUCT_OPS_STATE_INUSE, 732 BPF_STRUCT_OPS_STATE_TOBEFREE); 733 switch (prev_state) { 734 case BPF_STRUCT_OPS_STATE_INUSE: 735 st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); 736 bpf_map_put(map); 737 return 0; 738 case BPF_STRUCT_OPS_STATE_TOBEFREE: 739 return -EINPROGRESS; 740 case BPF_STRUCT_OPS_STATE_INIT: 741 return -ENOENT; 742 default: 743 WARN_ON_ONCE(1); 744 /* Should never happen. Treat it as not found. */ 745 return -ENOENT; 746 } 747 } 748 749 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 750 struct seq_file *m) 751 { 752 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 753 void *value; 754 int err; 755 756 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 757 if (!value) 758 return; 759 760 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 761 if (!err) { 762 btf_type_seq_show(st_map->btf, 763 map->btf_vmlinux_value_type_id, 764 value, m); 765 seq_puts(m, "\n"); 766 } 767 768 kfree(value); 769 } 770 771 static void __bpf_struct_ops_map_free(struct bpf_map *map) 772 { 773 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 774 775 if (st_map->links) 776 bpf_struct_ops_map_put_progs(st_map); 777 bpf_map_area_free(st_map->links); 778 if (st_map->image) { 779 arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); 780 bpf_jit_uncharge_modmem(PAGE_SIZE); 781 } 782 bpf_map_area_free(st_map->uvalue); 783 bpf_map_area_free(st_map); 784 } 785 786 static void bpf_struct_ops_map_free(struct bpf_map *map) 787 { 788 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 789 790 /* st_ops->owner was acquired during map_alloc to implicitly holds 791 * the btf's refcnt. The acquire was only done when btf_is_module() 792 * st_map->btf cannot be NULL here. 793 */ 794 if (btf_is_module(st_map->btf)) 795 module_put(st_map->st_ops_desc->st_ops->owner); 796 797 /* The struct_ops's function may switch to another struct_ops. 798 * 799 * For example, bpf_tcp_cc_x->init() may switch to 800 * another tcp_cc_y by calling 801 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 802 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 803 * and its refcount may reach 0 which then free its 804 * trampoline image while tcp_cc_x is still running. 805 * 806 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 807 * to finish. bpf-tcp-cc prog is non sleepable. 808 * A rcu_tasks gp is to wait for the last few insn 809 * in the tramopline image to finish before releasing 810 * the trampoline image. 811 */ 812 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 813 814 __bpf_struct_ops_map_free(map); 815 } 816 817 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 818 { 819 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 820 (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) || 821 !attr->btf_vmlinux_value_type_id) 822 return -EINVAL; 823 return 0; 824 } 825 826 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 827 { 828 const struct bpf_struct_ops_desc *st_ops_desc; 829 size_t st_map_size; 830 struct bpf_struct_ops_map *st_map; 831 const struct btf_type *t, *vt; 832 struct module *mod = NULL; 833 struct bpf_map *map; 834 struct btf *btf; 835 int ret; 836 837 if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { 838 /* The map holds btf for its whole life time. */ 839 btf = btf_get_by_fd(attr->value_type_btf_obj_fd); 840 if (IS_ERR(btf)) 841 return ERR_CAST(btf); 842 if (!btf_is_module(btf)) { 843 btf_put(btf); 844 return ERR_PTR(-EINVAL); 845 } 846 847 mod = btf_try_get_module(btf); 848 /* mod holds a refcnt to btf. We don't need an extra refcnt 849 * here. 850 */ 851 btf_put(btf); 852 if (!mod) 853 return ERR_PTR(-EINVAL); 854 } else { 855 btf = bpf_get_btf_vmlinux(); 856 if (IS_ERR(btf)) 857 return ERR_CAST(btf); 858 if (!btf) 859 return ERR_PTR(-ENOTSUPP); 860 } 861 862 st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); 863 if (!st_ops_desc) { 864 ret = -ENOTSUPP; 865 goto errout; 866 } 867 868 vt = st_ops_desc->value_type; 869 if (attr->value_size != vt->size) { 870 ret = -EINVAL; 871 goto errout; 872 } 873 874 t = st_ops_desc->type; 875 876 st_map_size = sizeof(*st_map) + 877 /* kvalue stores the 878 * struct bpf_struct_ops_tcp_congestions_ops 879 */ 880 (vt->size - sizeof(struct bpf_struct_ops_value)); 881 882 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 883 if (!st_map) { 884 ret = -ENOMEM; 885 goto errout; 886 } 887 888 st_map->st_ops_desc = st_ops_desc; 889 map = &st_map->map; 890 891 ret = bpf_jit_charge_modmem(PAGE_SIZE); 892 if (ret) 893 goto errout_free; 894 895 st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); 896 if (!st_map->image) { 897 /* __bpf_struct_ops_map_free() uses st_map->image as flag 898 * for "charged or not". In this case, we need to unchange 899 * here. 900 */ 901 bpf_jit_uncharge_modmem(PAGE_SIZE); 902 ret = -ENOMEM; 903 goto errout_free; 904 } 905 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 906 st_map->links_cnt = btf_type_vlen(t); 907 st_map->links = 908 bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), 909 NUMA_NO_NODE); 910 if (!st_map->uvalue || !st_map->links) { 911 ret = -ENOMEM; 912 goto errout_free; 913 } 914 st_map->btf = btf; 915 916 mutex_init(&st_map->lock); 917 bpf_map_init_from_attr(map, attr); 918 919 return map; 920 921 errout_free: 922 __bpf_struct_ops_map_free(map); 923 errout: 924 module_put(mod); 925 926 return ERR_PTR(ret); 927 } 928 929 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 930 { 931 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 932 const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; 933 const struct btf_type *vt = st_ops_desc->value_type; 934 u64 usage; 935 936 usage = sizeof(*st_map) + 937 vt->size - sizeof(struct bpf_struct_ops_value); 938 usage += vt->size; 939 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 940 usage += PAGE_SIZE; 941 return usage; 942 } 943 944 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 945 const struct bpf_map_ops bpf_struct_ops_map_ops = { 946 .map_alloc_check = bpf_struct_ops_map_alloc_check, 947 .map_alloc = bpf_struct_ops_map_alloc, 948 .map_free = bpf_struct_ops_map_free, 949 .map_get_next_key = bpf_struct_ops_map_get_next_key, 950 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 951 .map_delete_elem = bpf_struct_ops_map_delete_elem, 952 .map_update_elem = bpf_struct_ops_map_update_elem, 953 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 954 .map_mem_usage = bpf_struct_ops_map_mem_usage, 955 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 956 }; 957 958 /* "const void *" because some subsystem is 959 * passing a const (e.g. const struct tcp_congestion_ops *) 960 */ 961 bool bpf_struct_ops_get(const void *kdata) 962 { 963 struct bpf_struct_ops_value *kvalue; 964 struct bpf_struct_ops_map *st_map; 965 struct bpf_map *map; 966 967 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 968 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 969 970 map = __bpf_map_inc_not_zero(&st_map->map, false); 971 return !IS_ERR(map); 972 } 973 974 void bpf_struct_ops_put(const void *kdata) 975 { 976 struct bpf_struct_ops_value *kvalue; 977 struct bpf_struct_ops_map *st_map; 978 979 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 980 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 981 982 bpf_map_put(&st_map->map); 983 } 984 985 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 986 { 987 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 988 989 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 990 map->map_flags & BPF_F_LINK && 991 /* Pair with smp_store_release() during map_update */ 992 smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY; 993 } 994 995 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 996 { 997 struct bpf_struct_ops_link *st_link; 998 struct bpf_struct_ops_map *st_map; 999 1000 st_link = container_of(link, struct bpf_struct_ops_link, link); 1001 st_map = (struct bpf_struct_ops_map *) 1002 rcu_dereference_protected(st_link->map, true); 1003 if (st_map) { 1004 /* st_link->map can be NULL if 1005 * bpf_struct_ops_link_create() fails to register. 1006 */ 1007 st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); 1008 bpf_map_put(&st_map->map); 1009 } 1010 kfree(st_link); 1011 } 1012 1013 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 1014 struct seq_file *seq) 1015 { 1016 struct bpf_struct_ops_link *st_link; 1017 struct bpf_map *map; 1018 1019 st_link = container_of(link, struct bpf_struct_ops_link, link); 1020 rcu_read_lock(); 1021 map = rcu_dereference(st_link->map); 1022 seq_printf(seq, "map_id:\t%d\n", map->id); 1023 rcu_read_unlock(); 1024 } 1025 1026 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 1027 struct bpf_link_info *info) 1028 { 1029 struct bpf_struct_ops_link *st_link; 1030 struct bpf_map *map; 1031 1032 st_link = container_of(link, struct bpf_struct_ops_link, link); 1033 rcu_read_lock(); 1034 map = rcu_dereference(st_link->map); 1035 info->struct_ops.map_id = map->id; 1036 rcu_read_unlock(); 1037 return 0; 1038 } 1039 1040 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 1041 struct bpf_map *expected_old_map) 1042 { 1043 struct bpf_struct_ops_map *st_map, *old_st_map; 1044 struct bpf_map *old_map; 1045 struct bpf_struct_ops_link *st_link; 1046 int err; 1047 1048 st_link = container_of(link, struct bpf_struct_ops_link, link); 1049 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 1050 1051 if (!bpf_struct_ops_valid_to_reg(new_map)) 1052 return -EINVAL; 1053 1054 if (!st_map->st_ops_desc->st_ops->update) 1055 return -EOPNOTSUPP; 1056 1057 mutex_lock(&update_mutex); 1058 1059 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 1060 if (expected_old_map && old_map != expected_old_map) { 1061 err = -EPERM; 1062 goto err_out; 1063 } 1064 1065 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 1066 /* The new and old struct_ops must be the same type. */ 1067 if (st_map->st_ops_desc != old_st_map->st_ops_desc) { 1068 err = -EINVAL; 1069 goto err_out; 1070 } 1071 1072 err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); 1073 if (err) 1074 goto err_out; 1075 1076 bpf_map_inc(new_map); 1077 rcu_assign_pointer(st_link->map, new_map); 1078 bpf_map_put(old_map); 1079 1080 err_out: 1081 mutex_unlock(&update_mutex); 1082 1083 return err; 1084 } 1085 1086 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 1087 .dealloc = bpf_struct_ops_map_link_dealloc, 1088 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 1089 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 1090 .update_map = bpf_struct_ops_map_link_update, 1091 }; 1092 1093 int bpf_struct_ops_link_create(union bpf_attr *attr) 1094 { 1095 struct bpf_struct_ops_link *link = NULL; 1096 struct bpf_link_primer link_primer; 1097 struct bpf_struct_ops_map *st_map; 1098 struct bpf_map *map; 1099 int err; 1100 1101 map = bpf_map_get(attr->link_create.map_fd); 1102 if (IS_ERR(map)) 1103 return PTR_ERR(map); 1104 1105 st_map = (struct bpf_struct_ops_map *)map; 1106 1107 if (!bpf_struct_ops_valid_to_reg(map)) { 1108 err = -EINVAL; 1109 goto err_out; 1110 } 1111 1112 link = kzalloc(sizeof(*link), GFP_USER); 1113 if (!link) { 1114 err = -ENOMEM; 1115 goto err_out; 1116 } 1117 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 1118 1119 err = bpf_link_prime(&link->link, &link_primer); 1120 if (err) 1121 goto err_out; 1122 1123 err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); 1124 if (err) { 1125 bpf_link_cleanup(&link_primer); 1126 link = NULL; 1127 goto err_out; 1128 } 1129 RCU_INIT_POINTER(link->map, map); 1130 1131 return bpf_link_settle(&link_primer); 1132 1133 err_out: 1134 bpf_map_put(map); 1135 kfree(link); 1136 return err; 1137 } 1138 1139 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) 1140 { 1141 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 1142 1143 info->btf_vmlinux_id = btf_obj_id(st_map->btf); 1144 } 1145