1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * FUSE: Filesystem in Userspace 4 * Copyright (c) 2023-2024 DataDirect Networks. 5 */ 6 7 #include "fuse_i.h" 8 #include "dev_uring_i.h" 9 #include "fuse_dev_i.h" 10 11 #include <linux/fs.h> 12 #include <linux/io_uring/cmd.h> 13 14 static bool __read_mostly enable_uring; 15 module_param(enable_uring, bool, 0644); 16 MODULE_PARM_DESC(enable_uring, 17 "Enable userspace communication through io-uring"); 18 19 #define FUSE_URING_IOV_SEGS 2 /* header and payload */ 20 21 22 bool fuse_uring_enabled(void) 23 { 24 return enable_uring; 25 } 26 27 struct fuse_uring_pdu { 28 struct fuse_ring_ent *ent; 29 }; 30 31 static const struct fuse_iqueue_ops fuse_io_uring_ops; 32 33 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, 34 struct fuse_ring_ent *ring_ent) 35 { 36 struct fuse_uring_pdu *pdu = 37 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 38 39 pdu->ent = ring_ent; 40 } 41 42 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) 43 { 44 struct fuse_uring_pdu *pdu = 45 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 46 47 return pdu->ent; 48 } 49 50 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) 51 { 52 struct fuse_ring *ring = queue->ring; 53 struct fuse_conn *fc = ring->fc; 54 55 lockdep_assert_held(&queue->lock); 56 lockdep_assert_held(&fc->bg_lock); 57 58 /* 59 * Allow one bg request per queue, ignoring global fc limits. 60 * This prevents a single queue from consuming all resources and 61 * eliminates the need for remote queue wake-ups when global 62 * limits are met but this queue has no more waiting requests. 63 */ 64 while ((fc->active_background < fc->max_background || 65 !queue->active_background) && 66 (!list_empty(&queue->fuse_req_bg_queue))) { 67 struct fuse_req *req; 68 69 req = list_first_entry(&queue->fuse_req_bg_queue, 70 struct fuse_req, list); 71 fc->active_background++; 72 queue->active_background++; 73 74 list_move_tail(&req->list, &queue->fuse_req_queue); 75 } 76 } 77 78 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, 79 int error) 80 { 81 struct fuse_ring_queue *queue = ent->queue; 82 struct fuse_ring *ring = queue->ring; 83 struct fuse_conn *fc = ring->fc; 84 85 lockdep_assert_not_held(&queue->lock); 86 spin_lock(&queue->lock); 87 ent->fuse_req = NULL; 88 if (test_bit(FR_BACKGROUND, &req->flags)) { 89 queue->active_background--; 90 spin_lock(&fc->bg_lock); 91 fuse_uring_flush_bg(queue); 92 spin_unlock(&fc->bg_lock); 93 } 94 95 spin_unlock(&queue->lock); 96 97 if (error) 98 req->out.h.error = error; 99 100 clear_bit(FR_SENT, &req->flags); 101 fuse_request_end(req); 102 } 103 104 /* Abort all list queued request on the given ring queue */ 105 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) 106 { 107 struct fuse_req *req; 108 LIST_HEAD(req_list); 109 110 spin_lock(&queue->lock); 111 list_for_each_entry(req, &queue->fuse_req_queue, list) 112 clear_bit(FR_PENDING, &req->flags); 113 list_splice_init(&queue->fuse_req_queue, &req_list); 114 spin_unlock(&queue->lock); 115 116 /* must not hold queue lock to avoid order issues with fi->lock */ 117 fuse_dev_end_requests(&req_list); 118 } 119 120 void fuse_uring_abort_end_requests(struct fuse_ring *ring) 121 { 122 int qid; 123 struct fuse_ring_queue *queue; 124 struct fuse_conn *fc = ring->fc; 125 126 for (qid = 0; qid < ring->nr_queues; qid++) { 127 queue = READ_ONCE(ring->queues[qid]); 128 if (!queue) 129 continue; 130 131 queue->stopped = true; 132 133 WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); 134 spin_lock(&queue->lock); 135 spin_lock(&fc->bg_lock); 136 fuse_uring_flush_bg(queue); 137 spin_unlock(&fc->bg_lock); 138 spin_unlock(&queue->lock); 139 fuse_uring_abort_end_queue_requests(queue); 140 } 141 } 142 143 bool fuse_uring_request_expired(struct fuse_conn *fc) 144 { 145 struct fuse_ring *ring = fc->ring; 146 struct fuse_ring_queue *queue; 147 int qid; 148 149 if (!ring) 150 return false; 151 152 for (qid = 0; qid < ring->nr_queues; qid++) { 153 queue = READ_ONCE(ring->queues[qid]); 154 if (!queue) 155 continue; 156 157 spin_lock(&queue->lock); 158 if (fuse_request_expired(fc, &queue->fuse_req_queue) || 159 fuse_request_expired(fc, &queue->fuse_req_bg_queue) || 160 fuse_fpq_processing_expired(fc, queue->fpq.processing)) { 161 spin_unlock(&queue->lock); 162 return true; 163 } 164 spin_unlock(&queue->lock); 165 } 166 167 return false; 168 } 169 170 void fuse_uring_destruct(struct fuse_conn *fc) 171 { 172 struct fuse_ring *ring = fc->ring; 173 int qid; 174 175 if (!ring) 176 return; 177 178 for (qid = 0; qid < ring->nr_queues; qid++) { 179 struct fuse_ring_queue *queue = ring->queues[qid]; 180 struct fuse_ring_ent *ent, *next; 181 182 if (!queue) 183 continue; 184 185 WARN_ON(!list_empty(&queue->ent_avail_queue)); 186 WARN_ON(!list_empty(&queue->ent_w_req_queue)); 187 WARN_ON(!list_empty(&queue->ent_commit_queue)); 188 WARN_ON(!list_empty(&queue->ent_in_userspace)); 189 190 list_for_each_entry_safe(ent, next, &queue->ent_released, 191 list) { 192 list_del_init(&ent->list); 193 kfree(ent); 194 } 195 196 kfree(queue->fpq.processing); 197 kfree(queue); 198 ring->queues[qid] = NULL; 199 } 200 201 kfree(ring->queues); 202 kfree(ring); 203 fc->ring = NULL; 204 } 205 206 /* 207 * Basic ring setup for this connection based on the provided configuration 208 */ 209 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) 210 { 211 struct fuse_ring *ring; 212 size_t nr_queues = num_possible_cpus(); 213 struct fuse_ring *res = NULL; 214 size_t max_payload_size; 215 216 ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); 217 if (!ring) 218 return NULL; 219 220 ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), 221 GFP_KERNEL_ACCOUNT); 222 if (!ring->queues) 223 goto out_err; 224 225 max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); 226 max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); 227 228 spin_lock(&fc->lock); 229 if (fc->ring) { 230 /* race, another thread created the ring in the meantime */ 231 spin_unlock(&fc->lock); 232 res = fc->ring; 233 goto out_err; 234 } 235 236 init_waitqueue_head(&ring->stop_waitq); 237 238 ring->nr_queues = nr_queues; 239 ring->fc = fc; 240 ring->max_payload_sz = max_payload_size; 241 atomic_set(&ring->queue_refs, 0); 242 smp_store_release(&fc->ring, ring); 243 244 spin_unlock(&fc->lock); 245 return ring; 246 247 out_err: 248 kfree(ring->queues); 249 kfree(ring); 250 return res; 251 } 252 253 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, 254 int qid) 255 { 256 struct fuse_conn *fc = ring->fc; 257 struct fuse_ring_queue *queue; 258 struct list_head *pq; 259 260 queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); 261 if (!queue) 262 return NULL; 263 pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); 264 if (!pq) { 265 kfree(queue); 266 return NULL; 267 } 268 269 queue->qid = qid; 270 queue->ring = ring; 271 spin_lock_init(&queue->lock); 272 273 INIT_LIST_HEAD(&queue->ent_avail_queue); 274 INIT_LIST_HEAD(&queue->ent_commit_queue); 275 INIT_LIST_HEAD(&queue->ent_w_req_queue); 276 INIT_LIST_HEAD(&queue->ent_in_userspace); 277 INIT_LIST_HEAD(&queue->fuse_req_queue); 278 INIT_LIST_HEAD(&queue->fuse_req_bg_queue); 279 INIT_LIST_HEAD(&queue->ent_released); 280 281 queue->fpq.processing = pq; 282 fuse_pqueue_init(&queue->fpq); 283 284 spin_lock(&fc->lock); 285 if (ring->queues[qid]) { 286 spin_unlock(&fc->lock); 287 kfree(queue->fpq.processing); 288 kfree(queue); 289 return ring->queues[qid]; 290 } 291 292 /* 293 * write_once and lock as the caller mostly doesn't take the lock at all 294 */ 295 WRITE_ONCE(ring->queues[qid], queue); 296 spin_unlock(&fc->lock); 297 298 return queue; 299 } 300 301 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) 302 { 303 clear_bit(FR_SENT, &req->flags); 304 req->out.h.error = -ECONNABORTED; 305 fuse_request_end(req); 306 } 307 308 /* 309 * Release a request/entry on connection tear down 310 */ 311 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) 312 { 313 struct fuse_req *req; 314 struct io_uring_cmd *cmd; 315 316 struct fuse_ring_queue *queue = ent->queue; 317 318 spin_lock(&queue->lock); 319 cmd = ent->cmd; 320 ent->cmd = NULL; 321 req = ent->fuse_req; 322 ent->fuse_req = NULL; 323 if (req) { 324 /* remove entry from queue->fpq->processing */ 325 list_del_init(&req->list); 326 } 327 328 /* 329 * The entry must not be freed immediately, due to access of direct 330 * pointer access of entries through IO_URING_F_CANCEL - there is a risk 331 * of race between daemon termination (which triggers IO_URING_F_CANCEL 332 * and accesses entries without checking the list state first 333 */ 334 list_move(&ent->list, &queue->ent_released); 335 ent->state = FRRS_RELEASED; 336 spin_unlock(&queue->lock); 337 338 if (cmd) 339 io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); 340 341 if (req) 342 fuse_uring_stop_fuse_req_end(req); 343 } 344 345 static void fuse_uring_stop_list_entries(struct list_head *head, 346 struct fuse_ring_queue *queue, 347 enum fuse_ring_req_state exp_state) 348 { 349 struct fuse_ring *ring = queue->ring; 350 struct fuse_ring_ent *ent, *next; 351 ssize_t queue_refs = SSIZE_MAX; 352 LIST_HEAD(to_teardown); 353 354 spin_lock(&queue->lock); 355 list_for_each_entry_safe(ent, next, head, list) { 356 if (ent->state != exp_state) { 357 pr_warn("entry teardown qid=%d state=%d expected=%d", 358 queue->qid, ent->state, exp_state); 359 continue; 360 } 361 362 ent->state = FRRS_TEARDOWN; 363 list_move(&ent->list, &to_teardown); 364 } 365 spin_unlock(&queue->lock); 366 367 /* no queue lock to avoid lock order issues */ 368 list_for_each_entry_safe(ent, next, &to_teardown, list) { 369 fuse_uring_entry_teardown(ent); 370 queue_refs = atomic_dec_return(&ring->queue_refs); 371 WARN_ON_ONCE(queue_refs < 0); 372 } 373 } 374 375 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) 376 { 377 fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, 378 FRRS_USERSPACE); 379 fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, 380 FRRS_AVAILABLE); 381 } 382 383 /* 384 * Log state debug info 385 */ 386 static void fuse_uring_log_ent_state(struct fuse_ring *ring) 387 { 388 int qid; 389 struct fuse_ring_ent *ent; 390 391 for (qid = 0; qid < ring->nr_queues; qid++) { 392 struct fuse_ring_queue *queue = ring->queues[qid]; 393 394 if (!queue) 395 continue; 396 397 spin_lock(&queue->lock); 398 /* 399 * Log entries from the intermediate queue, the other queues 400 * should be empty 401 */ 402 list_for_each_entry(ent, &queue->ent_w_req_queue, list) { 403 pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", 404 ring, qid, ent, ent->state); 405 } 406 list_for_each_entry(ent, &queue->ent_commit_queue, list) { 407 pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", 408 ring, qid, ent, ent->state); 409 } 410 spin_unlock(&queue->lock); 411 } 412 ring->stop_debug_log = 1; 413 } 414 415 static void fuse_uring_async_stop_queues(struct work_struct *work) 416 { 417 int qid; 418 struct fuse_ring *ring = 419 container_of(work, struct fuse_ring, async_teardown_work.work); 420 421 /* XXX code dup */ 422 for (qid = 0; qid < ring->nr_queues; qid++) { 423 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 424 425 if (!queue) 426 continue; 427 428 fuse_uring_teardown_entries(queue); 429 } 430 431 /* 432 * Some ring entries might be in the middle of IO operations, 433 * i.e. in process to get handled by file_operations::uring_cmd 434 * or on the way to userspace - we could handle that with conditions in 435 * run time code, but easier/cleaner to have an async tear down handler 436 * If there are still queue references left 437 */ 438 if (atomic_read(&ring->queue_refs) > 0) { 439 if (time_after(jiffies, 440 ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) 441 fuse_uring_log_ent_state(ring); 442 443 schedule_delayed_work(&ring->async_teardown_work, 444 FUSE_URING_TEARDOWN_INTERVAL); 445 } else { 446 wake_up_all(&ring->stop_waitq); 447 } 448 } 449 450 /* 451 * Stop the ring queues 452 */ 453 void fuse_uring_stop_queues(struct fuse_ring *ring) 454 { 455 int qid; 456 457 for (qid = 0; qid < ring->nr_queues; qid++) { 458 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 459 460 if (!queue) 461 continue; 462 463 fuse_uring_teardown_entries(queue); 464 } 465 466 if (atomic_read(&ring->queue_refs) > 0) { 467 ring->teardown_time = jiffies; 468 INIT_DELAYED_WORK(&ring->async_teardown_work, 469 fuse_uring_async_stop_queues); 470 schedule_delayed_work(&ring->async_teardown_work, 471 FUSE_URING_TEARDOWN_INTERVAL); 472 } else { 473 wake_up_all(&ring->stop_waitq); 474 } 475 } 476 477 /* 478 * Handle IO_URING_F_CANCEL, typically should come on daemon termination. 479 * 480 * Releasing the last entry should trigger fuse_dev_release() if 481 * the daemon was terminated 482 */ 483 static void fuse_uring_cancel(struct io_uring_cmd *cmd, 484 unsigned int issue_flags) 485 { 486 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 487 struct fuse_ring_queue *queue; 488 bool need_cmd_done = false; 489 490 /* 491 * direct access on ent - it must not be destructed as long as 492 * IO_URING_F_CANCEL might come up 493 */ 494 queue = ent->queue; 495 spin_lock(&queue->lock); 496 if (ent->state == FRRS_AVAILABLE) { 497 ent->state = FRRS_USERSPACE; 498 list_move(&ent->list, &queue->ent_in_userspace); 499 need_cmd_done = true; 500 ent->cmd = NULL; 501 } 502 spin_unlock(&queue->lock); 503 504 if (need_cmd_done) { 505 /* no queue lock to avoid lock order issues */ 506 io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); 507 } 508 } 509 510 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, 511 struct fuse_ring_ent *ring_ent) 512 { 513 uring_cmd_set_ring_ent(cmd, ring_ent); 514 io_uring_cmd_mark_cancelable(cmd, issue_flags); 515 } 516 517 /* 518 * Checks for errors and stores it into the request 519 */ 520 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, 521 struct fuse_req *req, 522 struct fuse_conn *fc) 523 { 524 int err; 525 526 err = -EINVAL; 527 if (oh->unique == 0) { 528 /* Not supported through io-uring yet */ 529 pr_warn_once("notify through fuse-io-uring not supported\n"); 530 goto err; 531 } 532 533 if (oh->error <= -ERESTARTSYS || oh->error > 0) 534 goto err; 535 536 if (oh->error) { 537 err = oh->error; 538 goto err; 539 } 540 541 err = -ENOENT; 542 if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { 543 pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", 544 req->in.h.unique, 545 oh->unique & ~FUSE_INT_REQ_BIT); 546 goto err; 547 } 548 549 /* 550 * Is it an interrupt reply ID? 551 * XXX: Not supported through fuse-io-uring yet, it should not even 552 * find the request - should not happen. 553 */ 554 WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); 555 556 err = 0; 557 err: 558 return err; 559 } 560 561 static int fuse_uring_copy_from_ring(struct fuse_ring *ring, 562 struct fuse_req *req, 563 struct fuse_ring_ent *ent) 564 { 565 struct fuse_copy_state cs; 566 struct fuse_args *args = req->args; 567 struct iov_iter iter; 568 int err; 569 struct fuse_uring_ent_in_out ring_in_out; 570 571 err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, 572 sizeof(ring_in_out)); 573 if (err) 574 return -EFAULT; 575 576 err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, 577 &iter); 578 if (err) 579 return err; 580 581 fuse_copy_init(&cs, 0, &iter); 582 cs.is_uring = 1; 583 cs.req = req; 584 585 return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); 586 } 587 588 /* 589 * Copy data from the req to the ring buffer 590 */ 591 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, 592 struct fuse_ring_ent *ent) 593 { 594 struct fuse_copy_state cs; 595 struct fuse_args *args = req->args; 596 struct fuse_in_arg *in_args = args->in_args; 597 int num_args = args->in_numargs; 598 int err; 599 struct iov_iter iter; 600 struct fuse_uring_ent_in_out ent_in_out = { 601 .flags = 0, 602 .commit_id = req->in.h.unique, 603 }; 604 605 err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); 606 if (err) { 607 pr_info_ratelimited("fuse: Import of user buffer failed\n"); 608 return err; 609 } 610 611 fuse_copy_init(&cs, 1, &iter); 612 cs.is_uring = 1; 613 cs.req = req; 614 615 if (num_args > 0) { 616 /* 617 * Expectation is that the first argument is the per op header. 618 * Some op code have that as zero size. 619 */ 620 if (args->in_args[0].size > 0) { 621 err = copy_to_user(&ent->headers->op_in, in_args->value, 622 in_args->size); 623 if (err) { 624 pr_info_ratelimited( 625 "Copying the header failed.\n"); 626 return -EFAULT; 627 } 628 } 629 in_args++; 630 num_args--; 631 } 632 633 /* copy the payload */ 634 err = fuse_copy_args(&cs, num_args, args->in_pages, 635 (struct fuse_arg *)in_args, 0); 636 if (err) { 637 pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); 638 return err; 639 } 640 641 ent_in_out.payload_sz = cs.ring.copied_sz; 642 err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, 643 sizeof(ent_in_out)); 644 return err ? -EFAULT : 0; 645 } 646 647 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, 648 struct fuse_req *req) 649 { 650 struct fuse_ring_queue *queue = ent->queue; 651 struct fuse_ring *ring = queue->ring; 652 int err; 653 654 err = -EIO; 655 if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { 656 pr_err("qid=%d ring-req=%p invalid state %d on send\n", 657 queue->qid, ent, ent->state); 658 return err; 659 } 660 661 err = -EINVAL; 662 if (WARN_ON(req->in.h.unique == 0)) 663 return err; 664 665 /* copy the request */ 666 err = fuse_uring_args_to_ring(ring, req, ent); 667 if (unlikely(err)) { 668 pr_info_ratelimited("Copy to ring failed: %d\n", err); 669 return err; 670 } 671 672 /* copy fuse_in_header */ 673 err = copy_to_user(&ent->headers->in_out, &req->in.h, 674 sizeof(req->in.h)); 675 if (err) { 676 err = -EFAULT; 677 return err; 678 } 679 680 return 0; 681 } 682 683 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, 684 struct fuse_req *req) 685 { 686 int err; 687 688 err = fuse_uring_copy_to_ring(ent, req); 689 if (!err) 690 set_bit(FR_SENT, &req->flags); 691 else 692 fuse_uring_req_end(ent, req, err); 693 694 return err; 695 } 696 697 /* 698 * Write data to the ring buffer and send the request to userspace, 699 * userspace will read it 700 * This is comparable with classical read(/dev/fuse) 701 */ 702 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, 703 struct fuse_req *req, 704 unsigned int issue_flags) 705 { 706 struct fuse_ring_queue *queue = ent->queue; 707 int err; 708 struct io_uring_cmd *cmd; 709 710 err = fuse_uring_prepare_send(ent, req); 711 if (err) 712 return err; 713 714 spin_lock(&queue->lock); 715 cmd = ent->cmd; 716 ent->cmd = NULL; 717 ent->state = FRRS_USERSPACE; 718 list_move(&ent->list, &queue->ent_in_userspace); 719 spin_unlock(&queue->lock); 720 721 io_uring_cmd_done(cmd, 0, 0, issue_flags); 722 return 0; 723 } 724 725 /* 726 * Make a ring entry available for fuse_req assignment 727 */ 728 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, 729 struct fuse_ring_queue *queue) 730 { 731 WARN_ON_ONCE(!ent->cmd); 732 list_move(&ent->list, &queue->ent_avail_queue); 733 ent->state = FRRS_AVAILABLE; 734 } 735 736 /* Used to find the request on SQE commit */ 737 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, 738 struct fuse_req *req) 739 { 740 struct fuse_ring_queue *queue = ent->queue; 741 struct fuse_pqueue *fpq = &queue->fpq; 742 unsigned int hash; 743 744 req->ring_entry = ent; 745 hash = fuse_req_hash(req->in.h.unique); 746 list_move_tail(&req->list, &fpq->processing[hash]); 747 } 748 749 /* 750 * Assign a fuse queue entry to the given entry 751 */ 752 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, 753 struct fuse_req *req) 754 { 755 struct fuse_ring_queue *queue = ent->queue; 756 757 lockdep_assert_held(&queue->lock); 758 759 if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && 760 ent->state != FRRS_COMMIT)) { 761 pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, 762 ent->state); 763 } 764 765 clear_bit(FR_PENDING, &req->flags); 766 ent->fuse_req = req; 767 ent->state = FRRS_FUSE_REQ; 768 list_move(&ent->list, &queue->ent_w_req_queue); 769 fuse_uring_add_to_pq(ent, req); 770 } 771 772 /* Fetch the next fuse request if available */ 773 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) 774 __must_hold(&queue->lock) 775 { 776 struct fuse_req *req; 777 struct fuse_ring_queue *queue = ent->queue; 778 struct list_head *req_queue = &queue->fuse_req_queue; 779 780 lockdep_assert_held(&queue->lock); 781 782 /* get and assign the next entry while it is still holding the lock */ 783 req = list_first_entry_or_null(req_queue, struct fuse_req, list); 784 if (req) 785 fuse_uring_add_req_to_ring_ent(ent, req); 786 787 return req; 788 } 789 790 /* 791 * Read data from the ring buffer, which user space has written to 792 * This is comparible with handling of classical write(/dev/fuse). 793 * Also make the ring request available again for new fuse requests. 794 */ 795 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, 796 unsigned int issue_flags) 797 { 798 struct fuse_ring *ring = ent->queue->ring; 799 struct fuse_conn *fc = ring->fc; 800 ssize_t err = 0; 801 802 err = copy_from_user(&req->out.h, &ent->headers->in_out, 803 sizeof(req->out.h)); 804 if (err) { 805 req->out.h.error = -EFAULT; 806 goto out; 807 } 808 809 err = fuse_uring_out_header_has_err(&req->out.h, req, fc); 810 if (err) { 811 /* req->out.h.error already set */ 812 goto out; 813 } 814 815 err = fuse_uring_copy_from_ring(ring, req, ent); 816 out: 817 fuse_uring_req_end(ent, req, err); 818 } 819 820 /* 821 * Get the next fuse req and send it 822 */ 823 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, 824 struct fuse_ring_queue *queue, 825 unsigned int issue_flags) 826 { 827 int err; 828 struct fuse_req *req; 829 830 retry: 831 spin_lock(&queue->lock); 832 fuse_uring_ent_avail(ent, queue); 833 req = fuse_uring_ent_assign_req(ent); 834 spin_unlock(&queue->lock); 835 836 if (req) { 837 err = fuse_uring_send_next_to_ring(ent, req, issue_flags); 838 if (err) 839 goto retry; 840 } 841 } 842 843 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) 844 { 845 struct fuse_ring_queue *queue = ent->queue; 846 847 lockdep_assert_held(&queue->lock); 848 849 if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) 850 return -EIO; 851 852 ent->state = FRRS_COMMIT; 853 list_move(&ent->list, &queue->ent_commit_queue); 854 855 return 0; 856 } 857 858 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ 859 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, 860 struct fuse_conn *fc) 861 { 862 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 863 struct fuse_ring_ent *ent; 864 int err; 865 struct fuse_ring *ring = fc->ring; 866 struct fuse_ring_queue *queue; 867 uint64_t commit_id = READ_ONCE(cmd_req->commit_id); 868 unsigned int qid = READ_ONCE(cmd_req->qid); 869 struct fuse_pqueue *fpq; 870 struct fuse_req *req; 871 872 err = -ENOTCONN; 873 if (!ring) 874 return err; 875 876 if (qid >= ring->nr_queues) 877 return -EINVAL; 878 879 queue = ring->queues[qid]; 880 if (!queue) 881 return err; 882 fpq = &queue->fpq; 883 884 if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) 885 return err; 886 887 spin_lock(&queue->lock); 888 /* Find a request based on the unique ID of the fuse request 889 * This should get revised, as it needs a hash calculation and list 890 * search. And full struct fuse_pqueue is needed (memory overhead). 891 * As well as the link from req to ring_ent. 892 */ 893 req = fuse_request_find(fpq, commit_id); 894 err = -ENOENT; 895 if (!req) { 896 pr_info("qid=%d commit_id %llu not found\n", queue->qid, 897 commit_id); 898 spin_unlock(&queue->lock); 899 return err; 900 } 901 list_del_init(&req->list); 902 ent = req->ring_entry; 903 req->ring_entry = NULL; 904 905 err = fuse_ring_ent_set_commit(ent); 906 if (err != 0) { 907 pr_info_ratelimited("qid=%d commit_id %llu state %d", 908 queue->qid, commit_id, ent->state); 909 spin_unlock(&queue->lock); 910 req->out.h.error = err; 911 clear_bit(FR_SENT, &req->flags); 912 fuse_request_end(req); 913 return err; 914 } 915 916 ent->cmd = cmd; 917 spin_unlock(&queue->lock); 918 919 /* without the queue lock, as other locks are taken */ 920 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 921 fuse_uring_commit(ent, req, issue_flags); 922 923 /* 924 * Fetching the next request is absolutely required as queued 925 * fuse requests would otherwise not get processed - committing 926 * and fetching is done in one step vs legacy fuse, which has separated 927 * read (fetch request) and write (commit result). 928 */ 929 fuse_uring_next_fuse_req(ent, queue, issue_flags); 930 return 0; 931 } 932 933 static bool is_ring_ready(struct fuse_ring *ring, int current_qid) 934 { 935 int qid; 936 struct fuse_ring_queue *queue; 937 bool ready = true; 938 939 for (qid = 0; qid < ring->nr_queues && ready; qid++) { 940 if (current_qid == qid) 941 continue; 942 943 queue = ring->queues[qid]; 944 if (!queue) { 945 ready = false; 946 break; 947 } 948 949 spin_lock(&queue->lock); 950 if (list_empty(&queue->ent_avail_queue)) 951 ready = false; 952 spin_unlock(&queue->lock); 953 } 954 955 return ready; 956 } 957 958 /* 959 * fuse_uring_req_fetch command handling 960 */ 961 static void fuse_uring_do_register(struct fuse_ring_ent *ent, 962 struct io_uring_cmd *cmd, 963 unsigned int issue_flags) 964 { 965 struct fuse_ring_queue *queue = ent->queue; 966 struct fuse_ring *ring = queue->ring; 967 struct fuse_conn *fc = ring->fc; 968 struct fuse_iqueue *fiq = &fc->iq; 969 970 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 971 972 spin_lock(&queue->lock); 973 ent->cmd = cmd; 974 fuse_uring_ent_avail(ent, queue); 975 spin_unlock(&queue->lock); 976 977 if (!ring->ready) { 978 bool ready = is_ring_ready(ring, queue->qid); 979 980 if (ready) { 981 WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); 982 WRITE_ONCE(ring->ready, true); 983 wake_up_all(&fc->blocked_waitq); 984 } 985 } 986 } 987 988 /* 989 * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] 990 * the payload 991 */ 992 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, 993 struct iovec iov[FUSE_URING_IOV_SEGS]) 994 { 995 struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); 996 struct iov_iter iter; 997 ssize_t ret; 998 999 if (sqe->len != FUSE_URING_IOV_SEGS) 1000 return -EINVAL; 1001 1002 /* 1003 * Direction for buffer access will actually be READ and WRITE, 1004 * using write for the import should include READ access as well. 1005 */ 1006 ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, 1007 FUSE_URING_IOV_SEGS, &iov, &iter); 1008 if (ret < 0) 1009 return ret; 1010 1011 return 0; 1012 } 1013 1014 static struct fuse_ring_ent * 1015 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, 1016 struct fuse_ring_queue *queue) 1017 { 1018 struct fuse_ring *ring = queue->ring; 1019 struct fuse_ring_ent *ent; 1020 size_t payload_size; 1021 struct iovec iov[FUSE_URING_IOV_SEGS]; 1022 int err; 1023 1024 err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); 1025 if (err) { 1026 pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", 1027 err); 1028 return ERR_PTR(err); 1029 } 1030 1031 err = -EINVAL; 1032 if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { 1033 pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); 1034 return ERR_PTR(err); 1035 } 1036 1037 payload_size = iov[1].iov_len; 1038 if (payload_size < ring->max_payload_sz) { 1039 pr_info_ratelimited("Invalid req payload len %zu\n", 1040 payload_size); 1041 return ERR_PTR(err); 1042 } 1043 1044 err = -ENOMEM; 1045 ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); 1046 if (!ent) 1047 return ERR_PTR(err); 1048 1049 INIT_LIST_HEAD(&ent->list); 1050 1051 ent->queue = queue; 1052 ent->headers = iov[0].iov_base; 1053 ent->payload = iov[1].iov_base; 1054 1055 atomic_inc(&ring->queue_refs); 1056 return ent; 1057 } 1058 1059 /* 1060 * Register header and payload buffer with the kernel and puts the 1061 * entry as "ready to get fuse requests" on the queue 1062 */ 1063 static int fuse_uring_register(struct io_uring_cmd *cmd, 1064 unsigned int issue_flags, struct fuse_conn *fc) 1065 { 1066 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 1067 struct fuse_ring *ring = smp_load_acquire(&fc->ring); 1068 struct fuse_ring_queue *queue; 1069 struct fuse_ring_ent *ent; 1070 int err; 1071 unsigned int qid = READ_ONCE(cmd_req->qid); 1072 1073 err = -ENOMEM; 1074 if (!ring) { 1075 ring = fuse_uring_create(fc); 1076 if (!ring) 1077 return err; 1078 } 1079 1080 if (qid >= ring->nr_queues) { 1081 pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); 1082 return -EINVAL; 1083 } 1084 1085 queue = ring->queues[qid]; 1086 if (!queue) { 1087 queue = fuse_uring_create_queue(ring, qid); 1088 if (!queue) 1089 return err; 1090 } 1091 1092 /* 1093 * The created queue above does not need to be destructed in 1094 * case of entry errors below, will be done at ring destruction time. 1095 */ 1096 1097 ent = fuse_uring_create_ring_ent(cmd, queue); 1098 if (IS_ERR(ent)) 1099 return PTR_ERR(ent); 1100 1101 fuse_uring_do_register(ent, cmd, issue_flags); 1102 1103 return 0; 1104 } 1105 1106 /* 1107 * Entry function from io_uring to handle the given passthrough command 1108 * (op code IORING_OP_URING_CMD) 1109 */ 1110 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1111 { 1112 struct fuse_dev *fud; 1113 struct fuse_conn *fc; 1114 u32 cmd_op = cmd->cmd_op; 1115 int err; 1116 1117 if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { 1118 fuse_uring_cancel(cmd, issue_flags); 1119 return 0; 1120 } 1121 1122 /* This extra SQE size holds struct fuse_uring_cmd_req */ 1123 if (!(issue_flags & IO_URING_F_SQE128)) 1124 return -EINVAL; 1125 1126 fud = fuse_get_dev(cmd->file); 1127 if (!fud) { 1128 pr_info_ratelimited("No fuse device found\n"); 1129 return -ENOTCONN; 1130 } 1131 fc = fud->fc; 1132 1133 /* Once a connection has io-uring enabled on it, it can't be disabled */ 1134 if (!enable_uring && !fc->io_uring) { 1135 pr_info_ratelimited("fuse-io-uring is disabled\n"); 1136 return -EOPNOTSUPP; 1137 } 1138 1139 if (fc->aborted) 1140 return -ECONNABORTED; 1141 if (!fc->connected) 1142 return -ENOTCONN; 1143 1144 /* 1145 * fuse_uring_register() needs the ring to be initialized, 1146 * we need to know the max payload size 1147 */ 1148 if (!fc->initialized) 1149 return -EAGAIN; 1150 1151 switch (cmd_op) { 1152 case FUSE_IO_URING_CMD_REGISTER: 1153 err = fuse_uring_register(cmd, issue_flags, fc); 1154 if (err) { 1155 pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", 1156 err); 1157 fc->io_uring = 0; 1158 wake_up_all(&fc->blocked_waitq); 1159 return err; 1160 } 1161 break; 1162 case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: 1163 err = fuse_uring_commit_fetch(cmd, issue_flags, fc); 1164 if (err) { 1165 pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", 1166 err); 1167 return err; 1168 } 1169 break; 1170 default: 1171 return -EINVAL; 1172 } 1173 1174 return -EIOCBQUEUED; 1175 } 1176 1177 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, 1178 ssize_t ret, unsigned int issue_flags) 1179 { 1180 struct fuse_ring_queue *queue = ent->queue; 1181 1182 spin_lock(&queue->lock); 1183 ent->state = FRRS_USERSPACE; 1184 list_move(&ent->list, &queue->ent_in_userspace); 1185 ent->cmd = NULL; 1186 spin_unlock(&queue->lock); 1187 1188 io_uring_cmd_done(cmd, ret, 0, issue_flags); 1189 } 1190 1191 /* 1192 * This prepares and sends the ring request in fuse-uring task context. 1193 * User buffers are not mapped yet - the application does not have permission 1194 * to write to it - this has to be executed in ring task context. 1195 */ 1196 static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1197 unsigned int issue_flags) 1198 { 1199 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1200 struct fuse_ring_queue *queue = ent->queue; 1201 int err; 1202 1203 if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1204 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1205 if (err) { 1206 fuse_uring_next_fuse_req(ent, queue, issue_flags); 1207 return; 1208 } 1209 } else { 1210 err = -ECANCELED; 1211 } 1212 1213 fuse_uring_send(ent, cmd, err, issue_flags); 1214 } 1215 1216 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) 1217 { 1218 unsigned int qid; 1219 struct fuse_ring_queue *queue; 1220 1221 qid = task_cpu(current); 1222 1223 if (WARN_ONCE(qid >= ring->nr_queues, 1224 "Core number (%u) exceeds nr queues (%zu)\n", qid, 1225 ring->nr_queues)) 1226 qid = 0; 1227 1228 queue = ring->queues[qid]; 1229 WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); 1230 1231 return queue; 1232 } 1233 1234 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) 1235 { 1236 struct io_uring_cmd *cmd = ent->cmd; 1237 1238 uring_cmd_set_ring_ent(cmd, ent); 1239 io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); 1240 } 1241 1242 /* queue a fuse request and send it if a ring entry is available */ 1243 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) 1244 { 1245 struct fuse_conn *fc = req->fm->fc; 1246 struct fuse_ring *ring = fc->ring; 1247 struct fuse_ring_queue *queue; 1248 struct fuse_ring_ent *ent = NULL; 1249 int err; 1250 1251 err = -EINVAL; 1252 queue = fuse_uring_task_to_queue(ring); 1253 if (!queue) 1254 goto err; 1255 1256 if (req->in.h.opcode != FUSE_NOTIFY_REPLY) 1257 req->in.h.unique = fuse_get_unique(fiq); 1258 1259 spin_lock(&queue->lock); 1260 err = -ENOTCONN; 1261 if (unlikely(queue->stopped)) 1262 goto err_unlock; 1263 1264 set_bit(FR_URING, &req->flags); 1265 req->ring_queue = queue; 1266 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1267 struct fuse_ring_ent, list); 1268 if (ent) 1269 fuse_uring_add_req_to_ring_ent(ent, req); 1270 else 1271 list_add_tail(&req->list, &queue->fuse_req_queue); 1272 spin_unlock(&queue->lock); 1273 1274 if (ent) 1275 fuse_uring_dispatch_ent(ent); 1276 1277 return; 1278 1279 err_unlock: 1280 spin_unlock(&queue->lock); 1281 err: 1282 req->out.h.error = err; 1283 clear_bit(FR_PENDING, &req->flags); 1284 fuse_request_end(req); 1285 } 1286 1287 bool fuse_uring_queue_bq_req(struct fuse_req *req) 1288 { 1289 struct fuse_conn *fc = req->fm->fc; 1290 struct fuse_ring *ring = fc->ring; 1291 struct fuse_ring_queue *queue; 1292 struct fuse_ring_ent *ent = NULL; 1293 1294 queue = fuse_uring_task_to_queue(ring); 1295 if (!queue) 1296 return false; 1297 1298 spin_lock(&queue->lock); 1299 if (unlikely(queue->stopped)) { 1300 spin_unlock(&queue->lock); 1301 return false; 1302 } 1303 1304 set_bit(FR_URING, &req->flags); 1305 req->ring_queue = queue; 1306 list_add_tail(&req->list, &queue->fuse_req_bg_queue); 1307 1308 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1309 struct fuse_ring_ent, list); 1310 spin_lock(&fc->bg_lock); 1311 fc->num_background++; 1312 if (fc->num_background == fc->max_background) 1313 fc->blocked = 1; 1314 fuse_uring_flush_bg(queue); 1315 spin_unlock(&fc->bg_lock); 1316 1317 /* 1318 * Due to bg_queue flush limits there might be other bg requests 1319 * in the queue that need to be handled first. Or no further req 1320 * might be available. 1321 */ 1322 req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, 1323 list); 1324 if (ent && req) { 1325 fuse_uring_add_req_to_ring_ent(ent, req); 1326 spin_unlock(&queue->lock); 1327 1328 fuse_uring_dispatch_ent(ent); 1329 } else { 1330 spin_unlock(&queue->lock); 1331 } 1332 1333 return true; 1334 } 1335 1336 bool fuse_uring_remove_pending_req(struct fuse_req *req) 1337 { 1338 struct fuse_ring_queue *queue = req->ring_queue; 1339 1340 return fuse_remove_pending_req(req, &queue->lock); 1341 } 1342 1343 static const struct fuse_iqueue_ops fuse_io_uring_ops = { 1344 /* should be send over io-uring as enhancement */ 1345 .send_forget = fuse_dev_queue_forget, 1346 1347 /* 1348 * could be send over io-uring, but interrupts should be rare, 1349 * no need to make the code complex 1350 */ 1351 .send_interrupt = fuse_dev_queue_interrupt, 1352 .send_req = fuse_uring_queue_fuse_req, 1353 }; 1354