xref: /linux-6.15/fs/fuse/dev_uring.c (revision 1dfe2a22)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * FUSE: Filesystem in Userspace
4  * Copyright (c) 2023-2024 DataDirect Networks.
5  */
6 
7 #include "fuse_i.h"
8 #include "dev_uring_i.h"
9 #include "fuse_dev_i.h"
10 
11 #include <linux/fs.h>
12 #include <linux/io_uring/cmd.h>
13 
14 static bool __read_mostly enable_uring;
15 module_param(enable_uring, bool, 0644);
16 MODULE_PARM_DESC(enable_uring,
17 		 "Enable userspace communication through io-uring");
18 
19 #define FUSE_URING_IOV_SEGS 2 /* header and payload */
20 
21 
22 bool fuse_uring_enabled(void)
23 {
24 	return enable_uring;
25 }
26 
27 struct fuse_uring_pdu {
28 	struct fuse_ring_ent *ent;
29 };
30 
31 static const struct fuse_iqueue_ops fuse_io_uring_ops;
32 
33 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
34 				   struct fuse_ring_ent *ring_ent)
35 {
36 	struct fuse_uring_pdu *pdu =
37 		io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
38 
39 	pdu->ent = ring_ent;
40 }
41 
42 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
43 {
44 	struct fuse_uring_pdu *pdu =
45 		io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
46 
47 	return pdu->ent;
48 }
49 
50 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
51 {
52 	struct fuse_ring *ring = queue->ring;
53 	struct fuse_conn *fc = ring->fc;
54 
55 	lockdep_assert_held(&queue->lock);
56 	lockdep_assert_held(&fc->bg_lock);
57 
58 	/*
59 	 * Allow one bg request per queue, ignoring global fc limits.
60 	 * This prevents a single queue from consuming all resources and
61 	 * eliminates the need for remote queue wake-ups when global
62 	 * limits are met but this queue has no more waiting requests.
63 	 */
64 	while ((fc->active_background < fc->max_background ||
65 		!queue->active_background) &&
66 	       (!list_empty(&queue->fuse_req_bg_queue))) {
67 		struct fuse_req *req;
68 
69 		req = list_first_entry(&queue->fuse_req_bg_queue,
70 				       struct fuse_req, list);
71 		fc->active_background++;
72 		queue->active_background++;
73 
74 		list_move_tail(&req->list, &queue->fuse_req_queue);
75 	}
76 }
77 
78 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
79 			       int error)
80 {
81 	struct fuse_ring_queue *queue = ent->queue;
82 	struct fuse_ring *ring = queue->ring;
83 	struct fuse_conn *fc = ring->fc;
84 
85 	lockdep_assert_not_held(&queue->lock);
86 	spin_lock(&queue->lock);
87 	ent->fuse_req = NULL;
88 	if (test_bit(FR_BACKGROUND, &req->flags)) {
89 		queue->active_background--;
90 		spin_lock(&fc->bg_lock);
91 		fuse_uring_flush_bg(queue);
92 		spin_unlock(&fc->bg_lock);
93 	}
94 
95 	spin_unlock(&queue->lock);
96 
97 	if (error)
98 		req->out.h.error = error;
99 
100 	clear_bit(FR_SENT, &req->flags);
101 	fuse_request_end(req);
102 }
103 
104 /* Abort all list queued request on the given ring queue */
105 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue)
106 {
107 	struct fuse_req *req;
108 	LIST_HEAD(req_list);
109 
110 	spin_lock(&queue->lock);
111 	list_for_each_entry(req, &queue->fuse_req_queue, list)
112 		clear_bit(FR_PENDING, &req->flags);
113 	list_splice_init(&queue->fuse_req_queue, &req_list);
114 	spin_unlock(&queue->lock);
115 
116 	/* must not hold queue lock to avoid order issues with fi->lock */
117 	fuse_dev_end_requests(&req_list);
118 }
119 
120 void fuse_uring_abort_end_requests(struct fuse_ring *ring)
121 {
122 	int qid;
123 	struct fuse_ring_queue *queue;
124 	struct fuse_conn *fc = ring->fc;
125 
126 	for (qid = 0; qid < ring->nr_queues; qid++) {
127 		queue = READ_ONCE(ring->queues[qid]);
128 		if (!queue)
129 			continue;
130 
131 		queue->stopped = true;
132 
133 		WARN_ON_ONCE(ring->fc->max_background != UINT_MAX);
134 		spin_lock(&queue->lock);
135 		spin_lock(&fc->bg_lock);
136 		fuse_uring_flush_bg(queue);
137 		spin_unlock(&fc->bg_lock);
138 		spin_unlock(&queue->lock);
139 		fuse_uring_abort_end_queue_requests(queue);
140 	}
141 }
142 
143 bool fuse_uring_request_expired(struct fuse_conn *fc)
144 {
145 	struct fuse_ring *ring = fc->ring;
146 	struct fuse_ring_queue *queue;
147 	int qid;
148 
149 	if (!ring)
150 		return false;
151 
152 	for (qid = 0; qid < ring->nr_queues; qid++) {
153 		queue = READ_ONCE(ring->queues[qid]);
154 		if (!queue)
155 			continue;
156 
157 		spin_lock(&queue->lock);
158 		if (fuse_request_expired(fc, &queue->fuse_req_queue) ||
159 		    fuse_request_expired(fc, &queue->fuse_req_bg_queue) ||
160 		    fuse_fpq_processing_expired(fc, queue->fpq.processing)) {
161 			spin_unlock(&queue->lock);
162 			return true;
163 		}
164 		spin_unlock(&queue->lock);
165 	}
166 
167 	return false;
168 }
169 
170 void fuse_uring_destruct(struct fuse_conn *fc)
171 {
172 	struct fuse_ring *ring = fc->ring;
173 	int qid;
174 
175 	if (!ring)
176 		return;
177 
178 	for (qid = 0; qid < ring->nr_queues; qid++) {
179 		struct fuse_ring_queue *queue = ring->queues[qid];
180 		struct fuse_ring_ent *ent, *next;
181 
182 		if (!queue)
183 			continue;
184 
185 		WARN_ON(!list_empty(&queue->ent_avail_queue));
186 		WARN_ON(!list_empty(&queue->ent_w_req_queue));
187 		WARN_ON(!list_empty(&queue->ent_commit_queue));
188 		WARN_ON(!list_empty(&queue->ent_in_userspace));
189 
190 		list_for_each_entry_safe(ent, next, &queue->ent_released,
191 					 list) {
192 			list_del_init(&ent->list);
193 			kfree(ent);
194 		}
195 
196 		kfree(queue->fpq.processing);
197 		kfree(queue);
198 		ring->queues[qid] = NULL;
199 	}
200 
201 	kfree(ring->queues);
202 	kfree(ring);
203 	fc->ring = NULL;
204 }
205 
206 /*
207  * Basic ring setup for this connection based on the provided configuration
208  */
209 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
210 {
211 	struct fuse_ring *ring;
212 	size_t nr_queues = num_possible_cpus();
213 	struct fuse_ring *res = NULL;
214 	size_t max_payload_size;
215 
216 	ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
217 	if (!ring)
218 		return NULL;
219 
220 	ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
221 			       GFP_KERNEL_ACCOUNT);
222 	if (!ring->queues)
223 		goto out_err;
224 
225 	max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
226 	max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
227 
228 	spin_lock(&fc->lock);
229 	if (fc->ring) {
230 		/* race, another thread created the ring in the meantime */
231 		spin_unlock(&fc->lock);
232 		res = fc->ring;
233 		goto out_err;
234 	}
235 
236 	init_waitqueue_head(&ring->stop_waitq);
237 
238 	ring->nr_queues = nr_queues;
239 	ring->fc = fc;
240 	ring->max_payload_sz = max_payload_size;
241 	atomic_set(&ring->queue_refs, 0);
242 	smp_store_release(&fc->ring, ring);
243 
244 	spin_unlock(&fc->lock);
245 	return ring;
246 
247 out_err:
248 	kfree(ring->queues);
249 	kfree(ring);
250 	return res;
251 }
252 
253 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
254 						       int qid)
255 {
256 	struct fuse_conn *fc = ring->fc;
257 	struct fuse_ring_queue *queue;
258 	struct list_head *pq;
259 
260 	queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
261 	if (!queue)
262 		return NULL;
263 	pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
264 	if (!pq) {
265 		kfree(queue);
266 		return NULL;
267 	}
268 
269 	queue->qid = qid;
270 	queue->ring = ring;
271 	spin_lock_init(&queue->lock);
272 
273 	INIT_LIST_HEAD(&queue->ent_avail_queue);
274 	INIT_LIST_HEAD(&queue->ent_commit_queue);
275 	INIT_LIST_HEAD(&queue->ent_w_req_queue);
276 	INIT_LIST_HEAD(&queue->ent_in_userspace);
277 	INIT_LIST_HEAD(&queue->fuse_req_queue);
278 	INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
279 	INIT_LIST_HEAD(&queue->ent_released);
280 
281 	queue->fpq.processing = pq;
282 	fuse_pqueue_init(&queue->fpq);
283 
284 	spin_lock(&fc->lock);
285 	if (ring->queues[qid]) {
286 		spin_unlock(&fc->lock);
287 		kfree(queue->fpq.processing);
288 		kfree(queue);
289 		return ring->queues[qid];
290 	}
291 
292 	/*
293 	 * write_once and lock as the caller mostly doesn't take the lock at all
294 	 */
295 	WRITE_ONCE(ring->queues[qid], queue);
296 	spin_unlock(&fc->lock);
297 
298 	return queue;
299 }
300 
301 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
302 {
303 	clear_bit(FR_SENT, &req->flags);
304 	req->out.h.error = -ECONNABORTED;
305 	fuse_request_end(req);
306 }
307 
308 /*
309  * Release a request/entry on connection tear down
310  */
311 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
312 {
313 	struct fuse_req *req;
314 	struct io_uring_cmd *cmd;
315 
316 	struct fuse_ring_queue *queue = ent->queue;
317 
318 	spin_lock(&queue->lock);
319 	cmd = ent->cmd;
320 	ent->cmd = NULL;
321 	req = ent->fuse_req;
322 	ent->fuse_req = NULL;
323 	if (req) {
324 		/* remove entry from queue->fpq->processing */
325 		list_del_init(&req->list);
326 	}
327 
328 	/*
329 	 * The entry must not be freed immediately, due to access of direct
330 	 * pointer access of entries through IO_URING_F_CANCEL - there is a risk
331 	 * of race between daemon termination (which triggers IO_URING_F_CANCEL
332 	 * and accesses entries without checking the list state first
333 	 */
334 	list_move(&ent->list, &queue->ent_released);
335 	ent->state = FRRS_RELEASED;
336 	spin_unlock(&queue->lock);
337 
338 	if (cmd)
339 		io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
340 
341 	if (req)
342 		fuse_uring_stop_fuse_req_end(req);
343 }
344 
345 static void fuse_uring_stop_list_entries(struct list_head *head,
346 					 struct fuse_ring_queue *queue,
347 					 enum fuse_ring_req_state exp_state)
348 {
349 	struct fuse_ring *ring = queue->ring;
350 	struct fuse_ring_ent *ent, *next;
351 	ssize_t queue_refs = SSIZE_MAX;
352 	LIST_HEAD(to_teardown);
353 
354 	spin_lock(&queue->lock);
355 	list_for_each_entry_safe(ent, next, head, list) {
356 		if (ent->state != exp_state) {
357 			pr_warn("entry teardown qid=%d state=%d expected=%d",
358 				queue->qid, ent->state, exp_state);
359 			continue;
360 		}
361 
362 		ent->state = FRRS_TEARDOWN;
363 		list_move(&ent->list, &to_teardown);
364 	}
365 	spin_unlock(&queue->lock);
366 
367 	/* no queue lock to avoid lock order issues */
368 	list_for_each_entry_safe(ent, next, &to_teardown, list) {
369 		fuse_uring_entry_teardown(ent);
370 		queue_refs = atomic_dec_return(&ring->queue_refs);
371 		WARN_ON_ONCE(queue_refs < 0);
372 	}
373 }
374 
375 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue)
376 {
377 	fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue,
378 				     FRRS_USERSPACE);
379 	fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue,
380 				     FRRS_AVAILABLE);
381 }
382 
383 /*
384  * Log state debug info
385  */
386 static void fuse_uring_log_ent_state(struct fuse_ring *ring)
387 {
388 	int qid;
389 	struct fuse_ring_ent *ent;
390 
391 	for (qid = 0; qid < ring->nr_queues; qid++) {
392 		struct fuse_ring_queue *queue = ring->queues[qid];
393 
394 		if (!queue)
395 			continue;
396 
397 		spin_lock(&queue->lock);
398 		/*
399 		 * Log entries from the intermediate queue, the other queues
400 		 * should be empty
401 		 */
402 		list_for_each_entry(ent, &queue->ent_w_req_queue, list) {
403 			pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n",
404 				ring, qid, ent, ent->state);
405 		}
406 		list_for_each_entry(ent, &queue->ent_commit_queue, list) {
407 			pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n",
408 				ring, qid, ent, ent->state);
409 		}
410 		spin_unlock(&queue->lock);
411 	}
412 	ring->stop_debug_log = 1;
413 }
414 
415 static void fuse_uring_async_stop_queues(struct work_struct *work)
416 {
417 	int qid;
418 	struct fuse_ring *ring =
419 		container_of(work, struct fuse_ring, async_teardown_work.work);
420 
421 	/* XXX code dup */
422 	for (qid = 0; qid < ring->nr_queues; qid++) {
423 		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
424 
425 		if (!queue)
426 			continue;
427 
428 		fuse_uring_teardown_entries(queue);
429 	}
430 
431 	/*
432 	 * Some ring entries might be in the middle of IO operations,
433 	 * i.e. in process to get handled by file_operations::uring_cmd
434 	 * or on the way to userspace - we could handle that with conditions in
435 	 * run time code, but easier/cleaner to have an async tear down handler
436 	 * If there are still queue references left
437 	 */
438 	if (atomic_read(&ring->queue_refs) > 0) {
439 		if (time_after(jiffies,
440 			       ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT))
441 			fuse_uring_log_ent_state(ring);
442 
443 		schedule_delayed_work(&ring->async_teardown_work,
444 				      FUSE_URING_TEARDOWN_INTERVAL);
445 	} else {
446 		wake_up_all(&ring->stop_waitq);
447 	}
448 }
449 
450 /*
451  * Stop the ring queues
452  */
453 void fuse_uring_stop_queues(struct fuse_ring *ring)
454 {
455 	int qid;
456 
457 	for (qid = 0; qid < ring->nr_queues; qid++) {
458 		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
459 
460 		if (!queue)
461 			continue;
462 
463 		fuse_uring_teardown_entries(queue);
464 	}
465 
466 	if (atomic_read(&ring->queue_refs) > 0) {
467 		ring->teardown_time = jiffies;
468 		INIT_DELAYED_WORK(&ring->async_teardown_work,
469 				  fuse_uring_async_stop_queues);
470 		schedule_delayed_work(&ring->async_teardown_work,
471 				      FUSE_URING_TEARDOWN_INTERVAL);
472 	} else {
473 		wake_up_all(&ring->stop_waitq);
474 	}
475 }
476 
477 /*
478  * Handle IO_URING_F_CANCEL, typically should come on daemon termination.
479  *
480  * Releasing the last entry should trigger fuse_dev_release() if
481  * the daemon was terminated
482  */
483 static void fuse_uring_cancel(struct io_uring_cmd *cmd,
484 			      unsigned int issue_flags)
485 {
486 	struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
487 	struct fuse_ring_queue *queue;
488 	bool need_cmd_done = false;
489 
490 	/*
491 	 * direct access on ent - it must not be destructed as long as
492 	 * IO_URING_F_CANCEL might come up
493 	 */
494 	queue = ent->queue;
495 	spin_lock(&queue->lock);
496 	if (ent->state == FRRS_AVAILABLE) {
497 		ent->state = FRRS_USERSPACE;
498 		list_move(&ent->list, &queue->ent_in_userspace);
499 		need_cmd_done = true;
500 		ent->cmd = NULL;
501 	}
502 	spin_unlock(&queue->lock);
503 
504 	if (need_cmd_done) {
505 		/* no queue lock to avoid lock order issues */
506 		io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
507 	}
508 }
509 
510 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
511 				      struct fuse_ring_ent *ring_ent)
512 {
513 	uring_cmd_set_ring_ent(cmd, ring_ent);
514 	io_uring_cmd_mark_cancelable(cmd, issue_flags);
515 }
516 
517 /*
518  * Checks for errors and stores it into the request
519  */
520 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
521 					 struct fuse_req *req,
522 					 struct fuse_conn *fc)
523 {
524 	int err;
525 
526 	err = -EINVAL;
527 	if (oh->unique == 0) {
528 		/* Not supported through io-uring yet */
529 		pr_warn_once("notify through fuse-io-uring not supported\n");
530 		goto err;
531 	}
532 
533 	if (oh->error <= -ERESTARTSYS || oh->error > 0)
534 		goto err;
535 
536 	if (oh->error) {
537 		err = oh->error;
538 		goto err;
539 	}
540 
541 	err = -ENOENT;
542 	if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) {
543 		pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n",
544 				    req->in.h.unique,
545 				    oh->unique & ~FUSE_INT_REQ_BIT);
546 		goto err;
547 	}
548 
549 	/*
550 	 * Is it an interrupt reply ID?
551 	 * XXX: Not supported through fuse-io-uring yet, it should not even
552 	 *      find the request - should not happen.
553 	 */
554 	WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT);
555 
556 	err = 0;
557 err:
558 	return err;
559 }
560 
561 static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
562 				     struct fuse_req *req,
563 				     struct fuse_ring_ent *ent)
564 {
565 	struct fuse_copy_state cs;
566 	struct fuse_args *args = req->args;
567 	struct iov_iter iter;
568 	int err;
569 	struct fuse_uring_ent_in_out ring_in_out;
570 
571 	err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
572 			     sizeof(ring_in_out));
573 	if (err)
574 		return -EFAULT;
575 
576 	err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
577 			  &iter);
578 	if (err)
579 		return err;
580 
581 	fuse_copy_init(&cs, 0, &iter);
582 	cs.is_uring = 1;
583 	cs.req = req;
584 
585 	return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
586 }
587 
588  /*
589   * Copy data from the req to the ring buffer
590   */
591 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
592 				   struct fuse_ring_ent *ent)
593 {
594 	struct fuse_copy_state cs;
595 	struct fuse_args *args = req->args;
596 	struct fuse_in_arg *in_args = args->in_args;
597 	int num_args = args->in_numargs;
598 	int err;
599 	struct iov_iter iter;
600 	struct fuse_uring_ent_in_out ent_in_out = {
601 		.flags = 0,
602 		.commit_id = req->in.h.unique,
603 	};
604 
605 	err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
606 	if (err) {
607 		pr_info_ratelimited("fuse: Import of user buffer failed\n");
608 		return err;
609 	}
610 
611 	fuse_copy_init(&cs, 1, &iter);
612 	cs.is_uring = 1;
613 	cs.req = req;
614 
615 	if (num_args > 0) {
616 		/*
617 		 * Expectation is that the first argument is the per op header.
618 		 * Some op code have that as zero size.
619 		 */
620 		if (args->in_args[0].size > 0) {
621 			err = copy_to_user(&ent->headers->op_in, in_args->value,
622 					   in_args->size);
623 			if (err) {
624 				pr_info_ratelimited(
625 					"Copying the header failed.\n");
626 				return -EFAULT;
627 			}
628 		}
629 		in_args++;
630 		num_args--;
631 	}
632 
633 	/* copy the payload */
634 	err = fuse_copy_args(&cs, num_args, args->in_pages,
635 			     (struct fuse_arg *)in_args, 0);
636 	if (err) {
637 		pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
638 		return err;
639 	}
640 
641 	ent_in_out.payload_sz = cs.ring.copied_sz;
642 	err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
643 			   sizeof(ent_in_out));
644 	return err ? -EFAULT : 0;
645 }
646 
647 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
648 				   struct fuse_req *req)
649 {
650 	struct fuse_ring_queue *queue = ent->queue;
651 	struct fuse_ring *ring = queue->ring;
652 	int err;
653 
654 	err = -EIO;
655 	if (WARN_ON(ent->state != FRRS_FUSE_REQ)) {
656 		pr_err("qid=%d ring-req=%p invalid state %d on send\n",
657 		       queue->qid, ent, ent->state);
658 		return err;
659 	}
660 
661 	err = -EINVAL;
662 	if (WARN_ON(req->in.h.unique == 0))
663 		return err;
664 
665 	/* copy the request */
666 	err = fuse_uring_args_to_ring(ring, req, ent);
667 	if (unlikely(err)) {
668 		pr_info_ratelimited("Copy to ring failed: %d\n", err);
669 		return err;
670 	}
671 
672 	/* copy fuse_in_header */
673 	err = copy_to_user(&ent->headers->in_out, &req->in.h,
674 			   sizeof(req->in.h));
675 	if (err) {
676 		err = -EFAULT;
677 		return err;
678 	}
679 
680 	return 0;
681 }
682 
683 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
684 				   struct fuse_req *req)
685 {
686 	int err;
687 
688 	err = fuse_uring_copy_to_ring(ent, req);
689 	if (!err)
690 		set_bit(FR_SENT, &req->flags);
691 	else
692 		fuse_uring_req_end(ent, req, err);
693 
694 	return err;
695 }
696 
697 /*
698  * Write data to the ring buffer and send the request to userspace,
699  * userspace will read it
700  * This is comparable with classical read(/dev/fuse)
701  */
702 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
703 					struct fuse_req *req,
704 					unsigned int issue_flags)
705 {
706 	struct fuse_ring_queue *queue = ent->queue;
707 	int err;
708 	struct io_uring_cmd *cmd;
709 
710 	err = fuse_uring_prepare_send(ent, req);
711 	if (err)
712 		return err;
713 
714 	spin_lock(&queue->lock);
715 	cmd = ent->cmd;
716 	ent->cmd = NULL;
717 	ent->state = FRRS_USERSPACE;
718 	list_move(&ent->list, &queue->ent_in_userspace);
719 	spin_unlock(&queue->lock);
720 
721 	io_uring_cmd_done(cmd, 0, 0, issue_flags);
722 	return 0;
723 }
724 
725 /*
726  * Make a ring entry available for fuse_req assignment
727  */
728 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
729 				 struct fuse_ring_queue *queue)
730 {
731 	WARN_ON_ONCE(!ent->cmd);
732 	list_move(&ent->list, &queue->ent_avail_queue);
733 	ent->state = FRRS_AVAILABLE;
734 }
735 
736 /* Used to find the request on SQE commit */
737 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent,
738 				 struct fuse_req *req)
739 {
740 	struct fuse_ring_queue *queue = ent->queue;
741 	struct fuse_pqueue *fpq = &queue->fpq;
742 	unsigned int hash;
743 
744 	req->ring_entry = ent;
745 	hash = fuse_req_hash(req->in.h.unique);
746 	list_move_tail(&req->list, &fpq->processing[hash]);
747 }
748 
749 /*
750  * Assign a fuse queue entry to the given entry
751  */
752 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
753 					   struct fuse_req *req)
754 {
755 	struct fuse_ring_queue *queue = ent->queue;
756 
757 	lockdep_assert_held(&queue->lock);
758 
759 	if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE &&
760 			 ent->state != FRRS_COMMIT)) {
761 		pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid,
762 			ent->state);
763 	}
764 
765 	clear_bit(FR_PENDING, &req->flags);
766 	ent->fuse_req = req;
767 	ent->state = FRRS_FUSE_REQ;
768 	list_move(&ent->list, &queue->ent_w_req_queue);
769 	fuse_uring_add_to_pq(ent, req);
770 }
771 
772 /* Fetch the next fuse request if available */
773 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
774 	__must_hold(&queue->lock)
775 {
776 	struct fuse_req *req;
777 	struct fuse_ring_queue *queue = ent->queue;
778 	struct list_head *req_queue = &queue->fuse_req_queue;
779 
780 	lockdep_assert_held(&queue->lock);
781 
782 	/* get and assign the next entry while it is still holding the lock */
783 	req = list_first_entry_or_null(req_queue, struct fuse_req, list);
784 	if (req)
785 		fuse_uring_add_req_to_ring_ent(ent, req);
786 
787 	return req;
788 }
789 
790 /*
791  * Read data from the ring buffer, which user space has written to
792  * This is comparible with handling of classical write(/dev/fuse).
793  * Also make the ring request available again for new fuse requests.
794  */
795 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
796 			      unsigned int issue_flags)
797 {
798 	struct fuse_ring *ring = ent->queue->ring;
799 	struct fuse_conn *fc = ring->fc;
800 	ssize_t err = 0;
801 
802 	err = copy_from_user(&req->out.h, &ent->headers->in_out,
803 			     sizeof(req->out.h));
804 	if (err) {
805 		req->out.h.error = -EFAULT;
806 		goto out;
807 	}
808 
809 	err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
810 	if (err) {
811 		/* req->out.h.error already set */
812 		goto out;
813 	}
814 
815 	err = fuse_uring_copy_from_ring(ring, req, ent);
816 out:
817 	fuse_uring_req_end(ent, req, err);
818 }
819 
820 /*
821  * Get the next fuse req and send it
822  */
823 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
824 				     struct fuse_ring_queue *queue,
825 				     unsigned int issue_flags)
826 {
827 	int err;
828 	struct fuse_req *req;
829 
830 retry:
831 	spin_lock(&queue->lock);
832 	fuse_uring_ent_avail(ent, queue);
833 	req = fuse_uring_ent_assign_req(ent);
834 	spin_unlock(&queue->lock);
835 
836 	if (req) {
837 		err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
838 		if (err)
839 			goto retry;
840 	}
841 }
842 
843 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
844 {
845 	struct fuse_ring_queue *queue = ent->queue;
846 
847 	lockdep_assert_held(&queue->lock);
848 
849 	if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
850 		return -EIO;
851 
852 	ent->state = FRRS_COMMIT;
853 	list_move(&ent->list, &queue->ent_commit_queue);
854 
855 	return 0;
856 }
857 
858 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
859 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
860 				   struct fuse_conn *fc)
861 {
862 	const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
863 	struct fuse_ring_ent *ent;
864 	int err;
865 	struct fuse_ring *ring = fc->ring;
866 	struct fuse_ring_queue *queue;
867 	uint64_t commit_id = READ_ONCE(cmd_req->commit_id);
868 	unsigned int qid = READ_ONCE(cmd_req->qid);
869 	struct fuse_pqueue *fpq;
870 	struct fuse_req *req;
871 
872 	err = -ENOTCONN;
873 	if (!ring)
874 		return err;
875 
876 	if (qid >= ring->nr_queues)
877 		return -EINVAL;
878 
879 	queue = ring->queues[qid];
880 	if (!queue)
881 		return err;
882 	fpq = &queue->fpq;
883 
884 	if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped))
885 		return err;
886 
887 	spin_lock(&queue->lock);
888 	/* Find a request based on the unique ID of the fuse request
889 	 * This should get revised, as it needs a hash calculation and list
890 	 * search. And full struct fuse_pqueue is needed (memory overhead).
891 	 * As well as the link from req to ring_ent.
892 	 */
893 	req = fuse_request_find(fpq, commit_id);
894 	err = -ENOENT;
895 	if (!req) {
896 		pr_info("qid=%d commit_id %llu not found\n", queue->qid,
897 			commit_id);
898 		spin_unlock(&queue->lock);
899 		return err;
900 	}
901 	list_del_init(&req->list);
902 	ent = req->ring_entry;
903 	req->ring_entry = NULL;
904 
905 	err = fuse_ring_ent_set_commit(ent);
906 	if (err != 0) {
907 		pr_info_ratelimited("qid=%d commit_id %llu state %d",
908 				    queue->qid, commit_id, ent->state);
909 		spin_unlock(&queue->lock);
910 		req->out.h.error = err;
911 		clear_bit(FR_SENT, &req->flags);
912 		fuse_request_end(req);
913 		return err;
914 	}
915 
916 	ent->cmd = cmd;
917 	spin_unlock(&queue->lock);
918 
919 	/* without the queue lock, as other locks are taken */
920 	fuse_uring_prepare_cancel(cmd, issue_flags, ent);
921 	fuse_uring_commit(ent, req, issue_flags);
922 
923 	/*
924 	 * Fetching the next request is absolutely required as queued
925 	 * fuse requests would otherwise not get processed - committing
926 	 * and fetching is done in one step vs legacy fuse, which has separated
927 	 * read (fetch request) and write (commit result).
928 	 */
929 	fuse_uring_next_fuse_req(ent, queue, issue_flags);
930 	return 0;
931 }
932 
933 static bool is_ring_ready(struct fuse_ring *ring, int current_qid)
934 {
935 	int qid;
936 	struct fuse_ring_queue *queue;
937 	bool ready = true;
938 
939 	for (qid = 0; qid < ring->nr_queues && ready; qid++) {
940 		if (current_qid == qid)
941 			continue;
942 
943 		queue = ring->queues[qid];
944 		if (!queue) {
945 			ready = false;
946 			break;
947 		}
948 
949 		spin_lock(&queue->lock);
950 		if (list_empty(&queue->ent_avail_queue))
951 			ready = false;
952 		spin_unlock(&queue->lock);
953 	}
954 
955 	return ready;
956 }
957 
958 /*
959  * fuse_uring_req_fetch command handling
960  */
961 static void fuse_uring_do_register(struct fuse_ring_ent *ent,
962 				   struct io_uring_cmd *cmd,
963 				   unsigned int issue_flags)
964 {
965 	struct fuse_ring_queue *queue = ent->queue;
966 	struct fuse_ring *ring = queue->ring;
967 	struct fuse_conn *fc = ring->fc;
968 	struct fuse_iqueue *fiq = &fc->iq;
969 
970 	fuse_uring_prepare_cancel(cmd, issue_flags, ent);
971 
972 	spin_lock(&queue->lock);
973 	ent->cmd = cmd;
974 	fuse_uring_ent_avail(ent, queue);
975 	spin_unlock(&queue->lock);
976 
977 	if (!ring->ready) {
978 		bool ready = is_ring_ready(ring, queue->qid);
979 
980 		if (ready) {
981 			WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
982 			WRITE_ONCE(ring->ready, true);
983 			wake_up_all(&fc->blocked_waitq);
984 		}
985 	}
986 }
987 
988 /*
989  * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1]
990  * the payload
991  */
992 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
993 					 struct iovec iov[FUSE_URING_IOV_SEGS])
994 {
995 	struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr));
996 	struct iov_iter iter;
997 	ssize_t ret;
998 
999 	if (sqe->len != FUSE_URING_IOV_SEGS)
1000 		return -EINVAL;
1001 
1002 	/*
1003 	 * Direction for buffer access will actually be READ and WRITE,
1004 	 * using write for the import should include READ access as well.
1005 	 */
1006 	ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS,
1007 			   FUSE_URING_IOV_SEGS, &iov, &iter);
1008 	if (ret < 0)
1009 		return ret;
1010 
1011 	return 0;
1012 }
1013 
1014 static struct fuse_ring_ent *
1015 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
1016 			   struct fuse_ring_queue *queue)
1017 {
1018 	struct fuse_ring *ring = queue->ring;
1019 	struct fuse_ring_ent *ent;
1020 	size_t payload_size;
1021 	struct iovec iov[FUSE_URING_IOV_SEGS];
1022 	int err;
1023 
1024 	err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
1025 	if (err) {
1026 		pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
1027 				    err);
1028 		return ERR_PTR(err);
1029 	}
1030 
1031 	err = -EINVAL;
1032 	if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
1033 		pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
1034 		return ERR_PTR(err);
1035 	}
1036 
1037 	payload_size = iov[1].iov_len;
1038 	if (payload_size < ring->max_payload_sz) {
1039 		pr_info_ratelimited("Invalid req payload len %zu\n",
1040 				    payload_size);
1041 		return ERR_PTR(err);
1042 	}
1043 
1044 	err = -ENOMEM;
1045 	ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
1046 	if (!ent)
1047 		return ERR_PTR(err);
1048 
1049 	INIT_LIST_HEAD(&ent->list);
1050 
1051 	ent->queue = queue;
1052 	ent->headers = iov[0].iov_base;
1053 	ent->payload = iov[1].iov_base;
1054 
1055 	atomic_inc(&ring->queue_refs);
1056 	return ent;
1057 }
1058 
1059 /*
1060  * Register header and payload buffer with the kernel and puts the
1061  * entry as "ready to get fuse requests" on the queue
1062  */
1063 static int fuse_uring_register(struct io_uring_cmd *cmd,
1064 			       unsigned int issue_flags, struct fuse_conn *fc)
1065 {
1066 	const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
1067 	struct fuse_ring *ring = smp_load_acquire(&fc->ring);
1068 	struct fuse_ring_queue *queue;
1069 	struct fuse_ring_ent *ent;
1070 	int err;
1071 	unsigned int qid = READ_ONCE(cmd_req->qid);
1072 
1073 	err = -ENOMEM;
1074 	if (!ring) {
1075 		ring = fuse_uring_create(fc);
1076 		if (!ring)
1077 			return err;
1078 	}
1079 
1080 	if (qid >= ring->nr_queues) {
1081 		pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid);
1082 		return -EINVAL;
1083 	}
1084 
1085 	queue = ring->queues[qid];
1086 	if (!queue) {
1087 		queue = fuse_uring_create_queue(ring, qid);
1088 		if (!queue)
1089 			return err;
1090 	}
1091 
1092 	/*
1093 	 * The created queue above does not need to be destructed in
1094 	 * case of entry errors below, will be done at ring destruction time.
1095 	 */
1096 
1097 	ent = fuse_uring_create_ring_ent(cmd, queue);
1098 	if (IS_ERR(ent))
1099 		return PTR_ERR(ent);
1100 
1101 	fuse_uring_do_register(ent, cmd, issue_flags);
1102 
1103 	return 0;
1104 }
1105 
1106 /*
1107  * Entry function from io_uring to handle the given passthrough command
1108  * (op code IORING_OP_URING_CMD)
1109  */
1110 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1111 {
1112 	struct fuse_dev *fud;
1113 	struct fuse_conn *fc;
1114 	u32 cmd_op = cmd->cmd_op;
1115 	int err;
1116 
1117 	if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
1118 		fuse_uring_cancel(cmd, issue_flags);
1119 		return 0;
1120 	}
1121 
1122 	/* This extra SQE size holds struct fuse_uring_cmd_req */
1123 	if (!(issue_flags & IO_URING_F_SQE128))
1124 		return -EINVAL;
1125 
1126 	fud = fuse_get_dev(cmd->file);
1127 	if (!fud) {
1128 		pr_info_ratelimited("No fuse device found\n");
1129 		return -ENOTCONN;
1130 	}
1131 	fc = fud->fc;
1132 
1133 	/* Once a connection has io-uring enabled on it, it can't be disabled */
1134 	if (!enable_uring && !fc->io_uring) {
1135 		pr_info_ratelimited("fuse-io-uring is disabled\n");
1136 		return -EOPNOTSUPP;
1137 	}
1138 
1139 	if (fc->aborted)
1140 		return -ECONNABORTED;
1141 	if (!fc->connected)
1142 		return -ENOTCONN;
1143 
1144 	/*
1145 	 * fuse_uring_register() needs the ring to be initialized,
1146 	 * we need to know the max payload size
1147 	 */
1148 	if (!fc->initialized)
1149 		return -EAGAIN;
1150 
1151 	switch (cmd_op) {
1152 	case FUSE_IO_URING_CMD_REGISTER:
1153 		err = fuse_uring_register(cmd, issue_flags, fc);
1154 		if (err) {
1155 			pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
1156 				     err);
1157 			fc->io_uring = 0;
1158 			wake_up_all(&fc->blocked_waitq);
1159 			return err;
1160 		}
1161 		break;
1162 	case FUSE_IO_URING_CMD_COMMIT_AND_FETCH:
1163 		err = fuse_uring_commit_fetch(cmd, issue_flags, fc);
1164 		if (err) {
1165 			pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n",
1166 				     err);
1167 			return err;
1168 		}
1169 		break;
1170 	default:
1171 		return -EINVAL;
1172 	}
1173 
1174 	return -EIOCBQUEUED;
1175 }
1176 
1177 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
1178 			    ssize_t ret, unsigned int issue_flags)
1179 {
1180 	struct fuse_ring_queue *queue = ent->queue;
1181 
1182 	spin_lock(&queue->lock);
1183 	ent->state = FRRS_USERSPACE;
1184 	list_move(&ent->list, &queue->ent_in_userspace);
1185 	ent->cmd = NULL;
1186 	spin_unlock(&queue->lock);
1187 
1188 	io_uring_cmd_done(cmd, ret, 0, issue_flags);
1189 }
1190 
1191 /*
1192  * This prepares and sends the ring request in fuse-uring task context.
1193  * User buffers are not mapped yet - the application does not have permission
1194  * to write to it - this has to be executed in ring task context.
1195  */
1196 static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
1197 				    unsigned int issue_flags)
1198 {
1199 	struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
1200 	struct fuse_ring_queue *queue = ent->queue;
1201 	int err;
1202 
1203 	if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
1204 		err = fuse_uring_prepare_send(ent, ent->fuse_req);
1205 		if (err) {
1206 			fuse_uring_next_fuse_req(ent, queue, issue_flags);
1207 			return;
1208 		}
1209 	} else {
1210 		err = -ECANCELED;
1211 	}
1212 
1213 	fuse_uring_send(ent, cmd, err, issue_flags);
1214 }
1215 
1216 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
1217 {
1218 	unsigned int qid;
1219 	struct fuse_ring_queue *queue;
1220 
1221 	qid = task_cpu(current);
1222 
1223 	if (WARN_ONCE(qid >= ring->nr_queues,
1224 		      "Core number (%u) exceeds nr queues (%zu)\n", qid,
1225 		      ring->nr_queues))
1226 		qid = 0;
1227 
1228 	queue = ring->queues[qid];
1229 	WARN_ONCE(!queue, "Missing queue for qid %d\n", qid);
1230 
1231 	return queue;
1232 }
1233 
1234 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent)
1235 {
1236 	struct io_uring_cmd *cmd = ent->cmd;
1237 
1238 	uring_cmd_set_ring_ent(cmd, ent);
1239 	io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task);
1240 }
1241 
1242 /* queue a fuse request and send it if a ring entry is available */
1243 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
1244 {
1245 	struct fuse_conn *fc = req->fm->fc;
1246 	struct fuse_ring *ring = fc->ring;
1247 	struct fuse_ring_queue *queue;
1248 	struct fuse_ring_ent *ent = NULL;
1249 	int err;
1250 
1251 	err = -EINVAL;
1252 	queue = fuse_uring_task_to_queue(ring);
1253 	if (!queue)
1254 		goto err;
1255 
1256 	if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
1257 		req->in.h.unique = fuse_get_unique(fiq);
1258 
1259 	spin_lock(&queue->lock);
1260 	err = -ENOTCONN;
1261 	if (unlikely(queue->stopped))
1262 		goto err_unlock;
1263 
1264 	set_bit(FR_URING, &req->flags);
1265 	req->ring_queue = queue;
1266 	ent = list_first_entry_or_null(&queue->ent_avail_queue,
1267 				       struct fuse_ring_ent, list);
1268 	if (ent)
1269 		fuse_uring_add_req_to_ring_ent(ent, req);
1270 	else
1271 		list_add_tail(&req->list, &queue->fuse_req_queue);
1272 	spin_unlock(&queue->lock);
1273 
1274 	if (ent)
1275 		fuse_uring_dispatch_ent(ent);
1276 
1277 	return;
1278 
1279 err_unlock:
1280 	spin_unlock(&queue->lock);
1281 err:
1282 	req->out.h.error = err;
1283 	clear_bit(FR_PENDING, &req->flags);
1284 	fuse_request_end(req);
1285 }
1286 
1287 bool fuse_uring_queue_bq_req(struct fuse_req *req)
1288 {
1289 	struct fuse_conn *fc = req->fm->fc;
1290 	struct fuse_ring *ring = fc->ring;
1291 	struct fuse_ring_queue *queue;
1292 	struct fuse_ring_ent *ent = NULL;
1293 
1294 	queue = fuse_uring_task_to_queue(ring);
1295 	if (!queue)
1296 		return false;
1297 
1298 	spin_lock(&queue->lock);
1299 	if (unlikely(queue->stopped)) {
1300 		spin_unlock(&queue->lock);
1301 		return false;
1302 	}
1303 
1304 	set_bit(FR_URING, &req->flags);
1305 	req->ring_queue = queue;
1306 	list_add_tail(&req->list, &queue->fuse_req_bg_queue);
1307 
1308 	ent = list_first_entry_or_null(&queue->ent_avail_queue,
1309 				       struct fuse_ring_ent, list);
1310 	spin_lock(&fc->bg_lock);
1311 	fc->num_background++;
1312 	if (fc->num_background == fc->max_background)
1313 		fc->blocked = 1;
1314 	fuse_uring_flush_bg(queue);
1315 	spin_unlock(&fc->bg_lock);
1316 
1317 	/*
1318 	 * Due to bg_queue flush limits there might be other bg requests
1319 	 * in the queue that need to be handled first. Or no further req
1320 	 * might be available.
1321 	 */
1322 	req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
1323 				       list);
1324 	if (ent && req) {
1325 		fuse_uring_add_req_to_ring_ent(ent, req);
1326 		spin_unlock(&queue->lock);
1327 
1328 		fuse_uring_dispatch_ent(ent);
1329 	} else {
1330 		spin_unlock(&queue->lock);
1331 	}
1332 
1333 	return true;
1334 }
1335 
1336 bool fuse_uring_remove_pending_req(struct fuse_req *req)
1337 {
1338 	struct fuse_ring_queue *queue = req->ring_queue;
1339 
1340 	return fuse_remove_pending_req(req, &queue->lock);
1341 }
1342 
1343 static const struct fuse_iqueue_ops fuse_io_uring_ops = {
1344 	/* should be send over io-uring as enhancement */
1345 	.send_forget = fuse_dev_queue_forget,
1346 
1347 	/*
1348 	 * could be send over io-uring, but interrupts should be rare,
1349 	 * no need to make the code complex
1350 	 */
1351 	.send_interrupt = fuse_dev_queue_interrupt,
1352 	.send_req = fuse_uring_queue_fuse_req,
1353 };
1354