1*ba6e0e5cSBreno Leitao /* SPDX-License-Identifier: MIT */
2*ba6e0e5cSBreno Leitao 
3*ba6e0e5cSBreno Leitao #include <linux/io_uring.h>
4*ba6e0e5cSBreno Leitao #include <sys/mman.h>
5*ba6e0e5cSBreno Leitao #include <sys/syscall.h>
6*ba6e0e5cSBreno Leitao #include <stdio.h>
7*ba6e0e5cSBreno Leitao #include <string.h>
8*ba6e0e5cSBreno Leitao #include <unistd.h>
9*ba6e0e5cSBreno Leitao 
10*ba6e0e5cSBreno Leitao struct io_sq_ring {
11*ba6e0e5cSBreno Leitao 	unsigned int *head;
12*ba6e0e5cSBreno Leitao 	unsigned int *tail;
13*ba6e0e5cSBreno Leitao 	unsigned int *ring_mask;
14*ba6e0e5cSBreno Leitao 	unsigned int *ring_entries;
15*ba6e0e5cSBreno Leitao 	unsigned int *flags;
16*ba6e0e5cSBreno Leitao 	unsigned int *array;
17*ba6e0e5cSBreno Leitao };
18*ba6e0e5cSBreno Leitao 
19*ba6e0e5cSBreno Leitao struct io_cq_ring {
20*ba6e0e5cSBreno Leitao 	unsigned int *head;
21*ba6e0e5cSBreno Leitao 	unsigned int *tail;
22*ba6e0e5cSBreno Leitao 	unsigned int *ring_mask;
23*ba6e0e5cSBreno Leitao 	unsigned int *ring_entries;
24*ba6e0e5cSBreno Leitao 	struct io_uring_cqe *cqes;
25*ba6e0e5cSBreno Leitao };
26*ba6e0e5cSBreno Leitao 
27*ba6e0e5cSBreno Leitao struct io_uring_sq {
28*ba6e0e5cSBreno Leitao 	unsigned int *khead;
29*ba6e0e5cSBreno Leitao 	unsigned int *ktail;
30*ba6e0e5cSBreno Leitao 	unsigned int *kring_mask;
31*ba6e0e5cSBreno Leitao 	unsigned int *kring_entries;
32*ba6e0e5cSBreno Leitao 	unsigned int *kflags;
33*ba6e0e5cSBreno Leitao 	unsigned int *kdropped;
34*ba6e0e5cSBreno Leitao 	unsigned int *array;
35*ba6e0e5cSBreno Leitao 	struct io_uring_sqe *sqes;
36*ba6e0e5cSBreno Leitao 
37*ba6e0e5cSBreno Leitao 	unsigned int sqe_head;
38*ba6e0e5cSBreno Leitao 	unsigned int sqe_tail;
39*ba6e0e5cSBreno Leitao 
40*ba6e0e5cSBreno Leitao 	size_t ring_sz;
41*ba6e0e5cSBreno Leitao };
42*ba6e0e5cSBreno Leitao 
43*ba6e0e5cSBreno Leitao struct io_uring_cq {
44*ba6e0e5cSBreno Leitao 	unsigned int *khead;
45*ba6e0e5cSBreno Leitao 	unsigned int *ktail;
46*ba6e0e5cSBreno Leitao 	unsigned int *kring_mask;
47*ba6e0e5cSBreno Leitao 	unsigned int *kring_entries;
48*ba6e0e5cSBreno Leitao 	unsigned int *koverflow;
49*ba6e0e5cSBreno Leitao 	struct io_uring_cqe *cqes;
50*ba6e0e5cSBreno Leitao 
51*ba6e0e5cSBreno Leitao 	size_t ring_sz;
52*ba6e0e5cSBreno Leitao };
53*ba6e0e5cSBreno Leitao 
54*ba6e0e5cSBreno Leitao struct io_uring {
55*ba6e0e5cSBreno Leitao 	struct io_uring_sq sq;
56*ba6e0e5cSBreno Leitao 	struct io_uring_cq cq;
57*ba6e0e5cSBreno Leitao 	int ring_fd;
58*ba6e0e5cSBreno Leitao };
59*ba6e0e5cSBreno Leitao 
60*ba6e0e5cSBreno Leitao #if defined(__x86_64) || defined(__i386__)
61*ba6e0e5cSBreno Leitao #define read_barrier()	__asm__ __volatile__("":::"memory")
62*ba6e0e5cSBreno Leitao #define write_barrier()	__asm__ __volatile__("":::"memory")
63*ba6e0e5cSBreno Leitao #else
64*ba6e0e5cSBreno Leitao #define read_barrier()	__sync_synchronize()
65*ba6e0e5cSBreno Leitao #define write_barrier()	__sync_synchronize()
66*ba6e0e5cSBreno Leitao #endif
67*ba6e0e5cSBreno Leitao 
io_uring_mmap(int fd,struct io_uring_params * p,struct io_uring_sq * sq,struct io_uring_cq * cq)68*ba6e0e5cSBreno Leitao static inline int io_uring_mmap(int fd, struct io_uring_params *p,
69*ba6e0e5cSBreno Leitao 				struct io_uring_sq *sq, struct io_uring_cq *cq)
70*ba6e0e5cSBreno Leitao {
71*ba6e0e5cSBreno Leitao 	size_t size;
72*ba6e0e5cSBreno Leitao 	void *ptr;
73*ba6e0e5cSBreno Leitao 	int ret;
74*ba6e0e5cSBreno Leitao 
75*ba6e0e5cSBreno Leitao 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned int);
76*ba6e0e5cSBreno Leitao 	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
77*ba6e0e5cSBreno Leitao 		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
78*ba6e0e5cSBreno Leitao 	if (ptr == MAP_FAILED)
79*ba6e0e5cSBreno Leitao 		return -errno;
80*ba6e0e5cSBreno Leitao 	sq->khead = ptr + p->sq_off.head;
81*ba6e0e5cSBreno Leitao 	sq->ktail = ptr + p->sq_off.tail;
82*ba6e0e5cSBreno Leitao 	sq->kring_mask = ptr + p->sq_off.ring_mask;
83*ba6e0e5cSBreno Leitao 	sq->kring_entries = ptr + p->sq_off.ring_entries;
84*ba6e0e5cSBreno Leitao 	sq->kflags = ptr + p->sq_off.flags;
85*ba6e0e5cSBreno Leitao 	sq->kdropped = ptr + p->sq_off.dropped;
86*ba6e0e5cSBreno Leitao 	sq->array = ptr + p->sq_off.array;
87*ba6e0e5cSBreno Leitao 
88*ba6e0e5cSBreno Leitao 	size = p->sq_entries * sizeof(struct io_uring_sqe);
89*ba6e0e5cSBreno Leitao 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
90*ba6e0e5cSBreno Leitao 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
91*ba6e0e5cSBreno Leitao 	if (sq->sqes == MAP_FAILED) {
92*ba6e0e5cSBreno Leitao 		ret = -errno;
93*ba6e0e5cSBreno Leitao err:
94*ba6e0e5cSBreno Leitao 		munmap(sq->khead, sq->ring_sz);
95*ba6e0e5cSBreno Leitao 		return ret;
96*ba6e0e5cSBreno Leitao 	}
97*ba6e0e5cSBreno Leitao 
98*ba6e0e5cSBreno Leitao 	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
99*ba6e0e5cSBreno Leitao 	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
100*ba6e0e5cSBreno Leitao 		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
101*ba6e0e5cSBreno Leitao 	if (ptr == MAP_FAILED) {
102*ba6e0e5cSBreno Leitao 		ret = -errno;
103*ba6e0e5cSBreno Leitao 		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
104*ba6e0e5cSBreno Leitao 		goto err;
105*ba6e0e5cSBreno Leitao 	}
106*ba6e0e5cSBreno Leitao 	cq->khead = ptr + p->cq_off.head;
107*ba6e0e5cSBreno Leitao 	cq->ktail = ptr + p->cq_off.tail;
108*ba6e0e5cSBreno Leitao 	cq->kring_mask = ptr + p->cq_off.ring_mask;
109*ba6e0e5cSBreno Leitao 	cq->kring_entries = ptr + p->cq_off.ring_entries;
110*ba6e0e5cSBreno Leitao 	cq->koverflow = ptr + p->cq_off.overflow;
111*ba6e0e5cSBreno Leitao 	cq->cqes = ptr + p->cq_off.cqes;
112*ba6e0e5cSBreno Leitao 	return 0;
113*ba6e0e5cSBreno Leitao }
114*ba6e0e5cSBreno Leitao 
io_uring_setup(unsigned int entries,struct io_uring_params * p)115*ba6e0e5cSBreno Leitao static inline int io_uring_setup(unsigned int entries,
116*ba6e0e5cSBreno Leitao 				 struct io_uring_params *p)
117*ba6e0e5cSBreno Leitao {
118*ba6e0e5cSBreno Leitao 	return syscall(__NR_io_uring_setup, entries, p);
119*ba6e0e5cSBreno Leitao }
120*ba6e0e5cSBreno Leitao 
io_uring_enter(int fd,unsigned int to_submit,unsigned int min_complete,unsigned int flags,sigset_t * sig)121*ba6e0e5cSBreno Leitao static inline int io_uring_enter(int fd, unsigned int to_submit,
122*ba6e0e5cSBreno Leitao 				 unsigned int min_complete,
123*ba6e0e5cSBreno Leitao 				 unsigned int flags, sigset_t *sig)
124*ba6e0e5cSBreno Leitao {
125*ba6e0e5cSBreno Leitao 	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
126*ba6e0e5cSBreno Leitao 		       flags, sig, _NSIG / 8);
127*ba6e0e5cSBreno Leitao }
128*ba6e0e5cSBreno Leitao 
io_uring_queue_init(unsigned int entries,struct io_uring * ring,unsigned int flags)129*ba6e0e5cSBreno Leitao static inline int io_uring_queue_init(unsigned int entries,
130*ba6e0e5cSBreno Leitao 				      struct io_uring *ring,
131*ba6e0e5cSBreno Leitao 				      unsigned int flags)
132*ba6e0e5cSBreno Leitao {
133*ba6e0e5cSBreno Leitao 	struct io_uring_params p;
134*ba6e0e5cSBreno Leitao 	int fd, ret;
135*ba6e0e5cSBreno Leitao 
136*ba6e0e5cSBreno Leitao 	memset(ring, 0, sizeof(*ring));
137*ba6e0e5cSBreno Leitao 	memset(&p, 0, sizeof(p));
138*ba6e0e5cSBreno Leitao 	p.flags = flags;
139*ba6e0e5cSBreno Leitao 
140*ba6e0e5cSBreno Leitao 	fd = io_uring_setup(entries, &p);
141*ba6e0e5cSBreno Leitao 	if (fd < 0)
142*ba6e0e5cSBreno Leitao 		return fd;
143*ba6e0e5cSBreno Leitao 	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
144*ba6e0e5cSBreno Leitao 	if (!ret)
145*ba6e0e5cSBreno Leitao 		ring->ring_fd = fd;
146*ba6e0e5cSBreno Leitao 	else
147*ba6e0e5cSBreno Leitao 		close(fd);
148*ba6e0e5cSBreno Leitao 	return ret;
149*ba6e0e5cSBreno Leitao }
150*ba6e0e5cSBreno Leitao 
151*ba6e0e5cSBreno Leitao /* Get a sqe */
io_uring_get_sqe(struct io_uring * ring)152*ba6e0e5cSBreno Leitao static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
153*ba6e0e5cSBreno Leitao {
154*ba6e0e5cSBreno Leitao 	struct io_uring_sq *sq = &ring->sq;
155*ba6e0e5cSBreno Leitao 
156*ba6e0e5cSBreno Leitao 	if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
157*ba6e0e5cSBreno Leitao 		return NULL;
158*ba6e0e5cSBreno Leitao 	return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
159*ba6e0e5cSBreno Leitao }
160*ba6e0e5cSBreno Leitao 
io_uring_wait_cqe(struct io_uring * ring,struct io_uring_cqe ** cqe_ptr)161*ba6e0e5cSBreno Leitao static inline int io_uring_wait_cqe(struct io_uring *ring,
162*ba6e0e5cSBreno Leitao 				    struct io_uring_cqe **cqe_ptr)
163*ba6e0e5cSBreno Leitao {
164*ba6e0e5cSBreno Leitao 	struct io_uring_cq *cq = &ring->cq;
165*ba6e0e5cSBreno Leitao 	const unsigned int mask = *cq->kring_mask;
166*ba6e0e5cSBreno Leitao 	unsigned int head = *cq->khead;
167*ba6e0e5cSBreno Leitao 	int ret;
168*ba6e0e5cSBreno Leitao 
169*ba6e0e5cSBreno Leitao 	*cqe_ptr = NULL;
170*ba6e0e5cSBreno Leitao 	do {
171*ba6e0e5cSBreno Leitao 		read_barrier();
172*ba6e0e5cSBreno Leitao 		if (head != *cq->ktail) {
173*ba6e0e5cSBreno Leitao 			*cqe_ptr = &cq->cqes[head & mask];
174*ba6e0e5cSBreno Leitao 			break;
175*ba6e0e5cSBreno Leitao 		}
176*ba6e0e5cSBreno Leitao 		ret = io_uring_enter(ring->ring_fd, 0, 1,
177*ba6e0e5cSBreno Leitao 				     IORING_ENTER_GETEVENTS, NULL);
178*ba6e0e5cSBreno Leitao 		if (ret < 0)
179*ba6e0e5cSBreno Leitao 			return -errno;
180*ba6e0e5cSBreno Leitao 	} while (1);
181*ba6e0e5cSBreno Leitao 
182*ba6e0e5cSBreno Leitao 	return 0;
183*ba6e0e5cSBreno Leitao }
184*ba6e0e5cSBreno Leitao 
io_uring_submit(struct io_uring * ring)185*ba6e0e5cSBreno Leitao static inline int io_uring_submit(struct io_uring *ring)
186*ba6e0e5cSBreno Leitao {
187*ba6e0e5cSBreno Leitao 	struct io_uring_sq *sq = &ring->sq;
188*ba6e0e5cSBreno Leitao 	const unsigned int mask = *sq->kring_mask;
189*ba6e0e5cSBreno Leitao 	unsigned int ktail, submitted, to_submit;
190*ba6e0e5cSBreno Leitao 	int ret;
191*ba6e0e5cSBreno Leitao 
192*ba6e0e5cSBreno Leitao 	read_barrier();
193*ba6e0e5cSBreno Leitao 	if (*sq->khead != *sq->ktail) {
194*ba6e0e5cSBreno Leitao 		submitted = *sq->kring_entries;
195*ba6e0e5cSBreno Leitao 		goto submit;
196*ba6e0e5cSBreno Leitao 	}
197*ba6e0e5cSBreno Leitao 	if (sq->sqe_head == sq->sqe_tail)
198*ba6e0e5cSBreno Leitao 		return 0;
199*ba6e0e5cSBreno Leitao 
200*ba6e0e5cSBreno Leitao 	ktail = *sq->ktail;
201*ba6e0e5cSBreno Leitao 	to_submit = sq->sqe_tail - sq->sqe_head;
202*ba6e0e5cSBreno Leitao 	for (submitted = 0; submitted < to_submit; submitted++) {
203*ba6e0e5cSBreno Leitao 		read_barrier();
204*ba6e0e5cSBreno Leitao 		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
205*ba6e0e5cSBreno Leitao 	}
206*ba6e0e5cSBreno Leitao 	if (!submitted)
207*ba6e0e5cSBreno Leitao 		return 0;
208*ba6e0e5cSBreno Leitao 
209*ba6e0e5cSBreno Leitao 	if (*sq->ktail != ktail) {
210*ba6e0e5cSBreno Leitao 		write_barrier();
211*ba6e0e5cSBreno Leitao 		*sq->ktail = ktail;
212*ba6e0e5cSBreno Leitao 		write_barrier();
213*ba6e0e5cSBreno Leitao 	}
214*ba6e0e5cSBreno Leitao submit:
215*ba6e0e5cSBreno Leitao 	ret = io_uring_enter(ring->ring_fd, submitted, 0,
216*ba6e0e5cSBreno Leitao 			     IORING_ENTER_GETEVENTS, NULL);
217*ba6e0e5cSBreno Leitao 	return ret < 0 ? -errno : ret;
218*ba6e0e5cSBreno Leitao }
219*ba6e0e5cSBreno Leitao 
io_uring_queue_exit(struct io_uring * ring)220*ba6e0e5cSBreno Leitao static inline void io_uring_queue_exit(struct io_uring *ring)
221*ba6e0e5cSBreno Leitao {
222*ba6e0e5cSBreno Leitao 	struct io_uring_sq *sq = &ring->sq;
223*ba6e0e5cSBreno Leitao 
224*ba6e0e5cSBreno Leitao 	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
225*ba6e0e5cSBreno Leitao 	munmap(sq->khead, sq->ring_sz);
226*ba6e0e5cSBreno Leitao 	close(ring->ring_fd);
227*ba6e0e5cSBreno Leitao }
228*ba6e0e5cSBreno Leitao 
229*ba6e0e5cSBreno Leitao /* Prepare and send the SQE */
io_uring_prep_cmd(struct io_uring_sqe * sqe,int op,int sockfd,int level,int optname,const void * optval,int optlen)230*ba6e0e5cSBreno Leitao static inline void io_uring_prep_cmd(struct io_uring_sqe *sqe, int op,
231*ba6e0e5cSBreno Leitao 				     int sockfd,
232*ba6e0e5cSBreno Leitao 				     int level, int optname,
233*ba6e0e5cSBreno Leitao 				     const void *optval,
234*ba6e0e5cSBreno Leitao 				     int optlen)
235*ba6e0e5cSBreno Leitao {
236*ba6e0e5cSBreno Leitao 	memset(sqe, 0, sizeof(*sqe));
237*ba6e0e5cSBreno Leitao 	sqe->opcode = (__u8)IORING_OP_URING_CMD;
238*ba6e0e5cSBreno Leitao 	sqe->fd = sockfd;
239*ba6e0e5cSBreno Leitao 	sqe->cmd_op = op;
240*ba6e0e5cSBreno Leitao 
241*ba6e0e5cSBreno Leitao 	sqe->level = level;
242*ba6e0e5cSBreno Leitao 	sqe->optname = optname;
243*ba6e0e5cSBreno Leitao 	sqe->optval = (unsigned long long)optval;
244*ba6e0e5cSBreno Leitao 	sqe->optlen = optlen;
245*ba6e0e5cSBreno Leitao }
246*ba6e0e5cSBreno Leitao 
io_uring_register_buffers(struct io_uring * ring,const struct iovec * iovecs,unsigned int nr_iovecs)247*ba6e0e5cSBreno Leitao static inline int io_uring_register_buffers(struct io_uring *ring,
248*ba6e0e5cSBreno Leitao 					    const struct iovec *iovecs,
249*ba6e0e5cSBreno Leitao 					    unsigned int nr_iovecs)
250*ba6e0e5cSBreno Leitao {
251*ba6e0e5cSBreno Leitao 	int ret;
252*ba6e0e5cSBreno Leitao 
253*ba6e0e5cSBreno Leitao 	ret = syscall(__NR_io_uring_register, ring->ring_fd,
254*ba6e0e5cSBreno Leitao 		      IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
255*ba6e0e5cSBreno Leitao 	return (ret < 0) ? -errno : ret;
256*ba6e0e5cSBreno Leitao }
257*ba6e0e5cSBreno Leitao 
io_uring_prep_send(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags)258*ba6e0e5cSBreno Leitao static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
259*ba6e0e5cSBreno Leitao 				      const void *buf, size_t len, int flags)
260*ba6e0e5cSBreno Leitao {
261*ba6e0e5cSBreno Leitao 	memset(sqe, 0, sizeof(*sqe));
262*ba6e0e5cSBreno Leitao 	sqe->opcode = (__u8)IORING_OP_SEND;
263*ba6e0e5cSBreno Leitao 	sqe->fd = sockfd;
264*ba6e0e5cSBreno Leitao 	sqe->addr = (unsigned long)buf;
265*ba6e0e5cSBreno Leitao 	sqe->len = len;
266*ba6e0e5cSBreno Leitao 	sqe->msg_flags = (__u32)flags;
267*ba6e0e5cSBreno Leitao }
268*ba6e0e5cSBreno Leitao 
io_uring_prep_sendzc(struct io_uring_sqe * sqe,int sockfd,const void * buf,size_t len,int flags,unsigned int zc_flags)269*ba6e0e5cSBreno Leitao static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
270*ba6e0e5cSBreno Leitao 					const void *buf, size_t len, int flags,
271*ba6e0e5cSBreno Leitao 					unsigned int zc_flags)
272*ba6e0e5cSBreno Leitao {
273*ba6e0e5cSBreno Leitao 	io_uring_prep_send(sqe, sockfd, buf, len, flags);
274*ba6e0e5cSBreno Leitao 	sqe->opcode = (__u8)IORING_OP_SEND_ZC;
275*ba6e0e5cSBreno Leitao 	sqe->ioprio = zc_flags;
276*ba6e0e5cSBreno Leitao }
277*ba6e0e5cSBreno Leitao 
io_uring_cqe_seen(struct io_uring * ring)278*ba6e0e5cSBreno Leitao static inline void io_uring_cqe_seen(struct io_uring *ring)
279*ba6e0e5cSBreno Leitao {
280*ba6e0e5cSBreno Leitao 	*(&ring->cq)->khead += 1;
281*ba6e0e5cSBreno Leitao 	write_barrier();
282*ba6e0e5cSBreno Leitao }
283