1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21 
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37 
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41 
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44 
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46 
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51 	struct {
52 		int pipefd[2];
53 	};
54 	struct {
55 		int readfd;
56 		int writefd;
57 	};
58 };
59 
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64 	int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66 	uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68 	uint64_t timerfd_num;            /* for timerfd */
69 	char charbuf[16];                /* for others */
70 };
71 
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74 
75 struct rte_intr_callback {
76 	TAILQ_ENTRY(rte_intr_callback) next;
77 	rte_intr_callback_fn cb_fn;  /**< callback address */
78 	void *cb_arg;                /**< parameter for callback */
79 	uint8_t pending_delete;      /**< delete after callback is called */
80 	rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82 
83 struct rte_intr_source {
84 	TAILQ_ENTRY(rte_intr_source) next;
85 	struct rte_intr_handle intr_handle; /**< interrupt handle */
86 	struct rte_intr_cb_list callbacks;  /**< user callbacks */
87 	uint32_t active;
88 };
89 
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92 
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95 
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98 
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101 
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104 
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108 			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109 
110 /* enable legacy (INTx) interrupts */
111 static int
vfio_enable_intx(const struct rte_intr_handle * intr_handle)112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113 	struct vfio_irq_set *irq_set;
114 	char irq_set_buf[IRQ_SET_BUF_LEN];
115 	int len, ret;
116 	int *fd_ptr;
117 
118 	len = sizeof(irq_set_buf);
119 
120 	/* enable INTx */
121 	irq_set = (struct vfio_irq_set *) irq_set_buf;
122 	irq_set->argsz = len;
123 	irq_set->count = 1;
124 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126 	irq_set->start = 0;
127 	fd_ptr = (int *) &irq_set->data;
128 	*fd_ptr = intr_handle->fd;
129 
130 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131 
132 	if (ret) {
133 		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134 						intr_handle->fd);
135 		return -1;
136 	}
137 
138 	/* unmask INTx after enabling */
139 	memset(irq_set, 0, len);
140 	len = sizeof(struct vfio_irq_set);
141 	irq_set->argsz = len;
142 	irq_set->count = 1;
143 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145 	irq_set->start = 0;
146 
147 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148 
149 	if (ret) {
150 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151 						intr_handle->fd);
152 		return -1;
153 	}
154 	return 0;
155 }
156 
157 /* disable legacy (INTx) interrupts */
158 static int
vfio_disable_intx(const struct rte_intr_handle * intr_handle)159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160 	struct vfio_irq_set *irq_set;
161 	char irq_set_buf[IRQ_SET_BUF_LEN];
162 	int len, ret;
163 
164 	len = sizeof(struct vfio_irq_set);
165 
166 	/* mask interrupts before disabling */
167 	irq_set = (struct vfio_irq_set *) irq_set_buf;
168 	irq_set->argsz = len;
169 	irq_set->count = 1;
170 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172 	irq_set->start = 0;
173 
174 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175 
176 	if (ret) {
177 		RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178 						intr_handle->fd);
179 		return -1;
180 	}
181 
182 	/* disable INTx*/
183 	memset(irq_set, 0, len);
184 	irq_set->argsz = len;
185 	irq_set->count = 0;
186 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187 	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188 	irq_set->start = 0;
189 
190 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191 
192 	if (ret) {
193 		RTE_LOG(ERR, EAL,
194 			"Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195 		return -1;
196 	}
197 	return 0;
198 }
199 
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
vfio_ack_intx(const struct rte_intr_handle * intr_handle)202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204 	struct vfio_irq_set irq_set;
205 
206 	/* unmask INTx */
207 	memset(&irq_set, 0, sizeof(irq_set));
208 	irq_set.argsz = sizeof(irq_set);
209 	irq_set.count = 1;
210 	irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211 	irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212 	irq_set.start = 0;
213 
214 	if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215 		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216 			intr_handle->fd);
217 		return -1;
218 	}
219 	return 0;
220 }
221 
222 /* enable MSI interrupts */
223 static int
vfio_enable_msi(const struct rte_intr_handle * intr_handle)224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225 	int len, ret;
226 	char irq_set_buf[IRQ_SET_BUF_LEN];
227 	struct vfio_irq_set *irq_set;
228 	int *fd_ptr;
229 
230 	len = sizeof(irq_set_buf);
231 
232 	irq_set = (struct vfio_irq_set *) irq_set_buf;
233 	irq_set->argsz = len;
234 	irq_set->count = 1;
235 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237 	irq_set->start = 0;
238 	fd_ptr = (int *) &irq_set->data;
239 	*fd_ptr = intr_handle->fd;
240 
241 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242 
243 	if (ret) {
244 		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245 						intr_handle->fd);
246 		return -1;
247 	}
248 	return 0;
249 }
250 
251 /* disable MSI interrupts */
252 static int
vfio_disable_msi(const struct rte_intr_handle * intr_handle)253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254 	struct vfio_irq_set *irq_set;
255 	char irq_set_buf[IRQ_SET_BUF_LEN];
256 	int len, ret;
257 
258 	len = sizeof(struct vfio_irq_set);
259 
260 	irq_set = (struct vfio_irq_set *) irq_set_buf;
261 	irq_set->argsz = len;
262 	irq_set->count = 0;
263 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265 	irq_set->start = 0;
266 
267 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268 
269 	if (ret)
270 		RTE_LOG(ERR, EAL,
271 			"Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272 
273 	return ret;
274 }
275 
276 /* enable MSI-X interrupts */
277 static int
vfio_enable_msix(const struct rte_intr_handle * intr_handle)278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279 	int len, ret;
280 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281 	struct vfio_irq_set *irq_set;
282 	int *fd_ptr;
283 
284 	len = sizeof(irq_set_buf);
285 
286 	irq_set = (struct vfio_irq_set *) irq_set_buf;
287 	irq_set->argsz = len;
288 	/* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289 	irq_set->count = intr_handle->max_intr ?
290 		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291 		RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294 	irq_set->start = 0;
295 	fd_ptr = (int *) &irq_set->data;
296 	/* INTR vector offset 0 reserve for non-efds mapping */
297 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298 	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299 		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300 
301 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302 
303 	if (ret) {
304 		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305 						intr_handle->fd);
306 		return -1;
307 	}
308 
309 	return 0;
310 }
311 
312 /* disable MSI-X interrupts */
313 static int
vfio_disable_msix(const struct rte_intr_handle * intr_handle)314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315 	struct vfio_irq_set *irq_set;
316 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317 	int len, ret;
318 
319 	len = sizeof(struct vfio_irq_set);
320 
321 	irq_set = (struct vfio_irq_set *) irq_set_buf;
322 	irq_set->argsz = len;
323 	irq_set->count = 0;
324 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326 	irq_set->start = 0;
327 
328 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329 
330 	if (ret)
331 		RTE_LOG(ERR, EAL,
332 			"Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333 
334 	return ret;
335 }
336 
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
vfio_enable_req(const struct rte_intr_handle * intr_handle)340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342 	int len, ret;
343 	char irq_set_buf[IRQ_SET_BUF_LEN];
344 	struct vfio_irq_set *irq_set;
345 	int *fd_ptr;
346 
347 	len = sizeof(irq_set_buf);
348 
349 	irq_set = (struct vfio_irq_set *) irq_set_buf;
350 	irq_set->argsz = len;
351 	irq_set->count = 1;
352 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353 			 VFIO_IRQ_SET_ACTION_TRIGGER;
354 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355 	irq_set->start = 0;
356 	fd_ptr = (int *) &irq_set->data;
357 	*fd_ptr = intr_handle->fd;
358 
359 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360 
361 	if (ret) {
362 		RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363 						intr_handle->fd);
364 		return -1;
365 	}
366 
367 	return 0;
368 }
369 
370 /* disable req notifier */
371 static int
vfio_disable_req(const struct rte_intr_handle * intr_handle)372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374 	struct vfio_irq_set *irq_set;
375 	char irq_set_buf[IRQ_SET_BUF_LEN];
376 	int len, ret;
377 
378 	len = sizeof(struct vfio_irq_set);
379 
380 	irq_set = (struct vfio_irq_set *) irq_set_buf;
381 	irq_set->argsz = len;
382 	irq_set->count = 0;
383 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384 	irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385 	irq_set->start = 0;
386 
387 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388 
389 	if (ret)
390 		RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391 			intr_handle->fd);
392 
393 	return ret;
394 }
395 #endif
396 #endif
397 
398 static int
uio_intx_intr_disable(const struct rte_intr_handle * intr_handle)399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401 	unsigned char command_high;
402 
403 	/* use UIO config file descriptor for uio_pci_generic */
404 	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405 		RTE_LOG(ERR, EAL,
406 			"Error reading interrupts status for fd %d\n",
407 			intr_handle->uio_cfg_fd);
408 		return -1;
409 	}
410 	/* disable interrupts */
411 	command_high |= 0x4;
412 	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413 		RTE_LOG(ERR, EAL,
414 			"Error disabling interrupts for fd %d\n",
415 			intr_handle->uio_cfg_fd);
416 		return -1;
417 	}
418 
419 	return 0;
420 }
421 
422 static int
uio_intx_intr_enable(const struct rte_intr_handle * intr_handle)423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425 	unsigned char command_high;
426 
427 	/* use UIO config file descriptor for uio_pci_generic */
428 	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429 		RTE_LOG(ERR, EAL,
430 			"Error reading interrupts status for fd %d\n",
431 			intr_handle->uio_cfg_fd);
432 		return -1;
433 	}
434 	/* enable interrupts */
435 	command_high &= ~0x4;
436 	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437 		RTE_LOG(ERR, EAL,
438 			"Error enabling interrupts for fd %d\n",
439 			intr_handle->uio_cfg_fd);
440 		return -1;
441 	}
442 
443 	return 0;
444 }
445 
446 static int
uio_intr_disable(const struct rte_intr_handle * intr_handle)447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449 	const int value = 0;
450 
451 	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452 		RTE_LOG(ERR, EAL,
453 			"Error disabling interrupts for fd %d (%s)\n",
454 			intr_handle->fd, strerror(errno));
455 		return -1;
456 	}
457 	return 0;
458 }
459 
460 static int
uio_intr_enable(const struct rte_intr_handle * intr_handle)461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463 	const int value = 1;
464 
465 	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466 		RTE_LOG(ERR, EAL,
467 			"Error enabling interrupts for fd %d (%s)\n",
468 			intr_handle->fd, strerror(errno));
469 		return -1;
470 	}
471 	return 0;
472 }
473 
474 int
rte_intr_callback_register(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb,void * cb_arg)475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476 			rte_intr_callback_fn cb, void *cb_arg)
477 {
478 	int ret, wake_thread;
479 	struct rte_intr_source *src;
480 	struct rte_intr_callback *callback;
481 
482 	wake_thread = 0;
483 
484 	/* first do parameter checking */
485 	if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486 		RTE_LOG(ERR, EAL,
487 			"Registering with invalid input parameter\n");
488 		return -EINVAL;
489 	}
490 
491 	/* allocate a new interrupt callback entity */
492 	callback = calloc(1, sizeof(*callback));
493 	if (callback == NULL) {
494 		RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495 		return -ENOMEM;
496 	}
497 	callback->cb_fn = cb;
498 	callback->cb_arg = cb_arg;
499 	callback->pending_delete = 0;
500 	callback->ucb_fn = NULL;
501 
502 	rte_spinlock_lock(&intr_lock);
503 
504 	/* check if there is at least one callback registered for the fd */
505 	TAILQ_FOREACH(src, &intr_sources, next) {
506 		if (src->intr_handle.fd == intr_handle->fd) {
507 			/* we had no interrupts for this */
508 			if (TAILQ_EMPTY(&src->callbacks))
509 				wake_thread = 1;
510 
511 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512 			ret = 0;
513 			break;
514 		}
515 	}
516 
517 	/* no existing callbacks for this - add new source */
518 	if (src == NULL) {
519 		src = calloc(1, sizeof(*src));
520 		if (src == NULL) {
521 			RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522 			free(callback);
523 			ret = -ENOMEM;
524 		} else {
525 			src->intr_handle = *intr_handle;
526 			TAILQ_INIT(&src->callbacks);
527 			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528 			TAILQ_INSERT_TAIL(&intr_sources, src, next);
529 			wake_thread = 1;
530 			ret = 0;
531 		}
532 	}
533 
534 	rte_spinlock_unlock(&intr_lock);
535 
536 	/**
537 	 * check if need to notify the pipe fd waited by epoll_wait to
538 	 * rebuild the wait list.
539 	 */
540 	if (wake_thread)
541 		if (write(intr_pipe.writefd, "1", 1) < 0)
542 			ret = -EPIPE;
543 
544 	rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
545 	return ret;
546 }
547 
548 int
rte_intr_callback_unregister_pending(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb_fn,void * cb_arg,rte_intr_unregister_callback_fn ucb_fn)549 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
550 				rte_intr_callback_fn cb_fn, void *cb_arg,
551 				rte_intr_unregister_callback_fn ucb_fn)
552 {
553 	int ret;
554 	struct rte_intr_source *src;
555 	struct rte_intr_callback *cb, *next;
556 
557 	/* do parameter checking first */
558 	if (intr_handle == NULL || intr_handle->fd < 0) {
559 		RTE_LOG(ERR, EAL,
560 		"Unregistering with invalid input parameter\n");
561 		return -EINVAL;
562 	}
563 
564 	rte_spinlock_lock(&intr_lock);
565 
566 	/* check if the insterrupt source for the fd is existent */
567 	TAILQ_FOREACH(src, &intr_sources, next)
568 		if (src->intr_handle.fd == intr_handle->fd)
569 			break;
570 
571 	/* No interrupt source registered for the fd */
572 	if (src == NULL) {
573 		ret = -ENOENT;
574 
575 	/* only usable if the source is active */
576 	} else if (src->active == 0) {
577 		ret = -EAGAIN;
578 
579 	} else {
580 		ret = 0;
581 
582 		/* walk through the callbacks and mark all that match. */
583 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
584 			next = TAILQ_NEXT(cb, next);
585 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
586 					cb->cb_arg == cb_arg)) {
587 				cb->pending_delete = 1;
588 				cb->ucb_fn = ucb_fn;
589 				ret++;
590 			}
591 		}
592 	}
593 
594 	rte_spinlock_unlock(&intr_lock);
595 
596 	return ret;
597 }
598 
599 int
rte_intr_callback_unregister(const struct rte_intr_handle * intr_handle,rte_intr_callback_fn cb_fn,void * cb_arg)600 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
601 			rte_intr_callback_fn cb_fn, void *cb_arg)
602 {
603 	int ret;
604 	struct rte_intr_source *src;
605 	struct rte_intr_callback *cb, *next;
606 
607 	/* do parameter checking first */
608 	if (intr_handle == NULL || intr_handle->fd < 0) {
609 		RTE_LOG(ERR, EAL,
610 		"Unregistering with invalid input parameter\n");
611 		return -EINVAL;
612 	}
613 
614 	rte_spinlock_lock(&intr_lock);
615 
616 	/* check if the insterrupt source for the fd is existent */
617 	TAILQ_FOREACH(src, &intr_sources, next)
618 		if (src->intr_handle.fd == intr_handle->fd)
619 			break;
620 
621 	/* No interrupt source registered for the fd */
622 	if (src == NULL) {
623 		ret = -ENOENT;
624 
625 	/* interrupt source has some active callbacks right now. */
626 	} else if (src->active != 0) {
627 		ret = -EAGAIN;
628 
629 	/* ok to remove. */
630 	} else {
631 		ret = 0;
632 
633 		/*walk through the callbacks and remove all that match. */
634 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
635 
636 			next = TAILQ_NEXT(cb, next);
637 
638 			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
639 					cb->cb_arg == cb_arg)) {
640 				TAILQ_REMOVE(&src->callbacks, cb, next);
641 				free(cb);
642 				ret++;
643 			}
644 		}
645 
646 		/* all callbacks for that source are removed. */
647 		if (TAILQ_EMPTY(&src->callbacks)) {
648 			TAILQ_REMOVE(&intr_sources, src, next);
649 			free(src);
650 		}
651 	}
652 
653 	rte_spinlock_unlock(&intr_lock);
654 
655 	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
656 	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
657 		ret = -EPIPE;
658 	}
659 
660 	rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
661 		ret);
662 	return ret;
663 }
664 
665 int
rte_intr_enable(const struct rte_intr_handle * intr_handle)666 rte_intr_enable(const struct rte_intr_handle *intr_handle)
667 {
668 	int rc = 0;
669 
670 	if (intr_handle == NULL)
671 		return -1;
672 
673 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
674 		rc = 0;
675 		goto out;
676 	}
677 
678 	if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
679 		rc = -1;
680 		goto out;
681 	}
682 
683 	switch (intr_handle->type){
684 	/* write to the uio fd to enable the interrupt */
685 	case RTE_INTR_HANDLE_UIO:
686 		if (uio_intr_enable(intr_handle))
687 			rc = -1;
688 		break;
689 	case RTE_INTR_HANDLE_UIO_INTX:
690 		if (uio_intx_intr_enable(intr_handle))
691 			rc = -1;
692 		break;
693 	/* not used at this moment */
694 	case RTE_INTR_HANDLE_ALARM:
695 		rc = -1;
696 		break;
697 #ifdef VFIO_PRESENT
698 	case RTE_INTR_HANDLE_VFIO_MSIX:
699 		if (vfio_enable_msix(intr_handle))
700 			rc = -1;
701 		break;
702 	case RTE_INTR_HANDLE_VFIO_MSI:
703 		if (vfio_enable_msi(intr_handle))
704 			rc = -1;
705 		break;
706 	case RTE_INTR_HANDLE_VFIO_LEGACY:
707 		if (vfio_enable_intx(intr_handle))
708 			rc = -1;
709 		break;
710 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
711 	case RTE_INTR_HANDLE_VFIO_REQ:
712 		if (vfio_enable_req(intr_handle))
713 			rc = -1;
714 		break;
715 #endif
716 #endif
717 	/* not used at this moment */
718 	case RTE_INTR_HANDLE_DEV_EVENT:
719 		rc = -1;
720 		break;
721 	/* unknown handle type */
722 	default:
723 		RTE_LOG(ERR, EAL,
724 			"Unknown handle type of fd %d\n",
725 					intr_handle->fd);
726 		rc = -1;
727 		break;
728 	}
729 out:
730 	rte_eal_trace_intr_enable(intr_handle, rc);
731 	return rc;
732 }
733 
734 /**
735  * PMD generally calls this function at the end of its IRQ callback.
736  * Internally, it unmasks the interrupt if possible.
737  *
738  * For INTx, unmasking is required as the interrupt is auto-masked prior to
739  * invoking callback.
740  *
741  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
742  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
743  * this function is no-op.
744  */
745 int
rte_intr_ack(const struct rte_intr_handle * intr_handle)746 rte_intr_ack(const struct rte_intr_handle *intr_handle)
747 {
748 	if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
749 		return 0;
750 
751 	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
752 		return -1;
753 
754 	switch (intr_handle->type) {
755 	/* Both acking and enabling are same for UIO */
756 	case RTE_INTR_HANDLE_UIO:
757 		if (uio_intr_enable(intr_handle))
758 			return -1;
759 		break;
760 	case RTE_INTR_HANDLE_UIO_INTX:
761 		if (uio_intx_intr_enable(intr_handle))
762 			return -1;
763 		break;
764 	/* not used at this moment */
765 	case RTE_INTR_HANDLE_ALARM:
766 		return -1;
767 #ifdef VFIO_PRESENT
768 	/* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
769 	case RTE_INTR_HANDLE_VFIO_MSIX:
770 	case RTE_INTR_HANDLE_VFIO_MSI:
771 		return 0;
772 	case RTE_INTR_HANDLE_VFIO_LEGACY:
773 		if (vfio_ack_intx(intr_handle))
774 			return -1;
775 		break;
776 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
777 	case RTE_INTR_HANDLE_VFIO_REQ:
778 		return -1;
779 #endif
780 #endif
781 	/* not used at this moment */
782 	case RTE_INTR_HANDLE_DEV_EVENT:
783 		return -1;
784 	/* unknown handle type */
785 	default:
786 		RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
787 			intr_handle->fd);
788 		return -1;
789 	}
790 
791 	return 0;
792 }
793 
794 int
rte_intr_disable(const struct rte_intr_handle * intr_handle)795 rte_intr_disable(const struct rte_intr_handle *intr_handle)
796 {
797 	int rc = 0;
798 
799 	if (intr_handle == NULL)
800 		return -1;
801 
802 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
803 		rc = 0;
804 		goto out;
805 	}
806 
807 	if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
808 		rc = -1;
809 		goto out;
810 	}
811 
812 	switch (intr_handle->type){
813 	/* write to the uio fd to disable the interrupt */
814 	case RTE_INTR_HANDLE_UIO:
815 		if (uio_intr_disable(intr_handle))
816 			rc = -1;
817 		break;
818 	case RTE_INTR_HANDLE_UIO_INTX:
819 		if (uio_intx_intr_disable(intr_handle))
820 			rc = -1;
821 		break;
822 	/* not used at this moment */
823 	case RTE_INTR_HANDLE_ALARM:
824 		rc = -1;
825 		break;
826 #ifdef VFIO_PRESENT
827 	case RTE_INTR_HANDLE_VFIO_MSIX:
828 		if (vfio_disable_msix(intr_handle))
829 			rc = -1;
830 		break;
831 	case RTE_INTR_HANDLE_VFIO_MSI:
832 		if (vfio_disable_msi(intr_handle))
833 			rc = -1;
834 		break;
835 	case RTE_INTR_HANDLE_VFIO_LEGACY:
836 		if (vfio_disable_intx(intr_handle))
837 			rc = -1;
838 		break;
839 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
840 	case RTE_INTR_HANDLE_VFIO_REQ:
841 		if (vfio_disable_req(intr_handle))
842 			rc = -1;
843 		break;
844 #endif
845 #endif
846 	/* not used at this moment */
847 	case RTE_INTR_HANDLE_DEV_EVENT:
848 		rc = -1;
849 		break;
850 	/* unknown handle type */
851 	default:
852 		RTE_LOG(ERR, EAL,
853 			"Unknown handle type of fd %d\n",
854 					intr_handle->fd);
855 		rc = -1;
856 		break;
857 	}
858 out:
859 	rte_eal_trace_intr_disable(intr_handle, rc);
860 	return rc;
861 }
862 
863 static int
eal_intr_process_interrupts(struct epoll_event * events,int nfds)864 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
865 {
866 	bool call = false;
867 	int n, bytes_read, rv;
868 	struct rte_intr_source *src;
869 	struct rte_intr_callback *cb, *next;
870 	union rte_intr_read_buffer buf;
871 	struct rte_intr_callback active_cb;
872 
873 	for (n = 0; n < nfds; n++) {
874 
875 		/**
876 		 * if the pipe fd is ready to read, return out to
877 		 * rebuild the wait list.
878 		 */
879 		if (events[n].data.fd == intr_pipe.readfd){
880 			int r = read(intr_pipe.readfd, buf.charbuf,
881 					sizeof(buf.charbuf));
882 			RTE_SET_USED(r);
883 			return -1;
884 		}
885 		rte_spinlock_lock(&intr_lock);
886 		TAILQ_FOREACH(src, &intr_sources, next)
887 			if (src->intr_handle.fd ==
888 					events[n].data.fd)
889 				break;
890 		if (src == NULL){
891 			rte_spinlock_unlock(&intr_lock);
892 			continue;
893 		}
894 
895 		/* mark this interrupt source as active and release the lock. */
896 		src->active = 1;
897 		rte_spinlock_unlock(&intr_lock);
898 
899 		/* set the length to be read dor different handle type */
900 		switch (src->intr_handle.type) {
901 		case RTE_INTR_HANDLE_UIO:
902 		case RTE_INTR_HANDLE_UIO_INTX:
903 			bytes_read = sizeof(buf.uio_intr_count);
904 			break;
905 		case RTE_INTR_HANDLE_ALARM:
906 			bytes_read = sizeof(buf.timerfd_num);
907 			break;
908 #ifdef VFIO_PRESENT
909 		case RTE_INTR_HANDLE_VFIO_MSIX:
910 		case RTE_INTR_HANDLE_VFIO_MSI:
911 		case RTE_INTR_HANDLE_VFIO_LEGACY:
912 			bytes_read = sizeof(buf.vfio_intr_count);
913 			break;
914 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
915 		case RTE_INTR_HANDLE_VFIO_REQ:
916 			bytes_read = 0;
917 			call = true;
918 			break;
919 #endif
920 #endif
921 		case RTE_INTR_HANDLE_VDEV:
922 		case RTE_INTR_HANDLE_EXT:
923 			bytes_read = 0;
924 			call = true;
925 			break;
926 		case RTE_INTR_HANDLE_DEV_EVENT:
927 			bytes_read = 0;
928 			call = true;
929 			break;
930 		default:
931 			bytes_read = 1;
932 			break;
933 		}
934 
935 		if (bytes_read > 0) {
936 			/**
937 			 * read out to clear the ready-to-be-read flag
938 			 * for epoll_wait.
939 			 */
940 			bytes_read = read(events[n].data.fd, &buf, bytes_read);
941 			if (bytes_read < 0) {
942 				if (errno == EINTR || errno == EWOULDBLOCK)
943 					continue;
944 
945 				RTE_LOG(ERR, EAL, "Error reading from file "
946 					"descriptor %d: %s\n",
947 					events[n].data.fd,
948 					strerror(errno));
949 				/*
950 				 * The device is unplugged or buggy, remove
951 				 * it as an interrupt source and return to
952 				 * force the wait list to be rebuilt.
953 				 */
954 				rte_spinlock_lock(&intr_lock);
955 				TAILQ_REMOVE(&intr_sources, src, next);
956 				rte_spinlock_unlock(&intr_lock);
957 
958 				for (cb = TAILQ_FIRST(&src->callbacks); cb;
959 							cb = next) {
960 					next = TAILQ_NEXT(cb, next);
961 					TAILQ_REMOVE(&src->callbacks, cb, next);
962 					free(cb);
963 				}
964 				free(src);
965 				return -1;
966 			} else if (bytes_read == 0)
967 				RTE_LOG(ERR, EAL, "Read nothing from file "
968 					"descriptor %d\n", events[n].data.fd);
969 			else
970 				call = true;
971 		}
972 
973 		/* grab a lock, again to call callbacks and update status. */
974 		rte_spinlock_lock(&intr_lock);
975 
976 		if (call) {
977 
978 			/* Finally, call all callbacks. */
979 			TAILQ_FOREACH(cb, &src->callbacks, next) {
980 
981 				/* make a copy and unlock. */
982 				active_cb = *cb;
983 				rte_spinlock_unlock(&intr_lock);
984 
985 				/* call the actual callback */
986 				active_cb.cb_fn(active_cb.cb_arg);
987 
988 				/*get the lock back. */
989 				rte_spinlock_lock(&intr_lock);
990 			}
991 		}
992 		/* we done with that interrupt source, release it. */
993 		src->active = 0;
994 
995 		rv = 0;
996 
997 		/* check if any callback are supposed to be removed */
998 		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
999 			next = TAILQ_NEXT(cb, next);
1000 			if (cb->pending_delete) {
1001 				TAILQ_REMOVE(&src->callbacks, cb, next);
1002 				if (cb->ucb_fn)
1003 					cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1004 				free(cb);
1005 				rv++;
1006 			}
1007 		}
1008 
1009 		/* all callbacks for that source are removed. */
1010 		if (TAILQ_EMPTY(&src->callbacks)) {
1011 			TAILQ_REMOVE(&intr_sources, src, next);
1012 			free(src);
1013 		}
1014 
1015 		/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1016 		if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1017 			rte_spinlock_unlock(&intr_lock);
1018 			return -EPIPE;
1019 		}
1020 
1021 		rte_spinlock_unlock(&intr_lock);
1022 	}
1023 
1024 	return 0;
1025 }
1026 
1027 /**
1028  * It handles all the interrupts.
1029  *
1030  * @param pfd
1031  *  epoll file descriptor.
1032  * @param totalfds
1033  *  The number of file descriptors added in epoll.
1034  *
1035  * @return
1036  *  void
1037  */
1038 static void
eal_intr_handle_interrupts(int pfd,unsigned totalfds)1039 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1040 {
1041 	struct epoll_event events[totalfds];
1042 	int nfds = 0;
1043 
1044 	for(;;) {
1045 		nfds = epoll_wait(pfd, events, totalfds,
1046 			EAL_INTR_EPOLL_WAIT_FOREVER);
1047 		/* epoll_wait fail */
1048 		if (nfds < 0) {
1049 			if (errno == EINTR)
1050 				continue;
1051 			RTE_LOG(ERR, EAL,
1052 				"epoll_wait returns with fail\n");
1053 			return;
1054 		}
1055 		/* epoll_wait timeout, will never happens here */
1056 		else if (nfds == 0)
1057 			continue;
1058 		/* epoll_wait has at least one fd ready to read */
1059 		if (eal_intr_process_interrupts(events, nfds) < 0)
1060 			return;
1061 	}
1062 }
1063 
1064 /**
1065  * It builds/rebuilds up the epoll file descriptor with all the
1066  * file descriptors being waited on. Then handles the interrupts.
1067  *
1068  * @param arg
1069  *  pointer. (unused)
1070  *
1071  * @return
1072  *  never return;
1073  */
1074 static __rte_noreturn void *
eal_intr_thread_main(__rte_unused void * arg)1075 eal_intr_thread_main(__rte_unused void *arg)
1076 {
1077 	/* host thread, never break out */
1078 	for (;;) {
1079 		/* build up the epoll fd with all descriptors we are to
1080 		 * wait on then pass it to the handle_interrupts function
1081 		 */
1082 		static struct epoll_event pipe_event = {
1083 			.events = EPOLLIN | EPOLLPRI,
1084 		};
1085 		struct rte_intr_source *src;
1086 		unsigned numfds = 0;
1087 
1088 		/* create epoll fd */
1089 		int pfd = epoll_create(1);
1090 		if (pfd < 0)
1091 			rte_panic("Cannot create epoll instance\n");
1092 
1093 		pipe_event.data.fd = intr_pipe.readfd;
1094 		/**
1095 		 * add pipe fd into wait list, this pipe is used to
1096 		 * rebuild the wait list.
1097 		 */
1098 		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1099 						&pipe_event) < 0) {
1100 			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1101 					intr_pipe.readfd, strerror(errno));
1102 		}
1103 		numfds++;
1104 
1105 		rte_spinlock_lock(&intr_lock);
1106 
1107 		TAILQ_FOREACH(src, &intr_sources, next) {
1108 			struct epoll_event ev;
1109 
1110 			if (src->callbacks.tqh_first == NULL)
1111 				continue; /* skip those with no callbacks */
1112 			memset(&ev, 0, sizeof(ev));
1113 			ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1114 			ev.data.fd = src->intr_handle.fd;
1115 
1116 			/**
1117 			 * add all the uio device file descriptor
1118 			 * into wait list.
1119 			 */
1120 			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1121 					src->intr_handle.fd, &ev) < 0){
1122 				rte_panic("Error adding fd %d epoll_ctl, %s\n",
1123 					src->intr_handle.fd, strerror(errno));
1124 			}
1125 			else
1126 				numfds++;
1127 		}
1128 		rte_spinlock_unlock(&intr_lock);
1129 		/* serve the interrupt */
1130 		eal_intr_handle_interrupts(pfd, numfds);
1131 
1132 		/**
1133 		 * when we return, we need to rebuild the
1134 		 * list of fds to monitor.
1135 		 */
1136 		close(pfd);
1137 	}
1138 }
1139 
1140 int
rte_eal_intr_init(void)1141 rte_eal_intr_init(void)
1142 {
1143 	int ret = 0;
1144 
1145 	/* init the global interrupt source head */
1146 	TAILQ_INIT(&intr_sources);
1147 
1148 	/**
1149 	 * create a pipe which will be waited by epoll and notified to
1150 	 * rebuild the wait list of epoll.
1151 	 */
1152 	if (pipe(intr_pipe.pipefd) < 0) {
1153 		rte_errno = errno;
1154 		return -1;
1155 	}
1156 
1157 	/* create the host thread to wait/handle the interrupt */
1158 	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1159 			eal_intr_thread_main, NULL);
1160 	if (ret != 0) {
1161 		rte_errno = -ret;
1162 		RTE_LOG(ERR, EAL,
1163 			"Failed to create thread for interrupt handling\n");
1164 	}
1165 
1166 	return ret;
1167 }
1168 
1169 static void
eal_intr_proc_rxtx_intr(int fd,const struct rte_intr_handle * intr_handle)1170 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1171 {
1172 	union rte_intr_read_buffer buf;
1173 	int bytes_read = 0;
1174 	int nbytes;
1175 
1176 	switch (intr_handle->type) {
1177 	case RTE_INTR_HANDLE_UIO:
1178 	case RTE_INTR_HANDLE_UIO_INTX:
1179 		bytes_read = sizeof(buf.uio_intr_count);
1180 		break;
1181 #ifdef VFIO_PRESENT
1182 	case RTE_INTR_HANDLE_VFIO_MSIX:
1183 	case RTE_INTR_HANDLE_VFIO_MSI:
1184 	case RTE_INTR_HANDLE_VFIO_LEGACY:
1185 		bytes_read = sizeof(buf.vfio_intr_count);
1186 		break;
1187 #endif
1188 	case RTE_INTR_HANDLE_VDEV:
1189 		bytes_read = intr_handle->efd_counter_size;
1190 		/* For vdev, number of bytes to read is set by driver */
1191 		break;
1192 	case RTE_INTR_HANDLE_EXT:
1193 		return;
1194 	default:
1195 		bytes_read = 1;
1196 		RTE_LOG(INFO, EAL, "unexpected intr type\n");
1197 		break;
1198 	}
1199 
1200 	/**
1201 	 * read out to clear the ready-to-be-read flag
1202 	 * for epoll_wait.
1203 	 */
1204 	if (bytes_read == 0)
1205 		return;
1206 	do {
1207 		nbytes = read(fd, &buf, bytes_read);
1208 		if (nbytes < 0) {
1209 			if (errno == EINTR || errno == EWOULDBLOCK ||
1210 			    errno == EAGAIN)
1211 				continue;
1212 			RTE_LOG(ERR, EAL,
1213 				"Error reading from fd %d: %s\n",
1214 				fd, strerror(errno));
1215 		} else if (nbytes == 0)
1216 			RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1217 		return;
1218 	} while (1);
1219 }
1220 
1221 static int
eal_epoll_process_event(struct epoll_event * evs,unsigned int n,struct rte_epoll_event * events)1222 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1223 			struct rte_epoll_event *events)
1224 {
1225 	unsigned int i, count = 0;
1226 	struct rte_epoll_event *rev;
1227 	uint32_t valid_status;
1228 
1229 	for (i = 0; i < n; i++) {
1230 		rev = evs[i].data.ptr;
1231 		valid_status =  RTE_EPOLL_VALID;
1232 		/* ACQUIRE memory ordering here pairs with RELEASE
1233 		 * ordering below acting as a lock to synchronize
1234 		 * the event data updating.
1235 		 */
1236 		if (!rev || !__atomic_compare_exchange_n(&rev->status,
1237 				    &valid_status, RTE_EPOLL_EXEC, 0,
1238 				    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1239 			continue;
1240 
1241 		events[count].status        = RTE_EPOLL_VALID;
1242 		events[count].fd            = rev->fd;
1243 		events[count].epfd          = rev->epfd;
1244 		events[count].epdata.event  = rev->epdata.event;
1245 		events[count].epdata.data   = rev->epdata.data;
1246 		if (rev->epdata.cb_fun)
1247 			rev->epdata.cb_fun(rev->fd,
1248 					   rev->epdata.cb_arg);
1249 
1250 		/* the status update should be observed after
1251 		 * the other fields change.
1252 		 */
1253 		__atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1254 				__ATOMIC_RELEASE);
1255 		count++;
1256 	}
1257 	return count;
1258 }
1259 
1260 static inline int
eal_init_tls_epfd(void)1261 eal_init_tls_epfd(void)
1262 {
1263 	int pfd = epoll_create(255);
1264 
1265 	if (pfd < 0) {
1266 		RTE_LOG(ERR, EAL,
1267 			"Cannot create epoll instance\n");
1268 		return -1;
1269 	}
1270 	return pfd;
1271 }
1272 
1273 int
rte_intr_tls_epfd(void)1274 rte_intr_tls_epfd(void)
1275 {
1276 	if (RTE_PER_LCORE(_epfd) == -1)
1277 		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1278 
1279 	return RTE_PER_LCORE(_epfd);
1280 }
1281 
1282 static int
eal_epoll_wait(int epfd,struct rte_epoll_event * events,int maxevents,int timeout,bool interruptible)1283 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1284 	       int maxevents, int timeout, bool interruptible)
1285 {
1286 	struct epoll_event evs[maxevents];
1287 	int rc;
1288 
1289 	if (!events) {
1290 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1291 		return -1;
1292 	}
1293 
1294 	/* using per thread epoll fd */
1295 	if (epfd == RTE_EPOLL_PER_THREAD)
1296 		epfd = rte_intr_tls_epfd();
1297 
1298 	while (1) {
1299 		rc = epoll_wait(epfd, evs, maxevents, timeout);
1300 		if (likely(rc > 0)) {
1301 			/* epoll_wait has at least one fd ready to read */
1302 			rc = eal_epoll_process_event(evs, rc, events);
1303 			break;
1304 		} else if (rc < 0) {
1305 			if (errno == EINTR) {
1306 				if (interruptible)
1307 					return -1;
1308 				else
1309 					continue;
1310 			}
1311 			/* epoll_wait fail */
1312 			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1313 				strerror(errno));
1314 			rc = -1;
1315 			break;
1316 		} else {
1317 			/* rc == 0, epoll_wait timed out */
1318 			break;
1319 		}
1320 	}
1321 
1322 	return rc;
1323 }
1324 
1325 int
rte_epoll_wait(int epfd,struct rte_epoll_event * events,int maxevents,int timeout)1326 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1327 	       int maxevents, int timeout)
1328 {
1329 	return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1330 }
1331 
1332 int
rte_epoll_wait_interruptible(int epfd,struct rte_epoll_event * events,int maxevents,int timeout)1333 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1334 			     int maxevents, int timeout)
1335 {
1336 	return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1337 }
1338 
1339 static inline void
eal_epoll_data_safe_free(struct rte_epoll_event * ev)1340 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1341 {
1342 	uint32_t valid_status = RTE_EPOLL_VALID;
1343 
1344 	while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1345 		    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1346 		while (__atomic_load_n(&ev->status,
1347 				__ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1348 			rte_pause();
1349 		valid_status = RTE_EPOLL_VALID;
1350 	}
1351 	memset(&ev->epdata, 0, sizeof(ev->epdata));
1352 	ev->fd = -1;
1353 	ev->epfd = -1;
1354 }
1355 
1356 int
rte_epoll_ctl(int epfd,int op,int fd,struct rte_epoll_event * event)1357 rte_epoll_ctl(int epfd, int op, int fd,
1358 	      struct rte_epoll_event *event)
1359 {
1360 	struct epoll_event ev;
1361 
1362 	if (!event) {
1363 		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1364 		return -1;
1365 	}
1366 
1367 	/* using per thread epoll fd */
1368 	if (epfd == RTE_EPOLL_PER_THREAD)
1369 		epfd = rte_intr_tls_epfd();
1370 
1371 	if (op == EPOLL_CTL_ADD) {
1372 		__atomic_store_n(&event->status, RTE_EPOLL_VALID,
1373 				__ATOMIC_RELAXED);
1374 		event->fd = fd;  /* ignore fd in event */
1375 		event->epfd = epfd;
1376 		ev.data.ptr = (void *)event;
1377 	}
1378 
1379 	ev.events = event->epdata.event;
1380 	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1381 		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1382 			op, fd, strerror(errno));
1383 		if (op == EPOLL_CTL_ADD)
1384 			/* rollback status when CTL_ADD fail */
1385 			__atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1386 					__ATOMIC_RELAXED);
1387 		return -1;
1388 	}
1389 
1390 	if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1391 			__ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1392 		eal_epoll_data_safe_free(event);
1393 
1394 	return 0;
1395 }
1396 
1397 int
rte_intr_rx_ctl(struct rte_intr_handle * intr_handle,int epfd,int op,unsigned int vec,void * data)1398 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1399 		int op, unsigned int vec, void *data)
1400 {
1401 	struct rte_epoll_event *rev;
1402 	struct rte_epoll_data *epdata;
1403 	int epfd_op;
1404 	unsigned int efd_idx;
1405 	int rc = 0;
1406 
1407 	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1408 		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1409 
1410 	if (!intr_handle || intr_handle->nb_efd == 0 ||
1411 	    efd_idx >= intr_handle->nb_efd) {
1412 		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1413 		return -EPERM;
1414 	}
1415 
1416 	switch (op) {
1417 	case RTE_INTR_EVENT_ADD:
1418 		epfd_op = EPOLL_CTL_ADD;
1419 		rev = &intr_handle->elist[efd_idx];
1420 		if (__atomic_load_n(&rev->status,
1421 				__ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1422 			RTE_LOG(INFO, EAL, "Event already been added.\n");
1423 			return -EEXIST;
1424 		}
1425 
1426 		/* attach to intr vector fd */
1427 		epdata = &rev->epdata;
1428 		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1429 		epdata->data   = data;
1430 		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1431 		epdata->cb_arg = (void *)intr_handle;
1432 		rc = rte_epoll_ctl(epfd, epfd_op,
1433 				   intr_handle->efds[efd_idx], rev);
1434 		if (!rc)
1435 			RTE_LOG(DEBUG, EAL,
1436 				"efd %d associated with vec %d added on epfd %d"
1437 				"\n", rev->fd, vec, epfd);
1438 		else
1439 			rc = -EPERM;
1440 		break;
1441 	case RTE_INTR_EVENT_DEL:
1442 		epfd_op = EPOLL_CTL_DEL;
1443 		rev = &intr_handle->elist[efd_idx];
1444 		if (__atomic_load_n(&rev->status,
1445 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1446 			RTE_LOG(INFO, EAL, "Event does not exist.\n");
1447 			return -EPERM;
1448 		}
1449 
1450 		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1451 		if (rc)
1452 			rc = -EPERM;
1453 		break;
1454 	default:
1455 		RTE_LOG(ERR, EAL, "event op type mismatch\n");
1456 		rc = -EPERM;
1457 	}
1458 
1459 	return rc;
1460 }
1461 
1462 void
rte_intr_free_epoll_fd(struct rte_intr_handle * intr_handle)1463 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1464 {
1465 	uint32_t i;
1466 	struct rte_epoll_event *rev;
1467 
1468 	for (i = 0; i < intr_handle->nb_efd; i++) {
1469 		rev = &intr_handle->elist[i];
1470 		if (__atomic_load_n(&rev->status,
1471 				__ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1472 			continue;
1473 		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1474 			/* force free if the entry valid */
1475 			eal_epoll_data_safe_free(rev);
1476 		}
1477 	}
1478 }
1479 
1480 int
rte_intr_efd_enable(struct rte_intr_handle * intr_handle,uint32_t nb_efd)1481 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1482 {
1483 	uint32_t i;
1484 	int fd;
1485 	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1486 
1487 	assert(nb_efd != 0);
1488 
1489 	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1490 		for (i = 0; i < n; i++) {
1491 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1492 			if (fd < 0) {
1493 				RTE_LOG(ERR, EAL,
1494 					"can't setup eventfd, error %i (%s)\n",
1495 					errno, strerror(errno));
1496 				return -errno;
1497 			}
1498 			intr_handle->efds[i] = fd;
1499 		}
1500 		intr_handle->nb_efd   = n;
1501 		intr_handle->max_intr = NB_OTHER_INTR + n;
1502 	} else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1503 		/* only check, initialization would be done in vdev driver.*/
1504 		if (intr_handle->efd_counter_size >
1505 		    sizeof(union rte_intr_read_buffer)) {
1506 			RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1507 			return -EINVAL;
1508 		}
1509 	} else {
1510 		intr_handle->efds[0]  = intr_handle->fd;
1511 		intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1512 		intr_handle->max_intr = NB_OTHER_INTR;
1513 	}
1514 
1515 	return 0;
1516 }
1517 
1518 void
rte_intr_efd_disable(struct rte_intr_handle * intr_handle)1519 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1520 {
1521 	uint32_t i;
1522 
1523 	rte_intr_free_epoll_fd(intr_handle);
1524 	if (intr_handle->max_intr > intr_handle->nb_efd) {
1525 		for (i = 0; i < intr_handle->nb_efd; i++)
1526 			close(intr_handle->efds[i]);
1527 	}
1528 	intr_handle->nb_efd = 0;
1529 	intr_handle->max_intr = 0;
1530 }
1531 
1532 int
rte_intr_dp_is_en(struct rte_intr_handle * intr_handle)1533 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1534 {
1535 	return !(!intr_handle->nb_efd);
1536 }
1537 
1538 int
rte_intr_allow_others(struct rte_intr_handle * intr_handle)1539 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1540 {
1541 	if (!rte_intr_dp_is_en(intr_handle))
1542 		return 1;
1543 	else
1544 		return !!(intr_handle->max_intr - intr_handle->nb_efd);
1545 }
1546 
1547 int
rte_intr_cap_multiple(struct rte_intr_handle * intr_handle)1548 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1549 {
1550 	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1551 		return 1;
1552 
1553 	if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1554 		return 1;
1555 
1556 	return 0;
1557 }
1558 
rte_thread_is_intr(void)1559 int rte_thread_is_intr(void)
1560 {
1561 	return pthread_equal(intr_thread, pthread_self());
1562 }
1563