xref: /freebsd-14.2/sys/dev/netmap/netmap.c (revision f0ea3689)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/sockio.h>
141 #include <sys/socketvar.h>	/* struct socket */
142 #include <sys/malloc.h>
143 #include <sys/poll.h>
144 #include <sys/rwlock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/selinfo.h>
147 #include <sys/sysctl.h>
148 #include <sys/jail.h>
149 #include <net/vnet.h>
150 #include <net/if.h>
151 #include <net/if_var.h>
152 #include <net/bpf.h>		/* BIOCIMMEDIATE */
153 #include <machine/bus.h>	/* bus_dmamap_* */
154 #include <sys/endian.h>
155 #include <sys/refcount.h>
156 
157 
158 /* reduce conditional code */
159 // linux API, use for the knlist in FreeBSD
160 #define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
161 
162 void freebsd_selwakeup(struct selinfo *si, int pri);
163 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
164 
165 #elif defined(linux)
166 
167 #include "bsd_glue.h"
168 
169 
170 
171 #elif defined(__APPLE__)
172 
173 #warning OSX support is only partial
174 #include "osx_glue.h"
175 
176 #else
177 
178 #error	Unsupported platform
179 
180 #endif /* unsupported */
181 
182 /*
183  * common headers
184  */
185 #include <net/netmap.h>
186 #include <dev/netmap/netmap_kern.h>
187 #include <dev/netmap/netmap_mem2.h>
188 
189 
190 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
191 
192 /*
193  * The following variables are used by the drivers and replicate
194  * fields in the global memory pool. They only refer to buffers
195  * used by physical interfaces.
196  */
197 u_int netmap_total_buffers;
198 u_int netmap_buf_size;
199 char *netmap_buffer_base;	/* also address of an invalid buffer */
200 
201 /* user-controlled variables */
202 int netmap_verbose;
203 
204 static int netmap_no_timestamp; /* don't timestamp on rxsync */
205 
206 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
207 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
208     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
209 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
210     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
211 int netmap_mitigate = 1;
212 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
213 int netmap_no_pendintr = 1;
214 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
215     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
216 int netmap_txsync_retry = 2;
217 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
218     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
219 
220 int netmap_flags = 0;	/* debug flags */
221 int netmap_fwd = 0;	/* force transparent mode */
222 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
223 
224 /*
225  * netmap_admode selects the netmap mode to use.
226  * Invalid values are reset to NETMAP_ADMODE_BEST
227  */
228 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
229 	NETMAP_ADMODE_NATIVE,	/* either native or none */
230 	NETMAP_ADMODE_GENERIC,	/* force generic */
231 	NETMAP_ADMODE_LAST };
232 static int netmap_admode = NETMAP_ADMODE_BEST;
233 
234 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
235 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
236 int netmap_generic_rings = 1;   /* number of queues in generic. */
237 
238 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
239 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
241 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
242 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
245 
246 NMG_LOCK_T	netmap_global_lock;
247 
248 
249 static void
250 nm_kr_get(struct netmap_kring *kr)
251 {
252 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
253 		tsleep(kr, 0, "NM_KR_GET", 4);
254 }
255 
256 
257 /*
258  * mark the ring as stopped, and run through the locks
259  * to make sure other users get to see it.
260  */
261 void
262 netmap_disable_ring(struct netmap_kring *kr)
263 {
264 	kr->nkr_stopped = 1;
265 	nm_kr_get(kr);
266 	mtx_lock(&kr->q_lock);
267 	mtx_unlock(&kr->q_lock);
268 	nm_kr_put(kr);
269 }
270 
271 
272 static void
273 netmap_set_all_rings(struct ifnet *ifp, int stopped)
274 {
275 	struct netmap_adapter *na;
276 	int i;
277 	u_int ntx, nrx;
278 
279 	if (!(ifp->if_capenable & IFCAP_NETMAP))
280 		return;
281 
282 	na = NA(ifp);
283 
284 	ntx = netmap_real_tx_rings(na);
285 	nrx = netmap_real_rx_rings(na);
286 
287 	for (i = 0; i < ntx; i++) {
288 		if (stopped)
289 			netmap_disable_ring(na->tx_rings + i);
290 		else
291 			na->tx_rings[i].nkr_stopped = 0;
292 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
293 	}
294 
295 	for (i = 0; i < nrx; i++) {
296 		if (stopped)
297 			netmap_disable_ring(na->rx_rings + i);
298 		else
299 			na->rx_rings[i].nkr_stopped = 0;
300 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
301 	}
302 }
303 
304 
305 void
306 netmap_disable_all_rings(struct ifnet *ifp)
307 {
308 	netmap_set_all_rings(ifp, 1 /* stopped */);
309 }
310 
311 
312 void
313 netmap_enable_all_rings(struct ifnet *ifp)
314 {
315 	netmap_set_all_rings(ifp, 0 /* enabled */);
316 }
317 
318 
319 /*
320  * generic bound_checking function
321  */
322 u_int
323 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
324 {
325 	u_int oldv = *v;
326 	const char *op = NULL;
327 
328 	if (dflt < lo)
329 		dflt = lo;
330 	if (dflt > hi)
331 		dflt = hi;
332 	if (oldv < lo) {
333 		*v = dflt;
334 		op = "Bump";
335 	} else if (oldv > hi) {
336 		*v = hi;
337 		op = "Clamp";
338 	}
339 	if (op && msg)
340 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
341 	return *v;
342 }
343 
344 
345 /*
346  * packet-dump function, user-supplied or static buffer.
347  * The destination buffer must be at least 30+4*len
348  */
349 const char *
350 nm_dump_buf(char *p, int len, int lim, char *dst)
351 {
352 	static char _dst[8192];
353 	int i, j, i0;
354 	static char hex[] ="0123456789abcdef";
355 	char *o;	/* output position */
356 
357 #define P_HI(x)	hex[((x) & 0xf0)>>4]
358 #define P_LO(x)	hex[((x) & 0xf)]
359 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
360 	if (!dst)
361 		dst = _dst;
362 	if (lim <= 0 || lim > len)
363 		lim = len;
364 	o = dst;
365 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
366 	o += strlen(o);
367 	/* hexdump routine */
368 	for (i = 0; i < lim; ) {
369 		sprintf(o, "%5d: ", i);
370 		o += strlen(o);
371 		memset(o, ' ', 48);
372 		i0 = i;
373 		for (j=0; j < 16 && i < lim; i++, j++) {
374 			o[j*3] = P_HI(p[i]);
375 			o[j*3+1] = P_LO(p[i]);
376 		}
377 		i = i0;
378 		for (j=0; j < 16 && i < lim; i++, j++)
379 			o[j + 48] = P_C(p[i]);
380 		o[j+48] = '\n';
381 		o += j+49;
382 	}
383 	*o = '\0';
384 #undef P_HI
385 #undef P_LO
386 #undef P_C
387 	return dst;
388 }
389 
390 
391 /*
392  * Fetch configuration from the device, to cope with dynamic
393  * reconfigurations after loading the module.
394  */
395 int
396 netmap_update_config(struct netmap_adapter *na)
397 {
398 	struct ifnet *ifp = na->ifp;
399 	u_int txr, txd, rxr, rxd;
400 
401 	txr = txd = rxr = rxd = 0;
402 	if (na->nm_config) {
403 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
404 	} else {
405 		/* take whatever we had at init time */
406 		txr = na->num_tx_rings;
407 		txd = na->num_tx_desc;
408 		rxr = na->num_rx_rings;
409 		rxd = na->num_rx_desc;
410 	}
411 
412 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
413 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
414 		return 0; /* nothing changed */
415 	if (netmap_verbose || na->active_fds > 0) {
416 		D("stored config %s: txring %d x %d, rxring %d x %d",
417 			NM_IFPNAME(ifp),
418 			na->num_tx_rings, na->num_tx_desc,
419 			na->num_rx_rings, na->num_rx_desc);
420 		D("new config %s: txring %d x %d, rxring %d x %d",
421 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
422 	}
423 	if (na->active_fds == 0) {
424 		D("configuration changed (but fine)");
425 		na->num_tx_rings = txr;
426 		na->num_tx_desc = txd;
427 		na->num_rx_rings = rxr;
428 		na->num_rx_desc = rxd;
429 		return 0;
430 	}
431 	D("configuration changed while active, this is bad...");
432 	return 1;
433 }
434 
435 static int
436 netmap_txsync_compat(struct netmap_kring *kring, int flags)
437 {
438 	struct netmap_adapter *na = kring->na;
439 	return na->nm_txsync(na, kring->ring_id, flags);
440 }
441 
442 static int
443 netmap_rxsync_compat(struct netmap_kring *kring, int flags)
444 {
445 	struct netmap_adapter *na = kring->na;
446 	return na->nm_rxsync(na, kring->ring_id, flags);
447 }
448 
449 static int
450 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
451 {
452 	(void)flags;
453 	netmap_txsync_to_host(kring->na);
454 	return 0;
455 }
456 
457 static int
458 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
459 {
460 	(void)flags;
461 	netmap_rxsync_from_host(kring->na, NULL, NULL);
462 	return 0;
463 }
464 
465 
466 
467 /* create the krings array and initialize the fields common to all adapters.
468  * The array layout is this:
469  *
470  *                    +----------+
471  * na->tx_rings ----->|          | \
472  *                    |          |  } na->num_tx_ring
473  *                    |          | /
474  *                    +----------+
475  *                    |          |    host tx kring
476  * na->rx_rings ----> +----------+
477  *                    |          | \
478  *                    |          |  } na->num_rx_rings
479  *                    |          | /
480  *                    +----------+
481  *                    |          |    host rx kring
482  *                    +----------+
483  * na->tailroom ----->|          | \
484  *                    |          |  } tailroom bytes
485  *                    |          | /
486  *                    +----------+
487  *
488  * Note: for compatibility, host krings are created even when not needed.
489  * The tailroom space is currently used by vale ports for allocating leases.
490  */
491 int
492 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
493 {
494 	u_int i, len, ndesc;
495 	struct netmap_kring *kring;
496 	u_int ntx, nrx;
497 
498 	/* account for the (possibly fake) host rings */
499 	ntx = na->num_tx_rings + 1;
500 	nrx = na->num_rx_rings + 1;
501 
502 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
503 
504 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
505 	if (na->tx_rings == NULL) {
506 		D("Cannot allocate krings");
507 		return ENOMEM;
508 	}
509 	na->rx_rings = na->tx_rings + ntx;
510 
511 	/*
512 	 * All fields in krings are 0 except the one initialized below.
513 	 * but better be explicit on important kring fields.
514 	 */
515 	ndesc = na->num_tx_desc;
516 	for (i = 0; i < ntx; i++) { /* Transmit rings */
517 		kring = &na->tx_rings[i];
518 		bzero(kring, sizeof(*kring));
519 		kring->na = na;
520 		kring->ring_id = i;
521 		kring->nkr_num_slots = ndesc;
522 		if (i < na->num_tx_rings) {
523 			kring->nm_sync = netmap_txsync_compat; // XXX
524 		} else if (i == na->num_tx_rings) {
525 			kring->nm_sync = netmap_txsync_to_host_compat;
526 		}
527 		/*
528 		 * IMPORTANT: Always keep one slot empty.
529 		 */
530 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
531 		kring->rtail = kring->nr_hwtail = ndesc - 1;
532 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
533 		ND("ktx %s h %d c %d t %d",
534 			kring->name, kring->rhead, kring->rcur, kring->rtail);
535 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
536 		init_waitqueue_head(&kring->si);
537 	}
538 
539 	ndesc = na->num_rx_desc;
540 	for (i = 0; i < nrx; i++) { /* Receive rings */
541 		kring = &na->rx_rings[i];
542 		bzero(kring, sizeof(*kring));
543 		kring->na = na;
544 		kring->ring_id = i;
545 		kring->nkr_num_slots = ndesc;
546 		if (i < na->num_rx_rings) {
547 			kring->nm_sync = netmap_rxsync_compat; // XXX
548 		} else if (i == na->num_rx_rings) {
549 			kring->nm_sync = netmap_rxsync_from_host_compat;
550 		}
551 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
552 		kring->rtail = kring->nr_hwtail = 0;
553 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
554 		ND("krx %s h %d c %d t %d",
555 			kring->name, kring->rhead, kring->rcur, kring->rtail);
556 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
557 		init_waitqueue_head(&kring->si);
558 	}
559 	init_waitqueue_head(&na->tx_si);
560 	init_waitqueue_head(&na->rx_si);
561 
562 	na->tailroom = na->rx_rings + nrx;
563 
564 	return 0;
565 }
566 
567 
568 /* undo the actions performed by netmap_krings_create */
569 void
570 netmap_krings_delete(struct netmap_adapter *na)
571 {
572 	struct netmap_kring *kring = na->tx_rings;
573 
574 	/* we rely on the krings layout described above */
575 	for ( ; kring != na->tailroom; kring++) {
576 		mtx_destroy(&kring->q_lock);
577 	}
578 	free(na->tx_rings, M_DEVBUF);
579 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
580 }
581 
582 
583 /*
584  * Destructor for NIC ports. They also have an mbuf queue
585  * on the rings connected to the host so we need to purge
586  * them first.
587  */
588 static void
589 netmap_hw_krings_delete(struct netmap_adapter *na)
590 {
591 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
592 
593 	ND("destroy sw mbq with len %d", mbq_len(q));
594 	mbq_purge(q);
595 	mbq_safe_destroy(q);
596 	netmap_krings_delete(na);
597 }
598 
599 
600 static struct netmap_if*
601 netmap_if_new(const char *ifname, struct netmap_adapter *na)
602 {
603 	struct netmap_if *nifp;
604 
605 	if (netmap_update_config(na)) {
606 		/* configuration mismatch, report and fail */
607 		return NULL;
608 	}
609 
610 	if (na->active_fds)
611 		goto final;
612 
613 	if (na->nm_krings_create(na))
614 		goto cleanup;
615 
616 	if (netmap_mem_rings_create(na))
617 		goto cleanup;
618 
619 final:
620 
621 	nifp = netmap_mem_if_new(ifname, na);
622 	if (nifp == NULL)
623 		goto cleanup;
624 
625 	return (nifp);
626 
627 cleanup:
628 
629 	if (na->active_fds == 0) {
630 		netmap_mem_rings_delete(na);
631 		na->nm_krings_delete(na);
632 	}
633 
634 	return NULL;
635 }
636 
637 
638 /* grab a reference to the memory allocator, if we don't have one already.  The
639  * reference is taken from the netmap_adapter registered with the priv.
640  *
641  */
642 static int
643 netmap_get_memory_locked(struct netmap_priv_d* p)
644 {
645 	struct netmap_mem_d *nmd;
646 	int error = 0;
647 
648 	if (p->np_na == NULL) {
649 		if (!netmap_mmap_unreg)
650 			return ENODEV;
651 		/* for compatibility with older versions of the API
652  		 * we use the global allocator when no interface has been
653  		 * registered
654  		 */
655 		nmd = &nm_mem;
656 	} else {
657 		nmd = p->np_na->nm_mem;
658 	}
659 	if (p->np_mref == NULL) {
660 		error = netmap_mem_finalize(nmd);
661 		if (!error)
662 			p->np_mref = nmd;
663 	} else if (p->np_mref != nmd) {
664 		/* a virtual port has been registered, but previous
665  		 * syscalls already used the global allocator.
666  		 * We cannot continue
667  		 */
668 		error = ENODEV;
669 	}
670 	return error;
671 }
672 
673 
674 int
675 netmap_get_memory(struct netmap_priv_d* p)
676 {
677 	int error;
678 	NMG_LOCK();
679 	error = netmap_get_memory_locked(p);
680 	NMG_UNLOCK();
681 	return error;
682 }
683 
684 
685 static int
686 netmap_have_memory_locked(struct netmap_priv_d* p)
687 {
688 	return p->np_mref != NULL;
689 }
690 
691 
692 static void
693 netmap_drop_memory_locked(struct netmap_priv_d* p)
694 {
695 	if (p->np_mref) {
696 		netmap_mem_deref(p->np_mref);
697 		p->np_mref = NULL;
698 	}
699 }
700 
701 
702 /*
703  * File descriptor's private data destructor.
704  *
705  * Call nm_register(ifp,0) to stop netmap mode on the interface and
706  * revert to normal operation. We expect that np_na->ifp has not gone.
707  * The second argument is the nifp to work on. In some cases it is
708  * not attached yet to the netmap_priv_d so we need to pass it as
709  * a separate argument.
710  */
711 /* call with NMG_LOCK held */
712 static void
713 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
714 {
715 	struct netmap_adapter *na = priv->np_na;
716 	struct ifnet *ifp = na->ifp;
717 
718 	NMG_LOCK_ASSERT();
719 	na->active_fds--;
720 	if (na->active_fds <= 0) {	/* last instance */
721 
722 		if (netmap_verbose)
723 			D("deleting last instance for %s", NM_IFPNAME(ifp));
724 		/*
725 		 * (TO CHECK) This function is only called
726 		 * when the last reference to this file descriptor goes
727 		 * away. This means we cannot have any pending poll()
728 		 * or interrupt routine operating on the structure.
729 		 * XXX The file may be closed in a thread while
730 		 * another thread is using it.
731 		 * Linux keeps the file opened until the last reference
732 		 * by any outstanding ioctl/poll or mmap is gone.
733 		 * FreeBSD does not track mmap()s (but we do) and
734 		 * wakes up any sleeping poll(). Need to check what
735 		 * happens if the close() occurs while a concurrent
736 		 * syscall is running.
737 		 */
738 		if (ifp)
739 			na->nm_register(na, 0); /* off, clear flags */
740 		/* Wake up any sleeping threads. netmap_poll will
741 		 * then return POLLERR
742 		 * XXX The wake up now must happen during *_down(), when
743 		 * we order all activities to stop. -gl
744 		 */
745 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
746 		/* knlist_destroy(&na->tx_si.si_note); */
747 		/* knlist_destroy(&na->rx_si.si_note); */
748 
749 		/* delete rings and buffers */
750 		netmap_mem_rings_delete(na);
751 		na->nm_krings_delete(na);
752 	}
753 	/* delete the nifp */
754 	netmap_mem_if_delete(na, nifp);
755 }
756 
757 static __inline int
758 nm_tx_si_user(struct netmap_priv_d *priv)
759 {
760 	return (priv->np_na != NULL &&
761 		(priv->np_txqlast - priv->np_txqfirst > 1));
762 }
763 
764 static __inline int
765 nm_rx_si_user(struct netmap_priv_d *priv)
766 {
767 	return (priv->np_na != NULL &&
768 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
769 }
770 
771 
772 /*
773  * returns 1 if this is the last instance and we can free priv
774  */
775 int
776 netmap_dtor_locked(struct netmap_priv_d *priv)
777 {
778 	struct netmap_adapter *na = priv->np_na;
779 
780 #ifdef __FreeBSD__
781 	/*
782 	 * np_refcount is the number of active mmaps on
783 	 * this file descriptor
784 	 */
785 	if (--priv->np_refcount > 0) {
786 		return 0;
787 	}
788 #endif /* __FreeBSD__ */
789 	if (!na) {
790 	    return 1; //XXX is it correct?
791 	}
792 	netmap_do_unregif(priv, priv->np_nifp);
793 	priv->np_nifp = NULL;
794 	netmap_drop_memory_locked(priv);
795 	if (priv->np_na) {
796 		if (nm_tx_si_user(priv))
797 			na->tx_si_users--;
798 		if (nm_rx_si_user(priv))
799 			na->rx_si_users--;
800 		netmap_adapter_put(na);
801 		priv->np_na = NULL;
802 	}
803 	return 1;
804 }
805 
806 
807 void
808 netmap_dtor(void *data)
809 {
810 	struct netmap_priv_d *priv = data;
811 	int last_instance;
812 
813 	NMG_LOCK();
814 	last_instance = netmap_dtor_locked(priv);
815 	NMG_UNLOCK();
816 	if (last_instance) {
817 		bzero(priv, sizeof(*priv));	/* for safety */
818 		free(priv, M_DEVBUF);
819 	}
820 }
821 
822 
823 
824 
825 /*
826  * Handlers for synchronization of the queues from/to the host.
827  * Netmap has two operating modes:
828  * - in the default mode, the rings connected to the host stack are
829  *   just another ring pair managed by userspace;
830  * - in transparent mode (XXX to be defined) incoming packets
831  *   (from the host or the NIC) are marked as NS_FORWARD upon
832  *   arrival, and the user application has a chance to reset the
833  *   flag for packets that should be dropped.
834  *   On the RXSYNC or poll(), packets in RX rings between
835  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
836  *   to the other side.
837  * The transfer NIC --> host is relatively easy, just encapsulate
838  * into mbufs and we are done. The host --> NIC side is slightly
839  * harder because there might not be room in the tx ring so it
840  * might take a while before releasing the buffer.
841  */
842 
843 
844 /*
845  * pass a chain of buffers to the host stack as coming from 'dst'
846  * We do not need to lock because the queue is private.
847  */
848 static void
849 netmap_send_up(struct ifnet *dst, struct mbq *q)
850 {
851 	struct mbuf *m;
852 
853 	/* send packets up, outside the lock */
854 	while ((m = mbq_dequeue(q)) != NULL) {
855 		if (netmap_verbose & NM_VERB_HOST)
856 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
857 		NM_SEND_UP(dst, m);
858 	}
859 	mbq_destroy(q);
860 }
861 
862 
863 /*
864  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
865  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
866  * and pass them up. Drop remaining packets in the unlikely event
867  * of an mbuf shortage.
868  */
869 static void
870 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
871 {
872 	u_int const lim = kring->nkr_num_slots - 1;
873 	u_int const head = kring->ring->head;
874 	u_int n;
875 	struct netmap_adapter *na = kring->na;
876 
877 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
878 		struct mbuf *m;
879 		struct netmap_slot *slot = &kring->ring->slot[n];
880 
881 		if ((slot->flags & NS_FORWARD) == 0 && !force)
882 			continue;
883 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
884 			RD(5, "bad pkt at %d len %d", n, slot->len);
885 			continue;
886 		}
887 		slot->flags &= ~NS_FORWARD; // XXX needed ?
888 		/* XXX TODO: adapt to the case of a multisegment packet */
889 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
890 
891 		if (m == NULL)
892 			break;
893 		mbq_enqueue(q, m);
894 	}
895 }
896 
897 
898 /*
899  * Send to the NIC rings packets marked NS_FORWARD between
900  * kring->nr_hwcur and kring->rhead
901  * Called under kring->rx_queue.lock on the sw rx ring,
902  */
903 static u_int
904 netmap_sw_to_nic(struct netmap_adapter *na)
905 {
906 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
907 	struct netmap_slot *rxslot = kring->ring->slot;
908 	u_int i, rxcur = kring->nr_hwcur;
909 	u_int const head = kring->rhead;
910 	u_int const src_lim = kring->nkr_num_slots - 1;
911 	u_int sent = 0;
912 
913 	/* scan rings to find space, then fill as much as possible */
914 	for (i = 0; i < na->num_tx_rings; i++) {
915 		struct netmap_kring *kdst = &na->tx_rings[i];
916 		struct netmap_ring *rdst = kdst->ring;
917 		u_int const dst_lim = kdst->nkr_num_slots - 1;
918 
919 		/* XXX do we trust ring or kring->rcur,rtail ? */
920 		for (; rxcur != head && !nm_ring_empty(rdst);
921 		     rxcur = nm_next(rxcur, src_lim) ) {
922 			struct netmap_slot *src, *dst, tmp;
923 			u_int dst_cur = rdst->cur;
924 
925 			src = &rxslot[rxcur];
926 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
927 				continue;
928 
929 			sent++;
930 
931 			dst = &rdst->slot[dst_cur];
932 
933 			tmp = *src;
934 
935 			src->buf_idx = dst->buf_idx;
936 			src->flags = NS_BUF_CHANGED;
937 
938 			dst->buf_idx = tmp.buf_idx;
939 			dst->len = tmp.len;
940 			dst->flags = NS_BUF_CHANGED;
941 
942 			rdst->cur = nm_next(dst_cur, dst_lim);
943 		}
944 		/* if (sent) XXX txsync ? */
945 	}
946 	return sent;
947 }
948 
949 
950 /*
951  * netmap_txsync_to_host() passes packets up. We are called from a
952  * system call in user process context, and the only contention
953  * can be among multiple user threads erroneously calling
954  * this routine concurrently.
955  */
956 void
957 netmap_txsync_to_host(struct netmap_adapter *na)
958 {
959 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
960 	struct netmap_ring *ring = kring->ring;
961 	u_int const lim = kring->nkr_num_slots - 1;
962 	u_int const head = kring->rhead;
963 	struct mbq q;
964 
965 	/* Take packets from hwcur to head and pass them up.
966 	 * force head = cur since netmap_grab_packets() stops at head
967 	 * In case of no buffers we give up. At the end of the loop,
968 	 * the queue is drained in all cases.
969 	 */
970 	mbq_init(&q);
971 	ring->cur = head;
972 	netmap_grab_packets(kring, &q, 1 /* force */);
973 	ND("have %d pkts in queue", mbq_len(&q));
974 	kring->nr_hwcur = head;
975 	kring->nr_hwtail = head + lim;
976 	if (kring->nr_hwtail > lim)
977 		kring->nr_hwtail -= lim + 1;
978 	nm_txsync_finalize(kring);
979 
980 	netmap_send_up(na->ifp, &q);
981 }
982 
983 
984 /*
985  * rxsync backend for packets coming from the host stack.
986  * They have been put in kring->rx_queue by netmap_transmit().
987  * We protect access to the kring using kring->rx_queue.lock
988  *
989  * This routine also does the selrecord if called from the poll handler
990  * (we know because td != NULL).
991  *
992  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
993  *     as an additional hidden argument.
994  * returns the number of packets delivered to tx queues in
995  * transparent mode, or a negative value if error
996  */
997 int
998 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
999 {
1000 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1001 	struct netmap_ring *ring = kring->ring;
1002 	u_int nm_i, n;
1003 	u_int const lim = kring->nkr_num_slots - 1;
1004 	u_int const head = kring->rhead;
1005 	int ret = 0;
1006 	struct mbq *q = &kring->rx_queue;
1007 
1008 	(void)pwait;	/* disable unused warnings */
1009 	(void)td;
1010 
1011 	mtx_lock(&q->lock);
1012 
1013 	/* First part: import newly received packets */
1014 	n = mbq_len(q);
1015 	if (n) { /* grab packets from the queue */
1016 		struct mbuf *m;
1017 		uint32_t stop_i;
1018 
1019 		nm_i = kring->nr_hwtail;
1020 		stop_i = nm_prev(nm_i, lim);
1021 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1022 			int len = MBUF_LEN(m);
1023 			struct netmap_slot *slot = &ring->slot[nm_i];
1024 
1025 			m_copydata(m, 0, len, BDG_NMB(na, slot));
1026 			ND("nm %d len %d", nm_i, len);
1027 			if (netmap_verbose)
1028                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
1029 
1030 			slot->len = len;
1031 			slot->flags = kring->nkr_slot_flags;
1032 			nm_i = nm_next(nm_i, lim);
1033 		}
1034 		kring->nr_hwtail = nm_i;
1035 	}
1036 
1037 	/*
1038 	 * Second part: skip past packets that userspace has released.
1039 	 */
1040 	nm_i = kring->nr_hwcur;
1041 	if (nm_i != head) { /* something was released */
1042 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1043 			ret = netmap_sw_to_nic(na);
1044 		kring->nr_hwcur = head;
1045 	}
1046 
1047 	nm_rxsync_finalize(kring);
1048 
1049 	/* access copies of cur,tail in the kring */
1050 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1051 		selrecord(td, &kring->si);
1052 
1053 	mtx_unlock(&q->lock);
1054 	return ret;
1055 }
1056 
1057 
1058 /* Get a netmap adapter for the port.
1059  *
1060  * If it is possible to satisfy the request, return 0
1061  * with *na containing the netmap adapter found.
1062  * Otherwise return an error code, with *na containing NULL.
1063  *
1064  * When the port is attached to a bridge, we always return
1065  * EBUSY.
1066  * Otherwise, if the port is already bound to a file descriptor,
1067  * then we unconditionally return the existing adapter into *na.
1068  * In all the other cases, we return (into *na) either native,
1069  * generic or NULL, according to the following table:
1070  *
1071  *					native_support
1072  * active_fds   dev.netmap.admode         YES     NO
1073  * -------------------------------------------------------
1074  *    >0              *                 NA(ifp) NA(ifp)
1075  *
1076  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1077  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1078  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1079  *
1080  */
1081 
1082 int
1083 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1084 {
1085 	/* generic support */
1086 	int i = netmap_admode;	/* Take a snapshot. */
1087 	int error = 0;
1088 	struct netmap_adapter *prev_na;
1089 	struct netmap_generic_adapter *gna;
1090 
1091 	*na = NULL; /* default */
1092 
1093 	/* reset in case of invalid value */
1094 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1095 		i = netmap_admode = NETMAP_ADMODE_BEST;
1096 
1097 	if (NETMAP_CAPABLE(ifp)) {
1098 		/* If an adapter already exists, but is
1099 		 * attached to a vale port, we report that the
1100 		 * port is busy.
1101 		 */
1102 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1103 			return EBUSY;
1104 
1105 		/* If an adapter already exists, return it if
1106 		 * there are active file descriptors or if
1107 		 * netmap is not forced to use generic
1108 		 * adapters.
1109 		 */
1110 		if (NA(ifp)->active_fds > 0 ||
1111 				i != NETMAP_ADMODE_GENERIC) {
1112 			*na = NA(ifp);
1113 			return 0;
1114 		}
1115 	}
1116 
1117 	/* If there isn't native support and netmap is not allowed
1118 	 * to use generic adapters, we cannot satisfy the request.
1119 	 */
1120 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1121 		return EOPNOTSUPP;
1122 
1123 	/* Otherwise, create a generic adapter and return it,
1124 	 * saving the previously used netmap adapter, if any.
1125 	 *
1126 	 * Note that here 'prev_na', if not NULL, MUST be a
1127 	 * native adapter, and CANNOT be a generic one. This is
1128 	 * true because generic adapters are created on demand, and
1129 	 * destroyed when not used anymore. Therefore, if the adapter
1130 	 * currently attached to an interface 'ifp' is generic, it
1131 	 * must be that
1132 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1133 	 * Consequently, if NA(ifp) is generic, we will enter one of
1134 	 * the branches above. This ensures that we never override
1135 	 * a generic adapter with another generic adapter.
1136 	 */
1137 	prev_na = NA(ifp);
1138 	error = generic_netmap_attach(ifp);
1139 	if (error)
1140 		return error;
1141 
1142 	*na = NA(ifp);
1143 	gna = (struct netmap_generic_adapter*)NA(ifp);
1144 	gna->prev = prev_na; /* save old na */
1145 	if (prev_na != NULL) {
1146 		ifunit_ref(ifp->if_xname);
1147 		// XXX add a refcount ?
1148 		netmap_adapter_get(prev_na);
1149 	}
1150 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1151 
1152 	return 0;
1153 }
1154 
1155 
1156 /*
1157  * MUST BE CALLED UNDER NMG_LOCK()
1158  *
1159  * Get a refcounted reference to a netmap adapter attached
1160  * to the interface specified by nmr.
1161  * This is always called in the execution of an ioctl().
1162  *
1163  * Return ENXIO if the interface specified by the request does
1164  * not exist, ENOTSUP if netmap is not supported by the interface,
1165  * EBUSY if the interface is already attached to a bridge,
1166  * EINVAL if parameters are invalid, ENOMEM if needed resources
1167  * could not be allocated.
1168  * If successful, hold a reference to the netmap adapter.
1169  *
1170  * No reference is kept on the real interface, which may then
1171  * disappear at any time.
1172  */
1173 int
1174 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1175 {
1176 	struct ifnet *ifp = NULL;
1177 	int error = 0;
1178 	struct netmap_adapter *ret = NULL;
1179 
1180 	*na = NULL;     /* default return value */
1181 
1182 	/* first try to see if this is a bridge port. */
1183 	NMG_LOCK_ASSERT();
1184 
1185 	error = netmap_get_pipe_na(nmr, na, create);
1186 	if (error || *na != NULL)
1187 		return error;
1188 
1189 	error = netmap_get_bdg_na(nmr, na, create);
1190 	if (error)
1191 		return error;
1192 
1193 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1194 		goto pipes;
1195 
1196 	ifp = ifunit_ref(nmr->nr_name);
1197 	if (ifp == NULL) {
1198 	        return ENXIO;
1199 	}
1200 
1201 	error = netmap_get_hw_na(ifp, &ret);
1202 	if (error)
1203 		goto out;
1204 
1205 	/* Users cannot use the NIC attached to a bridge directly */
1206 	if (NETMAP_OWNED_BY_KERN(ret)) {
1207 		error = EBUSY;
1208 		goto out;
1209 	}
1210 	*na = ret;
1211 	netmap_adapter_get(ret);
1212 
1213 pipes:
1214 	error = netmap_pipe_alloc(*na, nmr);
1215 
1216 out:
1217 	if (error && ret != NULL)
1218 		netmap_adapter_put(ret);
1219 
1220 	if (ifp)
1221 		if_rele(ifp);
1222 
1223 	return error;
1224 }
1225 
1226 
1227 /*
1228  * validate parameters on entry for *_txsync()
1229  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1230  * in case of error.
1231  *
1232  * rhead, rcur and rtail=hwtail are stored from previous round.
1233  * hwcur is the next packet to send to the ring.
1234  *
1235  * We want
1236  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1237  *
1238  * hwcur, rhead, rtail and hwtail are reliable
1239  */
1240 u_int
1241 nm_txsync_prologue(struct netmap_kring *kring)
1242 {
1243 	struct netmap_ring *ring = kring->ring;
1244 	u_int head = ring->head; /* read only once */
1245 	u_int cur = ring->cur; /* read only once */
1246 	u_int n = kring->nkr_num_slots;
1247 
1248 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1249 		kring->name,
1250 		kring->nr_hwcur, kring->nr_hwtail,
1251 		ring->head, ring->cur, ring->tail);
1252 #if 1 /* kernel sanity checks; but we can trust the kring. */
1253 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1254 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1255 		goto error;
1256 #endif /* kernel sanity checks */
1257 	/*
1258 	 * user sanity checks. We only use 'cur',
1259 	 * A, B, ... are possible positions for cur:
1260 	 *
1261 	 *  0    A  cur   B  tail  C  n-1
1262 	 *  0    D  tail  E  cur   F  n-1
1263 	 *
1264 	 * B, F, D are valid. A, C, E are wrong
1265 	 */
1266 	if (kring->rtail >= kring->rhead) {
1267 		/* want rhead <= head <= rtail */
1268 		if (head < kring->rhead || head > kring->rtail)
1269 			goto error;
1270 		/* and also head <= cur <= rtail */
1271 		if (cur < head || cur > kring->rtail)
1272 			goto error;
1273 	} else { /* here rtail < rhead */
1274 		/* we need head outside rtail .. rhead */
1275 		if (head > kring->rtail && head < kring->rhead)
1276 			goto error;
1277 
1278 		/* two cases now: head <= rtail or head >= rhead  */
1279 		if (head <= kring->rtail) {
1280 			/* want head <= cur <= rtail */
1281 			if (cur < head || cur > kring->rtail)
1282 				goto error;
1283 		} else { /* head >= rhead */
1284 			/* cur must be outside rtail..head */
1285 			if (cur > kring->rtail && cur < head)
1286 				goto error;
1287 		}
1288 	}
1289 	if (ring->tail != kring->rtail) {
1290 		RD(5, "tail overwritten was %d need %d",
1291 			ring->tail, kring->rtail);
1292 		ring->tail = kring->rtail;
1293 	}
1294 	kring->rhead = head;
1295 	kring->rcur = cur;
1296 	return head;
1297 
1298 error:
1299 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1300 		kring->name,
1301 		kring->nr_hwcur,
1302 		kring->rcur, kring->nr_hwtail,
1303 		cur, ring->tail);
1304 	return n;
1305 }
1306 
1307 
1308 /*
1309  * validate parameters on entry for *_rxsync()
1310  * Returns ring->head if ok, kring->nkr_num_slots on error.
1311  *
1312  * For a valid configuration,
1313  * hwcur <= head <= cur <= tail <= hwtail
1314  *
1315  * We only consider head and cur.
1316  * hwcur and hwtail are reliable.
1317  *
1318  */
1319 u_int
1320 nm_rxsync_prologue(struct netmap_kring *kring)
1321 {
1322 	struct netmap_ring *ring = kring->ring;
1323 	uint32_t const n = kring->nkr_num_slots;
1324 	uint32_t head, cur;
1325 
1326 	ND("%s kc %d kt %d h %d c %d t %d",
1327 		kring->name,
1328 		kring->nr_hwcur, kring->nr_hwtail,
1329 		ring->head, ring->cur, ring->tail);
1330 	/*
1331 	 * Before storing the new values, we should check they do not
1332 	 * move backwards. However:
1333 	 * - head is not an issue because the previous value is hwcur;
1334 	 * - cur could in principle go back, however it does not matter
1335 	 *   because we are processing a brand new rxsync()
1336 	 */
1337 	cur = kring->rcur = ring->cur;	/* read only once */
1338 	head = kring->rhead = ring->head;	/* read only once */
1339 #if 1 /* kernel sanity checks */
1340 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1341 		goto error;
1342 #endif /* kernel sanity checks */
1343 	/* user sanity checks */
1344 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1345 		/* want hwcur <= rhead <= hwtail */
1346 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1347 			goto error;
1348 		/* and also rhead <= rcur <= hwtail */
1349 		if (cur < head || cur > kring->nr_hwtail)
1350 			goto error;
1351 	} else {
1352 		/* we need rhead outside hwtail..hwcur */
1353 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1354 			goto error;
1355 		/* two cases now: head <= hwtail or head >= hwcur  */
1356 		if (head <= kring->nr_hwtail) {
1357 			/* want head <= cur <= hwtail */
1358 			if (cur < head || cur > kring->nr_hwtail)
1359 				goto error;
1360 		} else {
1361 			/* cur must be outside hwtail..head */
1362 			if (cur < head && cur > kring->nr_hwtail)
1363 				goto error;
1364 		}
1365 	}
1366 	if (ring->tail != kring->rtail) {
1367 		RD(5, "%s tail overwritten was %d need %d",
1368 			kring->name,
1369 			ring->tail, kring->rtail);
1370 		ring->tail = kring->rtail;
1371 	}
1372 	return head;
1373 
1374 error:
1375 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1376 		kring->nr_hwcur,
1377 		kring->rcur, kring->nr_hwtail,
1378 		kring->rhead, kring->rcur, ring->tail);
1379 	return n;
1380 }
1381 
1382 
1383 /*
1384  * Error routine called when txsync/rxsync detects an error.
1385  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1386  * Return 1 on reinit.
1387  *
1388  * This routine is only called by the upper half of the kernel.
1389  * It only reads hwcur (which is changed only by the upper half, too)
1390  * and hwtail (which may be changed by the lower half, but only on
1391  * a tx ring and only to increase it, so any error will be recovered
1392  * on the next call). For the above, we don't strictly need to call
1393  * it under lock.
1394  */
1395 int
1396 netmap_ring_reinit(struct netmap_kring *kring)
1397 {
1398 	struct netmap_ring *ring = kring->ring;
1399 	u_int i, lim = kring->nkr_num_slots - 1;
1400 	int errors = 0;
1401 
1402 	// XXX KASSERT nm_kr_tryget
1403 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1404 	// XXX probably wrong to trust userspace
1405 	kring->rhead = ring->head;
1406 	kring->rcur  = ring->cur;
1407 	kring->rtail = ring->tail;
1408 
1409 	if (ring->cur > lim)
1410 		errors++;
1411 	if (ring->head > lim)
1412 		errors++;
1413 	if (ring->tail > lim)
1414 		errors++;
1415 	for (i = 0; i <= lim; i++) {
1416 		u_int idx = ring->slot[i].buf_idx;
1417 		u_int len = ring->slot[i].len;
1418 		if (idx < 2 || idx >= netmap_total_buffers) {
1419 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1420 			ring->slot[i].buf_idx = 0;
1421 			ring->slot[i].len = 0;
1422 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1423 			ring->slot[i].len = 0;
1424 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1425 		}
1426 	}
1427 	if (errors) {
1428 		RD(10, "total %d errors", errors);
1429 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1430 			kring->name,
1431 			ring->cur, kring->nr_hwcur,
1432 			ring->tail, kring->nr_hwtail);
1433 		ring->head = kring->rhead = kring->nr_hwcur;
1434 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1435 		ring->tail = kring->rtail = kring->nr_hwtail;
1436 	}
1437 	return (errors ? 1 : 0);
1438 }
1439 
1440 
1441 /*
1442  * Set the ring ID. For devices with a single queue, a request
1443  * for all rings is the same as a single ring.
1444  */
1445 static int
1446 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1447 {
1448 	struct netmap_adapter *na = priv->np_na;
1449 	u_int j, i = ringid & NETMAP_RING_MASK;
1450 	u_int reg = flags & NR_REG_MASK;
1451 
1452 	if (reg == NR_REG_DEFAULT) {
1453 		/* convert from old ringid to flags */
1454 		if (ringid & NETMAP_SW_RING) {
1455 			reg = NR_REG_SW;
1456 		} else if (ringid & NETMAP_HW_RING) {
1457 			reg = NR_REG_ONE_NIC;
1458 		} else {
1459 			reg = NR_REG_ALL_NIC;
1460 		}
1461 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1462 	}
1463 	switch (reg) {
1464 	case NR_REG_ALL_NIC:
1465 	case NR_REG_PIPE_MASTER:
1466 	case NR_REG_PIPE_SLAVE:
1467 		priv->np_txqfirst = 0;
1468 		priv->np_txqlast = na->num_tx_rings;
1469 		priv->np_rxqfirst = 0;
1470 		priv->np_rxqlast = na->num_rx_rings;
1471 		ND("%s %d %d", "ALL/PIPE",
1472 			priv->np_rxqfirst, priv->np_rxqlast);
1473 		break;
1474 	case NR_REG_SW:
1475 	case NR_REG_NIC_SW:
1476 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1477 			D("host rings not supported");
1478 			return EINVAL;
1479 		}
1480 		priv->np_txqfirst = (reg == NR_REG_SW ?
1481 			na->num_tx_rings : 0);
1482 		priv->np_txqlast = na->num_tx_rings + 1;
1483 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1484 			na->num_rx_rings : 0);
1485 		priv->np_rxqlast = na->num_rx_rings + 1;
1486 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1487 			priv->np_rxqfirst, priv->np_rxqlast);
1488 		break;
1489 	case NR_REG_ONE_NIC:
1490 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1491 			D("invalid ring id %d", i);
1492 			return EINVAL;
1493 		}
1494 		/* if not enough rings, use the first one */
1495 		j = i;
1496 		if (j >= na->num_tx_rings)
1497 			j = 0;
1498 		priv->np_txqfirst = j;
1499 		priv->np_txqlast = j + 1;
1500 		j = i;
1501 		if (j >= na->num_rx_rings)
1502 			j = 0;
1503 		priv->np_rxqfirst = j;
1504 		priv->np_rxqlast = j + 1;
1505 		break;
1506 	default:
1507 		D("invalid regif type %d", reg);
1508 		return EINVAL;
1509 	}
1510 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1511 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1512 	if (nm_tx_si_user(priv))
1513 		na->tx_si_users++;
1514 	if (nm_rx_si_user(priv))
1515 		na->rx_si_users++;
1516 	if (netmap_verbose) {
1517 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1518 			NM_IFPNAME(na->ifp),
1519 			priv->np_txqfirst,
1520 			priv->np_txqlast,
1521 			priv->np_rxqfirst,
1522 			priv->np_rxqlast,
1523 			i);
1524 	}
1525 	return 0;
1526 }
1527 
1528 /*
1529  * possibly move the interface to netmap-mode.
1530  * If success it returns a pointer to netmap_if, otherwise NULL.
1531  * This must be called with NMG_LOCK held.
1532  */
1533 struct netmap_if *
1534 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1535 	uint16_t ringid, uint32_t flags, int *err)
1536 {
1537 	struct ifnet *ifp = na->ifp;
1538 	struct netmap_if *nifp = NULL;
1539 	int error, need_mem = 0;
1540 
1541 	NMG_LOCK_ASSERT();
1542 	/* ring configuration may have changed, fetch from the card */
1543 	netmap_update_config(na);
1544 	priv->np_na = na;     /* store the reference */
1545 	error = netmap_set_ringid(priv, ringid, flags);
1546 	if (error)
1547 		goto out;
1548 	/* ensure allocators are ready */
1549 	need_mem = !netmap_have_memory_locked(priv);
1550 	if (need_mem) {
1551 		error = netmap_get_memory_locked(priv);
1552 		ND("get_memory returned %d", error);
1553 		if (error)
1554 			goto out;
1555 	}
1556 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1557 	if (nifp == NULL) { /* allocation failed */
1558 		/* we should drop the allocator, but only
1559 		 * if we were the ones who grabbed it
1560 		 */
1561 		error = ENOMEM;
1562 		goto out;
1563 	}
1564 	na->active_fds++;
1565 	if (ifp->if_capenable & IFCAP_NETMAP) {
1566 		/* was already set */
1567 	} else {
1568 		/* Otherwise set the card in netmap mode
1569 		 * and make it use the shared buffers.
1570 		 *
1571 		 * do not core lock because the race is harmless here,
1572 		 * there cannot be any traffic to netmap_transmit()
1573 		 */
1574 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1575 		ND("%p->na_lut == %p", na, na->na_lut);
1576 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1577 		error = na->nm_register(na, 1); /* mode on */
1578 		if (error) {
1579 			netmap_do_unregif(priv, nifp);
1580 			nifp = NULL;
1581 		}
1582 	}
1583 out:
1584 	*err = error;
1585 	if (error) {
1586 		priv->np_na = NULL;
1587 		if (need_mem)
1588 			netmap_drop_memory_locked(priv);
1589 	}
1590 	if (nifp != NULL) {
1591 		/*
1592 		 * advertise that the interface is ready bt setting ni_nifp.
1593 		 * The barrier is needed because readers (poll and *SYNC)
1594 		 * check for priv->np_nifp != NULL without locking
1595 		 */
1596 		wmb(); /* make sure previous writes are visible to all CPUs */
1597 		priv->np_nifp = nifp;
1598 	}
1599 	return nifp;
1600 }
1601 
1602 
1603 
1604 /*
1605  * ioctl(2) support for the "netmap" device.
1606  *
1607  * Following a list of accepted commands:
1608  * - NIOCGINFO
1609  * - SIOCGIFADDR	just for convenience
1610  * - NIOCREGIF
1611  * - NIOCTXSYNC
1612  * - NIOCRXSYNC
1613  *
1614  * Return 0 on success, errno otherwise.
1615  */
1616 int
1617 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1618 	int fflag, struct thread *td)
1619 {
1620 	struct netmap_priv_d *priv = NULL;
1621 	struct ifnet *ifp = NULL;
1622 	struct nmreq *nmr = (struct nmreq *) data;
1623 	struct netmap_adapter *na = NULL;
1624 	int error;
1625 	u_int i, qfirst, qlast;
1626 	struct netmap_if *nifp;
1627 	struct netmap_kring *krings;
1628 
1629 	(void)dev;	/* UNUSED */
1630 	(void)fflag;	/* UNUSED */
1631 
1632 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1633 		/* truncate name */
1634 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1635 		if (nmr->nr_version != NETMAP_API) {
1636 			D("API mismatch for %s got %d need %d",
1637 				nmr->nr_name,
1638 				nmr->nr_version, NETMAP_API);
1639 			nmr->nr_version = NETMAP_API;
1640 		}
1641 		if (nmr->nr_version < NETMAP_MIN_API ||
1642 		    nmr->nr_version > NETMAP_MAX_API) {
1643 			return EINVAL;
1644 		}
1645 	}
1646 	CURVNET_SET(TD_TO_VNET(td));
1647 
1648 	error = devfs_get_cdevpriv((void **)&priv);
1649 	if (error) {
1650 		CURVNET_RESTORE();
1651 		/* XXX ENOENT should be impossible, since the priv
1652 		 * is now created in the open */
1653 		return (error == ENOENT ? ENXIO : error);
1654 	}
1655 
1656 	switch (cmd) {
1657 	case NIOCGINFO:		/* return capabilities etc */
1658 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1659 			error = netmap_bdg_ctl(nmr, NULL);
1660 			break;
1661 		}
1662 
1663 		NMG_LOCK();
1664 		do {
1665 			/* memsize is always valid */
1666 			struct netmap_mem_d *nmd = &nm_mem;
1667 			u_int memflags;
1668 
1669 			if (nmr->nr_name[0] != '\0') {
1670 				/* get a refcount */
1671 				error = netmap_get_na(nmr, &na, 1 /* create */);
1672 				if (error)
1673 					break;
1674 				nmd = na->nm_mem; /* get memory allocator */
1675 			}
1676 
1677 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
1678 				&nmr->nr_arg2);
1679 			if (error)
1680 				break;
1681 			if (na == NULL) /* only memory info */
1682 				break;
1683 			nmr->nr_offset = 0;
1684 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1685 			netmap_update_config(na);
1686 			nmr->nr_rx_rings = na->num_rx_rings;
1687 			nmr->nr_tx_rings = na->num_tx_rings;
1688 			nmr->nr_rx_slots = na->num_rx_desc;
1689 			nmr->nr_tx_slots = na->num_tx_desc;
1690 			netmap_adapter_put(na);
1691 		} while (0);
1692 		NMG_UNLOCK();
1693 		break;
1694 
1695 	case NIOCREGIF:
1696 		/* possibly attach/detach NIC and VALE switch */
1697 		i = nmr->nr_cmd;
1698 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1699 				|| i == NETMAP_BDG_VNET_HDR) {
1700 			error = netmap_bdg_ctl(nmr, NULL);
1701 			break;
1702 		} else if (i != 0) {
1703 			D("nr_cmd must be 0 not %d", i);
1704 			error = EINVAL;
1705 			break;
1706 		}
1707 
1708 		/* protect access to priv from concurrent NIOCREGIF */
1709 		NMG_LOCK();
1710 		do {
1711 			u_int memflags;
1712 
1713 			if (priv->np_na != NULL) {	/* thread already registered */
1714 				error = EBUSY;
1715 				break;
1716 			}
1717 			/* find the interface and a reference */
1718 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1719 			if (error)
1720 				break;
1721 			ifp = na->ifp;
1722 			if (NETMAP_OWNED_BY_KERN(na)) {
1723 				netmap_adapter_put(na);
1724 				error = EBUSY;
1725 				break;
1726 			}
1727 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
1728 			if (!nifp) {    /* reg. failed, release priv and ref */
1729 				netmap_adapter_put(na);
1730 				priv->np_nifp = NULL;
1731 				break;
1732 			}
1733 			priv->np_td = td; // XXX kqueue, debugging only
1734 
1735 			/* return the offset of the netmap_if object */
1736 			nmr->nr_rx_rings = na->num_rx_rings;
1737 			nmr->nr_tx_rings = na->num_tx_rings;
1738 			nmr->nr_rx_slots = na->num_rx_desc;
1739 			nmr->nr_tx_slots = na->num_tx_desc;
1740 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
1741 				&nmr->nr_arg2);
1742 			if (error) {
1743 				netmap_adapter_put(na);
1744 				break;
1745 			}
1746 			if (memflags & NETMAP_MEM_PRIVATE) {
1747 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1748 			}
1749 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
1750 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
1751 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
1752 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
1753 
1754 			if (nmr->nr_arg3) {
1755 				D("requested %d extra buffers", nmr->nr_arg3);
1756 				nmr->nr_arg3 = netmap_extra_alloc(na,
1757 					&nifp->ni_bufs_head, nmr->nr_arg3);
1758 				D("got %d extra buffers", nmr->nr_arg3);
1759 			}
1760 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1761 		} while (0);
1762 		NMG_UNLOCK();
1763 		break;
1764 
1765 	case NIOCTXSYNC:
1766 	case NIOCRXSYNC:
1767 		nifp = priv->np_nifp;
1768 
1769 		if (nifp == NULL) {
1770 			error = ENXIO;
1771 			break;
1772 		}
1773 		rmb(); /* make sure following reads are not from cache */
1774 
1775 		na = priv->np_na;      /* we have a reference */
1776 
1777 		if (na == NULL) {
1778 			D("Internal error: nifp != NULL && na == NULL");
1779 			error = ENXIO;
1780 			break;
1781 		}
1782 
1783 		ifp = na->ifp;
1784 		if (ifp == NULL) {
1785 			RD(1, "the ifp is gone");
1786 			error = ENXIO;
1787 			break;
1788 		}
1789 
1790 		if (cmd == NIOCTXSYNC) {
1791 			krings = na->tx_rings;
1792 			qfirst = priv->np_txqfirst;
1793 			qlast = priv->np_txqlast;
1794 		} else {
1795 			krings = na->rx_rings;
1796 			qfirst = priv->np_rxqfirst;
1797 			qlast = priv->np_rxqlast;
1798 		}
1799 
1800 		for (i = qfirst; i < qlast; i++) {
1801 			struct netmap_kring *kring = krings + i;
1802 			if (nm_kr_tryget(kring)) {
1803 				error = EBUSY;
1804 				goto out;
1805 			}
1806 			if (cmd == NIOCTXSYNC) {
1807 				if (netmap_verbose & NM_VERB_TXSYNC)
1808 					D("pre txsync ring %d cur %d hwcur %d",
1809 					    i, kring->ring->cur,
1810 					    kring->nr_hwcur);
1811 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1812 					netmap_ring_reinit(kring);
1813 				} else {
1814 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
1815 				}
1816 				if (netmap_verbose & NM_VERB_TXSYNC)
1817 					D("post txsync ring %d cur %d hwcur %d",
1818 					    i, kring->ring->cur,
1819 					    kring->nr_hwcur);
1820 			} else {
1821 				kring->nm_sync(kring, NAF_FORCE_READ);
1822 				microtime(&na->rx_rings[i].ring->ts);
1823 			}
1824 			nm_kr_put(kring);
1825 		}
1826 
1827 		break;
1828 
1829 #ifdef __FreeBSD__
1830 	case BIOCIMMEDIATE:
1831 	case BIOCGHDRCMPLT:
1832 	case BIOCSHDRCMPLT:
1833 	case BIOCSSEESENT:
1834 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1835 		break;
1836 
1837 	default:	/* allow device-specific ioctls */
1838 	    {
1839 		struct socket so;
1840 
1841 		bzero(&so, sizeof(so));
1842 		NMG_LOCK();
1843 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1844 		if (error) {
1845 			netmap_adapter_put(na);
1846 			NMG_UNLOCK();
1847 			break;
1848 		}
1849 		ifp = na->ifp;
1850 		so.so_vnet = ifp->if_vnet;
1851 		// so->so_proto not null.
1852 		error = ifioctl(&so, cmd, data, td);
1853 		netmap_adapter_put(na);
1854 		NMG_UNLOCK();
1855 		break;
1856 	    }
1857 
1858 #else /* linux */
1859 	default:
1860 		error = EOPNOTSUPP;
1861 #endif /* linux */
1862 	}
1863 out:
1864 
1865 	CURVNET_RESTORE();
1866 	return (error);
1867 }
1868 
1869 
1870 /*
1871  * select(2) and poll(2) handlers for the "netmap" device.
1872  *
1873  * Can be called for one or more queues.
1874  * Return true the event mask corresponding to ready events.
1875  * If there are no ready events, do a selrecord on either individual
1876  * selinfo or on the global one.
1877  * Device-dependent parts (locking and sync of tx/rx rings)
1878  * are done through callbacks.
1879  *
1880  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1881  * The first one is remapped to pwait as selrecord() uses the name as an
1882  * hidden argument.
1883  */
1884 int
1885 netmap_poll(struct cdev *dev, int events, struct thread *td)
1886 {
1887 	struct netmap_priv_d *priv = NULL;
1888 	struct netmap_adapter *na;
1889 	struct ifnet *ifp;
1890 	struct netmap_kring *kring;
1891 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1892 	struct mbq q;		/* packets from hw queues to host stack */
1893 	void *pwait = dev;	/* linux compatibility */
1894 	int is_kevent = 0;
1895 
1896 	/*
1897 	 * In order to avoid nested locks, we need to "double check"
1898 	 * txsync and rxsync if we decide to do a selrecord().
1899 	 * retry_tx (and retry_rx, later) prevent looping forever.
1900 	 */
1901 	int retry_tx = 1, retry_rx = 1;
1902 
1903 	(void)pwait;
1904 	mbq_init(&q);
1905 
1906 	/*
1907 	 * XXX kevent has curthread->tp_fop == NULL,
1908 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
1909 	 * priv as the first argument, which is also useful to avoid
1910 	 * the selrecord() which are not necessary in that case.
1911 	 */
1912 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
1913 		is_kevent = 1;
1914 		if (netmap_verbose)
1915 			D("called from kevent");
1916 		priv = (struct netmap_priv_d *)dev;
1917 	}
1918 	if (priv == NULL)
1919 		return POLLERR;
1920 
1921 	if (priv->np_nifp == NULL) {
1922 		D("No if registered");
1923 		return POLLERR;
1924 	}
1925 	rmb(); /* make sure following reads are not from cache */
1926 
1927 	na = priv->np_na;
1928 	ifp = na->ifp;
1929 	// check for deleted
1930 	if (ifp == NULL) {
1931 		RD(1, "the ifp is gone");
1932 		return POLLERR;
1933 	}
1934 
1935 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1936 		return POLLERR;
1937 
1938 	if (netmap_verbose & 0x8000)
1939 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1940 	want_tx = events & (POLLOUT | POLLWRNORM);
1941 	want_rx = events & (POLLIN | POLLRDNORM);
1942 
1943 
1944 	/*
1945 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1946 	 * the file descriptor is bound to all of them. If so, we sleep on
1947 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1948 	 * (FreeBSD only allows two selinfo's per file descriptor).
1949 	 * The interrupt routine in the driver wake one or the other
1950 	 * (or both) depending on which clients are active.
1951 	 *
1952 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1953 	 * txsync() is called if we run out of buffers on POLLOUT, or
1954 	 * there are pending packets to send. The latter can be disabled
1955 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1956 	 */
1957 	check_all_tx = nm_tx_si_user(priv);
1958 	check_all_rx = nm_rx_si_user(priv);
1959 
1960 	/*
1961 	 * We start with a lock free round which is cheap if we have
1962 	 * slots available. If this fails, then lock and call the sync
1963 	 * routines.
1964 	 */
1965 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
1966 		kring = &na->rx_rings[i];
1967 		/* XXX compare ring->cur and kring->tail */
1968 		if (!nm_ring_empty(kring->ring)) {
1969 			revents |= want_rx;
1970 			want_rx = 0;	/* also breaks the loop */
1971 		}
1972 	}
1973 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
1974 		kring = &na->tx_rings[i];
1975 		/* XXX compare ring->cur and kring->tail */
1976 		if (!nm_ring_empty(kring->ring)) {
1977 			revents |= want_tx;
1978 			want_tx = 0;	/* also breaks the loop */
1979 		}
1980 	}
1981 
1982 	/*
1983 	 * If we want to push packets out (priv->np_txpoll) or
1984 	 * want_tx is still set, we must issue txsync calls
1985 	 * (on all rings, to avoid that the tx rings stall).
1986 	 * XXX should also check cur != hwcur on the tx rings.
1987 	 * Fortunately, normal tx mode has np_txpoll set.
1988 	 */
1989 	if (priv->np_txpoll || want_tx) {
1990 		/*
1991 		 * The first round checks if anyone is ready, if not
1992 		 * do a selrecord and another round to handle races.
1993 		 * want_tx goes to 0 if any space is found, and is
1994 		 * used to skip rings with no pending transmissions.
1995 		 */
1996 flush_tx:
1997 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
1998 			int found = 0;
1999 
2000 			kring = &na->tx_rings[i];
2001 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2002 				continue;
2003 			/* only one thread does txsync */
2004 			if (nm_kr_tryget(kring)) {
2005 				D("%p lost race on txring %d, ok", priv, i);
2006 				continue;
2007 			}
2008 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2009 				netmap_ring_reinit(kring);
2010 				revents |= POLLERR;
2011 			} else {
2012 				if (kring->nm_sync(kring, 0))
2013 					revents |= POLLERR;
2014 			}
2015 
2016 			/*
2017 			 * If we found new slots, notify potential
2018 			 * listeners on the same ring.
2019 			 * Since we just did a txsync, look at the copies
2020 			 * of cur,tail in the kring.
2021 			 */
2022 			found = kring->rcur != kring->rtail;
2023 			nm_kr_put(kring);
2024 			if (found) { /* notify other listeners */
2025 				revents |= want_tx;
2026 				want_tx = 0;
2027 				na->nm_notify(na, i, NR_TX, 0);
2028 			}
2029 		}
2030 		if (want_tx && retry_tx && !is_kevent) {
2031 			selrecord(td, check_all_tx ?
2032 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2033 			retry_tx = 0;
2034 			goto flush_tx;
2035 		}
2036 	}
2037 
2038 	/*
2039 	 * If want_rx is still set scan receive rings.
2040 	 * Do it on all rings because otherwise we starve.
2041 	 */
2042 	if (want_rx) {
2043 		int send_down = 0; /* transparent mode */
2044 		/* two rounds here to for race avoidance */
2045 do_retry_rx:
2046 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2047 			int found = 0;
2048 
2049 			kring = &na->rx_rings[i];
2050 
2051 			if (nm_kr_tryget(kring)) {
2052 				D("%p lost race on rxring %d, ok", priv, i);
2053 				continue;
2054 			}
2055 
2056 			/*
2057 			 * transparent mode support: collect packets
2058 			 * from the rxring(s).
2059 			 * XXX NR_FORWARD should only be read on
2060 			 * physical or NIC ports
2061 			 */
2062 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2063 				ND(10, "forwarding some buffers up %d to %d",
2064 				    kring->nr_hwcur, kring->ring->cur);
2065 				netmap_grab_packets(kring, &q, netmap_fwd);
2066 			}
2067 
2068 			if (kring->nm_sync(kring, 0))
2069 				revents |= POLLERR;
2070 			if (netmap_no_timestamp == 0 ||
2071 					kring->ring->flags & NR_TIMESTAMP) {
2072 				microtime(&kring->ring->ts);
2073 			}
2074 			/* after an rxsync we can use kring->rcur, rtail */
2075 			found = kring->rcur != kring->rtail;
2076 			nm_kr_put(kring);
2077 			if (found) {
2078 				revents |= want_rx;
2079 				retry_rx = 0;
2080 				na->nm_notify(na, i, NR_RX, 0);
2081 			}
2082 		}
2083 
2084 		/* transparent mode XXX only during first pass ? */
2085 		if (na->na_flags & NAF_HOST_RINGS) {
2086 			kring = &na->rx_rings[na->num_rx_rings];
2087 			if (check_all_rx
2088 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2089 				/* XXX fix to use kring fields */
2090 				if (nm_ring_empty(kring->ring))
2091 					send_down = netmap_rxsync_from_host(na, td, dev);
2092 				if (!nm_ring_empty(kring->ring))
2093 					revents |= want_rx;
2094 			}
2095 		}
2096 
2097 		if (retry_rx && !is_kevent)
2098 			selrecord(td, check_all_rx ?
2099 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2100 		if (send_down > 0 || retry_rx) {
2101 			retry_rx = 0;
2102 			if (send_down)
2103 				goto flush_tx; /* and retry_rx */
2104 			else
2105 				goto do_retry_rx;
2106 		}
2107 	}
2108 
2109 	/*
2110 	 * Transparent mode: marked bufs on rx rings between
2111 	 * kring->nr_hwcur and ring->head
2112 	 * are passed to the other endpoint.
2113 	 *
2114 	 * In this mode we also scan the sw rxring, which in
2115 	 * turn passes packets up.
2116 	 *
2117 	 * XXX Transparent mode at the moment requires to bind all
2118  	 * rings to a single file descriptor.
2119 	 */
2120 
2121 	if (q.head)
2122 		netmap_send_up(na->ifp, &q);
2123 
2124 	return (revents);
2125 }
2126 
2127 
2128 /*-------------------- driver support routines -------------------*/
2129 
2130 static int netmap_hw_krings_create(struct netmap_adapter *);
2131 
2132 static int
2133 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2134 	enum txrx tx, int flags)
2135 {
2136 	struct netmap_kring *kring;
2137 
2138 	if (tx == NR_TX) {
2139 		kring = na->tx_rings + n_ring;
2140 		OS_selwakeup(&kring->si, PI_NET);
2141 		if (na->tx_si_users > 0)
2142 			OS_selwakeup(&na->tx_si, PI_NET);
2143 	} else {
2144 		kring = na->rx_rings + n_ring;
2145 		OS_selwakeup(&kring->si, PI_NET);
2146 		if (na->rx_si_users > 0)
2147 			OS_selwakeup(&na->rx_si, PI_NET);
2148 	}
2149 	return 0;
2150 }
2151 
2152 
2153 // XXX check handling of failures
2154 int
2155 netmap_attach_common(struct netmap_adapter *na)
2156 {
2157 	struct ifnet *ifp = na->ifp;
2158 
2159 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2160 		D("%s: invalid rings tx %d rx %d",
2161 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2162 		return EINVAL;
2163 	}
2164 	WNA(ifp) = na;
2165 
2166 	/* the following is only needed for na that use the host port.
2167 	 * XXX do we have something similar for linux ?
2168 	 */
2169 #ifdef __FreeBSD__
2170 	na->if_input = ifp->if_input; /* for netmap_send_up */
2171 #endif /* __FreeBSD__ */
2172 
2173 	NETMAP_SET_CAPABLE(ifp);
2174 	if (na->nm_krings_create == NULL) {
2175 		na->nm_krings_create = netmap_hw_krings_create;
2176 		na->nm_krings_delete = netmap_hw_krings_delete;
2177 	}
2178 	if (na->nm_notify == NULL)
2179 		na->nm_notify = netmap_notify;
2180 	na->active_fds = 0;
2181 
2182 	if (na->nm_mem == NULL)
2183 		na->nm_mem = &nm_mem;
2184 	return 0;
2185 }
2186 
2187 
2188 void
2189 netmap_detach_common(struct netmap_adapter *na)
2190 {
2191 	if (na->ifp)
2192 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2193 
2194 	if (na->tx_rings) { /* XXX should not happen */
2195 		D("freeing leftover tx_rings");
2196 		na->nm_krings_delete(na);
2197 	}
2198 	netmap_pipe_dealloc(na);
2199 	if (na->na_flags & NAF_MEM_OWNER)
2200 		netmap_mem_private_delete(na->nm_mem);
2201 	bzero(na, sizeof(*na));
2202 	free(na, M_DEVBUF);
2203 }
2204 
2205 
2206 /*
2207  * Initialize a ``netmap_adapter`` object created by driver on attach.
2208  * We allocate a block of memory with room for a struct netmap_adapter
2209  * plus two sets of N+2 struct netmap_kring (where N is the number
2210  * of hardware rings):
2211  * krings	0..N-1	are for the hardware queues.
2212  * kring	N	is for the host stack queue
2213  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2214  * Return 0 on success, ENOMEM otherwise.
2215  */
2216 int
2217 netmap_attach(struct netmap_adapter *arg)
2218 {
2219 	struct netmap_hw_adapter *hwna = NULL;
2220 	// XXX when is arg == NULL ?
2221 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2222 
2223 	if (arg == NULL || ifp == NULL)
2224 		goto fail;
2225 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2226 	if (hwna == NULL)
2227 		goto fail;
2228 	hwna->up = *arg;
2229 	hwna->up.na_flags |= NAF_HOST_RINGS;
2230 	if (netmap_attach_common(&hwna->up)) {
2231 		free(hwna, M_DEVBUF);
2232 		goto fail;
2233 	}
2234 	netmap_adapter_get(&hwna->up);
2235 
2236 #ifdef linux
2237 	if (ifp->netdev_ops) {
2238 		/* prepare a clone of the netdev ops */
2239 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2240 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2241 #else
2242 		hwna->nm_ndo = *ifp->netdev_ops;
2243 #endif
2244 	}
2245 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2246 #endif /* linux */
2247 
2248 	D("success for %s", NM_IFPNAME(ifp));
2249 	return 0;
2250 
2251 fail:
2252 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2253 	netmap_detach(ifp);
2254 	return (hwna ? EINVAL : ENOMEM);
2255 }
2256 
2257 
2258 void
2259 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2260 {
2261 	if (!na) {
2262 		return;
2263 	}
2264 
2265 	refcount_acquire(&na->na_refcount);
2266 }
2267 
2268 
2269 /* returns 1 iff the netmap_adapter is destroyed */
2270 int
2271 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2272 {
2273 	if (!na)
2274 		return 1;
2275 
2276 	if (!refcount_release(&na->na_refcount))
2277 		return 0;
2278 
2279 	if (na->nm_dtor)
2280 		na->nm_dtor(na);
2281 
2282 	netmap_detach_common(na);
2283 
2284 	return 1;
2285 }
2286 
2287 int
2288 netmap_hw_krings_create(struct netmap_adapter *na)
2289 {
2290 	int ret = netmap_krings_create(na, 0);
2291 	if (ret == 0) {
2292 		/* initialize the mbq for the sw rx ring */
2293 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2294 		ND("initialized sw rx queue %d", na->num_rx_rings);
2295 	}
2296 	return ret;
2297 }
2298 
2299 
2300 
2301 /*
2302  * Free the allocated memory linked to the given ``netmap_adapter``
2303  * object.
2304  */
2305 void
2306 netmap_detach(struct ifnet *ifp)
2307 {
2308 	struct netmap_adapter *na = NA(ifp);
2309 
2310 	if (!na)
2311 		return;
2312 
2313 	NMG_LOCK();
2314 	netmap_disable_all_rings(ifp);
2315 	if (!netmap_adapter_put(na)) {
2316 		/* someone is still using the adapter,
2317 		 * tell them that the interface is gone
2318 		 */
2319 		na->ifp = NULL;
2320 		/* give them a chance to notice */
2321 		netmap_enable_all_rings(ifp);
2322 	}
2323 	NMG_UNLOCK();
2324 }
2325 
2326 
2327 /*
2328  * Intercept packets from the network stack and pass them
2329  * to netmap as incoming packets on the 'software' ring.
2330  *
2331  * We only store packets in a bounded mbq and then copy them
2332  * in the relevant rxsync routine.
2333  *
2334  * We rely on the OS to make sure that the ifp and na do not go
2335  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2336  * In nm_register() or whenever there is a reinitialization,
2337  * we make sure to make the mode change visible here.
2338  */
2339 int
2340 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2341 {
2342 	struct netmap_adapter *na = NA(ifp);
2343 	struct netmap_kring *kring;
2344 	u_int len = MBUF_LEN(m);
2345 	u_int error = ENOBUFS;
2346 	struct mbq *q;
2347 	int space;
2348 
2349 	// XXX [Linux] we do not need this lock
2350 	// if we follow the down/configure/up protocol -gl
2351 	// mtx_lock(&na->core_lock);
2352 
2353 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2354 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2355 		error = ENXIO;
2356 		goto done;
2357 	}
2358 
2359 	kring = &na->rx_rings[na->num_rx_rings];
2360 	q = &kring->rx_queue;
2361 
2362 	// XXX reconsider long packets if we handle fragments
2363 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2364 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2365 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2366 		goto done;
2367 	}
2368 
2369 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2370 	 * and maybe other instances of netmap_transmit (the latter
2371 	 * not possible on Linux).
2372 	 * Also avoid overflowing the queue.
2373 	 */
2374 	mtx_lock(&q->lock);
2375 
2376         space = kring->nr_hwtail - kring->nr_hwcur;
2377         if (space < 0)
2378                 space += kring->nkr_num_slots;
2379 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2380 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2381 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2382 			len, m);
2383 	} else {
2384 		mbq_enqueue(q, m);
2385 		ND(10, "%s %d bufs in queue len %d m %p",
2386 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2387 		/* notify outside the lock */
2388 		m = NULL;
2389 		error = 0;
2390 	}
2391 	mtx_unlock(&q->lock);
2392 
2393 done:
2394 	if (m)
2395 		m_freem(m);
2396 	/* unconditionally wake up listeners */
2397 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2398 
2399 	return (error);
2400 }
2401 
2402 
2403 /*
2404  * netmap_reset() is called by the driver routines when reinitializing
2405  * a ring. The driver is in charge of locking to protect the kring.
2406  * If native netmap mode is not set just return NULL.
2407  */
2408 struct netmap_slot *
2409 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2410 	u_int new_cur)
2411 {
2412 	struct netmap_kring *kring;
2413 	int new_hwofs, lim;
2414 
2415 	if (na == NULL) {
2416 		D("NULL na, should not happen");
2417 		return NULL;	/* no netmap support here */
2418 	}
2419 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2420 		ND("interface not in netmap mode");
2421 		return NULL;	/* nothing to reinitialize */
2422 	}
2423 
2424 	/* XXX note- in the new scheme, we are not guaranteed to be
2425 	 * under lock (e.g. when called on a device reset).
2426 	 * In this case, we should set a flag and do not trust too
2427 	 * much the values. In practice: TODO
2428 	 * - set a RESET flag somewhere in the kring
2429 	 * - do the processing in a conservative way
2430 	 * - let the *sync() fixup at the end.
2431 	 */
2432 	if (tx == NR_TX) {
2433 		if (n >= na->num_tx_rings)
2434 			return NULL;
2435 		kring = na->tx_rings + n;
2436 		// XXX check whether we should use hwcur or rcur
2437 		new_hwofs = kring->nr_hwcur - new_cur;
2438 	} else {
2439 		if (n >= na->num_rx_rings)
2440 			return NULL;
2441 		kring = na->rx_rings + n;
2442 		new_hwofs = kring->nr_hwtail - new_cur;
2443 	}
2444 	lim = kring->nkr_num_slots - 1;
2445 	if (new_hwofs > lim)
2446 		new_hwofs -= lim + 1;
2447 
2448 	/* Always set the new offset value and realign the ring. */
2449 	if (netmap_verbose)
2450 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2451 		NM_IFPNAME(na->ifp),
2452 		tx == NR_TX ? "TX" : "RX", n,
2453 		kring->nkr_hwofs, new_hwofs,
2454 		kring->nr_hwtail,
2455 		tx == NR_TX ? lim : kring->nr_hwtail);
2456 	kring->nkr_hwofs = new_hwofs;
2457 	if (tx == NR_TX) {
2458 		kring->nr_hwtail = kring->nr_hwcur + lim;
2459 		if (kring->nr_hwtail > lim)
2460 			kring->nr_hwtail -= lim + 1;
2461 	}
2462 
2463 #if 0 // def linux
2464 	/* XXX check that the mappings are correct */
2465 	/* need ring_nr, adapter->pdev, direction */
2466 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2467 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2468 		D("error mapping rx netmap buffer %d", i);
2469 		// XXX fix error handling
2470 	}
2471 
2472 #endif /* linux */
2473 	/*
2474 	 * Wakeup on the individual and global selwait
2475 	 * We do the wakeup here, but the ring is not yet reconfigured.
2476 	 * However, we are under lock so there are no races.
2477 	 */
2478 	na->nm_notify(na, n, tx, 0);
2479 	return kring->ring->slot;
2480 }
2481 
2482 
2483 /*
2484  * Dispatch rx/tx interrupts to the netmap rings.
2485  *
2486  * "work_done" is non-null on the RX path, NULL for the TX path.
2487  * We rely on the OS to make sure that there is only one active
2488  * instance per queue, and that there is appropriate locking.
2489  *
2490  * The 'notify' routine depends on what the ring is attached to.
2491  * - for a netmap file descriptor, do a selwakeup on the individual
2492  *   waitqueue, plus one on the global one if needed
2493  * - for a switch, call the proper forwarding routine
2494  * - XXX more ?
2495  */
2496 void
2497 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2498 {
2499 	struct netmap_adapter *na = NA(ifp);
2500 	struct netmap_kring *kring;
2501 
2502 	q &= NETMAP_RING_MASK;
2503 
2504 	if (netmap_verbose) {
2505 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2506 	}
2507 
2508 	if (work_done) { /* RX path */
2509 		if (q >= na->num_rx_rings)
2510 			return;	// not a physical queue
2511 		kring = na->rx_rings + q;
2512 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2513 		na->nm_notify(na, q, NR_RX, 0);
2514 		*work_done = 1; /* do not fire napi again */
2515 	} else { /* TX path */
2516 		if (q >= na->num_tx_rings)
2517 			return;	// not a physical queue
2518 		kring = na->tx_rings + q;
2519 		na->nm_notify(na, q, NR_TX, 0);
2520 	}
2521 }
2522 
2523 
2524 /*
2525  * Default functions to handle rx/tx interrupts from a physical device.
2526  * "work_done" is non-null on the RX path, NULL for the TX path.
2527  *
2528  * If the card is not in netmap mode, simply return 0,
2529  * so that the caller proceeds with regular processing.
2530  * Otherwise call netmap_common_irq() and return 1.
2531  *
2532  * If the card is connected to a netmap file descriptor,
2533  * do a selwakeup on the individual queue, plus one on the global one
2534  * if needed (multiqueue card _and_ there are multiqueue listeners),
2535  * and return 1.
2536  *
2537  * Finally, if called on rx from an interface connected to a switch,
2538  * calls the proper forwarding routine, and return 1.
2539  */
2540 int
2541 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2542 {
2543 	// XXX could we check NAF_NATIVE_ON ?
2544 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2545 		return 0;
2546 
2547 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2548 		ND("use regular interrupt");
2549 		return 0;
2550 	}
2551 
2552 	netmap_common_irq(ifp, q, work_done);
2553 	return 1;
2554 }
2555 
2556 
2557 /*
2558  * Module loader and unloader
2559  *
2560  * netmap_init() creates the /dev/netmap device and initializes
2561  * all global variables. Returns 0 on success, errno on failure
2562  * (but there is no chance)
2563  *
2564  * netmap_fini() destroys everything.
2565  */
2566 
2567 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2568 extern struct cdevsw netmap_cdevsw;
2569 
2570 
2571 void
2572 netmap_fini(void)
2573 {
2574 	// XXX destroy_bridges() ?
2575 	if (netmap_dev)
2576 		destroy_dev(netmap_dev);
2577 	netmap_mem_fini();
2578 	NMG_LOCK_DESTROY();
2579 	printf("netmap: unloaded module.\n");
2580 }
2581 
2582 
2583 int
2584 netmap_init(void)
2585 {
2586 	int error;
2587 
2588 	NMG_LOCK_INIT();
2589 
2590 	error = netmap_mem_init();
2591 	if (error != 0)
2592 		goto fail;
2593 	/* XXX could use make_dev_credv() to get error number */
2594 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2595 			      "netmap");
2596 	if (!netmap_dev)
2597 		goto fail;
2598 
2599 	netmap_init_bridges();
2600 	printf("netmap: loaded module\n");
2601 	return (0);
2602 fail:
2603 	netmap_fini();
2604 	return (EINVAL); /* may be incorrect */
2605 }
2606