xref: /freebsd-12.1/sys/dev/netmap/netmap.c (revision 2c6d18eb)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/sockio.h>
141 #include <sys/socketvar.h>	/* struct socket */
142 #include <sys/malloc.h>
143 #include <sys/poll.h>
144 #include <sys/rwlock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/selinfo.h>
147 #include <sys/sysctl.h>
148 #include <net/if.h>
149 #include <net/if_var.h>
150 #include <net/bpf.h>		/* BIOCIMMEDIATE */
151 #include <machine/bus.h>	/* bus_dmamap_* */
152 #include <sys/endian.h>
153 #include <sys/refcount.h>
154 
155 
156 /* reduce conditional code */
157 #define init_waitqueue_head(x)	// only needed in linux
158 
159 
160 
161 #elif defined(linux)
162 
163 #include "bsd_glue.h"
164 
165 
166 
167 #elif defined(__APPLE__)
168 
169 #warning OSX support is only partial
170 #include "osx_glue.h"
171 
172 #else
173 
174 #error	Unsupported platform
175 
176 #endif /* unsupported */
177 
178 /*
179  * common headers
180  */
181 #include <net/netmap.h>
182 #include <dev/netmap/netmap_kern.h>
183 #include <dev/netmap/netmap_mem2.h>
184 
185 
186 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
187 
188 /*
189  * The following variables are used by the drivers and replicate
190  * fields in the global memory pool. They only refer to buffers
191  * used by physical interfaces.
192  */
193 u_int netmap_total_buffers;
194 u_int netmap_buf_size;
195 char *netmap_buffer_base;	/* also address of an invalid buffer */
196 
197 /* user-controlled variables */
198 int netmap_verbose;
199 
200 static int netmap_no_timestamp; /* don't timestamp on rxsync */
201 
202 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
203 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
204     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
205 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
206     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
207 int netmap_mitigate = 1;
208 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
209 int netmap_no_pendintr = 1;
210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
211     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
212 int netmap_txsync_retry = 2;
213 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
214     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
215 
216 int netmap_flags = 0;	/* debug flags */
217 int netmap_fwd = 0;	/* force transparent mode */
218 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
219 
220 /*
221  * netmap_admode selects the netmap mode to use.
222  * Invalid values are reset to NETMAP_ADMODE_BEST
223  */
224 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
225 	NETMAP_ADMODE_NATIVE,	/* either native or none */
226 	NETMAP_ADMODE_GENERIC,	/* force generic */
227 	NETMAP_ADMODE_LAST };
228 static int netmap_admode = NETMAP_ADMODE_BEST;
229 
230 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
231 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
232 
233 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
234 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
235 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
236 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
237 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
238 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
239 
240 NMG_LOCK_T	netmap_global_lock;
241 
242 
243 static void
244 nm_kr_get(struct netmap_kring *kr)
245 {
246 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
247 		tsleep(kr, 0, "NM_KR_GET", 4);
248 }
249 
250 
251 /*
252  * mark the ring as stopped, and run through the locks
253  * to make sure other users get to see it.
254  */
255 void
256 netmap_disable_ring(struct netmap_kring *kr)
257 {
258 	kr->nkr_stopped = 1;
259 	nm_kr_get(kr);
260 	mtx_lock(&kr->q_lock);
261 	mtx_unlock(&kr->q_lock);
262 	nm_kr_put(kr);
263 }
264 
265 
266 static void
267 netmap_set_all_rings(struct ifnet *ifp, int stopped)
268 {
269 	struct netmap_adapter *na;
270 	int i;
271 
272 	if (!(ifp->if_capenable & IFCAP_NETMAP))
273 		return;
274 
275 	na = NA(ifp);
276 
277 	for (i = 0; i <= na->num_tx_rings; i++) {
278 		if (stopped)
279 			netmap_disable_ring(na->tx_rings + i);
280 		else
281 			na->tx_rings[i].nkr_stopped = 0;
282 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
283 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
284 	}
285 
286 	for (i = 0; i <= na->num_rx_rings; i++) {
287 		if (stopped)
288 			netmap_disable_ring(na->rx_rings + i);
289 		else
290 			na->rx_rings[i].nkr_stopped = 0;
291 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
292 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
293 	}
294 }
295 
296 
297 void
298 netmap_disable_all_rings(struct ifnet *ifp)
299 {
300 	netmap_set_all_rings(ifp, 1 /* stopped */);
301 }
302 
303 
304 void
305 netmap_enable_all_rings(struct ifnet *ifp)
306 {
307 	netmap_set_all_rings(ifp, 0 /* enabled */);
308 }
309 
310 
311 /*
312  * generic bound_checking function
313  */
314 u_int
315 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
316 {
317 	u_int oldv = *v;
318 	const char *op = NULL;
319 
320 	if (dflt < lo)
321 		dflt = lo;
322 	if (dflt > hi)
323 		dflt = hi;
324 	if (oldv < lo) {
325 		*v = dflt;
326 		op = "Bump";
327 	} else if (oldv > hi) {
328 		*v = hi;
329 		op = "Clamp";
330 	}
331 	if (op && msg)
332 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
333 	return *v;
334 }
335 
336 
337 /*
338  * packet-dump function, user-supplied or static buffer.
339  * The destination buffer must be at least 30+4*len
340  */
341 const char *
342 nm_dump_buf(char *p, int len, int lim, char *dst)
343 {
344 	static char _dst[8192];
345 	int i, j, i0;
346 	static char hex[] ="0123456789abcdef";
347 	char *o;	/* output position */
348 
349 #define P_HI(x)	hex[((x) & 0xf0)>>4]
350 #define P_LO(x)	hex[((x) & 0xf)]
351 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
352 	if (!dst)
353 		dst = _dst;
354 	if (lim <= 0 || lim > len)
355 		lim = len;
356 	o = dst;
357 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
358 	o += strlen(o);
359 	/* hexdump routine */
360 	for (i = 0; i < lim; ) {
361 		sprintf(o, "%5d: ", i);
362 		o += strlen(o);
363 		memset(o, ' ', 48);
364 		i0 = i;
365 		for (j=0; j < 16 && i < lim; i++, j++) {
366 			o[j*3] = P_HI(p[i]);
367 			o[j*3+1] = P_LO(p[i]);
368 		}
369 		i = i0;
370 		for (j=0; j < 16 && i < lim; i++, j++)
371 			o[j + 48] = P_C(p[i]);
372 		o[j+48] = '\n';
373 		o += j+49;
374 	}
375 	*o = '\0';
376 #undef P_HI
377 #undef P_LO
378 #undef P_C
379 	return dst;
380 }
381 
382 
383 /*
384  * Fetch configuration from the device, to cope with dynamic
385  * reconfigurations after loading the module.
386  */
387 int
388 netmap_update_config(struct netmap_adapter *na)
389 {
390 	struct ifnet *ifp = na->ifp;
391 	u_int txr, txd, rxr, rxd;
392 
393 	txr = txd = rxr = rxd = 0;
394 	if (na->nm_config) {
395 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
396 	} else {
397 		/* take whatever we had at init time */
398 		txr = na->num_tx_rings;
399 		txd = na->num_tx_desc;
400 		rxr = na->num_rx_rings;
401 		rxd = na->num_rx_desc;
402 	}
403 
404 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
405 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
406 		return 0; /* nothing changed */
407 	if (netmap_verbose || na->active_fds > 0) {
408 		D("stored config %s: txring %d x %d, rxring %d x %d",
409 			NM_IFPNAME(ifp),
410 			na->num_tx_rings, na->num_tx_desc,
411 			na->num_rx_rings, na->num_rx_desc);
412 		D("new config %s: txring %d x %d, rxring %d x %d",
413 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
414 	}
415 	if (na->active_fds == 0) {
416 		D("configuration changed (but fine)");
417 		na->num_tx_rings = txr;
418 		na->num_tx_desc = txd;
419 		na->num_rx_rings = rxr;
420 		na->num_rx_desc = rxd;
421 		return 0;
422 	}
423 	D("configuration changed while active, this is bad...");
424 	return 1;
425 }
426 
427 
428 int
429 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
430 {
431 	u_int i, len, ndesc;
432 	struct netmap_kring *kring;
433 
434 	// XXX additional space for extra rings ?
435 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
436 
437 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
438 	if (na->tx_rings == NULL) {
439 		D("Cannot allocate krings");
440 		return ENOMEM;
441 	}
442 	na->rx_rings = na->tx_rings + ntx;
443 
444 	/*
445 	 * All fields in krings are 0 except the one initialized below.
446 	 * but better be explicit on important kring fields.
447 	 */
448 	ndesc = na->num_tx_desc;
449 	for (i = 0; i < ntx; i++) { /* Transmit rings */
450 		kring = &na->tx_rings[i];
451 		bzero(kring, sizeof(*kring));
452 		kring->na = na;
453 		kring->ring_id = i;
454 		kring->nkr_num_slots = ndesc;
455 		/*
456 		 * IMPORTANT: Always keep one slot empty.
457 		 */
458 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
459 		kring->rtail = kring->nr_hwtail = ndesc - 1;
460 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
461 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
462 		init_waitqueue_head(&kring->si);
463 	}
464 
465 	ndesc = na->num_rx_desc;
466 	for (i = 0; i < nrx; i++) { /* Receive rings */
467 		kring = &na->rx_rings[i];
468 		bzero(kring, sizeof(*kring));
469 		kring->na = na;
470 		kring->ring_id = i;
471 		kring->nkr_num_slots = ndesc;
472 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
473 		kring->rtail = kring->nr_hwtail = 0;
474 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
475 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
476 		init_waitqueue_head(&kring->si);
477 	}
478 	init_waitqueue_head(&na->tx_si);
479 	init_waitqueue_head(&na->rx_si);
480 
481 	na->tailroom = na->rx_rings + nrx;
482 
483 	return 0;
484 }
485 
486 
487 /* XXX check boundaries */
488 void
489 netmap_krings_delete(struct netmap_adapter *na)
490 {
491 	int i;
492 
493 	for (i = 0; i < na->num_tx_rings + 1; i++) {
494 		mtx_destroy(&na->tx_rings[i].q_lock);
495 	}
496 	for (i = 0; i < na->num_rx_rings + 1; i++) {
497 		mtx_destroy(&na->rx_rings[i].q_lock);
498 	}
499 	free(na->tx_rings, M_DEVBUF);
500 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
501 }
502 
503 
504 /*
505  * Destructor for NIC ports. They also have an mbuf queue
506  * on the rings connected to the host so we need to purge
507  * them first.
508  */
509 static void
510 netmap_hw_krings_delete(struct netmap_adapter *na)
511 {
512 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
513 
514 	ND("destroy sw mbq with len %d", mbq_len(q));
515 	mbq_purge(q);
516 	mbq_safe_destroy(q);
517 	netmap_krings_delete(na);
518 }
519 
520 
521 static struct netmap_if*
522 netmap_if_new(const char *ifname, struct netmap_adapter *na)
523 {
524 	struct netmap_if *nifp;
525 
526 	if (netmap_update_config(na)) {
527 		/* configuration mismatch, report and fail */
528 		return NULL;
529 	}
530 
531 	if (na->active_fds)
532 		goto final;
533 
534 	if (na->nm_krings_create(na))
535 		goto cleanup;
536 
537 	if (netmap_mem_rings_create(na))
538 		goto cleanup;
539 
540 final:
541 
542 	nifp = netmap_mem_if_new(ifname, na);
543 	if (nifp == NULL)
544 		goto cleanup;
545 
546 	return (nifp);
547 
548 cleanup:
549 
550 	if (na->active_fds == 0) {
551 		netmap_mem_rings_delete(na);
552 		na->nm_krings_delete(na);
553 	}
554 
555 	return NULL;
556 }
557 
558 
559 /* grab a reference to the memory allocator, if we don't have one already.  The
560  * reference is taken from the netmap_adapter registered with the priv.
561  *
562  */
563 static int
564 netmap_get_memory_locked(struct netmap_priv_d* p)
565 {
566 	struct netmap_mem_d *nmd;
567 	int error = 0;
568 
569 	if (p->np_na == NULL) {
570 		if (!netmap_mmap_unreg)
571 			return ENODEV;
572 		/* for compatibility with older versions of the API
573  		 * we use the global allocator when no interface has been
574  		 * registered
575  		 */
576 		nmd = &nm_mem;
577 	} else {
578 		nmd = p->np_na->nm_mem;
579 	}
580 	if (p->np_mref == NULL) {
581 		error = netmap_mem_finalize(nmd);
582 		if (!error)
583 			p->np_mref = nmd;
584 	} else if (p->np_mref != nmd) {
585 		/* a virtual port has been registered, but previous
586  		 * syscalls already used the global allocator.
587  		 * We cannot continue
588  		 */
589 		error = ENODEV;
590 	}
591 	return error;
592 }
593 
594 
595 int
596 netmap_get_memory(struct netmap_priv_d* p)
597 {
598 	int error;
599 	NMG_LOCK();
600 	error = netmap_get_memory_locked(p);
601 	NMG_UNLOCK();
602 	return error;
603 }
604 
605 
606 static int
607 netmap_have_memory_locked(struct netmap_priv_d* p)
608 {
609 	return p->np_mref != NULL;
610 }
611 
612 
613 static void
614 netmap_drop_memory_locked(struct netmap_priv_d* p)
615 {
616 	if (p->np_mref) {
617 		netmap_mem_deref(p->np_mref);
618 		p->np_mref = NULL;
619 	}
620 }
621 
622 
623 /*
624  * File descriptor's private data destructor.
625  *
626  * Call nm_register(ifp,0) to stop netmap mode on the interface and
627  * revert to normal operation. We expect that np_na->ifp has not gone.
628  * The second argument is the nifp to work on. In some cases it is
629  * not attached yet to the netmap_priv_d so we need to pass it as
630  * a separate argument.
631  */
632 /* call with NMG_LOCK held */
633 static void
634 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
635 {
636 	struct netmap_adapter *na = priv->np_na;
637 	struct ifnet *ifp = na->ifp;
638 
639 	NMG_LOCK_ASSERT();
640 	na->active_fds--;
641 	if (na->active_fds <= 0) {	/* last instance */
642 
643 		if (netmap_verbose)
644 			D("deleting last instance for %s", NM_IFPNAME(ifp));
645 		/*
646 		 * (TO CHECK) This function is only called
647 		 * when the last reference to this file descriptor goes
648 		 * away. This means we cannot have any pending poll()
649 		 * or interrupt routine operating on the structure.
650 		 * XXX The file may be closed in a thread while
651 		 * another thread is using it.
652 		 * Linux keeps the file opened until the last reference
653 		 * by any outstanding ioctl/poll or mmap is gone.
654 		 * FreeBSD does not track mmap()s (but we do) and
655 		 * wakes up any sleeping poll(). Need to check what
656 		 * happens if the close() occurs while a concurrent
657 		 * syscall is running.
658 		 */
659 		if (ifp)
660 			na->nm_register(na, 0); /* off, clear flags */
661 		/* Wake up any sleeping threads. netmap_poll will
662 		 * then return POLLERR
663 		 * XXX The wake up now must happen during *_down(), when
664 		 * we order all activities to stop. -gl
665 		 */
666 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
667 		/* knlist_destroy(&na->tx_si.si_note); */
668 		/* knlist_destroy(&na->rx_si.si_note); */
669 
670 		/* delete rings and buffers */
671 		netmap_mem_rings_delete(na);
672 		na->nm_krings_delete(na);
673 	}
674 	/* delete the nifp */
675 	netmap_mem_if_delete(na, nifp);
676 }
677 
678 
679 /*
680  * returns 1 if this is the last instance and we can free priv
681  */
682 int
683 netmap_dtor_locked(struct netmap_priv_d *priv)
684 {
685 	struct netmap_adapter *na = priv->np_na;
686 
687 #ifdef __FreeBSD__
688 	/*
689 	 * np_refcount is the number of active mmaps on
690 	 * this file descriptor
691 	 */
692 	if (--priv->np_refcount > 0) {
693 		return 0;
694 	}
695 #endif /* __FreeBSD__ */
696 	if (!na) {
697 	    return 1; //XXX is it correct?
698 	}
699 	netmap_do_unregif(priv, priv->np_nifp);
700 	priv->np_nifp = NULL;
701 	netmap_drop_memory_locked(priv);
702 	if (priv->np_na) {
703 		netmap_adapter_put(na);
704 		priv->np_na = NULL;
705 	}
706 	return 1;
707 }
708 
709 
710 void
711 netmap_dtor(void *data)
712 {
713 	struct netmap_priv_d *priv = data;
714 	int last_instance;
715 
716 	NMG_LOCK();
717 	last_instance = netmap_dtor_locked(priv);
718 	NMG_UNLOCK();
719 	if (last_instance) {
720 		bzero(priv, sizeof(*priv));	/* for safety */
721 		free(priv, M_DEVBUF);
722 	}
723 }
724 
725 
726 
727 
728 /*
729  * Handlers for synchronization of the queues from/to the host.
730  * Netmap has two operating modes:
731  * - in the default mode, the rings connected to the host stack are
732  *   just another ring pair managed by userspace;
733  * - in transparent mode (XXX to be defined) incoming packets
734  *   (from the host or the NIC) are marked as NS_FORWARD upon
735  *   arrival, and the user application has a chance to reset the
736  *   flag for packets that should be dropped.
737  *   On the RXSYNC or poll(), packets in RX rings between
738  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
739  *   to the other side.
740  * The transfer NIC --> host is relatively easy, just encapsulate
741  * into mbufs and we are done. The host --> NIC side is slightly
742  * harder because there might not be room in the tx ring so it
743  * might take a while before releasing the buffer.
744  */
745 
746 
747 /*
748  * pass a chain of buffers to the host stack as coming from 'dst'
749  * We do not need to lock because the queue is private.
750  */
751 static void
752 netmap_send_up(struct ifnet *dst, struct mbq *q)
753 {
754 	struct mbuf *m;
755 
756 	/* send packets up, outside the lock */
757 	while ((m = mbq_dequeue(q)) != NULL) {
758 		if (netmap_verbose & NM_VERB_HOST)
759 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
760 		NM_SEND_UP(dst, m);
761 	}
762 	mbq_destroy(q);
763 }
764 
765 
766 /*
767  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
768  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
769  * and pass them up. Drop remaining packets in the unlikely event
770  * of an mbuf shortage.
771  */
772 static void
773 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
774 {
775 	u_int const lim = kring->nkr_num_slots - 1;
776 	u_int const head = kring->ring->head;
777 	u_int n;
778 	struct netmap_adapter *na = kring->na;
779 
780 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
781 		struct mbuf *m;
782 		struct netmap_slot *slot = &kring->ring->slot[n];
783 
784 		if ((slot->flags & NS_FORWARD) == 0 && !force)
785 			continue;
786 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
787 			RD(5, "bad pkt at %d len %d", n, slot->len);
788 			continue;
789 		}
790 		slot->flags &= ~NS_FORWARD; // XXX needed ?
791 		/* XXX TODO: adapt to the case of a multisegment packet */
792 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
793 
794 		if (m == NULL)
795 			break;
796 		mbq_enqueue(q, m);
797 	}
798 }
799 
800 
801 /*
802  * Send to the NIC rings packets marked NS_FORWARD between
803  * kring->nr_hwcur and kring->rhead
804  * Called under kring->rx_queue.lock on the sw rx ring,
805  */
806 static u_int
807 netmap_sw_to_nic(struct netmap_adapter *na)
808 {
809 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
810 	struct netmap_slot *rxslot = kring->ring->slot;
811 	u_int i, rxcur = kring->nr_hwcur;
812 	u_int const head = kring->rhead;
813 	u_int const src_lim = kring->nkr_num_slots - 1;
814 	u_int sent = 0;
815 
816 	/* scan rings to find space, then fill as much as possible */
817 	for (i = 0; i < na->num_tx_rings; i++) {
818 		struct netmap_kring *kdst = &na->tx_rings[i];
819 		struct netmap_ring *rdst = kdst->ring;
820 		u_int const dst_lim = kdst->nkr_num_slots - 1;
821 
822 		/* XXX do we trust ring or kring->rcur,rtail ? */
823 		for (; rxcur != head && !nm_ring_empty(rdst);
824 		     rxcur = nm_next(rxcur, src_lim) ) {
825 			struct netmap_slot *src, *dst, tmp;
826 			u_int dst_cur = rdst->cur;
827 
828 			src = &rxslot[rxcur];
829 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
830 				continue;
831 
832 			sent++;
833 
834 			dst = &rdst->slot[dst_cur];
835 
836 			tmp = *src;
837 
838 			src->buf_idx = dst->buf_idx;
839 			src->flags = NS_BUF_CHANGED;
840 
841 			dst->buf_idx = tmp.buf_idx;
842 			dst->len = tmp.len;
843 			dst->flags = NS_BUF_CHANGED;
844 
845 			rdst->cur = nm_next(dst_cur, dst_lim);
846 		}
847 		/* if (sent) XXX txsync ? */
848 	}
849 	return sent;
850 }
851 
852 
853 /*
854  * netmap_txsync_to_host() passes packets up. We are called from a
855  * system call in user process context, and the only contention
856  * can be among multiple user threads erroneously calling
857  * this routine concurrently.
858  */
859 void
860 netmap_txsync_to_host(struct netmap_adapter *na)
861 {
862 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
863 	struct netmap_ring *ring = kring->ring;
864 	u_int const lim = kring->nkr_num_slots - 1;
865 	u_int const head = nm_txsync_prologue(kring);
866 	struct mbq q;
867 	int error;
868 
869 	error = nm_kr_tryget(kring);
870 	if (error) {
871 		if (error == NM_KR_BUSY)
872 			D("ring %p busy (user error)", kring);
873 		return;
874 	}
875 	if (head > lim) {
876 		D("invalid ring index in stack TX kring %p", kring);
877 		netmap_ring_reinit(kring);
878 		nm_kr_put(kring);
879 		return;
880 	}
881 
882 	/* Take packets from hwcur to head and pass them up.
883 	 * force head = cur since netmap_grab_packets() stops at head
884 	 * In case of no buffers we give up. At the end of the loop,
885 	 * the queue is drained in all cases.
886 	 */
887 	mbq_init(&q);
888 	ring->cur = head;
889 	netmap_grab_packets(kring, &q, 1 /* force */);
890 	ND("have %d pkts in queue", mbq_len(&q));
891 	kring->nr_hwcur = head;
892 	kring->nr_hwtail = head + lim;
893 	if (kring->nr_hwtail > lim)
894 		kring->nr_hwtail -= lim + 1;
895 	nm_txsync_finalize(kring);
896 
897 	nm_kr_put(kring);
898 	netmap_send_up(na->ifp, &q);
899 }
900 
901 
902 /*
903  * rxsync backend for packets coming from the host stack.
904  * They have been put in kring->rx_queue by netmap_transmit().
905  * We protect access to the kring using kring->rx_queue.lock
906  *
907  * This routine also does the selrecord if called from the poll handler
908  * (we know because td != NULL).
909  *
910  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
911  *     as an additional hidden argument.
912  * returns the number of packets delivered to tx queues in
913  * transparent mode, or a negative value if error
914  */
915 int
916 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
917 {
918 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
919 	struct netmap_ring *ring = kring->ring;
920 	u_int nm_i, n;
921 	u_int const lim = kring->nkr_num_slots - 1;
922 	u_int const head = nm_rxsync_prologue(kring);
923 	int ret = 0;
924 	struct mbq *q = &kring->rx_queue;
925 
926 	(void)pwait;	/* disable unused warnings */
927 
928 	if (head > lim) {
929 		netmap_ring_reinit(kring);
930 		return EINVAL;
931 	}
932 
933 	if (kring->nkr_stopped) /* check a first time without lock */
934 		return EBUSY;
935 
936 	mtx_lock(&q->lock);
937 
938 	if (kring->nkr_stopped) {  /* check again with lock held */
939 		ret = EBUSY;
940 		goto unlock_out;
941 	}
942 
943 	/* First part: import newly received packets */
944 	n = mbq_len(q);
945 	if (n) { /* grab packets from the queue */
946 		struct mbuf *m;
947 		uint32_t stop_i;
948 
949 		nm_i = kring->nr_hwtail;
950 		stop_i = nm_prev(nm_i, lim);
951 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
952 			int len = MBUF_LEN(m);
953 			struct netmap_slot *slot = &ring->slot[nm_i];
954 
955 			m_copydata(m, 0, len, BDG_NMB(na, slot));
956 			ND("nm %d len %d", nm_i, len);
957 			if (netmap_verbose)
958                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
959 
960 			slot->len = len;
961 			slot->flags = kring->nkr_slot_flags;
962 			nm_i = nm_next(nm_i, lim);
963 		}
964 		kring->nr_hwtail = nm_i;
965 	}
966 
967 	/*
968 	 * Second part: skip past packets that userspace has released.
969 	 */
970 	nm_i = kring->nr_hwcur;
971 	if (nm_i != head) { /* something was released */
972 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
973 			ret = netmap_sw_to_nic(na);
974 		kring->nr_hwcur = head;
975 	}
976 
977 	nm_rxsync_finalize(kring);
978 
979 	/* access copies of cur,tail in the kring */
980 	if (kring->rcur == kring->rtail && td) /* no bufs available */
981 		selrecord(td, &kring->si);
982 
983 unlock_out:
984 
985 	mtx_unlock(&q->lock);
986 	return ret;
987 }
988 
989 
990 /* Get a netmap adapter for the port.
991  *
992  * If it is possible to satisfy the request, return 0
993  * with *na containing the netmap adapter found.
994  * Otherwise return an error code, with *na containing NULL.
995  *
996  * When the port is attached to a bridge, we always return
997  * EBUSY.
998  * Otherwise, if the port is already bound to a file descriptor,
999  * then we unconditionally return the existing adapter into *na.
1000  * In all the other cases, we return (into *na) either native,
1001  * generic or NULL, according to the following table:
1002  *
1003  *					native_support
1004  * active_fds   dev.netmap.admode         YES     NO
1005  * -------------------------------------------------------
1006  *    >0              *                 NA(ifp) NA(ifp)
1007  *
1008  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1009  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1010  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1011  *
1012  */
1013 
1014 int
1015 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1016 {
1017 	/* generic support */
1018 	int i = netmap_admode;	/* Take a snapshot. */
1019 	int error = 0;
1020 	struct netmap_adapter *prev_na;
1021 	struct netmap_generic_adapter *gna;
1022 
1023 	*na = NULL; /* default */
1024 
1025 	/* reset in case of invalid value */
1026 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1027 		i = netmap_admode = NETMAP_ADMODE_BEST;
1028 
1029 	if (NETMAP_CAPABLE(ifp)) {
1030 		/* If an adapter already exists, but is
1031 		 * attached to a vale port, we report that the
1032 		 * port is busy.
1033 		 */
1034 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1035 			return EBUSY;
1036 
1037 		/* If an adapter already exists, return it if
1038 		 * there are active file descriptors or if
1039 		 * netmap is not forced to use generic
1040 		 * adapters.
1041 		 */
1042 		if (NA(ifp)->active_fds > 0 ||
1043 				i != NETMAP_ADMODE_GENERIC) {
1044 			*na = NA(ifp);
1045 			return 0;
1046 		}
1047 	}
1048 
1049 	/* If there isn't native support and netmap is not allowed
1050 	 * to use generic adapters, we cannot satisfy the request.
1051 	 */
1052 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1053 		return EINVAL;
1054 
1055 	/* Otherwise, create a generic adapter and return it,
1056 	 * saving the previously used netmap adapter, if any.
1057 	 *
1058 	 * Note that here 'prev_na', if not NULL, MUST be a
1059 	 * native adapter, and CANNOT be a generic one. This is
1060 	 * true because generic adapters are created on demand, and
1061 	 * destroyed when not used anymore. Therefore, if the adapter
1062 	 * currently attached to an interface 'ifp' is generic, it
1063 	 * must be that
1064 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1065 	 * Consequently, if NA(ifp) is generic, we will enter one of
1066 	 * the branches above. This ensures that we never override
1067 	 * a generic adapter with another generic adapter.
1068 	 */
1069 	prev_na = NA(ifp);
1070 	error = generic_netmap_attach(ifp);
1071 	if (error)
1072 		return error;
1073 
1074 	*na = NA(ifp);
1075 	gna = (struct netmap_generic_adapter*)NA(ifp);
1076 	gna->prev = prev_na; /* save old na */
1077 	if (prev_na != NULL) {
1078 		ifunit_ref(ifp->if_xname);
1079 		// XXX add a refcount ?
1080 		netmap_adapter_get(prev_na);
1081 	}
1082 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1083 
1084 	return 0;
1085 }
1086 
1087 
1088 /*
1089  * MUST BE CALLED UNDER NMG_LOCK()
1090  *
1091  * get a refcounted reference to an interface.
1092  * This is always called in the execution of an ioctl().
1093  *
1094  * Return ENXIO if the interface does not exist, EINVAL if netmap
1095  * is not supported by the interface.
1096  * If successful, hold a reference.
1097  *
1098  * When the NIC is attached to a bridge, reference is managed
1099  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1100  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1101  * is detached from the bridge, then ifp's refcount is dropped (this
1102  * is equivalent to that ifp is destroyed in case of virtual ports.
1103  *
1104  * This function uses if_rele() when we want to prevent the NIC from
1105  * being detached from the bridge in error handling.  But once refcount
1106  * is acquired by this function, it must be released using nm_if_rele().
1107  */
1108 int
1109 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1110 {
1111 	struct ifnet *ifp;
1112 	int error = 0;
1113 	struct netmap_adapter *ret;
1114 
1115 	*na = NULL;     /* default return value */
1116 
1117 	/* first try to see if this is a bridge port. */
1118 	NMG_LOCK_ASSERT();
1119 
1120 	error = netmap_get_bdg_na(nmr, na, create);
1121 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1122 		return error;
1123 
1124 	ifp = ifunit_ref(nmr->nr_name);
1125 	if (ifp == NULL) {
1126 	        return ENXIO;
1127 	}
1128 
1129 	error = netmap_get_hw_na(ifp, &ret);
1130 	if (error)
1131 		goto out;
1132 
1133 	if (ret != NULL) {
1134 		/* Users cannot use the NIC attached to a bridge directly */
1135 		if (NETMAP_OWNED_BY_KERN(ret)) {
1136 			error = EINVAL;
1137 			goto out;
1138 		}
1139 		error = 0;
1140 		*na = ret;
1141 		netmap_adapter_get(ret);
1142 	}
1143 out:
1144 	if_rele(ifp);
1145 
1146 	return error;
1147 }
1148 
1149 
1150 /*
1151  * validate parameters on entry for *_txsync()
1152  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1153  * in case of error.
1154  *
1155  * rhead, rcur and rtail=hwtail are stored from previous round.
1156  * hwcur is the next packet to send to the ring.
1157  *
1158  * We want
1159  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1160  *
1161  * hwcur, rhead, rtail and hwtail are reliable
1162  */
1163 u_int
1164 nm_txsync_prologue(struct netmap_kring *kring)
1165 {
1166 	struct netmap_ring *ring = kring->ring;
1167 	u_int head = ring->head; /* read only once */
1168 	u_int cur = ring->cur; /* read only once */
1169 	u_int n = kring->nkr_num_slots;
1170 
1171 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1172 		kring->name,
1173 		kring->nr_hwcur, kring->nr_hwtail,
1174 		ring->head, ring->cur, ring->tail);
1175 #if 1 /* kernel sanity checks; but we can trust the kring. */
1176 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1177 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1178 		goto error;
1179 #endif /* kernel sanity checks */
1180 	/*
1181 	 * user sanity checks. We only use 'cur',
1182 	 * A, B, ... are possible positions for cur:
1183 	 *
1184 	 *  0    A  cur   B  tail  C  n-1
1185 	 *  0    D  tail  E  cur   F  n-1
1186 	 *
1187 	 * B, F, D are valid. A, C, E are wrong
1188 	 */
1189 	if (kring->rtail >= kring->rhead) {
1190 		/* want rhead <= head <= rtail */
1191 		if (head < kring->rhead || head > kring->rtail)
1192 			goto error;
1193 		/* and also head <= cur <= rtail */
1194 		if (cur < head || cur > kring->rtail)
1195 			goto error;
1196 	} else { /* here rtail < rhead */
1197 		/* we need head outside rtail .. rhead */
1198 		if (head > kring->rtail && head < kring->rhead)
1199 			goto error;
1200 
1201 		/* two cases now: head <= rtail or head >= rhead  */
1202 		if (head <= kring->rtail) {
1203 			/* want head <= cur <= rtail */
1204 			if (cur < head || cur > kring->rtail)
1205 				goto error;
1206 		} else { /* head >= rhead */
1207 			/* cur must be outside rtail..head */
1208 			if (cur > kring->rtail && cur < head)
1209 				goto error;
1210 		}
1211 	}
1212 	if (ring->tail != kring->rtail) {
1213 		RD(5, "tail overwritten was %d need %d",
1214 			ring->tail, kring->rtail);
1215 		ring->tail = kring->rtail;
1216 	}
1217 	kring->rhead = head;
1218 	kring->rcur = cur;
1219 	return head;
1220 
1221 error:
1222 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1223 		kring->name,
1224 		kring->nr_hwcur,
1225 		kring->rcur, kring->nr_hwtail,
1226 		cur, ring->tail);
1227 	return n;
1228 }
1229 
1230 
1231 /*
1232  * validate parameters on entry for *_rxsync()
1233  * Returns ring->head if ok, kring->nkr_num_slots on error.
1234  *
1235  * For a valid configuration,
1236  * hwcur <= head <= cur <= tail <= hwtail
1237  *
1238  * We only consider head and cur.
1239  * hwcur and hwtail are reliable.
1240  *
1241  */
1242 u_int
1243 nm_rxsync_prologue(struct netmap_kring *kring)
1244 {
1245 	struct netmap_ring *ring = kring->ring;
1246 	uint32_t const n = kring->nkr_num_slots;
1247 	uint32_t head, cur;
1248 
1249 	ND("%s kc %d kt %d h %d c %d t %d",
1250 		kring->name,
1251 		kring->nr_hwcur, kring->nr_hwtail,
1252 		ring->head, ring->cur, ring->tail);
1253 	/*
1254 	 * Before storing the new values, we should check they do not
1255 	 * move backwards. However:
1256 	 * - head is not an issue because the previous value is hwcur;
1257 	 * - cur could in principle go back, however it does not matter
1258 	 *   because we are processing a brand new rxsync()
1259 	 */
1260 	cur = kring->rcur = ring->cur;	/* read only once */
1261 	head = kring->rhead = ring->head;	/* read only once */
1262 #if 1 /* kernel sanity checks */
1263 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1264 		goto error;
1265 #endif /* kernel sanity checks */
1266 	/* user sanity checks */
1267 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1268 		/* want hwcur <= rhead <= hwtail */
1269 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1270 			goto error;
1271 		/* and also rhead <= rcur <= hwtail */
1272 		if (cur < head || cur > kring->nr_hwtail)
1273 			goto error;
1274 	} else {
1275 		/* we need rhead outside hwtail..hwcur */
1276 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1277 			goto error;
1278 		/* two cases now: head <= hwtail or head >= hwcur  */
1279 		if (head <= kring->nr_hwtail) {
1280 			/* want head <= cur <= hwtail */
1281 			if (cur < head || cur > kring->nr_hwtail)
1282 				goto error;
1283 		} else {
1284 			/* cur must be outside hwtail..head */
1285 			if (cur < head && cur > kring->nr_hwtail)
1286 				goto error;
1287 		}
1288 	}
1289 	if (ring->tail != kring->rtail) {
1290 		RD(5, "%s tail overwritten was %d need %d",
1291 			kring->name,
1292 			ring->tail, kring->rtail);
1293 		ring->tail = kring->rtail;
1294 	}
1295 	return head;
1296 
1297 error:
1298 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1299 		kring->nr_hwcur,
1300 		kring->rcur, kring->nr_hwtail,
1301 		kring->rhead, kring->rcur, ring->tail);
1302 	return n;
1303 }
1304 
1305 
1306 /*
1307  * Error routine called when txsync/rxsync detects an error.
1308  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1309  * Return 1 on reinit.
1310  *
1311  * This routine is only called by the upper half of the kernel.
1312  * It only reads hwcur (which is changed only by the upper half, too)
1313  * and hwtail (which may be changed by the lower half, but only on
1314  * a tx ring and only to increase it, so any error will be recovered
1315  * on the next call). For the above, we don't strictly need to call
1316  * it under lock.
1317  */
1318 int
1319 netmap_ring_reinit(struct netmap_kring *kring)
1320 {
1321 	struct netmap_ring *ring = kring->ring;
1322 	u_int i, lim = kring->nkr_num_slots - 1;
1323 	int errors = 0;
1324 
1325 	// XXX KASSERT nm_kr_tryget
1326 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1327 	// XXX probably wrong to trust userspace
1328 	kring->rhead = ring->head;
1329 	kring->rcur  = ring->cur;
1330 	kring->rtail = ring->tail;
1331 
1332 	if (ring->cur > lim)
1333 		errors++;
1334 	if (ring->head > lim)
1335 		errors++;
1336 	if (ring->tail > lim)
1337 		errors++;
1338 	for (i = 0; i <= lim; i++) {
1339 		u_int idx = ring->slot[i].buf_idx;
1340 		u_int len = ring->slot[i].len;
1341 		if (idx < 2 || idx >= netmap_total_buffers) {
1342 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1343 			ring->slot[i].buf_idx = 0;
1344 			ring->slot[i].len = 0;
1345 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1346 			ring->slot[i].len = 0;
1347 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1348 		}
1349 	}
1350 	if (errors) {
1351 		RD(10, "total %d errors", errors);
1352 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1353 			kring->name,
1354 			ring->cur, kring->nr_hwcur,
1355 			ring->tail, kring->nr_hwtail);
1356 		ring->head = kring->rhead = kring->nr_hwcur;
1357 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1358 		ring->tail = kring->rtail = kring->nr_hwtail;
1359 	}
1360 	return (errors ? 1 : 0);
1361 }
1362 
1363 
1364 /*
1365  * Set the ring ID. For devices with a single queue, a request
1366  * for all rings is the same as a single ring.
1367  */
1368 static int
1369 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1370 {
1371 	struct netmap_adapter *na = priv->np_na;
1372 	struct ifnet *ifp = na->ifp;
1373 	u_int i = ringid & NETMAP_RING_MASK;
1374 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1375 	u_int lim = na->num_rx_rings;
1376 
1377 	if (na->num_tx_rings > lim)
1378 		lim = na->num_tx_rings;
1379 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1380 		D("invalid ring id %d", i);
1381 		return (EINVAL);
1382 	}
1383 	priv->np_ringid = ringid;
1384 	if (ringid & NETMAP_SW_RING) {
1385 		priv->np_qfirst = NETMAP_SW_RING;
1386 		priv->np_qlast = 0;
1387 	} else if (ringid & NETMAP_HW_RING) {
1388 		priv->np_qfirst = i;
1389 		priv->np_qlast = i + 1;
1390 	} else {
1391 		priv->np_qfirst = 0;
1392 		priv->np_qlast = NETMAP_HW_RING ;
1393 	}
1394 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1395     if (netmap_verbose) {
1396 	if (ringid & NETMAP_SW_RING)
1397 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1398 	else if (ringid & NETMAP_HW_RING)
1399 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1400 			priv->np_qfirst);
1401 	else
1402 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1403     }
1404 	return 0;
1405 }
1406 
1407 
1408 /*
1409  * possibly move the interface to netmap-mode.
1410  * If success it returns a pointer to netmap_if, otherwise NULL.
1411  * This must be called with NMG_LOCK held.
1412  */
1413 struct netmap_if *
1414 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1415 	uint16_t ringid, int *err)
1416 {
1417 	struct ifnet *ifp = na->ifp;
1418 	struct netmap_if *nifp = NULL;
1419 	int error, need_mem = 0;
1420 
1421 	NMG_LOCK_ASSERT();
1422 	/* ring configuration may have changed, fetch from the card */
1423 	netmap_update_config(na);
1424 	priv->np_na = na;     /* store the reference */
1425 	error = netmap_set_ringid(priv, ringid);
1426 	if (error)
1427 		goto out;
1428 	/* ensure allocators are ready */
1429 	need_mem = !netmap_have_memory_locked(priv);
1430 	if (need_mem) {
1431 		error = netmap_get_memory_locked(priv);
1432 		ND("get_memory returned %d", error);
1433 		if (error)
1434 			goto out;
1435 	}
1436 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1437 	if (nifp == NULL) { /* allocation failed */
1438 		/* we should drop the allocator, but only
1439 		 * if we were the ones who grabbed it
1440 		 */
1441 		error = ENOMEM;
1442 		goto out;
1443 	}
1444 	na->active_fds++;
1445 	if (ifp->if_capenable & IFCAP_NETMAP) {
1446 		/* was already set */
1447 	} else {
1448 		/* Otherwise set the card in netmap mode
1449 		 * and make it use the shared buffers.
1450 		 *
1451 		 * do not core lock because the race is harmless here,
1452 		 * there cannot be any traffic to netmap_transmit()
1453 		 */
1454 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1455 		ND("%p->na_lut == %p", na, na->na_lut);
1456 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1457 		error = na->nm_register(na, 1); /* mode on */
1458 		if (error) {
1459 			netmap_do_unregif(priv, nifp);
1460 			nifp = NULL;
1461 		}
1462 	}
1463 out:
1464 	*err = error;
1465 	if (error) {
1466 		priv->np_na = NULL;
1467 		if (need_mem)
1468 			netmap_drop_memory_locked(priv);
1469 	}
1470 	if (nifp != NULL) {
1471 		/*
1472 		 * advertise that the interface is ready bt setting ni_nifp.
1473 		 * The barrier is needed because readers (poll and *SYNC)
1474 		 * check for priv->np_nifp != NULL without locking
1475 		 */
1476 		wmb(); /* make sure previous writes are visible to all CPUs */
1477 		priv->np_nifp = nifp;
1478 	}
1479 	return nifp;
1480 }
1481 
1482 
1483 
1484 /*
1485  * ioctl(2) support for the "netmap" device.
1486  *
1487  * Following a list of accepted commands:
1488  * - NIOCGINFO
1489  * - SIOCGIFADDR	just for convenience
1490  * - NIOCREGIF
1491  * - NIOCTXSYNC
1492  * - NIOCRXSYNC
1493  *
1494  * Return 0 on success, errno otherwise.
1495  */
1496 int
1497 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1498 	int fflag, struct thread *td)
1499 {
1500 	struct netmap_priv_d *priv = NULL;
1501 	struct ifnet *ifp = NULL;
1502 	struct nmreq *nmr = (struct nmreq *) data;
1503 	struct netmap_adapter *na = NULL;
1504 	int error;
1505 	u_int i, lim;
1506 	struct netmap_if *nifp;
1507 	struct netmap_kring *krings;
1508 
1509 	(void)dev;	/* UNUSED */
1510 	(void)fflag;	/* UNUSED */
1511 #ifdef linux
1512 #define devfs_get_cdevpriv(pp)				\
1513 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
1514 		(*pp ? 0 : ENOENT); })
1515 
1516 /* devfs_set_cdevpriv cannot fail on linux */
1517 #define devfs_set_cdevpriv(p, fn)				\
1518 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
1519 
1520 
1521 #define devfs_clear_cdevpriv()	do {				\
1522 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
1523 	} while (0)
1524 #endif /* linux */
1525 
1526 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1527 		/* truncate name */
1528 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1529 		if (nmr->nr_version != NETMAP_API) {
1530 			D("API mismatch for %s got %d need %d",
1531 				nmr->nr_name,
1532 				nmr->nr_version, NETMAP_API);
1533 			nmr->nr_version = NETMAP_API;
1534 			return EINVAL;
1535 		}
1536 	}
1537 	CURVNET_SET(TD_TO_VNET(td));
1538 
1539 	error = devfs_get_cdevpriv((void **)&priv);
1540 	if (error) {
1541 		CURVNET_RESTORE();
1542 		/* XXX ENOENT should be impossible, since the priv
1543 		 * is now created in the open */
1544 		return (error == ENOENT ? ENXIO : error);
1545 	}
1546 
1547 	switch (cmd) {
1548 	case NIOCGINFO:		/* return capabilities etc */
1549 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1550 			error = netmap_bdg_ctl(nmr, NULL);
1551 			break;
1552 		}
1553 
1554 		NMG_LOCK();
1555 		do {
1556 			/* memsize is always valid */
1557 			struct netmap_mem_d *nmd = &nm_mem;
1558 			u_int memflags;
1559 
1560 			if (nmr->nr_name[0] != '\0') {
1561 				/* get a refcount */
1562 				error = netmap_get_na(nmr, &na, 1 /* create */);
1563 				if (error)
1564 					break;
1565 				nmd = na->nm_mem; /* get memory allocator */
1566 			}
1567 
1568 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1569 			if (error)
1570 				break;
1571 			if (na == NULL) /* only memory info */
1572 				break;
1573 			nmr->nr_offset = 0;
1574 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1575 			netmap_update_config(na);
1576 			nmr->nr_rx_rings = na->num_rx_rings;
1577 			nmr->nr_tx_rings = na->num_tx_rings;
1578 			nmr->nr_rx_slots = na->num_rx_desc;
1579 			nmr->nr_tx_slots = na->num_tx_desc;
1580 			if (memflags & NETMAP_MEM_PRIVATE)
1581 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1582 			netmap_adapter_put(na);
1583 		} while (0);
1584 		NMG_UNLOCK();
1585 		break;
1586 
1587 	case NIOCREGIF:
1588 		/* possibly attach/detach NIC and VALE switch */
1589 		i = nmr->nr_cmd;
1590 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1591 				|| i == NETMAP_BDG_OFFSET) {
1592 			error = netmap_bdg_ctl(nmr, NULL);
1593 			break;
1594 		} else if (i != 0) {
1595 			D("nr_cmd must be 0 not %d", i);
1596 			error = EINVAL;
1597 			break;
1598 		}
1599 
1600 		/* protect access to priv from concurrent NIOCREGIF */
1601 		NMG_LOCK();
1602 		do {
1603 			u_int memflags;
1604 
1605 			if (priv->np_na != NULL) {	/* thread already registered */
1606 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1607 				break;
1608 			}
1609 			/* find the interface and a reference */
1610 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1611 			if (error)
1612 				break;
1613 			ifp = na->ifp;
1614 			if (NETMAP_OWNED_BY_KERN(na)) {
1615 				netmap_adapter_put(na);
1616 				error = EBUSY;
1617 				break;
1618 			}
1619 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1620 			if (!nifp) {    /* reg. failed, release priv and ref */
1621 				netmap_adapter_put(na);
1622 				priv->np_nifp = NULL;
1623 				break;
1624 			}
1625 
1626 			/* return the offset of the netmap_if object */
1627 			nmr->nr_rx_rings = na->num_rx_rings;
1628 			nmr->nr_tx_rings = na->num_tx_rings;
1629 			nmr->nr_rx_slots = na->num_rx_desc;
1630 			nmr->nr_tx_slots = na->num_tx_desc;
1631 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1632 			if (error) {
1633 				netmap_adapter_put(na);
1634 				break;
1635 			}
1636 			if (memflags & NETMAP_MEM_PRIVATE) {
1637 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1638 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1639 			}
1640 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1641 		} while (0);
1642 		NMG_UNLOCK();
1643 		break;
1644 
1645 	case NIOCTXSYNC:
1646 	case NIOCRXSYNC:
1647 		nifp = priv->np_nifp;
1648 
1649 		if (nifp == NULL) {
1650 			error = ENXIO;
1651 			break;
1652 		}
1653 		rmb(); /* make sure following reads are not from cache */
1654 
1655 		na = priv->np_na;      /* we have a reference */
1656 
1657 		if (na == NULL) {
1658 			D("Internal error: nifp != NULL && na == NULL");
1659 			error = ENXIO;
1660 			break;
1661 		}
1662 
1663 		ifp = na->ifp;
1664 		if (ifp == NULL) {
1665 			RD(1, "the ifp is gone");
1666 			error = ENXIO;
1667 			break;
1668 		}
1669 
1670 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1671 			if (cmd == NIOCTXSYNC)
1672 				netmap_txsync_to_host(na);
1673 			else
1674 				netmap_rxsync_from_host(na, NULL, NULL);
1675 			break;
1676 		}
1677 		/* find the last ring to scan */
1678 		lim = priv->np_qlast;
1679 		if (lim == NETMAP_HW_RING)
1680 			lim = (cmd == NIOCTXSYNC) ?
1681 			    na->num_tx_rings : na->num_rx_rings;
1682 
1683 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1684 		for (i = priv->np_qfirst; i < lim; i++) {
1685 			struct netmap_kring *kring = krings + i;
1686 			if (nm_kr_tryget(kring)) {
1687 				error = EBUSY;
1688 				goto out;
1689 			}
1690 			if (cmd == NIOCTXSYNC) {
1691 				if (netmap_verbose & NM_VERB_TXSYNC)
1692 					D("pre txsync ring %d cur %d hwcur %d",
1693 					    i, kring->ring->cur,
1694 					    kring->nr_hwcur);
1695 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1696 					netmap_ring_reinit(kring);
1697 				} else {
1698 					na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1699 				}
1700 				if (netmap_verbose & NM_VERB_TXSYNC)
1701 					D("post txsync ring %d cur %d hwcur %d",
1702 					    i, kring->ring->cur,
1703 					    kring->nr_hwcur);
1704 			} else {
1705 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1706 				microtime(&na->rx_rings[i].ring->ts);
1707 			}
1708 			nm_kr_put(kring);
1709 		}
1710 
1711 		break;
1712 
1713 #ifdef __FreeBSD__
1714 	case BIOCIMMEDIATE:
1715 	case BIOCGHDRCMPLT:
1716 	case BIOCSHDRCMPLT:
1717 	case BIOCSSEESENT:
1718 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1719 		break;
1720 
1721 	default:	/* allow device-specific ioctls */
1722 	    {
1723 		struct socket so;
1724 
1725 		bzero(&so, sizeof(so));
1726 		NMG_LOCK();
1727 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1728 		if (error) {
1729 			netmap_adapter_put(na);
1730 			NMG_UNLOCK();
1731 			break;
1732 		}
1733 		ifp = na->ifp;
1734 		so.so_vnet = ifp->if_vnet;
1735 		// so->so_proto not null.
1736 		error = ifioctl(&so, cmd, data, td);
1737 		netmap_adapter_put(na);
1738 		NMG_UNLOCK();
1739 		break;
1740 	    }
1741 
1742 #else /* linux */
1743 	default:
1744 		error = EOPNOTSUPP;
1745 #endif /* linux */
1746 	}
1747 out:
1748 
1749 	CURVNET_RESTORE();
1750 	return (error);
1751 }
1752 
1753 
1754 /*
1755  * select(2) and poll(2) handlers for the "netmap" device.
1756  *
1757  * Can be called for one or more queues.
1758  * Return true the event mask corresponding to ready events.
1759  * If there are no ready events, do a selrecord on either individual
1760  * selinfo or on the global one.
1761  * Device-dependent parts (locking and sync of tx/rx rings)
1762  * are done through callbacks.
1763  *
1764  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1765  * The first one is remapped to pwait as selrecord() uses the name as an
1766  * hidden argument.
1767  */
1768 int
1769 netmap_poll(struct cdev *dev, int events, struct thread *td)
1770 {
1771 	struct netmap_priv_d *priv = NULL;
1772 	struct netmap_adapter *na;
1773 	struct ifnet *ifp;
1774 	struct netmap_kring *kring;
1775 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1776 	u_int lim_tx, lim_rx;
1777 	struct mbq q;		/* packets from hw queues to host stack */
1778 	void *pwait = dev;	/* linux compatibility */
1779 
1780 	/*
1781 	 * In order to avoid nested locks, we need to "double check"
1782 	 * txsync and rxsync if we decide to do a selrecord().
1783 	 * retry_tx (and retry_rx, later) prevent looping forever.
1784 	 */
1785 	int retry_tx = 1, retry_rx = 1;
1786 
1787 	(void)pwait;
1788 	mbq_init(&q);
1789 
1790 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1791 		return POLLERR;
1792 
1793 	if (priv->np_nifp == NULL) {
1794 		D("No if registered");
1795 		return POLLERR;
1796 	}
1797 	rmb(); /* make sure following reads are not from cache */
1798 
1799 	na = priv->np_na;
1800 	ifp = na->ifp;
1801 	// check for deleted
1802 	if (ifp == NULL) {
1803 		RD(1, "the ifp is gone");
1804 		return POLLERR;
1805 	}
1806 
1807 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1808 		return POLLERR;
1809 
1810 	if (netmap_verbose & 0x8000)
1811 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1812 	want_tx = events & (POLLOUT | POLLWRNORM);
1813 	want_rx = events & (POLLIN | POLLRDNORM);
1814 
1815 	lim_tx = na->num_tx_rings;
1816 	lim_rx = na->num_rx_rings;
1817 
1818 	if (priv->np_qfirst == NETMAP_SW_RING) {
1819 		// XXX locking ?
1820 		/* handle the host stack ring */
1821 		if (priv->np_txpoll || want_tx) {
1822 			/* push any packets up, then we are always ready */
1823 			netmap_txsync_to_host(na);
1824 			revents |= want_tx;
1825 		}
1826 		if (want_rx) {
1827 			kring = &na->rx_rings[lim_rx];
1828 			/* XXX replace with rxprologue etc. */
1829 			if (nm_ring_empty(kring->ring))
1830 				netmap_rxsync_from_host(na, td, dev);
1831 			if (!nm_ring_empty(kring->ring))
1832 				revents |= want_rx;
1833 		}
1834 		return (revents);
1835 	}
1836 
1837 
1838 	/*
1839 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1840 	 * the file descriptor is bound to all of them. If so, we sleep on
1841 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1842 	 * (FreeBSD only allows two selinfo's per file descriptor).
1843 	 * The interrupt routine in the driver wake one or the other
1844 	 * (or both) depending on which clients are active.
1845 	 *
1846 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1847 	 * txsync() is called if we run out of buffers on POLLOUT, or
1848 	 * there are pending packets to send. The latter can be disabled
1849 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1850 	 */
1851 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1852 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1853 
1854 	if (priv->np_qlast != NETMAP_HW_RING) {
1855 		lim_tx = lim_rx = priv->np_qlast;
1856 	}
1857 
1858 	/*
1859 	 * We start with a lock free round which is cheap if we have
1860 	 * slots available. If this fails, then lock and call the sync
1861 	 * routines.
1862 	 */
1863 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1864 		kring = &na->rx_rings[i];
1865 		/* XXX compare ring->cur and kring->tail */
1866 		if (!nm_ring_empty(kring->ring)) {
1867 			revents |= want_rx;
1868 			want_rx = 0;	/* also breaks the loop */
1869 		}
1870 	}
1871 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1872 		kring = &na->tx_rings[i];
1873 		/* XXX compare ring->cur and kring->tail */
1874 		if (!nm_ring_empty(kring->ring)) {
1875 			revents |= want_tx;
1876 			want_tx = 0;	/* also breaks the loop */
1877 		}
1878 	}
1879 
1880 	/*
1881 	 * If we want to push packets out (priv->np_txpoll) or
1882 	 * want_tx is still set, we must issue txsync calls
1883 	 * (on all rings, to avoid that the tx rings stall).
1884 	 * XXX should also check cur != hwcur on the tx rings.
1885 	 * Fortunately, normal tx mode has np_txpoll set.
1886 	 */
1887 	if (priv->np_txpoll || want_tx) {
1888 		/*
1889 		 * The first round checks if anyone is ready, if not
1890 		 * do a selrecord and another round to handle races.
1891 		 * want_tx goes to 0 if any space is found, and is
1892 		 * used to skip rings with no pending transmissions.
1893 		 */
1894 flush_tx:
1895 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1896 			int found = 0;
1897 
1898 			kring = &na->tx_rings[i];
1899 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1900 				continue;
1901 			/* only one thread does txsync */
1902 			if (nm_kr_tryget(kring)) {
1903 				D("%p lost race on txring %d, ok", priv, i);
1904 				continue;
1905 			}
1906 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1907 				netmap_ring_reinit(kring);
1908 				revents |= POLLERR;
1909 			} else {
1910 				if (na->nm_txsync(na, i, 0))
1911 					revents |= POLLERR;
1912 			}
1913 
1914 			/*
1915 			 * If we found new slots, notify potential
1916 			 * listeners on the same ring.
1917 			 * Since we just did a txsync, look at the copies
1918 			 * of cur,tail in the kring.
1919 			 */
1920 			found = kring->rcur != kring->rtail;
1921 			nm_kr_put(kring);
1922 			if (found) { /* notify other listeners */
1923 				revents |= want_tx;
1924 				want_tx = 0;
1925 				na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY);
1926 			}
1927 		}
1928 		if (want_tx && retry_tx) {
1929 			selrecord(td, check_all_tx ?
1930 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
1931 			retry_tx = 0;
1932 			goto flush_tx;
1933 		}
1934 	}
1935 
1936 	/*
1937 	 * If want_rx is still set scan receive rings.
1938 	 * Do it on all rings because otherwise we starve.
1939 	 */
1940 	if (want_rx) {
1941 		int send_down = 0; /* transparent mode */
1942 		/* two rounds here to for race avoidance */
1943 do_retry_rx:
1944 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1945 			int found = 0;
1946 
1947 			kring = &na->rx_rings[i];
1948 
1949 			if (nm_kr_tryget(kring)) {
1950 				D("%p lost race on rxring %d, ok", priv, i);
1951 				continue;
1952 			}
1953 
1954 			/*
1955 			 * transparent mode support: collect packets
1956 			 * from the rxring(s).
1957 			 * XXX NR_FORWARD should only be read on
1958 			 * physical or NIC ports
1959 			 */
1960 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1961 				ND(10, "forwarding some buffers up %d to %d",
1962 				    kring->nr_hwcur, kring->ring->cur);
1963 				netmap_grab_packets(kring, &q, netmap_fwd);
1964 			}
1965 
1966 			if (na->nm_rxsync(na, i, 0))
1967 				revents |= POLLERR;
1968 			if (netmap_no_timestamp == 0 ||
1969 					kring->ring->flags & NR_TIMESTAMP) {
1970 				microtime(&kring->ring->ts);
1971 			}
1972 			/* after an rxsync we can use kring->rcur, rtail */
1973 			found = kring->rcur != kring->rtail;
1974 			nm_kr_put(kring);
1975 			if (found) {
1976 				revents |= want_rx;
1977 				retry_rx = 0;
1978 				na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY);
1979 			}
1980 		}
1981 
1982 		/* transparent mode XXX only during first pass ? */
1983 		kring = &na->rx_rings[lim_rx];
1984 		if (check_all_rx
1985 		    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
1986 			/* XXX fix to use kring fields */
1987 			if (nm_ring_empty(kring->ring))
1988 				send_down = netmap_rxsync_from_host(na, td, dev);
1989 			if (!nm_ring_empty(kring->ring))
1990 				revents |= want_rx;
1991 		}
1992 
1993 		if (retry_rx)
1994 			selrecord(td, check_all_rx ?
1995 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
1996 		if (send_down > 0 || retry_rx) {
1997 			retry_rx = 0;
1998 			if (send_down)
1999 				goto flush_tx; /* and retry_rx */
2000 			else
2001 				goto do_retry_rx;
2002 		}
2003 	}
2004 
2005 	/*
2006 	 * Transparent mode: marked bufs on rx rings between
2007 	 * kring->nr_hwcur and ring->head
2008 	 * are passed to the other endpoint.
2009 	 *
2010 	 * In this mode we also scan the sw rxring, which in
2011 	 * turn passes packets up.
2012 	 *
2013 	 * XXX Transparent mode at the moment requires to bind all
2014  	 * rings to a single file descriptor.
2015 	 */
2016 
2017 	if (q.head)
2018 		netmap_send_up(na->ifp, &q);
2019 
2020 	return (revents);
2021 }
2022 
2023 
2024 /*-------------------- driver support routines -------------------*/
2025 
2026 static int netmap_hw_krings_create(struct netmap_adapter *);
2027 
2028 static int
2029 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2030 	enum txrx tx, int flags)
2031 {
2032 	struct netmap_kring *kring;
2033 
2034 	if (tx == NR_TX) {
2035 		kring = na->tx_rings + n_ring;
2036 		selwakeuppri(&kring->si, PI_NET);
2037 		if (flags & NAF_GLOBAL_NOTIFY)
2038 			selwakeuppri(&na->tx_si, PI_NET);
2039 	} else {
2040 		kring = na->rx_rings + n_ring;
2041 		selwakeuppri(&kring->si, PI_NET);
2042 		if (flags & NAF_GLOBAL_NOTIFY)
2043 			selwakeuppri(&na->rx_si, PI_NET);
2044 	}
2045 	return 0;
2046 }
2047 
2048 
2049 // XXX check handling of failures
2050 int
2051 netmap_attach_common(struct netmap_adapter *na)
2052 {
2053 	struct ifnet *ifp = na->ifp;
2054 
2055 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2056 		D("%s: invalid rings tx %d rx %d",
2057 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2058 		return EINVAL;
2059 	}
2060 	WNA(ifp) = na;
2061 
2062 	/* the following is only needed for na that use the host port.
2063 	 * XXX do we have something similar for linux ?
2064 	 */
2065 #ifdef __FreeBSD__
2066 	na->if_input = ifp->if_input; /* for netmap_send_up */
2067 #endif /* __FreeBSD__ */
2068 
2069 	NETMAP_SET_CAPABLE(ifp);
2070 	if (na->nm_krings_create == NULL) {
2071 		na->nm_krings_create = netmap_hw_krings_create;
2072 		na->nm_krings_delete = netmap_hw_krings_delete;
2073 	}
2074 	if (na->nm_notify == NULL)
2075 		na->nm_notify = netmap_notify;
2076 	na->active_fds = 0;
2077 
2078 	if (na->nm_mem == NULL)
2079 		na->nm_mem = &nm_mem;
2080 	return 0;
2081 }
2082 
2083 
2084 void
2085 netmap_detach_common(struct netmap_adapter *na)
2086 {
2087 	if (na->ifp)
2088 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2089 
2090 	if (na->tx_rings) { /* XXX should not happen */
2091 		D("freeing leftover tx_rings");
2092 		na->nm_krings_delete(na);
2093 	}
2094 	if (na->na_flags & NAF_MEM_OWNER)
2095 		netmap_mem_private_delete(na->nm_mem);
2096 	bzero(na, sizeof(*na));
2097 	free(na, M_DEVBUF);
2098 }
2099 
2100 
2101 /*
2102  * Initialize a ``netmap_adapter`` object created by driver on attach.
2103  * We allocate a block of memory with room for a struct netmap_adapter
2104  * plus two sets of N+2 struct netmap_kring (where N is the number
2105  * of hardware rings):
2106  * krings	0..N-1	are for the hardware queues.
2107  * kring	N	is for the host stack queue
2108  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2109  * Return 0 on success, ENOMEM otherwise.
2110  */
2111 int
2112 netmap_attach(struct netmap_adapter *arg)
2113 {
2114 	struct netmap_hw_adapter *hwna = NULL;
2115 	// XXX when is arg == NULL ?
2116 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2117 
2118 	if (arg == NULL || ifp == NULL)
2119 		goto fail;
2120 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2121 	if (hwna == NULL)
2122 		goto fail;
2123 	hwna->up = *arg;
2124 	if (netmap_attach_common(&hwna->up)) {
2125 		free(hwna, M_DEVBUF);
2126 		goto fail;
2127 	}
2128 	netmap_adapter_get(&hwna->up);
2129 
2130 #ifdef linux
2131 	if (ifp->netdev_ops) {
2132 		/* prepare a clone of the netdev ops */
2133 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2134 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2135 #else
2136 		hwna->nm_ndo = *ifp->netdev_ops;
2137 #endif
2138 	}
2139 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2140 #endif /* linux */
2141 
2142 	D("success for %s", NM_IFPNAME(ifp));
2143 	return 0;
2144 
2145 fail:
2146 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2147 	netmap_detach(ifp);
2148 	return (hwna ? EINVAL : ENOMEM);
2149 }
2150 
2151 
2152 void
2153 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2154 {
2155 	if (!na) {
2156 		return;
2157 	}
2158 
2159 	refcount_acquire(&na->na_refcount);
2160 }
2161 
2162 
2163 /* returns 1 iff the netmap_adapter is destroyed */
2164 int
2165 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2166 {
2167 	if (!na)
2168 		return 1;
2169 
2170 	if (!refcount_release(&na->na_refcount))
2171 		return 0;
2172 
2173 	if (na->nm_dtor)
2174 		na->nm_dtor(na);
2175 
2176 	netmap_detach_common(na);
2177 
2178 	return 1;
2179 }
2180 
2181 
2182 int
2183 netmap_hw_krings_create(struct netmap_adapter *na)
2184 {
2185 	int ret = netmap_krings_create(na,
2186 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
2187 	if (ret == 0) {
2188 		/* initialize the mbq for the sw rx ring */
2189 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2190 		ND("initialized sw rx queue %d", na->num_rx_rings);
2191 	}
2192 	return ret;
2193 }
2194 
2195 
2196 
2197 /*
2198  * Free the allocated memory linked to the given ``netmap_adapter``
2199  * object.
2200  */
2201 void
2202 netmap_detach(struct ifnet *ifp)
2203 {
2204 	struct netmap_adapter *na = NA(ifp);
2205 
2206 	if (!na)
2207 		return;
2208 
2209 	NMG_LOCK();
2210 	netmap_disable_all_rings(ifp);
2211 	if (!netmap_adapter_put(na)) {
2212 		/* someone is still using the adapter,
2213 		 * tell them that the interface is gone
2214 		 */
2215 		na->ifp = NULL;
2216 		/* give them a chance to notice */
2217 		netmap_enable_all_rings(ifp);
2218 	}
2219 	NMG_UNLOCK();
2220 }
2221 
2222 
2223 /*
2224  * Intercept packets from the network stack and pass them
2225  * to netmap as incoming packets on the 'software' ring.
2226  *
2227  * We only store packets in a bounded mbq and then copy them
2228  * in the relevant rxsync routine.
2229  *
2230  * We rely on the OS to make sure that the ifp and na do not go
2231  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2232  * In nm_register() or whenever there is a reinitialization,
2233  * we make sure to make the mode change visible here.
2234  */
2235 int
2236 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2237 {
2238 	struct netmap_adapter *na = NA(ifp);
2239 	struct netmap_kring *kring;
2240 	u_int len = MBUF_LEN(m);
2241 	u_int error = ENOBUFS;
2242 	struct mbq *q;
2243 	int space;
2244 
2245 	// XXX [Linux] we do not need this lock
2246 	// if we follow the down/configure/up protocol -gl
2247 	// mtx_lock(&na->core_lock);
2248 
2249 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2250 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2251 		error = ENXIO;
2252 		goto done;
2253 	}
2254 
2255 	kring = &na->rx_rings[na->num_rx_rings];
2256 	q = &kring->rx_queue;
2257 
2258 	// XXX reconsider long packets if we handle fragments
2259 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2260 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2261 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2262 		goto done;
2263 	}
2264 
2265 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2266 	 * and maybe other instances of netmap_transmit (the latter
2267 	 * not possible on Linux).
2268 	 * Also avoid overflowing the queue.
2269 	 */
2270 	mtx_lock(&q->lock);
2271 
2272         space = kring->nr_hwtail - kring->nr_hwcur;
2273         if (space < 0)
2274                 space += kring->nkr_num_slots;
2275 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2276 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2277 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2278 			len, m);
2279 	} else {
2280 		mbq_enqueue(q, m);
2281 		ND(10, "%s %d bufs in queue len %d m %p",
2282 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2283 		/* notify outside the lock */
2284 		m = NULL;
2285 		error = 0;
2286 	}
2287 	mtx_unlock(&q->lock);
2288 
2289 done:
2290 	if (m)
2291 		m_freem(m);
2292 	/* unconditionally wake up listeners */
2293 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2294 
2295 	return (error);
2296 }
2297 
2298 
2299 /*
2300  * netmap_reset() is called by the driver routines when reinitializing
2301  * a ring. The driver is in charge of locking to protect the kring.
2302  * If native netmap mode is not set just return NULL.
2303  */
2304 struct netmap_slot *
2305 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2306 	u_int new_cur)
2307 {
2308 	struct netmap_kring *kring;
2309 	int new_hwofs, lim;
2310 
2311 	if (na == NULL) {
2312 		D("NULL na, should not happen");
2313 		return NULL;	/* no netmap support here */
2314 	}
2315 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2316 		ND("interface not in netmap mode");
2317 		return NULL;	/* nothing to reinitialize */
2318 	}
2319 
2320 	/* XXX note- in the new scheme, we are not guaranteed to be
2321 	 * under lock (e.g. when called on a device reset).
2322 	 * In this case, we should set a flag and do not trust too
2323 	 * much the values. In practice: TODO
2324 	 * - set a RESET flag somewhere in the kring
2325 	 * - do the processing in a conservative way
2326 	 * - let the *sync() fixup at the end.
2327 	 */
2328 	if (tx == NR_TX) {
2329 		if (n >= na->num_tx_rings)
2330 			return NULL;
2331 		kring = na->tx_rings + n;
2332 		// XXX check whether we should use hwcur or rcur
2333 		new_hwofs = kring->nr_hwcur - new_cur;
2334 	} else {
2335 		if (n >= na->num_rx_rings)
2336 			return NULL;
2337 		kring = na->rx_rings + n;
2338 		new_hwofs = kring->nr_hwtail - new_cur;
2339 	}
2340 	lim = kring->nkr_num_slots - 1;
2341 	if (new_hwofs > lim)
2342 		new_hwofs -= lim + 1;
2343 
2344 	/* Always set the new offset value and realign the ring. */
2345 	if (netmap_verbose)
2346 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2347 		NM_IFPNAME(na->ifp),
2348 		tx == NR_TX ? "TX" : "RX", n,
2349 		kring->nkr_hwofs, new_hwofs,
2350 		kring->nr_hwtail,
2351 		tx == NR_TX ? lim : kring->nr_hwtail);
2352 	kring->nkr_hwofs = new_hwofs;
2353 	if (tx == NR_TX) {
2354 		kring->nr_hwtail = kring->nr_hwcur + lim;
2355 		if (kring->nr_hwtail > lim)
2356 			kring->nr_hwtail -= lim + 1;
2357 	}
2358 
2359 #if 0 // def linux
2360 	/* XXX check that the mappings are correct */
2361 	/* need ring_nr, adapter->pdev, direction */
2362 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2363 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2364 		D("error mapping rx netmap buffer %d", i);
2365 		// XXX fix error handling
2366 	}
2367 
2368 #endif /* linux */
2369 	/*
2370 	 * Wakeup on the individual and global selwait
2371 	 * We do the wakeup here, but the ring is not yet reconfigured.
2372 	 * However, we are under lock so there are no races.
2373 	 */
2374 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2375 	return kring->ring->slot;
2376 }
2377 
2378 
2379 /*
2380  * Dispatch rx/tx interrupts to the netmap rings.
2381  *
2382  * "work_done" is non-null on the RX path, NULL for the TX path.
2383  * We rely on the OS to make sure that there is only one active
2384  * instance per queue, and that there is appropriate locking.
2385  *
2386  * The 'notify' routine depends on what the ring is attached to.
2387  * - for a netmap file descriptor, do a selwakeup on the individual
2388  *   waitqueue, plus one on the global one if needed
2389  * - for a switch, call the proper forwarding routine
2390  * - XXX more ?
2391  */
2392 void
2393 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2394 {
2395 	struct netmap_adapter *na = NA(ifp);
2396 	struct netmap_kring *kring;
2397 
2398 	q &= NETMAP_RING_MASK;
2399 
2400 	if (netmap_verbose) {
2401 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2402 	}
2403 
2404 	if (work_done) { /* RX path */
2405 		if (q >= na->num_rx_rings)
2406 			return;	// not a physical queue
2407 		kring = na->rx_rings + q;
2408 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2409 		na->nm_notify(na, q, NR_RX,
2410 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2411 		*work_done = 1; /* do not fire napi again */
2412 	} else { /* TX path */
2413 		if (q >= na->num_tx_rings)
2414 			return;	// not a physical queue
2415 		kring = na->tx_rings + q;
2416 		na->nm_notify(na, q, NR_TX,
2417 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2418 	}
2419 }
2420 
2421 
2422 /*
2423  * Default functions to handle rx/tx interrupts from a physical device.
2424  * "work_done" is non-null on the RX path, NULL for the TX path.
2425  *
2426  * If the card is not in netmap mode, simply return 0,
2427  * so that the caller proceeds with regular processing.
2428  * Otherwise call netmap_common_irq() and return 1.
2429  *
2430  * If the card is connected to a netmap file descriptor,
2431  * do a selwakeup on the individual queue, plus one on the global one
2432  * if needed (multiqueue card _and_ there are multiqueue listeners),
2433  * and return 1.
2434  *
2435  * Finally, if called on rx from an interface connected to a switch,
2436  * calls the proper forwarding routine, and return 1.
2437  */
2438 int
2439 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2440 {
2441 	// XXX could we check NAF_NATIVE_ON ?
2442 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2443 		return 0;
2444 
2445 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2446 		ND("use regular interrupt");
2447 		return 0;
2448 	}
2449 
2450 	netmap_common_irq(ifp, q, work_done);
2451 	return 1;
2452 }
2453 
2454 
2455 /*
2456  * Module loader and unloader
2457  *
2458  * netmap_init() creates the /dev/netmap device and initializes
2459  * all global variables. Returns 0 on success, errno on failure
2460  * (but there is no chance)
2461  *
2462  * netmap_fini() destroys everything.
2463  */
2464 
2465 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2466 extern struct cdevsw netmap_cdevsw;
2467 
2468 
2469 void
2470 netmap_fini(void)
2471 {
2472 	// XXX destroy_bridges() ?
2473 	if (netmap_dev)
2474 		destroy_dev(netmap_dev);
2475 	netmap_mem_fini();
2476 	NMG_LOCK_DESTROY();
2477 	printf("netmap: unloaded module.\n");
2478 }
2479 
2480 
2481 int
2482 netmap_init(void)
2483 {
2484 	int error;
2485 
2486 	NMG_LOCK_INIT();
2487 
2488 	error = netmap_mem_init();
2489 	if (error != 0)
2490 		goto fail;
2491 	/* XXX could use make_dev_credv() to get error number */
2492 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2493 			      "netmap");
2494 	if (!netmap_dev)
2495 		goto fail;
2496 
2497 	netmap_init_bridges();
2498 	printf("netmap: loaded module\n");
2499 	return (0);
2500 fail:
2501 	netmap_fini();
2502 	return (EINVAL); /* may be incorrect */
2503 }
2504