xref: /freebsd-14.2/sys/dev/netmap/netmap.c (revision 8947c298)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 
128 /* --- internals ----
129  *
130  * Roadmap to the code that implements the above.
131  *
132  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133  * >    select()able file descriptor on which events are reported.
134  *
135  *  	Internally, we allocate a netmap_priv_d structure, that will be
136  *  	initialized on ioctl(NIOCREGIF).
137  *
138  *      os-specific:
139  *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140  *  		     per-thread.
141  *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142  *  		     per-open.
143  *
144  * > 2. on each descriptor, the process issues an ioctl() to identify
145  * >    the interface that should report events to the file descriptor.
146  *
147  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148  * 	Most important things happen in netmap_get_na() and
149  * 	netmap_do_regif(), called from there. Additional details can be
150  * 	found in the comments above those functions.
151  *
152  * 	In all cases, this action creates/takes-a-reference-to a
153  * 	netmap_*_adapter describing the port, and allocates a netmap_if
154  * 	and all necessary netmap rings, filling them with netmap buffers.
155  *
156  *      In this phase, the sync callbacks for each ring are set (these are used
157  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158  *      The adapter creation/initialization code puts them in the
159  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162  * 	actually call netmap_krings_create() to perform this and the other
163  * 	common stuff. netmap_krings_create() also takes care of the host rings,
164  * 	if needed, by setting their sync callbacks appropriately.
165  *
166  * 	Additional actions depend on the kind of netmap_adapter that has been
167  * 	registered:
168  *
169  * 	- netmap_hw_adapter:  	     [netmap.c]
170  * 	     This is a system netdev/ifp with native netmap support.
171  * 	     The ifp is detached from the host stack by redirecting:
172  * 	       - transmissions (from the network stack) to netmap_transmit()
173  * 	       - receive notifications to the nm_notify() callback for
174  * 	         this adapter. The callback is normally netmap_notify(), unless
175  * 	         the ifp is attached to a bridge using bwrap, in which case it
176  * 	         is netmap_bwrap_intr_notify().
177  *
178  * 	- netmap_generic_adapter:      [netmap_generic.c]
179  * 	      A system netdev/ifp without native netmap support.
180  *
181  * 	(the decision about native/non native support is taken in
182  * 	 netmap_get_hw_na(), called by netmap_get_na())
183  *
184  * 	- netmap_vp_adapter 		[netmap_vale.c]
185  * 	      Returned by netmap_get_bdg_na().
186  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187  * 	      are created on the fly if they don't already exist, and are
188  * 	      always attached to a bridge.
189  * 	      Persistent VALE ports must must be created seperately, and i
190  * 	      then attached like normal NICs. The NIOCREGIF we are examining
191  * 	      will find them only if they had previosly been created and
192  * 	      attached (see VALE_CTL below).
193  *
194  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195  * 	      Returned by netmap_get_pipe_na().
196  * 	      Both pipe ends are created, if they didn't already exist.
197  *
198  * 	- netmap_monitor_adapter      [netmap_monitor.c]
199  * 	      Returned by netmap_get_monitor_na().
200  * 	      If successful, the nm_sync callbacks of the monitored adapter
201  * 	      will be intercepted by the returned monitor.
202  *
203  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204  * 	      Cannot be obtained in this way, see VALE_CTL below
205  *
206  *
207  * 	os-specific:
208  * 	    linux: we first go through linux_netmap_ioctl() to
209  * 	           adapt the FreeBSD interface to the linux one.
210  *
211  *
212  * > 3. on each descriptor, the process issues an mmap() request to
213  * >    map the shared memory region within the process' address space.
214  * >    The list of interesting queues is indicated by a location in
215  * >    the shared memory region.
216  *
217  *      os-specific:
218  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220  *
221  * > 4. using the functions in the netmap(4) userspace API, a process
222  * >    can look up the occupation state of a queue, access memory buffers,
223  * >    and retrieve received packets or enqueue packets to transmit.
224  *
225  * 	these actions do not involve the kernel.
226  *
227  * > 5. using some ioctl()s the process can synchronize the userspace view
228  * >    of the queue with the actual status in the kernel. This includes both
229  * >    receiving the notification of new packets, and transmitting new
230  * >    packets on the output interface.
231  *
232  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234  * 	structures, as initialized in step 2 and maybe later modified
235  * 	by a monitor. Monitors, however, will always call the original
236  * 	callback before doing anything else.
237  *
238  *
239  * > 6. select() or poll() can be used to wait for events on individual
240  * >    transmit or receive queues (or all queues for a given interface).
241  *
242  * 	Implemented in netmap_poll(). This will call the same nm_sync()
243  * 	callbacks as in step 5 above.
244  *
245  * 	os-specific:
246  * 		linux: we first go through linux_netmap_poll() to adapt
247  * 		       the FreeBSD interface to the linux one.
248  *
249  *
250  *  ----  VALE_CTL -----
251  *
252  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253  *  nr_cmd in the nmreq structure. These subcommands are handled by
254  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256  *  subcommands, respectively.
257  *
258  *  Any network interface known to the system (including a persistent VALE
259  *  port) can be attached to a VALE switch by issuing the
260  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262  *  attachment of other interfaces, instead, requires the creation of a
263  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264  *  netmap mode. This may require the creation of a netmap_generic_adapter if
265  *  we have no native support for the interface, or if generic adapters have
266  *  been forced by sysctl.
267  *
268  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270  *  callback.  In the case of the bwrap, the callback creates the
271  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274  *  A generic adapter for the wrapped ifp will be created if needed, when
275  *  netmap_get_bdg_na() calls netmap_get_hw_na().
276  *
277  *
278  *  ---- DATAPATHS -----
279  *
280  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281  *
282  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283  *
284  *    - tx from netmap userspace:
285  *	 concurrently:
286  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287  *                kring->nm_sync() == DEVICE_netmap_txsync()
288  *           2) device interrupt handler
289  *                na->nm_notify()  == netmap_notify()
290  *    - rx from netmap userspace:
291  *       concurrently:
292  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293  *                kring->nm_sync() == DEVICE_netmap_rxsync()
294  *           2) device interrupt handler
295  *                na->nm_notify()  == netmap_notify()
296  *    - rx from host stack
297  *       concurrently:
298  *           1) host stack
299  *                netmap_transmit()
300  *                  na->nm_notify  == netmap_notify()
301  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302  *                kring->nm_sync() == netmap_rxsync_from_host_compat
303  *                  netmap_rxsync_from_host(na, NULL, NULL)
304  *    - tx to host stack
305  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306  *             kring->nm_sync() == netmap_txsync_to_host_compat
307  *               netmap_txsync_to_host(na)
308  *                 NM_SEND_UP()
309  *                   FreeBSD: na->if_input() == ?? XXX
310  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311  *
312  *
313  *
314  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315  *
316  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
317  *
318  *    - tx from netmap userspace:
319  *       concurrently:
320  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
321  *               kring->nm_sync() == generic_netmap_txsync()
322  *                   linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
323  *                       generic_ndo_start_xmit()
324  *                           orig. dev. start_xmit
325  *                   FreeBSD: na->if_transmit() == orig. dev if_transmit
326  *           2) generic_mbuf_destructor()
327  *                   na->nm_notify() == netmap_notify()
328  *    - rx from netmap userspace:
329  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
330  *               kring->nm_sync() == generic_netmap_rxsync()
331  *                   mbq_safe_dequeue()
332  *           2) device driver
333  *               generic_rx_handler()
334  *                   mbq_safe_enqueue()
335  *                   na->nm_notify() == netmap_notify()
336  *    - rx from host stack:
337  *        concurrently:
338  *           1) host stack
339  *               linux: generic_ndo_start_xmit()
340  *                   netmap_transmit()
341  *               FreeBSD: ifp->if_input() == netmap_transmit
342  *               both:
343  *                       na->nm_notify() == netmap_notify()
344  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
345  *                kring->nm_sync() == netmap_rxsync_from_host_compat
346  *                  netmap_rxsync_from_host(na, NULL, NULL)
347  *    - tx to host stack:
348  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
349  *             kring->nm_sync() == netmap_txsync_to_host_compat
350  *               netmap_txsync_to_host(na)
351  *                 NM_SEND_UP()
352  *                   FreeBSD: na->if_input() == ??? XXX
353  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
354  *
355  *
356  *                           -= VALE =-
357  *
358  *   INCOMING:
359  *
360  *      - VALE ports:
361  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
362  *              kring->nm_sync() == netmap_vp_txsync()
363  *
364  *      - system device with native support:
365  *         from cable:
366  *             interrupt
367  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
368  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
369  *                     netmap_vp_txsync()
370  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
371  *         from host stack:
372  *             netmap_transmit()
373  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
374  *                     kring->nm_sync() == netmap_rxsync_from_host_compat()
375  *                     netmap_vp_txsync()
376  *
377  *      - system device with generic support:
378  *         from device driver:
379  *            generic_rx_handler()
380  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
381  *                     kring->nm_sync() == generic_netmap_rxsync()
382  *                     netmap_vp_txsync()
383  *                     kring->nm_sync() == generic_netmap_rxsync()
384  *         from host stack:
385  *            netmap_transmit()
386  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
387  *                     kring->nm_sync() == netmap_rxsync_from_host_compat()
388  *                     netmap_vp_txsync()
389  *
390  *   (all cases) --> nm_bdg_flush()
391  *                      dest_na->nm_notify() == (see below)
392  *
393  *   OUTGOING:
394  *
395  *      - VALE ports:
396  *         concurrently:
397  *             1) ioctlNIOCRXSYNC)/netmap_poll() in process context
398  *                    kring->nm_sync() == netmap_vp_rxsync()
399  *             2) from nm_bdg_flush()
400  *                    na->nm_notify() == netmap_notify()
401  *
402  *      - system device with native support:
403  *          to cable:
404  *             na->nm_notify() == netmap_bwrap_notify()
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == DEVICE_netmap_txsync()
407  *                 netmap_vp_rxsync()
408  *          to host stack:
409  *                 netmap_vp_rxsync()
410  *                 kring->nm_sync() == netmap_txsync_to_host_compat
411  *                 netmap_vp_rxsync_locked()
412  *
413  *      - system device with generic adapter:
414  *          to device driver:
415  *             na->nm_notify() == netmap_bwrap_notify()
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == generic_netmap_txsync()
418  *                 netmap_vp_rxsync()
419  *          to host stack:
420  *                 netmap_vp_rxsync()
421  *                 kring->nm_sync() == netmap_txsync_to_host_compat
422  *                 netmap_vp_rxsync()
423  *
424  */
425 
426 /*
427  * OS-specific code that is used only within this file.
428  * Other OS-specific code that must be accessed by drivers
429  * is present in netmap_kern.h
430  */
431 
432 #if defined(__FreeBSD__)
433 #include <sys/cdefs.h> /* prerequisite */
434 #include <sys/types.h>
435 #include <sys/errno.h>
436 #include <sys/param.h>	/* defines used in kernel.h */
437 #include <sys/kernel.h>	/* types used in module initialization */
438 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
439 #include <sys/filio.h>	/* FIONBIO */
440 #include <sys/sockio.h>
441 #include <sys/socketvar.h>	/* struct socket */
442 #include <sys/malloc.h>
443 #include <sys/poll.h>
444 #include <sys/rwlock.h>
445 #include <sys/socket.h> /* sockaddrs */
446 #include <sys/selinfo.h>
447 #include <sys/sysctl.h>
448 #include <sys/jail.h>
449 #include <net/vnet.h>
450 #include <net/if.h>
451 #include <net/if_var.h>
452 #include <net/bpf.h>		/* BIOCIMMEDIATE */
453 #include <machine/bus.h>	/* bus_dmamap_* */
454 #include <sys/endian.h>
455 #include <sys/refcount.h>
456 
457 
458 /* reduce conditional code */
459 // linux API, use for the knlist in FreeBSD
460 /* use a private mutex for the knlist */
461 #define init_waitqueue_head(x) do {			\
462 	struct mtx *m = &(x)->m;			\
463 	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);	\
464 	knlist_init_mtx(&(x)->si.si_note, m);		\
465     } while (0)
466 
467 #define OS_selrecord(a, b)	selrecord(a, &((b)->si))
468 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
469 
470 #elif defined(linux)
471 
472 #include "bsd_glue.h"
473 
474 
475 
476 #elif defined(__APPLE__)
477 
478 #warning OSX support is only partial
479 #include "osx_glue.h"
480 
481 #else
482 
483 #error	Unsupported platform
484 
485 #endif /* unsupported */
486 
487 /*
488  * common headers
489  */
490 #include <net/netmap.h>
491 #include <dev/netmap/netmap_kern.h>
492 #include <dev/netmap/netmap_mem2.h>
493 
494 
495 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
496 
497 /* user-controlled variables */
498 int netmap_verbose;
499 
500 static int netmap_no_timestamp; /* don't timestamp on rxsync */
501 
502 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
503 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
504     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
505 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
506     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
507 int netmap_mitigate = 1;
508 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
509 int netmap_no_pendintr = 1;
510 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
511     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
512 int netmap_txsync_retry = 2;
513 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
514     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
515 
516 int netmap_adaptive_io = 0;
517 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
518     &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
519 
520 int netmap_flags = 0;	/* debug flags */
521 int netmap_fwd = 0;	/* force transparent mode */
522 
523 /*
524  * netmap_admode selects the netmap mode to use.
525  * Invalid values are reset to NETMAP_ADMODE_BEST
526  */
527 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
528 	NETMAP_ADMODE_NATIVE,	/* either native or none */
529 	NETMAP_ADMODE_GENERIC,	/* force generic */
530 	NETMAP_ADMODE_LAST };
531 static int netmap_admode = NETMAP_ADMODE_BEST;
532 
533 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
534 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
535 int netmap_generic_rings = 1;   /* number of queues in generic. */
536 
537 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
539 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
543 
544 NMG_LOCK_T	netmap_global_lock;
545 
546 /*
547  * mark the ring as stopped, and run through the locks
548  * to make sure other users get to see it.
549  */
550 static void
551 netmap_disable_ring(struct netmap_kring *kr)
552 {
553 	kr->nkr_stopped = 1;
554 	nm_kr_get(kr);
555 	mtx_lock(&kr->q_lock);
556 	mtx_unlock(&kr->q_lock);
557 	nm_kr_put(kr);
558 }
559 
560 /* stop or enable a single ring */
561 void
562 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
563 {
564 	if (stopped)
565 		netmap_disable_ring(NMR(na, t) + ring_id);
566 	else
567 		NMR(na, t)[ring_id].nkr_stopped = 0;
568 }
569 
570 
571 /* stop or enable all the rings of na */
572 void
573 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
574 {
575 	int i;
576 	enum txrx t;
577 
578 	if (!nm_netmap_on(na))
579 		return;
580 
581 	for_rx_tx(t) {
582 		for (i = 0; i < netmap_real_rings(na, t); i++) {
583 			netmap_set_ring(na, i, t, stopped);
584 		}
585 	}
586 }
587 
588 /*
589  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
590  * to finish and prevents any new one from starting.  Call this before turning
591  * netmap mode off, or before removing the harware rings (e.g., on module
592  * onload).  As a rule of thumb for linux drivers, this should be placed near
593  * each napi_disable().
594  */
595 void
596 netmap_disable_all_rings(struct ifnet *ifp)
597 {
598 	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
599 }
600 
601 /*
602  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
603  * adapter's rings In linux drivers, this should be placed near each
604  * napi_enable().
605  */
606 void
607 netmap_enable_all_rings(struct ifnet *ifp)
608 {
609 	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
610 }
611 
612 
613 /*
614  * generic bound_checking function
615  */
616 u_int
617 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
618 {
619 	u_int oldv = *v;
620 	const char *op = NULL;
621 
622 	if (dflt < lo)
623 		dflt = lo;
624 	if (dflt > hi)
625 		dflt = hi;
626 	if (oldv < lo) {
627 		*v = dflt;
628 		op = "Bump";
629 	} else if (oldv > hi) {
630 		*v = hi;
631 		op = "Clamp";
632 	}
633 	if (op && msg)
634 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
635 	return *v;
636 }
637 
638 
639 /*
640  * packet-dump function, user-supplied or static buffer.
641  * The destination buffer must be at least 30+4*len
642  */
643 const char *
644 nm_dump_buf(char *p, int len, int lim, char *dst)
645 {
646 	static char _dst[8192];
647 	int i, j, i0;
648 	static char hex[] ="0123456789abcdef";
649 	char *o;	/* output position */
650 
651 #define P_HI(x)	hex[((x) & 0xf0)>>4]
652 #define P_LO(x)	hex[((x) & 0xf)]
653 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
654 	if (!dst)
655 		dst = _dst;
656 	if (lim <= 0 || lim > len)
657 		lim = len;
658 	o = dst;
659 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
660 	o += strlen(o);
661 	/* hexdump routine */
662 	for (i = 0; i < lim; ) {
663 		sprintf(o, "%5d: ", i);
664 		o += strlen(o);
665 		memset(o, ' ', 48);
666 		i0 = i;
667 		for (j=0; j < 16 && i < lim; i++, j++) {
668 			o[j*3] = P_HI(p[i]);
669 			o[j*3+1] = P_LO(p[i]);
670 		}
671 		i = i0;
672 		for (j=0; j < 16 && i < lim; i++, j++)
673 			o[j + 48] = P_C(p[i]);
674 		o[j+48] = '\n';
675 		o += j+49;
676 	}
677 	*o = '\0';
678 #undef P_HI
679 #undef P_LO
680 #undef P_C
681 	return dst;
682 }
683 
684 
685 /*
686  * Fetch configuration from the device, to cope with dynamic
687  * reconfigurations after loading the module.
688  */
689 /* call with NMG_LOCK held */
690 int
691 netmap_update_config(struct netmap_adapter *na)
692 {
693 	u_int txr, txd, rxr, rxd;
694 
695 	txr = txd = rxr = rxd = 0;
696 	if (na->nm_config == NULL ||
697 	    na->nm_config(na, &txr, &txd, &rxr, &rxd))
698 	{
699 		/* take whatever we had at init time */
700 		txr = na->num_tx_rings;
701 		txd = na->num_tx_desc;
702 		rxr = na->num_rx_rings;
703 		rxd = na->num_rx_desc;
704 	}
705 
706 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
707 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
708 		return 0; /* nothing changed */
709 	if (netmap_verbose || na->active_fds > 0) {
710 		D("stored config %s: txring %d x %d, rxring %d x %d",
711 			na->name,
712 			na->num_tx_rings, na->num_tx_desc,
713 			na->num_rx_rings, na->num_rx_desc);
714 		D("new config %s: txring %d x %d, rxring %d x %d",
715 			na->name, txr, txd, rxr, rxd);
716 	}
717 	if (na->active_fds == 0) {
718 		D("configuration changed (but fine)");
719 		na->num_tx_rings = txr;
720 		na->num_tx_desc = txd;
721 		na->num_rx_rings = rxr;
722 		na->num_rx_desc = rxd;
723 		return 0;
724 	}
725 	D("configuration changed while active, this is bad...");
726 	return 1;
727 }
728 
729 static void netmap_txsync_to_host(struct netmap_adapter *na);
730 static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
731 
732 /* kring->nm_sync callback for the host tx ring */
733 static int
734 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
735 {
736 	(void)flags; /* unused */
737 	netmap_txsync_to_host(kring->na);
738 	return 0;
739 }
740 
741 /* kring->nm_sync callback for the host rx ring */
742 static int
743 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
744 {
745 	(void)flags; /* unused */
746 	netmap_rxsync_from_host(kring->na, NULL, NULL);
747 	return 0;
748 }
749 
750 
751 
752 /* create the krings array and initialize the fields common to all adapters.
753  * The array layout is this:
754  *
755  *                    +----------+
756  * na->tx_rings ----->|          | \
757  *                    |          |  } na->num_tx_ring
758  *                    |          | /
759  *                    +----------+
760  *                    |          |    host tx kring
761  * na->rx_rings ----> +----------+
762  *                    |          | \
763  *                    |          |  } na->num_rx_rings
764  *                    |          | /
765  *                    +----------+
766  *                    |          |    host rx kring
767  *                    +----------+
768  * na->tailroom ----->|          | \
769  *                    |          |  } tailroom bytes
770  *                    |          | /
771  *                    +----------+
772  *
773  * Note: for compatibility, host krings are created even when not needed.
774  * The tailroom space is currently used by vale ports for allocating leases.
775  */
776 /* call with NMG_LOCK held */
777 int
778 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
779 {
780 	u_int i, len, ndesc;
781 	struct netmap_kring *kring;
782 	u_int n[NR_TXRX];
783 	enum txrx t;
784 
785 	/* account for the (possibly fake) host rings */
786 	n[NR_TX] = na->num_tx_rings + 1;
787 	n[NR_RX] = na->num_rx_rings + 1;
788 
789 	len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
790 
791 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
792 	if (na->tx_rings == NULL) {
793 		D("Cannot allocate krings");
794 		return ENOMEM;
795 	}
796 	na->rx_rings = na->tx_rings + n[NR_TX];
797 
798 	/*
799 	 * All fields in krings are 0 except the one initialized below.
800 	 * but better be explicit on important kring fields.
801 	 */
802 	for_rx_tx(t) {
803 		ndesc = nma_get_ndesc(na, t);
804 		for (i = 0; i < n[t]; i++) {
805 			kring = &NMR(na, t)[i];
806 			bzero(kring, sizeof(*kring));
807 			kring->na = na;
808 			kring->ring_id = i;
809 			kring->tx = t;
810 			kring->nkr_num_slots = ndesc;
811 			if (i < nma_get_nrings(na, t)) {
812 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
813 			} else if (i == na->num_tx_rings) {
814 				kring->nm_sync = (t == NR_TX ?
815 						netmap_txsync_to_host_compat :
816 						netmap_rxsync_from_host_compat);
817 			}
818 			kring->nm_notify = na->nm_notify;
819 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
820 			/*
821 			 * IMPORTANT: Always keep one slot empty.
822 			 */
823 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
824 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
825 					nm_txrx2str(t), i);
826 			ND("ktx %s h %d c %d t %d",
827 				kring->name, kring->rhead, kring->rcur, kring->rtail);
828 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
829 			init_waitqueue_head(&kring->si);
830 		}
831 		init_waitqueue_head(&na->si[t]);
832 	}
833 
834 	na->tailroom = na->rx_rings + n[NR_RX];
835 
836 	return 0;
837 }
838 
839 
840 #ifdef __FreeBSD__
841 static void
842 netmap_knlist_destroy(NM_SELINFO_T *si)
843 {
844 	/* XXX kqueue(9) needed; these will mirror knlist_init. */
845 	knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
846 	knlist_destroy(&si->si.si_note);
847 	/* now we don't need the mutex anymore */
848 	mtx_destroy(&si->m);
849 }
850 #endif /* __FreeBSD__ */
851 
852 
853 /* undo the actions performed by netmap_krings_create */
854 /* call with NMG_LOCK held */
855 void
856 netmap_krings_delete(struct netmap_adapter *na)
857 {
858 	struct netmap_kring *kring = na->tx_rings;
859 	enum txrx t;
860 
861 	for_rx_tx(t)
862 		netmap_knlist_destroy(&na->si[t]);
863 
864 	/* we rely on the krings layout described above */
865 	for ( ; kring != na->tailroom; kring++) {
866 		mtx_destroy(&kring->q_lock);
867 		netmap_knlist_destroy(&kring->si);
868 	}
869 	free(na->tx_rings, M_DEVBUF);
870 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
871 }
872 
873 
874 /*
875  * Destructor for NIC ports. They also have an mbuf queue
876  * on the rings connected to the host so we need to purge
877  * them first.
878  */
879 /* call with NMG_LOCK held */
880 static void
881 netmap_hw_krings_delete(struct netmap_adapter *na)
882 {
883 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
884 
885 	ND("destroy sw mbq with len %d", mbq_len(q));
886 	mbq_purge(q);
887 	mbq_safe_destroy(q);
888 	netmap_krings_delete(na);
889 }
890 
891 
892 
893 /*
894  * Undo everything that was done in netmap_do_regif(). In particular,
895  * call nm_register(ifp,0) to stop netmap mode on the interface and
896  * revert to normal operation.
897  */
898 /* call with NMG_LOCK held */
899 static void netmap_unset_ringid(struct netmap_priv_d *);
900 static void netmap_rel_exclusive(struct netmap_priv_d *);
901 static void
902 netmap_do_unregif(struct netmap_priv_d *priv)
903 {
904 	struct netmap_adapter *na = priv->np_na;
905 
906 	NMG_LOCK_ASSERT();
907 	na->active_fds--;
908 	/* release exclusive use if it was requested on regif */
909 	netmap_rel_exclusive(priv);
910 	if (na->active_fds <= 0) {	/* last instance */
911 
912 		if (netmap_verbose)
913 			D("deleting last instance for %s", na->name);
914 
915 #ifdef	WITH_MONITOR
916 		/* walk through all the rings and tell any monitor
917 		 * that the port is going to exit netmap mode
918 		 */
919 		netmap_monitor_stop(na);
920 #endif
921 		/*
922 		 * (TO CHECK) This function is only called
923 		 * when the last reference to this file descriptor goes
924 		 * away. This means we cannot have any pending poll()
925 		 * or interrupt routine operating on the structure.
926 		 * XXX The file may be closed in a thread while
927 		 * another thread is using it.
928 		 * Linux keeps the file opened until the last reference
929 		 * by any outstanding ioctl/poll or mmap is gone.
930 		 * FreeBSD does not track mmap()s (but we do) and
931 		 * wakes up any sleeping poll(). Need to check what
932 		 * happens if the close() occurs while a concurrent
933 		 * syscall is running.
934 		 */
935 		na->nm_register(na, 0); /* off, clear flags */
936 		/* Wake up any sleeping threads. netmap_poll will
937 		 * then return POLLERR
938 		 * XXX The wake up now must happen during *_down(), when
939 		 * we order all activities to stop. -gl
940 		 */
941 		/* delete rings and buffers */
942 		netmap_mem_rings_delete(na);
943 		na->nm_krings_delete(na);
944 	}
945 	/* possibily decrement counter of tx_si/rx_si users */
946 	netmap_unset_ringid(priv);
947 	/* delete the nifp */
948 	netmap_mem_if_delete(na, priv->np_nifp);
949 	/* drop the allocator */
950 	netmap_mem_deref(na->nm_mem, na);
951 	/* mark the priv as unregistered */
952 	priv->np_na = NULL;
953 	priv->np_nifp = NULL;
954 }
955 
956 /* call with NMG_LOCK held */
957 static __inline int
958 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
959 {
960 	return (priv->np_na != NULL &&
961 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
962 }
963 
964 /*
965  * Destructor of the netmap_priv_d, called when the fd is closed
966  * Action: undo all the things done by NIOCREGIF,
967  * On FreeBSD we need to track whether there are active mmap()s,
968  * and we use np_active_mmaps for that. On linux, the field is always 0.
969  * Return: 1 if we can free priv, 0 otherwise.
970  *
971  */
972 /* call with NMG_LOCK held */
973 int
974 netmap_dtor_locked(struct netmap_priv_d *priv)
975 {
976 	struct netmap_adapter *na = priv->np_na;
977 
978 	/* number of active mmaps on this fd (FreeBSD only) */
979 	if (--priv->np_refs > 0) {
980 		return 0;
981 	}
982 
983 	if (!na) {
984 		return 1; //XXX is it correct?
985 	}
986 	netmap_do_unregif(priv);
987 	netmap_adapter_put(na);
988 	return 1;
989 }
990 
991 
992 /* call with NMG_LOCK *not* held */
993 void
994 netmap_dtor(void *data)
995 {
996 	struct netmap_priv_d *priv = data;
997 	int last_instance;
998 
999 	NMG_LOCK();
1000 	last_instance = netmap_dtor_locked(priv);
1001 	NMG_UNLOCK();
1002 	if (last_instance) {
1003 		bzero(priv, sizeof(*priv));	/* for safety */
1004 		free(priv, M_DEVBUF);
1005 	}
1006 }
1007 
1008 
1009 
1010 
1011 /*
1012  * Handlers for synchronization of the queues from/to the host.
1013  * Netmap has two operating modes:
1014  * - in the default mode, the rings connected to the host stack are
1015  *   just another ring pair managed by userspace;
1016  * - in transparent mode (XXX to be defined) incoming packets
1017  *   (from the host or the NIC) are marked as NS_FORWARD upon
1018  *   arrival, and the user application has a chance to reset the
1019  *   flag for packets that should be dropped.
1020  *   On the RXSYNC or poll(), packets in RX rings between
1021  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1022  *   to the other side.
1023  * The transfer NIC --> host is relatively easy, just encapsulate
1024  * into mbufs and we are done. The host --> NIC side is slightly
1025  * harder because there might not be room in the tx ring so it
1026  * might take a while before releasing the buffer.
1027  */
1028 
1029 
1030 /*
1031  * pass a chain of buffers to the host stack as coming from 'dst'
1032  * We do not need to lock because the queue is private.
1033  */
1034 static void
1035 netmap_send_up(struct ifnet *dst, struct mbq *q)
1036 {
1037 	struct mbuf *m;
1038 
1039 	/* send packets up, outside the lock */
1040 	while ((m = mbq_dequeue(q)) != NULL) {
1041 		if (netmap_verbose & NM_VERB_HOST)
1042 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1043 		NM_SEND_UP(dst, m);
1044 	}
1045 	mbq_destroy(q);
1046 }
1047 
1048 
1049 /*
1050  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1051  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1052  * and pass them up. Drop remaining packets in the unlikely event
1053  * of an mbuf shortage.
1054  */
1055 static void
1056 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1057 {
1058 	u_int const lim = kring->nkr_num_slots - 1;
1059 	u_int const head = kring->rhead;
1060 	u_int n;
1061 	struct netmap_adapter *na = kring->na;
1062 
1063 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1064 		struct mbuf *m;
1065 		struct netmap_slot *slot = &kring->ring->slot[n];
1066 
1067 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1068 			continue;
1069 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1070 			RD(5, "bad pkt at %d len %d", n, slot->len);
1071 			continue;
1072 		}
1073 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1074 		/* XXX TODO: adapt to the case of a multisegment packet */
1075 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1076 
1077 		if (m == NULL)
1078 			break;
1079 		mbq_enqueue(q, m);
1080 	}
1081 }
1082 
1083 
1084 /*
1085  * Send to the NIC rings packets marked NS_FORWARD between
1086  * kring->nr_hwcur and kring->rhead
1087  * Called under kring->rx_queue.lock on the sw rx ring,
1088  */
1089 static u_int
1090 netmap_sw_to_nic(struct netmap_adapter *na)
1091 {
1092 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1093 	struct netmap_slot *rxslot = kring->ring->slot;
1094 	u_int i, rxcur = kring->nr_hwcur;
1095 	u_int const head = kring->rhead;
1096 	u_int const src_lim = kring->nkr_num_slots - 1;
1097 	u_int sent = 0;
1098 
1099 	/* scan rings to find space, then fill as much as possible */
1100 	for (i = 0; i < na->num_tx_rings; i++) {
1101 		struct netmap_kring *kdst = &na->tx_rings[i];
1102 		struct netmap_ring *rdst = kdst->ring;
1103 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1104 
1105 		/* XXX do we trust ring or kring->rcur,rtail ? */
1106 		for (; rxcur != head && !nm_ring_empty(rdst);
1107 		     rxcur = nm_next(rxcur, src_lim) ) {
1108 			struct netmap_slot *src, *dst, tmp;
1109 			u_int dst_cur = rdst->cur;
1110 
1111 			src = &rxslot[rxcur];
1112 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1113 				continue;
1114 
1115 			sent++;
1116 
1117 			dst = &rdst->slot[dst_cur];
1118 
1119 			tmp = *src;
1120 
1121 			src->buf_idx = dst->buf_idx;
1122 			src->flags = NS_BUF_CHANGED;
1123 
1124 			dst->buf_idx = tmp.buf_idx;
1125 			dst->len = tmp.len;
1126 			dst->flags = NS_BUF_CHANGED;
1127 
1128 			rdst->cur = nm_next(dst_cur, dst_lim);
1129 		}
1130 		/* if (sent) XXX txsync ? */
1131 	}
1132 	return sent;
1133 }
1134 
1135 
1136 /*
1137  * netmap_txsync_to_host() passes packets up. We are called from a
1138  * system call in user process context, and the only contention
1139  * can be among multiple user threads erroneously calling
1140  * this routine concurrently.
1141  */
1142 static void
1143 netmap_txsync_to_host(struct netmap_adapter *na)
1144 {
1145 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1146 	u_int const lim = kring->nkr_num_slots - 1;
1147 	u_int const head = kring->rhead;
1148 	struct mbq q;
1149 
1150 	/* Take packets from hwcur to head and pass them up.
1151 	 * force head = cur since netmap_grab_packets() stops at head
1152 	 * In case of no buffers we give up. At the end of the loop,
1153 	 * the queue is drained in all cases.
1154 	 */
1155 	mbq_init(&q);
1156 	netmap_grab_packets(kring, &q, 1 /* force */);
1157 	ND("have %d pkts in queue", mbq_len(&q));
1158 	kring->nr_hwcur = head;
1159 	kring->nr_hwtail = head + lim;
1160 	if (kring->nr_hwtail > lim)
1161 		kring->nr_hwtail -= lim + 1;
1162 
1163 	netmap_send_up(na->ifp, &q);
1164 }
1165 
1166 
1167 /*
1168  * rxsync backend for packets coming from the host stack.
1169  * They have been put in kring->rx_queue by netmap_transmit().
1170  * We protect access to the kring using kring->rx_queue.lock
1171  *
1172  * This routine also does the selrecord if called from the poll handler
1173  * (we know because td != NULL).
1174  *
1175  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1176  *     as an additional hidden argument.
1177  * returns the number of packets delivered to tx queues in
1178  * transparent mode, or a negative value if error
1179  */
1180 static int
1181 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1182 {
1183 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1184 	struct netmap_ring *ring = kring->ring;
1185 	u_int nm_i, n;
1186 	u_int const lim = kring->nkr_num_slots - 1;
1187 	u_int const head = kring->rhead;
1188 	int ret = 0;
1189 	struct mbq *q = &kring->rx_queue, fq;
1190 
1191 	(void)pwait;	/* disable unused warnings */
1192 	(void)td;
1193 
1194 	mbq_init(&fq); /* fq holds packets to be freed */
1195 
1196 	mbq_lock(q);
1197 
1198 	/* First part: import newly received packets */
1199 	n = mbq_len(q);
1200 	if (n) { /* grab packets from the queue */
1201 		struct mbuf *m;
1202 		uint32_t stop_i;
1203 
1204 		nm_i = kring->nr_hwtail;
1205 		stop_i = nm_prev(nm_i, lim);
1206 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1207 			int len = MBUF_LEN(m);
1208 			struct netmap_slot *slot = &ring->slot[nm_i];
1209 
1210 			m_copydata(m, 0, len, NMB(na, slot));
1211 			ND("nm %d len %d", nm_i, len);
1212 			if (netmap_verbose)
1213                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1214 
1215 			slot->len = len;
1216 			slot->flags = kring->nkr_slot_flags;
1217 			nm_i = nm_next(nm_i, lim);
1218 			mbq_enqueue(&fq, m);
1219 		}
1220 		kring->nr_hwtail = nm_i;
1221 	}
1222 
1223 	/*
1224 	 * Second part: skip past packets that userspace has released.
1225 	 */
1226 	nm_i = kring->nr_hwcur;
1227 	if (nm_i != head) { /* something was released */
1228 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1229 			ret = netmap_sw_to_nic(na);
1230 		kring->nr_hwcur = head;
1231 	}
1232 
1233 	/* access copies of cur,tail in the kring */
1234 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1235 		OS_selrecord(td, &kring->si);
1236 
1237 	mbq_unlock(q);
1238 
1239 	mbq_purge(&fq);
1240 	mbq_destroy(&fq);
1241 
1242 	return ret;
1243 }
1244 
1245 
1246 /* Get a netmap adapter for the port.
1247  *
1248  * If it is possible to satisfy the request, return 0
1249  * with *na containing the netmap adapter found.
1250  * Otherwise return an error code, with *na containing NULL.
1251  *
1252  * When the port is attached to a bridge, we always return
1253  * EBUSY.
1254  * Otherwise, if the port is already bound to a file descriptor,
1255  * then we unconditionally return the existing adapter into *na.
1256  * In all the other cases, we return (into *na) either native,
1257  * generic or NULL, according to the following table:
1258  *
1259  *					native_support
1260  * active_fds   dev.netmap.admode         YES     NO
1261  * -------------------------------------------------------
1262  *    >0              *                 NA(ifp) NA(ifp)
1263  *
1264  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1265  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1266  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1267  *
1268  */
1269 
1270 int
1271 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1272 {
1273 	/* generic support */
1274 	int i = netmap_admode;	/* Take a snapshot. */
1275 	struct netmap_adapter *prev_na;
1276 #ifdef WITH_GENERIC
1277 	struct netmap_generic_adapter *gna;
1278 	int error = 0;
1279 #endif
1280 
1281 	*na = NULL; /* default */
1282 
1283 	/* reset in case of invalid value */
1284 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1285 		i = netmap_admode = NETMAP_ADMODE_BEST;
1286 
1287 	if (NETMAP_CAPABLE(ifp)) {
1288 		prev_na = NA(ifp);
1289 		/* If an adapter already exists, return it if
1290 		 * there are active file descriptors or if
1291 		 * netmap is not forced to use generic
1292 		 * adapters.
1293 		 */
1294 		if (NETMAP_OWNED_BY_ANY(prev_na)
1295 			|| i != NETMAP_ADMODE_GENERIC
1296 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1297 #ifdef WITH_PIPES
1298 			/* ugly, but we cannot allow an adapter switch
1299 			 * if some pipe is referring to this one
1300 			 */
1301 			|| prev_na->na_next_pipe > 0
1302 #endif
1303 		) {
1304 			*na = prev_na;
1305 			return 0;
1306 		}
1307 	}
1308 
1309 	/* If there isn't native support and netmap is not allowed
1310 	 * to use generic adapters, we cannot satisfy the request.
1311 	 */
1312 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1313 		return EOPNOTSUPP;
1314 
1315 #ifdef WITH_GENERIC
1316 	/* Otherwise, create a generic adapter and return it,
1317 	 * saving the previously used netmap adapter, if any.
1318 	 *
1319 	 * Note that here 'prev_na', if not NULL, MUST be a
1320 	 * native adapter, and CANNOT be a generic one. This is
1321 	 * true because generic adapters are created on demand, and
1322 	 * destroyed when not used anymore. Therefore, if the adapter
1323 	 * currently attached to an interface 'ifp' is generic, it
1324 	 * must be that
1325 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1326 	 * Consequently, if NA(ifp) is generic, we will enter one of
1327 	 * the branches above. This ensures that we never override
1328 	 * a generic adapter with another generic adapter.
1329 	 */
1330 	prev_na = NA(ifp);
1331 	error = generic_netmap_attach(ifp);
1332 	if (error)
1333 		return error;
1334 
1335 	*na = NA(ifp);
1336 	gna = (struct netmap_generic_adapter*)NA(ifp);
1337 	gna->prev = prev_na; /* save old na */
1338 	if (prev_na != NULL) {
1339 		ifunit_ref(ifp->if_xname);
1340 		// XXX add a refcount ?
1341 		netmap_adapter_get(prev_na);
1342 	}
1343 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1344 
1345 	return 0;
1346 #else /* !WITH_GENERIC */
1347 	return EOPNOTSUPP;
1348 #endif
1349 }
1350 
1351 
1352 /*
1353  * MUST BE CALLED UNDER NMG_LOCK()
1354  *
1355  * Get a refcounted reference to a netmap adapter attached
1356  * to the interface specified by nmr.
1357  * This is always called in the execution of an ioctl().
1358  *
1359  * Return ENXIO if the interface specified by the request does
1360  * not exist, ENOTSUP if netmap is not supported by the interface,
1361  * EBUSY if the interface is already attached to a bridge,
1362  * EINVAL if parameters are invalid, ENOMEM if needed resources
1363  * could not be allocated.
1364  * If successful, hold a reference to the netmap adapter.
1365  *
1366  * No reference is kept on the real interface, which may then
1367  * disappear at any time.
1368  */
1369 int
1370 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1371 {
1372 	struct ifnet *ifp = NULL;
1373 	int error = 0;
1374 	struct netmap_adapter *ret = NULL;
1375 
1376 	*na = NULL;     /* default return value */
1377 
1378 	NMG_LOCK_ASSERT();
1379 
1380 	/* we cascade through all possibile types of netmap adapter.
1381 	 * All netmap_get_*_na() functions return an error and an na,
1382 	 * with the following combinations:
1383 	 *
1384 	 * error    na
1385 	 *   0	   NULL		type doesn't match
1386 	 *  !0	   NULL		type matches, but na creation/lookup failed
1387 	 *   0	  !NULL		type matches and na created/found
1388 	 *  !0    !NULL		impossible
1389 	 */
1390 
1391 	/* try to see if this is a monitor port */
1392 	error = netmap_get_monitor_na(nmr, na, create);
1393 	if (error || *na != NULL)
1394 		return error;
1395 
1396 	/* try to see if this is a pipe port */
1397 	error = netmap_get_pipe_na(nmr, na, create);
1398 	if (error || *na != NULL)
1399 		return error;
1400 
1401 	/* try to see if this is a bridge port */
1402 	error = netmap_get_bdg_na(nmr, na, create);
1403 	if (error)
1404 		return error;
1405 
1406 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1407 		goto out;
1408 
1409 	/*
1410 	 * This must be a hardware na, lookup the name in the system.
1411 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1412 	 * This may still be a tap, a veth/epair, or even a
1413 	 * persistent VALE port.
1414 	 */
1415 	ifp = ifunit_ref(nmr->nr_name);
1416 	if (ifp == NULL) {
1417 	        return ENXIO;
1418 	}
1419 
1420 	error = netmap_get_hw_na(ifp, &ret);
1421 	if (error)
1422 		goto out;
1423 
1424 	*na = ret;
1425 	netmap_adapter_get(ret);
1426 
1427 out:
1428 	if (error && ret != NULL)
1429 		netmap_adapter_put(ret);
1430 
1431 	if (ifp)
1432 		if_rele(ifp); /* allow live unloading of drivers modules */
1433 
1434 	return error;
1435 }
1436 
1437 
1438 /*
1439  * validate parameters on entry for *_txsync()
1440  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1441  * in case of error.
1442  *
1443  * rhead, rcur and rtail=hwtail are stored from previous round.
1444  * hwcur is the next packet to send to the ring.
1445  *
1446  * We want
1447  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1448  *
1449  * hwcur, rhead, rtail and hwtail are reliable
1450  */
1451 static u_int
1452 nm_txsync_prologue(struct netmap_kring *kring)
1453 {
1454 #define NM_ASSERT(t) if (t) { D("fail " #t); goto error; }
1455 	struct netmap_ring *ring = kring->ring;
1456 	u_int head = ring->head; /* read only once */
1457 	u_int cur = ring->cur; /* read only once */
1458 	u_int n = kring->nkr_num_slots;
1459 
1460 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1461 		kring->name,
1462 		kring->nr_hwcur, kring->nr_hwtail,
1463 		ring->head, ring->cur, ring->tail);
1464 #if 1 /* kernel sanity checks; but we can trust the kring. */
1465 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1466 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1467 		goto error;
1468 #endif /* kernel sanity checks */
1469 	/*
1470 	 * user sanity checks. We only use 'cur',
1471 	 * A, B, ... are possible positions for cur:
1472 	 *
1473 	 *  0    A  cur   B  tail  C  n-1
1474 	 *  0    D  tail  E  cur   F  n-1
1475 	 *
1476 	 * B, F, D are valid. A, C, E are wrong
1477 	 */
1478 	if (kring->rtail >= kring->rhead) {
1479 		/* want rhead <= head <= rtail */
1480 		NM_ASSERT(head < kring->rhead || head > kring->rtail);
1481 		/* and also head <= cur <= rtail */
1482 		NM_ASSERT(cur < head || cur > kring->rtail);
1483 	} else { /* here rtail < rhead */
1484 		/* we need head outside rtail .. rhead */
1485 		NM_ASSERT(head > kring->rtail && head < kring->rhead);
1486 
1487 		/* two cases now: head <= rtail or head >= rhead  */
1488 		if (head <= kring->rtail) {
1489 			/* want head <= cur <= rtail */
1490 			NM_ASSERT(cur < head || cur > kring->rtail);
1491 		} else { /* head >= rhead */
1492 			/* cur must be outside rtail..head */
1493 			NM_ASSERT(cur > kring->rtail && cur < head);
1494 		}
1495 	}
1496 	if (ring->tail != kring->rtail) {
1497 		RD(5, "tail overwritten was %d need %d",
1498 			ring->tail, kring->rtail);
1499 		ring->tail = kring->rtail;
1500 	}
1501 	kring->rhead = head;
1502 	kring->rcur = cur;
1503 	return head;
1504 
1505 error:
1506 	RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d",
1507 		kring->name,
1508 		head, cur, ring->tail,
1509 		kring->rhead, kring->rcur, kring->rtail,
1510 		kring->nr_hwcur, kring->nr_hwtail);
1511 	return n;
1512 #undef NM_ASSERT
1513 }
1514 
1515 
1516 /*
1517  * validate parameters on entry for *_rxsync()
1518  * Returns ring->head if ok, kring->nkr_num_slots on error.
1519  *
1520  * For a valid configuration,
1521  * hwcur <= head <= cur <= tail <= hwtail
1522  *
1523  * We only consider head and cur.
1524  * hwcur and hwtail are reliable.
1525  *
1526  */
1527 static u_int
1528 nm_rxsync_prologue(struct netmap_kring *kring)
1529 {
1530 	struct netmap_ring *ring = kring->ring;
1531 	uint32_t const n = kring->nkr_num_slots;
1532 	uint32_t head, cur;
1533 
1534 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1535 		kring->name,
1536 		kring->nr_hwcur, kring->nr_hwtail,
1537 		ring->head, ring->cur, ring->tail);
1538 	/*
1539 	 * Before storing the new values, we should check they do not
1540 	 * move backwards. However:
1541 	 * - head is not an issue because the previous value is hwcur;
1542 	 * - cur could in principle go back, however it does not matter
1543 	 *   because we are processing a brand new rxsync()
1544 	 */
1545 	cur = kring->rcur = ring->cur;	/* read only once */
1546 	head = kring->rhead = ring->head;	/* read only once */
1547 #if 1 /* kernel sanity checks */
1548 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1549 		goto error;
1550 #endif /* kernel sanity checks */
1551 	/* user sanity checks */
1552 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1553 		/* want hwcur <= rhead <= hwtail */
1554 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1555 			goto error;
1556 		/* and also rhead <= rcur <= hwtail */
1557 		if (cur < head || cur > kring->nr_hwtail)
1558 			goto error;
1559 	} else {
1560 		/* we need rhead outside hwtail..hwcur */
1561 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1562 			goto error;
1563 		/* two cases now: head <= hwtail or head >= hwcur  */
1564 		if (head <= kring->nr_hwtail) {
1565 			/* want head <= cur <= hwtail */
1566 			if (cur < head || cur > kring->nr_hwtail)
1567 				goto error;
1568 		} else {
1569 			/* cur must be outside hwtail..head */
1570 			if (cur < head && cur > kring->nr_hwtail)
1571 				goto error;
1572 		}
1573 	}
1574 	if (ring->tail != kring->rtail) {
1575 		RD(5, "%s tail overwritten was %d need %d",
1576 			kring->name,
1577 			ring->tail, kring->rtail);
1578 		ring->tail = kring->rtail;
1579 	}
1580 	return head;
1581 
1582 error:
1583 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1584 		kring->nr_hwcur,
1585 		kring->rcur, kring->nr_hwtail,
1586 		kring->rhead, kring->rcur, ring->tail);
1587 	return n;
1588 }
1589 
1590 
1591 /*
1592  * Error routine called when txsync/rxsync detects an error.
1593  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1594  * Return 1 on reinit.
1595  *
1596  * This routine is only called by the upper half of the kernel.
1597  * It only reads hwcur (which is changed only by the upper half, too)
1598  * and hwtail (which may be changed by the lower half, but only on
1599  * a tx ring and only to increase it, so any error will be recovered
1600  * on the next call). For the above, we don't strictly need to call
1601  * it under lock.
1602  */
1603 int
1604 netmap_ring_reinit(struct netmap_kring *kring)
1605 {
1606 	struct netmap_ring *ring = kring->ring;
1607 	u_int i, lim = kring->nkr_num_slots - 1;
1608 	int errors = 0;
1609 
1610 	// XXX KASSERT nm_kr_tryget
1611 	RD(10, "called for %s", kring->name);
1612 	// XXX probably wrong to trust userspace
1613 	kring->rhead = ring->head;
1614 	kring->rcur  = ring->cur;
1615 	kring->rtail = ring->tail;
1616 
1617 	if (ring->cur > lim)
1618 		errors++;
1619 	if (ring->head > lim)
1620 		errors++;
1621 	if (ring->tail > lim)
1622 		errors++;
1623 	for (i = 0; i <= lim; i++) {
1624 		u_int idx = ring->slot[i].buf_idx;
1625 		u_int len = ring->slot[i].len;
1626 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1627 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1628 			ring->slot[i].buf_idx = 0;
1629 			ring->slot[i].len = 0;
1630 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1631 			ring->slot[i].len = 0;
1632 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1633 		}
1634 	}
1635 	if (errors) {
1636 		RD(10, "total %d errors", errors);
1637 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1638 			kring->name,
1639 			ring->cur, kring->nr_hwcur,
1640 			ring->tail, kring->nr_hwtail);
1641 		ring->head = kring->rhead = kring->nr_hwcur;
1642 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1643 		ring->tail = kring->rtail = kring->nr_hwtail;
1644 	}
1645 	return (errors ? 1 : 0);
1646 }
1647 
1648 /* interpret the ringid and flags fields of an nmreq, by translating them
1649  * into a pair of intervals of ring indices:
1650  *
1651  * [priv->np_txqfirst, priv->np_txqlast) and
1652  * [priv->np_rxqfirst, priv->np_rxqlast)
1653  *
1654  */
1655 int
1656 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1657 {
1658 	struct netmap_adapter *na = priv->np_na;
1659 	u_int j, i = ringid & NETMAP_RING_MASK;
1660 	u_int reg = flags & NR_REG_MASK;
1661 	enum txrx t;
1662 
1663 	if (reg == NR_REG_DEFAULT) {
1664 		/* convert from old ringid to flags */
1665 		if (ringid & NETMAP_SW_RING) {
1666 			reg = NR_REG_SW;
1667 		} else if (ringid & NETMAP_HW_RING) {
1668 			reg = NR_REG_ONE_NIC;
1669 		} else {
1670 			reg = NR_REG_ALL_NIC;
1671 		}
1672 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1673 	}
1674 	switch (reg) {
1675 	case NR_REG_ALL_NIC:
1676 	case NR_REG_PIPE_MASTER:
1677 	case NR_REG_PIPE_SLAVE:
1678 		for_rx_tx(t) {
1679 			priv->np_qfirst[t] = 0;
1680 			priv->np_qlast[t] = nma_get_nrings(na, t);
1681 		}
1682 		ND("%s %d %d", "ALL/PIPE",
1683 			priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1684 		break;
1685 	case NR_REG_SW:
1686 	case NR_REG_NIC_SW:
1687 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1688 			D("host rings not supported");
1689 			return EINVAL;
1690 		}
1691 		for_rx_tx(t) {
1692 			priv->np_qfirst[t] = (reg == NR_REG_SW ?
1693 				nma_get_nrings(na, t) : 0);
1694 			priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1695 		}
1696 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1697 			priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1698 		break;
1699 	case NR_REG_ONE_NIC:
1700 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1701 			D("invalid ring id %d", i);
1702 			return EINVAL;
1703 		}
1704 		for_rx_tx(t) {
1705 			/* if not enough rings, use the first one */
1706 			j = i;
1707 			if (j >= nma_get_nrings(na, t))
1708 				j = 0;
1709 			priv->np_qfirst[t] = j;
1710 			priv->np_qlast[t] = j + 1;
1711 		}
1712 		break;
1713 	default:
1714 		D("invalid regif type %d", reg);
1715 		return EINVAL;
1716 	}
1717 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1718 
1719 	if (netmap_verbose) {
1720 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1721 			na->name,
1722 			priv->np_qfirst[NR_TX],
1723 			priv->np_qlast[NR_TX],
1724 			priv->np_qfirst[NR_RX],
1725 			priv->np_qlast[NR_RX],
1726 			i);
1727 	}
1728 	return 0;
1729 }
1730 
1731 
1732 /*
1733  * Set the ring ID. For devices with a single queue, a request
1734  * for all rings is the same as a single ring.
1735  */
1736 static int
1737 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1738 {
1739 	struct netmap_adapter *na = priv->np_na;
1740 	int error;
1741 	enum txrx t;
1742 
1743 	error = netmap_interp_ringid(priv, ringid, flags);
1744 	if (error) {
1745 		return error;
1746 	}
1747 
1748 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1749 
1750 	/* optimization: count the users registered for more than
1751 	 * one ring, which are the ones sleeping on the global queue.
1752 	 * The default netmap_notify() callback will then
1753 	 * avoid signaling the global queue if nobody is using it
1754 	 */
1755 	for_rx_tx(t) {
1756 		if (nm_si_user(priv, t))
1757 			na->si_users[t]++;
1758 	}
1759 	return 0;
1760 }
1761 
1762 static void
1763 netmap_unset_ringid(struct netmap_priv_d *priv)
1764 {
1765 	struct netmap_adapter *na = priv->np_na;
1766 	enum txrx t;
1767 
1768 	for_rx_tx(t) {
1769 		if (nm_si_user(priv, t))
1770 			na->si_users[t]--;
1771 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1772 	}
1773 	priv->np_flags = 0;
1774 	priv->np_txpoll = 0;
1775 }
1776 
1777 
1778 /* check that the rings we want to bind are not exclusively owned by a previous
1779  * bind.  If exclusive ownership has been requested, we also mark the rings.
1780  */
1781 static int
1782 netmap_get_exclusive(struct netmap_priv_d *priv)
1783 {
1784 	struct netmap_adapter *na = priv->np_na;
1785 	u_int i;
1786 	struct netmap_kring *kring;
1787 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1788 	enum txrx t;
1789 
1790 	ND("%s: grabbing tx [%d, %d) rx [%d, %d)",
1791 			na->name,
1792 			priv->np_qfirst[NR_TX],
1793 			priv->np_qlast[NR_TX],
1794 			priv->np_qfirst[NR_RX],
1795 			priv->np_qlast[NR_RX]);
1796 
1797 	/* first round: check that all the requested rings
1798 	 * are neither alread exclusively owned, nor we
1799 	 * want exclusive ownership when they are already in use
1800 	 */
1801 	for_rx_tx(t) {
1802 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1803 			kring = &NMR(na, t)[i];
1804 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1805 			    (kring->users && excl))
1806 			{
1807 				ND("ring %s busy", kring->name);
1808 				return EBUSY;
1809 			}
1810 		}
1811 	}
1812 
1813 	/* second round: increment usage cound and possibly
1814 	 * mark as exclusive
1815 	 */
1816 
1817 	for_rx_tx(t) {
1818 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1819 			kring = &NMR(na, t)[i];
1820 			kring->users++;
1821 			if (excl)
1822 				kring->nr_kflags |= NKR_EXCLUSIVE;
1823 		}
1824 	}
1825 
1826 	return 0;
1827 
1828 }
1829 
1830 /* undo netmap_get_ownership() */
1831 static void
1832 netmap_rel_exclusive(struct netmap_priv_d *priv)
1833 {
1834 	struct netmap_adapter *na = priv->np_na;
1835 	u_int i;
1836 	struct netmap_kring *kring;
1837 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1838 	enum txrx t;
1839 
1840 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1841 			na->name,
1842 			priv->np_qfirst[NR_TX],
1843 			priv->np_qlast[NR_TX],
1844 			priv->np_qfirst[NR_RX],
1845 			priv->np_qlast[MR_RX]);
1846 
1847 
1848 	for_rx_tx(t) {
1849 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1850 			kring = &NMR(na, t)[i];
1851 			if (excl)
1852 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
1853 			kring->users--;
1854 		}
1855 	}
1856 }
1857 
1858 /*
1859  * possibly move the interface to netmap-mode.
1860  * If success it returns a pointer to netmap_if, otherwise NULL.
1861  * This must be called with NMG_LOCK held.
1862  *
1863  * The following na callbacks are called in the process:
1864  *
1865  * na->nm_config()			[by netmap_update_config]
1866  * (get current number and size of rings)
1867  *
1868  *  	We have a generic one for linux (netmap_linux_config).
1869  *  	The bwrap has to override this, since it has to forward
1870  *  	the request to the wrapped adapter (netmap_bwrap_config).
1871  *
1872  *
1873  * na->nm_krings_create()
1874  * (create and init the krings array)
1875  *
1876  * 	One of the following:
1877  *
1878  *	* netmap_hw_krings_create, 			(hw ports)
1879  *		creates the standard layout for the krings
1880  * 		and adds the mbq (used for the host rings).
1881  *
1882  * 	* netmap_vp_krings_create			(VALE ports)
1883  * 		add leases and scratchpads
1884  *
1885  * 	* netmap_pipe_krings_create			(pipes)
1886  * 		create the krings and rings of both ends and
1887  * 		cross-link them
1888  *
1889  *      * netmap_monitor_krings_create 			(monitors)
1890  *      	avoid allocating the mbq
1891  *
1892  *      * netmap_bwrap_krings_create			(bwraps)
1893  *      	create both the brap krings array,
1894  *      	the krings array of the wrapped adapter, and
1895  *      	(if needed) the fake array for the host adapter
1896  *
1897  * na->nm_register(, 1)
1898  * (put the adapter in netmap mode)
1899  *
1900  * 	This may be one of the following:
1901  * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1902  *
1903  * 	* netmap_hw_register				(hw ports)
1904  * 		checks that the ifp is still there, then calls
1905  * 		the hardware specific callback;
1906  *
1907  * 	* netmap_vp_reg					(VALE ports)
1908  *		If the port is connected to a bridge,
1909  *		set the NAF_NETMAP_ON flag under the
1910  *		bridge write lock.
1911  *
1912  *	* netmap_pipe_reg				(pipes)
1913  *		inform the other pipe end that it is no
1914  *		longer responsibile for the lifetime of this
1915  *		pipe end
1916  *
1917  *	* netmap_monitor_reg				(monitors)
1918  *		intercept the sync callbacks of the monitored
1919  *		rings
1920  *
1921  *	* netmap_bwrap_register				(bwraps)
1922  *		cross-link the bwrap and hwna rings,
1923  *		forward the request to the hwna, override
1924  *		the hwna notify callback (to get the frames
1925  *		coming from outside go through the bridge).
1926  *
1927  *
1928  */
1929 int
1930 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1931 	uint16_t ringid, uint32_t flags)
1932 {
1933 	struct netmap_if *nifp = NULL;
1934 	int error;
1935 
1936 	NMG_LOCK_ASSERT();
1937 	/* ring configuration may have changed, fetch from the card */
1938 	netmap_update_config(na);
1939 	priv->np_na = na;     /* store the reference */
1940 	error = netmap_set_ringid(priv, ringid, flags);
1941 	if (error)
1942 		goto err;
1943 	error = netmap_mem_finalize(na->nm_mem, na);
1944 	if (error)
1945 		goto err;
1946 
1947 	if (na->active_fds == 0) {
1948 		/*
1949 		 * If this is the first registration of the adapter,
1950 		 * also create the netmap rings and their in-kernel view,
1951 		 * the netmap krings.
1952 		 */
1953 
1954 		/*
1955 		 * Depending on the adapter, this may also create
1956 		 * the netmap rings themselves
1957 		 */
1958 		error = na->nm_krings_create(na);
1959 		if (error)
1960 			goto err_drop_mem;
1961 
1962 		/* create all missing netmap rings */
1963 		error = netmap_mem_rings_create(na);
1964 		if (error)
1965 			goto err_del_krings;
1966 	}
1967 
1968 	/* now the kring must exist and we can check whether some
1969 	 * previous bind has exclusive ownership on them
1970 	 */
1971 	error = netmap_get_exclusive(priv);
1972 	if (error)
1973 		goto err_del_rings;
1974 
1975 	/* in all cases, create a new netmap if */
1976 	nifp = netmap_mem_if_new(na);
1977 	if (nifp == NULL) {
1978 		error = ENOMEM;
1979 		goto err_rel_excl;
1980 	}
1981 
1982 	na->active_fds++;
1983 	if (!nm_netmap_on(na)) {
1984 		/* Netmap not active, set the card in netmap mode
1985 		 * and make it use the shared buffers.
1986 		 */
1987 		/* cache the allocator info in the na */
1988 		netmap_mem_get_lut(na->nm_mem, &na->na_lut);
1989 		ND("%p->na_lut == %p", na, na->na_lut.lut);
1990 		error = na->nm_register(na, 1); /* mode on */
1991 		if (error)
1992 			goto err_del_if;
1993 	}
1994 
1995 	/*
1996 	 * advertise that the interface is ready by setting np_nifp.
1997 	 * The barrier is needed because readers (poll, *SYNC and mmap)
1998 	 * check for priv->np_nifp != NULL without locking
1999 	 */
2000 	mb(); /* make sure previous writes are visible to all CPUs */
2001 	priv->np_nifp = nifp;
2002 
2003 	return 0;
2004 
2005 err_del_if:
2006 	memset(&na->na_lut, 0, sizeof(na->na_lut));
2007 	na->active_fds--;
2008 	netmap_mem_if_delete(na, nifp);
2009 err_rel_excl:
2010 	netmap_rel_exclusive(priv);
2011 err_del_rings:
2012 	if (na->active_fds == 0)
2013 		netmap_mem_rings_delete(na);
2014 err_del_krings:
2015 	if (na->active_fds == 0)
2016 		na->nm_krings_delete(na);
2017 err_drop_mem:
2018 	netmap_mem_deref(na->nm_mem, na);
2019 err:
2020 	priv->np_na = NULL;
2021 	return error;
2022 }
2023 
2024 
2025 /*
2026  * update kring and ring at the end of txsync.
2027  */
2028 static inline void
2029 nm_txsync_finalize(struct netmap_kring *kring)
2030 {
2031 	/* update ring tail to what the kernel knows */
2032 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2033 
2034 	/* note, head/rhead/hwcur might be behind cur/rcur
2035 	 * if no carrier
2036 	 */
2037 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2038 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2039 		kring->rhead, kring->rcur, kring->rtail);
2040 }
2041 
2042 
2043 /*
2044  * update kring and ring at the end of rxsync
2045  */
2046 static inline void
2047 nm_rxsync_finalize(struct netmap_kring *kring)
2048 {
2049 	/* tell userspace that there might be new packets */
2050 	//struct netmap_ring *ring = kring->ring;
2051 	ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
2052 		kring->nr_hwtail);
2053 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2054 	/* make a copy of the state for next round */
2055 	kring->rhead = kring->ring->head;
2056 	kring->rcur = kring->ring->cur;
2057 }
2058 
2059 
2060 
2061 /*
2062  * ioctl(2) support for the "netmap" device.
2063  *
2064  * Following a list of accepted commands:
2065  * - NIOCGINFO
2066  * - SIOCGIFADDR	just for convenience
2067  * - NIOCREGIF
2068  * - NIOCTXSYNC
2069  * - NIOCRXSYNC
2070  *
2071  * Return 0 on success, errno otherwise.
2072  */
2073 int
2074 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2075 	int fflag, struct thread *td)
2076 {
2077 	struct netmap_priv_d *priv = NULL;
2078 	struct nmreq *nmr = (struct nmreq *) data;
2079 	struct netmap_adapter *na = NULL;
2080 	int error;
2081 	u_int i, qfirst, qlast;
2082 	struct netmap_if *nifp;
2083 	struct netmap_kring *krings;
2084 	enum txrx t;
2085 
2086 	(void)dev;	/* UNUSED */
2087 	(void)fflag;	/* UNUSED */
2088 
2089 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2090 		/* truncate name */
2091 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2092 		if (nmr->nr_version != NETMAP_API) {
2093 			D("API mismatch for %s got %d need %d",
2094 				nmr->nr_name,
2095 				nmr->nr_version, NETMAP_API);
2096 			nmr->nr_version = NETMAP_API;
2097 		}
2098 		if (nmr->nr_version < NETMAP_MIN_API ||
2099 		    nmr->nr_version > NETMAP_MAX_API) {
2100 			return EINVAL;
2101 		}
2102 	}
2103 	CURVNET_SET(TD_TO_VNET(td));
2104 
2105 	error = devfs_get_cdevpriv((void **)&priv);
2106 	if (error) {
2107 		CURVNET_RESTORE();
2108 		/* XXX ENOENT should be impossible, since the priv
2109 		 * is now created in the open */
2110 		return (error == ENOENT ? ENXIO : error);
2111 	}
2112 
2113 	switch (cmd) {
2114 	case NIOCGINFO:		/* return capabilities etc */
2115 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2116 			error = netmap_bdg_ctl(nmr, NULL);
2117 			break;
2118 		}
2119 
2120 		NMG_LOCK();
2121 		do {
2122 			/* memsize is always valid */
2123 			struct netmap_mem_d *nmd = &nm_mem;
2124 			u_int memflags;
2125 
2126 			if (nmr->nr_name[0] != '\0') {
2127 				/* get a refcount */
2128 				error = netmap_get_na(nmr, &na, 1 /* create */);
2129 				if (error)
2130 					break;
2131 				nmd = na->nm_mem; /* get memory allocator */
2132 			}
2133 
2134 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2135 				&nmr->nr_arg2);
2136 			if (error)
2137 				break;
2138 			if (na == NULL) /* only memory info */
2139 				break;
2140 			nmr->nr_offset = 0;
2141 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2142 			netmap_update_config(na);
2143 			nmr->nr_rx_rings = na->num_rx_rings;
2144 			nmr->nr_tx_rings = na->num_tx_rings;
2145 			nmr->nr_rx_slots = na->num_rx_desc;
2146 			nmr->nr_tx_slots = na->num_tx_desc;
2147 			netmap_adapter_put(na);
2148 		} while (0);
2149 		NMG_UNLOCK();
2150 		break;
2151 
2152 	case NIOCREGIF:
2153 		/* possibly attach/detach NIC and VALE switch */
2154 		i = nmr->nr_cmd;
2155 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2156 				|| i == NETMAP_BDG_VNET_HDR
2157 				|| i == NETMAP_BDG_NEWIF
2158 				|| i == NETMAP_BDG_DELIF) {
2159 			error = netmap_bdg_ctl(nmr, NULL);
2160 			break;
2161 		} else if (i != 0) {
2162 			D("nr_cmd must be 0 not %d", i);
2163 			error = EINVAL;
2164 			break;
2165 		}
2166 
2167 		/* protect access to priv from concurrent NIOCREGIF */
2168 		NMG_LOCK();
2169 		do {
2170 			u_int memflags;
2171 
2172 			if (priv->np_nifp != NULL) {	/* thread already registered */
2173 				error = EBUSY;
2174 				break;
2175 			}
2176 			/* find the interface and a reference */
2177 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2178 			if (error)
2179 				break;
2180 			if (NETMAP_OWNED_BY_KERN(na)) {
2181 				netmap_adapter_put(na);
2182 				error = EBUSY;
2183 				break;
2184 			}
2185 			error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2186 			if (error) {    /* reg. failed, release priv and ref */
2187 				netmap_adapter_put(na);
2188 				break;
2189 			}
2190 			nifp = priv->np_nifp;
2191 			priv->np_td = td; // XXX kqueue, debugging only
2192 
2193 			/* return the offset of the netmap_if object */
2194 			nmr->nr_rx_rings = na->num_rx_rings;
2195 			nmr->nr_tx_rings = na->num_tx_rings;
2196 			nmr->nr_rx_slots = na->num_rx_desc;
2197 			nmr->nr_tx_slots = na->num_tx_desc;
2198 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2199 				&nmr->nr_arg2);
2200 			if (error) {
2201 				netmap_do_unregif(priv);
2202 				netmap_adapter_put(na);
2203 				break;
2204 			}
2205 			if (memflags & NETMAP_MEM_PRIVATE) {
2206 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2207 			}
2208 			for_rx_tx(t) {
2209 				priv->np_si[t] = nm_si_user(priv, t) ?
2210 					&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2211 			}
2212 
2213 			if (nmr->nr_arg3) {
2214 				D("requested %d extra buffers", nmr->nr_arg3);
2215 				nmr->nr_arg3 = netmap_extra_alloc(na,
2216 					&nifp->ni_bufs_head, nmr->nr_arg3);
2217 				D("got %d extra buffers", nmr->nr_arg3);
2218 			}
2219 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2220 		} while (0);
2221 		NMG_UNLOCK();
2222 		break;
2223 
2224 	case NIOCTXSYNC:
2225 	case NIOCRXSYNC:
2226 		nifp = priv->np_nifp;
2227 
2228 		if (nifp == NULL) {
2229 			error = ENXIO;
2230 			break;
2231 		}
2232 		mb(); /* make sure following reads are not from cache */
2233 
2234 		na = priv->np_na;      /* we have a reference */
2235 
2236 		if (na == NULL) {
2237 			D("Internal error: nifp != NULL && na == NULL");
2238 			error = ENXIO;
2239 			break;
2240 		}
2241 
2242 		if (!nm_netmap_on(na)) {
2243 			error = ENXIO;
2244 			break;
2245 		}
2246 
2247 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2248 		krings = NMR(na, t);
2249 		qfirst = priv->np_qfirst[t];
2250 		qlast = priv->np_qlast[t];
2251 
2252 		for (i = qfirst; i < qlast; i++) {
2253 			struct netmap_kring *kring = krings + i;
2254 			if (nm_kr_tryget(kring)) {
2255 				error = EBUSY;
2256 				goto out;
2257 			}
2258 			if (cmd == NIOCTXSYNC) {
2259 				if (netmap_verbose & NM_VERB_TXSYNC)
2260 					D("pre txsync ring %d cur %d hwcur %d",
2261 					    i, kring->ring->cur,
2262 					    kring->nr_hwcur);
2263 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2264 					netmap_ring_reinit(kring);
2265 				} else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) {
2266 					nm_txsync_finalize(kring);
2267 				}
2268 				if (netmap_verbose & NM_VERB_TXSYNC)
2269 					D("post txsync ring %d cur %d hwcur %d",
2270 					    i, kring->ring->cur,
2271 					    kring->nr_hwcur);
2272 			} else {
2273 				if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2274 					netmap_ring_reinit(kring);
2275 				} else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) {
2276 					nm_rxsync_finalize(kring);
2277 				}
2278 				microtime(&na->rx_rings[i].ring->ts);
2279 			}
2280 			nm_kr_put(kring);
2281 		}
2282 
2283 		break;
2284 
2285 #ifdef WITH_VALE
2286 	case NIOCCONFIG:
2287 		error = netmap_bdg_config(nmr);
2288 		break;
2289 #endif
2290 #ifdef __FreeBSD__
2291 	case FIONBIO:
2292 	case FIOASYNC:
2293 		ND("FIONBIO/FIOASYNC are no-ops");
2294 		break;
2295 
2296 	case BIOCIMMEDIATE:
2297 	case BIOCGHDRCMPLT:
2298 	case BIOCSHDRCMPLT:
2299 	case BIOCSSEESENT:
2300 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2301 		break;
2302 
2303 	default:	/* allow device-specific ioctls */
2304 	    {
2305 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2306 		if (ifp == NULL) {
2307 			error = ENXIO;
2308 		} else {
2309 			struct socket so;
2310 
2311 			bzero(&so, sizeof(so));
2312 			so.so_vnet = ifp->if_vnet;
2313 			// so->so_proto not null.
2314 			error = ifioctl(&so, cmd, data, td);
2315 			if_rele(ifp);
2316 		}
2317 		break;
2318 	    }
2319 
2320 #else /* linux */
2321 	default:
2322 		error = EOPNOTSUPP;
2323 #endif /* linux */
2324 	}
2325 out:
2326 
2327 	CURVNET_RESTORE();
2328 	return (error);
2329 }
2330 
2331 
2332 /*
2333  * select(2) and poll(2) handlers for the "netmap" device.
2334  *
2335  * Can be called for one or more queues.
2336  * Return true the event mask corresponding to ready events.
2337  * If there are no ready events, do a selrecord on either individual
2338  * selinfo or on the global one.
2339  * Device-dependent parts (locking and sync of tx/rx rings)
2340  * are done through callbacks.
2341  *
2342  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2343  * The first one is remapped to pwait as selrecord() uses the name as an
2344  * hidden argument.
2345  */
2346 int
2347 netmap_poll(struct cdev *dev, int events, struct thread *td)
2348 {
2349 	struct netmap_priv_d *priv = NULL;
2350 	struct netmap_adapter *na;
2351 	struct netmap_kring *kring;
2352 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2353 #define want_tx want[NR_TX]
2354 #define want_rx want[NR_RX]
2355 	struct mbq q;		/* packets from hw queues to host stack */
2356 	void *pwait = dev;	/* linux compatibility */
2357 	int is_kevent = 0;
2358 	enum txrx t;
2359 
2360 	/*
2361 	 * In order to avoid nested locks, we need to "double check"
2362 	 * txsync and rxsync if we decide to do a selrecord().
2363 	 * retry_tx (and retry_rx, later) prevent looping forever.
2364 	 */
2365 	int retry_tx = 1, retry_rx = 1;
2366 
2367 	(void)pwait;
2368 	mbq_init(&q);
2369 
2370 	/*
2371 	 * XXX kevent has curthread->tp_fop == NULL,
2372 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2373 	 * priv as the first argument, which is also useful to avoid
2374 	 * the selrecord() which are not necessary in that case.
2375 	 */
2376 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2377 		is_kevent = 1;
2378 		if (netmap_verbose)
2379 			D("called from kevent");
2380 		priv = (struct netmap_priv_d *)dev;
2381 	}
2382 	if (priv == NULL)
2383 		return POLLERR;
2384 
2385 	if (priv->np_nifp == NULL) {
2386 		D("No if registered");
2387 		return POLLERR;
2388 	}
2389 	mb(); /* make sure following reads are not from cache */
2390 
2391 	na = priv->np_na;
2392 
2393 	if (!nm_netmap_on(na))
2394 		return POLLERR;
2395 
2396 	if (netmap_verbose & 0x8000)
2397 		D("device %s events 0x%x", na->name, events);
2398 	want_tx = events & (POLLOUT | POLLWRNORM);
2399 	want_rx = events & (POLLIN | POLLRDNORM);
2400 
2401 
2402 	/*
2403 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2404 	 * the file descriptor is bound to all of them. If so, we sleep on
2405 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2406 	 * (FreeBSD only allows two selinfo's per file descriptor).
2407 	 * The interrupt routine in the driver wake one or the other
2408 	 * (or both) depending on which clients are active.
2409 	 *
2410 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2411 	 * txsync() is called if we run out of buffers on POLLOUT, or
2412 	 * there are pending packets to send. The latter can be disabled
2413 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2414 	 */
2415 	check_all_tx = nm_si_user(priv, NR_TX);
2416 	check_all_rx = nm_si_user(priv, NR_RX);
2417 
2418 	/*
2419 	 * We start with a lock free round which is cheap if we have
2420 	 * slots available. If this fails, then lock and call the sync
2421 	 * routines.
2422 	 */
2423 	for_rx_tx(t) {
2424 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2425 			kring = &NMR(na, t)[i];
2426 			/* XXX compare ring->cur and kring->tail */
2427 			if (!nm_ring_empty(kring->ring)) {
2428 				revents |= want[t];
2429 				want[t] = 0;	/* also breaks the loop */
2430 			}
2431 		}
2432 	}
2433 
2434 	/*
2435 	 * If we want to push packets out (priv->np_txpoll) or
2436 	 * want_tx is still set, we must issue txsync calls
2437 	 * (on all rings, to avoid that the tx rings stall).
2438 	 * XXX should also check cur != hwcur on the tx rings.
2439 	 * Fortunately, normal tx mode has np_txpoll set.
2440 	 */
2441 	if (priv->np_txpoll || want_tx) {
2442 		/*
2443 		 * The first round checks if anyone is ready, if not
2444 		 * do a selrecord and another round to handle races.
2445 		 * want_tx goes to 0 if any space is found, and is
2446 		 * used to skip rings with no pending transmissions.
2447 		 */
2448 flush_tx:
2449 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) {
2450 			int found = 0;
2451 
2452 			kring = &na->tx_rings[i];
2453 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2454 				continue;
2455 			/* only one thread does txsync */
2456 			if (nm_kr_tryget(kring)) {
2457 				/* either busy or stopped
2458 				 * XXX if the ring is stopped, sleeping would
2459 				 * be better. In current code, however, we only
2460 				 * stop the rings for brief intervals (2014-03-14)
2461 				 */
2462 				if (netmap_verbose)
2463 					RD(2, "%p lost race on txring %d, ok",
2464 					    priv, i);
2465 				continue;
2466 			}
2467 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2468 				netmap_ring_reinit(kring);
2469 				revents |= POLLERR;
2470 			} else {
2471 				if (kring->nm_sync(kring, 0))
2472 					revents |= POLLERR;
2473 				else
2474 					nm_txsync_finalize(kring);
2475 			}
2476 
2477 			/*
2478 			 * If we found new slots, notify potential
2479 			 * listeners on the same ring.
2480 			 * Since we just did a txsync, look at the copies
2481 			 * of cur,tail in the kring.
2482 			 */
2483 			found = kring->rcur != kring->rtail;
2484 			nm_kr_put(kring);
2485 			if (found) { /* notify other listeners */
2486 				revents |= want_tx;
2487 				want_tx = 0;
2488 				kring->nm_notify(kring, 0);
2489 			}
2490 		}
2491 		if (want_tx && retry_tx && !is_kevent) {
2492 			OS_selrecord(td, check_all_tx ?
2493 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2494 			retry_tx = 0;
2495 			goto flush_tx;
2496 		}
2497 	}
2498 
2499 	/*
2500 	 * If want_rx is still set scan receive rings.
2501 	 * Do it on all rings because otherwise we starve.
2502 	 */
2503 	if (want_rx) {
2504 		int send_down = 0; /* transparent mode */
2505 		/* two rounds here for race avoidance */
2506 do_retry_rx:
2507 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2508 			int found = 0;
2509 
2510 			kring = &na->rx_rings[i];
2511 
2512 			if (nm_kr_tryget(kring)) {
2513 				if (netmap_verbose)
2514 					RD(2, "%p lost race on rxring %d, ok",
2515 					    priv, i);
2516 				continue;
2517 			}
2518 
2519 			if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2520 				netmap_ring_reinit(kring);
2521 				revents |= POLLERR;
2522 			}
2523 			/* now we can use kring->rcur, rtail */
2524 
2525 			/*
2526 			 * transparent mode support: collect packets
2527 			 * from the rxring(s).
2528 			 * XXX NR_FORWARD should only be read on
2529 			 * physical or NIC ports
2530 			 */
2531 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2532 				ND(10, "forwarding some buffers up %d to %d",
2533 				    kring->nr_hwcur, kring->ring->cur);
2534 				netmap_grab_packets(kring, &q, netmap_fwd);
2535 			}
2536 
2537 			if (kring->nm_sync(kring, 0))
2538 				revents |= POLLERR;
2539 			else
2540 				nm_rxsync_finalize(kring);
2541 			if (netmap_no_timestamp == 0 ||
2542 					kring->ring->flags & NR_TIMESTAMP) {
2543 				microtime(&kring->ring->ts);
2544 			}
2545 			found = kring->rcur != kring->rtail;
2546 			nm_kr_put(kring);
2547 			if (found) {
2548 				revents |= want_rx;
2549 				retry_rx = 0;
2550 				kring->nm_notify(kring, 0);
2551 			}
2552 		}
2553 
2554 		/* transparent mode XXX only during first pass ? */
2555 		if (na->na_flags & NAF_HOST_RINGS) {
2556 			kring = &na->rx_rings[na->num_rx_rings];
2557 			if (check_all_rx
2558 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2559 				/* XXX fix to use kring fields */
2560 				if (nm_ring_empty(kring->ring))
2561 					send_down = netmap_rxsync_from_host(na, td, dev);
2562 				if (!nm_ring_empty(kring->ring))
2563 					revents |= want_rx;
2564 			}
2565 		}
2566 
2567 		if (retry_rx && !is_kevent)
2568 			OS_selrecord(td, check_all_rx ?
2569 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2570 		if (send_down > 0 || retry_rx) {
2571 			retry_rx = 0;
2572 			if (send_down)
2573 				goto flush_tx; /* and retry_rx */
2574 			else
2575 				goto do_retry_rx;
2576 		}
2577 	}
2578 
2579 	/*
2580 	 * Transparent mode: marked bufs on rx rings between
2581 	 * kring->nr_hwcur and ring->head
2582 	 * are passed to the other endpoint.
2583 	 *
2584 	 * In this mode we also scan the sw rxring, which in
2585 	 * turn passes packets up.
2586 	 *
2587 	 * XXX Transparent mode at the moment requires to bind all
2588  	 * rings to a single file descriptor.
2589 	 */
2590 
2591 	if (q.head && na->ifp != NULL)
2592 		netmap_send_up(na->ifp, &q);
2593 
2594 	return (revents);
2595 #undef want_tx
2596 #undef want_rx
2597 }
2598 
2599 
2600 /*-------------------- driver support routines -------------------*/
2601 
2602 static int netmap_hw_krings_create(struct netmap_adapter *);
2603 
2604 /* default notify callback */
2605 static int
2606 netmap_notify(struct netmap_kring *kring, int flags)
2607 {
2608 	struct netmap_adapter *na = kring->na;
2609 	enum txrx t = kring->tx;
2610 
2611 	OS_selwakeup(&kring->si, PI_NET);
2612 	/* optimization: avoid a wake up on the global
2613 	 * queue if nobody has registered for more
2614 	 * than one ring
2615 	 */
2616 	if (na->si_users[t] > 0)
2617 		OS_selwakeup(&na->si[t], PI_NET);
2618 
2619 	return 0;
2620 }
2621 
2622 
2623 /* called by all routines that create netmap_adapters.
2624  * Attach na to the ifp (if any) and provide defaults
2625  * for optional callbacks. Defaults assume that we
2626  * are creating an hardware netmap_adapter.
2627  */
2628 int
2629 netmap_attach_common(struct netmap_adapter *na)
2630 {
2631 	struct ifnet *ifp = na->ifp;
2632 
2633 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2634 		D("%s: invalid rings tx %d rx %d",
2635 			na->name, na->num_tx_rings, na->num_rx_rings);
2636 		return EINVAL;
2637 	}
2638 	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2639 	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2640 	 * use by the external modules, but that is set after this
2641 	 * function has been called.
2642 	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2643 	 */
2644 	if (ifp != NULL) {
2645 		WNA(ifp) = na;
2646 
2647 	/* the following is only needed for na that use the host port.
2648 	 * XXX do we have something similar for linux ?
2649 	 */
2650 #ifdef __FreeBSD__
2651 		na->if_input = ifp->if_input; /* for netmap_send_up */
2652 #endif /* __FreeBSD__ */
2653 
2654 		NETMAP_SET_CAPABLE(ifp);
2655 	}
2656 	if (na->nm_krings_create == NULL) {
2657 		/* we assume that we have been called by a driver,
2658 		 * since other port types all provide their own
2659 		 * nm_krings_create
2660 		 */
2661 		na->nm_krings_create = netmap_hw_krings_create;
2662 		na->nm_krings_delete = netmap_hw_krings_delete;
2663 	}
2664 	if (na->nm_notify == NULL)
2665 		na->nm_notify = netmap_notify;
2666 	na->active_fds = 0;
2667 
2668 	if (na->nm_mem == NULL)
2669 		/* use the global allocator */
2670 		na->nm_mem = &nm_mem;
2671 	netmap_mem_get(na->nm_mem);
2672 #ifdef WITH_VALE
2673 	if (na->nm_bdg_attach == NULL)
2674 		/* no special nm_bdg_attach callback. On VALE
2675 		 * attach, we need to interpose a bwrap
2676 		 */
2677 		na->nm_bdg_attach = netmap_bwrap_attach;
2678 #endif
2679 	return 0;
2680 }
2681 
2682 
2683 /* standard cleanup, called by all destructors */
2684 void
2685 netmap_detach_common(struct netmap_adapter *na)
2686 {
2687 	if (na->ifp != NULL)
2688 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2689 
2690 	if (na->tx_rings) { /* XXX should not happen */
2691 		D("freeing leftover tx_rings");
2692 		na->nm_krings_delete(na);
2693 	}
2694 	netmap_pipe_dealloc(na);
2695 	if (na->nm_mem)
2696 		netmap_mem_put(na->nm_mem);
2697 	bzero(na, sizeof(*na));
2698 	free(na, M_DEVBUF);
2699 }
2700 
2701 /* Wrapper for the register callback provided hardware drivers.
2702  * na->ifp == NULL means the the driver module has been
2703  * unloaded, so we cannot call into it.
2704  * Note that module unloading, in our patched linux drivers,
2705  * happens under NMG_LOCK and after having stopped all the
2706  * nic rings (see netmap_detach). This provides sufficient
2707  * protection for the other driver-provied callbacks
2708  * (i.e., nm_config and nm_*xsync), that therefore don't need
2709  * to wrapped.
2710  */
2711 static int
2712 netmap_hw_register(struct netmap_adapter *na, int onoff)
2713 {
2714 	struct netmap_hw_adapter *hwna =
2715 		(struct netmap_hw_adapter*)na;
2716 
2717 	if (na->ifp == NULL)
2718 		return onoff ? ENXIO : 0;
2719 
2720 	return hwna->nm_hw_register(na, onoff);
2721 }
2722 
2723 
2724 /*
2725  * Initialize a ``netmap_adapter`` object created by driver on attach.
2726  * We allocate a block of memory with room for a struct netmap_adapter
2727  * plus two sets of N+2 struct netmap_kring (where N is the number
2728  * of hardware rings):
2729  * krings	0..N-1	are for the hardware queues.
2730  * kring	N	is for the host stack queue
2731  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2732  * Return 0 on success, ENOMEM otherwise.
2733  */
2734 int
2735 netmap_attach(struct netmap_adapter *arg)
2736 {
2737 	struct netmap_hw_adapter *hwna = NULL;
2738 	// XXX when is arg == NULL ?
2739 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2740 
2741 	if (arg == NULL || ifp == NULL)
2742 		goto fail;
2743 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2744 	if (hwna == NULL)
2745 		goto fail;
2746 	hwna->up = *arg;
2747 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
2748 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2749 	hwna->nm_hw_register = hwna->up.nm_register;
2750 	hwna->up.nm_register = netmap_hw_register;
2751 	if (netmap_attach_common(&hwna->up)) {
2752 		free(hwna, M_DEVBUF);
2753 		goto fail;
2754 	}
2755 	netmap_adapter_get(&hwna->up);
2756 
2757 #ifdef linux
2758 	if (ifp->netdev_ops) {
2759 		/* prepare a clone of the netdev ops */
2760 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
2761 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2762 #else
2763 		hwna->nm_ndo = *ifp->netdev_ops;
2764 #endif
2765 	}
2766 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2767 	if (ifp->ethtool_ops) {
2768 		hwna->nm_eto = *ifp->ethtool_ops;
2769 	}
2770 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2771 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
2772 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2773 #endif
2774 	if (arg->nm_config == NULL) {
2775 		hwna->up.nm_config = netmap_linux_config;
2776 	}
2777 #endif /* linux */
2778 
2779 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2780 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2781 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2782 	return 0;
2783 
2784 fail:
2785 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2786 	if (ifp)
2787 		netmap_detach(ifp);
2788 	return (hwna ? EINVAL : ENOMEM);
2789 }
2790 
2791 
2792 void
2793 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2794 {
2795 	if (!na) {
2796 		return;
2797 	}
2798 
2799 	refcount_acquire(&na->na_refcount);
2800 }
2801 
2802 
2803 /* returns 1 iff the netmap_adapter is destroyed */
2804 int
2805 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2806 {
2807 	if (!na)
2808 		return 1;
2809 
2810 	if (!refcount_release(&na->na_refcount))
2811 		return 0;
2812 
2813 	if (na->nm_dtor)
2814 		na->nm_dtor(na);
2815 
2816 	netmap_detach_common(na);
2817 
2818 	return 1;
2819 }
2820 
2821 /* nm_krings_create callback for all hardware native adapters */
2822 int
2823 netmap_hw_krings_create(struct netmap_adapter *na)
2824 {
2825 	int ret = netmap_krings_create(na, 0);
2826 	if (ret == 0) {
2827 		/* initialize the mbq for the sw rx ring */
2828 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2829 		ND("initialized sw rx queue %d", na->num_rx_rings);
2830 	}
2831 	return ret;
2832 }
2833 
2834 
2835 
2836 /*
2837  * Called on module unload by the netmap-enabled drivers
2838  */
2839 void
2840 netmap_detach(struct ifnet *ifp)
2841 {
2842 	struct netmap_adapter *na = NA(ifp);
2843 
2844 	if (!na)
2845 		return;
2846 
2847 	NMG_LOCK();
2848 	netmap_disable_all_rings(ifp);
2849 	na->ifp = NULL;
2850 	na->na_flags &= ~NAF_NETMAP_ON;
2851 	/*
2852 	 * if the netmap adapter is not native, somebody
2853 	 * changed it, so we can not release it here.
2854 	 * The NULL na->ifp will notify the new owner that
2855 	 * the driver is gone.
2856 	 */
2857 	if (na->na_flags & NAF_NATIVE) {
2858 	        netmap_adapter_put(na);
2859 	}
2860 	/* give them a chance to notice */
2861 	netmap_enable_all_rings(ifp);
2862 	NMG_UNLOCK();
2863 }
2864 
2865 
2866 /*
2867  * Intercept packets from the network stack and pass them
2868  * to netmap as incoming packets on the 'software' ring.
2869  *
2870  * We only store packets in a bounded mbq and then copy them
2871  * in the relevant rxsync routine.
2872  *
2873  * We rely on the OS to make sure that the ifp and na do not go
2874  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2875  * In nm_register() or whenever there is a reinitialization,
2876  * we make sure to make the mode change visible here.
2877  */
2878 int
2879 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2880 {
2881 	struct netmap_adapter *na = NA(ifp);
2882 	struct netmap_kring *kring;
2883 	u_int len = MBUF_LEN(m);
2884 	u_int error = ENOBUFS;
2885 	struct mbq *q;
2886 	int space;
2887 
2888 	kring = &na->rx_rings[na->num_rx_rings];
2889 	// XXX [Linux] we do not need this lock
2890 	// if we follow the down/configure/up protocol -gl
2891 	// mtx_lock(&na->core_lock);
2892 
2893 	if (!nm_netmap_on(na)) {
2894 		D("%s not in netmap mode anymore", na->name);
2895 		error = ENXIO;
2896 		goto done;
2897 	}
2898 
2899 	q = &kring->rx_queue;
2900 
2901 	// XXX reconsider long packets if we handle fragments
2902 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2903 		D("%s from_host, drop packet size %d > %d", na->name,
2904 			len, NETMAP_BUF_SIZE(na));
2905 		goto done;
2906 	}
2907 
2908 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2909 	 * and maybe other instances of netmap_transmit (the latter
2910 	 * not possible on Linux).
2911 	 * Also avoid overflowing the queue.
2912 	 */
2913 	mbq_lock(q);
2914 
2915         space = kring->nr_hwtail - kring->nr_hwcur;
2916         if (space < 0)
2917                 space += kring->nkr_num_slots;
2918 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2919 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2920 			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2921 			len, m);
2922 	} else {
2923 		mbq_enqueue(q, m);
2924 		ND(10, "%s %d bufs in queue len %d m %p",
2925 			na->name, mbq_len(q), len, m);
2926 		/* notify outside the lock */
2927 		m = NULL;
2928 		error = 0;
2929 	}
2930 	mbq_unlock(q);
2931 
2932 done:
2933 	if (m)
2934 		m_freem(m);
2935 	/* unconditionally wake up listeners */
2936 	kring->nm_notify(kring, 0);
2937 	/* this is normally netmap_notify(), but for nics
2938 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2939 	 * that possibly forwards the frames through the switch
2940 	 */
2941 
2942 	return (error);
2943 }
2944 
2945 
2946 /*
2947  * netmap_reset() is called by the driver routines when reinitializing
2948  * a ring. The driver is in charge of locking to protect the kring.
2949  * If native netmap mode is not set just return NULL.
2950  */
2951 struct netmap_slot *
2952 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2953 	u_int new_cur)
2954 {
2955 	struct netmap_kring *kring;
2956 	int new_hwofs, lim;
2957 
2958 	if (!nm_native_on(na)) {
2959 		ND("interface not in native netmap mode");
2960 		return NULL;	/* nothing to reinitialize */
2961 	}
2962 
2963 	/* XXX note- in the new scheme, we are not guaranteed to be
2964 	 * under lock (e.g. when called on a device reset).
2965 	 * In this case, we should set a flag and do not trust too
2966 	 * much the values. In practice: TODO
2967 	 * - set a RESET flag somewhere in the kring
2968 	 * - do the processing in a conservative way
2969 	 * - let the *sync() fixup at the end.
2970 	 */
2971 	if (tx == NR_TX) {
2972 		if (n >= na->num_tx_rings)
2973 			return NULL;
2974 		kring = na->tx_rings + n;
2975 		// XXX check whether we should use hwcur or rcur
2976 		new_hwofs = kring->nr_hwcur - new_cur;
2977 	} else {
2978 		if (n >= na->num_rx_rings)
2979 			return NULL;
2980 		kring = na->rx_rings + n;
2981 		new_hwofs = kring->nr_hwtail - new_cur;
2982 	}
2983 	lim = kring->nkr_num_slots - 1;
2984 	if (new_hwofs > lim)
2985 		new_hwofs -= lim + 1;
2986 
2987 	/* Always set the new offset value and realign the ring. */
2988 	if (netmap_verbose)
2989 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2990 		na->name,
2991 		tx == NR_TX ? "TX" : "RX", n,
2992 		kring->nkr_hwofs, new_hwofs,
2993 		kring->nr_hwtail,
2994 		tx == NR_TX ? lim : kring->nr_hwtail);
2995 	kring->nkr_hwofs = new_hwofs;
2996 	if (tx == NR_TX) {
2997 		kring->nr_hwtail = kring->nr_hwcur + lim;
2998 		if (kring->nr_hwtail > lim)
2999 			kring->nr_hwtail -= lim + 1;
3000 	}
3001 
3002 #if 0 // def linux
3003 	/* XXX check that the mappings are correct */
3004 	/* need ring_nr, adapter->pdev, direction */
3005 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3006 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3007 		D("error mapping rx netmap buffer %d", i);
3008 		// XXX fix error handling
3009 	}
3010 
3011 #endif /* linux */
3012 	/*
3013 	 * Wakeup on the individual and global selwait
3014 	 * We do the wakeup here, but the ring is not yet reconfigured.
3015 	 * However, we are under lock so there are no races.
3016 	 */
3017 	kring->nm_notify(kring, 0);
3018 	return kring->ring->slot;
3019 }
3020 
3021 
3022 /*
3023  * Dispatch rx/tx interrupts to the netmap rings.
3024  *
3025  * "work_done" is non-null on the RX path, NULL for the TX path.
3026  * We rely on the OS to make sure that there is only one active
3027  * instance per queue, and that there is appropriate locking.
3028  *
3029  * The 'notify' routine depends on what the ring is attached to.
3030  * - for a netmap file descriptor, do a selwakeup on the individual
3031  *   waitqueue, plus one on the global one if needed
3032  *   (see netmap_notify)
3033  * - for a nic connected to a switch, call the proper forwarding routine
3034  *   (see netmap_bwrap_intr_notify)
3035  */
3036 void
3037 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3038 {
3039 	struct netmap_adapter *na = NA(ifp);
3040 	struct netmap_kring *kring;
3041 	enum txrx t = (work_done ? NR_RX : NR_TX);
3042 
3043 	q &= NETMAP_RING_MASK;
3044 
3045 	if (netmap_verbose) {
3046 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3047 	}
3048 
3049 	if (q >= nma_get_nrings(na, t))
3050 		return;	// not a physical queue
3051 
3052 	kring = NMR(na, t) + q;
3053 
3054 	if (t == NR_RX) {
3055 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3056 		*work_done = 1; /* do not fire napi again */
3057 	}
3058 	kring->nm_notify(kring, 0);
3059 }
3060 
3061 
3062 /*
3063  * Default functions to handle rx/tx interrupts from a physical device.
3064  * "work_done" is non-null on the RX path, NULL for the TX path.
3065  *
3066  * If the card is not in netmap mode, simply return 0,
3067  * so that the caller proceeds with regular processing.
3068  * Otherwise call netmap_common_irq() and return 1.
3069  *
3070  * If the card is connected to a netmap file descriptor,
3071  * do a selwakeup on the individual queue, plus one on the global one
3072  * if needed (multiqueue card _and_ there are multiqueue listeners),
3073  * and return 1.
3074  *
3075  * Finally, if called on rx from an interface connected to a switch,
3076  * calls the proper forwarding routine, and return 1.
3077  */
3078 int
3079 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3080 {
3081 	struct netmap_adapter *na = NA(ifp);
3082 
3083 	/*
3084 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3085 	 * we still use the regular driver even though the previous
3086 	 * check fails. It is unclear whether we should use
3087 	 * nm_native_on() here.
3088 	 */
3089 	if (!nm_netmap_on(na))
3090 		return 0;
3091 
3092 	if (na->na_flags & NAF_SKIP_INTR) {
3093 		ND("use regular interrupt");
3094 		return 0;
3095 	}
3096 
3097 	netmap_common_irq(ifp, q, work_done);
3098 	return 1;
3099 }
3100 
3101 
3102 /*
3103  * Module loader and unloader
3104  *
3105  * netmap_init() creates the /dev/netmap device and initializes
3106  * all global variables. Returns 0 on success, errno on failure
3107  * (but there is no chance)
3108  *
3109  * netmap_fini() destroys everything.
3110  */
3111 
3112 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3113 extern struct cdevsw netmap_cdevsw;
3114 
3115 
3116 void
3117 netmap_fini(void)
3118 {
3119 	netmap_uninit_bridges();
3120 	if (netmap_dev)
3121 		destroy_dev(netmap_dev);
3122 	netmap_mem_fini();
3123 	NMG_LOCK_DESTROY();
3124 	printf("netmap: unloaded module.\n");
3125 }
3126 
3127 
3128 int
3129 netmap_init(void)
3130 {
3131 	int error;
3132 
3133 	NMG_LOCK_INIT();
3134 
3135 	error = netmap_mem_init();
3136 	if (error != 0)
3137 		goto fail;
3138 	/*
3139 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3140 	 * when the module is compiled in.
3141 	 * XXX could use make_dev_credv() to get error number
3142 	 */
3143 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3144 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3145 			      "netmap");
3146 	if (!netmap_dev)
3147 		goto fail;
3148 
3149 	error = netmap_init_bridges();
3150 	if (error)
3151 		goto fail;
3152 
3153 #ifdef __FreeBSD__
3154 	nm_vi_init_index();
3155 #endif
3156 
3157 	printf("netmap: loaded module\n");
3158 	return (0);
3159 fail:
3160 	netmap_fini();
3161 	return (EINVAL); /* may be incorrect */
3162 }
3163