xref: /freebsd-13.1/sys/dev/netmap/netmap.c (revision 19c4ec08)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 #include <net/ethernet.h>	/* ETHER_BPF_MTAP */
453 
454 
455 #elif defined(linux)
456 
457 #include "bsd_glue.h"
458 
459 #elif defined(__APPLE__)
460 
461 #warning OSX support is only partial
462 #include "osx_glue.h"
463 
464 #elif defined (_WIN32)
465 
466 #include "win_glue.h"
467 
468 #else
469 
470 #error	Unsupported platform
471 
472 #endif /* unsupported */
473 
474 /*
475  * common headers
476  */
477 #include <net/netmap.h>
478 #include <dev/netmap/netmap_kern.h>
479 #include <dev/netmap/netmap_mem2.h>
480 
481 
482 /* user-controlled variables */
483 int netmap_verbose;
484 #ifdef CONFIG_NETMAP_DEBUG
485 int netmap_debug;
486 #endif /* CONFIG_NETMAP_DEBUG */
487 
488 static int netmap_no_timestamp; /* don't timestamp on rxsync */
489 int netmap_no_pendintr = 1;
490 int netmap_txsync_retry = 2;
491 static int netmap_fwd = 0;	/* force transparent forwarding */
492 
493 /*
494  * netmap_admode selects the netmap mode to use.
495  * Invalid values are reset to NETMAP_ADMODE_BEST
496  */
497 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
498 	NETMAP_ADMODE_NATIVE,	/* either native or none */
499 	NETMAP_ADMODE_GENERIC,	/* force generic */
500 	NETMAP_ADMODE_LAST };
501 static int netmap_admode = NETMAP_ADMODE_BEST;
502 
503 /* netmap_generic_mit controls mitigation of RX notifications for
504  * the generic netmap adapter. The value is a time interval in
505  * nanoseconds. */
506 int netmap_generic_mit = 100*1000;
507 
508 /* We use by default netmap-aware qdiscs with generic netmap adapters,
509  * even if there can be a little performance hit with hardware NICs.
510  * However, using the qdisc is the safer approach, for two reasons:
511  * 1) it prevents non-fifo qdiscs to break the TX notification
512  *    scheme, which is based on mbuf destructors when txqdisc is
513  *    not used.
514  * 2) it makes it possible to transmit over software devices that
515  *    change skb->dev, like bridge, veth, ...
516  *
517  * Anyway users looking for the best performance should
518  * use native adapters.
519  */
520 #ifdef linux
521 int netmap_generic_txqdisc = 1;
522 #endif
523 
524 /* Default number of slots and queues for generic adapters. */
525 int netmap_generic_ringsize = 1024;
526 int netmap_generic_rings = 1;
527 
528 /* Non-zero to enable checksum offloading in NIC drivers */
529 int netmap_generic_hwcsum = 0;
530 
531 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
532 int ptnet_vnet_hdr = 1;
533 
534 /*
535  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
536  * in some other operating systems
537  */
538 SYSBEGIN(main_init);
539 
540 SYSCTL_DECL(_dev_netmap);
541 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
543 		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
544 #ifdef CONFIG_NETMAP_DEBUG
545 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
546 		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
547 #endif /* CONFIG_NETMAP_DEBUG */
548 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
549 		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
551 		0, "Always look for new received packets.");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
553 		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
554 
555 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
556 		"Force NR_FORWARD mode");
557 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
558 		"Adapter mode. 0 selects the best option available,"
559 		"1 forces native adapter, 2 forces emulated adapter");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
561 		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
562 		"1 to enable checksum generation by the NIC");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
564 		0, "RX notification interval in nanoseconds");
565 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
566 		&netmap_generic_ringsize, 0,
567 		"Number of per-ring slots for emulated netmap mode");
568 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
569 		&netmap_generic_rings, 0,
570 		"Number of TX/RX queues for emulated netmap adapters");
571 #ifdef linux
572 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
573 		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
574 #endif
575 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
576 		0, "Allow ptnet devices to use virtio-net headers");
577 
578 SYSEND;
579 
580 NMG_LOCK_T	netmap_global_lock;
581 
582 /*
583  * mark the ring as stopped, and run through the locks
584  * to make sure other users get to see it.
585  * stopped must be either NR_KR_STOPPED (for unbounded stop)
586  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
587  */
588 static void
589 netmap_disable_ring(struct netmap_kring *kr, int stopped)
590 {
591 	nm_kr_stop(kr, stopped);
592 	// XXX check if nm_kr_stop is sufficient
593 	mtx_lock(&kr->q_lock);
594 	mtx_unlock(&kr->q_lock);
595 	nm_kr_put(kr);
596 }
597 
598 /* stop or enable a single ring */
599 void
600 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
601 {
602 	if (stopped)
603 		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
604 	else
605 		NMR(na, t)[ring_id]->nkr_stopped = 0;
606 }
607 
608 
609 /* stop or enable all the rings of na */
610 void
611 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
612 {
613 	int i;
614 	enum txrx t;
615 
616 	if (!nm_netmap_on(na))
617 		return;
618 
619 	for_rx_tx(t) {
620 		for (i = 0; i < netmap_real_rings(na, t); i++) {
621 			netmap_set_ring(na, i, t, stopped);
622 		}
623 	}
624 }
625 
626 /*
627  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
628  * to finish and prevents any new one from starting.  Call this before turning
629  * netmap mode off, or before removing the hardware rings (e.g., on module
630  * onload).
631  */
632 void
633 netmap_disable_all_rings(struct ifnet *ifp)
634 {
635 	if (NM_NA_VALID(ifp)) {
636 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
637 	}
638 }
639 
640 /*
641  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
642  * adapter's rings In linux drivers, this should be placed near each
643  * napi_enable().
644  */
645 void
646 netmap_enable_all_rings(struct ifnet *ifp)
647 {
648 	if (NM_NA_VALID(ifp)) {
649 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
650 	}
651 }
652 
653 void
654 netmap_make_zombie(struct ifnet *ifp)
655 {
656 	if (NM_NA_VALID(ifp)) {
657 		struct netmap_adapter *na = NA(ifp);
658 		netmap_set_all_rings(na, NM_KR_LOCKED);
659 		na->na_flags |= NAF_ZOMBIE;
660 		netmap_set_all_rings(na, 0);
661 	}
662 }
663 
664 void
665 netmap_undo_zombie(struct ifnet *ifp)
666 {
667 	if (NM_NA_VALID(ifp)) {
668 		struct netmap_adapter *na = NA(ifp);
669 		if (na->na_flags & NAF_ZOMBIE) {
670 			netmap_set_all_rings(na, NM_KR_LOCKED);
671 			na->na_flags &= ~NAF_ZOMBIE;
672 			netmap_set_all_rings(na, 0);
673 		}
674 	}
675 }
676 
677 /*
678  * generic bound_checking function
679  */
680 u_int
681 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
682 {
683 	u_int oldv = *v;
684 	const char *op = NULL;
685 
686 	if (dflt < lo)
687 		dflt = lo;
688 	if (dflt > hi)
689 		dflt = hi;
690 	if (oldv < lo) {
691 		*v = dflt;
692 		op = "Bump";
693 	} else if (oldv > hi) {
694 		*v = hi;
695 		op = "Clamp";
696 	}
697 	if (op && msg)
698 		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
699 	return *v;
700 }
701 
702 
703 /*
704  * packet-dump function, user-supplied or static buffer.
705  * The destination buffer must be at least 30+4*len
706  */
707 const char *
708 nm_dump_buf(char *p, int len, int lim, char *dst)
709 {
710 	static char _dst[8192];
711 	int i, j, i0;
712 	static char hex[] ="0123456789abcdef";
713 	char *o;	/* output position */
714 
715 #define P_HI(x)	hex[((x) & 0xf0)>>4]
716 #define P_LO(x)	hex[((x) & 0xf)]
717 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
718 	if (!dst)
719 		dst = _dst;
720 	if (lim <= 0 || lim > len)
721 		lim = len;
722 	o = dst;
723 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
724 	o += strlen(o);
725 	/* hexdump routine */
726 	for (i = 0; i < lim; ) {
727 		sprintf(o, "%5d: ", i);
728 		o += strlen(o);
729 		memset(o, ' ', 48);
730 		i0 = i;
731 		for (j=0; j < 16 && i < lim; i++, j++) {
732 			o[j*3] = P_HI(p[i]);
733 			o[j*3+1] = P_LO(p[i]);
734 		}
735 		i = i0;
736 		for (j=0; j < 16 && i < lim; i++, j++)
737 			o[j + 48] = P_C(p[i]);
738 		o[j+48] = '\n';
739 		o += j+49;
740 	}
741 	*o = '\0';
742 #undef P_HI
743 #undef P_LO
744 #undef P_C
745 	return dst;
746 }
747 
748 
749 /*
750  * Fetch configuration from the device, to cope with dynamic
751  * reconfigurations after loading the module.
752  */
753 /* call with NMG_LOCK held */
754 int
755 netmap_update_config(struct netmap_adapter *na)
756 {
757 	struct nm_config_info info;
758 
759 	bzero(&info, sizeof(info));
760 	if (na->nm_config == NULL ||
761 	    na->nm_config(na, &info)) {
762 		/* take whatever we had at init time */
763 		info.num_tx_rings = na->num_tx_rings;
764 		info.num_tx_descs = na->num_tx_desc;
765 		info.num_rx_rings = na->num_rx_rings;
766 		info.num_rx_descs = na->num_rx_desc;
767 		info.rx_buf_maxsize = na->rx_buf_maxsize;
768 	}
769 
770 	if (na->num_tx_rings == info.num_tx_rings &&
771 	    na->num_tx_desc == info.num_tx_descs &&
772 	    na->num_rx_rings == info.num_rx_rings &&
773 	    na->num_rx_desc == info.num_rx_descs &&
774 	    na->rx_buf_maxsize == info.rx_buf_maxsize)
775 		return 0; /* nothing changed */
776 	if (na->active_fds == 0) {
777 		na->num_tx_rings = info.num_tx_rings;
778 		na->num_tx_desc = info.num_tx_descs;
779 		na->num_rx_rings = info.num_rx_rings;
780 		na->num_rx_desc = info.num_rx_descs;
781 		na->rx_buf_maxsize = info.rx_buf_maxsize;
782 		if (netmap_verbose)
783 			nm_prinf("configuration changed for %s: txring %d x %d, "
784 				"rxring %d x %d, rxbufsz %d",
785 				na->name, na->num_tx_rings, na->num_tx_desc,
786 				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
787 		return 0;
788 	}
789 	nm_prerr("WARNING: configuration changed for %s while active: "
790 		"txring %d x %d, rxring %d x %d, rxbufsz %d",
791 		na->name, info.num_tx_rings, info.num_tx_descs,
792 		info.num_rx_rings, info.num_rx_descs,
793 		info.rx_buf_maxsize);
794 	return 1;
795 }
796 
797 /* nm_sync callbacks for the host rings */
798 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
799 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
800 
801 /* create the krings array and initialize the fields common to all adapters.
802  * The array layout is this:
803  *
804  *                    +----------+
805  * na->tx_rings ----->|          | \
806  *                    |          |  } na->num_tx_ring
807  *                    |          | /
808  *                    +----------+
809  *                    |          |    host tx kring
810  * na->rx_rings ----> +----------+
811  *                    |          | \
812  *                    |          |  } na->num_rx_rings
813  *                    |          | /
814  *                    +----------+
815  *                    |          |    host rx kring
816  *                    +----------+
817  * na->tailroom ----->|          | \
818  *                    |          |  } tailroom bytes
819  *                    |          | /
820  *                    +----------+
821  *
822  * Note: for compatibility, host krings are created even when not needed.
823  * The tailroom space is currently used by vale ports for allocating leases.
824  */
825 /* call with NMG_LOCK held */
826 int
827 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
828 {
829 	u_int i, len, ndesc;
830 	struct netmap_kring *kring;
831 	u_int n[NR_TXRX];
832 	enum txrx t;
833 	int err = 0;
834 
835 	if (na->tx_rings != NULL) {
836 		if (netmap_debug & NM_DEBUG_ON)
837 			nm_prerr("warning: krings were already created");
838 		return 0;
839 	}
840 
841 	/* account for the (possibly fake) host rings */
842 	n[NR_TX] = netmap_all_rings(na, NR_TX);
843 	n[NR_RX] = netmap_all_rings(na, NR_RX);
844 
845 	len = (n[NR_TX] + n[NR_RX]) *
846 		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
847 		+ tailroom;
848 
849 	na->tx_rings = nm_os_malloc((size_t)len);
850 	if (na->tx_rings == NULL) {
851 		nm_prerr("Cannot allocate krings");
852 		return ENOMEM;
853 	}
854 	na->rx_rings = na->tx_rings + n[NR_TX];
855 	na->tailroom = na->rx_rings + n[NR_RX];
856 
857 	/* link the krings in the krings array */
858 	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
859 	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
860 		na->tx_rings[i] = kring;
861 		kring++;
862 	}
863 
864 	/*
865 	 * All fields in krings are 0 except the one initialized below.
866 	 * but better be explicit on important kring fields.
867 	 */
868 	for_rx_tx(t) {
869 		ndesc = nma_get_ndesc(na, t);
870 		for (i = 0; i < n[t]; i++) {
871 			kring = NMR(na, t)[i];
872 			bzero(kring, sizeof(*kring));
873 			kring->notify_na = na;
874 			kring->ring_id = i;
875 			kring->tx = t;
876 			kring->nkr_num_slots = ndesc;
877 			kring->nr_mode = NKR_NETMAP_OFF;
878 			kring->nr_pending_mode = NKR_NETMAP_OFF;
879 			if (i < nma_get_nrings(na, t)) {
880 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
881 			} else {
882 				if (!(na->na_flags & NAF_HOST_RINGS))
883 					kring->nr_kflags |= NKR_FAKERING;
884 				kring->nm_sync = (t == NR_TX ?
885 						netmap_txsync_to_host:
886 						netmap_rxsync_from_host);
887 			}
888 			kring->nm_notify = na->nm_notify;
889 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
890 			/*
891 			 * IMPORTANT: Always keep one slot empty.
892 			 */
893 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
894 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
895 					nm_txrx2str(t), i);
896 			ND("ktx %s h %d c %d t %d",
897 				kring->name, kring->rhead, kring->rcur, kring->rtail);
898 			err = nm_os_selinfo_init(&kring->si, kring->name);
899 			if (err) {
900 				netmap_krings_delete(na);
901 				return err;
902 			}
903 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
904 			kring->na = na;	/* setting this field marks the mutex as initialized */
905 		}
906 		err = nm_os_selinfo_init(&na->si[t], na->name);
907 		if (err) {
908 			netmap_krings_delete(na);
909 			return err;
910 		}
911 	}
912 
913 	return 0;
914 }
915 
916 
917 /* undo the actions performed by netmap_krings_create */
918 /* call with NMG_LOCK held */
919 void
920 netmap_krings_delete(struct netmap_adapter *na)
921 {
922 	struct netmap_kring **kring = na->tx_rings;
923 	enum txrx t;
924 
925 	if (na->tx_rings == NULL) {
926 		if (netmap_debug & NM_DEBUG_ON)
927 			nm_prerr("warning: krings were already deleted");
928 		return;
929 	}
930 
931 	for_rx_tx(t)
932 		nm_os_selinfo_uninit(&na->si[t]);
933 
934 	/* we rely on the krings layout described above */
935 	for ( ; kring != na->tailroom; kring++) {
936 		if ((*kring)->na != NULL)
937 			mtx_destroy(&(*kring)->q_lock);
938 		nm_os_selinfo_uninit(&(*kring)->si);
939 	}
940 	nm_os_free(na->tx_rings);
941 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
942 }
943 
944 
945 /*
946  * Destructor for NIC ports. They also have an mbuf queue
947  * on the rings connected to the host so we need to purge
948  * them first.
949  */
950 /* call with NMG_LOCK held */
951 void
952 netmap_hw_krings_delete(struct netmap_adapter *na)
953 {
954 	u_int lim = netmap_real_rings(na, NR_RX), i;
955 
956 	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
957 		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
958 		ND("destroy sw mbq with len %d", mbq_len(q));
959 		mbq_purge(q);
960 		mbq_safe_fini(q);
961 	}
962 	netmap_krings_delete(na);
963 }
964 
965 static void
966 netmap_mem_drop(struct netmap_adapter *na)
967 {
968 	int last = netmap_mem_deref(na->nm_mem, na);
969 	/* if the native allocator had been overrided on regif,
970 	 * restore it now and drop the temporary one
971 	 */
972 	if (last && na->nm_mem_prev) {
973 		netmap_mem_put(na->nm_mem);
974 		na->nm_mem = na->nm_mem_prev;
975 		na->nm_mem_prev = NULL;
976 	}
977 }
978 
979 /*
980  * Undo everything that was done in netmap_do_regif(). In particular,
981  * call nm_register(ifp,0) to stop netmap mode on the interface and
982  * revert to normal operation.
983  */
984 /* call with NMG_LOCK held */
985 static void netmap_unset_ringid(struct netmap_priv_d *);
986 static void netmap_krings_put(struct netmap_priv_d *);
987 void
988 netmap_do_unregif(struct netmap_priv_d *priv)
989 {
990 	struct netmap_adapter *na = priv->np_na;
991 
992 	NMG_LOCK_ASSERT();
993 	na->active_fds--;
994 	/* unset nr_pending_mode and possibly release exclusive mode */
995 	netmap_krings_put(priv);
996 
997 #ifdef	WITH_MONITOR
998 	/* XXX check whether we have to do something with monitor
999 	 * when rings change nr_mode. */
1000 	if (na->active_fds <= 0) {
1001 		/* walk through all the rings and tell any monitor
1002 		 * that the port is going to exit netmap mode
1003 		 */
1004 		netmap_monitor_stop(na);
1005 	}
1006 #endif
1007 
1008 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1009 		na->nm_register(na, 0);
1010 	}
1011 
1012 	/* delete rings and buffers that are no longer needed */
1013 	netmap_mem_rings_delete(na);
1014 
1015 	if (na->active_fds <= 0) {	/* last instance */
1016 		/*
1017 		 * (TO CHECK) We enter here
1018 		 * when the last reference to this file descriptor goes
1019 		 * away. This means we cannot have any pending poll()
1020 		 * or interrupt routine operating on the structure.
1021 		 * XXX The file may be closed in a thread while
1022 		 * another thread is using it.
1023 		 * Linux keeps the file opened until the last reference
1024 		 * by any outstanding ioctl/poll or mmap is gone.
1025 		 * FreeBSD does not track mmap()s (but we do) and
1026 		 * wakes up any sleeping poll(). Need to check what
1027 		 * happens if the close() occurs while a concurrent
1028 		 * syscall is running.
1029 		 */
1030 		if (netmap_debug & NM_DEBUG_ON)
1031 			nm_prinf("deleting last instance for %s", na->name);
1032 
1033 		if (nm_netmap_on(na)) {
1034 			nm_prerr("BUG: netmap on while going to delete the krings");
1035 		}
1036 
1037 		na->nm_krings_delete(na);
1038 	}
1039 
1040 	/* possibily decrement counter of tx_si/rx_si users */
1041 	netmap_unset_ringid(priv);
1042 	/* delete the nifp */
1043 	netmap_mem_if_delete(na, priv->np_nifp);
1044 	/* drop the allocator */
1045 	netmap_mem_drop(na);
1046 	/* mark the priv as unregistered */
1047 	priv->np_na = NULL;
1048 	priv->np_nifp = NULL;
1049 }
1050 
1051 struct netmap_priv_d*
1052 netmap_priv_new(void)
1053 {
1054 	struct netmap_priv_d *priv;
1055 
1056 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1057 	if (priv == NULL)
1058 		return NULL;
1059 	priv->np_refs = 1;
1060 	nm_os_get_module();
1061 	return priv;
1062 }
1063 
1064 /*
1065  * Destructor of the netmap_priv_d, called when the fd is closed
1066  * Action: undo all the things done by NIOCREGIF,
1067  * On FreeBSD we need to track whether there are active mmap()s,
1068  * and we use np_active_mmaps for that. On linux, the field is always 0.
1069  * Return: 1 if we can free priv, 0 otherwise.
1070  *
1071  */
1072 /* call with NMG_LOCK held */
1073 void
1074 netmap_priv_delete(struct netmap_priv_d *priv)
1075 {
1076 	struct netmap_adapter *na = priv->np_na;
1077 
1078 	/* number of active references to this fd */
1079 	if (--priv->np_refs > 0) {
1080 		return;
1081 	}
1082 	nm_os_put_module();
1083 	if (na) {
1084 		netmap_do_unregif(priv);
1085 	}
1086 	netmap_unget_na(na, priv->np_ifp);
1087 	bzero(priv, sizeof(*priv));	/* for safety */
1088 	nm_os_free(priv);
1089 }
1090 
1091 
1092 /* call with NMG_LOCK *not* held */
1093 void
1094 netmap_dtor(void *data)
1095 {
1096 	struct netmap_priv_d *priv = data;
1097 
1098 	NMG_LOCK();
1099 	netmap_priv_delete(priv);
1100 	NMG_UNLOCK();
1101 }
1102 
1103 
1104 /*
1105  * Handlers for synchronization of the rings from/to the host stack.
1106  * These are associated to a network interface and are just another
1107  * ring pair managed by userspace.
1108  *
1109  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1110  * flags):
1111  *
1112  * - Before releasing buffers on hw RX rings, the application can mark
1113  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1114  *   will be forwarded to the host stack, similarly to what happened if
1115  *   the application moved them to the host TX ring.
1116  *
1117  * - Before releasing buffers on the host RX ring, the application can
1118  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1119  *   they will be forwarded to the hw TX rings, saving the application
1120  *   from doing the same task in user-space.
1121  *
1122  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1123  * flag, or globally with the netmap_fwd sysctl.
1124  *
1125  * The transfer NIC --> host is relatively easy, just encapsulate
1126  * into mbufs and we are done. The host --> NIC side is slightly
1127  * harder because there might not be room in the tx ring so it
1128  * might take a while before releasing the buffer.
1129  */
1130 
1131 
1132 /*
1133  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1134  * We do not need to lock because the queue is private.
1135  * After this call the queue is empty.
1136  */
1137 static void
1138 netmap_send_up(struct ifnet *dst, struct mbq *q)
1139 {
1140 	struct mbuf *m;
1141 	struct mbuf *head = NULL, *prev = NULL;
1142 
1143 	/* Send packets up, outside the lock; head/prev machinery
1144 	 * is only useful for Windows. */
1145 	while ((m = mbq_dequeue(q)) != NULL) {
1146 		if (netmap_debug & NM_DEBUG_HOST)
1147 			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1148 		prev = nm_os_send_up(dst, m, prev);
1149 		if (head == NULL)
1150 			head = prev;
1151 	}
1152 	if (head)
1153 		nm_os_send_up(dst, NULL, head);
1154 	mbq_fini(q);
1155 }
1156 
1157 
1158 /*
1159  * Scan the buffers from hwcur to ring->head, and put a copy of those
1160  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1161  * Drop remaining packets in the unlikely event
1162  * of an mbuf shortage.
1163  */
1164 static void
1165 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1166 {
1167 	u_int const lim = kring->nkr_num_slots - 1;
1168 	u_int const head = kring->rhead;
1169 	u_int n;
1170 	struct netmap_adapter *na = kring->na;
1171 
1172 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1173 		struct mbuf *m;
1174 		struct netmap_slot *slot = &kring->ring->slot[n];
1175 
1176 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1177 			continue;
1178 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1179 			RD(5, "bad pkt at %d len %d", n, slot->len);
1180 			continue;
1181 		}
1182 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1183 		/* XXX TODO: adapt to the case of a multisegment packet */
1184 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1185 
1186 		if (m == NULL)
1187 			break;
1188 		mbq_enqueue(q, m);
1189 	}
1190 }
1191 
1192 static inline int
1193 _nm_may_forward(struct netmap_kring *kring)
1194 {
1195 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1196 		 kring->na->na_flags & NAF_HOST_RINGS &&
1197 		 kring->tx == NR_RX);
1198 }
1199 
1200 static inline int
1201 nm_may_forward_up(struct netmap_kring *kring)
1202 {
1203 	return	_nm_may_forward(kring) &&
1204 		 kring->ring_id != kring->na->num_rx_rings;
1205 }
1206 
1207 static inline int
1208 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1209 {
1210 	return	_nm_may_forward(kring) &&
1211 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1212 		 kring->ring_id == kring->na->num_rx_rings;
1213 }
1214 
1215 /*
1216  * Send to the NIC rings packets marked NS_FORWARD between
1217  * kring->nr_hwcur and kring->rhead.
1218  * Called under kring->rx_queue.lock on the sw rx ring.
1219  *
1220  * It can only be called if the user opened all the TX hw rings,
1221  * see NAF_CAN_FORWARD_DOWN flag.
1222  * We can touch the TX netmap rings (slots, head and cur) since
1223  * we are in poll/ioctl system call context, and the application
1224  * is not supposed to touch the ring (using a different thread)
1225  * during the execution of the system call.
1226  */
1227 static u_int
1228 netmap_sw_to_nic(struct netmap_adapter *na)
1229 {
1230 	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1231 	struct netmap_slot *rxslot = kring->ring->slot;
1232 	u_int i, rxcur = kring->nr_hwcur;
1233 	u_int const head = kring->rhead;
1234 	u_int const src_lim = kring->nkr_num_slots - 1;
1235 	u_int sent = 0;
1236 
1237 	/* scan rings to find space, then fill as much as possible */
1238 	for (i = 0; i < na->num_tx_rings; i++) {
1239 		struct netmap_kring *kdst = na->tx_rings[i];
1240 		struct netmap_ring *rdst = kdst->ring;
1241 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1242 
1243 		/* XXX do we trust ring or kring->rcur,rtail ? */
1244 		for (; rxcur != head && !nm_ring_empty(rdst);
1245 		     rxcur = nm_next(rxcur, src_lim) ) {
1246 			struct netmap_slot *src, *dst, tmp;
1247 			u_int dst_head = rdst->head;
1248 
1249 			src = &rxslot[rxcur];
1250 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1251 				continue;
1252 
1253 			sent++;
1254 
1255 			dst = &rdst->slot[dst_head];
1256 
1257 			tmp = *src;
1258 
1259 			src->buf_idx = dst->buf_idx;
1260 			src->flags = NS_BUF_CHANGED;
1261 
1262 			dst->buf_idx = tmp.buf_idx;
1263 			dst->len = tmp.len;
1264 			dst->flags = NS_BUF_CHANGED;
1265 
1266 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1267 		}
1268 		/* if (sent) XXX txsync ? it would be just an optimization */
1269 	}
1270 	return sent;
1271 }
1272 
1273 
1274 /*
1275  * netmap_txsync_to_host() passes packets up. We are called from a
1276  * system call in user process context, and the only contention
1277  * can be among multiple user threads erroneously calling
1278  * this routine concurrently.
1279  */
1280 static int
1281 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1282 {
1283 	struct netmap_adapter *na = kring->na;
1284 	u_int const lim = kring->nkr_num_slots - 1;
1285 	u_int const head = kring->rhead;
1286 	struct mbq q;
1287 
1288 	/* Take packets from hwcur to head and pass them up.
1289 	 * Force hwcur = head since netmap_grab_packets() stops at head
1290 	 */
1291 	mbq_init(&q);
1292 	netmap_grab_packets(kring, &q, 1 /* force */);
1293 	ND("have %d pkts in queue", mbq_len(&q));
1294 	kring->nr_hwcur = head;
1295 	kring->nr_hwtail = head + lim;
1296 	if (kring->nr_hwtail > lim)
1297 		kring->nr_hwtail -= lim + 1;
1298 
1299 	netmap_send_up(na->ifp, &q);
1300 	return 0;
1301 }
1302 
1303 
1304 /*
1305  * rxsync backend for packets coming from the host stack.
1306  * They have been put in kring->rx_queue by netmap_transmit().
1307  * We protect access to the kring using kring->rx_queue.lock
1308  *
1309  * also moves to the nic hw rings any packet the user has marked
1310  * for transparent-mode forwarding, then sets the NR_FORWARD
1311  * flag in the kring to let the caller push them out
1312  */
1313 static int
1314 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1315 {
1316 	struct netmap_adapter *na = kring->na;
1317 	struct netmap_ring *ring = kring->ring;
1318 	u_int nm_i, n;
1319 	u_int const lim = kring->nkr_num_slots - 1;
1320 	u_int const head = kring->rhead;
1321 	int ret = 0;
1322 	struct mbq *q = &kring->rx_queue, fq;
1323 
1324 	mbq_init(&fq); /* fq holds packets to be freed */
1325 
1326 	mbq_lock(q);
1327 
1328 	/* First part: import newly received packets */
1329 	n = mbq_len(q);
1330 	if (n) { /* grab packets from the queue */
1331 		struct mbuf *m;
1332 		uint32_t stop_i;
1333 
1334 		nm_i = kring->nr_hwtail;
1335 		stop_i = nm_prev(kring->nr_hwcur, lim);
1336 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1337 			int len = MBUF_LEN(m);
1338 			struct netmap_slot *slot = &ring->slot[nm_i];
1339 
1340 			m_copydata(m, 0, len, NMB(na, slot));
1341 			ND("nm %d len %d", nm_i, len);
1342 			if (netmap_debug & NM_DEBUG_HOST)
1343 				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1344 
1345 			slot->len = len;
1346 			slot->flags = 0;
1347 			nm_i = nm_next(nm_i, lim);
1348 			mbq_enqueue(&fq, m);
1349 		}
1350 		kring->nr_hwtail = nm_i;
1351 	}
1352 
1353 	/*
1354 	 * Second part: skip past packets that userspace has released.
1355 	 */
1356 	nm_i = kring->nr_hwcur;
1357 	if (nm_i != head) { /* something was released */
1358 		if (nm_may_forward_down(kring, flags)) {
1359 			ret = netmap_sw_to_nic(na);
1360 			if (ret > 0) {
1361 				kring->nr_kflags |= NR_FORWARD;
1362 				ret = 0;
1363 			}
1364 		}
1365 		kring->nr_hwcur = head;
1366 	}
1367 
1368 	mbq_unlock(q);
1369 
1370 	mbq_purge(&fq);
1371 	mbq_fini(&fq);
1372 
1373 	return ret;
1374 }
1375 
1376 
1377 /* Get a netmap adapter for the port.
1378  *
1379  * If it is possible to satisfy the request, return 0
1380  * with *na containing the netmap adapter found.
1381  * Otherwise return an error code, with *na containing NULL.
1382  *
1383  * When the port is attached to a bridge, we always return
1384  * EBUSY.
1385  * Otherwise, if the port is already bound to a file descriptor,
1386  * then we unconditionally return the existing adapter into *na.
1387  * In all the other cases, we return (into *na) either native,
1388  * generic or NULL, according to the following table:
1389  *
1390  *					native_support
1391  * active_fds   dev.netmap.admode         YES     NO
1392  * -------------------------------------------------------
1393  *    >0              *                 NA(ifp) NA(ifp)
1394  *
1395  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1396  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1397  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1398  *
1399  */
1400 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1401 int
1402 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1403 {
1404 	/* generic support */
1405 	int i = netmap_admode;	/* Take a snapshot. */
1406 	struct netmap_adapter *prev_na;
1407 	int error = 0;
1408 
1409 	*na = NULL; /* default */
1410 
1411 	/* reset in case of invalid value */
1412 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1413 		i = netmap_admode = NETMAP_ADMODE_BEST;
1414 
1415 	if (NM_NA_VALID(ifp)) {
1416 		prev_na = NA(ifp);
1417 		/* If an adapter already exists, return it if
1418 		 * there are active file descriptors or if
1419 		 * netmap is not forced to use generic
1420 		 * adapters.
1421 		 */
1422 		if (NETMAP_OWNED_BY_ANY(prev_na)
1423 			|| i != NETMAP_ADMODE_GENERIC
1424 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1425 #ifdef WITH_PIPES
1426 			/* ugly, but we cannot allow an adapter switch
1427 			 * if some pipe is referring to this one
1428 			 */
1429 			|| prev_na->na_next_pipe > 0
1430 #endif
1431 		) {
1432 			*na = prev_na;
1433 			goto assign_mem;
1434 		}
1435 	}
1436 
1437 	/* If there isn't native support and netmap is not allowed
1438 	 * to use generic adapters, we cannot satisfy the request.
1439 	 */
1440 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1441 		return EOPNOTSUPP;
1442 
1443 	/* Otherwise, create a generic adapter and return it,
1444 	 * saving the previously used netmap adapter, if any.
1445 	 *
1446 	 * Note that here 'prev_na', if not NULL, MUST be a
1447 	 * native adapter, and CANNOT be a generic one. This is
1448 	 * true because generic adapters are created on demand, and
1449 	 * destroyed when not used anymore. Therefore, if the adapter
1450 	 * currently attached to an interface 'ifp' is generic, it
1451 	 * must be that
1452 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1453 	 * Consequently, if NA(ifp) is generic, we will enter one of
1454 	 * the branches above. This ensures that we never override
1455 	 * a generic adapter with another generic adapter.
1456 	 */
1457 	error = generic_netmap_attach(ifp);
1458 	if (error)
1459 		return error;
1460 
1461 	*na = NA(ifp);
1462 
1463 assign_mem:
1464 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1465 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1466 		(*na)->nm_mem_prev = (*na)->nm_mem;
1467 		(*na)->nm_mem = netmap_mem_get(nmd);
1468 	}
1469 
1470 	return 0;
1471 }
1472 
1473 /*
1474  * MUST BE CALLED UNDER NMG_LOCK()
1475  *
1476  * Get a refcounted reference to a netmap adapter attached
1477  * to the interface specified by req.
1478  * This is always called in the execution of an ioctl().
1479  *
1480  * Return ENXIO if the interface specified by the request does
1481  * not exist, ENOTSUP if netmap is not supported by the interface,
1482  * EBUSY if the interface is already attached to a bridge,
1483  * EINVAL if parameters are invalid, ENOMEM if needed resources
1484  * could not be allocated.
1485  * If successful, hold a reference to the netmap adapter.
1486  *
1487  * If the interface specified by req is a system one, also keep
1488  * a reference to it and return a valid *ifp.
1489  */
1490 int
1491 netmap_get_na(struct nmreq_header *hdr,
1492 	      struct netmap_adapter **na, struct ifnet **ifp,
1493 	      struct netmap_mem_d *nmd, int create)
1494 {
1495 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1496 	int error = 0;
1497 	struct netmap_adapter *ret = NULL;
1498 	int nmd_ref = 0;
1499 
1500 	*na = NULL;     /* default return value */
1501 	*ifp = NULL;
1502 
1503 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1504 		return EINVAL;
1505 	}
1506 
1507 	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1508 			req->nr_mode == NR_REG_PIPE_SLAVE) {
1509 		/* Do not accept deprecated pipe modes. */
1510 		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1511 		return EINVAL;
1512 	}
1513 
1514 	NMG_LOCK_ASSERT();
1515 
1516 	/* if the request contain a memid, try to find the
1517 	 * corresponding memory region
1518 	 */
1519 	if (nmd == NULL && req->nr_mem_id) {
1520 		nmd = netmap_mem_find(req->nr_mem_id);
1521 		if (nmd == NULL)
1522 			return EINVAL;
1523 		/* keep the rereference */
1524 		nmd_ref = 1;
1525 	}
1526 
1527 	/* We cascade through all possible types of netmap adapter.
1528 	 * All netmap_get_*_na() functions return an error and an na,
1529 	 * with the following combinations:
1530 	 *
1531 	 * error    na
1532 	 *   0	   NULL		type doesn't match
1533 	 *  !0	   NULL		type matches, but na creation/lookup failed
1534 	 *   0	  !NULL		type matches and na created/found
1535 	 *  !0    !NULL		impossible
1536 	 */
1537 	error = netmap_get_null_na(hdr, na, nmd, create);
1538 	if (error || *na != NULL)
1539 		goto out;
1540 
1541 	/* try to see if this is a monitor port */
1542 	error = netmap_get_monitor_na(hdr, na, nmd, create);
1543 	if (error || *na != NULL)
1544 		goto out;
1545 
1546 	/* try to see if this is a pipe port */
1547 	error = netmap_get_pipe_na(hdr, na, nmd, create);
1548 	if (error || *na != NULL)
1549 		goto out;
1550 
1551 	/* try to see if this is a bridge port */
1552 	error = netmap_get_vale_na(hdr, na, nmd, create);
1553 	if (error)
1554 		goto out;
1555 
1556 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1557 		goto out;
1558 
1559 	/*
1560 	 * This must be a hardware na, lookup the name in the system.
1561 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1562 	 * This may still be a tap, a veth/epair, or even a
1563 	 * persistent VALE port.
1564 	 */
1565 	*ifp = ifunit_ref(hdr->nr_name);
1566 	if (*ifp == NULL) {
1567 		error = ENXIO;
1568 		goto out;
1569 	}
1570 
1571 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1572 	if (error)
1573 		goto out;
1574 
1575 	*na = ret;
1576 	netmap_adapter_get(ret);
1577 
1578 out:
1579 	if (error) {
1580 		if (ret)
1581 			netmap_adapter_put(ret);
1582 		if (*ifp) {
1583 			if_rele(*ifp);
1584 			*ifp = NULL;
1585 		}
1586 	}
1587 	if (nmd_ref)
1588 		netmap_mem_put(nmd);
1589 
1590 	return error;
1591 }
1592 
1593 /* undo netmap_get_na() */
1594 void
1595 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1596 {
1597 	if (ifp)
1598 		if_rele(ifp);
1599 	if (na)
1600 		netmap_adapter_put(na);
1601 }
1602 
1603 
1604 #define NM_FAIL_ON(t) do {						\
1605 	if (unlikely(t)) {						\
1606 		RD(5, "%s: fail '" #t "' "				\
1607 			"h %d c %d t %d "				\
1608 			"rh %d rc %d rt %d "				\
1609 			"hc %d ht %d",					\
1610 			kring->name,					\
1611 			head, cur, ring->tail,				\
1612 			kring->rhead, kring->rcur, kring->rtail,	\
1613 			kring->nr_hwcur, kring->nr_hwtail);		\
1614 		return kring->nkr_num_slots;				\
1615 	}								\
1616 } while (0)
1617 
1618 /*
1619  * validate parameters on entry for *_txsync()
1620  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1621  * in case of error.
1622  *
1623  * rhead, rcur and rtail=hwtail are stored from previous round.
1624  * hwcur is the next packet to send to the ring.
1625  *
1626  * We want
1627  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1628  *
1629  * hwcur, rhead, rtail and hwtail are reliable
1630  */
1631 u_int
1632 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1633 {
1634 	u_int head = ring->head; /* read only once */
1635 	u_int cur = ring->cur; /* read only once */
1636 	u_int n = kring->nkr_num_slots;
1637 
1638 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1639 		kring->name,
1640 		kring->nr_hwcur, kring->nr_hwtail,
1641 		ring->head, ring->cur, ring->tail);
1642 #if 1 /* kernel sanity checks; but we can trust the kring. */
1643 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1644 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1645 #endif /* kernel sanity checks */
1646 	/*
1647 	 * user sanity checks. We only use head,
1648 	 * A, B, ... are possible positions for head:
1649 	 *
1650 	 *  0    A  rhead   B  rtail   C  n-1
1651 	 *  0    D  rtail   E  rhead   F  n-1
1652 	 *
1653 	 * B, F, D are valid. A, C, E are wrong
1654 	 */
1655 	if (kring->rtail >= kring->rhead) {
1656 		/* want rhead <= head <= rtail */
1657 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1658 		/* and also head <= cur <= rtail */
1659 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1660 	} else { /* here rtail < rhead */
1661 		/* we need head outside rtail .. rhead */
1662 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1663 
1664 		/* two cases now: head <= rtail or head >= rhead  */
1665 		if (head <= kring->rtail) {
1666 			/* want head <= cur <= rtail */
1667 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1668 		} else { /* head >= rhead */
1669 			/* cur must be outside rtail..head */
1670 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1671 		}
1672 	}
1673 	if (ring->tail != kring->rtail) {
1674 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1675 			ring->tail, kring->rtail);
1676 		ring->tail = kring->rtail;
1677 	}
1678 	kring->rhead = head;
1679 	kring->rcur = cur;
1680 	return head;
1681 }
1682 
1683 
1684 /*
1685  * validate parameters on entry for *_rxsync()
1686  * Returns ring->head if ok, kring->nkr_num_slots on error.
1687  *
1688  * For a valid configuration,
1689  * hwcur <= head <= cur <= tail <= hwtail
1690  *
1691  * We only consider head and cur.
1692  * hwcur and hwtail are reliable.
1693  *
1694  */
1695 u_int
1696 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1697 {
1698 	uint32_t const n = kring->nkr_num_slots;
1699 	uint32_t head, cur;
1700 
1701 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1702 		kring->name,
1703 		kring->nr_hwcur, kring->nr_hwtail,
1704 		ring->head, ring->cur, ring->tail);
1705 	/*
1706 	 * Before storing the new values, we should check they do not
1707 	 * move backwards. However:
1708 	 * - head is not an issue because the previous value is hwcur;
1709 	 * - cur could in principle go back, however it does not matter
1710 	 *   because we are processing a brand new rxsync()
1711 	 */
1712 	cur = kring->rcur = ring->cur;	/* read only once */
1713 	head = kring->rhead = ring->head;	/* read only once */
1714 #if 1 /* kernel sanity checks */
1715 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1716 #endif /* kernel sanity checks */
1717 	/* user sanity checks */
1718 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1719 		/* want hwcur <= rhead <= hwtail */
1720 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1721 		/* and also rhead <= rcur <= hwtail */
1722 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1723 	} else {
1724 		/* we need rhead outside hwtail..hwcur */
1725 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1726 		/* two cases now: head <= hwtail or head >= hwcur  */
1727 		if (head <= kring->nr_hwtail) {
1728 			/* want head <= cur <= hwtail */
1729 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1730 		} else {
1731 			/* cur must be outside hwtail..head */
1732 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1733 		}
1734 	}
1735 	if (ring->tail != kring->rtail) {
1736 		RD(5, "%s tail overwritten was %d need %d",
1737 			kring->name,
1738 			ring->tail, kring->rtail);
1739 		ring->tail = kring->rtail;
1740 	}
1741 	return head;
1742 }
1743 
1744 
1745 /*
1746  * Error routine called when txsync/rxsync detects an error.
1747  * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1748  * Return 1 on reinit.
1749  *
1750  * This routine is only called by the upper half of the kernel.
1751  * It only reads hwcur (which is changed only by the upper half, too)
1752  * and hwtail (which may be changed by the lower half, but only on
1753  * a tx ring and only to increase it, so any error will be recovered
1754  * on the next call). For the above, we don't strictly need to call
1755  * it under lock.
1756  */
1757 int
1758 netmap_ring_reinit(struct netmap_kring *kring)
1759 {
1760 	struct netmap_ring *ring = kring->ring;
1761 	u_int i, lim = kring->nkr_num_slots - 1;
1762 	int errors = 0;
1763 
1764 	// XXX KASSERT nm_kr_tryget
1765 	RD(10, "called for %s", kring->name);
1766 	// XXX probably wrong to trust userspace
1767 	kring->rhead = ring->head;
1768 	kring->rcur  = ring->cur;
1769 	kring->rtail = ring->tail;
1770 
1771 	if (ring->cur > lim)
1772 		errors++;
1773 	if (ring->head > lim)
1774 		errors++;
1775 	if (ring->tail > lim)
1776 		errors++;
1777 	for (i = 0; i <= lim; i++) {
1778 		u_int idx = ring->slot[i].buf_idx;
1779 		u_int len = ring->slot[i].len;
1780 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1781 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1782 			ring->slot[i].buf_idx = 0;
1783 			ring->slot[i].len = 0;
1784 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1785 			ring->slot[i].len = 0;
1786 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1787 		}
1788 	}
1789 	if (errors) {
1790 		RD(10, "total %d errors", errors);
1791 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1792 			kring->name,
1793 			ring->cur, kring->nr_hwcur,
1794 			ring->tail, kring->nr_hwtail);
1795 		ring->head = kring->rhead = kring->nr_hwcur;
1796 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1797 		ring->tail = kring->rtail = kring->nr_hwtail;
1798 	}
1799 	return (errors ? 1 : 0);
1800 }
1801 
1802 /* interpret the ringid and flags fields of an nmreq, by translating them
1803  * into a pair of intervals of ring indices:
1804  *
1805  * [priv->np_txqfirst, priv->np_txqlast) and
1806  * [priv->np_rxqfirst, priv->np_rxqlast)
1807  *
1808  */
1809 int
1810 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1811 			uint16_t nr_ringid, uint64_t nr_flags)
1812 {
1813 	struct netmap_adapter *na = priv->np_na;
1814 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1815 	enum txrx t;
1816 	u_int j;
1817 
1818 	for_rx_tx(t) {
1819 		if (nr_flags & excluded_direction[t]) {
1820 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1821 			continue;
1822 		}
1823 		switch (nr_mode) {
1824 		case NR_REG_ALL_NIC:
1825 		case NR_REG_NULL:
1826 			priv->np_qfirst[t] = 0;
1827 			priv->np_qlast[t] = nma_get_nrings(na, t);
1828 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1829 				priv->np_qfirst[t], priv->np_qlast[t]);
1830 			break;
1831 		case NR_REG_SW:
1832 		case NR_REG_NIC_SW:
1833 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1834 				nm_prerr("host rings not supported");
1835 				return EINVAL;
1836 			}
1837 			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1838 				nma_get_nrings(na, t) : 0);
1839 			priv->np_qlast[t] = netmap_all_rings(na, t);
1840 			ND("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1841 				nm_txrx2str(t),
1842 				priv->np_qfirst[t], priv->np_qlast[t]);
1843 			break;
1844 		case NR_REG_ONE_NIC:
1845 			if (nr_ringid >= na->num_tx_rings &&
1846 					nr_ringid >= na->num_rx_rings) {
1847 				nm_prerr("invalid ring id %d", nr_ringid);
1848 				return EINVAL;
1849 			}
1850 			/* if not enough rings, use the first one */
1851 			j = nr_ringid;
1852 			if (j >= nma_get_nrings(na, t))
1853 				j = 0;
1854 			priv->np_qfirst[t] = j;
1855 			priv->np_qlast[t] = j + 1;
1856 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1857 				priv->np_qfirst[t], priv->np_qlast[t]);
1858 			break;
1859 		default:
1860 			nm_prerr("invalid regif type %d", nr_mode);
1861 			return EINVAL;
1862 		}
1863 	}
1864 	priv->np_flags = nr_flags;
1865 
1866 	/* Allow transparent forwarding mode in the host --> nic
1867 	 * direction only if all the TX hw rings have been opened. */
1868 	if (priv->np_qfirst[NR_TX] == 0 &&
1869 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1870 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1871 	}
1872 
1873 	if (netmap_verbose) {
1874 		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1875 			na->name,
1876 			priv->np_qfirst[NR_TX],
1877 			priv->np_qlast[NR_TX],
1878 			priv->np_qfirst[NR_RX],
1879 			priv->np_qlast[NR_RX],
1880 			nr_ringid);
1881 	}
1882 	return 0;
1883 }
1884 
1885 
1886 /*
1887  * Set the ring ID. For devices with a single queue, a request
1888  * for all rings is the same as a single ring.
1889  */
1890 static int
1891 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1892 		uint16_t nr_ringid, uint64_t nr_flags)
1893 {
1894 	struct netmap_adapter *na = priv->np_na;
1895 	int error;
1896 	enum txrx t;
1897 
1898 	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1899 	if (error) {
1900 		return error;
1901 	}
1902 
1903 	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1904 
1905 	/* optimization: count the users registered for more than
1906 	 * one ring, which are the ones sleeping on the global queue.
1907 	 * The default netmap_notify() callback will then
1908 	 * avoid signaling the global queue if nobody is using it
1909 	 */
1910 	for_rx_tx(t) {
1911 		if (nm_si_user(priv, t))
1912 			na->si_users[t]++;
1913 	}
1914 	return 0;
1915 }
1916 
1917 static void
1918 netmap_unset_ringid(struct netmap_priv_d *priv)
1919 {
1920 	struct netmap_adapter *na = priv->np_na;
1921 	enum txrx t;
1922 
1923 	for_rx_tx(t) {
1924 		if (nm_si_user(priv, t))
1925 			na->si_users[t]--;
1926 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1927 	}
1928 	priv->np_flags = 0;
1929 	priv->np_txpoll = 0;
1930 	priv->np_kloop_state = 0;
1931 }
1932 
1933 
1934 /* Set the nr_pending_mode for the requested rings.
1935  * If requested, also try to get exclusive access to the rings, provided
1936  * the rings we want to bind are not exclusively owned by a previous bind.
1937  */
1938 static int
1939 netmap_krings_get(struct netmap_priv_d *priv)
1940 {
1941 	struct netmap_adapter *na = priv->np_na;
1942 	u_int i;
1943 	struct netmap_kring *kring;
1944 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1945 	enum txrx t;
1946 
1947 	if (netmap_debug & NM_DEBUG_ON)
1948 		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
1949 			na->name,
1950 			priv->np_qfirst[NR_TX],
1951 			priv->np_qlast[NR_TX],
1952 			priv->np_qfirst[NR_RX],
1953 			priv->np_qlast[NR_RX]);
1954 
1955 	/* first round: check that all the requested rings
1956 	 * are neither alread exclusively owned, nor we
1957 	 * want exclusive ownership when they are already in use
1958 	 */
1959 	for_rx_tx(t) {
1960 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1961 			kring = NMR(na, t)[i];
1962 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1963 			    (kring->users && excl))
1964 			{
1965 				ND("ring %s busy", kring->name);
1966 				return EBUSY;
1967 			}
1968 		}
1969 	}
1970 
1971 	/* second round: increment usage count (possibly marking them
1972 	 * as exclusive) and set the nr_pending_mode
1973 	 */
1974 	for_rx_tx(t) {
1975 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1976 			kring = NMR(na, t)[i];
1977 			kring->users++;
1978 			if (excl)
1979 				kring->nr_kflags |= NKR_EXCLUSIVE;
1980 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1981 		}
1982 	}
1983 
1984 	return 0;
1985 
1986 }
1987 
1988 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1989  * if was asked on regif, and unset the nr_pending_mode if we are the
1990  * last users of the involved rings. */
1991 static void
1992 netmap_krings_put(struct netmap_priv_d *priv)
1993 {
1994 	struct netmap_adapter *na = priv->np_na;
1995 	u_int i;
1996 	struct netmap_kring *kring;
1997 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1998 	enum txrx t;
1999 
2000 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
2001 			na->name,
2002 			priv->np_qfirst[NR_TX],
2003 			priv->np_qlast[NR_TX],
2004 			priv->np_qfirst[NR_RX],
2005 			priv->np_qlast[MR_RX]);
2006 
2007 	for_rx_tx(t) {
2008 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2009 			kring = NMR(na, t)[i];
2010 			if (excl)
2011 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2012 			kring->users--;
2013 			if (kring->users == 0)
2014 				kring->nr_pending_mode = NKR_NETMAP_OFF;
2015 		}
2016 	}
2017 }
2018 
2019 static int
2020 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2021 {
2022 	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2023 }
2024 
2025 /* Validate the CSB entries for both directions (atok and ktoa).
2026  * To be called under NMG_LOCK(). */
2027 static int
2028 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2029 {
2030 	struct nm_csb_atok *csb_atok_base =
2031 		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2032 	struct nm_csb_ktoa *csb_ktoa_base =
2033 		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2034 	enum txrx t;
2035 	int num_rings[NR_TXRX], tot_rings;
2036 	size_t entry_size[2];
2037 	void *csb_start[2];
2038 	int i;
2039 
2040 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2041 		nm_prerr("Cannot update CSB while kloop is running");
2042 		return EBUSY;
2043 	}
2044 
2045 	tot_rings = 0;
2046 	for_rx_tx(t) {
2047 		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2048 		tot_rings += num_rings[t];
2049 	}
2050 	if (tot_rings <= 0)
2051 		return 0;
2052 
2053 	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2054 		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2055 		return EINVAL;
2056 	}
2057 
2058 	entry_size[0] = sizeof(*csb_atok_base);
2059 	entry_size[1] = sizeof(*csb_ktoa_base);
2060 	csb_start[0] = (void *)csb_atok_base;
2061 	csb_start[1] = (void *)csb_ktoa_base;
2062 
2063 	for (i = 0; i < 2; i++) {
2064 		/* On Linux we could use access_ok() to simplify
2065 		 * the validation. However, the advantage of
2066 		 * this approach is that it works also on
2067 		 * FreeBSD. */
2068 		size_t csb_size = tot_rings * entry_size[i];
2069 		void *tmp;
2070 		int err;
2071 
2072 		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2073 			nm_prerr("Unaligned CSB address");
2074 			return EINVAL;
2075 		}
2076 
2077 		tmp = nm_os_malloc(csb_size);
2078 		if (!tmp)
2079 			return ENOMEM;
2080 		if (i == 0) {
2081 			/* Application --> kernel direction. */
2082 			err = copyin(csb_start[i], tmp, csb_size);
2083 		} else {
2084 			/* Kernel --> application direction. */
2085 			memset(tmp, 0, csb_size);
2086 			err = copyout(tmp, csb_start[i], csb_size);
2087 		}
2088 		nm_os_free(tmp);
2089 		if (err) {
2090 			nm_prerr("Invalid CSB address");
2091 			return err;
2092 		}
2093 	}
2094 
2095 	priv->np_csb_atok_base = csb_atok_base;
2096 	priv->np_csb_ktoa_base = csb_ktoa_base;
2097 
2098 	/* Initialize the CSB. */
2099 	for_rx_tx(t) {
2100 		for (i = 0; i < num_rings[t]; i++) {
2101 			struct netmap_kring *kring =
2102 				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2103 			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2104 			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2105 
2106 			if (t == NR_RX) {
2107 				csb_atok += num_rings[NR_TX];
2108 				csb_ktoa += num_rings[NR_TX];
2109 			}
2110 
2111 			CSB_WRITE(csb_atok, head, kring->rhead);
2112 			CSB_WRITE(csb_atok, cur, kring->rcur);
2113 			CSB_WRITE(csb_atok, appl_need_kick, 1);
2114 			CSB_WRITE(csb_atok, sync_flags, 1);
2115 			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2116 			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2117 			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2118 
2119 			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2120 				"hwcur %u, hwtail %u", kring->name,
2121 				kring->rhead, kring->rcur, kring->nr_hwcur,
2122 				kring->nr_hwtail);
2123 		}
2124 	}
2125 
2126 	return 0;
2127 }
2128 
2129 /* Ensure that the netmap adapter can support the given MTU.
2130  * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2131  */
2132 int
2133 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2134 	unsigned nbs = NETMAP_BUF_SIZE(na);
2135 
2136 	if (mtu <= na->rx_buf_maxsize) {
2137 		/* The MTU fits a single NIC slot. We only
2138 		 * Need to check that netmap buffers are
2139 		 * large enough to hold an MTU. NS_MOREFRAG
2140 		 * cannot be used in this case. */
2141 		if (nbs < mtu) {
2142 			nm_prerr("error: netmap buf size (%u) "
2143 				 "< device MTU (%u)", nbs, mtu);
2144 			return EINVAL;
2145 		}
2146 	} else {
2147 		/* More NIC slots may be needed to receive
2148 		 * or transmit a single packet. Check that
2149 		 * the adapter supports NS_MOREFRAG and that
2150 		 * netmap buffers are large enough to hold
2151 		 * the maximum per-slot size. */
2152 		if (!(na->na_flags & NAF_MOREFRAG)) {
2153 			nm_prerr("error: large MTU (%d) needed "
2154 				 "but %s does not support "
2155 				 "NS_MOREFRAG", mtu,
2156 				 na->ifp->if_xname);
2157 			return EINVAL;
2158 		} else if (nbs < na->rx_buf_maxsize) {
2159 			nm_prerr("error: using NS_MOREFRAG on "
2160 				 "%s requires netmap buf size "
2161 				 ">= %u", na->ifp->if_xname,
2162 				 na->rx_buf_maxsize);
2163 			return EINVAL;
2164 		} else {
2165 			nm_prinf("info: netmap application on "
2166 				 "%s needs to support "
2167 				 "NS_MOREFRAG "
2168 				 "(MTU=%u,netmap_buf_size=%u)",
2169 				 na->ifp->if_xname, mtu, nbs);
2170 		}
2171 	}
2172 	return 0;
2173 }
2174 
2175 
2176 /*
2177  * possibly move the interface to netmap-mode.
2178  * If success it returns a pointer to netmap_if, otherwise NULL.
2179  * This must be called with NMG_LOCK held.
2180  *
2181  * The following na callbacks are called in the process:
2182  *
2183  * na->nm_config()			[by netmap_update_config]
2184  * (get current number and size of rings)
2185  *
2186  *  	We have a generic one for linux (netmap_linux_config).
2187  *  	The bwrap has to override this, since it has to forward
2188  *  	the request to the wrapped adapter (netmap_bwrap_config).
2189  *
2190  *
2191  * na->nm_krings_create()
2192  * (create and init the krings array)
2193  *
2194  * 	One of the following:
2195  *
2196  *	* netmap_hw_krings_create, 			(hw ports)
2197  *		creates the standard layout for the krings
2198  * 		and adds the mbq (used for the host rings).
2199  *
2200  * 	* netmap_vp_krings_create			(VALE ports)
2201  * 		add leases and scratchpads
2202  *
2203  * 	* netmap_pipe_krings_create			(pipes)
2204  * 		create the krings and rings of both ends and
2205  * 		cross-link them
2206  *
2207  *      * netmap_monitor_krings_create 			(monitors)
2208  *      	avoid allocating the mbq
2209  *
2210  *      * netmap_bwrap_krings_create			(bwraps)
2211  *      	create both the brap krings array,
2212  *      	the krings array of the wrapped adapter, and
2213  *      	(if needed) the fake array for the host adapter
2214  *
2215  * na->nm_register(, 1)
2216  * (put the adapter in netmap mode)
2217  *
2218  * 	This may be one of the following:
2219  *
2220  * 	* netmap_hw_reg				        (hw ports)
2221  * 		checks that the ifp is still there, then calls
2222  * 		the hardware specific callback;
2223  *
2224  * 	* netmap_vp_reg					(VALE ports)
2225  *		If the port is connected to a bridge,
2226  *		set the NAF_NETMAP_ON flag under the
2227  *		bridge write lock.
2228  *
2229  *	* netmap_pipe_reg				(pipes)
2230  *		inform the other pipe end that it is no
2231  *		longer responsible for the lifetime of this
2232  *		pipe end
2233  *
2234  *	* netmap_monitor_reg				(monitors)
2235  *		intercept the sync callbacks of the monitored
2236  *		rings
2237  *
2238  *	* netmap_bwrap_reg				(bwraps)
2239  *		cross-link the bwrap and hwna rings,
2240  *		forward the request to the hwna, override
2241  *		the hwna notify callback (to get the frames
2242  *		coming from outside go through the bridge).
2243  *
2244  *
2245  */
2246 int
2247 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2248 	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2249 {
2250 	struct netmap_if *nifp = NULL;
2251 	int error;
2252 
2253 	NMG_LOCK_ASSERT();
2254 	priv->np_na = na;     /* store the reference */
2255 	error = netmap_mem_finalize(na->nm_mem, na);
2256 	if (error)
2257 		goto err;
2258 
2259 	if (na->active_fds == 0) {
2260 
2261 		/* cache the allocator info in the na */
2262 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2263 		if (error)
2264 			goto err_drop_mem;
2265 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2266 					    na->na_lut.objsize);
2267 
2268 		/* ring configuration may have changed, fetch from the card */
2269 		netmap_update_config(na);
2270 	}
2271 
2272 	/* compute the range of tx and rx rings to monitor */
2273 	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2274 	if (error)
2275 		goto err_put_lut;
2276 
2277 	if (na->active_fds == 0) {
2278 		/*
2279 		 * If this is the first registration of the adapter,
2280 		 * perform sanity checks and create the in-kernel view
2281 		 * of the netmap rings (the netmap krings).
2282 		 */
2283 		if (na->ifp && nm_priv_rx_enabled(priv)) {
2284 			/* This netmap adapter is attached to an ifnet. */
2285 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2286 
2287 			ND("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2288 				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2289 
2290 			if (na->rx_buf_maxsize == 0) {
2291 				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2292 				error = EIO;
2293 				goto err_drop_mem;
2294 			}
2295 
2296 			error = netmap_buf_size_validate(na, mtu);
2297 			if (error)
2298 				goto err_drop_mem;
2299 		}
2300 
2301 		/*
2302 		 * Depending on the adapter, this may also create
2303 		 * the netmap rings themselves
2304 		 */
2305 		error = na->nm_krings_create(na);
2306 		if (error)
2307 			goto err_put_lut;
2308 
2309 	}
2310 
2311 	/* now the krings must exist and we can check whether some
2312 	 * previous bind has exclusive ownership on them, and set
2313 	 * nr_pending_mode
2314 	 */
2315 	error = netmap_krings_get(priv);
2316 	if (error)
2317 		goto err_del_krings;
2318 
2319 	/* create all needed missing netmap rings */
2320 	error = netmap_mem_rings_create(na);
2321 	if (error)
2322 		goto err_rel_excl;
2323 
2324 	/* in all cases, create a new netmap if */
2325 	nifp = netmap_mem_if_new(na, priv);
2326 	if (nifp == NULL) {
2327 		error = ENOMEM;
2328 		goto err_rel_excl;
2329 	}
2330 
2331 	if (nm_kring_pending(priv)) {
2332 		/* Some kring is switching mode, tell the adapter to
2333 		 * react on this. */
2334 		error = na->nm_register(na, 1);
2335 		if (error)
2336 			goto err_del_if;
2337 	}
2338 
2339 	/* Commit the reference. */
2340 	na->active_fds++;
2341 
2342 	/*
2343 	 * advertise that the interface is ready by setting np_nifp.
2344 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2345 	 * check for priv->np_nifp != NULL without locking
2346 	 */
2347 	mb(); /* make sure previous writes are visible to all CPUs */
2348 	priv->np_nifp = nifp;
2349 
2350 	return 0;
2351 
2352 err_del_if:
2353 	netmap_mem_if_delete(na, nifp);
2354 err_rel_excl:
2355 	netmap_krings_put(priv);
2356 	netmap_mem_rings_delete(na);
2357 err_del_krings:
2358 	if (na->active_fds == 0)
2359 		na->nm_krings_delete(na);
2360 err_put_lut:
2361 	if (na->active_fds == 0)
2362 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2363 err_drop_mem:
2364 	netmap_mem_drop(na);
2365 err:
2366 	priv->np_na = NULL;
2367 	return error;
2368 }
2369 
2370 
2371 /*
2372  * update kring and ring at the end of rxsync/txsync.
2373  */
2374 static inline void
2375 nm_sync_finalize(struct netmap_kring *kring)
2376 {
2377 	/*
2378 	 * Update ring tail to what the kernel knows
2379 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2380 	 * if no carrier.
2381 	 */
2382 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2383 
2384 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2385 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2386 		kring->rhead, kring->rcur, kring->rtail);
2387 }
2388 
2389 /* set ring timestamp */
2390 static inline void
2391 ring_timestamp_set(struct netmap_ring *ring)
2392 {
2393 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2394 		microtime(&ring->ts);
2395 	}
2396 }
2397 
2398 static int nmreq_copyin(struct nmreq_header *, int);
2399 static int nmreq_copyout(struct nmreq_header *, int);
2400 static int nmreq_checkoptions(struct nmreq_header *);
2401 
2402 /*
2403  * ioctl(2) support for the "netmap" device.
2404  *
2405  * Following a list of accepted commands:
2406  * - NIOCCTRL		device control API
2407  * - NIOCTXSYNC		sync TX rings
2408  * - NIOCRXSYNC		sync RX rings
2409  * - SIOCGIFADDR	just for convenience
2410  * - NIOCGINFO		deprecated (legacy API)
2411  * - NIOCREGIF		deprecated (legacy API)
2412  *
2413  * Return 0 on success, errno otherwise.
2414  */
2415 int
2416 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2417 		struct thread *td, int nr_body_is_user)
2418 {
2419 	struct mbq q;	/* packets from RX hw queues to host stack */
2420 	struct netmap_adapter *na = NULL;
2421 	struct netmap_mem_d *nmd = NULL;
2422 	struct ifnet *ifp = NULL;
2423 	int error = 0;
2424 	u_int i, qfirst, qlast;
2425 	struct netmap_kring **krings;
2426 	int sync_flags;
2427 	enum txrx t;
2428 
2429 	switch (cmd) {
2430 	case NIOCCTRL: {
2431 		struct nmreq_header *hdr = (struct nmreq_header *)data;
2432 
2433 		if (hdr->nr_version < NETMAP_MIN_API ||
2434 		    hdr->nr_version > NETMAP_MAX_API) {
2435 			nm_prerr("API mismatch: got %d need %d",
2436 				hdr->nr_version, NETMAP_API);
2437 			return EINVAL;
2438 		}
2439 
2440 		/* Make a kernel-space copy of the user-space nr_body.
2441 		 * For convenince, the nr_body pointer and the pointers
2442 		 * in the options list will be replaced with their
2443 		 * kernel-space counterparts. The original pointers are
2444 		 * saved internally and later restored by nmreq_copyout
2445 		 */
2446 		error = nmreq_copyin(hdr, nr_body_is_user);
2447 		if (error) {
2448 			return error;
2449 		}
2450 
2451 		/* Sanitize hdr->nr_name. */
2452 		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2453 
2454 		switch (hdr->nr_reqtype) {
2455 		case NETMAP_REQ_REGISTER: {
2456 			struct nmreq_register *req =
2457 				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2458 			struct netmap_if *nifp;
2459 
2460 			/* Protect access to priv from concurrent requests. */
2461 			NMG_LOCK();
2462 			do {
2463 				struct nmreq_option *opt;
2464 				u_int memflags;
2465 
2466 				if (priv->np_nifp != NULL) {	/* thread already registered */
2467 					error = EBUSY;
2468 					break;
2469 				}
2470 
2471 #ifdef WITH_EXTMEM
2472 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2473 						NETMAP_REQ_OPT_EXTMEM);
2474 				if (opt != NULL) {
2475 					struct nmreq_opt_extmem *e =
2476 						(struct nmreq_opt_extmem *)opt;
2477 
2478 					error = nmreq_checkduplicate(opt);
2479 					if (error) {
2480 						opt->nro_status = error;
2481 						break;
2482 					}
2483 					nmd = netmap_mem_ext_create(e->nro_usrptr,
2484 							&e->nro_info, &error);
2485 					opt->nro_status = error;
2486 					if (nmd == NULL)
2487 						break;
2488 				}
2489 #endif /* WITH_EXTMEM */
2490 
2491 				if (nmd == NULL && req->nr_mem_id) {
2492 					/* find the allocator and get a reference */
2493 					nmd = netmap_mem_find(req->nr_mem_id);
2494 					if (nmd == NULL) {
2495 						if (netmap_verbose) {
2496 							nm_prerr("%s: failed to find mem_id %u",
2497 									hdr->nr_name, req->nr_mem_id);
2498 						}
2499 						error = EINVAL;
2500 						break;
2501 					}
2502 				}
2503 				/* find the interface and a reference */
2504 				error = netmap_get_na(hdr, &na, &ifp, nmd,
2505 						      1 /* create */); /* keep reference */
2506 				if (error)
2507 					break;
2508 				if (NETMAP_OWNED_BY_KERN(na)) {
2509 					error = EBUSY;
2510 					break;
2511 				}
2512 
2513 				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2514 					nm_prerr("virt_hdr_len=%d, but application does "
2515 						"not accept it", na->virt_hdr_len);
2516 					error = EIO;
2517 					break;
2518 				}
2519 
2520 				error = netmap_do_regif(priv, na, req->nr_mode,
2521 							req->nr_ringid, req->nr_flags);
2522 				if (error) {    /* reg. failed, release priv and ref */
2523 					break;
2524 				}
2525 
2526 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2527 							NETMAP_REQ_OPT_CSB);
2528 				if (opt != NULL) {
2529 					struct nmreq_opt_csb *csbo =
2530 						(struct nmreq_opt_csb *)opt;
2531 					error = nmreq_checkduplicate(opt);
2532 					if (!error) {
2533 						error = netmap_csb_validate(priv, csbo);
2534 					}
2535 					opt->nro_status = error;
2536 					if (error) {
2537 						netmap_do_unregif(priv);
2538 						break;
2539 					}
2540 				}
2541 
2542 				nifp = priv->np_nifp;
2543 
2544 				/* return the offset of the netmap_if object */
2545 				req->nr_rx_rings = na->num_rx_rings;
2546 				req->nr_tx_rings = na->num_tx_rings;
2547 				req->nr_rx_slots = na->num_rx_desc;
2548 				req->nr_tx_slots = na->num_tx_desc;
2549 				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2550 					&req->nr_mem_id);
2551 				if (error) {
2552 					netmap_do_unregif(priv);
2553 					break;
2554 				}
2555 				if (memflags & NETMAP_MEM_PRIVATE) {
2556 					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2557 				}
2558 				for_rx_tx(t) {
2559 					priv->np_si[t] = nm_si_user(priv, t) ?
2560 						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2561 				}
2562 
2563 				if (req->nr_extra_bufs) {
2564 					if (netmap_verbose)
2565 						nm_prinf("requested %d extra buffers",
2566 							req->nr_extra_bufs);
2567 					req->nr_extra_bufs = netmap_extra_alloc(na,
2568 						&nifp->ni_bufs_head, req->nr_extra_bufs);
2569 					if (netmap_verbose)
2570 						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2571 				}
2572 				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2573 
2574 				error = nmreq_checkoptions(hdr);
2575 				if (error) {
2576 					netmap_do_unregif(priv);
2577 					break;
2578 				}
2579 
2580 				/* store ifp reference so that priv destructor may release it */
2581 				priv->np_ifp = ifp;
2582 			} while (0);
2583 			if (error) {
2584 				netmap_unget_na(na, ifp);
2585 			}
2586 			/* release the reference from netmap_mem_find() or
2587 			 * netmap_mem_ext_create()
2588 			 */
2589 			if (nmd)
2590 				netmap_mem_put(nmd);
2591 			NMG_UNLOCK();
2592 			break;
2593 		}
2594 
2595 		case NETMAP_REQ_PORT_INFO_GET: {
2596 			struct nmreq_port_info_get *req =
2597 				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2598 
2599 			NMG_LOCK();
2600 			do {
2601 				u_int memflags;
2602 
2603 				if (hdr->nr_name[0] != '\0') {
2604 					/* Build a nmreq_register out of the nmreq_port_info_get,
2605 					 * so that we can call netmap_get_na(). */
2606 					struct nmreq_register regreq;
2607 					bzero(&regreq, sizeof(regreq));
2608 					regreq.nr_mode = NR_REG_ALL_NIC;
2609 					regreq.nr_tx_slots = req->nr_tx_slots;
2610 					regreq.nr_rx_slots = req->nr_rx_slots;
2611 					regreq.nr_tx_rings = req->nr_tx_rings;
2612 					regreq.nr_rx_rings = req->nr_rx_rings;
2613 					regreq.nr_mem_id = req->nr_mem_id;
2614 
2615 					/* get a refcount */
2616 					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2617 					hdr->nr_body = (uintptr_t)&regreq;
2618 					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2619 					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2620 					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2621 					if (error) {
2622 						na = NULL;
2623 						ifp = NULL;
2624 						break;
2625 					}
2626 					nmd = na->nm_mem; /* get memory allocator */
2627 				} else {
2628 					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2629 					if (nmd == NULL) {
2630 						if (netmap_verbose)
2631 							nm_prerr("%s: failed to find mem_id %u",
2632 									hdr->nr_name,
2633 									req->nr_mem_id ? req->nr_mem_id : 1);
2634 						error = EINVAL;
2635 						break;
2636 					}
2637 				}
2638 
2639 				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2640 					&req->nr_mem_id);
2641 				if (error)
2642 					break;
2643 				if (na == NULL) /* only memory info */
2644 					break;
2645 				netmap_update_config(na);
2646 				req->nr_rx_rings = na->num_rx_rings;
2647 				req->nr_tx_rings = na->num_tx_rings;
2648 				req->nr_rx_slots = na->num_rx_desc;
2649 				req->nr_tx_slots = na->num_tx_desc;
2650 			} while (0);
2651 			netmap_unget_na(na, ifp);
2652 			NMG_UNLOCK();
2653 			break;
2654 		}
2655 #ifdef WITH_VALE
2656 		case NETMAP_REQ_VALE_ATTACH: {
2657 			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2658 			break;
2659 		}
2660 
2661 		case NETMAP_REQ_VALE_DETACH: {
2662 			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2663 			break;
2664 		}
2665 
2666 		case NETMAP_REQ_VALE_LIST: {
2667 			error = netmap_vale_list(hdr);
2668 			break;
2669 		}
2670 
2671 		case NETMAP_REQ_PORT_HDR_SET: {
2672 			struct nmreq_port_hdr *req =
2673 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2674 			/* Build a nmreq_register out of the nmreq_port_hdr,
2675 			 * so that we can call netmap_get_bdg_na(). */
2676 			struct nmreq_register regreq;
2677 			bzero(&regreq, sizeof(regreq));
2678 			regreq.nr_mode = NR_REG_ALL_NIC;
2679 
2680 			/* For now we only support virtio-net headers, and only for
2681 			 * VALE ports, but this may change in future. Valid lengths
2682 			 * for the virtio-net header are 0 (no header), 10 and 12. */
2683 			if (req->nr_hdr_len != 0 &&
2684 				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2685 					req->nr_hdr_len != 12) {
2686 				if (netmap_verbose)
2687 					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2688 				error = EINVAL;
2689 				break;
2690 			}
2691 			NMG_LOCK();
2692 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2693 			hdr->nr_body = (uintptr_t)&regreq;
2694 			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2695 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2696 			hdr->nr_body = (uintptr_t)req;
2697 			if (na && !error) {
2698 				struct netmap_vp_adapter *vpna =
2699 					(struct netmap_vp_adapter *)na;
2700 				na->virt_hdr_len = req->nr_hdr_len;
2701 				if (na->virt_hdr_len) {
2702 					vpna->mfs = NETMAP_BUF_SIZE(na);
2703 				}
2704 				if (netmap_verbose)
2705 					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2706 				netmap_adapter_put(na);
2707 			} else if (!na) {
2708 				error = ENXIO;
2709 			}
2710 			NMG_UNLOCK();
2711 			break;
2712 		}
2713 
2714 		case NETMAP_REQ_PORT_HDR_GET: {
2715 			/* Get vnet-header length for this netmap port */
2716 			struct nmreq_port_hdr *req =
2717 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2718 			/* Build a nmreq_register out of the nmreq_port_hdr,
2719 			 * so that we can call netmap_get_bdg_na(). */
2720 			struct nmreq_register regreq;
2721 			struct ifnet *ifp;
2722 
2723 			bzero(&regreq, sizeof(regreq));
2724 			regreq.nr_mode = NR_REG_ALL_NIC;
2725 			NMG_LOCK();
2726 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2727 			hdr->nr_body = (uintptr_t)&regreq;
2728 			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2729 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2730 			hdr->nr_body = (uintptr_t)req;
2731 			if (na && !error) {
2732 				req->nr_hdr_len = na->virt_hdr_len;
2733 			}
2734 			netmap_unget_na(na, ifp);
2735 			NMG_UNLOCK();
2736 			break;
2737 		}
2738 
2739 		case NETMAP_REQ_VALE_NEWIF: {
2740 			error = nm_vi_create(hdr);
2741 			break;
2742 		}
2743 
2744 		case NETMAP_REQ_VALE_DELIF: {
2745 			error = nm_vi_destroy(hdr->nr_name);
2746 			break;
2747 		}
2748 
2749 		case NETMAP_REQ_VALE_POLLING_ENABLE:
2750 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2751 			error = nm_bdg_polling(hdr);
2752 			break;
2753 		}
2754 #endif  /* WITH_VALE */
2755 		case NETMAP_REQ_POOLS_INFO_GET: {
2756 			/* Get information from the memory allocator used for
2757 			 * hdr->nr_name. */
2758 			struct nmreq_pools_info *req =
2759 				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2760 			NMG_LOCK();
2761 			do {
2762 				/* Build a nmreq_register out of the nmreq_pools_info,
2763 				 * so that we can call netmap_get_na(). */
2764 				struct nmreq_register regreq;
2765 				bzero(&regreq, sizeof(regreq));
2766 				regreq.nr_mem_id = req->nr_mem_id;
2767 				regreq.nr_mode = NR_REG_ALL_NIC;
2768 
2769 				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2770 				hdr->nr_body = (uintptr_t)&regreq;
2771 				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2772 				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2773 				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2774 				if (error) {
2775 					na = NULL;
2776 					ifp = NULL;
2777 					break;
2778 				}
2779 				nmd = na->nm_mem; /* grab the memory allocator */
2780 				if (nmd == NULL) {
2781 					error = EINVAL;
2782 					break;
2783 				}
2784 
2785 				/* Finalize the memory allocator, get the pools
2786 				 * information and release the allocator. */
2787 				error = netmap_mem_finalize(nmd, na);
2788 				if (error) {
2789 					break;
2790 				}
2791 				error = netmap_mem_pools_info_get(req, nmd);
2792 				netmap_mem_drop(na);
2793 			} while (0);
2794 			netmap_unget_na(na, ifp);
2795 			NMG_UNLOCK();
2796 			break;
2797 		}
2798 
2799 		case NETMAP_REQ_CSB_ENABLE: {
2800 			struct nmreq_option *opt;
2801 
2802 			opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2803 						NETMAP_REQ_OPT_CSB);
2804 			if (opt == NULL) {
2805 				error = EINVAL;
2806 			} else {
2807 				struct nmreq_opt_csb *csbo =
2808 					(struct nmreq_opt_csb *)opt;
2809 				error = nmreq_checkduplicate(opt);
2810 				if (!error) {
2811 					NMG_LOCK();
2812 					error = netmap_csb_validate(priv, csbo);
2813 					NMG_UNLOCK();
2814 				}
2815 				opt->nro_status = error;
2816 			}
2817 			break;
2818 		}
2819 
2820 		case NETMAP_REQ_SYNC_KLOOP_START: {
2821 			error = netmap_sync_kloop(priv, hdr);
2822 			break;
2823 		}
2824 
2825 		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2826 			error = netmap_sync_kloop_stop(priv);
2827 			break;
2828 		}
2829 
2830 		default: {
2831 			error = EINVAL;
2832 			break;
2833 		}
2834 		}
2835 		/* Write back request body to userspace and reset the
2836 		 * user-space pointer. */
2837 		error = nmreq_copyout(hdr, error);
2838 		break;
2839 	}
2840 
2841 	case NIOCTXSYNC:
2842 	case NIOCRXSYNC: {
2843 		if (unlikely(priv->np_nifp == NULL)) {
2844 			error = ENXIO;
2845 			break;
2846 		}
2847 		mb(); /* make sure following reads are not from cache */
2848 
2849 		if (unlikely(priv->np_csb_atok_base)) {
2850 			nm_prerr("Invalid sync in CSB mode");
2851 			error = EBUSY;
2852 			break;
2853 		}
2854 
2855 		na = priv->np_na;      /* we have a reference */
2856 
2857 		mbq_init(&q);
2858 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2859 		krings = NMR(na, t);
2860 		qfirst = priv->np_qfirst[t];
2861 		qlast = priv->np_qlast[t];
2862 		sync_flags = priv->np_sync_flags;
2863 
2864 		for (i = qfirst; i < qlast; i++) {
2865 			struct netmap_kring *kring = krings[i];
2866 			struct netmap_ring *ring = kring->ring;
2867 
2868 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2869 				error = (error ? EIO : 0);
2870 				continue;
2871 			}
2872 
2873 			if (cmd == NIOCTXSYNC) {
2874 				if (netmap_debug & NM_DEBUG_TXSYNC)
2875 					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2876 					    i, ring->cur,
2877 					    kring->nr_hwcur);
2878 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2879 					netmap_ring_reinit(kring);
2880 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2881 					nm_sync_finalize(kring);
2882 				}
2883 				if (netmap_debug & NM_DEBUG_TXSYNC)
2884 					nm_prinf("post txsync ring %d cur %d hwcur %d",
2885 					    i, ring->cur,
2886 					    kring->nr_hwcur);
2887 			} else {
2888 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2889 					netmap_ring_reinit(kring);
2890 				}
2891 				if (nm_may_forward_up(kring)) {
2892 					/* transparent forwarding, see netmap_poll() */
2893 					netmap_grab_packets(kring, &q, netmap_fwd);
2894 				}
2895 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2896 					nm_sync_finalize(kring);
2897 				}
2898 				ring_timestamp_set(ring);
2899 			}
2900 			nm_kr_put(kring);
2901 		}
2902 
2903 		if (mbq_peek(&q)) {
2904 			netmap_send_up(na->ifp, &q);
2905 		}
2906 
2907 		break;
2908 	}
2909 
2910 	default: {
2911 		return netmap_ioctl_legacy(priv, cmd, data, td);
2912 		break;
2913 	}
2914 	}
2915 
2916 	return (error);
2917 }
2918 
2919 size_t
2920 nmreq_size_by_type(uint16_t nr_reqtype)
2921 {
2922 	switch (nr_reqtype) {
2923 	case NETMAP_REQ_REGISTER:
2924 		return sizeof(struct nmreq_register);
2925 	case NETMAP_REQ_PORT_INFO_GET:
2926 		return sizeof(struct nmreq_port_info_get);
2927 	case NETMAP_REQ_VALE_ATTACH:
2928 		return sizeof(struct nmreq_vale_attach);
2929 	case NETMAP_REQ_VALE_DETACH:
2930 		return sizeof(struct nmreq_vale_detach);
2931 	case NETMAP_REQ_VALE_LIST:
2932 		return sizeof(struct nmreq_vale_list);
2933 	case NETMAP_REQ_PORT_HDR_SET:
2934 	case NETMAP_REQ_PORT_HDR_GET:
2935 		return sizeof(struct nmreq_port_hdr);
2936 	case NETMAP_REQ_VALE_NEWIF:
2937 		return sizeof(struct nmreq_vale_newif);
2938 	case NETMAP_REQ_VALE_DELIF:
2939 	case NETMAP_REQ_SYNC_KLOOP_STOP:
2940 	case NETMAP_REQ_CSB_ENABLE:
2941 		return 0;
2942 	case NETMAP_REQ_VALE_POLLING_ENABLE:
2943 	case NETMAP_REQ_VALE_POLLING_DISABLE:
2944 		return sizeof(struct nmreq_vale_polling);
2945 	case NETMAP_REQ_POOLS_INFO_GET:
2946 		return sizeof(struct nmreq_pools_info);
2947 	case NETMAP_REQ_SYNC_KLOOP_START:
2948 		return sizeof(struct nmreq_sync_kloop_start);
2949 	}
2950 	return 0;
2951 }
2952 
2953 static size_t
2954 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
2955 {
2956 	size_t rv = sizeof(struct nmreq_option);
2957 #ifdef NETMAP_REQ_OPT_DEBUG
2958 	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2959 		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2960 #endif /* NETMAP_REQ_OPT_DEBUG */
2961 	switch (nro_reqtype) {
2962 #ifdef WITH_EXTMEM
2963 	case NETMAP_REQ_OPT_EXTMEM:
2964 		rv = sizeof(struct nmreq_opt_extmem);
2965 		break;
2966 #endif /* WITH_EXTMEM */
2967 	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
2968 		if (nro_size >= rv)
2969 			rv = nro_size;
2970 		break;
2971 	case NETMAP_REQ_OPT_CSB:
2972 		rv = sizeof(struct nmreq_opt_csb);
2973 		break;
2974 	}
2975 	/* subtract the common header */
2976 	return rv - sizeof(struct nmreq_option);
2977 }
2978 
2979 int
2980 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2981 {
2982 	size_t rqsz, optsz, bufsz;
2983 	int error;
2984 	char *ker = NULL, *p;
2985 	struct nmreq_option **next, *src;
2986 	struct nmreq_option buf;
2987 	uint64_t *ptrs;
2988 
2989 	if (hdr->nr_reserved) {
2990 		if (netmap_verbose)
2991 			nm_prerr("nr_reserved must be zero");
2992 		return EINVAL;
2993 	}
2994 
2995 	if (!nr_body_is_user)
2996 		return 0;
2997 
2998 	hdr->nr_reserved = nr_body_is_user;
2999 
3000 	/* compute the total size of the buffer */
3001 	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3002 	if (rqsz > NETMAP_REQ_MAXSIZE) {
3003 		error = EMSGSIZE;
3004 		goto out_err;
3005 	}
3006 	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3007 		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3008 		/* Request body expected, but not found; or
3009 		 * request body found but unexpected. */
3010 		if (netmap_verbose)
3011 			nm_prerr("nr_body expected but not found, or vice versa");
3012 		error = EINVAL;
3013 		goto out_err;
3014 	}
3015 
3016 	bufsz = 2 * sizeof(void *) + rqsz;
3017 	optsz = 0;
3018 	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3019 	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3020 	{
3021 		error = copyin(src, &buf, sizeof(*src));
3022 		if (error)
3023 			goto out_err;
3024 		optsz += sizeof(*src);
3025 		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3026 		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3027 			error = EMSGSIZE;
3028 			goto out_err;
3029 		}
3030 		bufsz += optsz + sizeof(void *);
3031 	}
3032 
3033 	ker = nm_os_malloc(bufsz);
3034 	if (ker == NULL) {
3035 		error = ENOMEM;
3036 		goto out_err;
3037 	}
3038 	p = ker;
3039 
3040 	/* make a copy of the user pointers */
3041 	ptrs = (uint64_t*)p;
3042 	*ptrs++ = hdr->nr_body;
3043 	*ptrs++ = hdr->nr_options;
3044 	p = (char *)ptrs;
3045 
3046 	/* copy the body */
3047 	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3048 	if (error)
3049 		goto out_restore;
3050 	/* overwrite the user pointer with the in-kernel one */
3051 	hdr->nr_body = (uintptr_t)p;
3052 	p += rqsz;
3053 
3054 	/* copy the options */
3055 	next = (struct nmreq_option **)&hdr->nr_options;
3056 	src = *next;
3057 	while (src) {
3058 		struct nmreq_option *opt;
3059 
3060 		/* copy the option header */
3061 		ptrs = (uint64_t *)p;
3062 		opt = (struct nmreq_option *)(ptrs + 1);
3063 		error = copyin(src, opt, sizeof(*src));
3064 		if (error)
3065 			goto out_restore;
3066 		/* make a copy of the user next pointer */
3067 		*ptrs = opt->nro_next;
3068 		/* overwrite the user pointer with the in-kernel one */
3069 		*next = opt;
3070 
3071 		/* initialize the option as not supported.
3072 		 * Recognized options will update this field.
3073 		 */
3074 		opt->nro_status = EOPNOTSUPP;
3075 
3076 		p = (char *)(opt + 1);
3077 
3078 		/* copy the option body */
3079 		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3080 						opt->nro_size);
3081 		if (optsz) {
3082 			/* the option body follows the option header */
3083 			error = copyin(src + 1, p, optsz);
3084 			if (error)
3085 				goto out_restore;
3086 			p += optsz;
3087 		}
3088 
3089 		/* move to next option */
3090 		next = (struct nmreq_option **)&opt->nro_next;
3091 		src = *next;
3092 	}
3093 	return 0;
3094 
3095 out_restore:
3096 	ptrs = (uint64_t *)ker;
3097 	hdr->nr_body = *ptrs++;
3098 	hdr->nr_options = *ptrs++;
3099 	hdr->nr_reserved = 0;
3100 	nm_os_free(ker);
3101 out_err:
3102 	return error;
3103 }
3104 
3105 static int
3106 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3107 {
3108 	struct nmreq_option *src, *dst;
3109 	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3110 	uint64_t *ptrs;
3111 	size_t bodysz;
3112 	int error;
3113 
3114 	if (!hdr->nr_reserved)
3115 		return rerror;
3116 
3117 	/* restore the user pointers in the header */
3118 	ptrs = (uint64_t *)ker - 2;
3119 	bufstart = ptrs;
3120 	hdr->nr_body = *ptrs++;
3121 	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3122 	hdr->nr_options = *ptrs;
3123 
3124 	if (!rerror) {
3125 		/* copy the body */
3126 		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3127 		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3128 		if (error) {
3129 			rerror = error;
3130 			goto out;
3131 		}
3132 	}
3133 
3134 	/* copy the options */
3135 	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3136 	while (src) {
3137 		size_t optsz;
3138 		uint64_t next;
3139 
3140 		/* restore the user pointer */
3141 		next = src->nro_next;
3142 		ptrs = (uint64_t *)src - 1;
3143 		src->nro_next = *ptrs;
3144 
3145 		/* always copy the option header */
3146 		error = copyout(src, dst, sizeof(*src));
3147 		if (error) {
3148 			rerror = error;
3149 			goto out;
3150 		}
3151 
3152 		/* copy the option body only if there was no error */
3153 		if (!rerror && !src->nro_status) {
3154 			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3155 							src->nro_size);
3156 			if (optsz) {
3157 				error = copyout(src + 1, dst + 1, optsz);
3158 				if (error) {
3159 					rerror = error;
3160 					goto out;
3161 				}
3162 			}
3163 		}
3164 		src = (struct nmreq_option *)(uintptr_t)next;
3165 		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3166 	}
3167 
3168 
3169 out:
3170 	hdr->nr_reserved = 0;
3171 	nm_os_free(bufstart);
3172 	return rerror;
3173 }
3174 
3175 struct nmreq_option *
3176 nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
3177 {
3178 	for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3179 		if (opt->nro_reqtype == reqtype)
3180 			return opt;
3181 	return NULL;
3182 }
3183 
3184 int
3185 nmreq_checkduplicate(struct nmreq_option *opt) {
3186 	uint16_t type = opt->nro_reqtype;
3187 	int dup = 0;
3188 
3189 	while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
3190 			type))) {
3191 		dup++;
3192 		opt->nro_status = EINVAL;
3193 	}
3194 	return (dup ? EINVAL : 0);
3195 }
3196 
3197 static int
3198 nmreq_checkoptions(struct nmreq_header *hdr)
3199 {
3200 	struct nmreq_option *opt;
3201 	/* return error if there is still any option
3202 	 * marked as not supported
3203 	 */
3204 
3205 	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3206 	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3207 		if (opt->nro_status == EOPNOTSUPP)
3208 			return EOPNOTSUPP;
3209 
3210 	return 0;
3211 }
3212 
3213 /*
3214  * select(2) and poll(2) handlers for the "netmap" device.
3215  *
3216  * Can be called for one or more queues.
3217  * Return true the event mask corresponding to ready events.
3218  * If there are no ready events (and 'sr' is not NULL), do a
3219  * selrecord on either individual selinfo or on the global one.
3220  * Device-dependent parts (locking and sync of tx/rx rings)
3221  * are done through callbacks.
3222  *
3223  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3224  * The first one is remapped to pwait as selrecord() uses the name as an
3225  * hidden argument.
3226  */
3227 int
3228 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3229 {
3230 	struct netmap_adapter *na;
3231 	struct netmap_kring *kring;
3232 	struct netmap_ring *ring;
3233 	u_int i, want[NR_TXRX], revents = 0;
3234 	NM_SELINFO_T *si[NR_TXRX];
3235 #define want_tx want[NR_TX]
3236 #define want_rx want[NR_RX]
3237 	struct mbq q;	/* packets from RX hw queues to host stack */
3238 
3239 	/*
3240 	 * In order to avoid nested locks, we need to "double check"
3241 	 * txsync and rxsync if we decide to do a selrecord().
3242 	 * retry_tx (and retry_rx, later) prevent looping forever.
3243 	 */
3244 	int retry_tx = 1, retry_rx = 1;
3245 
3246 	/* Transparent mode: send_down is 1 if we have found some
3247 	 * packets to forward (host RX ring --> NIC) during the rx
3248 	 * scan and we have not sent them down to the NIC yet.
3249 	 * Transparent mode requires to bind all rings to a single
3250 	 * file descriptor.
3251 	 */
3252 	int send_down = 0;
3253 	int sync_flags = priv->np_sync_flags;
3254 
3255 	mbq_init(&q);
3256 
3257 	if (unlikely(priv->np_nifp == NULL)) {
3258 		return POLLERR;
3259 	}
3260 	mb(); /* make sure following reads are not from cache */
3261 
3262 	na = priv->np_na;
3263 
3264 	if (unlikely(!nm_netmap_on(na)))
3265 		return POLLERR;
3266 
3267 	if (unlikely(priv->np_csb_atok_base)) {
3268 		nm_prerr("Invalid poll in CSB mode");
3269 		return POLLERR;
3270 	}
3271 
3272 	if (netmap_debug & NM_DEBUG_ON)
3273 		nm_prinf("device %s events 0x%x", na->name, events);
3274 	want_tx = events & (POLLOUT | POLLWRNORM);
3275 	want_rx = events & (POLLIN | POLLRDNORM);
3276 
3277 	/*
3278 	 * If the card has more than one queue AND the file descriptor is
3279 	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3280 	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3281 	 * per file descriptor).
3282 	 * The interrupt routine in the driver wake one or the other
3283 	 * (or both) depending on which clients are active.
3284 	 *
3285 	 * rxsync() is only called if we run out of buffers on a POLLIN.
3286 	 * txsync() is called if we run out of buffers on POLLOUT, or
3287 	 * there are pending packets to send. The latter can be disabled
3288 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3289 	 */
3290 	si[NR_RX] = nm_si_user(priv, NR_RX) ? &na->si[NR_RX] :
3291 				&na->rx_rings[priv->np_qfirst[NR_RX]]->si;
3292 	si[NR_TX] = nm_si_user(priv, NR_TX) ? &na->si[NR_TX] :
3293 				&na->tx_rings[priv->np_qfirst[NR_TX]]->si;
3294 
3295 #ifdef __FreeBSD__
3296 	/*
3297 	 * We start with a lock free round which is cheap if we have
3298 	 * slots available. If this fails, then lock and call the sync
3299 	 * routines. We can't do this on Linux, as the contract says
3300 	 * that we must call nm_os_selrecord() unconditionally.
3301 	 */
3302 	if (want_tx) {
3303 		const enum txrx t = NR_TX;
3304 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3305 			kring = NMR(na, t)[i];
3306 			if (kring->ring->cur != kring->ring->tail) {
3307 				/* Some unseen TX space is available, so what
3308 				 * we don't need to run txsync. */
3309 				revents |= want[t];
3310 				want[t] = 0;
3311 				break;
3312 			}
3313 		}
3314 	}
3315 	if (want_rx) {
3316 		const enum txrx t = NR_RX;
3317 		int rxsync_needed = 0;
3318 
3319 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3320 			kring = NMR(na, t)[i];
3321 			if (kring->ring->cur == kring->ring->tail
3322 				|| kring->rhead != kring->ring->head) {
3323 				/* There are no unseen packets on this ring,
3324 				 * or there are some buffers to be returned
3325 				 * to the netmap port. We therefore go ahead
3326 				 * and run rxsync. */
3327 				rxsync_needed = 1;
3328 				break;
3329 			}
3330 		}
3331 		if (!rxsync_needed) {
3332 			revents |= want_rx;
3333 			want_rx = 0;
3334 		}
3335 	}
3336 #endif
3337 
3338 #ifdef linux
3339 	/* The selrecord must be unconditional on linux. */
3340 	nm_os_selrecord(sr, si[NR_RX]);
3341 	nm_os_selrecord(sr, si[NR_TX]);
3342 #endif /* linux */
3343 
3344 	/*
3345 	 * If we want to push packets out (priv->np_txpoll) or
3346 	 * want_tx is still set, we must issue txsync calls
3347 	 * (on all rings, to avoid that the tx rings stall).
3348 	 * Fortunately, normal tx mode has np_txpoll set.
3349 	 */
3350 	if (priv->np_txpoll || want_tx) {
3351 		/*
3352 		 * The first round checks if anyone is ready, if not
3353 		 * do a selrecord and another round to handle races.
3354 		 * want_tx goes to 0 if any space is found, and is
3355 		 * used to skip rings with no pending transmissions.
3356 		 */
3357 flush_tx:
3358 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3359 			int found = 0;
3360 
3361 			kring = na->tx_rings[i];
3362 			ring = kring->ring;
3363 
3364 			/*
3365 			 * Don't try to txsync this TX ring if we already found some
3366 			 * space in some of the TX rings (want_tx == 0) and there are no
3367 			 * TX slots in this ring that need to be flushed to the NIC
3368 			 * (head == hwcur).
3369 			 */
3370 			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3371 				continue;
3372 
3373 			if (nm_kr_tryget(kring, 1, &revents))
3374 				continue;
3375 
3376 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3377 				netmap_ring_reinit(kring);
3378 				revents |= POLLERR;
3379 			} else {
3380 				if (kring->nm_sync(kring, sync_flags))
3381 					revents |= POLLERR;
3382 				else
3383 					nm_sync_finalize(kring);
3384 			}
3385 
3386 			/*
3387 			 * If we found new slots, notify potential
3388 			 * listeners on the same ring.
3389 			 * Since we just did a txsync, look at the copies
3390 			 * of cur,tail in the kring.
3391 			 */
3392 			found = kring->rcur != kring->rtail;
3393 			nm_kr_put(kring);
3394 			if (found) { /* notify other listeners */
3395 				revents |= want_tx;
3396 				want_tx = 0;
3397 #ifndef linux
3398 				kring->nm_notify(kring, 0);
3399 #endif /* linux */
3400 			}
3401 		}
3402 		/* if there were any packet to forward we must have handled them by now */
3403 		send_down = 0;
3404 		if (want_tx && retry_tx && sr) {
3405 #ifndef linux
3406 			nm_os_selrecord(sr, si[NR_TX]);
3407 #endif /* !linux */
3408 			retry_tx = 0;
3409 			goto flush_tx;
3410 		}
3411 	}
3412 
3413 	/*
3414 	 * If want_rx is still set scan receive rings.
3415 	 * Do it on all rings because otherwise we starve.
3416 	 */
3417 	if (want_rx) {
3418 		/* two rounds here for race avoidance */
3419 do_retry_rx:
3420 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3421 			int found = 0;
3422 
3423 			kring = na->rx_rings[i];
3424 			ring = kring->ring;
3425 
3426 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3427 				continue;
3428 
3429 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3430 				netmap_ring_reinit(kring);
3431 				revents |= POLLERR;
3432 			}
3433 			/* now we can use kring->rcur, rtail */
3434 
3435 			/*
3436 			 * transparent mode support: collect packets from
3437 			 * hw rxring(s) that have been released by the user
3438 			 */
3439 			if (nm_may_forward_up(kring)) {
3440 				netmap_grab_packets(kring, &q, netmap_fwd);
3441 			}
3442 
3443 			/* Clear the NR_FORWARD flag anyway, it may be set by
3444 			 * the nm_sync() below only on for the host RX ring (see
3445 			 * netmap_rxsync_from_host()). */
3446 			kring->nr_kflags &= ~NR_FORWARD;
3447 			if (kring->nm_sync(kring, sync_flags))
3448 				revents |= POLLERR;
3449 			else
3450 				nm_sync_finalize(kring);
3451 			send_down |= (kring->nr_kflags & NR_FORWARD);
3452 			ring_timestamp_set(ring);
3453 			found = kring->rcur != kring->rtail;
3454 			nm_kr_put(kring);
3455 			if (found) {
3456 				revents |= want_rx;
3457 				retry_rx = 0;
3458 #ifndef linux
3459 				kring->nm_notify(kring, 0);
3460 #endif /* linux */
3461 			}
3462 		}
3463 
3464 #ifndef linux
3465 		if (retry_rx && sr) {
3466 			nm_os_selrecord(sr, si[NR_RX]);
3467 		}
3468 #endif /* !linux */
3469 		if (send_down || retry_rx) {
3470 			retry_rx = 0;
3471 			if (send_down)
3472 				goto flush_tx; /* and retry_rx */
3473 			else
3474 				goto do_retry_rx;
3475 		}
3476 	}
3477 
3478 	/*
3479 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3480 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3481 	 * to the host stack.
3482 	 */
3483 
3484 	if (mbq_peek(&q)) {
3485 		netmap_send_up(na->ifp, &q);
3486 	}
3487 
3488 	return (revents);
3489 #undef want_tx
3490 #undef want_rx
3491 }
3492 
3493 int
3494 nma_intr_enable(struct netmap_adapter *na, int onoff)
3495 {
3496 	bool changed = false;
3497 	enum txrx t;
3498 	int i;
3499 
3500 	for_rx_tx(t) {
3501 		for (i = 0; i < nma_get_nrings(na, t); i++) {
3502 			struct netmap_kring *kring = NMR(na, t)[i];
3503 			int on = !(kring->nr_kflags & NKR_NOINTR);
3504 
3505 			if (!!onoff != !!on) {
3506 				changed = true;
3507 			}
3508 			if (onoff) {
3509 				kring->nr_kflags &= ~NKR_NOINTR;
3510 			} else {
3511 				kring->nr_kflags |= NKR_NOINTR;
3512 			}
3513 		}
3514 	}
3515 
3516 	if (!changed) {
3517 		return 0; /* nothing to do */
3518 	}
3519 
3520 	if (!na->nm_intr) {
3521 		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3522 		  na->name);
3523 		return -1;
3524 	}
3525 
3526 	na->nm_intr(na, onoff);
3527 
3528 	return 0;
3529 }
3530 
3531 
3532 /*-------------------- driver support routines -------------------*/
3533 
3534 /* default notify callback */
3535 static int
3536 netmap_notify(struct netmap_kring *kring, int flags)
3537 {
3538 	struct netmap_adapter *na = kring->notify_na;
3539 	enum txrx t = kring->tx;
3540 
3541 	nm_os_selwakeup(&kring->si);
3542 	/* optimization: avoid a wake up on the global
3543 	 * queue if nobody has registered for more
3544 	 * than one ring
3545 	 */
3546 	if (na->si_users[t] > 0)
3547 		nm_os_selwakeup(&na->si[t]);
3548 
3549 	return NM_IRQ_COMPLETED;
3550 }
3551 
3552 /* called by all routines that create netmap_adapters.
3553  * provide some defaults and get a reference to the
3554  * memory allocator
3555  */
3556 int
3557 netmap_attach_common(struct netmap_adapter *na)
3558 {
3559 	if (!na->rx_buf_maxsize) {
3560 		/* Set a conservative default (larger is safer). */
3561 		na->rx_buf_maxsize = PAGE_SIZE;
3562 	}
3563 
3564 #ifdef __FreeBSD__
3565 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3566 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3567 	}
3568 	na->pdev = na; /* make sure netmap_mem_map() is called */
3569 #endif /* __FreeBSD__ */
3570 	if (na->na_flags & NAF_HOST_RINGS) {
3571 		if (na->num_host_rx_rings == 0)
3572 			na->num_host_rx_rings = 1;
3573 		if (na->num_host_tx_rings == 0)
3574 			na->num_host_tx_rings = 1;
3575 	}
3576 	if (na->nm_krings_create == NULL) {
3577 		/* we assume that we have been called by a driver,
3578 		 * since other port types all provide their own
3579 		 * nm_krings_create
3580 		 */
3581 		na->nm_krings_create = netmap_hw_krings_create;
3582 		na->nm_krings_delete = netmap_hw_krings_delete;
3583 	}
3584 	if (na->nm_notify == NULL)
3585 		na->nm_notify = netmap_notify;
3586 	na->active_fds = 0;
3587 
3588 	if (na->nm_mem == NULL) {
3589 		/* use the global allocator */
3590 		na->nm_mem = netmap_mem_get(&nm_mem);
3591 	}
3592 #ifdef WITH_VALE
3593 	if (na->nm_bdg_attach == NULL)
3594 		/* no special nm_bdg_attach callback. On VALE
3595 		 * attach, we need to interpose a bwrap
3596 		 */
3597 		na->nm_bdg_attach = netmap_default_bdg_attach;
3598 #endif
3599 
3600 	return 0;
3601 }
3602 
3603 /* Wrapper for the register callback provided netmap-enabled
3604  * hardware drivers.
3605  * nm_iszombie(na) means that the driver module has been
3606  * unloaded, so we cannot call into it.
3607  * nm_os_ifnet_lock() must guarantee mutual exclusion with
3608  * module unloading.
3609  */
3610 static int
3611 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3612 {
3613 	struct netmap_hw_adapter *hwna =
3614 		(struct netmap_hw_adapter*)na;
3615 	int error = 0;
3616 
3617 	nm_os_ifnet_lock();
3618 
3619 	if (nm_iszombie(na)) {
3620 		if (onoff) {
3621 			error = ENXIO;
3622 		} else if (na != NULL) {
3623 			na->na_flags &= ~NAF_NETMAP_ON;
3624 		}
3625 		goto out;
3626 	}
3627 
3628 	error = hwna->nm_hw_register(na, onoff);
3629 
3630 out:
3631 	nm_os_ifnet_unlock();
3632 
3633 	return error;
3634 }
3635 
3636 static void
3637 netmap_hw_dtor(struct netmap_adapter *na)
3638 {
3639 	if (na->ifp == NULL)
3640 		return;
3641 
3642 	NM_DETACH_NA(na->ifp);
3643 }
3644 
3645 
3646 /*
3647  * Allocate a netmap_adapter object, and initialize it from the
3648  * 'arg' passed by the driver on attach.
3649  * We allocate a block of memory of 'size' bytes, which has room
3650  * for struct netmap_adapter plus additional room private to
3651  * the caller.
3652  * Return 0 on success, ENOMEM otherwise.
3653  */
3654 int
3655 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3656 {
3657 	struct netmap_hw_adapter *hwna = NULL;
3658 	struct ifnet *ifp = NULL;
3659 
3660 	if (size < sizeof(struct netmap_hw_adapter)) {
3661 		if (netmap_debug & NM_DEBUG_ON)
3662 			nm_prerr("Invalid netmap adapter size %d", (int)size);
3663 		return EINVAL;
3664 	}
3665 
3666 	if (arg == NULL || arg->ifp == NULL) {
3667 		if (netmap_debug & NM_DEBUG_ON)
3668 			nm_prerr("either arg or arg->ifp is NULL");
3669 		return EINVAL;
3670 	}
3671 
3672 	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3673 		if (netmap_debug & NM_DEBUG_ON)
3674 			nm_prerr("%s: invalid rings tx %d rx %d",
3675 				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3676 		return EINVAL;
3677 	}
3678 
3679 	ifp = arg->ifp;
3680 	if (NM_NA_CLASH(ifp)) {
3681 		/* If NA(ifp) is not null but there is no valid netmap
3682 		 * adapter it means that someone else is using the same
3683 		 * pointer (e.g. ax25_ptr on linux). This happens for
3684 		 * instance when also PF_RING is in use. */
3685 		nm_prerr("Error: netmap adapter hook is busy");
3686 		return EBUSY;
3687 	}
3688 
3689 	hwna = nm_os_malloc(size);
3690 	if (hwna == NULL)
3691 		goto fail;
3692 	hwna->up = *arg;
3693 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3694 	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3695 	if (override_reg) {
3696 		hwna->nm_hw_register = hwna->up.nm_register;
3697 		hwna->up.nm_register = netmap_hw_reg;
3698 	}
3699 	if (netmap_attach_common(&hwna->up)) {
3700 		nm_os_free(hwna);
3701 		goto fail;
3702 	}
3703 	netmap_adapter_get(&hwna->up);
3704 
3705 	NM_ATTACH_NA(ifp, &hwna->up);
3706 
3707 	nm_os_onattach(ifp);
3708 
3709 	if (arg->nm_dtor == NULL) {
3710 		hwna->up.nm_dtor = netmap_hw_dtor;
3711 	}
3712 
3713 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3714 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3715 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3716 	return 0;
3717 
3718 fail:
3719 	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3720 	return (hwna ? EINVAL : ENOMEM);
3721 }
3722 
3723 
3724 int
3725 netmap_attach(struct netmap_adapter *arg)
3726 {
3727 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3728 			1 /* override nm_reg */);
3729 }
3730 
3731 
3732 void
3733 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3734 {
3735 	if (!na) {
3736 		return;
3737 	}
3738 
3739 	refcount_acquire(&na->na_refcount);
3740 }
3741 
3742 
3743 /* returns 1 iff the netmap_adapter is destroyed */
3744 int
3745 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3746 {
3747 	if (!na)
3748 		return 1;
3749 
3750 	if (!refcount_release(&na->na_refcount))
3751 		return 0;
3752 
3753 	if (na->nm_dtor)
3754 		na->nm_dtor(na);
3755 
3756 	if (na->tx_rings) { /* XXX should not happen */
3757 		if (netmap_debug & NM_DEBUG_ON)
3758 			nm_prerr("freeing leftover tx_rings");
3759 		na->nm_krings_delete(na);
3760 	}
3761 	netmap_pipe_dealloc(na);
3762 	if (na->nm_mem)
3763 		netmap_mem_put(na->nm_mem);
3764 	bzero(na, sizeof(*na));
3765 	nm_os_free(na);
3766 
3767 	return 1;
3768 }
3769 
3770 /* nm_krings_create callback for all hardware native adapters */
3771 int
3772 netmap_hw_krings_create(struct netmap_adapter *na)
3773 {
3774 	int ret = netmap_krings_create(na, 0);
3775 	if (ret == 0) {
3776 		/* initialize the mbq for the sw rx ring */
3777 		u_int lim = netmap_real_rings(na, NR_RX), i;
3778 		for (i = na->num_rx_rings; i < lim; i++) {
3779 			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3780 		}
3781 		ND("initialized sw rx queue %d", na->num_rx_rings);
3782 	}
3783 	return ret;
3784 }
3785 
3786 
3787 
3788 /*
3789  * Called on module unload by the netmap-enabled drivers
3790  */
3791 void
3792 netmap_detach(struct ifnet *ifp)
3793 {
3794 	struct netmap_adapter *na = NA(ifp);
3795 
3796 	if (!na)
3797 		return;
3798 
3799 	NMG_LOCK();
3800 	netmap_set_all_rings(na, NM_KR_LOCKED);
3801 	/*
3802 	 * if the netmap adapter is not native, somebody
3803 	 * changed it, so we can not release it here.
3804 	 * The NAF_ZOMBIE flag will notify the new owner that
3805 	 * the driver is gone.
3806 	 */
3807 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3808 		na->na_flags |= NAF_ZOMBIE;
3809 	}
3810 	/* give active users a chance to notice that NAF_ZOMBIE has been
3811 	 * turned on, so that they can stop and return an error to userspace.
3812 	 * Note that this becomes a NOP if there are no active users and,
3813 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3814 	 * NULL.
3815 	 */
3816 	netmap_enable_all_rings(ifp);
3817 	NMG_UNLOCK();
3818 }
3819 
3820 
3821 /*
3822  * Intercept packets from the network stack and pass them
3823  * to netmap as incoming packets on the 'software' ring.
3824  *
3825  * We only store packets in a bounded mbq and then copy them
3826  * in the relevant rxsync routine.
3827  *
3828  * We rely on the OS to make sure that the ifp and na do not go
3829  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3830  * In nm_register() or whenever there is a reinitialization,
3831  * we make sure to make the mode change visible here.
3832  */
3833 int
3834 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3835 {
3836 	struct netmap_adapter *na = NA(ifp);
3837 	struct netmap_kring *kring, *tx_kring;
3838 	u_int len = MBUF_LEN(m);
3839 	u_int error = ENOBUFS;
3840 	unsigned int txr;
3841 	struct mbq *q;
3842 	int busy;
3843 	u_int i;
3844 
3845 	i = MBUF_TXQ(m);
3846 	if (i >= na->num_host_rx_rings) {
3847 		i = i % na->num_host_rx_rings;
3848 	}
3849 	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3850 
3851 	// XXX [Linux] we do not need this lock
3852 	// if we follow the down/configure/up protocol -gl
3853 	// mtx_lock(&na->core_lock);
3854 
3855 	if (!nm_netmap_on(na)) {
3856 		nm_prerr("%s not in netmap mode anymore", na->name);
3857 		error = ENXIO;
3858 		goto done;
3859 	}
3860 
3861 	txr = MBUF_TXQ(m);
3862 	if (txr >= na->num_tx_rings) {
3863 		txr %= na->num_tx_rings;
3864 	}
3865 	tx_kring = NMR(na, NR_TX)[txr];
3866 
3867 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3868 		return MBUF_TRANSMIT(na, ifp, m);
3869 	}
3870 
3871 	q = &kring->rx_queue;
3872 
3873 	// XXX reconsider long packets if we handle fragments
3874 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3875 		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
3876 			len, NETMAP_BUF_SIZE(na));
3877 		goto done;
3878 	}
3879 
3880 	if (!netmap_generic_hwcsum) {
3881 		if (nm_os_mbuf_has_csum_offld(m)) {
3882 			RD(1, "%s drop mbuf that needs checksum offload", na->name);
3883 			goto done;
3884 		}
3885 	}
3886 
3887 	if (nm_os_mbuf_has_seg_offld(m)) {
3888 		RD(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3889 		goto done;
3890 	}
3891 
3892 #ifdef __FreeBSD__
3893 	ETHER_BPF_MTAP(ifp, m);
3894 #endif /* __FreeBSD__ */
3895 
3896 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3897 	 * and maybe other instances of netmap_transmit (the latter
3898 	 * not possible on Linux).
3899 	 * We enqueue the mbuf only if we are sure there is going to be
3900 	 * enough room in the host RX ring, otherwise we drop it.
3901 	 */
3902 	mbq_lock(q);
3903 
3904 	busy = kring->nr_hwtail - kring->nr_hwcur;
3905 	if (busy < 0)
3906 		busy += kring->nkr_num_slots;
3907 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3908 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3909 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3910 	} else {
3911 		mbq_enqueue(q, m);
3912 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3913 		/* notify outside the lock */
3914 		m = NULL;
3915 		error = 0;
3916 	}
3917 	mbq_unlock(q);
3918 
3919 done:
3920 	if (m)
3921 		m_freem(m);
3922 	/* unconditionally wake up listeners */
3923 	kring->nm_notify(kring, 0);
3924 	/* this is normally netmap_notify(), but for nics
3925 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3926 	 * that possibly forwards the frames through the switch
3927 	 */
3928 
3929 	return (error);
3930 }
3931 
3932 
3933 /*
3934  * netmap_reset() is called by the driver routines when reinitializing
3935  * a ring. The driver is in charge of locking to protect the kring.
3936  * If native netmap mode is not set just return NULL.
3937  * If native netmap mode is set, in particular, we have to set nr_mode to
3938  * NKR_NETMAP_ON.
3939  */
3940 struct netmap_slot *
3941 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3942 	u_int new_cur)
3943 {
3944 	struct netmap_kring *kring;
3945 	int new_hwofs, lim;
3946 
3947 	if (!nm_native_on(na)) {
3948 		ND("interface not in native netmap mode");
3949 		return NULL;	/* nothing to reinitialize */
3950 	}
3951 
3952 	/* XXX note- in the new scheme, we are not guaranteed to be
3953 	 * under lock (e.g. when called on a device reset).
3954 	 * In this case, we should set a flag and do not trust too
3955 	 * much the values. In practice: TODO
3956 	 * - set a RESET flag somewhere in the kring
3957 	 * - do the processing in a conservative way
3958 	 * - let the *sync() fixup at the end.
3959 	 */
3960 	if (tx == NR_TX) {
3961 		if (n >= na->num_tx_rings)
3962 			return NULL;
3963 
3964 		kring = na->tx_rings[n];
3965 
3966 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3967 			kring->nr_mode = NKR_NETMAP_OFF;
3968 			return NULL;
3969 		}
3970 
3971 		// XXX check whether we should use hwcur or rcur
3972 		new_hwofs = kring->nr_hwcur - new_cur;
3973 	} else {
3974 		if (n >= na->num_rx_rings)
3975 			return NULL;
3976 		kring = na->rx_rings[n];
3977 
3978 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3979 			kring->nr_mode = NKR_NETMAP_OFF;
3980 			return NULL;
3981 		}
3982 
3983 		new_hwofs = kring->nr_hwtail - new_cur;
3984 	}
3985 	lim = kring->nkr_num_slots - 1;
3986 	if (new_hwofs > lim)
3987 		new_hwofs -= lim + 1;
3988 
3989 	/* Always set the new offset value and realign the ring. */
3990 	if (netmap_debug & NM_DEBUG_ON)
3991 	    nm_prinf("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3992 		na->name,
3993 		tx == NR_TX ? "TX" : "RX", n,
3994 		kring->nkr_hwofs, new_hwofs,
3995 		kring->nr_hwtail,
3996 		tx == NR_TX ? lim : kring->nr_hwtail);
3997 	kring->nkr_hwofs = new_hwofs;
3998 	if (tx == NR_TX) {
3999 		kring->nr_hwtail = kring->nr_hwcur + lim;
4000 		if (kring->nr_hwtail > lim)
4001 			kring->nr_hwtail -= lim + 1;
4002 	}
4003 
4004 	/*
4005 	 * Wakeup on the individual and global selwait
4006 	 * We do the wakeup here, but the ring is not yet reconfigured.
4007 	 * However, we are under lock so there are no races.
4008 	 */
4009 	kring->nr_mode = NKR_NETMAP_ON;
4010 	kring->nm_notify(kring, 0);
4011 	return kring->ring->slot;
4012 }
4013 
4014 
4015 /*
4016  * Dispatch rx/tx interrupts to the netmap rings.
4017  *
4018  * "work_done" is non-null on the RX path, NULL for the TX path.
4019  * We rely on the OS to make sure that there is only one active
4020  * instance per queue, and that there is appropriate locking.
4021  *
4022  * The 'notify' routine depends on what the ring is attached to.
4023  * - for a netmap file descriptor, do a selwakeup on the individual
4024  *   waitqueue, plus one on the global one if needed
4025  *   (see netmap_notify)
4026  * - for a nic connected to a switch, call the proper forwarding routine
4027  *   (see netmap_bwrap_intr_notify)
4028  */
4029 int
4030 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4031 {
4032 	struct netmap_kring *kring;
4033 	enum txrx t = (work_done ? NR_RX : NR_TX);
4034 
4035 	q &= NETMAP_RING_MASK;
4036 
4037 	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4038 	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4039 	}
4040 
4041 	if (q >= nma_get_nrings(na, t))
4042 		return NM_IRQ_PASS; // not a physical queue
4043 
4044 	kring = NMR(na, t)[q];
4045 
4046 	if (kring->nr_mode == NKR_NETMAP_OFF) {
4047 		return NM_IRQ_PASS;
4048 	}
4049 
4050 	if (t == NR_RX) {
4051 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4052 		*work_done = 1; /* do not fire napi again */
4053 	}
4054 
4055 	return kring->nm_notify(kring, 0);
4056 }
4057 
4058 
4059 /*
4060  * Default functions to handle rx/tx interrupts from a physical device.
4061  * "work_done" is non-null on the RX path, NULL for the TX path.
4062  *
4063  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4064  * so that the caller proceeds with regular processing.
4065  * Otherwise call netmap_common_irq().
4066  *
4067  * If the card is connected to a netmap file descriptor,
4068  * do a selwakeup on the individual queue, plus one on the global one
4069  * if needed (multiqueue card _and_ there are multiqueue listeners),
4070  * and return NR_IRQ_COMPLETED.
4071  *
4072  * Finally, if called on rx from an interface connected to a switch,
4073  * calls the proper forwarding routine.
4074  */
4075 int
4076 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4077 {
4078 	struct netmap_adapter *na = NA(ifp);
4079 
4080 	/*
4081 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4082 	 * we still use the regular driver even though the previous
4083 	 * check fails. It is unclear whether we should use
4084 	 * nm_native_on() here.
4085 	 */
4086 	if (!nm_netmap_on(na))
4087 		return NM_IRQ_PASS;
4088 
4089 	if (na->na_flags & NAF_SKIP_INTR) {
4090 		ND("use regular interrupt");
4091 		return NM_IRQ_PASS;
4092 	}
4093 
4094 	return netmap_common_irq(na, q, work_done);
4095 }
4096 
4097 /* set/clear native flags and if_transmit/netdev_ops */
4098 void
4099 nm_set_native_flags(struct netmap_adapter *na)
4100 {
4101 	struct ifnet *ifp = na->ifp;
4102 
4103 	/* We do the setup for intercepting packets only if we are the
4104 	 * first user of this adapapter. */
4105 	if (na->active_fds > 0) {
4106 		return;
4107 	}
4108 
4109 	na->na_flags |= NAF_NETMAP_ON;
4110 	nm_os_onenter(ifp);
4111 	nm_update_hostrings_mode(na);
4112 }
4113 
4114 void
4115 nm_clear_native_flags(struct netmap_adapter *na)
4116 {
4117 	struct ifnet *ifp = na->ifp;
4118 
4119 	/* We undo the setup for intercepting packets only if we are the
4120 	 * last user of this adapter. */
4121 	if (na->active_fds > 0) {
4122 		return;
4123 	}
4124 
4125 	nm_update_hostrings_mode(na);
4126 	nm_os_onexit(ifp);
4127 
4128 	na->na_flags &= ~NAF_NETMAP_ON;
4129 }
4130 
4131 /*
4132  * Module loader and unloader
4133  *
4134  * netmap_init() creates the /dev/netmap device and initializes
4135  * all global variables. Returns 0 on success, errno on failure
4136  * (but there is no chance)
4137  *
4138  * netmap_fini() destroys everything.
4139  */
4140 
4141 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4142 extern struct cdevsw netmap_cdevsw;
4143 
4144 
4145 void
4146 netmap_fini(void)
4147 {
4148 	if (netmap_dev)
4149 		destroy_dev(netmap_dev);
4150 	/* we assume that there are no longer netmap users */
4151 	nm_os_ifnet_fini();
4152 	netmap_uninit_bridges();
4153 	netmap_mem_fini();
4154 	NMG_LOCK_DESTROY();
4155 	nm_prinf("netmap: unloaded module.");
4156 }
4157 
4158 
4159 int
4160 netmap_init(void)
4161 {
4162 	int error;
4163 
4164 	NMG_LOCK_INIT();
4165 
4166 	error = netmap_mem_init();
4167 	if (error != 0)
4168 		goto fail;
4169 	/*
4170 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4171 	 * when the module is compiled in.
4172 	 * XXX could use make_dev_credv() to get error number
4173 	 */
4174 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4175 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4176 			      "netmap");
4177 	if (!netmap_dev)
4178 		goto fail;
4179 
4180 	error = netmap_init_bridges();
4181 	if (error)
4182 		goto fail;
4183 
4184 #ifdef __FreeBSD__
4185 	nm_os_vi_init_index();
4186 #endif
4187 
4188 	error = nm_os_ifnet_init();
4189 	if (error)
4190 		goto fail;
4191 
4192 	nm_prinf("netmap: loaded module");
4193 	return (0);
4194 fail:
4195 	netmap_fini();
4196 	return (EINVAL); /* may be incorrect */
4197 }
4198