xref: /freebsd-14.2/sys/dev/netmap/netmap.c (revision 954dca4c)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 #ifdef __FreeBSD__
28 #define TEST_STUFF	// test code, does not compile yet on linux
29 #endif /* __FreeBSD__ */
30 
31 /*
32  * This module supports memory mapped access to network devices,
33  * see netmap(4).
34  *
35  * The module uses a large, memory pool allocated by the kernel
36  * and accessible as mmapped memory by multiple userspace threads/processes.
37  * The memory pool contains packet buffers and "netmap rings",
38  * i.e. user-accessible copies of the interface's queues.
39  *
40  * Access to the network card works like this:
41  * 1. a process/thread issues one or more open() on /dev/netmap, to create
42  *    select()able file descriptor on which events are reported.
43  * 2. on each descriptor, the process issues an ioctl() to identify
44  *    the interface that should report events to the file descriptor.
45  * 3. on each descriptor, the process issues an mmap() request to
46  *    map the shared memory region within the process' address space.
47  *    The list of interesting queues is indicated by a location in
48  *    the shared memory region.
49  * 4. using the functions in the netmap(4) userspace API, a process
50  *    can look up the occupation state of a queue, access memory buffers,
51  *    and retrieve received packets or enqueue packets to transmit.
52  * 5. using some ioctl()s the process can synchronize the userspace view
53  *    of the queue with the actual status in the kernel. This includes both
54  *    receiving the notification of new packets, and transmitting new
55  *    packets on the output interface.
56  * 6. select() or poll() can be used to wait for events on individual
57  *    transmit or receive queues (or all queues for a given interface).
58  *
59 
60 		SYNCHRONIZATION (USER)
61 
62 The netmap rings and data structures may be shared among multiple
63 user threads or even independent processes.
64 Any synchronization among those threads/processes is delegated
65 to the threads themselves. Only one thread at a time can be in
66 a system call on the same netmap ring. The OS does not enforce
67 this and only guarantees against system crashes in case of
68 invalid usage.
69 
70 		LOCKING (INTERNAL)
71 
72 Within the kernel, access to the netmap rings is protected as follows:
73 
74 - a spinlock on each ring, to handle producer/consumer races on
75   RX rings attached to the host stack (against multiple host
76   threads writing from the host stack to the same ring),
77   and on 'destination' rings attached to a VALE switch
78   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
79   protecting multiple active senders for the same destination)
80 
81 - an atomic variable to guarantee that there is at most one
82   instance of *_*xsync() on the ring at any time.
83   For rings connected to user file
84   descriptors, an atomic_test_and_set() protects this, and the
85   lock on the ring is not actually used.
86   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
87   is also used to prevent multiple executions (the driver might indeed
88   already guarantee this).
89   For NIC TX rings connected to a VALE switch, the lock arbitrates
90   access to the queue (both when allocating buffers and when pushing
91   them out).
92 
93 - *xsync() should be protected against initializations of the card.
94   On FreeBSD most devices have the reset routine protected by
95   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
96   the RING protection on rx_reset(), this should be added.
97 
98   On linux there is an external lock on the tx path, which probably
99   also arbitrates access to the reset routine. XXX to be revised
100 
101 - a per-interface core_lock protecting access from the host stack
102   while interfaces may be detached from netmap mode.
103   XXX there should be no need for this lock if we detach the interfaces
104   only while they are down.
105 
106 
107 --- VALE SWITCH ---
108 
109 NMG_LOCK() serializes all modifications to switches and ports.
110 A switch cannot be deleted until all ports are gone.
111 
112 For each switch, an SX lock (RWlock on linux) protects
113 deletion of ports. When configuring or deleting a new port, the
114 lock is acquired in exclusive mode (after holding NMG_LOCK).
115 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
116 The lock is held throughout the entire forwarding cycle,
117 during which the thread may incur in a page fault.
118 Hence it is important that sleepable shared locks are used.
119 
120 On the rx ring, the per-port lock is grabbed initially to reserve
121 a number of slot in the ring, then the lock is released,
122 packets are copied from source to destination, and then
123 the lock is acquired again and the receive ring is updated.
124 (A similar thing is done on the tx ring for NIC and host stack
125 ports attached to the switch)
126 
127  */
128 
129 /*
130  * OS-specific code that is used only within this file.
131  * Other OS-specific code that must be accessed by drivers
132  * is present in netmap_kern.h
133  */
134 
135 #if defined(__FreeBSD__)
136 #include <sys/cdefs.h> /* prerequisite */
137 __FBSDID("$FreeBSD$");
138 
139 #include <sys/types.h>
140 #include <sys/module.h>
141 #include <sys/errno.h>
142 #include <sys/param.h>	/* defines used in kernel.h */
143 #include <sys/jail.h>
144 #include <sys/kernel.h>	/* types used in module initialization */
145 #include <sys/conf.h>	/* cdevsw struct */
146 #include <sys/uio.h>	/* uio struct */
147 #include <sys/sockio.h>
148 #include <sys/socketvar.h>	/* struct socket */
149 #include <sys/malloc.h>
150 #include <sys/mman.h>	/* PROT_EXEC */
151 #include <sys/poll.h>
152 #include <sys/proc.h>
153 #include <sys/rwlock.h>
154 #include <vm/vm.h>	/* vtophys */
155 #include <vm/pmap.h>	/* vtophys */
156 #include <vm/vm_param.h>
157 #include <vm/vm_object.h>
158 #include <vm/vm_page.h>
159 #include <vm/vm_pager.h>
160 #include <vm/uma.h>
161 #include <sys/socket.h> /* sockaddrs */
162 #include <sys/selinfo.h>
163 #include <sys/sysctl.h>
164 #include <net/if.h>
165 #include <net/if_var.h>
166 #include <net/bpf.h>		/* BIOCIMMEDIATE */
167 #include <net/vnet.h>
168 #include <machine/bus.h>	/* bus_dmamap_* */
169 #include <sys/endian.h>
170 #include <sys/refcount.h>
171 
172 #define prefetch(x)	__builtin_prefetch(x)
173 
174 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
175 
176 #define	BDG_RWINIT(b)		\
177 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
178 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
179 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
180 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
181 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
182 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
183 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
184 
185 
186 /* netmap global lock.
187  * normally called within the user thread (upon a system call)
188  * or when a file descriptor or process is terminated
189  * (last close or last munmap)
190  */
191 
192 #define NMG_LOCK_T		struct mtx
193 #define NMG_LOCK_INIT()		mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF)
194 #define NMG_LOCK_DESTROY()	mtx_destroy(&netmap_global_lock)
195 #define NMG_LOCK()		mtx_lock(&netmap_global_lock)
196 #define NMG_UNLOCK()		mtx_unlock(&netmap_global_lock)
197 #define NMG_LOCK_ASSERT()	mtx_assert(&netmap_global_lock, MA_OWNED)
198 
199 
200 /* atomic operations */
201 #include <machine/atomic.h>
202 #define NM_ATOMIC_TEST_AND_SET(p)	(!atomic_cmpset_acq_int((p), 0, 1))
203 #define NM_ATOMIC_CLEAR(p)		atomic_store_rel_int((p), 0)
204 
205 
206 #elif defined(linux)
207 
208 #include "bsd_glue.h"
209 
210 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *);
211 
212 static struct device_driver*
213 linux_netmap_find_driver(struct device *dev)
214 {
215 	struct device_driver *dd;
216 
217 	while ( (dd = dev->driver) == NULL ) {
218 		if ( (dev = dev->parent) == NULL )
219 			return NULL;
220 	}
221 	return dd;
222 }
223 
224 static struct net_device*
225 ifunit_ref(const char *name)
226 {
227 	struct net_device *ifp = dev_get_by_name(&init_net, name);
228 	struct device_driver *dd;
229 
230 	if (ifp == NULL)
231 		return NULL;
232 
233 	if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL )
234 		goto error;
235 
236 	if (!try_module_get(dd->owner))
237 		goto error;
238 
239 	return ifp;
240 error:
241 	dev_put(ifp);
242 	return NULL;
243 }
244 
245 static void
246 if_rele(struct net_device *ifp)
247 {
248 	struct device_driver *dd;
249 	dd = linux_netmap_find_driver(&ifp->dev);
250 	dev_put(ifp);
251 	if (dd)
252 		module_put(dd->owner);
253 }
254 
255 // XXX a mtx would suffice here too 20130404 gl
256 #define NMG_LOCK_T		struct semaphore
257 #define NMG_LOCK_INIT()		sema_init(&netmap_global_lock, 1)
258 #define NMG_LOCK_DESTROY()
259 #define NMG_LOCK()		down(&netmap_global_lock)
260 #define NMG_UNLOCK()		up(&netmap_global_lock)
261 #define NMG_LOCK_ASSERT()	//	XXX to be completed
262 
263 
264 #elif defined(__APPLE__)
265 
266 #warning OSX support is only partial
267 #include "osx_glue.h"
268 
269 #else
270 
271 #error	Unsupported platform
272 
273 #endif /* unsupported */
274 
275 /*
276  * common headers
277  */
278 #include <net/netmap.h>
279 #include <dev/netmap/netmap_kern.h>
280 #include <dev/netmap/netmap_mem2.h>
281 
282 
283 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
284 
285 /*
286  * The following variables are used by the drivers and replicate
287  * fields in the global memory pool. They only refer to buffers
288  * used by physical interfaces.
289  */
290 u_int netmap_total_buffers;
291 u_int netmap_buf_size;
292 char *netmap_buffer_base;	/* also address of an invalid buffer */
293 
294 /* user-controlled variables */
295 int netmap_verbose;
296 
297 static int netmap_no_timestamp; /* don't timestamp on rxsync */
298 
299 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
300 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
301     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
302 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
303     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
304 int netmap_mitigate = 1;
305 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
306 int netmap_no_pendintr = 1;
307 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
308     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
309 int netmap_txsync_retry = 2;
310 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
311     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
312 
313 int netmap_drop = 0;	/* debugging */
314 int netmap_flags = 0;	/* debug flags */
315 int netmap_fwd = 0;	/* force transparent mode */
316 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
317 
318 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , "");
319 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
320 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
321 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
322 
323 NMG_LOCK_T	netmap_global_lock;
324 
325 /*
326  * protect against multiple threads using the same ring.
327  * also check that the ring has not been stopped.
328  */
329 #define NM_KR_BUSY	1
330 #define NM_KR_STOPPED	2
331 static void nm_kr_put(struct netmap_kring *kr);
332 static __inline int nm_kr_tryget(struct netmap_kring *kr)
333 {
334 	/* check a first time without taking the lock
335 	 * to avoid starvation for nm_kr_get()
336 	 */
337 	if (unlikely(kr->nkr_stopped)) {
338 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
339 		return NM_KR_STOPPED;
340 	}
341 	if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)))
342 		return NM_KR_BUSY;
343 	/* check a second time with lock held */
344 	if (unlikely(kr->nkr_stopped)) {
345 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
346 		nm_kr_put(kr);
347 		return NM_KR_STOPPED;
348 	}
349 	return 0;
350 }
351 
352 static __inline void nm_kr_put(struct netmap_kring *kr)
353 {
354 	NM_ATOMIC_CLEAR(&kr->nr_busy);
355 }
356 
357 static void nm_kr_get(struct netmap_kring *kr)
358 {
359 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
360 		tsleep(kr, 0, "NM_KR_GET", 4);
361 }
362 
363 static void nm_disable_ring(struct netmap_kring *kr)
364 {
365 	kr->nkr_stopped = 1;
366 	nm_kr_get(kr);
367 	mtx_lock(&kr->q_lock);
368 	mtx_unlock(&kr->q_lock);
369 	nm_kr_put(kr);
370 }
371 
372 void netmap_disable_all_rings(struct ifnet *ifp)
373 {
374 	struct netmap_adapter *na;
375 	int i;
376 
377 	if (!(ifp->if_capenable & IFCAP_NETMAP))
378 		return;
379 
380 	na = NA(ifp);
381 
382 	for (i = 0; i < na->num_tx_rings + 1; i++) {
383 		nm_disable_ring(na->tx_rings + i);
384 		selwakeuppri(&na->tx_rings[i].si, PI_NET);
385 	}
386 	for (i = 0; i < na->num_rx_rings + 1; i++) {
387 		nm_disable_ring(na->rx_rings + i);
388 		selwakeuppri(&na->rx_rings[i].si, PI_NET);
389 	}
390 	selwakeuppri(&na->tx_si, PI_NET);
391 	selwakeuppri(&na->rx_si, PI_NET);
392 }
393 
394 void netmap_enable_all_rings(struct ifnet *ifp)
395 {
396 	struct netmap_adapter *na;
397 	int i;
398 
399 	if (!(ifp->if_capenable & IFCAP_NETMAP))
400 		return;
401 
402 	na = NA(ifp);
403 	for (i = 0; i < na->num_tx_rings + 1; i++) {
404 		D("enabling %p", na->tx_rings + i);
405 		na->tx_rings[i].nkr_stopped = 0;
406 	}
407 	for (i = 0; i < na->num_rx_rings + 1; i++) {
408 		D("enabling %p", na->rx_rings + i);
409 		na->rx_rings[i].nkr_stopped = 0;
410 	}
411 }
412 
413 
414 /*
415  * generic bound_checking function
416  */
417 u_int
418 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
419 {
420 	u_int oldv = *v;
421 	const char *op = NULL;
422 
423 	if (dflt < lo)
424 		dflt = lo;
425 	if (dflt > hi)
426 		dflt = hi;
427 	if (oldv < lo) {
428 		*v = dflt;
429 		op = "Bump";
430 	} else if (oldv > hi) {
431 		*v = hi;
432 		op = "Clamp";
433 	}
434 	if (op && msg)
435 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
436 	return *v;
437 }
438 
439 /*
440  * packet-dump function, user-supplied or static buffer.
441  * The destination buffer must be at least 30+4*len
442  */
443 const char *
444 nm_dump_buf(char *p, int len, int lim, char *dst)
445 {
446 	static char _dst[8192];
447         int i, j, i0;
448 	static char hex[] ="0123456789abcdef";
449 	char *o;	/* output position */
450 
451 #define P_HI(x)	hex[((x) & 0xf0)>>4]
452 #define P_LO(x)	hex[((x) & 0xf)]
453 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
454 	if (!dst)
455 		dst = _dst;
456 	if (lim <= 0 || lim > len)
457 		lim = len;
458 	o = dst;
459 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
460 	o += strlen(o);
461 	/* hexdump routine */
462 	for (i = 0; i < lim; ) {
463 		sprintf(o, "%5d: ", i);
464 		o += strlen(o);
465 		memset(o, ' ', 48);
466 		i0 = i;
467 		for (j=0; j < 16 && i < lim; i++, j++) {
468 			o[j*3] = P_HI(p[i]);
469 			o[j*3+1] = P_LO(p[i]);
470 		}
471 		i = i0;
472 		for (j=0; j < 16 && i < lim; i++, j++)
473 			o[j + 48] = P_C(p[i]);
474 		o[j+48] = '\n';
475 		o += j+49;
476 	}
477 	*o = '\0';
478 #undef P_HI
479 #undef P_LO
480 #undef P_C
481 	return dst;
482 }
483 
484 /*
485  * system parameters (most of them in netmap_kern.h)
486  * NM_NAME	prefix for switch port names, default "vale"
487  * NM_BDG_MAXPORTS	number of ports
488  * NM_BRIDGES	max number of switches in the system.
489  *	XXX should become a sysctl or tunable
490  *
491  * Switch ports are named valeX:Y where X is the switch name and Y
492  * is the port. If Y matches a physical interface name, the port is
493  * connected to a physical device.
494  *
495  * Unlike physical interfaces, switch ports use their own memory region
496  * for rings and buffers.
497  * The virtual interfaces use per-queue lock instead of core lock.
498  * In the tx loop, we aggregate traffic in batches to make all operations
499  * faster. The batch size is bridge_batch.
500  */
501 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
502 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
503 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
504 #define NM_BDG_HASH		1024	/* forwarding table entries */
505 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
506 #define NM_MULTISEG		64	/* max size of a chain of bufs */
507 /* actual size of the tables */
508 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
509 /* NM_FT_NULL terminates a list of slots in the ft */
510 #define NM_FT_NULL		NM_BDG_BATCH_MAX
511 #define	NM_BRIDGES		8	/* number of bridges */
512 
513 
514 /*
515  * bridge_batch is set via sysctl to the max batch size to be
516  * used in the bridge. The actual value may be larger as the
517  * last packet in the block may overflow the size.
518  */
519 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
520 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
521 
522 
523 /*
524  * These are used to handle reference counters for bridge ports.
525  */
526 #define	ADD_BDG_REF(ifp)	refcount_acquire(&NA(ifp)->na_bdg_refcount)
527 #define	DROP_BDG_REF(ifp)	refcount_release(&NA(ifp)->na_bdg_refcount)
528 
529 /* The bridge references the buffers using the device specific look up table */
530 static inline void *
531 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot)
532 {
533 	struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut;
534 	uint32_t i = slot->buf_idx;
535 	return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ?  lut[0].vaddr : lut[i].vaddr;
536 }
537 
538 static int bdg_netmap_attach(struct netmap_adapter *);
539 static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
540 int kern_netmap_regif(struct nmreq *nmr);
541 
542 /*
543  * Each transmit queue accumulates a batch of packets into
544  * a structure before forwarding. Packets to the same
545  * destination are put in a list using ft_next as a link field.
546  * ft_frags and ft_next are valid only on the first fragment.
547  */
548 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
549 	void *ft_buf;		/* netmap or indirect buffer */
550 	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
551 	uint8_t _ft_port;	/* dst port (unused) */
552 	uint16_t ft_flags;	/* flags, e.g. indirect */
553 	uint16_t ft_len;	/* src fragment len */
554 	uint16_t ft_next;	/* next packet to same destination */
555 };
556 
557 /*
558  * For each output interface, nm_bdg_q is used to construct a list.
559  * bq_len is the number of output buffers (we can have coalescing
560  * during the copy).
561  */
562 struct nm_bdg_q {
563 	uint16_t bq_head;
564 	uint16_t bq_tail;
565 	uint32_t bq_len;	/* number of buffers */
566 };
567 
568 /* XXX revise this */
569 struct nm_hash_ent {
570 	uint64_t	mac;	/* the top 2 bytes are the epoch */
571 	uint64_t	ports;
572 };
573 
574 /*
575  * nm_bridge is a descriptor for a VALE switch.
576  * Interfaces for a bridge are all in bdg_ports[].
577  * The array has fixed size, an empty entry does not terminate
578  * the search, but lookups only occur on attach/detach so we
579  * don't mind if they are slow.
580  *
581  * The bridge is non blocking on the transmit ports: excess
582  * packets are dropped if there is no room on the output port.
583  *
584  * bdg_lock protects accesses to the bdg_ports array.
585  * This is a rw lock (or equivalent).
586  */
587 struct nm_bridge {
588 	/* XXX what is the proper alignment/layout ? */
589 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
590 	int		bdg_namelen;
591 	uint32_t	bdg_active_ports; /* 0 means free */
592 	char		bdg_basename[IFNAMSIZ];
593 
594 	/* Indexes of active ports (up to active_ports)
595 	 * and all other remaining ports.
596 	 */
597 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
598 
599 	struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
600 
601 
602 	/*
603 	 * The function to decide the destination port.
604 	 * It returns either of an index of the destination port,
605 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
606 	 * forward this packet.  ring_nr is the source ring index, and the
607 	 * function may overwrite this value to forward this packet to a
608 	 * different ring index.
609 	 * This function must be set by netmap_bdgctl().
610 	 */
611 	bdg_lookup_fn_t nm_bdg_lookup;
612 
613 	/* the forwarding table, MAC+ports.
614 	 * XXX should be changed to an argument to be passed to
615 	 * the lookup function, and allocated on attach
616 	 */
617 	struct nm_hash_ent ht[NM_BDG_HASH];
618 };
619 
620 
621 /*
622  * XXX in principle nm_bridges could be created dynamically
623  * Right now we have a static array and deletions are protected
624  * by an exclusive lock.
625  */
626 struct nm_bridge nm_bridges[NM_BRIDGES];
627 
628 
629 /*
630  * A few function to tell which kind of port are we using.
631  * XXX should we hold a lock ?
632  *
633  * nma_is_vp()		virtual port
634  * nma_is_host()	port connected to the host stack
635  * nma_is_hw()		port connected to a NIC
636  */
637 int nma_is_vp(struct netmap_adapter *na);
638 int
639 nma_is_vp(struct netmap_adapter *na)
640 {
641 	return na->nm_register == bdg_netmap_reg;
642 }
643 
644 static __inline int
645 nma_is_host(struct netmap_adapter *na)
646 {
647 	return na->nm_register == NULL;
648 }
649 
650 static __inline int
651 nma_is_hw(struct netmap_adapter *na)
652 {
653 	/* In case of sw adapter, nm_register is NULL */
654 	return !nma_is_vp(na) && !nma_is_host(na);
655 }
656 
657 
658 /*
659  * If the NIC is owned by the kernel
660  * (i.e., bridge), neither another bridge nor user can use it;
661  * if the NIC is owned by a user, only users can share it.
662  * Evaluation must be done under NMG_LOCK().
663  */
664 #define NETMAP_OWNED_BY_KERN(ifp)	(!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
665 #define NETMAP_OWNED_BY_ANY(ifp) \
666 	(NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
667 
668 /*
669  * NA(ifp)->bdg_port	port index
670  */
671 
672 
673 /*
674  * this is a slightly optimized copy routine which rounds
675  * to multiple of 64 bytes and is often faster than dealing
676  * with other odd sizes. We assume there is enough room
677  * in the source and destination buffers.
678  *
679  * XXX only for multiples of 64 bytes, non overlapped.
680  */
681 static inline void
682 pkt_copy(void *_src, void *_dst, int l)
683 {
684         uint64_t *src = _src;
685         uint64_t *dst = _dst;
686         if (unlikely(l >= 1024)) {
687                 memcpy(dst, src, l);
688                 return;
689         }
690         for (; likely(l > 0); l-=64) {
691                 *dst++ = *src++;
692                 *dst++ = *src++;
693                 *dst++ = *src++;
694                 *dst++ = *src++;
695                 *dst++ = *src++;
696                 *dst++ = *src++;
697                 *dst++ = *src++;
698                 *dst++ = *src++;
699         }
700 }
701 
702 
703 #ifdef TEST_STUFF
704 struct xxx {
705 	char *name;
706 	void (*fn)(uint32_t);
707 };
708 
709 
710 static void
711 nm_test_defmtx(uint32_t n)
712 {
713 	uint32_t i;
714 	struct mtx m;
715 	mtx_init(&m, "test", NULL, MTX_DEF);
716 	for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); }
717 	mtx_destroy(&m);
718 	return;
719 }
720 
721 static void
722 nm_test_spinmtx(uint32_t n)
723 {
724 	uint32_t i;
725 	struct mtx m;
726 	mtx_init(&m, "test", NULL, MTX_SPIN);
727 	for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); }
728 	mtx_destroy(&m);
729 	return;
730 }
731 
732 static void
733 nm_test_rlock(uint32_t n)
734 {
735 	uint32_t i;
736 	struct rwlock m;
737 	rw_init(&m, "test");
738 	for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); }
739 	rw_destroy(&m);
740 	return;
741 }
742 
743 static void
744 nm_test_wlock(uint32_t n)
745 {
746 	uint32_t i;
747 	struct rwlock m;
748 	rw_init(&m, "test");
749 	for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); }
750 	rw_destroy(&m);
751 	return;
752 }
753 
754 static void
755 nm_test_slock(uint32_t n)
756 {
757 	uint32_t i;
758 	struct sx m;
759 	sx_init(&m, "test");
760 	for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); }
761 	sx_destroy(&m);
762 	return;
763 }
764 
765 static void
766 nm_test_xlock(uint32_t n)
767 {
768 	uint32_t i;
769 	struct sx m;
770 	sx_init(&m, "test");
771 	for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); }
772 	sx_destroy(&m);
773 	return;
774 }
775 
776 
777 struct xxx nm_tests[] = {
778 	{ "defmtx", nm_test_defmtx },
779 	{ "spinmtx", nm_test_spinmtx },
780 	{ "rlock", nm_test_rlock },
781 	{ "wlock", nm_test_wlock },
782 	{ "slock", nm_test_slock },
783 	{ "xlock", nm_test_xlock },
784 };
785 
786 static int
787 nm_test(struct nmreq *nmr)
788 {
789 	uint32_t scale, n, test;
790 	static int old_test = -1;
791 
792 	test = nmr->nr_cmd;
793 	scale = nmr->nr_offset;
794 	n = sizeof(nm_tests) / sizeof(struct xxx) - 1;
795 	if (test > n) {
796 		D("test index too high, max %d", n);
797 		return 0;
798 	}
799 
800 	if (old_test != test) {
801 		D("test %s scale %d", nm_tests[test].name, scale);
802 		old_test = test;
803 	}
804 	nm_tests[test].fn(scale);
805 	return 0;
806 }
807 #endif /* TEST_STUFF */
808 
809 /*
810  * locate a bridge among the existing ones.
811  * MUST BE CALLED WITH NMG_LOCK()
812  *
813  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
814  * We assume that this is called with a name of at least NM_NAME chars.
815  */
816 static struct nm_bridge *
817 nm_find_bridge(const char *name, int create)
818 {
819 	int i, l, namelen;
820 	struct nm_bridge *b = NULL;
821 
822 	NMG_LOCK_ASSERT();
823 
824 	namelen = strlen(NM_NAME);	/* base length */
825 	l = name ? strlen(name) : 0;		/* actual length */
826 	if (l < namelen) {
827 		D("invalid bridge name %s", name ? name : NULL);
828 		return NULL;
829 	}
830 	for (i = namelen + 1; i < l; i++) {
831 		if (name[i] == ':') {
832 			namelen = i;
833 			break;
834 		}
835 	}
836 	if (namelen >= IFNAMSIZ)
837 		namelen = IFNAMSIZ;
838 	ND("--- prefix is '%.*s' ---", namelen, name);
839 
840 	/* lookup the name, remember empty slot if there is one */
841 	for (i = 0; i < NM_BRIDGES; i++) {
842 		struct nm_bridge *x = nm_bridges + i;
843 
844 		if (x->bdg_active_ports == 0) {
845 			if (create && b == NULL)
846 				b = x;	/* record empty slot */
847 		} else if (x->bdg_namelen != namelen) {
848 			continue;
849 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
850 			ND("found '%.*s' at %d", namelen, name, i);
851 			b = x;
852 			break;
853 		}
854 	}
855 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
856 		/* initialize the bridge */
857 		strncpy(b->bdg_basename, name, namelen);
858 		ND("create new bridge %s with ports %d", b->bdg_basename,
859 			b->bdg_active_ports);
860 		b->bdg_namelen = namelen;
861 		b->bdg_active_ports = 0;
862 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
863 			b->bdg_port_index[i] = i;
864 		/* set the default function */
865 		b->nm_bdg_lookup = netmap_bdg_learning;
866 		/* reset the MAC address table */
867 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
868 	}
869 	return b;
870 }
871 
872 
873 /*
874  * Free the forwarding tables for rings attached to switch ports.
875  */
876 static void
877 nm_free_bdgfwd(struct netmap_adapter *na)
878 {
879 	int nrings, i;
880 	struct netmap_kring *kring;
881 
882 	NMG_LOCK_ASSERT();
883 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
884 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
885 	for (i = 0; i < nrings; i++) {
886 		if (kring[i].nkr_ft) {
887 			free(kring[i].nkr_ft, M_DEVBUF);
888 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
889 		}
890 	}
891 	if (nma_is_hw(na))
892 		nm_free_bdgfwd(SWNA(na->ifp));
893 }
894 
895 
896 /*
897  * Allocate the forwarding tables for the rings attached to the bridge ports.
898  */
899 static int
900 nm_alloc_bdgfwd(struct netmap_adapter *na)
901 {
902 	int nrings, l, i, num_dstq;
903 	struct netmap_kring *kring;
904 
905 	NMG_LOCK_ASSERT();
906 	/* all port:rings + broadcast */
907 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
908 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
909 	l += sizeof(struct nm_bdg_q) * num_dstq;
910 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
911 
912 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
913 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
914 	for (i = 0; i < nrings; i++) {
915 		struct nm_bdg_fwd *ft;
916 		struct nm_bdg_q *dstq;
917 		int j;
918 
919 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
920 		if (!ft) {
921 			nm_free_bdgfwd(na);
922 			return ENOMEM;
923 		}
924 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
925 		for (j = 0; j < num_dstq; j++) {
926 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
927 			dstq[j].bq_len = 0;
928 		}
929 		kring[i].nkr_ft = ft;
930 	}
931 	if (nma_is_hw(na))
932 		nm_alloc_bdgfwd(SWNA(na->ifp));
933 	return 0;
934 }
935 
936 
937 /*
938  * Fetch configuration from the device, to cope with dynamic
939  * reconfigurations after loading the module.
940  */
941 static int
942 netmap_update_config(struct netmap_adapter *na)
943 {
944 	struct ifnet *ifp = na->ifp;
945 	u_int txr, txd, rxr, rxd;
946 
947 	txr = txd = rxr = rxd = 0;
948 	if (na->nm_config) {
949 		na->nm_config(ifp, &txr, &txd, &rxr, &rxd);
950 	} else {
951 		/* take whatever we had at init time */
952 		txr = na->num_tx_rings;
953 		txd = na->num_tx_desc;
954 		rxr = na->num_rx_rings;
955 		rxd = na->num_rx_desc;
956 	}
957 
958 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
959 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
960 		return 0; /* nothing changed */
961 	if (netmap_verbose || na->refcount > 0) {
962 		D("stored config %s: txring %d x %d, rxring %d x %d",
963 			ifp->if_xname,
964 			na->num_tx_rings, na->num_tx_desc,
965 			na->num_rx_rings, na->num_rx_desc);
966 		D("new config %s: txring %d x %d, rxring %d x %d",
967 			ifp->if_xname, txr, txd, rxr, rxd);
968 	}
969 	if (na->refcount == 0) {
970 		D("configuration changed (but fine)");
971 		na->num_tx_rings = txr;
972 		na->num_tx_desc = txd;
973 		na->num_rx_rings = rxr;
974 		na->num_rx_desc = rxd;
975 		return 0;
976 	}
977 	D("configuration changed while active, this is bad...");
978 	return 1;
979 }
980 
981 static struct netmap_if *
982 netmap_if_new(const char *ifname, struct netmap_adapter *na)
983 {
984 	if (netmap_update_config(na)) {
985 		/* configuration mismatch, report and fail */
986 		return NULL;
987 	}
988 	return netmap_mem_if_new(ifname, na);
989 }
990 
991 
992 /* Structure associated to each thread which registered an interface.
993  *
994  * The first 4 fields of this structure are written by NIOCREGIF and
995  * read by poll() and NIOC?XSYNC.
996  * There is low contention among writers (actually, a correct user program
997  * should have no contention among writers) and among writers and readers,
998  * so we use a single global lock to protect the structure initialization.
999  * Since initialization involves the allocation of memory, we reuse the memory
1000  * allocator lock.
1001  * Read access to the structure is lock free. Readers must check that
1002  * np_nifp is not NULL before using the other fields.
1003  * If np_nifp is NULL initialization has not been performed, so they should
1004  * return an error to userlevel.
1005  *
1006  * The ref_done field is used to regulate access to the refcount in the
1007  * memory allocator. The refcount must be incremented at most once for
1008  * each open("/dev/netmap"). The increment is performed by the first
1009  * function that calls netmap_get_memory() (currently called by
1010  * mmap(), NIOCGINFO and NIOCREGIF).
1011  * If the refcount is incremented, it is then decremented when the
1012  * private structure is destroyed.
1013  */
1014 struct netmap_priv_d {
1015 	struct netmap_if * volatile np_nifp;	/* netmap if descriptor. */
1016 
1017 	struct ifnet	*np_ifp;	/* device for which we hold a ref. */
1018 	int		np_ringid;	/* from the ioctl */
1019 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
1020 	uint16_t	np_txpoll;
1021 
1022 	struct netmap_mem_d *np_mref;	/* use with NMG_LOCK held */
1023 #ifdef __FreeBSD__
1024 	int		np_refcount;	/* use with NMG_LOCK held */
1025 #endif /* __FreeBSD__ */
1026 };
1027 
1028 /* grab a reference to the memory allocator, if we don't have one already.  The
1029  * reference is taken from the netmap_adapter registered with the priv.
1030  *
1031  */
1032 static int
1033 netmap_get_memory_locked(struct netmap_priv_d* p)
1034 {
1035 	struct netmap_mem_d *nmd;
1036 	int error = 0;
1037 
1038 	if (p->np_ifp == NULL) {
1039 		if (!netmap_mmap_unreg)
1040 			return ENODEV;
1041 		/* for compatibility with older versions of the API
1042  		 * we use the global allocator when no interface has been
1043  		 * registered
1044  		 */
1045 		nmd = &nm_mem;
1046 	} else {
1047 		nmd = NA(p->np_ifp)->nm_mem;
1048 	}
1049 	if (p->np_mref == NULL) {
1050 		error = netmap_mem_finalize(nmd);
1051 		if (!error)
1052 			p->np_mref = nmd;
1053 	} else if (p->np_mref != nmd) {
1054 		/* a virtual port has been registered, but previous
1055  		 * syscalls already used the global allocator.
1056  		 * We cannot continue
1057  		 */
1058 		error = ENODEV;
1059 	}
1060 	return error;
1061 }
1062 
1063 static int
1064 netmap_get_memory(struct netmap_priv_d* p)
1065 {
1066 	int error;
1067 	NMG_LOCK();
1068 	error = netmap_get_memory_locked(p);
1069 	NMG_UNLOCK();
1070 	return error;
1071 }
1072 
1073 static int
1074 netmap_have_memory_locked(struct netmap_priv_d* p)
1075 {
1076 	return p->np_mref != NULL;
1077 }
1078 
1079 static void
1080 netmap_drop_memory_locked(struct netmap_priv_d* p)
1081 {
1082 	if (p->np_mref) {
1083 		netmap_mem_deref(p->np_mref);
1084 		p->np_mref = NULL;
1085 	}
1086 }
1087 
1088 /*
1089  * File descriptor's private data destructor.
1090  *
1091  * Call nm_register(ifp,0) to stop netmap mode on the interface and
1092  * revert to normal operation. We expect that np_ifp has not gone.
1093  * The second argument is the nifp to work on. In some cases it is
1094  * not attached yet to the netmap_priv_d so we need to pass it as
1095  * a separate argument.
1096  */
1097 /* call with NMG_LOCK held */
1098 static void
1099 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
1100 {
1101 	struct ifnet *ifp = priv->np_ifp;
1102 	struct netmap_adapter *na = NA(ifp);
1103 
1104 	NMG_LOCK_ASSERT();
1105 	na->refcount--;
1106 	if (na->refcount <= 0) {	/* last instance */
1107 		u_int i;
1108 
1109 		if (netmap_verbose)
1110 			D("deleting last instance for %s", ifp->if_xname);
1111 		/*
1112 		 * (TO CHECK) This function is only called
1113 		 * when the last reference to this file descriptor goes
1114 		 * away. This means we cannot have any pending poll()
1115 		 * or interrupt routine operating on the structure.
1116 		 * XXX The file may be closed in a thread while
1117 		 * another thread is using it.
1118 		 * Linux keeps the file opened until the last reference
1119 		 * by any outstanding ioctl/poll or mmap is gone.
1120 		 * FreeBSD does not track mmap()s (but we do) and
1121 		 * wakes up any sleeping poll(). Need to check what
1122 		 * happens if the close() occurs while a concurrent
1123 		 * syscall is running.
1124 		 */
1125 		na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
1126 		/* Wake up any sleeping threads. netmap_poll will
1127 		 * then return POLLERR
1128 		 * XXX The wake up now must happen during *_down(), when
1129 		 * we order all activities to stop. -gl
1130 		 */
1131 		nm_free_bdgfwd(na);
1132 		for (i = 0; i < na->num_tx_rings + 1; i++) {
1133 			mtx_destroy(&na->tx_rings[i].q_lock);
1134 		}
1135 		for (i = 0; i < na->num_rx_rings + 1; i++) {
1136 			mtx_destroy(&na->rx_rings[i].q_lock);
1137 		}
1138 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
1139 		/* knlist_destroy(&na->tx_si.si_note); */
1140 		/* knlist_destroy(&na->rx_si.si_note); */
1141 		if (nma_is_hw(na))
1142 			SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
1143 	}
1144 	/*
1145 	 * netmap_mem_if_delete() deletes the nifp, and if this is
1146 	 * the last instance also buffers, rings and krings.
1147 	 */
1148 	netmap_mem_if_delete(na, nifp);
1149 }
1150 
1151 
1152 /* we assume netmap adapter exists
1153  * Called with NMG_LOCK held
1154  */
1155 static void
1156 nm_if_rele(struct ifnet *ifp)
1157 {
1158 	int i, is_hw, hw, sw, lim;
1159 	struct nm_bridge *b;
1160 	struct netmap_adapter *na;
1161 	uint8_t tmp[NM_BDG_MAXPORTS];
1162 
1163 	NMG_LOCK_ASSERT();
1164 	/* I can be called not only for get_ifp()-ed references where netmap's
1165 	 * capability is guaranteed, but also for non-netmap-capable NICs.
1166 	 */
1167 	if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
1168 		if_rele(ifp);
1169 		return;
1170 	}
1171 	na = NA(ifp);
1172 	b = na->na_bdg;
1173 	is_hw = nma_is_hw(na);
1174 
1175 	ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount);
1176 
1177 	if (!DROP_BDG_REF(ifp))
1178 		return;
1179 
1180 	/*
1181 	New algorithm:
1182 	make a copy of bdg_port_index;
1183 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
1184 	in the array of bdg_port_index, replacing them with
1185 	entries from the bottom of the array;
1186 	decrement bdg_active_ports;
1187 	acquire BDG_WLOCK() and copy back the array.
1188 	 */
1189 
1190 	hw = NA(ifp)->bdg_port;
1191 	sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1;
1192 	lim = b->bdg_active_ports;
1193 
1194 	ND("detach %d and %d (lim %d)", hw, sw, lim);
1195 	/* make a copy of the list of active ports, update it,
1196 	 * and then copy back within BDG_WLOCK().
1197 	 */
1198 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
1199 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
1200 		if (hw >= 0 && tmp[i] == hw) {
1201 			ND("detach hw %d at %d", hw, i);
1202 			lim--; /* point to last active port */
1203 			tmp[i] = tmp[lim]; /* swap with i */
1204 			tmp[lim] = hw;	/* now this is inactive */
1205 			hw = -1;
1206 		} else if (sw >= 0 && tmp[i] == sw) {
1207 			ND("detach sw %d at %d", sw, i);
1208 			lim--;
1209 			tmp[i] = tmp[lim];
1210 			tmp[lim] = sw;
1211 			sw = -1;
1212 		} else {
1213 			i++;
1214 		}
1215 	}
1216 	if (hw >= 0 || sw >= 0) {
1217 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
1218 	}
1219 	hw = NA(ifp)->bdg_port;
1220 	sw = (is_hw && SWNA(ifp)->na_bdg) ?  SWNA(ifp)->bdg_port : -1;
1221 
1222 	BDG_WLOCK(b);
1223 	b->bdg_ports[hw] = NULL;
1224 	na->na_bdg = NULL;
1225 	if (sw >= 0) {
1226 		b->bdg_ports[sw] = NULL;
1227 		SWNA(ifp)->na_bdg = NULL;
1228 	}
1229 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
1230 	b->bdg_active_ports = lim;
1231 	BDG_WUNLOCK(b);
1232 
1233 	ND("now %d active ports", lim);
1234 	if (lim == 0) {
1235 		ND("marking bridge %s as free", b->bdg_basename);
1236 		b->nm_bdg_lookup = NULL;
1237 	}
1238 
1239 	if (is_hw) {
1240 		if_rele(ifp);
1241 	} else {
1242 		if (na->na_flags & NAF_MEM_OWNER)
1243 			netmap_mem_private_delete(na->nm_mem);
1244 		bzero(na, sizeof(*na));
1245 		free(na, M_DEVBUF);
1246 		bzero(ifp, sizeof(*ifp));
1247 		free(ifp, M_DEVBUF);
1248 	}
1249 }
1250 
1251 
1252 /*
1253  * returns 1 if this is the last instance and we can free priv
1254  */
1255 static int
1256 netmap_dtor_locked(struct netmap_priv_d *priv)
1257 {
1258 	struct ifnet *ifp = priv->np_ifp;
1259 
1260 #ifdef __FreeBSD__
1261 	/*
1262 	 * np_refcount is the number of active mmaps on
1263 	 * this file descriptor
1264 	 */
1265 	if (--priv->np_refcount > 0) {
1266 		return 0;
1267 	}
1268 #endif /* __FreeBSD__ */
1269 	if (ifp) {
1270 		netmap_do_unregif(priv, priv->np_nifp);
1271 	}
1272 	netmap_drop_memory_locked(priv);
1273 	if (ifp) {
1274 		nm_if_rele(ifp); /* might also destroy *na */
1275 	}
1276 	return 1;
1277 }
1278 
1279 static void
1280 netmap_dtor(void *data)
1281 {
1282 	struct netmap_priv_d *priv = data;
1283 	int last_instance;
1284 
1285 	NMG_LOCK();
1286 	last_instance = netmap_dtor_locked(priv);
1287 	NMG_UNLOCK();
1288 	if (last_instance) {
1289 		bzero(priv, sizeof(*priv));	/* for safety */
1290 		free(priv, M_DEVBUF);
1291 	}
1292 }
1293 
1294 
1295 #ifdef __FreeBSD__
1296 
1297 /*
1298  * In order to track whether pages are still mapped, we hook into
1299  * the standard cdev_pager and intercept the constructor and
1300  * destructor.
1301  */
1302 
1303 struct netmap_vm_handle_t {
1304 	struct cdev 		*dev;
1305 	struct netmap_priv_d	*priv;
1306 };
1307 
1308 static int
1309 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
1310     vm_ooffset_t foff, struct ucred *cred, u_short *color)
1311 {
1312 	struct netmap_vm_handle_t *vmh = handle;
1313 	D("handle %p size %jd prot %d foff %jd",
1314 		handle, (intmax_t)size, prot, (intmax_t)foff);
1315 	dev_ref(vmh->dev);
1316 	return 0;
1317 }
1318 
1319 
1320 static void
1321 netmap_dev_pager_dtor(void *handle)
1322 {
1323 	struct netmap_vm_handle_t *vmh = handle;
1324 	struct cdev *dev = vmh->dev;
1325 	struct netmap_priv_d *priv = vmh->priv;
1326 	D("handle %p", handle);
1327 	netmap_dtor(priv);
1328 	free(vmh, M_DEVBUF);
1329 	dev_rel(dev);
1330 }
1331 
1332 static int
1333 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
1334 	int prot, vm_page_t *mres)
1335 {
1336 	struct netmap_vm_handle_t *vmh = object->handle;
1337 	struct netmap_priv_d *priv = vmh->priv;
1338 	vm_paddr_t paddr;
1339 	vm_page_t page;
1340 	vm_memattr_t memattr;
1341 	vm_pindex_t pidx;
1342 
1343 	ND("object %p offset %jd prot %d mres %p",
1344 			object, (intmax_t)offset, prot, mres);
1345 	memattr = object->memattr;
1346 	pidx = OFF_TO_IDX(offset);
1347 	paddr = netmap_mem_ofstophys(priv->np_mref, offset);
1348 	if (paddr == 0)
1349 		return VM_PAGER_FAIL;
1350 
1351 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
1352 		/*
1353 		 * If the passed in result page is a fake page, update it with
1354 		 * the new physical address.
1355 		 */
1356 		page = *mres;
1357 		vm_page_updatefake(page, paddr, memattr);
1358 	} else {
1359 		/*
1360 		 * Replace the passed in reqpage page with our own fake page and
1361 		 * free up the all of the original pages.
1362 		 */
1363 #ifndef VM_OBJECT_WUNLOCK	/* FreeBSD < 10.x */
1364 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK
1365 #define VM_OBJECT_WLOCK	VM_OBJECT_LOCK
1366 #endif /* VM_OBJECT_WUNLOCK */
1367 
1368 		VM_OBJECT_WUNLOCK(object);
1369 		page = vm_page_getfake(paddr, memattr);
1370 		VM_OBJECT_WLOCK(object);
1371 		vm_page_lock(*mres);
1372 		vm_page_free(*mres);
1373 		vm_page_unlock(*mres);
1374 		*mres = page;
1375 		vm_page_insert(page, object, pidx);
1376 	}
1377 	page->valid = VM_PAGE_BITS_ALL;
1378 	return (VM_PAGER_OK);
1379 }
1380 
1381 
1382 static struct cdev_pager_ops netmap_cdev_pager_ops = {
1383         .cdev_pg_ctor = netmap_dev_pager_ctor,
1384         .cdev_pg_dtor = netmap_dev_pager_dtor,
1385         .cdev_pg_fault = netmap_dev_pager_fault,
1386 };
1387 
1388 
1389 static int
1390 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
1391 	vm_size_t objsize,  vm_object_t *objp, int prot)
1392 {
1393 	int error;
1394 	struct netmap_vm_handle_t *vmh;
1395 	struct netmap_priv_d *priv;
1396 	vm_object_t obj;
1397 
1398 	D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
1399 	    (intmax_t )*foff, (intmax_t )objsize, objp, prot);
1400 
1401 	vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
1402 			      M_NOWAIT | M_ZERO);
1403 	if (vmh == NULL)
1404 		return ENOMEM;
1405 	vmh->dev = cdev;
1406 
1407 	NMG_LOCK();
1408 	error = devfs_get_cdevpriv((void**)&priv);
1409 	if (error)
1410 		goto err_unlock;
1411 	vmh->priv = priv;
1412 	priv->np_refcount++;
1413 	NMG_UNLOCK();
1414 
1415 	error = netmap_get_memory(priv);
1416 	if (error)
1417 		goto err_deref;
1418 
1419 	obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
1420 		&netmap_cdev_pager_ops, objsize, prot,
1421 		*foff, NULL);
1422 	if (obj == NULL) {
1423 		D("cdev_pager_allocate failed");
1424 		error = EINVAL;
1425 		goto err_deref;
1426 	}
1427 
1428 	*objp = obj;
1429 	return 0;
1430 
1431 err_deref:
1432 	NMG_LOCK();
1433 	priv->np_refcount--;
1434 err_unlock:
1435 	NMG_UNLOCK();
1436 // err:
1437 	free(vmh, M_DEVBUF);
1438 	return error;
1439 }
1440 
1441 
1442 // XXX can we remove this ?
1443 static int
1444 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
1445 {
1446 	if (netmap_verbose)
1447 		D("dev %p fflag 0x%x devtype %d td %p",
1448 			dev, fflag, devtype, td);
1449 	return 0;
1450 }
1451 
1452 
1453 static int
1454 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
1455 {
1456 	struct netmap_priv_d *priv;
1457 	int error;
1458 
1459 	(void)dev;
1460 	(void)oflags;
1461 	(void)devtype;
1462 	(void)td;
1463 
1464 	// XXX wait or nowait ?
1465 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
1466 			      M_NOWAIT | M_ZERO);
1467 	if (priv == NULL)
1468 		return ENOMEM;
1469 
1470 	error = devfs_set_cdevpriv(priv, netmap_dtor);
1471 	if (error)
1472 	        return error;
1473 
1474 	priv->np_refcount = 1;
1475 
1476 	return 0;
1477 }
1478 #endif /* __FreeBSD__ */
1479 
1480 
1481 /*
1482  * Handlers for synchronization of the queues from/to the host.
1483  * Netmap has two operating modes:
1484  * - in the default mode, the rings connected to the host stack are
1485  *   just another ring pair managed by userspace;
1486  * - in transparent mode (XXX to be defined) incoming packets
1487  *   (from the host or the NIC) are marked as NS_FORWARD upon
1488  *   arrival, and the user application has a chance to reset the
1489  *   flag for packets that should be dropped.
1490  *   On the RXSYNC or poll(), packets in RX rings between
1491  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1492  *   to the other side.
1493  * The transfer NIC --> host is relatively easy, just encapsulate
1494  * into mbufs and we are done. The host --> NIC side is slightly
1495  * harder because there might not be room in the tx ring so it
1496  * might take a while before releasing the buffer.
1497  */
1498 
1499 
1500 /*
1501  * pass a chain of buffers to the host stack as coming from 'dst'
1502  */
1503 static void
1504 netmap_send_up(struct ifnet *dst, struct mbuf *head)
1505 {
1506 	struct mbuf *m;
1507 
1508 	/* send packets up, outside the lock */
1509 	while ((m = head) != NULL) {
1510 		head = head->m_nextpkt;
1511 		m->m_nextpkt = NULL;
1512 		if (netmap_verbose & NM_VERB_HOST)
1513 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1514 		NM_SEND_UP(dst, m);
1515 	}
1516 }
1517 
1518 struct mbq {
1519 	struct mbuf *head;
1520 	struct mbuf *tail;
1521 	int count;
1522 };
1523 
1524 
1525 /*
1526  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1527  * Run from hwcur to cur - reserved
1528  */
1529 static void
1530 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1531 {
1532 	/* Take packets from hwcur to cur-reserved and pass them up.
1533 	 * In case of no buffers we give up. At the end of the loop,
1534 	 * the queue is drained in all cases.
1535 	 * XXX handle reserved
1536 	 */
1537 	u_int lim = kring->nkr_num_slots - 1;
1538 	struct mbuf *m, *tail = q->tail;
1539 	u_int k = kring->ring->cur, n = kring->ring->reserved;
1540 	struct netmap_mem_d *nmd = kring->na->nm_mem;
1541 
1542 	/* compute the final position, ring->cur - ring->reserved */
1543 	if (n > 0) {
1544 		if (k < n)
1545 			k += kring->nkr_num_slots;
1546 		k += n;
1547 	}
1548 	for (n = kring->nr_hwcur; n != k;) {
1549 		struct netmap_slot *slot = &kring->ring->slot[n];
1550 
1551 		n = nm_next(n, lim);
1552 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1553 			continue;
1554 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) {
1555 			D("bad pkt at %d len %d", n, slot->len);
1556 			continue;
1557 		}
1558 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1559 		/* XXX adapt to the case of a multisegment packet */
1560 		m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL);
1561 
1562 		if (m == NULL)
1563 			break;
1564 		if (tail)
1565 			tail->m_nextpkt = m;
1566 		else
1567 			q->head = m;
1568 		tail = m;
1569 		q->count++;
1570 		m->m_nextpkt = NULL;
1571 	}
1572 	q->tail = tail;
1573 }
1574 
1575 
1576 /*
1577  * The host ring has packets from nr_hwcur to (cur - reserved)
1578  * to be sent down to the NIC.
1579  * We need to use the queue lock on the source (host RX ring)
1580  * to protect against netmap_transmit.
1581  * If the user is well behaved we do not need to acquire locks
1582  * on the destination(s),
1583  * so we only need to make sure that there are no panics because
1584  * of user errors.
1585  * XXX verify
1586  *
1587  * We scan the tx rings, which have just been
1588  * flushed so nr_hwcur == cur. Pushing packets down means
1589  * increment cur and decrement avail.
1590  * XXX to be verified
1591  */
1592 static void
1593 netmap_sw_to_nic(struct netmap_adapter *na)
1594 {
1595 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1596 	struct netmap_kring *k1 = &na->tx_rings[0];
1597 	u_int i, howmany, src_lim, dst_lim;
1598 
1599 	/* XXX we should also check that the carrier is on */
1600 	if (kring->nkr_stopped)
1601 		return;
1602 
1603 	mtx_lock(&kring->q_lock);
1604 
1605 	if (kring->nkr_stopped)
1606 		goto out;
1607 
1608 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
1609 
1610 	src_lim = kring->nkr_num_slots - 1;
1611 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
1612 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
1613 		dst_lim = k1->nkr_num_slots - 1;
1614 		while (howmany > 0 && k1->ring->avail > 0) {
1615 			struct netmap_slot *src, *dst, tmp;
1616 			src = &kring->ring->slot[kring->nr_hwcur];
1617 			dst = &k1->ring->slot[k1->ring->cur];
1618 			tmp = *src;
1619 			src->buf_idx = dst->buf_idx;
1620 			src->flags = NS_BUF_CHANGED;
1621 
1622 			dst->buf_idx = tmp.buf_idx;
1623 			dst->len = tmp.len;
1624 			dst->flags = NS_BUF_CHANGED;
1625 			ND("out len %d buf %d from %d to %d",
1626 				dst->len, dst->buf_idx,
1627 				kring->nr_hwcur, k1->ring->cur);
1628 
1629 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
1630 			howmany--;
1631 			kring->nr_hwavail--;
1632 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
1633 			k1->ring->avail--;
1634 		}
1635 		kring->ring->cur = kring->nr_hwcur; // XXX
1636 		k1++; // XXX why?
1637 	}
1638 out:
1639 	mtx_unlock(&kring->q_lock);
1640 }
1641 
1642 
1643 /*
1644  * netmap_txsync_to_host() passes packets up. We are called from a
1645  * system call in user process context, and the only contention
1646  * can be among multiple user threads erroneously calling
1647  * this routine concurrently.
1648  */
1649 static void
1650 netmap_txsync_to_host(struct netmap_adapter *na)
1651 {
1652 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1653 	struct netmap_ring *ring = kring->ring;
1654 	u_int k, lim = kring->nkr_num_slots - 1;
1655 	struct mbq q = { NULL, NULL, 0 };
1656 
1657 	if (nm_kr_tryget(kring)) {
1658 		D("ring %p busy (user error)", kring);
1659 		return;
1660 	}
1661 	k = ring->cur;
1662 	if (k > lim) {
1663 		D("invalid ring index in stack TX kring %p", kring);
1664 		netmap_ring_reinit(kring);
1665 		nm_kr_put(kring);
1666 		return;
1667 	}
1668 
1669 	/* Take packets from hwcur to cur and pass them up.
1670 	 * In case of no buffers we give up. At the end of the loop,
1671 	 * the queue is drained in all cases.
1672 	 */
1673 	netmap_grab_packets(kring, &q, 1);
1674 	kring->nr_hwcur = k;
1675 	kring->nr_hwavail = ring->avail = lim;
1676 
1677 	nm_kr_put(kring);
1678 	netmap_send_up(na->ifp, q.head);
1679 }
1680 
1681 
1682 /*
1683  * This is the 'txsync' handler to send from a software ring to the
1684  * host stack.
1685  */
1686 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
1687 static int
1688 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags)
1689 {
1690 	(void)ring_nr;
1691 	(void)flags;
1692 	if (netmap_verbose > 255)
1693 		RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr);
1694 	netmap_txsync_to_host(NA(ifp));
1695 	return 0;
1696 }
1697 
1698 
1699 /*
1700  * rxsync backend for packets coming from the host stack.
1701  * They have been put in the queue by netmap_transmit() so we
1702  * need to protect access to the kring using a lock.
1703  *
1704  * This routine also does the selrecord if called from the poll handler
1705  * (we know because td != NULL).
1706  *
1707  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1708  *     as an additional hidden argument.
1709  */
1710 static void
1711 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1712 {
1713 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1714 	struct netmap_ring *ring = kring->ring;
1715 	u_int j, n, lim = kring->nkr_num_slots;
1716 	u_int k = ring->cur, resvd = ring->reserved;
1717 
1718 	(void)pwait;	/* disable unused warnings */
1719 
1720 	if (kring->nkr_stopped) /* check a first time without lock */
1721 		return;
1722 
1723 	/* XXX as an optimization we could reuse na->core_lock */
1724 	mtx_lock(&kring->q_lock);
1725 
1726 	if (kring->nkr_stopped)  /* check again with lock held */
1727 		goto unlock_out;
1728 
1729 	if (k >= lim) {
1730 		netmap_ring_reinit(kring);
1731 		goto unlock_out;
1732 	}
1733 	/* new packets are already set in nr_hwavail */
1734 	/* skip past packets that userspace has released */
1735 	j = kring->nr_hwcur;
1736 	if (resvd > 0) {
1737 		if (resvd + ring->avail >= lim + 1) {
1738 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
1739 			ring->reserved = resvd = 0; // XXX panic...
1740 		}
1741 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
1742         }
1743 	if (j != k) {
1744 		n = k >= j ? k - j : k + lim - j;
1745 		kring->nr_hwavail -= n;
1746 		kring->nr_hwcur = k;
1747 	}
1748 	k = ring->avail = kring->nr_hwavail - resvd;
1749 	if (k == 0 && td)
1750 		selrecord(td, &kring->si);
1751 	if (k && (netmap_verbose & NM_VERB_HOST))
1752 		D("%d pkts from stack", k);
1753 unlock_out:
1754 
1755 	mtx_unlock(&kring->q_lock);
1756 }
1757 
1758 
1759 /*
1760  * MUST BE CALLED UNDER NMG_LOCK()
1761  *
1762  * get a refcounted reference to an interface.
1763  * This is always called in the execution of an ioctl().
1764  *
1765  * Return ENXIO if the interface does not exist, EINVAL if netmap
1766  * is not supported by the interface.
1767  * If successful, hold a reference.
1768  *
1769  * When the NIC is attached to a bridge, reference is managed
1770  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1771  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1772  * is detached from the bridge, then ifp's refcount is dropped (this
1773  * is equivalent to that ifp is destroyed in case of virtual ports.
1774  *
1775  * This function uses if_rele() when we want to prevent the NIC from
1776  * being detached from the bridge in error handling.  But once refcount
1777  * is acquired by this function, it must be released using nm_if_rele().
1778  */
1779 static int
1780 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create)
1781 {
1782 	const char *name = nmr->nr_name;
1783 	int namelen = strlen(name);
1784 	struct ifnet *iter = NULL;
1785 	int no_prefix = 0;
1786 
1787 	/* first try to see if this is a bridge port. */
1788 	struct nm_bridge *b;
1789 	struct netmap_adapter *na;
1790 	int i, j, cand = -1, cand2 = -1;
1791 	int needed;
1792 
1793 	NMG_LOCK_ASSERT();
1794 	*ifp = NULL;	/* default */
1795 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
1796 		no_prefix = 1;	/* no VALE prefix */
1797 		goto no_bridge_port;
1798 	}
1799 
1800 	b = nm_find_bridge(name, create);
1801 	if (b == NULL) {
1802 		D("no bridges available for '%s'", name);
1803 		return (ENXIO);
1804 	}
1805 
1806 	/* Now we are sure that name starts with the bridge's name,
1807 	 * lookup the port in the bridge. We need to scan the entire
1808 	 * list. It is not important to hold a WLOCK on the bridge
1809 	 * during the search because NMG_LOCK already guarantees
1810 	 * that there are no other possible writers.
1811 	 */
1812 
1813 	/* lookup in the local list of ports */
1814 	for (j = 0; j < b->bdg_active_ports; j++) {
1815 		i = b->bdg_port_index[j];
1816 		na = b->bdg_ports[i];
1817 		// KASSERT(na != NULL);
1818 		iter = na->ifp;
1819 		/* XXX make sure the name only contains one : */
1820 		if (!strcmp(iter->if_xname, name) /* virtual port */ ||
1821 		    (namelen > b->bdg_namelen && !strcmp(iter->if_xname,
1822 		    name + b->bdg_namelen + 1)) /* NIC */) {
1823 			ADD_BDG_REF(iter);
1824 			ND("found existing if %s refs %d", name,
1825 				NA(iter)->na_bdg_refcount);
1826 			*ifp = iter;
1827 			/* we are done, this is surely netmap capable */
1828 			return 0;
1829 		}
1830 	}
1831 	/* not found, should we create it? */
1832 	if (!create)
1833 		return ENXIO;
1834 	/* yes we should, see if we have space to attach entries */
1835 	needed = 2; /* in some cases we only need 1 */
1836 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
1837 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
1838 		return EINVAL;
1839 	}
1840 	/* record the next two ports available, but do not allocate yet */
1841 	cand = b->bdg_port_index[b->bdg_active_ports];
1842 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
1843 	ND("+++ bridge %s port %s used %d avail %d %d",
1844 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
1845 
1846 	/*
1847 	 * try see if there is a matching NIC with this name
1848 	 * (after the bridge's name)
1849 	 */
1850 	iter = ifunit_ref(name + b->bdg_namelen + 1);
1851 	if (!iter) { /* this is a virtual port */
1852 		/* Create a temporary NA with arguments, then
1853 		 * bdg_netmap_attach() will allocate the real one
1854 		 * and attach it to the ifp
1855 		 */
1856 		struct netmap_adapter tmp_na;
1857 		int error;
1858 
1859 		if (nmr->nr_cmd) {
1860 			/* nr_cmd must be 0 for a virtual port */
1861 			return EINVAL;
1862 		}
1863 		bzero(&tmp_na, sizeof(tmp_na));
1864 		/* bound checking */
1865 		tmp_na.num_tx_rings = nmr->nr_tx_rings;
1866 		nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1867 		nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back
1868 		tmp_na.num_rx_rings = nmr->nr_rx_rings;
1869 		nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1870 		nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back
1871 		nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1872 				1, NM_BDG_MAXSLOTS, NULL);
1873 		tmp_na.num_tx_desc = nmr->nr_tx_slots;
1874 		nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1875 				1, NM_BDG_MAXSLOTS, NULL);
1876 		tmp_na.num_rx_desc = nmr->nr_rx_slots;
1877 
1878 	 	/* create a struct ifnet for the new port.
1879 		 * need M_NOWAIT as we are under nma_lock
1880 		 */
1881 		iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
1882 		if (!iter)
1883 			return ENOMEM;
1884 
1885 		strcpy(iter->if_xname, name);
1886 		tmp_na.ifp = iter;
1887 		/* bdg_netmap_attach creates a struct netmap_adapter */
1888 		error = bdg_netmap_attach(&tmp_na);
1889 		if (error) {
1890 			D("error %d", error);
1891 			free(iter, M_DEVBUF);
1892 			return error;
1893 		}
1894 		cand2 = -1;	/* only need one port */
1895 	} else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
1896 		/* make sure the NIC is not already in use */
1897 		if (NETMAP_OWNED_BY_ANY(iter)) {
1898 			D("NIC %s busy, cannot attach to bridge",
1899 				iter->if_xname);
1900 			if_rele(iter); /* don't detach from bridge */
1901 			return EINVAL;
1902 		}
1903 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
1904 			cand2 = -1; /* only need one port */
1905 	} else { /* not a netmap-capable NIC */
1906 		if_rele(iter); /* don't detach from bridge */
1907 		return EINVAL;
1908 	}
1909 	na = NA(iter);
1910 
1911 	BDG_WLOCK(b);
1912 	na->bdg_port = cand;
1913 	ND("NIC  %p to bridge port %d", NA(iter), cand);
1914 	/* bind the port to the bridge (virtual ports are not active) */
1915 	b->bdg_ports[cand] = na;
1916 	na->na_bdg = b;
1917 	b->bdg_active_ports++;
1918 	if (cand2 >= 0) {
1919 		/* also bind the host stack to the bridge */
1920 		b->bdg_ports[cand2] = SWNA(iter);
1921 		SWNA(iter)->bdg_port = cand2;
1922 		SWNA(iter)->na_bdg = b;
1923 		b->bdg_active_ports++;
1924 		ND("host %p to bridge port %d", SWNA(iter), cand2);
1925 	}
1926 	ADD_BDG_REF(iter);	// XXX one or two ?
1927 	ND("if %s refs %d", name, NA(iter)->na_bdg_refcount);
1928 	BDG_WUNLOCK(b);
1929 	*ifp = iter;
1930 	return 0;
1931 
1932 no_bridge_port:
1933 	*ifp = iter;
1934 	if (! *ifp)
1935 		*ifp = ifunit_ref(name);
1936 	if (*ifp == NULL)
1937 		return (ENXIO);
1938 
1939 	if (NETMAP_CAPABLE(*ifp)) {
1940 		/* Users cannot use the NIC attached to a bridge directly */
1941 		if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
1942 			if_rele(*ifp); /* don't detach from bridge */
1943 			return EINVAL;
1944 		} else
1945 			return 0;	/* valid pointer, we hold the refcount */
1946 	}
1947 	nm_if_rele(*ifp);
1948 	return EINVAL;	// not NETMAP capable
1949 }
1950 
1951 
1952 /*
1953  * Error routine called when txsync/rxsync detects an error.
1954  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1955  * Return 1 on reinit.
1956  *
1957  * This routine is only called by the upper half of the kernel.
1958  * It only reads hwcur (which is changed only by the upper half, too)
1959  * and hwavail (which may be changed by the lower half, but only on
1960  * a tx ring and only to increase it, so any error will be recovered
1961  * on the next call). For the above, we don't strictly need to call
1962  * it under lock.
1963  */
1964 int
1965 netmap_ring_reinit(struct netmap_kring *kring)
1966 {
1967 	struct netmap_ring *ring = kring->ring;
1968 	u_int i, lim = kring->nkr_num_slots - 1;
1969 	int errors = 0;
1970 
1971 	// XXX KASSERT nm_kr_tryget
1972 	RD(10, "called for %s", kring->na->ifp->if_xname);
1973 	if (ring->cur > lim)
1974 		errors++;
1975 	for (i = 0; i <= lim; i++) {
1976 		u_int idx = ring->slot[i].buf_idx;
1977 		u_int len = ring->slot[i].len;
1978 		if (idx < 2 || idx >= netmap_total_buffers) {
1979 			if (!errors++)
1980 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1981 			ring->slot[i].buf_idx = 0;
1982 			ring->slot[i].len = 0;
1983 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1984 			ring->slot[i].len = 0;
1985 			if (!errors++)
1986 				D("bad len %d at slot %d idx %d",
1987 					len, i, idx);
1988 		}
1989 	}
1990 	if (errors) {
1991 		int pos = kring - kring->na->tx_rings;
1992 		int n = kring->na->num_tx_rings + 1;
1993 
1994 		RD(10, "total %d errors", errors);
1995 		errors++;
1996 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1997 			kring->na->ifp->if_xname,
1998 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1999 			ring->cur, kring->nr_hwcur,
2000 			ring->avail, kring->nr_hwavail);
2001 		ring->cur = kring->nr_hwcur;
2002 		ring->avail = kring->nr_hwavail;
2003 	}
2004 	return (errors ? 1 : 0);
2005 }
2006 
2007 
2008 /*
2009  * Set the ring ID. For devices with a single queue, a request
2010  * for all rings is the same as a single ring.
2011  */
2012 static int
2013 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
2014 {
2015 	struct ifnet *ifp = priv->np_ifp;
2016 	struct netmap_adapter *na = NA(ifp);
2017 	u_int i = ringid & NETMAP_RING_MASK;
2018 	/* initially (np_qfirst == np_qlast) we don't want to lock */
2019 	u_int lim = na->num_rx_rings;
2020 
2021 	if (na->num_tx_rings > lim)
2022 		lim = na->num_tx_rings;
2023 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
2024 		D("invalid ring id %d", i);
2025 		return (EINVAL);
2026 	}
2027 	priv->np_ringid = ringid;
2028 	if (ringid & NETMAP_SW_RING) {
2029 		priv->np_qfirst = NETMAP_SW_RING;
2030 		priv->np_qlast = 0;
2031 	} else if (ringid & NETMAP_HW_RING) {
2032 		priv->np_qfirst = i;
2033 		priv->np_qlast = i + 1;
2034 	} else {
2035 		priv->np_qfirst = 0;
2036 		priv->np_qlast = NETMAP_HW_RING ;
2037 	}
2038 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
2039     if (netmap_verbose) {
2040 	if (ringid & NETMAP_SW_RING)
2041 		D("ringid %s set to SW RING", ifp->if_xname);
2042 	else if (ringid & NETMAP_HW_RING)
2043 		D("ringid %s set to HW RING %d", ifp->if_xname,
2044 			priv->np_qfirst);
2045 	else
2046 		D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim);
2047     }
2048 	return 0;
2049 }
2050 
2051 
2052 /*
2053  * possibly move the interface to netmap-mode.
2054  * If success it returns a pointer to netmap_if, otherwise NULL.
2055  * This must be called with NMG_LOCK held.
2056  */
2057 static struct netmap_if *
2058 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
2059 	uint16_t ringid, int *err)
2060 {
2061 	struct netmap_adapter *na = NA(ifp);
2062 	struct netmap_if *nifp = NULL;
2063 	int error, need_mem;
2064 
2065 	NMG_LOCK_ASSERT();
2066 	/* ring configuration may have changed, fetch from the card */
2067 	netmap_update_config(na);
2068 	priv->np_ifp = ifp;     /* store the reference */
2069 	error = netmap_set_ringid(priv, ringid);
2070 	if (error)
2071 		goto out;
2072 	/* ensure allocators are ready */
2073 	need_mem = !netmap_have_memory_locked(priv);
2074 	if (need_mem) {
2075 		error = netmap_get_memory_locked(priv);
2076 		ND("get_memory returned %d", error);
2077 		if (error)
2078 			goto out;
2079 	}
2080 	nifp = netmap_if_new(ifp->if_xname, na);
2081 	if (nifp == NULL) { /* allocation failed */
2082 		/* we should drop the allocator, but only
2083 		 * if we were the ones who grabbed it
2084 		 */
2085 		if (need_mem)
2086 			netmap_drop_memory_locked(priv);
2087 		error = ENOMEM;
2088 		goto out;
2089 	}
2090 	na->refcount++;
2091 	if (ifp->if_capenable & IFCAP_NETMAP) {
2092 		/* was already set */
2093 	} else {
2094 		u_int i;
2095 		/* Otherwise set the card in netmap mode
2096 		 * and make it use the shared buffers.
2097 		 *
2098 		 * If the interface is attached to a bridge, lock it.
2099 		 */
2100 		if (NETMAP_OWNED_BY_KERN(ifp))
2101 			BDG_WLOCK(NA(ifp)->na_bdg);
2102 		for (i = 0 ; i < na->num_tx_rings + 1; i++)
2103 			mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
2104 			    NULL, MTX_DEF);
2105 		for (i = 0 ; i < na->num_rx_rings + 1; i++) {
2106 			mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
2107 			    NULL, MTX_DEF);
2108 		}
2109 		if (nma_is_hw(na)) {
2110 			SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
2111 			SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
2112 		}
2113 		/*
2114 		 * do not core lock because the race is harmless here,
2115 		 * there cannot be any traffic to netmap_transmit()
2116 		 */
2117 		error = na->nm_register(ifp, 1); /* mode on */
2118 		// XXX do we need to nm_alloc_bdgfwd() in all cases ?
2119 		if (!error)
2120 			error = nm_alloc_bdgfwd(na);
2121 		if (error) {
2122 			netmap_do_unregif(priv, nifp);
2123 			nifp = NULL;
2124 		}
2125 		if (NETMAP_OWNED_BY_KERN(ifp))
2126 			BDG_WUNLOCK(NA(ifp)->na_bdg);
2127 
2128 	}
2129 out:
2130 	*err = error;
2131 	if (nifp != NULL) {
2132 		/*
2133 		 * advertise that the interface is ready bt setting ni_nifp.
2134 		 * The barrier is needed because readers (poll and *SYNC)
2135 		 * check for priv->np_nifp != NULL without locking
2136 		 */
2137 		wmb(); /* make sure previous writes are visible to all CPUs */
2138 		priv->np_nifp = nifp;
2139 	}
2140 	return nifp;
2141 }
2142 
2143 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
2144 static int
2145 nm_bdg_attach(struct nmreq *nmr)
2146 {
2147 	struct ifnet *ifp;
2148 	struct netmap_if *nifp;
2149 	struct netmap_priv_d *npriv;
2150 	int error;
2151 
2152 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
2153 	if (npriv == NULL)
2154 		return ENOMEM;
2155 	NMG_LOCK();
2156 	error = get_ifp(nmr, &ifp, 1 /* create if not exists */);
2157 	if (error) /* no device, or another bridge or user owns the device */
2158 		goto unlock_exit;
2159 	/* get_ifp() sets na_bdg if this is a physical interface
2160 	 * that we can attach to a switch.
2161 	 */
2162 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2163 		/* got reference to a virtual port or direct access to a NIC.
2164 		 * perhaps specified no bridge prefix or wrong NIC name
2165 		 */
2166 		error = EINVAL;
2167 		goto unref_exit;
2168 	}
2169 
2170 	if (NA(ifp)->refcount > 0) { /* already registered */
2171 		error = EBUSY;
2172 		DROP_BDG_REF(ifp);
2173 		goto unlock_exit;
2174 	}
2175 
2176 	nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
2177 	if (!nifp) {
2178 		goto unref_exit;
2179 	}
2180 
2181 	NA(ifp)->na_kpriv = npriv;
2182 	NMG_UNLOCK();
2183 	ND("registered %s to netmap-mode", ifp->if_xname);
2184 	return 0;
2185 
2186 unref_exit:
2187 	nm_if_rele(ifp);
2188 unlock_exit:
2189 	NMG_UNLOCK();
2190 	bzero(npriv, sizeof(*npriv));
2191 	free(npriv, M_DEVBUF);
2192 	return error;
2193 }
2194 
2195 static int
2196 nm_bdg_detach(struct nmreq *nmr)
2197 {
2198 	struct ifnet *ifp;
2199 	int error;
2200 	int last_instance;
2201 
2202 	NMG_LOCK();
2203 	error = get_ifp(nmr, &ifp, 0 /* don't create */);
2204 	if (error) { /* no device, or another bridge or user owns the device */
2205 		goto unlock_exit;
2206 	}
2207 	/* XXX do we need to check this ? */
2208 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2209 		/* got reference to a virtual port or direct access to a NIC.
2210 		 * perhaps specified no bridge's prefix or wrong NIC's name
2211 		 */
2212 		error = EINVAL;
2213 		goto unref_exit;
2214 	}
2215 
2216 	if (NA(ifp)->refcount == 0) { /* not registered */
2217 		error = EINVAL;
2218 		goto unref_exit;
2219 	}
2220 
2221 	DROP_BDG_REF(ifp); /* the one from get_ifp */
2222 	last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */
2223 	NMG_UNLOCK();
2224 	if (!last_instance) {
2225 		D("--- error, trying to detach an entry with active mmaps");
2226 		error = EINVAL;
2227 	} else {
2228 		struct netmap_priv_d *npriv = NA(ifp)->na_kpriv;
2229 		NA(ifp)->na_kpriv = NULL;
2230 
2231 		bzero(npriv, sizeof(*npriv));
2232 		free(npriv, M_DEVBUF);
2233 	}
2234 	return error;
2235 
2236 unref_exit:
2237 	nm_if_rele(ifp);
2238 unlock_exit:
2239 	NMG_UNLOCK();
2240 	return error;
2241 }
2242 
2243 
2244 /* Initialize necessary fields of sw adapter located in right after hw's
2245  * one.  sw adapter attaches a pair of sw rings of the netmap-mode NIC.
2246  * It is always activated and deactivated at the same tie with the hw's one.
2247  * Thus we don't need refcounting on the sw adapter.
2248  * Regardless of NIC's feature we use separate lock so that anybody can lock
2249  * me independently from the hw adapter.
2250  * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
2251  */
2252 static void
2253 netmap_attach_sw(struct ifnet *ifp)
2254 {
2255 	struct netmap_adapter *hw_na = NA(ifp);
2256 	struct netmap_adapter *na = SWNA(ifp);
2257 
2258 	na->ifp = ifp;
2259 	na->num_rx_rings = na->num_tx_rings = 1;
2260 	na->num_tx_desc = hw_na->num_tx_desc;
2261 	na->num_rx_desc = hw_na->num_rx_desc;
2262 	na->nm_txsync = netmap_bdg_to_host;
2263 	/* we use the same memory allocator as the
2264 	 * the hw adapter */
2265 	na->nm_mem = hw_na->nm_mem;
2266 }
2267 
2268 
2269 /* exported to kernel callers, e.g. OVS ?
2270  * Entry point.
2271  * Called without NMG_LOCK.
2272  */
2273 int
2274 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
2275 {
2276 	struct nm_bridge *b;
2277 	struct netmap_adapter *na;
2278 	struct ifnet *iter;
2279 	char *name = nmr->nr_name;
2280 	int cmd = nmr->nr_cmd, namelen = strlen(name);
2281 	int error = 0, i, j;
2282 
2283 	switch (cmd) {
2284 	case NETMAP_BDG_ATTACH:
2285 		error = nm_bdg_attach(nmr);
2286 		break;
2287 
2288 	case NETMAP_BDG_DETACH:
2289 		error = nm_bdg_detach(nmr);
2290 		break;
2291 
2292 	case NETMAP_BDG_LIST:
2293 		/* this is used to enumerate bridges and ports */
2294 		if (namelen) { /* look up indexes of bridge and port */
2295 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
2296 				error = EINVAL;
2297 				break;
2298 			}
2299 			NMG_LOCK();
2300 			b = nm_find_bridge(name, 0 /* don't create */);
2301 			if (!b) {
2302 				error = ENOENT;
2303 				NMG_UNLOCK();
2304 				break;
2305 			}
2306 
2307 			error = ENOENT;
2308 			for (j = 0; j < b->bdg_active_ports; j++) {
2309 				i = b->bdg_port_index[j];
2310 				na = b->bdg_ports[i];
2311 				if (na == NULL) {
2312 					D("---AAAAAAAAARGH-------");
2313 					continue;
2314 				}
2315 				iter = na->ifp;
2316 				/* the former and the latter identify a
2317 				 * virtual port and a NIC, respectively
2318 				 */
2319 				if (!strcmp(iter->if_xname, name) ||
2320 				    (namelen > b->bdg_namelen &&
2321 				    !strcmp(iter->if_xname,
2322 				    name + b->bdg_namelen + 1))) {
2323 					/* bridge index */
2324 					nmr->nr_arg1 = b - nm_bridges;
2325 					nmr->nr_arg2 = i; /* port index */
2326 					error = 0;
2327 					break;
2328 				}
2329 			}
2330 			NMG_UNLOCK();
2331 		} else {
2332 			/* return the first non-empty entry starting from
2333 			 * bridge nr_arg1 and port nr_arg2.
2334 			 *
2335 			 * Users can detect the end of the same bridge by
2336 			 * seeing the new and old value of nr_arg1, and can
2337 			 * detect the end of all the bridge by error != 0
2338 			 */
2339 			i = nmr->nr_arg1;
2340 			j = nmr->nr_arg2;
2341 
2342 			NMG_LOCK();
2343 			for (error = ENOENT; i < NM_BRIDGES; i++) {
2344 				b = nm_bridges + i;
2345 				if (j >= b->bdg_active_ports) {
2346 					j = 0; /* following bridges scan from 0 */
2347 					continue;
2348 				}
2349 				nmr->nr_arg1 = i;
2350 				nmr->nr_arg2 = j;
2351 				j = b->bdg_port_index[j];
2352 				na = b->bdg_ports[j];
2353 				iter = na->ifp;
2354 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
2355 				error = 0;
2356 				break;
2357 			}
2358 			NMG_UNLOCK();
2359 		}
2360 		break;
2361 
2362 	case NETMAP_BDG_LOOKUP_REG:
2363 		/* register a lookup function to the given bridge.
2364 		 * nmr->nr_name may be just bridge's name (including ':'
2365 		 * if it is not just NM_NAME).
2366 		 */
2367 		if (!func) {
2368 			error = EINVAL;
2369 			break;
2370 		}
2371 		NMG_LOCK();
2372 		b = nm_find_bridge(name, 0 /* don't create */);
2373 		if (!b) {
2374 			error = EINVAL;
2375 		} else {
2376 			b->nm_bdg_lookup = func;
2377 		}
2378 		NMG_UNLOCK();
2379 		break;
2380 
2381 	default:
2382 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
2383 		error = EINVAL;
2384 		break;
2385 	}
2386 	return error;
2387 }
2388 
2389 
2390 /*
2391  * ioctl(2) support for the "netmap" device.
2392  *
2393  * Following a list of accepted commands:
2394  * - NIOCGINFO
2395  * - SIOCGIFADDR	just for convenience
2396  * - NIOCREGIF
2397  * - NIOCUNREGIF
2398  * - NIOCTXSYNC
2399  * - NIOCRXSYNC
2400  *
2401  * Return 0 on success, errno otherwise.
2402  */
2403 static int
2404 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2405 	int fflag, struct thread *td)
2406 {
2407 	struct netmap_priv_d *priv = NULL;
2408 	struct ifnet *ifp = NULL;
2409 	struct nmreq *nmr = (struct nmreq *) data;
2410 	struct netmap_adapter *na = NULL;
2411 	int error;
2412 	u_int i, lim;
2413 	struct netmap_if *nifp;
2414 	struct netmap_kring *krings;
2415 
2416 	(void)dev;	/* UNUSED */
2417 	(void)fflag;	/* UNUSED */
2418 #ifdef linux
2419 #define devfs_get_cdevpriv(pp)				\
2420 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
2421 		(*pp ? 0 : ENOENT); })
2422 
2423 /* devfs_set_cdevpriv cannot fail on linux */
2424 #define devfs_set_cdevpriv(p, fn)				\
2425 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
2426 
2427 
2428 #define devfs_clear_cdevpriv()	do {				\
2429 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
2430 	} while (0)
2431 #endif /* linux */
2432 
2433 	CURVNET_SET(TD_TO_VNET(td));
2434 
2435 	error = devfs_get_cdevpriv((void **)&priv);
2436 	if (error) {
2437 		CURVNET_RESTORE();
2438 		/* XXX ENOENT should be impossible, since the priv
2439 		 * is now created in the open */
2440 		return (error == ENOENT ? ENXIO : error);
2441 	}
2442 
2443 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
2444 	switch (cmd) {
2445 	case NIOCGINFO:		/* return capabilities etc */
2446 		if (nmr->nr_version != NETMAP_API) {
2447 #ifdef TEST_STUFF
2448 			/* some test code for locks etc */
2449 			if (nmr->nr_version == 666) {
2450 				error = nm_test(nmr);
2451 				break;
2452 			}
2453 #endif /* TEST_STUFF */
2454 			D("API mismatch got %d have %d",
2455 				nmr->nr_version, NETMAP_API);
2456 			nmr->nr_version = NETMAP_API;
2457 			error = EINVAL;
2458 			break;
2459 		}
2460 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2461 			error = netmap_bdg_ctl(nmr, NULL);
2462 			break;
2463 		}
2464 
2465 		NMG_LOCK();
2466 		do {
2467 			/* memsize is always valid */
2468 			struct netmap_mem_d *nmd = &nm_mem;
2469 			u_int memflags;
2470 
2471 			if (nmr->nr_name[0] != '\0') {
2472 				/* get a refcount */
2473 				error = get_ifp(nmr, &ifp, 1 /* create */);
2474 				if (error)
2475 					break;
2476 				na = NA(ifp);  /* retrieve the netmap adapter */
2477 				nmd = na->nm_mem; /* and its memory allocator */
2478 			}
2479 
2480 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
2481 			if (error)
2482 				break;
2483 			if (na == NULL) /* only memory info */
2484 				break;
2485 			nmr->nr_offset = 0;
2486 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2487 			netmap_update_config(na);
2488 			nmr->nr_rx_rings = na->num_rx_rings;
2489 			nmr->nr_tx_rings = na->num_tx_rings;
2490 			nmr->nr_rx_slots = na->num_rx_desc;
2491 			nmr->nr_tx_slots = na->num_tx_desc;
2492 			if (memflags & NETMAP_MEM_PRIVATE)
2493 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2494 		} while (0);
2495 		if (ifp)
2496 			nm_if_rele(ifp);	/* return the refcount */
2497 		NMG_UNLOCK();
2498 		break;
2499 
2500 	case NIOCREGIF:
2501 		if (nmr->nr_version != NETMAP_API) {
2502 			nmr->nr_version = NETMAP_API;
2503 			error = EINVAL;
2504 			break;
2505 		}
2506 		/* possibly attach/detach NIC and VALE switch */
2507 		i = nmr->nr_cmd;
2508 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
2509 			error = netmap_bdg_ctl(nmr, NULL);
2510 			break;
2511 		} else if (i != 0) {
2512 			D("nr_cmd must be 0 not %d", i);
2513 			error = EINVAL;
2514 			break;
2515 		}
2516 
2517 		/* protect access to priv from concurrent NIOCREGIF */
2518 		NMG_LOCK();
2519 		do {
2520 			u_int memflags;
2521 
2522 			if (priv->np_ifp != NULL) {	/* thread already registered */
2523 				error = netmap_set_ringid(priv, nmr->nr_ringid);
2524 				break;
2525 			}
2526 			/* find the interface and a reference */
2527 			error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */
2528 			if (error)
2529 				break;
2530 			if (NETMAP_OWNED_BY_KERN(ifp)) {
2531 				nm_if_rele(ifp);
2532 				error = EBUSY;
2533 				break;
2534 			}
2535 			nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error);
2536 			if (!nifp) {    /* reg. failed, release priv and ref */
2537 				nm_if_rele(ifp);        /* return the refcount */
2538 				priv->np_ifp = NULL;
2539 				priv->np_nifp = NULL;
2540 				break;
2541 			}
2542 
2543 			/* return the offset of the netmap_if object */
2544 			na = NA(ifp); /* retrieve netmap adapter */
2545 			nmr->nr_rx_rings = na->num_rx_rings;
2546 			nmr->nr_tx_rings = na->num_tx_rings;
2547 			nmr->nr_rx_slots = na->num_rx_desc;
2548 			nmr->nr_tx_slots = na->num_tx_desc;
2549 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
2550 			if (error) {
2551 				nm_if_rele(ifp);
2552 				break;
2553 			}
2554 			if (memflags & NETMAP_MEM_PRIVATE) {
2555 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2556 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2557 			}
2558 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2559 		} while (0);
2560 		NMG_UNLOCK();
2561 		break;
2562 
2563 	case NIOCUNREGIF:
2564 		// XXX we have no data here ?
2565 		D("deprecated, data is %p", nmr);
2566 		error = EINVAL;
2567 		break;
2568 
2569 	case NIOCTXSYNC:
2570 	case NIOCRXSYNC:
2571 		nifp = priv->np_nifp;
2572 
2573 		if (nifp == NULL) {
2574 			error = ENXIO;
2575 			break;
2576 		}
2577 		rmb(); /* make sure following reads are not from cache */
2578 
2579 		ifp = priv->np_ifp;	/* we have a reference */
2580 
2581 		if (ifp == NULL) {
2582 			D("Internal error: nifp != NULL && ifp == NULL");
2583 			error = ENXIO;
2584 			break;
2585 		}
2586 
2587 		na = NA(ifp); /* retrieve netmap adapter */
2588 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
2589 			if (cmd == NIOCTXSYNC)
2590 				netmap_txsync_to_host(na);
2591 			else
2592 				netmap_rxsync_from_host(na, NULL, NULL);
2593 			break;
2594 		}
2595 		/* find the last ring to scan */
2596 		lim = priv->np_qlast;
2597 		if (lim == NETMAP_HW_RING)
2598 			lim = (cmd == NIOCTXSYNC) ?
2599 			    na->num_tx_rings : na->num_rx_rings;
2600 
2601 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
2602 		for (i = priv->np_qfirst; i < lim; i++) {
2603 			struct netmap_kring *kring = krings + i;
2604 			if (nm_kr_tryget(kring)) {
2605 				error = EBUSY;
2606 				goto out;
2607 			}
2608 			if (cmd == NIOCTXSYNC) {
2609 				if (netmap_verbose & NM_VERB_TXSYNC)
2610 					D("pre txsync ring %d cur %d hwcur %d",
2611 					    i, kring->ring->cur,
2612 					    kring->nr_hwcur);
2613 				na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM);
2614 				if (netmap_verbose & NM_VERB_TXSYNC)
2615 					D("post txsync ring %d cur %d hwcur %d",
2616 					    i, kring->ring->cur,
2617 					    kring->nr_hwcur);
2618 			} else {
2619 				na->nm_rxsync(ifp, i, NAF_FORCE_READ);
2620 				microtime(&na->rx_rings[i].ring->ts);
2621 			}
2622 			nm_kr_put(kring);
2623 		}
2624 
2625 		break;
2626 
2627 #ifdef __FreeBSD__
2628 	case BIOCIMMEDIATE:
2629 	case BIOCGHDRCMPLT:
2630 	case BIOCSHDRCMPLT:
2631 	case BIOCSSEESENT:
2632 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2633 		break;
2634 
2635 	default:	/* allow device-specific ioctls */
2636 	    {
2637 		struct socket so;
2638 
2639 		bzero(&so, sizeof(so));
2640 		NMG_LOCK();
2641 		error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */
2642 		if (error) {
2643 			NMG_UNLOCK();
2644 			break;
2645 		}
2646 		so.so_vnet = ifp->if_vnet;
2647 		// so->so_proto not null.
2648 		error = ifioctl(&so, cmd, data, td);
2649 		nm_if_rele(ifp);
2650 		NMG_UNLOCK();
2651 		break;
2652 	    }
2653 
2654 #else /* linux */
2655 	default:
2656 		error = EOPNOTSUPP;
2657 #endif /* linux */
2658 	}
2659 out:
2660 
2661 	CURVNET_RESTORE();
2662 	return (error);
2663 }
2664 
2665 
2666 /*
2667  * select(2) and poll(2) handlers for the "netmap" device.
2668  *
2669  * Can be called for one or more queues.
2670  * Return true the event mask corresponding to ready events.
2671  * If there are no ready events, do a selrecord on either individual
2672  * selinfo or on the global one.
2673  * Device-dependent parts (locking and sync of tx/rx rings)
2674  * are done through callbacks.
2675  *
2676  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2677  * The first one is remapped to pwait as selrecord() uses the name as an
2678  * hidden argument.
2679  */
2680 static int
2681 netmap_poll(struct cdev *dev, int events, struct thread *td)
2682 {
2683 	struct netmap_priv_d *priv = NULL;
2684 	struct netmap_adapter *na;
2685 	struct ifnet *ifp;
2686 	struct netmap_kring *kring;
2687 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2688 	u_int lim_tx, lim_rx, host_forwarded = 0;
2689 	struct mbq q = { NULL, NULL, 0 };
2690 	void *pwait = dev;	/* linux compatibility */
2691 
2692 		int retry_tx = 1;
2693 
2694 	(void)pwait;
2695 
2696 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
2697 		return POLLERR;
2698 
2699 	if (priv->np_nifp == NULL) {
2700 		D("No if registered");
2701 		return POLLERR;
2702 	}
2703 	rmb(); /* make sure following reads are not from cache */
2704 
2705 	ifp = priv->np_ifp;
2706 	// XXX check for deleting() ?
2707 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
2708 		return POLLERR;
2709 
2710 	if (netmap_verbose & 0x8000)
2711 		D("device %s events 0x%x", ifp->if_xname, events);
2712 	want_tx = events & (POLLOUT | POLLWRNORM);
2713 	want_rx = events & (POLLIN | POLLRDNORM);
2714 
2715 	na = NA(ifp); /* retrieve netmap adapter */
2716 
2717 	lim_tx = na->num_tx_rings;
2718 	lim_rx = na->num_rx_rings;
2719 
2720 	if (priv->np_qfirst == NETMAP_SW_RING) {
2721 		/* handle the host stack ring */
2722 		if (priv->np_txpoll || want_tx) {
2723 			/* push any packets up, then we are always ready */
2724 			netmap_txsync_to_host(na);
2725 			revents |= want_tx;
2726 		}
2727 		if (want_rx) {
2728 			kring = &na->rx_rings[lim_rx];
2729 			if (kring->ring->avail == 0)
2730 				netmap_rxsync_from_host(na, td, dev);
2731 			if (kring->ring->avail > 0) {
2732 				revents |= want_rx;
2733 			}
2734 		}
2735 		return (revents);
2736 	}
2737 
2738 	/* if we are in transparent mode, check also the host rx ring */
2739 	kring = &na->rx_rings[lim_rx];
2740 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2741 			&& want_rx
2742 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
2743 		if (kring->ring->avail == 0)
2744 			netmap_rxsync_from_host(na, td, dev);
2745 		if (kring->ring->avail > 0)
2746 			revents |= want_rx;
2747 	}
2748 
2749 	/*
2750 	 * check_all is set if the card has more than one queue AND
2751 	 * the client is polling all of them. If true, we sleep on
2752 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2753 	 * (FreeBSD only allows two selinfo's per file descriptor).
2754 	 * The interrupt routine in the driver wake one or the other
2755 	 * (or both) depending on which clients are active.
2756 	 *
2757 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2758 	 * txsync() is called if we run out of buffers on POLLOUT, or
2759 	 * there are pending packets to send. The latter can be disabled
2760 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2761 	 */
2762 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
2763 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
2764 
2765 	if (priv->np_qlast != NETMAP_HW_RING) {
2766 		lim_tx = lim_rx = priv->np_qlast;
2767 	}
2768 
2769 	/*
2770 	 * We start with a lock free round which is good if we have
2771 	 * data available. If this fails, then lock and call the sync
2772 	 * routines.
2773 	 */
2774 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
2775 		kring = &na->rx_rings[i];
2776 		if (kring->ring->avail > 0) {
2777 			revents |= want_rx;
2778 			want_rx = 0;	/* also breaks the loop */
2779 		}
2780 	}
2781 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
2782 		kring = &na->tx_rings[i];
2783 		if (kring->ring->avail > 0) {
2784 			revents |= want_tx;
2785 			want_tx = 0;	/* also breaks the loop */
2786 		}
2787 	}
2788 
2789 	/*
2790 	 * If we to push packets out (priv->np_txpoll) or want_tx is
2791 	 * still set, we do need to run the txsync calls (on all rings,
2792 	 * to avoid that the tx rings stall).
2793 	 */
2794 	if (priv->np_txpoll || want_tx) {
2795 		/* If we really want to be woken up (want_tx),
2796 		 * do a selrecord, either on the global or on
2797 		 * the private structure.  Then issue the txsync
2798 		 * so there is no race in the selrecord/selwait
2799 		 */
2800 flush_tx:
2801 		for (i = priv->np_qfirst; i < lim_tx; i++) {
2802 			kring = &na->tx_rings[i];
2803 			/*
2804 			 * Skip this ring if want_tx == 0
2805 			 * (we have already done a successful sync on
2806 			 * a previous ring) AND kring->cur == kring->hwcur
2807 			 * (there are no pending transmissions for this ring).
2808 			 */
2809 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2810 				continue;
2811 			/* make sure only one user thread is doing this */
2812 			if (nm_kr_tryget(kring)) {
2813 				ND("ring %p busy is %d", kring, (int)kring->nr_busy);
2814 				revents |= POLLERR;
2815 				goto out;
2816 			}
2817 
2818 			if (netmap_verbose & NM_VERB_TXSYNC)
2819 				D("send %d on %s %d",
2820 					kring->ring->cur, ifp->if_xname, i);
2821 			if (na->nm_txsync(ifp, i, 0))
2822 				revents |= POLLERR;
2823 
2824 			/* Check avail/call selrecord only if called with POLLOUT */
2825 			if (want_tx) {
2826 				if (kring->ring->avail > 0) {
2827 					/* stop at the first ring. We don't risk
2828 					 * starvation.
2829 					 */
2830 					revents |= want_tx;
2831 					want_tx = 0;
2832 				}
2833 			}
2834 			nm_kr_put(kring);
2835 		}
2836 		if (want_tx && retry_tx) {
2837 			selrecord(td, check_all_tx ?
2838 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
2839 			retry_tx = 0;
2840 			goto flush_tx;
2841 		}
2842 	}
2843 
2844 	/*
2845 	 * now if want_rx is still set we need to lock and rxsync.
2846 	 * Do it on all rings because otherwise we starve.
2847 	 */
2848 	if (want_rx) {
2849 		int retry_rx = 1;
2850 do_retry_rx:
2851 		for (i = priv->np_qfirst; i < lim_rx; i++) {
2852 			kring = &na->rx_rings[i];
2853 
2854 			if (nm_kr_tryget(kring)) {
2855 				revents |= POLLERR;
2856 				goto out;
2857 			}
2858 
2859 			/* XXX NR_FORWARD should only be read on
2860 			 * physical or NIC ports
2861 			 */
2862 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2863 				ND(10, "forwarding some buffers up %d to %d",
2864 				    kring->nr_hwcur, kring->ring->cur);
2865 				netmap_grab_packets(kring, &q, netmap_fwd);
2866 			}
2867 
2868 			if (na->nm_rxsync(ifp, i, 0))
2869 				revents |= POLLERR;
2870 			if (netmap_no_timestamp == 0 ||
2871 					kring->ring->flags & NR_TIMESTAMP) {
2872 				microtime(&kring->ring->ts);
2873 			}
2874 
2875 			if (kring->ring->avail > 0) {
2876 				revents |= want_rx;
2877 				retry_rx = 0;
2878 			}
2879 			nm_kr_put(kring);
2880 		}
2881 		if (retry_rx) {
2882 			retry_rx = 0;
2883 			selrecord(td, check_all_rx ?
2884 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
2885 			goto do_retry_rx;
2886 		}
2887 	}
2888 
2889 	/* forward host to the netmap ring.
2890 	 * I am accessing nr_hwavail without lock, but netmap_transmit
2891 	 * can only increment it, so the operation is safe.
2892 	 */
2893 	kring = &na->rx_rings[lim_rx];
2894 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2895 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
2896 			 && kring->nr_hwavail > 0 && !host_forwarded) {
2897 		netmap_sw_to_nic(na);
2898 		host_forwarded = 1; /* prevent another pass */
2899 		want_rx = 0;
2900 		goto flush_tx;
2901 	}
2902 
2903 	if (q.head)
2904 		netmap_send_up(na->ifp, q.head);
2905 
2906 out:
2907 
2908 	return (revents);
2909 }
2910 
2911 /*------- driver support routines ------*/
2912 
2913 
2914 /*
2915  * Initialize a ``netmap_adapter`` object created by driver on attach.
2916  * We allocate a block of memory with room for a struct netmap_adapter
2917  * plus two sets of N+2 struct netmap_kring (where N is the number
2918  * of hardware rings):
2919  * krings	0..N-1	are for the hardware queues.
2920  * kring	N	is for the host stack queue
2921  * kring	N+1	is only used for the selinfo for all queues.
2922  * Return 0 on success, ENOMEM otherwise.
2923  *
2924  * By default the receive and transmit adapter ring counts are both initialized
2925  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
2926  * setups.
2927  */
2928 int
2929 netmap_attach(struct netmap_adapter *arg, u_int num_queues)
2930 {
2931 	struct netmap_adapter *na = NULL;
2932 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2933 	size_t len;
2934 
2935 	if (arg == NULL || ifp == NULL)
2936 		goto fail;
2937 	/* a VALE port uses two endpoints */
2938 	len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2;
2939 	na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
2940 	if (na == NULL)
2941 		goto fail;
2942 	WNA(ifp) = na;
2943 	*na = *arg; /* copy everything, trust the driver to not pass junk */
2944 	NETMAP_SET_CAPABLE(ifp);
2945 	if (na->num_tx_rings == 0)
2946 		na->num_tx_rings = num_queues;
2947 	na->num_rx_rings = num_queues;
2948 	na->refcount = na->na_single = na->na_multi = 0;
2949 	/* Core lock initialized here, others after netmap_if_new. */
2950 	mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF);
2951 #ifdef linux
2952 	if (ifp->netdev_ops) {
2953 		ND("netdev_ops %p", ifp->netdev_ops);
2954 		/* prepare a clone of the netdev ops */
2955 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2956 		na->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2957 #else
2958 		na->nm_ndo = *ifp->netdev_ops;
2959 #endif
2960 	}
2961 	na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2962 #endif /* linux */
2963 	na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem;
2964 	if (!nma_is_vp(arg))
2965 		netmap_attach_sw(ifp);
2966 	D("success for %s", ifp->if_xname);
2967 	return 0;
2968 
2969 fail:
2970 	D("fail, arg %p ifp %p na %p", arg, ifp, na);
2971 	netmap_detach(ifp);
2972 	return (na ? EINVAL : ENOMEM);
2973 }
2974 
2975 
2976 /*
2977  * Free the allocated memory linked to the given ``netmap_adapter``
2978  * object.
2979  */
2980 void
2981 netmap_detach(struct ifnet *ifp)
2982 {
2983 	struct netmap_adapter *na = NA(ifp);
2984 
2985 	if (!na)
2986 		return;
2987 
2988 	mtx_destroy(&na->core_lock);
2989 
2990 	if (na->tx_rings) { /* XXX should not happen */
2991 		D("freeing leftover tx_rings");
2992 		free(na->tx_rings, M_DEVBUF);
2993 	}
2994 	if (na->na_flags & NAF_MEM_OWNER)
2995 		netmap_mem_private_delete(na->nm_mem);
2996 	bzero(na, sizeof(*na));
2997 	WNA(ifp) = NULL;
2998 	free(na, M_DEVBUF);
2999 }
3000 
3001 
3002 int
3003 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
3004 	struct netmap_adapter *na, u_int ring_nr);
3005 
3006 
3007 /*
3008  * Intercept packets from the network stack and pass them
3009  * to netmap as incoming packets on the 'software' ring.
3010  * We rely on the OS to make sure that the ifp and na do not go
3011  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3012  * In nm_register() or whenever there is a reinitialization,
3013  * we make sure to access the core lock and per-ring locks
3014  * so that IFCAP_NETMAP is visible here.
3015  */
3016 int
3017 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3018 {
3019 	struct netmap_adapter *na = NA(ifp);
3020 	struct netmap_kring *kring;
3021 	u_int i, len = MBUF_LEN(m);
3022 	u_int error = EBUSY, lim;
3023 	struct netmap_slot *slot;
3024 
3025 	// XXX [Linux] we do not need this lock
3026 	// if we follow the down/configure/up protocol -gl
3027 	// mtx_lock(&na->core_lock);
3028 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
3029 		/* interface not in netmap mode anymore */
3030 		error = ENXIO;
3031 		goto done;
3032 	}
3033 
3034 	kring = &na->rx_rings[na->num_rx_rings];
3035 	lim = kring->nkr_num_slots - 1;
3036 	if (netmap_verbose & NM_VERB_HOST)
3037 		D("%s packet %d len %d from the stack", ifp->if_xname,
3038 			kring->nr_hwcur + kring->nr_hwavail, len);
3039 	// XXX reconsider long packets if we handle fragments
3040 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
3041 		D("%s from_host, drop packet size %d > %d", ifp->if_xname,
3042 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
3043 		goto done;
3044 	}
3045 	if (SWNA(ifp)->na_bdg) {
3046 		struct nm_bdg_fwd *ft;
3047 		char *dst;
3048 
3049 		na = SWNA(ifp); /* we operate on the host port */
3050 		ft = na->rx_rings[0].nkr_ft;
3051 		dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]);
3052 
3053 		/* use slot 0 in the ft, there is nothing queued here */
3054 		/* XXX we can save the copy calling m_copydata in nm_bdg_flush,
3055 		 * need a special flag for this.
3056 		 */
3057 		m_copydata(m, 0, (int)len, dst);
3058 		ft->ft_flags = 0;
3059 		ft->ft_len = len;
3060 		ft->ft_buf = dst;
3061 		ft->ft_next = NM_FT_NULL;
3062 		ft->ft_frags = 1;
3063 		if (netmap_verbose & NM_VERB_HOST)
3064 			RD(5, "pkt %p size %d to bridge port %d",
3065 				dst, len, na->bdg_port);
3066 		nm_bdg_flush(ft, 1, na, 0);
3067 		na = NA(ifp);	/* back to the regular object/lock */
3068 		error = 0;
3069 		goto done;
3070 	}
3071 
3072 	/* protect against other instances of netmap_transmit,
3073 	 * and userspace invocations of rxsync().
3074 	 * XXX could reuse core_lock
3075 	 */
3076 	// XXX [Linux] there can be no other instances of netmap_transmit
3077 	// on this same ring, but we still need this lock to protect
3078 	// concurrent access from netmap_sw_to_nic() -gl
3079 	mtx_lock(&kring->q_lock);
3080 	if (kring->nr_hwavail >= lim) {
3081 		if (netmap_verbose)
3082 			D("stack ring %s full\n", ifp->if_xname);
3083 	} else {
3084 		/* compute the insert position */
3085 		i = nm_kr_rxpos(kring);
3086 		slot = &kring->ring->slot[i];
3087 		m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot));
3088 		slot->len = len;
3089 		slot->flags = kring->nkr_slot_flags;
3090 		kring->nr_hwavail++;
3091 		if (netmap_verbose  & NM_VERB_HOST)
3092 			D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings);
3093 		selwakeuppri(&kring->si, PI_NET);
3094 		error = 0;
3095 	}
3096 	mtx_unlock(&kring->q_lock);
3097 
3098 done:
3099 	// mtx_unlock(&na->core_lock);
3100 
3101 	/* release the mbuf in either cases of success or failure. As an
3102 	 * alternative, put the mbuf in a free list and free the list
3103 	 * only when really necessary.
3104 	 */
3105 	m_freem(m);
3106 
3107 	return (error);
3108 }
3109 
3110 
3111 /*
3112  * netmap_reset() is called by the driver routines when reinitializing
3113  * a ring. The driver is in charge of locking to protect the kring.
3114  * If netmap mode is not set just return NULL.
3115  */
3116 struct netmap_slot *
3117 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3118 	u_int new_cur)
3119 {
3120 	struct netmap_kring *kring;
3121 	int new_hwofs, lim;
3122 
3123 	if (na == NULL) {
3124 		D("NULL na, should not happen");
3125 		return NULL;	/* no netmap support here */
3126 	}
3127 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
3128 		D("interface not in netmap mode");
3129 		return NULL;	/* nothing to reinitialize */
3130 	}
3131 
3132 	/* XXX note- in the new scheme, we are not guaranteed to be
3133 	 * under lock (e.g. when called on a device reset).
3134 	 * In this case, we should set a flag and do not trust too
3135 	 * much the values. In practice: TODO
3136 	 * - set a RESET flag somewhere in the kring
3137 	 * - do the processing in a conservative way
3138 	 * - let the *sync() fixup at the end.
3139 	 */
3140 	if (tx == NR_TX) {
3141 		if (n >= na->num_tx_rings)
3142 			return NULL;
3143 		kring = na->tx_rings + n;
3144 		new_hwofs = kring->nr_hwcur - new_cur;
3145 	} else {
3146 		if (n >= na->num_rx_rings)
3147 			return NULL;
3148 		kring = na->rx_rings + n;
3149 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
3150 	}
3151 	lim = kring->nkr_num_slots - 1;
3152 	if (new_hwofs > lim)
3153 		new_hwofs -= lim + 1;
3154 
3155 	/* Always set the new offset value and realign the ring. */
3156 	D("%s hwofs %d -> %d, hwavail %d -> %d",
3157 		tx == NR_TX ? "TX" : "RX",
3158 		kring->nkr_hwofs, new_hwofs,
3159 		kring->nr_hwavail,
3160 		tx == NR_TX ? lim : kring->nr_hwavail);
3161 	kring->nkr_hwofs = new_hwofs;
3162 	if (tx == NR_TX)
3163 		kring->nr_hwavail = lim;
3164 
3165 #if 0 // def linux
3166 	/* XXX check that the mappings are correct */
3167 	/* need ring_nr, adapter->pdev, direction */
3168 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3169 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3170 		D("error mapping rx netmap buffer %d", i);
3171 		// XXX fix error handling
3172 	}
3173 
3174 #endif /* linux */
3175 	/*
3176 	 * Wakeup on the individual and global selwait
3177 	 * We do the wakeup here, but the ring is not yet reconfigured.
3178 	 * However, we are under lock so there are no races.
3179 	 */
3180 	selwakeuppri(&kring->si, PI_NET);
3181 	selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET);
3182 	return kring->ring->slot;
3183 }
3184 
3185 
3186 /*
3187  * Grab packets from a kring, move them into the ft structure
3188  * associated to the tx (input) port. Max one instance per port,
3189  * filtered on input (ioctl, poll or XXX).
3190  * Returns the next position in the ring.
3191  */
3192 static int
3193 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr,
3194 	struct netmap_kring *kring, u_int end)
3195 {
3196 	struct netmap_ring *ring = kring->ring;
3197 	struct nm_bdg_fwd *ft;
3198 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
3199 	u_int ft_i = 0;	/* start from 0 */
3200 	u_int frags = 1; /* how many frags ? */
3201 	struct nm_bridge *b = na->na_bdg;
3202 
3203 	/* To protect against modifications to the bridge we acquire a
3204 	 * shared lock, waiting if we can sleep (if the source port is
3205 	 * attached to a user process) or with a trylock otherwise (NICs).
3206 	 */
3207 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3208 	if (na->na_flags & NAF_BDG_MAYSLEEP)
3209 		BDG_RLOCK(b);
3210 	else if (!BDG_RTRYLOCK(b))
3211 		return 0;
3212 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3213 	ft = kring->nkr_ft;
3214 
3215 	for (; likely(j != end); j = nm_next(j, lim)) {
3216 		struct netmap_slot *slot = &ring->slot[j];
3217 		char *buf;
3218 
3219 		ft[ft_i].ft_len = slot->len;
3220 		ft[ft_i].ft_flags = slot->flags;
3221 
3222 		ND("flags is 0x%x", slot->flags);
3223 		/* this slot goes into a list so initialize the link field */
3224 		ft[ft_i].ft_next = NM_FT_NULL;
3225 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
3226 			(void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot);
3227 		prefetch(buf);
3228 		++ft_i;
3229 		if (slot->flags & NS_MOREFRAG) {
3230 			frags++;
3231 			continue;
3232 		}
3233 		if (unlikely(netmap_verbose && frags > 1))
3234 			RD(5, "%d frags at %d", frags, ft_i - frags);
3235 		ft[ft_i - frags].ft_frags = frags;
3236 		frags = 1;
3237 		if (unlikely((int)ft_i >= bridge_batch))
3238 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3239 	}
3240 	if (frags > 1) {
3241 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
3242 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
3243 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
3244 		ft[ft_i - frags].ft_frags = frags - 1;
3245 	}
3246 	if (ft_i)
3247 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3248 	BDG_RUNLOCK(b);
3249 	return j;
3250 }
3251 
3252 
3253 /*
3254  * Pass packets from nic to the bridge.
3255  * XXX TODO check locking: this is called from the interrupt
3256  * handler so we should make sure that the interface is not
3257  * disconnected while passing down an interrupt.
3258  *
3259  * Note, no user process can access this NIC so we can ignore
3260  * the info in the 'ring'.
3261  */
3262 static void
3263 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr)
3264 {
3265 	struct netmap_adapter *na = NA(ifp);
3266 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
3267 	struct netmap_ring *ring = kring->ring;
3268 	u_int j, k;
3269 
3270 	/* make sure that only one thread is ever in here,
3271 	 * after which we can unlock. Probably unnecessary XXX.
3272 	 */
3273 	if (nm_kr_tryget(kring))
3274 		return;
3275 	/* fetch packets that have arrived.
3276 	 * XXX maybe do this in a loop ?
3277 	 */
3278 	if (na->nm_rxsync(ifp, ring_nr, 0))
3279 		goto put_out;
3280 	if (kring->nr_hwavail == 0 && netmap_verbose) {
3281 		D("how strange, interrupt with no packets on %s",
3282 			ifp->if_xname);
3283 		goto put_out;
3284 	}
3285 	k = nm_kr_rxpos(kring);
3286 
3287 	j = nm_bdg_preflush(na, ring_nr, kring, k);
3288 
3289 	/* we consume everything, but we cannot update kring directly
3290 	 * because the nic may have destroyed the info in the NIC ring.
3291 	 * So we need to call rxsync again to restore it.
3292 	 */
3293 	ring->cur = j;
3294 	ring->avail = 0;
3295 	na->nm_rxsync(ifp, ring_nr, 0);
3296 
3297 put_out:
3298 	nm_kr_put(kring);
3299 	return;
3300 }
3301 
3302 
3303 /*
3304  * Default functions to handle rx/tx interrupts from a physical device.
3305  * "work_done" is non-null on the RX path, NULL for the TX path.
3306  * We rely on the OS to make sure that there is only one active
3307  * instance per queue, and that there is appropriate locking.
3308  *
3309  * If the card is not in netmap mode, simply return 0,
3310  * so that the caller proceeds with regular processing.
3311  *
3312  * If the card is connected to a netmap file descriptor,
3313  * do a selwakeup on the individual queue, plus one on the global one
3314  * if needed (multiqueue card _and_ there are multiqueue listeners),
3315  * and return 1.
3316  *
3317  * Finally, if called on rx from an interface connected to a switch,
3318  * calls the proper forwarding routine, and return 1.
3319  */
3320 int
3321 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3322 {
3323 	struct netmap_adapter *na;
3324 	struct netmap_kring *kring;
3325 
3326 	if (!(ifp->if_capenable & IFCAP_NETMAP))
3327 		return 0;
3328 
3329 	q &= NETMAP_RING_MASK;
3330 
3331 	if (netmap_verbose)
3332 		RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3333 	na = NA(ifp);
3334 	if (na->na_flags & NAF_SKIP_INTR) {
3335 		ND("use regular interrupt");
3336 		return 0;
3337 	}
3338 
3339 	if (work_done) { /* RX path */
3340 		if (q >= na->num_rx_rings)
3341 			return 0;	// not a physical queue
3342 		kring = na->rx_rings + q;
3343 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3344 		if (na->na_bdg != NULL) {
3345 			netmap_nic_to_bdg(ifp, q);
3346 		} else {
3347 			selwakeuppri(&kring->si, PI_NET);
3348 			if (na->num_rx_rings > 1 /* or multiple listeners */ )
3349 				selwakeuppri(&na->rx_si, PI_NET);
3350 		}
3351 		*work_done = 1; /* do not fire napi again */
3352 	} else { /* TX path */
3353 		if (q >= na->num_tx_rings)
3354 			return 0;	// not a physical queue
3355 		kring = na->tx_rings + q;
3356 		selwakeuppri(&kring->si, PI_NET);
3357 		if (na->num_tx_rings > 1 /* or multiple listeners */ )
3358 			selwakeuppri(&na->tx_si, PI_NET);
3359 	}
3360 	return 1;
3361 }
3362 
3363 
3364 #ifdef linux	/* linux-specific routines */
3365 
3366 
3367 /*
3368  * Remap linux arguments into the FreeBSD call.
3369  * - pwait is the poll table, passed as 'dev';
3370  *   If pwait == NULL someone else already woke up before. We can report
3371  *   events but they are filtered upstream.
3372  *   If pwait != NULL, then pwait->key contains the list of events.
3373  * - events is computed from pwait as above.
3374  * - file is passed as 'td';
3375  */
3376 static u_int
3377 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait)
3378 {
3379 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
3380 	int events = POLLIN | POLLOUT; /* XXX maybe... */
3381 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
3382 	int events = pwait ? pwait->key : POLLIN | POLLOUT;
3383 #else /* in 3.4.0 field 'key' was renamed to '_key' */
3384 	int events = pwait ? pwait->_key : POLLIN | POLLOUT;
3385 #endif
3386 	return netmap_poll((void *)pwait, events, (void *)file);
3387 }
3388 
3389 
3390 static int
3391 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
3392 {
3393 	int error = 0;
3394 	unsigned long off, va;
3395 	vm_ooffset_t pa;
3396 	struct netmap_priv_d *priv = f->private_data;
3397 	/*
3398 	 * vma->vm_start: start of mapping user address space
3399 	 * vma->vm_end: end of the mapping user address space
3400 	 * vma->vm_pfoff: offset of first page in the device
3401 	 */
3402 
3403 	// XXX security checks
3404 
3405 	error = netmap_get_memory(priv);
3406 	ND("get_memory returned %d", error);
3407 	if (error)
3408 	    return -error;
3409 
3410 	if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) {
3411 		ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end);
3412 		return -EINVAL;
3413 	}
3414 
3415 	for (va = vma->vm_start, off = vma->vm_pgoff;
3416 	     va < vma->vm_end;
3417 	     va += PAGE_SIZE, off++)
3418 	{
3419 		pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT);
3420 		if (pa == 0)
3421 			return -EINVAL;
3422 
3423 		ND("va %lx pa %p", va, pa);
3424 		error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot);
3425 		if (error)
3426 			return error;
3427 	}
3428 	return 0;
3429 }
3430 
3431 
3432 /*
3433  * This one is probably already protected by the netif lock XXX
3434  */
3435 static netdev_tx_t
3436 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev)
3437 {
3438 	netmap_transmit(dev, skb);
3439 	return (NETDEV_TX_OK);
3440 }
3441 
3442 
3443 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)	// XXX was 37
3444 #define LIN_IOCTL_NAME	.ioctl
3445 int
3446 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */)
3447 #else
3448 #define LIN_IOCTL_NAME	.unlocked_ioctl
3449 long
3450 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */)
3451 #endif
3452 {
3453 	int ret;
3454 	struct nmreq nmr;
3455 	bzero(&nmr, sizeof(nmr));
3456 
3457 	if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) {
3458 		data = 0;	/* no argument required here */
3459 	}
3460 	if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0)
3461 		return -EFAULT;
3462 	ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file);
3463 	if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0)
3464 		return -EFAULT;
3465 	return -ret;
3466 }
3467 
3468 
3469 static int
3470 netmap_release(struct inode *inode, struct file *file)
3471 {
3472 	(void)inode;	/* UNUSED */
3473 	if (file->private_data)
3474 		netmap_dtor(file->private_data);
3475 	return (0);
3476 }
3477 
3478 
3479 static int
3480 linux_netmap_open(struct inode *inode, struct file *file)
3481 {
3482 	struct netmap_priv_d *priv;
3483 	(void)inode;	/* UNUSED */
3484 
3485 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
3486 			      M_NOWAIT | M_ZERO);
3487 	if (priv == NULL)
3488 		return -ENOMEM;
3489 
3490 	file->private_data = priv;
3491 
3492 	return (0);
3493 }
3494 
3495 
3496 static struct file_operations netmap_fops = {
3497     .owner = THIS_MODULE,
3498     .open = linux_netmap_open,
3499     .mmap = linux_netmap_mmap,
3500     LIN_IOCTL_NAME = linux_netmap_ioctl,
3501     .poll = linux_netmap_poll,
3502     .release = netmap_release,
3503 };
3504 
3505 
3506 static struct miscdevice netmap_cdevsw = {	/* same name as FreeBSD */
3507 	MISC_DYNAMIC_MINOR,
3508 	"netmap",
3509 	&netmap_fops,
3510 };
3511 
3512 static int netmap_init(void);
3513 static void netmap_fini(void);
3514 
3515 
3516 /* Errors have negative values on linux */
3517 static int linux_netmap_init(void)
3518 {
3519 	return -netmap_init();
3520 }
3521 
3522 module_init(linux_netmap_init);
3523 module_exit(netmap_fini);
3524 /* export certain symbols to other modules */
3525 EXPORT_SYMBOL(netmap_attach);		// driver attach routines
3526 EXPORT_SYMBOL(netmap_detach);		// driver detach routines
3527 EXPORT_SYMBOL(netmap_ring_reinit);	// ring init on error
3528 EXPORT_SYMBOL(netmap_buffer_lut);
3529 EXPORT_SYMBOL(netmap_total_buffers);	// index check
3530 EXPORT_SYMBOL(netmap_buffer_base);
3531 EXPORT_SYMBOL(netmap_reset);		// ring init routines
3532 EXPORT_SYMBOL(netmap_buf_size);
3533 EXPORT_SYMBOL(netmap_rx_irq);		// default irq handler
3534 EXPORT_SYMBOL(netmap_no_pendintr);	// XXX mitigation - should go away
3535 EXPORT_SYMBOL(netmap_bdg_ctl);		// bridge configuration routine
3536 EXPORT_SYMBOL(netmap_bdg_learning);	// the default lookup function
3537 EXPORT_SYMBOL(netmap_disable_all_rings);
3538 EXPORT_SYMBOL(netmap_enable_all_rings);
3539 
3540 
3541 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/");
3542 MODULE_DESCRIPTION("The netmap packet I/O framework");
3543 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
3544 
3545 #else /* __FreeBSD__ */
3546 
3547 
3548 static struct cdevsw netmap_cdevsw = {
3549 	.d_version = D_VERSION,
3550 	.d_name = "netmap",
3551 	.d_open = netmap_open,
3552 	.d_mmap_single = netmap_mmap_single,
3553 	.d_ioctl = netmap_ioctl,
3554 	.d_poll = netmap_poll,
3555 	.d_close = netmap_close,
3556 };
3557 #endif /* __FreeBSD__ */
3558 
3559 /*
3560  *---- support for virtual bridge -----
3561  */
3562 
3563 /* ----- FreeBSD if_bridge hash function ------- */
3564 
3565 /*
3566  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
3567  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
3568  *
3569  * http://www.burtleburtle.net/bob/hash/spooky.html
3570  */
3571 #define mix(a, b, c)                                                    \
3572 do {                                                                    \
3573         a -= b; a -= c; a ^= (c >> 13);                                 \
3574         b -= c; b -= a; b ^= (a << 8);                                  \
3575         c -= a; c -= b; c ^= (b >> 13);                                 \
3576         a -= b; a -= c; a ^= (c >> 12);                                 \
3577         b -= c; b -= a; b ^= (a << 16);                                 \
3578         c -= a; c -= b; c ^= (b >> 5);                                  \
3579         a -= b; a -= c; a ^= (c >> 3);                                  \
3580         b -= c; b -= a; b ^= (a << 10);                                 \
3581         c -= a; c -= b; c ^= (b >> 15);                                 \
3582 } while (/*CONSTCOND*/0)
3583 
3584 static __inline uint32_t
3585 nm_bridge_rthash(const uint8_t *addr)
3586 {
3587         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
3588 
3589         b += addr[5] << 8;
3590         b += addr[4];
3591         a += addr[3] << 24;
3592         a += addr[2] << 16;
3593         a += addr[1] << 8;
3594         a += addr[0];
3595 
3596         mix(a, b, c);
3597 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
3598         return (c & BRIDGE_RTHASH_MASK);
3599 }
3600 
3601 #undef mix
3602 
3603 
3604 static int
3605 bdg_netmap_reg(struct ifnet *ifp, int onoff)
3606 {
3607 	/* the interface is already attached to the bridge,
3608 	 * so we only need to toggle IFCAP_NETMAP.
3609 	 */
3610 	if (onoff) {
3611 		ifp->if_capenable |= IFCAP_NETMAP;
3612 	} else {
3613 		ifp->if_capenable &= ~IFCAP_NETMAP;
3614 	}
3615 	return 0;
3616 }
3617 
3618 
3619 /*
3620  * Lookup function for a learning bridge.
3621  * Update the hash table with the source address,
3622  * and then returns the destination port index, and the
3623  * ring in *dst_ring (at the moment, always use ring 0)
3624  */
3625 u_int
3626 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
3627 		struct netmap_adapter *na)
3628 {
3629 	struct nm_hash_ent *ht = na->na_bdg->ht;
3630 	uint32_t sh, dh;
3631 	u_int dst, mysrc = na->bdg_port;
3632 	uint64_t smac, dmac;
3633 
3634 	if (buf_len < 14) {
3635 		D("invalid buf length %d", buf_len);
3636 		return NM_BDG_NOPORT;
3637 	}
3638 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
3639 	smac = le64toh(*(uint64_t *)(buf + 4));
3640 	smac >>= 16;
3641 
3642 	/*
3643 	 * The hash is somewhat expensive, there might be some
3644 	 * worthwhile optimizations here.
3645 	 */
3646 	if ((buf[6] & 1) == 0) { /* valid src */
3647 		uint8_t *s = buf+6;
3648 		sh = nm_bridge_rthash(s); // XXX hash of source
3649 		/* update source port forwarding entry */
3650 		ht[sh].mac = smac;	/* XXX expire ? */
3651 		ht[sh].ports = mysrc;
3652 		if (netmap_verbose)
3653 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
3654 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
3655 	}
3656 	dst = NM_BDG_BROADCAST;
3657 	if ((buf[0] & 1) == 0) { /* unicast */
3658 		dh = nm_bridge_rthash(buf); // XXX hash of dst
3659 		if (ht[dh].mac == dmac) {	/* found dst */
3660 			dst = ht[dh].ports;
3661 		}
3662 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
3663 	}
3664 	*dst_ring = 0;
3665 	return dst;
3666 }
3667 
3668 
3669 /*
3670  * This flush routine supports only unicast and broadcast but a large
3671  * number of ports, and lets us replace the learn and dispatch functions.
3672  */
3673 int
3674 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na,
3675 		u_int ring_nr)
3676 {
3677 	struct nm_bdg_q *dst_ents, *brddst;
3678 	uint16_t num_dsts = 0, *dsts;
3679 	struct nm_bridge *b = na->na_bdg;
3680 	u_int i, j, me = na->bdg_port;
3681 
3682 	/*
3683 	 * The work area (pointed by ft) is followed by an array of
3684 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
3685 	 * queues per port plus one for the broadcast traffic.
3686 	 * Then we have an array of destination indexes.
3687 	 */
3688 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
3689 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
3690 
3691 	/* first pass: find a destination for each packet in the batch */
3692 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
3693 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
3694 		uint16_t dst_port, d_i;
3695 		struct nm_bdg_q *d;
3696 
3697 		ND("slot %d frags %d", i, ft[i].ft_frags);
3698 		dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len,
3699 			&dst_ring, na);
3700 		if (netmap_verbose > 255)
3701 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
3702 		if (dst_port == NM_BDG_NOPORT)
3703 			continue; /* this packet is identified to be dropped */
3704 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
3705 			continue;
3706 		else if (dst_port == NM_BDG_BROADCAST)
3707 			dst_ring = 0; /* broadcasts always go to ring 0 */
3708 		else if (unlikely(dst_port == me ||
3709 		    !b->bdg_ports[dst_port]))
3710 			continue;
3711 
3712 		/* get a position in the scratch pad */
3713 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
3714 		d = dst_ents + d_i;
3715 
3716 		/* append the first fragment to the list */
3717 		if (d->bq_head == NM_FT_NULL) { /* new destination */
3718 			d->bq_head = d->bq_tail = i;
3719 			/* remember this position to be scanned later */
3720 			if (dst_port != NM_BDG_BROADCAST)
3721 				dsts[num_dsts++] = d_i;
3722 		} else {
3723 			ft[d->bq_tail].ft_next = i;
3724 			d->bq_tail = i;
3725 		}
3726 		d->bq_len += ft[i].ft_frags;
3727 	}
3728 
3729 	/*
3730 	 * Broadcast traffic goes to ring 0 on all destinations.
3731 	 * So we need to add these rings to the list of ports to scan.
3732 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
3733 	 * expensive. We should keep a compact list of active destinations
3734 	 * so we could shorten this loop.
3735 	 */
3736 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
3737 	if (brddst->bq_head != NM_FT_NULL) {
3738 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
3739 			uint16_t d_i;
3740 			i = b->bdg_port_index[j];
3741 			if (unlikely(i == me))
3742 				continue;
3743 			d_i = i * NM_BDG_MAXRINGS;
3744 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
3745 				dsts[num_dsts++] = d_i;
3746 		}
3747 	}
3748 
3749 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
3750 	/* second pass: scan destinations (XXX will be modular somehow) */
3751 	for (i = 0; i < num_dsts; i++) {
3752 		struct ifnet *dst_ifp;
3753 		struct netmap_adapter *dst_na;
3754 		struct netmap_kring *kring;
3755 		struct netmap_ring *ring;
3756 		u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next;
3757 		u_int needed, howmany;
3758 		int retry = netmap_txsync_retry;
3759 		struct nm_bdg_q *d;
3760 		uint32_t my_start = 0, lease_idx = 0;
3761 		int nrings;
3762 
3763 		d_i = dsts[i];
3764 		ND("second pass %d port %d", i, d_i);
3765 		d = dst_ents + d_i;
3766 		// XXX fix the division
3767 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
3768 		/* protect from the lookup function returning an inactive
3769 		 * destination port
3770 		 */
3771 		if (unlikely(dst_na == NULL))
3772 			goto cleanup;
3773 		if (dst_na->na_flags & NAF_SW_ONLY)
3774 			goto cleanup;
3775 		dst_ifp = dst_na->ifp;
3776 		/*
3777 		 * The interface may be in !netmap mode in two cases:
3778 		 * - when na is attached but not activated yet;
3779 		 * - when na is being deactivated but is still attached.
3780 		 */
3781 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
3782 			ND("not in netmap mode!");
3783 			goto cleanup;
3784 		}
3785 
3786 		/* there is at least one either unicast or broadcast packet */
3787 		brd_next = brddst->bq_head;
3788 		next = d->bq_head;
3789 		/* we need to reserve this many slots. If fewer are
3790 		 * available, some packets will be dropped.
3791 		 * Packets may have multiple fragments, so we may not use
3792 		 * there is a chance that we may not use all of the slots
3793 		 * we have claimed, so we will need to handle the leftover
3794 		 * ones when we regain the lock.
3795 		 */
3796 		needed = d->bq_len + brddst->bq_len;
3797 
3798 		is_vp = nma_is_vp(dst_na);
3799 		ND(5, "pass 2 dst %d is %x %s",
3800 			i, d_i, is_vp ? "virtual" : "nic/host");
3801 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
3802 		if (is_vp) { /* virtual port */
3803 			nrings = dst_na->num_rx_rings;
3804 		} else {
3805 			nrings = dst_na->num_tx_rings;
3806 		}
3807 		if (dst_nr >= nrings)
3808 			dst_nr = dst_nr % nrings;
3809 		kring = is_vp ?  &dst_na->rx_rings[dst_nr] :
3810 				&dst_na->tx_rings[dst_nr];
3811 		ring = kring->ring;
3812 		lim = kring->nkr_num_slots - 1;
3813 
3814 retry:
3815 
3816 		/* reserve the buffers in the queue and an entry
3817 		 * to report completion, and drop lock.
3818 		 * XXX this might become a helper function.
3819 		 */
3820 		mtx_lock(&kring->q_lock);
3821 		if (kring->nkr_stopped) {
3822 			mtx_unlock(&kring->q_lock);
3823 			goto cleanup;
3824 		}
3825 		/* on physical interfaces, do a txsync to recover
3826 		 * slots for packets already transmitted.
3827 		 * XXX maybe we could be optimistic and rely on a retry
3828 		 * in case of failure.
3829 		 */
3830 		if (nma_is_hw(dst_na)) {
3831 			dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3832 		}
3833 		my_start = j = kring->nkr_hwlease;
3834 		howmany = nm_kr_space(kring, is_vp);
3835 		if (needed < howmany)
3836 			howmany = needed;
3837 		lease_idx = nm_kr_lease(kring, howmany, is_vp);
3838 		mtx_unlock(&kring->q_lock);
3839 
3840 		/* only retry if we need more than available slots */
3841 		if (retry && needed <= howmany)
3842 			retry = 0;
3843 
3844 		/* copy to the destination queue */
3845 		while (howmany > 0) {
3846 			struct netmap_slot *slot;
3847 			struct nm_bdg_fwd *ft_p, *ft_end;
3848 			u_int cnt;
3849 
3850 			/* find the queue from which we pick next packet.
3851 			 * NM_FT_NULL is always higher than valid indexes
3852 			 * so we never dereference it if the other list
3853 			 * has packets (and if both are empty we never
3854 			 * get here).
3855 			 */
3856 			if (next < brd_next) {
3857 				ft_p = ft + next;
3858 				next = ft_p->ft_next;
3859 			} else { /* insert broadcast */
3860 				ft_p = ft + brd_next;
3861 				brd_next = ft_p->ft_next;
3862 			}
3863 			cnt = ft_p->ft_frags; // cnt > 0
3864 			if (unlikely(cnt > howmany))
3865 			    break; /* no more space */
3866 			howmany -= cnt;
3867 			if (netmap_verbose && cnt > 1)
3868 				RD(5, "rx %d frags to %d", cnt, j);
3869 			ft_end = ft_p + cnt;
3870 			do {
3871 			    void *dst, *src = ft_p->ft_buf;
3872 			    size_t len = (ft_p->ft_len + 63) & ~63;
3873 
3874 			    slot = &ring->slot[j];
3875 			    dst = BDG_NMB(dst_na->nm_mem, slot);
3876 			    /* round to a multiple of 64 */
3877 
3878 			    ND("send %d %d bytes at %s:%d",
3879 				i, ft_p->ft_len, dst_ifp->if_xname, j);
3880 			    if (ft_p->ft_flags & NS_INDIRECT) {
3881 				if (copyin(src, dst, len)) {
3882 					// invalid user pointer, pretend len is 0
3883 					ft_p->ft_len = 0;
3884 				}
3885 			    } else {
3886 				//memcpy(dst, src, len);
3887 				pkt_copy(src, dst, (int)len);
3888 			    }
3889 			    slot->len = ft_p->ft_len;
3890 			    slot->flags = (cnt << 8)| NS_MOREFRAG;
3891 			    j = nm_next(j, lim);
3892 			    ft_p++;
3893 			    sent++;
3894 			} while (ft_p != ft_end);
3895 			slot->flags = (cnt << 8); /* clear flag on last entry */
3896 			/* are we done ? */
3897 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
3898 				break;
3899 		}
3900 		{
3901 		    /* current position */
3902 		    uint32_t *p = kring->nkr_leases; /* shorthand */
3903 		    uint32_t update_pos;
3904 		    int still_locked = 1;
3905 
3906 		    mtx_lock(&kring->q_lock);
3907 		    if (unlikely(howmany > 0)) {
3908 			/* not used all bufs. If i am the last one
3909 			 * i can recover the slots, otherwise must
3910 			 * fill them with 0 to mark empty packets.
3911 			 */
3912 			ND("leftover %d bufs", howmany);
3913 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
3914 			    /* yes i am the last one */
3915 			    ND("roll back nkr_hwlease to %d", j);
3916 			    kring->nkr_hwlease = j;
3917 			} else {
3918 			    while (howmany-- > 0) {
3919 				ring->slot[j].len = 0;
3920 				ring->slot[j].flags = 0;
3921 				j = nm_next(j, lim);
3922 			    }
3923 			}
3924 		    }
3925 		    p[lease_idx] = j; /* report I am done */
3926 
3927 		    update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur;
3928 
3929 		    if (my_start == update_pos) {
3930 			/* all slots before my_start have been reported,
3931 			 * so scan subsequent leases to see if other ranges
3932 			 * have been completed, and to a selwakeup or txsync.
3933 		         */
3934 			while (lease_idx != kring->nkr_lease_idx &&
3935 				p[lease_idx] != NR_NOSLOT) {
3936 			    j = p[lease_idx];
3937 			    p[lease_idx] = NR_NOSLOT;
3938 			    lease_idx = nm_next(lease_idx, lim);
3939 			}
3940 			/* j is the new 'write' position. j != my_start
3941 			 * means there are new buffers to report
3942 			 */
3943 			if (likely(j != my_start)) {
3944 			    if (is_vp) {
3945 				uint32_t old_avail = kring->nr_hwavail;
3946 
3947 				kring->nr_hwavail = (j >= kring->nr_hwcur) ?
3948 					j - kring->nr_hwcur :
3949 					j + lim + 1 - kring->nr_hwcur;
3950 				if (kring->nr_hwavail < old_avail) {
3951 					D("avail shrink %d -> %d",
3952 						old_avail, kring->nr_hwavail);
3953 				}
3954 				still_locked = 0;
3955 				mtx_unlock(&kring->q_lock);
3956 				selwakeuppri(&kring->si, PI_NET);
3957 			    } else {
3958 				ring->cur = j;
3959 				/* XXX update avail ? */
3960 				still_locked = 0;
3961 				dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3962 				mtx_unlock(&kring->q_lock);
3963 
3964 				/* retry to send more packets */
3965 				if (nma_is_hw(dst_na) && retry--)
3966 					goto retry;
3967 			    }
3968 			}
3969 		    }
3970 		    if (still_locked)
3971 			mtx_unlock(&kring->q_lock);
3972 		}
3973 cleanup:
3974 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
3975 		d->bq_len = 0;
3976 	}
3977 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
3978 	brddst->bq_len = 0;
3979 	return 0;
3980 }
3981 
3982 
3983 /*
3984  * main dispatch routine for the bridge.
3985  * We already know that only one thread is running this.
3986  * we must run nm_bdg_preflush without lock.
3987  */
3988 static int
3989 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags)
3990 {
3991 	struct netmap_adapter *na = NA(ifp);
3992 	struct netmap_kring *kring = &na->tx_rings[ring_nr];
3993 	struct netmap_ring *ring = kring->ring;
3994 	u_int j, k, lim = kring->nkr_num_slots - 1;
3995 
3996 	k = ring->cur;
3997 	if (k > lim)
3998 		return netmap_ring_reinit(kring);
3999 
4000 	if (bridge_batch <= 0) { /* testing only */
4001 		j = k; // used all
4002 		goto done;
4003 	}
4004 	if (bridge_batch > NM_BDG_BATCH)
4005 		bridge_batch = NM_BDG_BATCH;
4006 
4007 	j = nm_bdg_preflush(na, ring_nr, kring, k);
4008 	if (j != k)
4009 		D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
4010 	/* k-j modulo ring size is the number of slots processed */
4011 	if (k < j)
4012 		k += kring->nkr_num_slots;
4013 	kring->nr_hwavail = lim - (k - j);
4014 
4015 done:
4016 	kring->nr_hwcur = j;
4017 	ring->avail = kring->nr_hwavail;
4018 	if (netmap_verbose)
4019 		D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags);
4020 	return 0;
4021 }
4022 
4023 
4024 /*
4025  * user process reading from a VALE switch.
4026  * Already protected against concurrent calls from userspace,
4027  * but we must acquire the queue's lock to protect against
4028  * writers on the same queue.
4029  */
4030 static int
4031 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags)
4032 {
4033 	struct netmap_adapter *na = NA(ifp);
4034 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
4035 	struct netmap_ring *ring = kring->ring;
4036 	u_int j, lim = kring->nkr_num_slots - 1;
4037 	u_int k = ring->cur, resvd = ring->reserved;
4038 	int n;
4039 
4040 	mtx_lock(&kring->q_lock);
4041 	if (k > lim) {
4042 		D("ouch dangerous reset!!!");
4043 		n = netmap_ring_reinit(kring);
4044 		goto done;
4045 	}
4046 
4047 	/* skip past packets that userspace has released */
4048 	j = kring->nr_hwcur;    /* netmap ring index */
4049 	if (resvd > 0) {
4050 		if (resvd + ring->avail >= lim + 1) {
4051 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
4052 			ring->reserved = resvd = 0; // XXX panic...
4053 		}
4054 		k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
4055 	}
4056 
4057 	if (j != k) { /* userspace has released some packets. */
4058 		n = k - j;
4059 		if (n < 0)
4060 			n += kring->nkr_num_slots;
4061 		ND("userspace releases %d packets", n);
4062                 for (n = 0; likely(j != k); n++) {
4063                         struct netmap_slot *slot = &ring->slot[j];
4064                         void *addr = BDG_NMB(na->nm_mem, slot);
4065 
4066                         if (addr == netmap_buffer_base) { /* bad buf */
4067 				D("bad buffer index %d, ignore ?",
4068 					slot->buf_idx);
4069                         }
4070 			slot->flags &= ~NS_BUF_CHANGED;
4071                         j = nm_next(j, lim);
4072                 }
4073                 kring->nr_hwavail -= n;
4074                 kring->nr_hwcur = k;
4075         }
4076         /* tell userspace that there are new packets */
4077         ring->avail = kring->nr_hwavail - resvd;
4078 	n = 0;
4079 done:
4080 	mtx_unlock(&kring->q_lock);
4081 	return n;
4082 }
4083 
4084 
4085 static int
4086 bdg_netmap_attach(struct netmap_adapter *arg)
4087 {
4088 	struct netmap_adapter na;
4089 
4090 	ND("attaching virtual bridge");
4091 	bzero(&na, sizeof(na));
4092 
4093 	na.ifp = arg->ifp;
4094 	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
4095 	na.num_tx_rings = arg->num_tx_rings;
4096 	na.num_rx_rings = arg->num_rx_rings;
4097 	na.num_tx_desc = arg->num_tx_desc;
4098 	na.num_rx_desc = arg->num_rx_desc;
4099 	na.nm_txsync = bdg_netmap_txsync;
4100 	na.nm_rxsync = bdg_netmap_rxsync;
4101 	na.nm_register = bdg_netmap_reg;
4102 	na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname,
4103 			na.num_tx_rings, na.num_tx_desc,
4104 			na.num_rx_rings, na.num_rx_desc);
4105 	return netmap_attach(&na, na.num_tx_rings);
4106 }
4107 
4108 
4109 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4110 
4111 
4112 /*
4113  * Module loader.
4114  *
4115  * Create the /dev/netmap device and initialize all global
4116  * variables.
4117  *
4118  * Return 0 on success, errno on failure.
4119  */
4120 static int
4121 netmap_init(void)
4122 {
4123 	int i, error;
4124 
4125 	NMG_LOCK_INIT();
4126 
4127 	error = netmap_mem_init();
4128 	if (error != 0) {
4129 		printf("netmap: unable to initialize the memory allocator.\n");
4130 		return (error);
4131 	}
4132 	printf("netmap: loaded module\n");
4133 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
4134 			      "netmap");
4135 
4136 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
4137 	for (i = 0; i < NM_BRIDGES; i++)
4138 		BDG_RWINIT(&nm_bridges[i]);
4139 	return (error);
4140 }
4141 
4142 
4143 /*
4144  * Module unloader.
4145  *
4146  * Free all the memory, and destroy the ``/dev/netmap`` device.
4147  */
4148 static void
4149 netmap_fini(void)
4150 {
4151 	destroy_dev(netmap_dev);
4152 	netmap_mem_fini();
4153 	NMG_LOCK_DESTROY();
4154 	printf("netmap: unloaded module.\n");
4155 }
4156 
4157 
4158 #ifdef __FreeBSD__
4159 /*
4160  * Kernel entry point.
4161  *
4162  * Initialize/finalize the module and return.
4163  *
4164  * Return 0 on success, errno on failure.
4165  */
4166 static int
4167 netmap_loader(__unused struct module *module, int event, __unused void *arg)
4168 {
4169 	int error = 0;
4170 
4171 	switch (event) {
4172 	case MOD_LOAD:
4173 		error = netmap_init();
4174 		break;
4175 
4176 	case MOD_UNLOAD:
4177 		netmap_fini();
4178 		break;
4179 
4180 	default:
4181 		error = EOPNOTSUPP;
4182 		break;
4183 	}
4184 
4185 	return (error);
4186 }
4187 
4188 
4189 DEV_MODULE(netmap, netmap_loader, NULL);
4190 #endif /* __FreeBSD__ */
4191