xref: /freebsd-14.2/sys/dev/netmap/netmap.c (revision 6f62d278)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 #ifdef __FreeBSD__
28 #define TEST_STUFF	// test code, does not compile yet on linux
29 #endif /* __FreeBSD__ */
30 
31 /*
32  * This module supports memory mapped access to network devices,
33  * see netmap(4).
34  *
35  * The module uses a large, memory pool allocated by the kernel
36  * and accessible as mmapped memory by multiple userspace threads/processes.
37  * The memory pool contains packet buffers and "netmap rings",
38  * i.e. user-accessible copies of the interface's queues.
39  *
40  * Access to the network card works like this:
41  * 1. a process/thread issues one or more open() on /dev/netmap, to create
42  *    select()able file descriptor on which events are reported.
43  * 2. on each descriptor, the process issues an ioctl() to identify
44  *    the interface that should report events to the file descriptor.
45  * 3. on each descriptor, the process issues an mmap() request to
46  *    map the shared memory region within the process' address space.
47  *    The list of interesting queues is indicated by a location in
48  *    the shared memory region.
49  * 4. using the functions in the netmap(4) userspace API, a process
50  *    can look up the occupation state of a queue, access memory buffers,
51  *    and retrieve received packets or enqueue packets to transmit.
52  * 5. using some ioctl()s the process can synchronize the userspace view
53  *    of the queue with the actual status in the kernel. This includes both
54  *    receiving the notification of new packets, and transmitting new
55  *    packets on the output interface.
56  * 6. select() or poll() can be used to wait for events on individual
57  *    transmit or receive queues (or all queues for a given interface).
58  *
59 
60 		SYNCHRONIZATION (USER)
61 
62 The netmap rings and data structures may be shared among multiple
63 user threads or even independent processes.
64 Any synchronization among those threads/processes is delegated
65 to the threads themselves. Only one thread at a time can be in
66 a system call on the same netmap ring. The OS does not enforce
67 this and only guarantees against system crashes in case of
68 invalid usage.
69 
70 		LOCKING (INTERNAL)
71 
72 Within the kernel, access to the netmap rings is protected as follows:
73 
74 - a spinlock on each ring, to handle producer/consumer races on
75   RX rings attached to the host stack (against multiple host
76   threads writing from the host stack to the same ring),
77   and on 'destination' rings attached to a VALE switch
78   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
79   protecting multiple active senders for the same destination)
80 
81 - an atomic variable to guarantee that there is at most one
82   instance of *_*xsync() on the ring at any time.
83   For rings connected to user file
84   descriptors, an atomic_test_and_set() protects this, and the
85   lock on the ring is not actually used.
86   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
87   is also used to prevent multiple executions (the driver might indeed
88   already guarantee this).
89   For NIC TX rings connected to a VALE switch, the lock arbitrates
90   access to the queue (both when allocating buffers and when pushing
91   them out).
92 
93 - *xsync() should be protected against initializations of the card.
94   On FreeBSD most devices have the reset routine protected by
95   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
96   the RING protection on rx_reset(), this should be added.
97 
98   On linux there is an external lock on the tx path, which probably
99   also arbitrates access to the reset routine. XXX to be revised
100 
101 - a per-interface core_lock protecting access from the host stack
102   while interfaces may be detached from netmap mode.
103   XXX there should be no need for this lock if we detach the interfaces
104   only while they are down.
105 
106 
107 --- VALE SWITCH ---
108 
109 NMG_LOCK() serializes all modifications to switches and ports.
110 A switch cannot be deleted until all ports are gone.
111 
112 For each switch, an SX lock (RWlock on linux) protects
113 deletion of ports. When configuring or deleting a new port, the
114 lock is acquired in exclusive mode (after holding NMG_LOCK).
115 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
116 The lock is held throughout the entire forwarding cycle,
117 during which the thread may incur in a page fault.
118 Hence it is important that sleepable shared locks are used.
119 
120 On the rx ring, the per-port lock is grabbed initially to reserve
121 a number of slot in the ring, then the lock is released,
122 packets are copied from source to destination, and then
123 the lock is acquired again and the receive ring is updated.
124 (A similar thing is done on the tx ring for NIC and host stack
125 ports attached to the switch)
126 
127  */
128 
129 /*
130  * OS-specific code that is used only within this file.
131  * Other OS-specific code that must be accessed by drivers
132  * is present in netmap_kern.h
133  */
134 
135 #if defined(__FreeBSD__)
136 #include <sys/cdefs.h> /* prerequisite */
137 __FBSDID("$FreeBSD$");
138 
139 #include <sys/types.h>
140 #include <sys/module.h>
141 #include <sys/errno.h>
142 #include <sys/param.h>	/* defines used in kernel.h */
143 #include <sys/jail.h>
144 #include <sys/kernel.h>	/* types used in module initialization */
145 #include <sys/conf.h>	/* cdevsw struct */
146 #include <sys/uio.h>	/* uio struct */
147 #include <sys/sockio.h>
148 #include <sys/socketvar.h>	/* struct socket */
149 #include <sys/malloc.h>
150 #include <sys/mman.h>	/* PROT_EXEC */
151 #include <sys/poll.h>
152 #include <sys/proc.h>
153 #include <sys/rwlock.h>
154 #include <vm/vm.h>	/* vtophys */
155 #include <vm/pmap.h>	/* vtophys */
156 #include <vm/vm_param.h>
157 #include <vm/vm_object.h>
158 #include <vm/vm_page.h>
159 #include <vm/vm_pager.h>
160 #include <vm/uma.h>
161 #include <sys/socket.h> /* sockaddrs */
162 #include <sys/selinfo.h>
163 #include <sys/sysctl.h>
164 #include <net/if.h>
165 #include <net/if_var.h>
166 #include <net/bpf.h>		/* BIOCIMMEDIATE */
167 #include <net/vnet.h>
168 #include <machine/bus.h>	/* bus_dmamap_* */
169 #include <sys/endian.h>
170 #include <sys/refcount.h>
171 
172 #define prefetch(x)	__builtin_prefetch(x)
173 
174 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
175 
176 #define	BDG_RWINIT(b)		\
177 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
178 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
179 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
180 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
181 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
182 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
183 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
184 
185 
186 /* netmap global lock.
187  * normally called within the user thread (upon a system call)
188  * or when a file descriptor or process is terminated
189  * (last close or last munmap)
190  */
191 
192 #define NMG_LOCK_T		struct mtx
193 #define NMG_LOCK_INIT()		mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF)
194 #define NMG_LOCK_DESTROY()	mtx_destroy(&netmap_global_lock)
195 #define NMG_LOCK()		mtx_lock(&netmap_global_lock)
196 #define NMG_UNLOCK()		mtx_unlock(&netmap_global_lock)
197 #define NMG_LOCK_ASSERT()	mtx_assert(&netmap_global_lock, MA_OWNED)
198 
199 
200 /* atomic operations */
201 #include <machine/atomic.h>
202 #define NM_ATOMIC_TEST_AND_SET(p)	(!atomic_cmpset_acq_int((p), 0, 1))
203 #define NM_ATOMIC_CLEAR(p)		atomic_store_rel_int((p), 0)
204 
205 
206 #elif defined(linux)
207 
208 #include "bsd_glue.h"
209 
210 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *);
211 
212 static struct device_driver*
213 linux_netmap_find_driver(struct device *dev)
214 {
215 	struct device_driver *dd;
216 
217 	while ( (dd = dev->driver) == NULL ) {
218 		if ( (dev = dev->parent) == NULL )
219 			return NULL;
220 	}
221 	return dd;
222 }
223 
224 static struct net_device*
225 ifunit_ref(const char *name)
226 {
227 	struct net_device *ifp = dev_get_by_name(&init_net, name);
228 	struct device_driver *dd;
229 
230 	if (ifp == NULL)
231 		return NULL;
232 
233 	if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL )
234 		goto error;
235 
236 	if (!try_module_get(dd->owner))
237 		goto error;
238 
239 	return ifp;
240 error:
241 	dev_put(ifp);
242 	return NULL;
243 }
244 
245 static void
246 if_rele(struct net_device *ifp)
247 {
248 	struct device_driver *dd;
249 	dd = linux_netmap_find_driver(&ifp->dev);
250 	dev_put(ifp);
251 	if (dd)
252 		module_put(dd->owner);
253 }
254 
255 // XXX a mtx would suffice here too 20130404 gl
256 #define NMG_LOCK_T		struct semaphore
257 #define NMG_LOCK_INIT()		sema_init(&netmap_global_lock, 1)
258 #define NMG_LOCK_DESTROY()
259 #define NMG_LOCK()		down(&netmap_global_lock)
260 #define NMG_UNLOCK()		up(&netmap_global_lock)
261 #define NMG_LOCK_ASSERT()	//	XXX to be completed
262 
263 
264 #elif defined(__APPLE__)
265 
266 #warning OSX support is only partial
267 #include "osx_glue.h"
268 
269 #else
270 
271 #error	Unsupported platform
272 
273 #endif /* unsupported */
274 
275 /*
276  * common headers
277  */
278 #include <net/netmap.h>
279 #include <dev/netmap/netmap_kern.h>
280 #include <dev/netmap/netmap_mem2.h>
281 
282 
283 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
284 
285 /*
286  * The following variables are used by the drivers and replicate
287  * fields in the global memory pool. They only refer to buffers
288  * used by physical interfaces.
289  */
290 u_int netmap_total_buffers;
291 u_int netmap_buf_size;
292 char *netmap_buffer_base;	/* also address of an invalid buffer */
293 
294 /* user-controlled variables */
295 int netmap_verbose;
296 
297 static int netmap_no_timestamp; /* don't timestamp on rxsync */
298 
299 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
300 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
301     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
302 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
303     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
304 int netmap_mitigate = 1;
305 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
306 int netmap_no_pendintr = 1;
307 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
308     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
309 int netmap_txsync_retry = 2;
310 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
311     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
312 
313 int netmap_drop = 0;	/* debugging */
314 int netmap_flags = 0;	/* debug flags */
315 int netmap_fwd = 0;	/* force transparent mode */
316 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
317 
318 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , "");
319 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
320 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
321 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
322 
323 NMG_LOCK_T	netmap_global_lock;
324 
325 /*
326  * protect against multiple threads using the same ring.
327  * also check that the ring has not been stopped.
328  */
329 #define NM_KR_BUSY	1
330 #define NM_KR_STOPPED	2
331 static void nm_kr_put(struct netmap_kring *kr);
332 static __inline int nm_kr_tryget(struct netmap_kring *kr)
333 {
334 	/* check a first time without taking the lock
335 	 * to avoid starvation for nm_kr_get()
336 	 */
337 	if (unlikely(kr->nkr_stopped)) {
338 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
339 		return NM_KR_STOPPED;
340 	}
341 	if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)))
342 		return NM_KR_BUSY;
343 	/* check a second time with lock held */
344 	if (unlikely(kr->nkr_stopped)) {
345 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
346 		nm_kr_put(kr);
347 		return NM_KR_STOPPED;
348 	}
349 	return 0;
350 }
351 
352 static __inline void nm_kr_put(struct netmap_kring *kr)
353 {
354 	NM_ATOMIC_CLEAR(&kr->nr_busy);
355 }
356 
357 static void nm_kr_get(struct netmap_kring *kr)
358 {
359 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
360 		tsleep(kr, 0, "NM_KR_GET", 4);
361 }
362 
363 static void nm_disable_ring(struct netmap_kring *kr)
364 {
365 	kr->nkr_stopped = 1;
366 	nm_kr_get(kr);
367 	mtx_lock(&kr->q_lock);
368 	mtx_unlock(&kr->q_lock);
369 	nm_kr_put(kr);
370 }
371 
372 void netmap_disable_all_rings(struct ifnet *ifp)
373 {
374 	struct netmap_adapter *na;
375 	int i;
376 
377 	if (!(ifp->if_capenable & IFCAP_NETMAP))
378 		return;
379 
380 	na = NA(ifp);
381 
382 	for (i = 0; i < na->num_tx_rings + 1; i++) {
383 		nm_disable_ring(na->tx_rings + i);
384 		selwakeuppri(&na->tx_rings[i].si, PI_NET);
385 	}
386 	for (i = 0; i < na->num_rx_rings + 1; i++) {
387 		nm_disable_ring(na->rx_rings + i);
388 		selwakeuppri(&na->rx_rings[i].si, PI_NET);
389 	}
390 	selwakeuppri(&na->tx_si, PI_NET);
391 	selwakeuppri(&na->rx_si, PI_NET);
392 }
393 
394 void netmap_enable_all_rings(struct ifnet *ifp)
395 {
396 	struct netmap_adapter *na;
397 	int i;
398 
399 	if (!(ifp->if_capenable & IFCAP_NETMAP))
400 		return;
401 
402 	na = NA(ifp);
403 	for (i = 0; i < na->num_tx_rings + 1; i++) {
404 		D("enabling %p", na->tx_rings + i);
405 		na->tx_rings[i].nkr_stopped = 0;
406 	}
407 	for (i = 0; i < na->num_rx_rings + 1; i++) {
408 		D("enabling %p", na->rx_rings + i);
409 		na->rx_rings[i].nkr_stopped = 0;
410 	}
411 }
412 
413 
414 /*
415  * generic bound_checking function
416  */
417 u_int
418 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
419 {
420 	u_int oldv = *v;
421 	const char *op = NULL;
422 
423 	if (dflt < lo)
424 		dflt = lo;
425 	if (dflt > hi)
426 		dflt = hi;
427 	if (oldv < lo) {
428 		*v = dflt;
429 		op = "Bump";
430 	} else if (oldv > hi) {
431 		*v = hi;
432 		op = "Clamp";
433 	}
434 	if (op && msg)
435 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
436 	return *v;
437 }
438 
439 /*
440  * packet-dump function, user-supplied or static buffer.
441  * The destination buffer must be at least 30+4*len
442  */
443 const char *
444 nm_dump_buf(char *p, int len, int lim, char *dst)
445 {
446 	static char _dst[8192];
447         int i, j, i0;
448 	static char hex[] ="0123456789abcdef";
449 	char *o;	/* output position */
450 
451 #define P_HI(x)	hex[((x) & 0xf0)>>4]
452 #define P_LO(x)	hex[((x) & 0xf)]
453 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
454 	if (!dst)
455 		dst = _dst;
456 	if (lim <= 0 || lim > len)
457 		lim = len;
458 	o = dst;
459 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
460 	o += strlen(o);
461 	/* hexdump routine */
462 	for (i = 0; i < lim; ) {
463 		sprintf(o, "%5d: ", i);
464 		o += strlen(o);
465 		memset(o, ' ', 48);
466 		i0 = i;
467 		for (j=0; j < 16 && i < lim; i++, j++) {
468 			o[j*3] = P_HI(p[i]);
469 			o[j*3+1] = P_LO(p[i]);
470 		}
471 		i = i0;
472 		for (j=0; j < 16 && i < lim; i++, j++)
473 			o[j + 48] = P_C(p[i]);
474 		o[j+48] = '\n';
475 		o += j+49;
476 	}
477 	*o = '\0';
478 #undef P_HI
479 #undef P_LO
480 #undef P_C
481 	return dst;
482 }
483 
484 /*
485  * system parameters (most of them in netmap_kern.h)
486  * NM_NAME	prefix for switch port names, default "vale"
487  * NM_BDG_MAXPORTS	number of ports
488  * NM_BRIDGES	max number of switches in the system.
489  *	XXX should become a sysctl or tunable
490  *
491  * Switch ports are named valeX:Y where X is the switch name and Y
492  * is the port. If Y matches a physical interface name, the port is
493  * connected to a physical device.
494  *
495  * Unlike physical interfaces, switch ports use their own memory region
496  * for rings and buffers.
497  * The virtual interfaces use per-queue lock instead of core lock.
498  * In the tx loop, we aggregate traffic in batches to make all operations
499  * faster. The batch size is bridge_batch.
500  */
501 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
502 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
503 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
504 #define NM_BDG_HASH		1024	/* forwarding table entries */
505 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
506 #define NM_MULTISEG		64	/* max size of a chain of bufs */
507 /* actual size of the tables */
508 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
509 /* NM_FT_NULL terminates a list of slots in the ft */
510 #define NM_FT_NULL		NM_BDG_BATCH_MAX
511 #define	NM_BRIDGES		8	/* number of bridges */
512 
513 
514 /*
515  * bridge_batch is set via sysctl to the max batch size to be
516  * used in the bridge. The actual value may be larger as the
517  * last packet in the block may overflow the size.
518  */
519 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
520 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
521 
522 
523 /*
524  * These are used to handle reference counters for bridge ports.
525  */
526 #define	ADD_BDG_REF(ifp)	refcount_acquire(&NA(ifp)->na_bdg_refcount)
527 #define	DROP_BDG_REF(ifp)	refcount_release(&NA(ifp)->na_bdg_refcount)
528 
529 /* The bridge references the buffers using the device specific look up table */
530 static inline void *
531 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot)
532 {
533 	struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut;
534 	uint32_t i = slot->buf_idx;
535 	return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ?  lut[0].vaddr : lut[i].vaddr;
536 }
537 
538 static void bdg_netmap_attach(struct netmap_adapter *);
539 static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
540 int kern_netmap_regif(struct nmreq *nmr);
541 
542 /*
543  * Each transmit queue accumulates a batch of packets into
544  * a structure before forwarding. Packets to the same
545  * destination are put in a list using ft_next as a link field.
546  * ft_frags and ft_next are valid only on the first fragment.
547  */
548 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
549 	void *ft_buf;		/* netmap or indirect buffer */
550 	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
551 	uint8_t _ft_port;	/* dst port (unused) */
552 	uint16_t ft_flags;	/* flags, e.g. indirect */
553 	uint16_t ft_len;	/* src fragment len */
554 	uint16_t ft_next;	/* next packet to same destination */
555 };
556 
557 /*
558  * For each output interface, nm_bdg_q is used to construct a list.
559  * bq_len is the number of output buffers (we can have coalescing
560  * during the copy).
561  */
562 struct nm_bdg_q {
563 	uint16_t bq_head;
564 	uint16_t bq_tail;
565 	uint32_t bq_len;	/* number of buffers */
566 };
567 
568 /* XXX revise this */
569 struct nm_hash_ent {
570 	uint64_t	mac;	/* the top 2 bytes are the epoch */
571 	uint64_t	ports;
572 };
573 
574 /*
575  * nm_bridge is a descriptor for a VALE switch.
576  * Interfaces for a bridge are all in bdg_ports[].
577  * The array has fixed size, an empty entry does not terminate
578  * the search, but lookups only occur on attach/detach so we
579  * don't mind if they are slow.
580  *
581  * The bridge is non blocking on the transmit ports: excess
582  * packets are dropped if there is no room on the output port.
583  *
584  * bdg_lock protects accesses to the bdg_ports array.
585  * This is a rw lock (or equivalent).
586  */
587 struct nm_bridge {
588 	/* XXX what is the proper alignment/layout ? */
589 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
590 	int		bdg_namelen;
591 	uint32_t	bdg_active_ports; /* 0 means free */
592 	char		bdg_basename[IFNAMSIZ];
593 
594 	/* Indexes of active ports (up to active_ports)
595 	 * and all other remaining ports.
596 	 */
597 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
598 
599 	struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
600 
601 
602 	/*
603 	 * The function to decide the destination port.
604 	 * It returns either of an index of the destination port,
605 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
606 	 * forward this packet.  ring_nr is the source ring index, and the
607 	 * function may overwrite this value to forward this packet to a
608 	 * different ring index.
609 	 * This function must be set by netmap_bdgctl().
610 	 */
611 	bdg_lookup_fn_t nm_bdg_lookup;
612 
613 	/* the forwarding table, MAC+ports.
614 	 * XXX should be changed to an argument to be passed to
615 	 * the lookup function, and allocated on attach
616 	 */
617 	struct nm_hash_ent ht[NM_BDG_HASH];
618 };
619 
620 
621 /*
622  * XXX in principle nm_bridges could be created dynamically
623  * Right now we have a static array and deletions are protected
624  * by an exclusive lock.
625  */
626 struct nm_bridge nm_bridges[NM_BRIDGES];
627 
628 
629 /*
630  * A few function to tell which kind of port are we using.
631  * XXX should we hold a lock ?
632  *
633  * nma_is_vp()		virtual port
634  * nma_is_host()	port connected to the host stack
635  * nma_is_hw()		port connected to a NIC
636  */
637 int nma_is_vp(struct netmap_adapter *na);
638 int
639 nma_is_vp(struct netmap_adapter *na)
640 {
641 	return na->nm_register == bdg_netmap_reg;
642 }
643 
644 static __inline int
645 nma_is_host(struct netmap_adapter *na)
646 {
647 	return na->nm_register == NULL;
648 }
649 
650 static __inline int
651 nma_is_hw(struct netmap_adapter *na)
652 {
653 	/* In case of sw adapter, nm_register is NULL */
654 	return !nma_is_vp(na) && !nma_is_host(na);
655 }
656 
657 
658 /*
659  * If the NIC is owned by the kernel
660  * (i.e., bridge), neither another bridge nor user can use it;
661  * if the NIC is owned by a user, only users can share it.
662  * Evaluation must be done under NMG_LOCK().
663  */
664 #define NETMAP_OWNED_BY_KERN(ifp)	(!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
665 #define NETMAP_OWNED_BY_ANY(ifp) \
666 	(NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
667 
668 /*
669  * NA(ifp)->bdg_port	port index
670  */
671 
672 
673 /*
674  * this is a slightly optimized copy routine which rounds
675  * to multiple of 64 bytes and is often faster than dealing
676  * with other odd sizes. We assume there is enough room
677  * in the source and destination buffers.
678  *
679  * XXX only for multiples of 64 bytes, non overlapped.
680  */
681 static inline void
682 pkt_copy(void *_src, void *_dst, int l)
683 {
684         uint64_t *src = _src;
685         uint64_t *dst = _dst;
686         if (unlikely(l >= 1024)) {
687                 memcpy(dst, src, l);
688                 return;
689         }
690         for (; likely(l > 0); l-=64) {
691                 *dst++ = *src++;
692                 *dst++ = *src++;
693                 *dst++ = *src++;
694                 *dst++ = *src++;
695                 *dst++ = *src++;
696                 *dst++ = *src++;
697                 *dst++ = *src++;
698                 *dst++ = *src++;
699         }
700 }
701 
702 
703 #ifdef TEST_STUFF
704 struct xxx {
705 	char *name;
706 	void (*fn)(uint32_t);
707 };
708 
709 
710 static void
711 nm_test_defmtx(uint32_t n)
712 {
713 	uint32_t i;
714 	struct mtx m;
715 	mtx_init(&m, "test", NULL, MTX_DEF);
716 	for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); }
717 	mtx_destroy(&m);
718 	return;
719 }
720 
721 static void
722 nm_test_spinmtx(uint32_t n)
723 {
724 	uint32_t i;
725 	struct mtx m;
726 	mtx_init(&m, "test", NULL, MTX_SPIN);
727 	for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); }
728 	mtx_destroy(&m);
729 	return;
730 }
731 
732 static void
733 nm_test_rlock(uint32_t n)
734 {
735 	uint32_t i;
736 	struct rwlock m;
737 	rw_init(&m, "test");
738 	for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); }
739 	rw_destroy(&m);
740 	return;
741 }
742 
743 static void
744 nm_test_wlock(uint32_t n)
745 {
746 	uint32_t i;
747 	struct rwlock m;
748 	rw_init(&m, "test");
749 	for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); }
750 	rw_destroy(&m);
751 	return;
752 }
753 
754 static void
755 nm_test_slock(uint32_t n)
756 {
757 	uint32_t i;
758 	struct sx m;
759 	sx_init(&m, "test");
760 	for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); }
761 	sx_destroy(&m);
762 	return;
763 }
764 
765 static void
766 nm_test_xlock(uint32_t n)
767 {
768 	uint32_t i;
769 	struct sx m;
770 	sx_init(&m, "test");
771 	for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); }
772 	sx_destroy(&m);
773 	return;
774 }
775 
776 
777 struct xxx nm_tests[] = {
778 	{ "defmtx", nm_test_defmtx },
779 	{ "spinmtx", nm_test_spinmtx },
780 	{ "rlock", nm_test_rlock },
781 	{ "wlock", nm_test_wlock },
782 	{ "slock", nm_test_slock },
783 	{ "xlock", nm_test_xlock },
784 };
785 
786 static int
787 nm_test(struct nmreq *nmr)
788 {
789 	uint32_t scale, n, test;
790 	static int old_test = -1;
791 
792 	test = nmr->nr_cmd;
793 	scale = nmr->nr_offset;
794 	n = sizeof(nm_tests) / sizeof(struct xxx) - 1;
795 	if (test > n) {
796 		D("test index too high, max %d", n);
797 		return 0;
798 	}
799 
800 	if (old_test != test) {
801 		D("test %s scale %d", nm_tests[test].name, scale);
802 		old_test = test;
803 	}
804 	nm_tests[test].fn(scale);
805 	return 0;
806 }
807 #endif /* TEST_STUFF */
808 
809 /*
810  * locate a bridge among the existing ones.
811  * MUST BE CALLED WITH NMG_LOCK()
812  *
813  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
814  * We assume that this is called with a name of at least NM_NAME chars.
815  */
816 static struct nm_bridge *
817 nm_find_bridge(const char *name, int create)
818 {
819 	int i, l, namelen;
820 	struct nm_bridge *b = NULL;
821 
822 	NMG_LOCK_ASSERT();
823 
824 	namelen = strlen(NM_NAME);	/* base length */
825 	l = name ? strlen(name) : 0;		/* actual length */
826 	if (l < namelen) {
827 		D("invalid bridge name %s", name ? name : NULL);
828 		return NULL;
829 	}
830 	for (i = namelen + 1; i < l; i++) {
831 		if (name[i] == ':') {
832 			namelen = i;
833 			break;
834 		}
835 	}
836 	if (namelen >= IFNAMSIZ)
837 		namelen = IFNAMSIZ;
838 	ND("--- prefix is '%.*s' ---", namelen, name);
839 
840 	/* lookup the name, remember empty slot if there is one */
841 	for (i = 0; i < NM_BRIDGES; i++) {
842 		struct nm_bridge *x = nm_bridges + i;
843 
844 		if (x->bdg_active_ports == 0) {
845 			if (create && b == NULL)
846 				b = x;	/* record empty slot */
847 		} else if (x->bdg_namelen != namelen) {
848 			continue;
849 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
850 			ND("found '%.*s' at %d", namelen, name, i);
851 			b = x;
852 			break;
853 		}
854 	}
855 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
856 		/* initialize the bridge */
857 		strncpy(b->bdg_basename, name, namelen);
858 		ND("create new bridge %s with ports %d", b->bdg_basename,
859 			b->bdg_active_ports);
860 		b->bdg_namelen = namelen;
861 		b->bdg_active_ports = 0;
862 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
863 			b->bdg_port_index[i] = i;
864 		/* set the default function */
865 		b->nm_bdg_lookup = netmap_bdg_learning;
866 		/* reset the MAC address table */
867 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
868 	}
869 	return b;
870 }
871 
872 
873 /*
874  * Free the forwarding tables for rings attached to switch ports.
875  */
876 static void
877 nm_free_bdgfwd(struct netmap_adapter *na)
878 {
879 	int nrings, i;
880 	struct netmap_kring *kring;
881 
882 	NMG_LOCK_ASSERT();
883 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
884 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
885 	for (i = 0; i < nrings; i++) {
886 		if (kring[i].nkr_ft) {
887 			free(kring[i].nkr_ft, M_DEVBUF);
888 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
889 		}
890 	}
891 	if (nma_is_hw(na))
892 		nm_free_bdgfwd(SWNA(na->ifp));
893 }
894 
895 
896 /*
897  * Allocate the forwarding tables for the rings attached to the bridge ports.
898  */
899 static int
900 nm_alloc_bdgfwd(struct netmap_adapter *na)
901 {
902 	int nrings, l, i, num_dstq;
903 	struct netmap_kring *kring;
904 
905 	NMG_LOCK_ASSERT();
906 	/* all port:rings + broadcast */
907 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
908 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
909 	l += sizeof(struct nm_bdg_q) * num_dstq;
910 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
911 
912 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
913 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
914 	for (i = 0; i < nrings; i++) {
915 		struct nm_bdg_fwd *ft;
916 		struct nm_bdg_q *dstq;
917 		int j;
918 
919 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
920 		if (!ft) {
921 			nm_free_bdgfwd(na);
922 			return ENOMEM;
923 		}
924 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
925 		for (j = 0; j < num_dstq; j++) {
926 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
927 			dstq[j].bq_len = 0;
928 		}
929 		kring[i].nkr_ft = ft;
930 	}
931 	if (nma_is_hw(na))
932 		nm_alloc_bdgfwd(SWNA(na->ifp));
933 	return 0;
934 }
935 
936 
937 /*
938  * Fetch configuration from the device, to cope with dynamic
939  * reconfigurations after loading the module.
940  */
941 static int
942 netmap_update_config(struct netmap_adapter *na)
943 {
944 	struct ifnet *ifp = na->ifp;
945 	u_int txr, txd, rxr, rxd;
946 
947 	txr = txd = rxr = rxd = 0;
948 	if (na->nm_config) {
949 		na->nm_config(ifp, &txr, &txd, &rxr, &rxd);
950 	} else {
951 		/* take whatever we had at init time */
952 		txr = na->num_tx_rings;
953 		txd = na->num_tx_desc;
954 		rxr = na->num_rx_rings;
955 		rxd = na->num_rx_desc;
956 	}
957 
958 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
959 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
960 		return 0; /* nothing changed */
961 	if (netmap_verbose || na->refcount > 0) {
962 		D("stored config %s: txring %d x %d, rxring %d x %d",
963 			ifp->if_xname,
964 			na->num_tx_rings, na->num_tx_desc,
965 			na->num_rx_rings, na->num_rx_desc);
966 		D("new config %s: txring %d x %d, rxring %d x %d",
967 			ifp->if_xname, txr, txd, rxr, rxd);
968 	}
969 	if (na->refcount == 0) {
970 		D("configuration changed (but fine)");
971 		na->num_tx_rings = txr;
972 		na->num_tx_desc = txd;
973 		na->num_rx_rings = rxr;
974 		na->num_rx_desc = rxd;
975 		return 0;
976 	}
977 	D("configuration changed while active, this is bad...");
978 	return 1;
979 }
980 
981 static struct netmap_if *
982 netmap_if_new(const char *ifname, struct netmap_adapter *na)
983 {
984 	if (netmap_update_config(na)) {
985 		/* configuration mismatch, report and fail */
986 		return NULL;
987 	}
988 	return netmap_mem_if_new(ifname, na);
989 }
990 
991 
992 /* Structure associated to each thread which registered an interface.
993  *
994  * The first 4 fields of this structure are written by NIOCREGIF and
995  * read by poll() and NIOC?XSYNC.
996  * There is low contention among writers (actually, a correct user program
997  * should have no contention among writers) and among writers and readers,
998  * so we use a single global lock to protect the structure initialization.
999  * Since initialization involves the allocation of memory, we reuse the memory
1000  * allocator lock.
1001  * Read access to the structure is lock free. Readers must check that
1002  * np_nifp is not NULL before using the other fields.
1003  * If np_nifp is NULL initialization has not been performed, so they should
1004  * return an error to userlevel.
1005  *
1006  * The ref_done field is used to regulate access to the refcount in the
1007  * memory allocator. The refcount must be incremented at most once for
1008  * each open("/dev/netmap"). The increment is performed by the first
1009  * function that calls netmap_get_memory() (currently called by
1010  * mmap(), NIOCGINFO and NIOCREGIF).
1011  * If the refcount is incremented, it is then decremented when the
1012  * private structure is destroyed.
1013  */
1014 struct netmap_priv_d {
1015 	struct netmap_if * volatile np_nifp;	/* netmap if descriptor. */
1016 
1017 	struct ifnet	*np_ifp;	/* device for which we hold a ref. */
1018 	int		np_ringid;	/* from the ioctl */
1019 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
1020 	uint16_t	np_txpoll;
1021 
1022 	struct netmap_mem_d *np_mref;	/* use with NMG_LOCK held */
1023 #ifdef __FreeBSD__
1024 	int		np_refcount;	/* use with NMG_LOCK held */
1025 #endif /* __FreeBSD__ */
1026 };
1027 
1028 /* grab a reference to the memory allocator, if we don't have one already.  The
1029  * reference is taken from the netmap_adapter registered with the priv.
1030  *
1031  */
1032 static int
1033 netmap_get_memory_locked(struct netmap_priv_d* p)
1034 {
1035 	struct netmap_mem_d *nmd;
1036 	int error = 0;
1037 
1038 	if (p->np_ifp == NULL) {
1039 		if (!netmap_mmap_unreg)
1040 			return ENODEV;
1041 		/* for compatibility with older versions of the API
1042  		 * we use the global allocator when no interface has been
1043  		 * registered
1044  		 */
1045 		nmd = &nm_mem;
1046 	} else {
1047 		nmd = NA(p->np_ifp)->nm_mem;
1048 	}
1049 	if (p->np_mref == NULL) {
1050 		error = netmap_mem_finalize(nmd);
1051 		if (!error)
1052 			p->np_mref = nmd;
1053 	} else if (p->np_mref != nmd) {
1054 		/* a virtual port has been registered, but previous
1055  		 * syscalls already used the global allocator.
1056  		 * We cannot continue
1057  		 */
1058 		error = ENODEV;
1059 	}
1060 	return error;
1061 }
1062 
1063 static int
1064 netmap_get_memory(struct netmap_priv_d* p)
1065 {
1066 	int error;
1067 	NMG_LOCK();
1068 	error = netmap_get_memory_locked(p);
1069 	NMG_UNLOCK();
1070 	return error;
1071 }
1072 
1073 static int
1074 netmap_have_memory_locked(struct netmap_priv_d* p)
1075 {
1076 	return p->np_mref != NULL;
1077 }
1078 
1079 static void
1080 netmap_drop_memory_locked(struct netmap_priv_d* p)
1081 {
1082 	if (p->np_mref) {
1083 		netmap_mem_deref(p->np_mref);
1084 		p->np_mref = NULL;
1085 	}
1086 }
1087 
1088 /*
1089  * File descriptor's private data destructor.
1090  *
1091  * Call nm_register(ifp,0) to stop netmap mode on the interface and
1092  * revert to normal operation. We expect that np_ifp has not gone.
1093  * The second argument is the nifp to work on. In some cases it is
1094  * not attached yet to the netmap_priv_d so we need to pass it as
1095  * a separate argument.
1096  */
1097 /* call with NMG_LOCK held */
1098 static void
1099 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
1100 {
1101 	struct ifnet *ifp = priv->np_ifp;
1102 	struct netmap_adapter *na = NA(ifp);
1103 
1104 	NMG_LOCK_ASSERT();
1105 	na->refcount--;
1106 	if (na->refcount <= 0) {	/* last instance */
1107 		u_int i;
1108 
1109 		if (netmap_verbose)
1110 			D("deleting last instance for %s", ifp->if_xname);
1111 		/*
1112 		 * (TO CHECK) This function is only called
1113 		 * when the last reference to this file descriptor goes
1114 		 * away. This means we cannot have any pending poll()
1115 		 * or interrupt routine operating on the structure.
1116 		 * XXX The file may be closed in a thread while
1117 		 * another thread is using it.
1118 		 * Linux keeps the file opened until the last reference
1119 		 * by any outstanding ioctl/poll or mmap is gone.
1120 		 * FreeBSD does not track mmap()s (but we do) and
1121 		 * wakes up any sleeping poll(). Need to check what
1122 		 * happens if the close() occurs while a concurrent
1123 		 * syscall is running.
1124 		 */
1125 		na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
1126 		/* Wake up any sleeping threads. netmap_poll will
1127 		 * then return POLLERR
1128 		 * XXX The wake up now must happen during *_down(), when
1129 		 * we order all activities to stop. -gl
1130 		 */
1131 		nm_free_bdgfwd(na);
1132 		for (i = 0; i < na->num_tx_rings + 1; i++) {
1133 			mtx_destroy(&na->tx_rings[i].q_lock);
1134 		}
1135 		for (i = 0; i < na->num_rx_rings + 1; i++) {
1136 			mtx_destroy(&na->rx_rings[i].q_lock);
1137 		}
1138 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
1139 		/* knlist_destroy(&na->tx_si.si_note); */
1140 		/* knlist_destroy(&na->rx_si.si_note); */
1141 		if (nma_is_hw(na))
1142 			SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
1143 	}
1144 	/*
1145 	 * netmap_mem_if_delete() deletes the nifp, and if this is
1146 	 * the last instance also buffers, rings and krings.
1147 	 */
1148 	netmap_mem_if_delete(na, nifp);
1149 }
1150 
1151 
1152 /* we assume netmap adapter exists
1153  * Called with NMG_LOCK held
1154  */
1155 static void
1156 nm_if_rele(struct ifnet *ifp)
1157 {
1158 	int i, is_hw, hw, sw, lim;
1159 	struct nm_bridge *b;
1160 	struct netmap_adapter *na;
1161 	uint8_t tmp[NM_BDG_MAXPORTS];
1162 
1163 	NMG_LOCK_ASSERT();
1164 	/* I can be called not only for get_ifp()-ed references where netmap's
1165 	 * capability is guaranteed, but also for non-netmap-capable NICs.
1166 	 */
1167 	if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
1168 		if_rele(ifp);
1169 		return;
1170 	}
1171 	na = NA(ifp);
1172 	b = na->na_bdg;
1173 	is_hw = nma_is_hw(na);
1174 
1175 	ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount);
1176 
1177 	if (!DROP_BDG_REF(ifp))
1178 		return;
1179 
1180 	/*
1181 	New algorithm:
1182 	make a copy of bdg_port_index;
1183 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
1184 	in the array of bdg_port_index, replacing them with
1185 	entries from the bottom of the array;
1186 	decrement bdg_active_ports;
1187 	acquire BDG_WLOCK() and copy back the array.
1188 	 */
1189 
1190 	hw = NA(ifp)->bdg_port;
1191 	sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1;
1192 	lim = b->bdg_active_ports;
1193 
1194 	ND("detach %d and %d (lim %d)", hw, sw, lim);
1195 	/* make a copy of the list of active ports, update it,
1196 	 * and then copy back within BDG_WLOCK().
1197 	 */
1198 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
1199 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
1200 		if (hw >= 0 && tmp[i] == hw) {
1201 			ND("detach hw %d at %d", hw, i);
1202 			lim--; /* point to last active port */
1203 			tmp[i] = tmp[lim]; /* swap with i */
1204 			tmp[lim] = hw;	/* now this is inactive */
1205 			hw = -1;
1206 		} else if (sw >= 0 && tmp[i] == sw) {
1207 			ND("detach sw %d at %d", sw, i);
1208 			lim--;
1209 			tmp[i] = tmp[lim];
1210 			tmp[lim] = sw;
1211 			sw = -1;
1212 		} else {
1213 			i++;
1214 		}
1215 	}
1216 	if (hw >= 0 || sw >= 0) {
1217 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
1218 	}
1219 	hw = NA(ifp)->bdg_port;
1220 	sw = (is_hw && SWNA(ifp)->na_bdg) ?  SWNA(ifp)->bdg_port : -1;
1221 
1222 	BDG_WLOCK(b);
1223 	b->bdg_ports[hw] = NULL;
1224 	na->na_bdg = NULL;
1225 	if (sw >= 0) {
1226 		b->bdg_ports[sw] = NULL;
1227 		SWNA(ifp)->na_bdg = NULL;
1228 	}
1229 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
1230 	b->bdg_active_ports = lim;
1231 	BDG_WUNLOCK(b);
1232 
1233 	ND("now %d active ports", lim);
1234 	if (lim == 0) {
1235 		ND("marking bridge %s as free", b->bdg_basename);
1236 		b->nm_bdg_lookup = NULL;
1237 	}
1238 
1239 	if (is_hw) {
1240 		if_rele(ifp);
1241 	} else {
1242 		if (na->na_flags & NAF_MEM_OWNER)
1243 			netmap_mem_private_delete(na->nm_mem);
1244 		bzero(na, sizeof(*na));
1245 		free(na, M_DEVBUF);
1246 		bzero(ifp, sizeof(*ifp));
1247 		free(ifp, M_DEVBUF);
1248 	}
1249 }
1250 
1251 
1252 /*
1253  * returns 1 if this is the last instance and we can free priv
1254  */
1255 static int
1256 netmap_dtor_locked(struct netmap_priv_d *priv)
1257 {
1258 	struct ifnet *ifp = priv->np_ifp;
1259 
1260 #ifdef __FreeBSD__
1261 	/*
1262 	 * np_refcount is the number of active mmaps on
1263 	 * this file descriptor
1264 	 */
1265 	if (--priv->np_refcount > 0) {
1266 		return 0;
1267 	}
1268 #endif /* __FreeBSD__ */
1269 	if (ifp) {
1270 		netmap_do_unregif(priv, priv->np_nifp);
1271 	}
1272 	netmap_drop_memory_locked(priv);
1273 	if (ifp) {
1274 		nm_if_rele(ifp); /* might also destroy *na */
1275 	}
1276 	return 1;
1277 }
1278 
1279 static void
1280 netmap_dtor(void *data)
1281 {
1282 	struct netmap_priv_d *priv = data;
1283 	int last_instance;
1284 
1285 	NMG_LOCK();
1286 	last_instance = netmap_dtor_locked(priv);
1287 	NMG_UNLOCK();
1288 	if (last_instance) {
1289 		bzero(priv, sizeof(*priv));	/* for safety */
1290 		free(priv, M_DEVBUF);
1291 	}
1292 }
1293 
1294 
1295 #ifdef __FreeBSD__
1296 
1297 /*
1298  * In order to track whether pages are still mapped, we hook into
1299  * the standard cdev_pager and intercept the constructor and
1300  * destructor.
1301  */
1302 
1303 struct netmap_vm_handle_t {
1304 	struct cdev 		*dev;
1305 	struct netmap_priv_d	*priv;
1306 };
1307 
1308 static int
1309 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
1310     vm_ooffset_t foff, struct ucred *cred, u_short *color)
1311 {
1312 	struct netmap_vm_handle_t *vmh = handle;
1313 	D("handle %p size %jd prot %d foff %jd",
1314 		handle, (intmax_t)size, prot, (intmax_t)foff);
1315 	dev_ref(vmh->dev);
1316 	return 0;
1317 }
1318 
1319 
1320 static void
1321 netmap_dev_pager_dtor(void *handle)
1322 {
1323 	struct netmap_vm_handle_t *vmh = handle;
1324 	struct cdev *dev = vmh->dev;
1325 	struct netmap_priv_d *priv = vmh->priv;
1326 	D("handle %p", handle);
1327 	netmap_dtor(priv);
1328 	free(vmh, M_DEVBUF);
1329 	dev_rel(dev);
1330 }
1331 
1332 static int
1333 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
1334 	int prot, vm_page_t *mres)
1335 {
1336 	struct netmap_vm_handle_t *vmh = object->handle;
1337 	struct netmap_priv_d *priv = vmh->priv;
1338 	vm_paddr_t paddr;
1339 	vm_page_t page;
1340 	vm_memattr_t memattr;
1341 	vm_pindex_t pidx;
1342 
1343 	ND("object %p offset %jd prot %d mres %p",
1344 			object, (intmax_t)offset, prot, mres);
1345 	memattr = object->memattr;
1346 	pidx = OFF_TO_IDX(offset);
1347 	paddr = netmap_mem_ofstophys(priv->np_mref, offset);
1348 	if (paddr == 0)
1349 		return VM_PAGER_FAIL;
1350 
1351 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
1352 		/*
1353 		 * If the passed in result page is a fake page, update it with
1354 		 * the new physical address.
1355 		 */
1356 		page = *mres;
1357 		vm_page_updatefake(page, paddr, memattr);
1358 	} else {
1359 		/*
1360 		 * Replace the passed in reqpage page with our own fake page and
1361 		 * free up the all of the original pages.
1362 		 */
1363 #ifndef VM_OBJECT_WUNLOCK	/* FreeBSD < 10.x */
1364 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK
1365 #define VM_OBJECT_WLOCK	VM_OBJECT_LOCK
1366 #endif /* VM_OBJECT_WUNLOCK */
1367 
1368 		VM_OBJECT_WUNLOCK(object);
1369 		page = vm_page_getfake(paddr, memattr);
1370 		VM_OBJECT_WLOCK(object);
1371 		vm_page_lock(*mres);
1372 		vm_page_free(*mres);
1373 		vm_page_unlock(*mres);
1374 		*mres = page;
1375 		vm_page_insert(page, object, pidx);
1376 	}
1377 	page->valid = VM_PAGE_BITS_ALL;
1378 	return (VM_PAGER_OK);
1379 }
1380 
1381 
1382 static struct cdev_pager_ops netmap_cdev_pager_ops = {
1383         .cdev_pg_ctor = netmap_dev_pager_ctor,
1384         .cdev_pg_dtor = netmap_dev_pager_dtor,
1385         .cdev_pg_fault = netmap_dev_pager_fault,
1386 };
1387 
1388 
1389 static int
1390 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
1391 	vm_size_t objsize,  vm_object_t *objp, int prot)
1392 {
1393 	int error;
1394 	struct netmap_vm_handle_t *vmh;
1395 	struct netmap_priv_d *priv;
1396 	vm_object_t obj;
1397 
1398 	D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
1399 	    (intmax_t )*foff, (intmax_t )objsize, objp, prot);
1400 
1401 	vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
1402 			      M_NOWAIT | M_ZERO);
1403 	if (vmh == NULL)
1404 		return ENOMEM;
1405 	vmh->dev = cdev;
1406 
1407 	NMG_LOCK();
1408 	error = devfs_get_cdevpriv((void**)&priv);
1409 	if (error)
1410 		goto err_unlock;
1411 	vmh->priv = priv;
1412 	priv->np_refcount++;
1413 	NMG_UNLOCK();
1414 
1415 	error = netmap_get_memory(priv);
1416 	if (error)
1417 		goto err_deref;
1418 
1419 	obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
1420 		&netmap_cdev_pager_ops, objsize, prot,
1421 		*foff, NULL);
1422 	if (obj == NULL) {
1423 		D("cdev_pager_allocate failed");
1424 		error = EINVAL;
1425 		goto err_deref;
1426 	}
1427 
1428 	*objp = obj;
1429 	return 0;
1430 
1431 err_deref:
1432 	NMG_LOCK();
1433 	priv->np_refcount--;
1434 err_unlock:
1435 	NMG_UNLOCK();
1436 // err:
1437 	free(vmh, M_DEVBUF);
1438 	return error;
1439 }
1440 
1441 
1442 // XXX can we remove this ?
1443 static int
1444 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
1445 {
1446 	if (netmap_verbose)
1447 		D("dev %p fflag 0x%x devtype %d td %p",
1448 			dev, fflag, devtype, td);
1449 	return 0;
1450 }
1451 
1452 
1453 static int
1454 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
1455 {
1456 	struct netmap_priv_d *priv;
1457 	int error;
1458 
1459 	(void)dev;
1460 	(void)oflags;
1461 	(void)devtype;
1462 	(void)td;
1463 
1464 	// XXX wait or nowait ?
1465 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
1466 			      M_NOWAIT | M_ZERO);
1467 	if (priv == NULL)
1468 		return ENOMEM;
1469 
1470 	error = devfs_set_cdevpriv(priv, netmap_dtor);
1471 	if (error)
1472 	        return error;
1473 
1474 	priv->np_refcount = 1;
1475 
1476 	return 0;
1477 }
1478 #endif /* __FreeBSD__ */
1479 
1480 
1481 /*
1482  * Handlers for synchronization of the queues from/to the host.
1483  * Netmap has two operating modes:
1484  * - in the default mode, the rings connected to the host stack are
1485  *   just another ring pair managed by userspace;
1486  * - in transparent mode (XXX to be defined) incoming packets
1487  *   (from the host or the NIC) are marked as NS_FORWARD upon
1488  *   arrival, and the user application has a chance to reset the
1489  *   flag for packets that should be dropped.
1490  *   On the RXSYNC or poll(), packets in RX rings between
1491  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1492  *   to the other side.
1493  * The transfer NIC --> host is relatively easy, just encapsulate
1494  * into mbufs and we are done. The host --> NIC side is slightly
1495  * harder because there might not be room in the tx ring so it
1496  * might take a while before releasing the buffer.
1497  */
1498 
1499 
1500 /*
1501  * pass a chain of buffers to the host stack as coming from 'dst'
1502  */
1503 static void
1504 netmap_send_up(struct ifnet *dst, struct mbuf *head)
1505 {
1506 	struct mbuf *m;
1507 
1508 	/* send packets up, outside the lock */
1509 	while ((m = head) != NULL) {
1510 		head = head->m_nextpkt;
1511 		m->m_nextpkt = NULL;
1512 		if (netmap_verbose & NM_VERB_HOST)
1513 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1514 		NM_SEND_UP(dst, m);
1515 	}
1516 }
1517 
1518 struct mbq {
1519 	struct mbuf *head;
1520 	struct mbuf *tail;
1521 	int count;
1522 };
1523 
1524 
1525 /*
1526  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1527  * Run from hwcur to cur - reserved
1528  */
1529 static void
1530 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1531 {
1532 	/* Take packets from hwcur to cur-reserved and pass them up.
1533 	 * In case of no buffers we give up. At the end of the loop,
1534 	 * the queue is drained in all cases.
1535 	 * XXX handle reserved
1536 	 */
1537 	u_int lim = kring->nkr_num_slots - 1;
1538 	struct mbuf *m, *tail = q->tail;
1539 	u_int k = kring->ring->cur, n = kring->ring->reserved;
1540 	struct netmap_mem_d *nmd = kring->na->nm_mem;
1541 
1542 	/* compute the final position, ring->cur - ring->reserved */
1543 	if (n > 0) {
1544 		if (k < n)
1545 			k += kring->nkr_num_slots;
1546 		k += n;
1547 	}
1548 	for (n = kring->nr_hwcur; n != k;) {
1549 		struct netmap_slot *slot = &kring->ring->slot[n];
1550 
1551 		n = nm_next(n, lim);
1552 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1553 			continue;
1554 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) {
1555 			D("bad pkt at %d len %d", n, slot->len);
1556 			continue;
1557 		}
1558 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1559 		/* XXX adapt to the case of a multisegment packet */
1560 		m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL);
1561 
1562 		if (m == NULL)
1563 			break;
1564 		if (tail)
1565 			tail->m_nextpkt = m;
1566 		else
1567 			q->head = m;
1568 		tail = m;
1569 		q->count++;
1570 		m->m_nextpkt = NULL;
1571 	}
1572 	q->tail = tail;
1573 }
1574 
1575 
1576 /*
1577  * The host ring has packets from nr_hwcur to (cur - reserved)
1578  * to be sent down to the NIC.
1579  * We need to use the queue lock on the source (host RX ring)
1580  * to protect against netmap_transmit.
1581  * If the user is well behaved we do not need to acquire locks
1582  * on the destination(s),
1583  * so we only need to make sure that there are no panics because
1584  * of user errors.
1585  * XXX verify
1586  *
1587  * We scan the tx rings, which have just been
1588  * flushed so nr_hwcur == cur. Pushing packets down means
1589  * increment cur and decrement avail.
1590  * XXX to be verified
1591  */
1592 static void
1593 netmap_sw_to_nic(struct netmap_adapter *na)
1594 {
1595 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1596 	struct netmap_kring *k1 = &na->tx_rings[0];
1597 	u_int i, howmany, src_lim, dst_lim;
1598 
1599 	/* XXX we should also check that the carrier is on */
1600 	if (kring->nkr_stopped)
1601 		return;
1602 
1603 	mtx_lock(&kring->q_lock);
1604 
1605 	if (kring->nkr_stopped)
1606 		goto out;
1607 
1608 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
1609 
1610 	src_lim = kring->nkr_num_slots - 1;
1611 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
1612 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
1613 		dst_lim = k1->nkr_num_slots - 1;
1614 		while (howmany > 0 && k1->ring->avail > 0) {
1615 			struct netmap_slot *src, *dst, tmp;
1616 			src = &kring->ring->slot[kring->nr_hwcur];
1617 			dst = &k1->ring->slot[k1->ring->cur];
1618 			tmp = *src;
1619 			src->buf_idx = dst->buf_idx;
1620 			src->flags = NS_BUF_CHANGED;
1621 
1622 			dst->buf_idx = tmp.buf_idx;
1623 			dst->len = tmp.len;
1624 			dst->flags = NS_BUF_CHANGED;
1625 			ND("out len %d buf %d from %d to %d",
1626 				dst->len, dst->buf_idx,
1627 				kring->nr_hwcur, k1->ring->cur);
1628 
1629 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
1630 			howmany--;
1631 			kring->nr_hwavail--;
1632 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
1633 			k1->ring->avail--;
1634 		}
1635 		kring->ring->cur = kring->nr_hwcur; // XXX
1636 		k1++; // XXX why?
1637 	}
1638 out:
1639 	mtx_unlock(&kring->q_lock);
1640 }
1641 
1642 
1643 /*
1644  * netmap_txsync_to_host() passes packets up. We are called from a
1645  * system call in user process context, and the only contention
1646  * can be among multiple user threads erroneously calling
1647  * this routine concurrently.
1648  */
1649 static void
1650 netmap_txsync_to_host(struct netmap_adapter *na)
1651 {
1652 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1653 	struct netmap_ring *ring = kring->ring;
1654 	u_int k, lim = kring->nkr_num_slots - 1;
1655 	struct mbq q = { NULL, NULL, 0 };
1656 
1657 	if (nm_kr_tryget(kring)) {
1658 		D("ring %p busy (user error)", kring);
1659 		return;
1660 	}
1661 	k = ring->cur;
1662 	if (k > lim) {
1663 		D("invalid ring index in stack TX kring %p", kring);
1664 		netmap_ring_reinit(kring);
1665 		nm_kr_put(kring);
1666 		return;
1667 	}
1668 
1669 	/* Take packets from hwcur to cur and pass them up.
1670 	 * In case of no buffers we give up. At the end of the loop,
1671 	 * the queue is drained in all cases.
1672 	 */
1673 	netmap_grab_packets(kring, &q, 1);
1674 	kring->nr_hwcur = k;
1675 	kring->nr_hwavail = ring->avail = lim;
1676 
1677 	nm_kr_put(kring);
1678 	netmap_send_up(na->ifp, q.head);
1679 }
1680 
1681 
1682 /*
1683  * This is the 'txsync' handler to send from a software ring to the
1684  * host stack.
1685  */
1686 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
1687 static int
1688 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags)
1689 {
1690 	(void)ring_nr;
1691 	(void)flags;
1692 	if (netmap_verbose > 255)
1693 		RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr);
1694 	netmap_txsync_to_host(NA(ifp));
1695 	return 0;
1696 }
1697 
1698 
1699 /*
1700  * rxsync backend for packets coming from the host stack.
1701  * They have been put in the queue by netmap_transmit() so we
1702  * need to protect access to the kring using a lock.
1703  *
1704  * This routine also does the selrecord if called from the poll handler
1705  * (we know because td != NULL).
1706  *
1707  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1708  *     as an additional hidden argument.
1709  */
1710 static void
1711 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1712 {
1713 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1714 	struct netmap_ring *ring = kring->ring;
1715 	u_int j, n, lim = kring->nkr_num_slots;
1716 	u_int k = ring->cur, resvd = ring->reserved;
1717 
1718 	(void)pwait;	/* disable unused warnings */
1719 
1720 	if (kring->nkr_stopped) /* check a first time without lock */
1721 		return;
1722 
1723 	/* XXX as an optimization we could reuse na->core_lock */
1724 	mtx_lock(&kring->q_lock);
1725 
1726 	if (kring->nkr_stopped)  /* check again with lock held */
1727 		goto unlock_out;
1728 
1729 	if (k >= lim) {
1730 		netmap_ring_reinit(kring);
1731 		goto unlock_out;
1732 	}
1733 	/* new packets are already set in nr_hwavail */
1734 	/* skip past packets that userspace has released */
1735 	j = kring->nr_hwcur;
1736 	if (resvd > 0) {
1737 		if (resvd + ring->avail >= lim + 1) {
1738 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
1739 			ring->reserved = resvd = 0; // XXX panic...
1740 		}
1741 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
1742         }
1743 	if (j != k) {
1744 		n = k >= j ? k - j : k + lim - j;
1745 		kring->nr_hwavail -= n;
1746 		kring->nr_hwcur = k;
1747 	}
1748 	k = ring->avail = kring->nr_hwavail - resvd;
1749 	if (k == 0 && td)
1750 		selrecord(td, &kring->si);
1751 	if (k && (netmap_verbose & NM_VERB_HOST))
1752 		D("%d pkts from stack", k);
1753 unlock_out:
1754 
1755 	mtx_unlock(&kring->q_lock);
1756 }
1757 
1758 
1759 /*
1760  * MUST BE CALLED UNDER NMG_LOCK()
1761  *
1762  * get a refcounted reference to an interface.
1763  * This is always called in the execution of an ioctl().
1764  *
1765  * Return ENXIO if the interface does not exist, EINVAL if netmap
1766  * is not supported by the interface.
1767  * If successful, hold a reference.
1768  *
1769  * When the NIC is attached to a bridge, reference is managed
1770  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1771  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1772  * is detached from the bridge, then ifp's refcount is dropped (this
1773  * is equivalent to that ifp is destroyed in case of virtual ports.
1774  *
1775  * This function uses if_rele() when we want to prevent the NIC from
1776  * being detached from the bridge in error handling.  But once refcount
1777  * is acquired by this function, it must be released using nm_if_rele().
1778  */
1779 static int
1780 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create)
1781 {
1782 	const char *name = nmr->nr_name;
1783 	int namelen = strlen(name);
1784 	struct ifnet *iter = NULL;
1785 	int no_prefix = 0;
1786 
1787 	/* first try to see if this is a bridge port. */
1788 	struct nm_bridge *b;
1789 	struct netmap_adapter *na;
1790 	int i, j, cand = -1, cand2 = -1;
1791 	int needed;
1792 
1793 	NMG_LOCK_ASSERT();
1794 	*ifp = NULL;	/* default */
1795 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
1796 		no_prefix = 1;	/* no VALE prefix */
1797 		goto no_bridge_port;
1798 	}
1799 
1800 	b = nm_find_bridge(name, create);
1801 	if (b == NULL) {
1802 		D("no bridges available for '%s'", name);
1803 		return (ENXIO);
1804 	}
1805 
1806 	/* Now we are sure that name starts with the bridge's name,
1807 	 * lookup the port in the bridge. We need to scan the entire
1808 	 * list. It is not important to hold a WLOCK on the bridge
1809 	 * during the search because NMG_LOCK already guarantees
1810 	 * that there are no other possible writers.
1811 	 */
1812 
1813 	/* lookup in the local list of ports */
1814 	for (j = 0; j < b->bdg_active_ports; j++) {
1815 		i = b->bdg_port_index[j];
1816 		na = b->bdg_ports[i];
1817 		// KASSERT(na != NULL);
1818 		iter = na->ifp;
1819 		/* XXX make sure the name only contains one : */
1820 		if (!strcmp(iter->if_xname, name) /* virtual port */ ||
1821 		    (namelen > b->bdg_namelen && !strcmp(iter->if_xname,
1822 		    name + b->bdg_namelen + 1)) /* NIC */) {
1823 			ADD_BDG_REF(iter);
1824 			ND("found existing if %s refs %d", name,
1825 				NA(iter)->na_bdg_refcount);
1826 			*ifp = iter;
1827 			/* we are done, this is surely netmap capable */
1828 			return 0;
1829 		}
1830 	}
1831 	/* not found, should we create it? */
1832 	if (!create)
1833 		return ENXIO;
1834 	/* yes we should, see if we have space to attach entries */
1835 	needed = 2; /* in some cases we only need 1 */
1836 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
1837 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
1838 		return EINVAL;
1839 	}
1840 	/* record the next two ports available, but do not allocate yet */
1841 	cand = b->bdg_port_index[b->bdg_active_ports];
1842 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
1843 	ND("+++ bridge %s port %s used %d avail %d %d",
1844 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
1845 
1846 	/*
1847 	 * try see if there is a matching NIC with this name
1848 	 * (after the bridge's name)
1849 	 */
1850 	iter = ifunit_ref(name + b->bdg_namelen + 1);
1851 	if (!iter) { /* this is a virtual port */
1852 		/* Create a temporary NA with arguments, then
1853 		 * bdg_netmap_attach() will allocate the real one
1854 		 * and attach it to the ifp
1855 		 */
1856 		struct netmap_adapter tmp_na;
1857 
1858 		if (nmr->nr_cmd) {
1859 			/* nr_cmd must be 0 for a virtual port */
1860 			return EINVAL;
1861 		}
1862 		bzero(&tmp_na, sizeof(tmp_na));
1863 		/* bound checking */
1864 		tmp_na.num_tx_rings = nmr->nr_tx_rings;
1865 		nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1866 		nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back
1867 		tmp_na.num_rx_rings = nmr->nr_rx_rings;
1868 		nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1869 		nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back
1870 		nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1871 				1, NM_BDG_MAXSLOTS, NULL);
1872 		tmp_na.num_tx_desc = nmr->nr_tx_slots;
1873 		nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1874 				1, NM_BDG_MAXSLOTS, NULL);
1875 		tmp_na.num_rx_desc = nmr->nr_rx_slots;
1876 
1877 	 	/* create a struct ifnet for the new port.
1878 		 * need M_NOWAIT as we are under nma_lock
1879 		 */
1880 		iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
1881 		if (!iter)
1882 			return ENOMEM;
1883 
1884 		strcpy(iter->if_xname, name);
1885 		tmp_na.ifp = iter;
1886 		/* bdg_netmap_attach creates a struct netmap_adapter */
1887 		bdg_netmap_attach(&tmp_na);
1888 		cand2 = -1;	/* only need one port */
1889 	} else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
1890 		/* make sure the NIC is not already in use */
1891 		if (NETMAP_OWNED_BY_ANY(iter)) {
1892 			D("NIC %s busy, cannot attach to bridge",
1893 				iter->if_xname);
1894 			if_rele(iter); /* don't detach from bridge */
1895 			return EINVAL;
1896 		}
1897 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
1898 			cand2 = -1; /* only need one port */
1899 	} else { /* not a netmap-capable NIC */
1900 		if_rele(iter); /* don't detach from bridge */
1901 		return EINVAL;
1902 	}
1903 	na = NA(iter);
1904 
1905 	BDG_WLOCK(b);
1906 	na->bdg_port = cand;
1907 	ND("NIC  %p to bridge port %d", NA(iter), cand);
1908 	/* bind the port to the bridge (virtual ports are not active) */
1909 	b->bdg_ports[cand] = na;
1910 	na->na_bdg = b;
1911 	b->bdg_active_ports++;
1912 	if (cand2 >= 0) {
1913 		/* also bind the host stack to the bridge */
1914 		b->bdg_ports[cand2] = SWNA(iter);
1915 		SWNA(iter)->bdg_port = cand2;
1916 		SWNA(iter)->na_bdg = b;
1917 		b->bdg_active_ports++;
1918 		ND("host %p to bridge port %d", SWNA(iter), cand2);
1919 	}
1920 	ADD_BDG_REF(iter);	// XXX one or two ?
1921 	ND("if %s refs %d", name, NA(iter)->na_bdg_refcount);
1922 	BDG_WUNLOCK(b);
1923 	*ifp = iter;
1924 	return 0;
1925 
1926 no_bridge_port:
1927 	*ifp = iter;
1928 	if (! *ifp)
1929 		*ifp = ifunit_ref(name);
1930 	if (*ifp == NULL)
1931 		return (ENXIO);
1932 
1933 	if (NETMAP_CAPABLE(*ifp)) {
1934 		/* Users cannot use the NIC attached to a bridge directly */
1935 		if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
1936 			if_rele(*ifp); /* don't detach from bridge */
1937 			return EINVAL;
1938 		} else
1939 			return 0;	/* valid pointer, we hold the refcount */
1940 	}
1941 	nm_if_rele(*ifp);
1942 	return EINVAL;	// not NETMAP capable
1943 }
1944 
1945 
1946 /*
1947  * Error routine called when txsync/rxsync detects an error.
1948  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1949  * Return 1 on reinit.
1950  *
1951  * This routine is only called by the upper half of the kernel.
1952  * It only reads hwcur (which is changed only by the upper half, too)
1953  * and hwavail (which may be changed by the lower half, but only on
1954  * a tx ring and only to increase it, so any error will be recovered
1955  * on the next call). For the above, we don't strictly need to call
1956  * it under lock.
1957  */
1958 int
1959 netmap_ring_reinit(struct netmap_kring *kring)
1960 {
1961 	struct netmap_ring *ring = kring->ring;
1962 	u_int i, lim = kring->nkr_num_slots - 1;
1963 	int errors = 0;
1964 
1965 	// XXX KASSERT nm_kr_tryget
1966 	RD(10, "called for %s", kring->na->ifp->if_xname);
1967 	if (ring->cur > lim)
1968 		errors++;
1969 	for (i = 0; i <= lim; i++) {
1970 		u_int idx = ring->slot[i].buf_idx;
1971 		u_int len = ring->slot[i].len;
1972 		if (idx < 2 || idx >= netmap_total_buffers) {
1973 			if (!errors++)
1974 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1975 			ring->slot[i].buf_idx = 0;
1976 			ring->slot[i].len = 0;
1977 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1978 			ring->slot[i].len = 0;
1979 			if (!errors++)
1980 				D("bad len %d at slot %d idx %d",
1981 					len, i, idx);
1982 		}
1983 	}
1984 	if (errors) {
1985 		int pos = kring - kring->na->tx_rings;
1986 		int n = kring->na->num_tx_rings + 1;
1987 
1988 		RD(10, "total %d errors", errors);
1989 		errors++;
1990 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1991 			kring->na->ifp->if_xname,
1992 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1993 			ring->cur, kring->nr_hwcur,
1994 			ring->avail, kring->nr_hwavail);
1995 		ring->cur = kring->nr_hwcur;
1996 		ring->avail = kring->nr_hwavail;
1997 	}
1998 	return (errors ? 1 : 0);
1999 }
2000 
2001 
2002 /*
2003  * Set the ring ID. For devices with a single queue, a request
2004  * for all rings is the same as a single ring.
2005  */
2006 static int
2007 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
2008 {
2009 	struct ifnet *ifp = priv->np_ifp;
2010 	struct netmap_adapter *na = NA(ifp);
2011 	u_int i = ringid & NETMAP_RING_MASK;
2012 	/* initially (np_qfirst == np_qlast) we don't want to lock */
2013 	u_int lim = na->num_rx_rings;
2014 
2015 	if (na->num_tx_rings > lim)
2016 		lim = na->num_tx_rings;
2017 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
2018 		D("invalid ring id %d", i);
2019 		return (EINVAL);
2020 	}
2021 	priv->np_ringid = ringid;
2022 	if (ringid & NETMAP_SW_RING) {
2023 		priv->np_qfirst = NETMAP_SW_RING;
2024 		priv->np_qlast = 0;
2025 	} else if (ringid & NETMAP_HW_RING) {
2026 		priv->np_qfirst = i;
2027 		priv->np_qlast = i + 1;
2028 	} else {
2029 		priv->np_qfirst = 0;
2030 		priv->np_qlast = NETMAP_HW_RING ;
2031 	}
2032 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
2033     if (netmap_verbose) {
2034 	if (ringid & NETMAP_SW_RING)
2035 		D("ringid %s set to SW RING", ifp->if_xname);
2036 	else if (ringid & NETMAP_HW_RING)
2037 		D("ringid %s set to HW RING %d", ifp->if_xname,
2038 			priv->np_qfirst);
2039 	else
2040 		D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim);
2041     }
2042 	return 0;
2043 }
2044 
2045 
2046 /*
2047  * possibly move the interface to netmap-mode.
2048  * If success it returns a pointer to netmap_if, otherwise NULL.
2049  * This must be called with NMG_LOCK held.
2050  */
2051 static struct netmap_if *
2052 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
2053 	uint16_t ringid, int *err)
2054 {
2055 	struct netmap_adapter *na = NA(ifp);
2056 	struct netmap_if *nifp = NULL;
2057 	int error, need_mem;
2058 
2059 	NMG_LOCK_ASSERT();
2060 	/* ring configuration may have changed, fetch from the card */
2061 	netmap_update_config(na);
2062 	priv->np_ifp = ifp;     /* store the reference */
2063 	error = netmap_set_ringid(priv, ringid);
2064 	if (error)
2065 		goto out;
2066 	/* ensure allocators are ready */
2067 	need_mem = !netmap_have_memory_locked(priv);
2068 	if (need_mem) {
2069 		error = netmap_get_memory_locked(priv);
2070 		ND("get_memory returned %d", error);
2071 		if (error)
2072 			goto out;
2073 	}
2074 	nifp = netmap_if_new(ifp->if_xname, na);
2075 	if (nifp == NULL) { /* allocation failed */
2076 		/* we should drop the allocator, but only
2077 		 * if we were the ones who grabbed it
2078 		 */
2079 		if (need_mem)
2080 			netmap_drop_memory_locked(priv);
2081 		error = ENOMEM;
2082 		goto out;
2083 	}
2084 	na->refcount++;
2085 	if (ifp->if_capenable & IFCAP_NETMAP) {
2086 		/* was already set */
2087 	} else {
2088 		u_int i;
2089 		/* Otherwise set the card in netmap mode
2090 		 * and make it use the shared buffers.
2091 		 *
2092 		 * If the interface is attached to a bridge, lock it.
2093 		 */
2094 		if (NETMAP_OWNED_BY_KERN(ifp))
2095 			BDG_WLOCK(NA(ifp)->na_bdg);
2096 		for (i = 0 ; i < na->num_tx_rings + 1; i++)
2097 			mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
2098 			    NULL, MTX_DEF);
2099 		for (i = 0 ; i < na->num_rx_rings + 1; i++) {
2100 			mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
2101 			    NULL, MTX_DEF);
2102 		}
2103 		if (nma_is_hw(na)) {
2104 			SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
2105 			SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
2106 		}
2107 		/*
2108 		 * do not core lock because the race is harmless here,
2109 		 * there cannot be any traffic to netmap_transmit()
2110 		 */
2111 		error = na->nm_register(ifp, 1); /* mode on */
2112 		// XXX do we need to nm_alloc_bdgfwd() in all cases ?
2113 		if (!error)
2114 			error = nm_alloc_bdgfwd(na);
2115 		if (error) {
2116 			netmap_do_unregif(priv, nifp);
2117 			nifp = NULL;
2118 		}
2119 		if (NETMAP_OWNED_BY_KERN(ifp))
2120 			BDG_WUNLOCK(NA(ifp)->na_bdg);
2121 
2122 	}
2123 out:
2124 	*err = error;
2125 	if (nifp != NULL) {
2126 		/*
2127 		 * advertise that the interface is ready bt setting ni_nifp.
2128 		 * The barrier is needed because readers (poll and *SYNC)
2129 		 * check for priv->np_nifp != NULL without locking
2130 		 */
2131 		wmb(); /* make sure previous writes are visible to all CPUs */
2132 		priv->np_nifp = nifp;
2133 	}
2134 	return nifp;
2135 }
2136 
2137 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
2138 static int
2139 nm_bdg_attach(struct nmreq *nmr)
2140 {
2141 	struct ifnet *ifp;
2142 	struct netmap_if *nifp;
2143 	struct netmap_priv_d *npriv;
2144 	int error;
2145 
2146 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
2147 	if (npriv == NULL)
2148 		return ENOMEM;
2149 	NMG_LOCK();
2150 	error = get_ifp(nmr, &ifp, 1 /* create if not exists */);
2151 	if (error) /* no device, or another bridge or user owns the device */
2152 		goto unlock_exit;
2153 	/* get_ifp() sets na_bdg if this is a physical interface
2154 	 * that we can attach to a switch.
2155 	 */
2156 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2157 		/* got reference to a virtual port or direct access to a NIC.
2158 		 * perhaps specified no bridge prefix or wrong NIC name
2159 		 */
2160 		error = EINVAL;
2161 		goto unref_exit;
2162 	}
2163 
2164 	if (NA(ifp)->refcount > 0) { /* already registered */
2165 		error = EBUSY;
2166 		DROP_BDG_REF(ifp);
2167 		goto unlock_exit;
2168 	}
2169 
2170 	nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
2171 	if (!nifp) {
2172 		goto unref_exit;
2173 	}
2174 
2175 	NA(ifp)->na_kpriv = npriv;
2176 	NMG_UNLOCK();
2177 	ND("registered %s to netmap-mode", ifp->if_xname);
2178 	return 0;
2179 
2180 unref_exit:
2181 	nm_if_rele(ifp);
2182 unlock_exit:
2183 	NMG_UNLOCK();
2184 	bzero(npriv, sizeof(*npriv));
2185 	free(npriv, M_DEVBUF);
2186 	return error;
2187 }
2188 
2189 static int
2190 nm_bdg_detach(struct nmreq *nmr)
2191 {
2192 	struct ifnet *ifp;
2193 	int error;
2194 	int last_instance;
2195 
2196 	NMG_LOCK();
2197 	error = get_ifp(nmr, &ifp, 0 /* don't create */);
2198 	if (error) { /* no device, or another bridge or user owns the device */
2199 		goto unlock_exit;
2200 	}
2201 	/* XXX do we need to check this ? */
2202 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2203 		/* got reference to a virtual port or direct access to a NIC.
2204 		 * perhaps specified no bridge's prefix or wrong NIC's name
2205 		 */
2206 		error = EINVAL;
2207 		goto unref_exit;
2208 	}
2209 
2210 	if (NA(ifp)->refcount == 0) { /* not registered */
2211 		error = EINVAL;
2212 		goto unref_exit;
2213 	}
2214 
2215 	DROP_BDG_REF(ifp); /* the one from get_ifp */
2216 	last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */
2217 	NMG_UNLOCK();
2218 	if (!last_instance) {
2219 		D("--- error, trying to detach an entry with active mmaps");
2220 		error = EINVAL;
2221 	} else {
2222 		struct netmap_priv_d *npriv = NA(ifp)->na_kpriv;
2223 		NA(ifp)->na_kpriv = NULL;
2224 
2225 		bzero(npriv, sizeof(*npriv));
2226 		free(npriv, M_DEVBUF);
2227 	}
2228 	return error;
2229 
2230 unref_exit:
2231 	nm_if_rele(ifp);
2232 unlock_exit:
2233 	NMG_UNLOCK();
2234 	return error;
2235 }
2236 
2237 
2238 /* Initialize necessary fields of sw adapter located in right after hw's
2239  * one.  sw adapter attaches a pair of sw rings of the netmap-mode NIC.
2240  * It is always activated and deactivated at the same tie with the hw's one.
2241  * Thus we don't need refcounting on the sw adapter.
2242  * Regardless of NIC's feature we use separate lock so that anybody can lock
2243  * me independently from the hw adapter.
2244  * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
2245  */
2246 static void
2247 netmap_attach_sw(struct ifnet *ifp)
2248 {
2249 	struct netmap_adapter *hw_na = NA(ifp);
2250 	struct netmap_adapter *na = SWNA(ifp);
2251 
2252 	na->ifp = ifp;
2253 	na->num_rx_rings = na->num_tx_rings = 1;
2254 	na->num_tx_desc = hw_na->num_tx_desc;
2255 	na->num_rx_desc = hw_na->num_rx_desc;
2256 	na->nm_txsync = netmap_bdg_to_host;
2257 	/* we use the same memory allocator as the
2258 	 * the hw adapter */
2259 	na->nm_mem = hw_na->nm_mem;
2260 }
2261 
2262 
2263 /* exported to kernel callers, e.g. OVS ?
2264  * Entry point.
2265  * Called without NMG_LOCK.
2266  */
2267 int
2268 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
2269 {
2270 	struct nm_bridge *b;
2271 	struct netmap_adapter *na;
2272 	struct ifnet *iter;
2273 	char *name = nmr->nr_name;
2274 	int cmd = nmr->nr_cmd, namelen = strlen(name);
2275 	int error = 0, i, j;
2276 
2277 	switch (cmd) {
2278 	case NETMAP_BDG_ATTACH:
2279 		error = nm_bdg_attach(nmr);
2280 		break;
2281 
2282 	case NETMAP_BDG_DETACH:
2283 		error = nm_bdg_detach(nmr);
2284 		break;
2285 
2286 	case NETMAP_BDG_LIST:
2287 		/* this is used to enumerate bridges and ports */
2288 		if (namelen) { /* look up indexes of bridge and port */
2289 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
2290 				error = EINVAL;
2291 				break;
2292 			}
2293 			NMG_LOCK();
2294 			b = nm_find_bridge(name, 0 /* don't create */);
2295 			if (!b) {
2296 				error = ENOENT;
2297 				NMG_UNLOCK();
2298 				break;
2299 			}
2300 
2301 			error = ENOENT;
2302 			for (j = 0; j < b->bdg_active_ports; j++) {
2303 				i = b->bdg_port_index[j];
2304 				na = b->bdg_ports[i];
2305 				if (na == NULL) {
2306 					D("---AAAAAAAAARGH-------");
2307 					continue;
2308 				}
2309 				iter = na->ifp;
2310 				/* the former and the latter identify a
2311 				 * virtual port and a NIC, respectively
2312 				 */
2313 				if (!strcmp(iter->if_xname, name) ||
2314 				    (namelen > b->bdg_namelen &&
2315 				    !strcmp(iter->if_xname,
2316 				    name + b->bdg_namelen + 1))) {
2317 					/* bridge index */
2318 					nmr->nr_arg1 = b - nm_bridges;
2319 					nmr->nr_arg2 = i; /* port index */
2320 					error = 0;
2321 					break;
2322 				}
2323 			}
2324 			NMG_UNLOCK();
2325 		} else {
2326 			/* return the first non-empty entry starting from
2327 			 * bridge nr_arg1 and port nr_arg2.
2328 			 *
2329 			 * Users can detect the end of the same bridge by
2330 			 * seeing the new and old value of nr_arg1, and can
2331 			 * detect the end of all the bridge by error != 0
2332 			 */
2333 			i = nmr->nr_arg1;
2334 			j = nmr->nr_arg2;
2335 
2336 			NMG_LOCK();
2337 			for (error = ENOENT; i < NM_BRIDGES; i++) {
2338 				b = nm_bridges + i;
2339 				if (j >= b->bdg_active_ports) {
2340 					j = 0; /* following bridges scan from 0 */
2341 					continue;
2342 				}
2343 				nmr->nr_arg1 = i;
2344 				nmr->nr_arg2 = j;
2345 				j = b->bdg_port_index[j];
2346 				na = b->bdg_ports[j];
2347 				iter = na->ifp;
2348 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
2349 				error = 0;
2350 				break;
2351 			}
2352 			NMG_UNLOCK();
2353 		}
2354 		break;
2355 
2356 	case NETMAP_BDG_LOOKUP_REG:
2357 		/* register a lookup function to the given bridge.
2358 		 * nmr->nr_name may be just bridge's name (including ':'
2359 		 * if it is not just NM_NAME).
2360 		 */
2361 		if (!func) {
2362 			error = EINVAL;
2363 			break;
2364 		}
2365 		NMG_LOCK();
2366 		b = nm_find_bridge(name, 0 /* don't create */);
2367 		if (!b) {
2368 			error = EINVAL;
2369 		} else {
2370 			b->nm_bdg_lookup = func;
2371 		}
2372 		NMG_UNLOCK();
2373 		break;
2374 
2375 	default:
2376 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
2377 		error = EINVAL;
2378 		break;
2379 	}
2380 	return error;
2381 }
2382 
2383 
2384 /*
2385  * ioctl(2) support for the "netmap" device.
2386  *
2387  * Following a list of accepted commands:
2388  * - NIOCGINFO
2389  * - SIOCGIFADDR	just for convenience
2390  * - NIOCREGIF
2391  * - NIOCUNREGIF
2392  * - NIOCTXSYNC
2393  * - NIOCRXSYNC
2394  *
2395  * Return 0 on success, errno otherwise.
2396  */
2397 static int
2398 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2399 	int fflag, struct thread *td)
2400 {
2401 	struct netmap_priv_d *priv = NULL;
2402 	struct ifnet *ifp = NULL;
2403 	struct nmreq *nmr = (struct nmreq *) data;
2404 	struct netmap_adapter *na = NULL;
2405 	int error;
2406 	u_int i, lim;
2407 	struct netmap_if *nifp;
2408 	struct netmap_kring *krings;
2409 
2410 	(void)dev;	/* UNUSED */
2411 	(void)fflag;	/* UNUSED */
2412 #ifdef linux
2413 #define devfs_get_cdevpriv(pp)				\
2414 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
2415 		(*pp ? 0 : ENOENT); })
2416 
2417 /* devfs_set_cdevpriv cannot fail on linux */
2418 #define devfs_set_cdevpriv(p, fn)				\
2419 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
2420 
2421 
2422 #define devfs_clear_cdevpriv()	do {				\
2423 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
2424 	} while (0)
2425 #endif /* linux */
2426 
2427 	CURVNET_SET(TD_TO_VNET(td));
2428 
2429 	error = devfs_get_cdevpriv((void **)&priv);
2430 	if (error) {
2431 		CURVNET_RESTORE();
2432 		/* XXX ENOENT should be impossible, since the priv
2433 		 * is now created in the open */
2434 		return (error == ENOENT ? ENXIO : error);
2435 	}
2436 
2437 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
2438 	switch (cmd) {
2439 	case NIOCGINFO:		/* return capabilities etc */
2440 		if (nmr->nr_version != NETMAP_API) {
2441 #ifdef TEST_STUFF
2442 			/* some test code for locks etc */
2443 			if (nmr->nr_version == 666) {
2444 				error = nm_test(nmr);
2445 				break;
2446 			}
2447 #endif /* TEST_STUFF */
2448 			D("API mismatch got %d have %d",
2449 				nmr->nr_version, NETMAP_API);
2450 			nmr->nr_version = NETMAP_API;
2451 			error = EINVAL;
2452 			break;
2453 		}
2454 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2455 			error = netmap_bdg_ctl(nmr, NULL);
2456 			break;
2457 		}
2458 
2459 		NMG_LOCK();
2460 		do {
2461 			/* memsize is always valid */
2462 			struct netmap_mem_d *nmd = &nm_mem;
2463 			u_int memflags;
2464 
2465 			if (nmr->nr_name[0] != '\0') {
2466 				/* get a refcount */
2467 				error = get_ifp(nmr, &ifp, 1 /* create */);
2468 				if (error)
2469 					break;
2470 				na = NA(ifp);  /* retrieve the netmap adapter */
2471 				nmd = na->nm_mem; /* and its memory allocator */
2472 			}
2473 
2474 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
2475 			if (error)
2476 				break;
2477 			if (na == NULL) /* only memory info */
2478 				break;
2479 			nmr->nr_offset = 0;
2480 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2481 			netmap_update_config(na);
2482 			nmr->nr_rx_rings = na->num_rx_rings;
2483 			nmr->nr_tx_rings = na->num_tx_rings;
2484 			nmr->nr_rx_slots = na->num_rx_desc;
2485 			nmr->nr_tx_slots = na->num_tx_desc;
2486 			if (memflags & NETMAP_MEM_PRIVATE)
2487 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2488 		} while (0);
2489 		if (ifp)
2490 			nm_if_rele(ifp);	/* return the refcount */
2491 		NMG_UNLOCK();
2492 		break;
2493 
2494 	case NIOCREGIF:
2495 		if (nmr->nr_version != NETMAP_API) {
2496 			nmr->nr_version = NETMAP_API;
2497 			error = EINVAL;
2498 			break;
2499 		}
2500 		/* possibly attach/detach NIC and VALE switch */
2501 		i = nmr->nr_cmd;
2502 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
2503 			error = netmap_bdg_ctl(nmr, NULL);
2504 			break;
2505 		} else if (i != 0) {
2506 			D("nr_cmd must be 0 not %d", i);
2507 			error = EINVAL;
2508 			break;
2509 		}
2510 
2511 		/* protect access to priv from concurrent NIOCREGIF */
2512 		NMG_LOCK();
2513 		do {
2514 			u_int memflags;
2515 
2516 			if (priv->np_ifp != NULL) {	/* thread already registered */
2517 				error = netmap_set_ringid(priv, nmr->nr_ringid);
2518 				break;
2519 			}
2520 			/* find the interface and a reference */
2521 			error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */
2522 			if (error)
2523 				break;
2524 			if (NETMAP_OWNED_BY_KERN(ifp)) {
2525 				nm_if_rele(ifp);
2526 				error = EBUSY;
2527 				break;
2528 			}
2529 			nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error);
2530 			if (!nifp) {    /* reg. failed, release priv and ref */
2531 				nm_if_rele(ifp);        /* return the refcount */
2532 				priv->np_ifp = NULL;
2533 				priv->np_nifp = NULL;
2534 				break;
2535 			}
2536 
2537 			/* return the offset of the netmap_if object */
2538 			na = NA(ifp); /* retrieve netmap adapter */
2539 			nmr->nr_rx_rings = na->num_rx_rings;
2540 			nmr->nr_tx_rings = na->num_tx_rings;
2541 			nmr->nr_rx_slots = na->num_rx_desc;
2542 			nmr->nr_tx_slots = na->num_tx_desc;
2543 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
2544 			if (error) {
2545 				nm_if_rele(ifp);
2546 				break;
2547 			}
2548 			if (memflags & NETMAP_MEM_PRIVATE) {
2549 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2550 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2551 			}
2552 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2553 		} while (0);
2554 		NMG_UNLOCK();
2555 		break;
2556 
2557 	case NIOCUNREGIF:
2558 		// XXX we have no data here ?
2559 		D("deprecated, data is %p", nmr);
2560 		error = EINVAL;
2561 		break;
2562 
2563 	case NIOCTXSYNC:
2564 	case NIOCRXSYNC:
2565 		nifp = priv->np_nifp;
2566 
2567 		if (nifp == NULL) {
2568 			error = ENXIO;
2569 			break;
2570 		}
2571 		rmb(); /* make sure following reads are not from cache */
2572 
2573 		ifp = priv->np_ifp;	/* we have a reference */
2574 
2575 		if (ifp == NULL) {
2576 			D("Internal error: nifp != NULL && ifp == NULL");
2577 			error = ENXIO;
2578 			break;
2579 		}
2580 
2581 		na = NA(ifp); /* retrieve netmap adapter */
2582 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
2583 			if (cmd == NIOCTXSYNC)
2584 				netmap_txsync_to_host(na);
2585 			else
2586 				netmap_rxsync_from_host(na, NULL, NULL);
2587 			break;
2588 		}
2589 		/* find the last ring to scan */
2590 		lim = priv->np_qlast;
2591 		if (lim == NETMAP_HW_RING)
2592 			lim = (cmd == NIOCTXSYNC) ?
2593 			    na->num_tx_rings : na->num_rx_rings;
2594 
2595 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
2596 		for (i = priv->np_qfirst; i < lim; i++) {
2597 			struct netmap_kring *kring = krings + i;
2598 			if (nm_kr_tryget(kring)) {
2599 				error = EBUSY;
2600 				goto out;
2601 			}
2602 			if (cmd == NIOCTXSYNC) {
2603 				if (netmap_verbose & NM_VERB_TXSYNC)
2604 					D("pre txsync ring %d cur %d hwcur %d",
2605 					    i, kring->ring->cur,
2606 					    kring->nr_hwcur);
2607 				na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM);
2608 				if (netmap_verbose & NM_VERB_TXSYNC)
2609 					D("post txsync ring %d cur %d hwcur %d",
2610 					    i, kring->ring->cur,
2611 					    kring->nr_hwcur);
2612 			} else {
2613 				na->nm_rxsync(ifp, i, NAF_FORCE_READ);
2614 				microtime(&na->rx_rings[i].ring->ts);
2615 			}
2616 			nm_kr_put(kring);
2617 		}
2618 
2619 		break;
2620 
2621 #ifdef __FreeBSD__
2622 	case BIOCIMMEDIATE:
2623 	case BIOCGHDRCMPLT:
2624 	case BIOCSHDRCMPLT:
2625 	case BIOCSSEESENT:
2626 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2627 		break;
2628 
2629 	default:	/* allow device-specific ioctls */
2630 	    {
2631 		struct socket so;
2632 
2633 		bzero(&so, sizeof(so));
2634 		NMG_LOCK();
2635 		error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */
2636 		if (error) {
2637 			NMG_UNLOCK();
2638 			break;
2639 		}
2640 		so.so_vnet = ifp->if_vnet;
2641 		// so->so_proto not null.
2642 		error = ifioctl(&so, cmd, data, td);
2643 		nm_if_rele(ifp);
2644 		NMG_UNLOCK();
2645 		break;
2646 	    }
2647 
2648 #else /* linux */
2649 	default:
2650 		error = EOPNOTSUPP;
2651 #endif /* linux */
2652 	}
2653 out:
2654 
2655 	CURVNET_RESTORE();
2656 	return (error);
2657 }
2658 
2659 
2660 /*
2661  * select(2) and poll(2) handlers for the "netmap" device.
2662  *
2663  * Can be called for one or more queues.
2664  * Return true the event mask corresponding to ready events.
2665  * If there are no ready events, do a selrecord on either individual
2666  * selinfo or on the global one.
2667  * Device-dependent parts (locking and sync of tx/rx rings)
2668  * are done through callbacks.
2669  *
2670  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2671  * The first one is remapped to pwait as selrecord() uses the name as an
2672  * hidden argument.
2673  */
2674 static int
2675 netmap_poll(struct cdev *dev, int events, struct thread *td)
2676 {
2677 	struct netmap_priv_d *priv = NULL;
2678 	struct netmap_adapter *na;
2679 	struct ifnet *ifp;
2680 	struct netmap_kring *kring;
2681 	u_int i, check_all, want_tx, want_rx, revents = 0;
2682 	u_int lim_tx, lim_rx, host_forwarded = 0;
2683 	struct mbq q = { NULL, NULL, 0 };
2684 	void *pwait = dev;	/* linux compatibility */
2685 
2686 		int retry_tx = 1;
2687 
2688 	(void)pwait;
2689 
2690 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
2691 		return POLLERR;
2692 
2693 	if (priv->np_nifp == NULL) {
2694 		D("No if registered");
2695 		return POLLERR;
2696 	}
2697 	rmb(); /* make sure following reads are not from cache */
2698 
2699 	ifp = priv->np_ifp;
2700 	// XXX check for deleting() ?
2701 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
2702 		return POLLERR;
2703 
2704 	if (netmap_verbose & 0x8000)
2705 		D("device %s events 0x%x", ifp->if_xname, events);
2706 	want_tx = events & (POLLOUT | POLLWRNORM);
2707 	want_rx = events & (POLLIN | POLLRDNORM);
2708 
2709 	na = NA(ifp); /* retrieve netmap adapter */
2710 
2711 	lim_tx = na->num_tx_rings;
2712 	lim_rx = na->num_rx_rings;
2713 
2714 	if (priv->np_qfirst == NETMAP_SW_RING) {
2715 		/* handle the host stack ring */
2716 		if (priv->np_txpoll || want_tx) {
2717 			/* push any packets up, then we are always ready */
2718 			netmap_txsync_to_host(na);
2719 			revents |= want_tx;
2720 		}
2721 		if (want_rx) {
2722 			kring = &na->rx_rings[lim_rx];
2723 			if (kring->ring->avail == 0)
2724 				netmap_rxsync_from_host(na, td, dev);
2725 			if (kring->ring->avail > 0) {
2726 				revents |= want_rx;
2727 			}
2728 		}
2729 		return (revents);
2730 	}
2731 
2732 	/* if we are in transparent mode, check also the host rx ring */
2733 	kring = &na->rx_rings[lim_rx];
2734 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2735 			&& want_rx
2736 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
2737 		if (kring->ring->avail == 0)
2738 			netmap_rxsync_from_host(na, td, dev);
2739 		if (kring->ring->avail > 0)
2740 			revents |= want_rx;
2741 	}
2742 
2743 	/*
2744 	 * check_all is set if the card has more than one queue AND
2745 	 * the client is polling all of them. If true, we sleep on
2746 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2747 	 * (FreeBSD only allows two selinfo's per file descriptor).
2748 	 * The interrupt routine in the driver wake one or the other
2749 	 * (or both) depending on which clients are active.
2750 	 *
2751 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2752 	 * txsync() is called if we run out of buffers on POLLOUT, or
2753 	 * there are pending packets to send. The latter can be disabled
2754 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2755 	 */
2756 	check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1);
2757 
2758 	if (priv->np_qlast != NETMAP_HW_RING) {
2759 		lim_tx = lim_rx = priv->np_qlast;
2760 	}
2761 
2762 	/*
2763 	 * We start with a lock free round which is good if we have
2764 	 * data available. If this fails, then lock and call the sync
2765 	 * routines.
2766 	 */
2767 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
2768 		kring = &na->rx_rings[i];
2769 		if (kring->ring->avail > 0) {
2770 			revents |= want_rx;
2771 			want_rx = 0;	/* also breaks the loop */
2772 		}
2773 	}
2774 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
2775 		kring = &na->tx_rings[i];
2776 		if (kring->ring->avail > 0) {
2777 			revents |= want_tx;
2778 			want_tx = 0;	/* also breaks the loop */
2779 		}
2780 	}
2781 
2782 	/*
2783 	 * If we to push packets out (priv->np_txpoll) or want_tx is
2784 	 * still set, we do need to run the txsync calls (on all rings,
2785 	 * to avoid that the tx rings stall).
2786 	 */
2787 	if (priv->np_txpoll || want_tx) {
2788 		/* If we really want to be woken up (want_tx),
2789 		 * do a selrecord, either on the global or on
2790 		 * the private structure.  Then issue the txsync
2791 		 * so there is no race in the selrecord/selwait
2792 		 */
2793 flush_tx:
2794 		for (i = priv->np_qfirst; i < lim_tx; i++) {
2795 			kring = &na->tx_rings[i];
2796 			/*
2797 			 * Skip this ring if want_tx == 0
2798 			 * (we have already done a successful sync on
2799 			 * a previous ring) AND kring->cur == kring->hwcur
2800 			 * (there are no pending transmissions for this ring).
2801 			 */
2802 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2803 				continue;
2804 			/* make sure only one user thread is doing this */
2805 			if (nm_kr_tryget(kring)) {
2806 				ND("ring %p busy is %d", kring, (int)kring->nr_busy);
2807 				revents |= POLLERR;
2808 				goto out;
2809 			}
2810 
2811 			if (netmap_verbose & NM_VERB_TXSYNC)
2812 				D("send %d on %s %d",
2813 					kring->ring->cur, ifp->if_xname, i);
2814 			if (na->nm_txsync(ifp, i, 0))
2815 				revents |= POLLERR;
2816 
2817 			/* Check avail/call selrecord only if called with POLLOUT */
2818 			if (want_tx) {
2819 				if (kring->ring->avail > 0) {
2820 					/* stop at the first ring. We don't risk
2821 					 * starvation.
2822 					 */
2823 					revents |= want_tx;
2824 					want_tx = 0;
2825 				}
2826 			}
2827 			nm_kr_put(kring);
2828 		}
2829 		if (want_tx && retry_tx) {
2830 			selrecord(td, check_all ?
2831 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
2832 			retry_tx = 0;
2833 			goto flush_tx;
2834 		}
2835 	}
2836 
2837 	/*
2838 	 * now if want_rx is still set we need to lock and rxsync.
2839 	 * Do it on all rings because otherwise we starve.
2840 	 */
2841 	if (want_rx) {
2842 		int retry_rx = 1;
2843 do_retry_rx:
2844 		for (i = priv->np_qfirst; i < lim_rx; i++) {
2845 			kring = &na->rx_rings[i];
2846 
2847 			if (nm_kr_tryget(kring)) {
2848 				revents |= POLLERR;
2849 				goto out;
2850 			}
2851 
2852 			/* XXX NR_FORWARD should only be read on
2853 			 * physical or NIC ports
2854 			 */
2855 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2856 				ND(10, "forwarding some buffers up %d to %d",
2857 				    kring->nr_hwcur, kring->ring->cur);
2858 				netmap_grab_packets(kring, &q, netmap_fwd);
2859 			}
2860 
2861 			if (na->nm_rxsync(ifp, i, 0))
2862 				revents |= POLLERR;
2863 			if (netmap_no_timestamp == 0 ||
2864 					kring->ring->flags & NR_TIMESTAMP) {
2865 				microtime(&kring->ring->ts);
2866 			}
2867 
2868 			if (kring->ring->avail > 0) {
2869 				revents |= want_rx;
2870 				retry_rx = 0;
2871 			}
2872 			nm_kr_put(kring);
2873 		}
2874 		if (retry_rx) {
2875 			retry_rx = 0;
2876 			selrecord(td, check_all ?
2877 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
2878 			goto do_retry_rx;
2879 		}
2880 	}
2881 
2882 	/* forward host to the netmap ring.
2883 	 * I am accessing nr_hwavail without lock, but netmap_transmit
2884 	 * can only increment it, so the operation is safe.
2885 	 */
2886 	kring = &na->rx_rings[lim_rx];
2887 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2888 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
2889 			 && kring->nr_hwavail > 0 && !host_forwarded) {
2890 		netmap_sw_to_nic(na);
2891 		host_forwarded = 1; /* prevent another pass */
2892 		want_rx = 0;
2893 		goto flush_tx;
2894 	}
2895 
2896 	if (q.head)
2897 		netmap_send_up(na->ifp, q.head);
2898 
2899 out:
2900 
2901 	return (revents);
2902 }
2903 
2904 /*------- driver support routines ------*/
2905 
2906 
2907 /*
2908  * Initialize a ``netmap_adapter`` object created by driver on attach.
2909  * We allocate a block of memory with room for a struct netmap_adapter
2910  * plus two sets of N+2 struct netmap_kring (where N is the number
2911  * of hardware rings):
2912  * krings	0..N-1	are for the hardware queues.
2913  * kring	N	is for the host stack queue
2914  * kring	N+1	is only used for the selinfo for all queues.
2915  * Return 0 on success, ENOMEM otherwise.
2916  *
2917  * By default the receive and transmit adapter ring counts are both initialized
2918  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
2919  * setups.
2920  */
2921 int
2922 netmap_attach(struct netmap_adapter *arg, u_int num_queues)
2923 {
2924 	struct netmap_adapter *na = NULL;
2925 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2926 	size_t len;
2927 
2928 	if (arg == NULL || ifp == NULL)
2929 		goto fail;
2930 	/* a VALE port uses two endpoints */
2931 	len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2;
2932 	na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
2933 	if (na == NULL)
2934 		goto fail;
2935 	WNA(ifp) = na;
2936 	*na = *arg; /* copy everything, trust the driver to not pass junk */
2937 	NETMAP_SET_CAPABLE(ifp);
2938 	if (na->num_tx_rings == 0)
2939 		na->num_tx_rings = num_queues;
2940 	na->num_rx_rings = num_queues;
2941 	na->refcount = na->na_single = na->na_multi = 0;
2942 	/* Core lock initialized here, others after netmap_if_new. */
2943 	mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF);
2944 #ifdef linux
2945 	if (ifp->netdev_ops) {
2946 		ND("netdev_ops %p", ifp->netdev_ops);
2947 		/* prepare a clone of the netdev ops */
2948 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2949 		na->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2950 #else
2951 		na->nm_ndo = *ifp->netdev_ops;
2952 #endif
2953 	}
2954 	na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2955 #endif /* linux */
2956 	na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem;
2957 	if (!nma_is_vp(arg))
2958 		netmap_attach_sw(ifp);
2959 	D("success for %s", ifp->if_xname);
2960 	return 0;
2961 
2962 fail:
2963 	D("fail, arg %p ifp %p na %p", arg, ifp, na);
2964 	netmap_detach(ifp);
2965 	return (na ? EINVAL : ENOMEM);
2966 }
2967 
2968 
2969 /*
2970  * Free the allocated memory linked to the given ``netmap_adapter``
2971  * object.
2972  */
2973 void
2974 netmap_detach(struct ifnet *ifp)
2975 {
2976 	struct netmap_adapter *na = NA(ifp);
2977 
2978 	if (!na)
2979 		return;
2980 
2981 	mtx_destroy(&na->core_lock);
2982 
2983 	if (na->tx_rings) { /* XXX should not happen */
2984 		D("freeing leftover tx_rings");
2985 		free(na->tx_rings, M_DEVBUF);
2986 	}
2987 	if (na->na_flags & NAF_MEM_OWNER)
2988 		netmap_mem_private_delete(na->nm_mem);
2989 	bzero(na, sizeof(*na));
2990 	WNA(ifp) = NULL;
2991 	free(na, M_DEVBUF);
2992 }
2993 
2994 
2995 int
2996 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
2997 	struct netmap_adapter *na, u_int ring_nr);
2998 
2999 
3000 /*
3001  * Intercept packets from the network stack and pass them
3002  * to netmap as incoming packets on the 'software' ring.
3003  * We rely on the OS to make sure that the ifp and na do not go
3004  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3005  * In nm_register() or whenever there is a reinitialization,
3006  * we make sure to access the core lock and per-ring locks
3007  * so that IFCAP_NETMAP is visible here.
3008  */
3009 int
3010 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3011 {
3012 	struct netmap_adapter *na = NA(ifp);
3013 	struct netmap_kring *kring;
3014 	u_int i, len = MBUF_LEN(m);
3015 	u_int error = EBUSY, lim;
3016 	struct netmap_slot *slot;
3017 
3018 	// XXX [Linux] we do not need this lock
3019 	// if we follow the down/configure/up protocol -gl
3020 	// mtx_lock(&na->core_lock);
3021 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
3022 		/* interface not in netmap mode anymore */
3023 		error = ENXIO;
3024 		goto done;
3025 	}
3026 
3027 	kring = &na->rx_rings[na->num_rx_rings];
3028 	lim = kring->nkr_num_slots - 1;
3029 	if (netmap_verbose & NM_VERB_HOST)
3030 		D("%s packet %d len %d from the stack", ifp->if_xname,
3031 			kring->nr_hwcur + kring->nr_hwavail, len);
3032 	// XXX reconsider long packets if we handle fragments
3033 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
3034 		D("%s from_host, drop packet size %d > %d", ifp->if_xname,
3035 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
3036 		goto done;
3037 	}
3038 	if (SWNA(ifp)->na_bdg) {
3039 		struct nm_bdg_fwd *ft;
3040 		char *dst;
3041 
3042 		na = SWNA(ifp); /* we operate on the host port */
3043 		ft = na->rx_rings[0].nkr_ft;
3044 		dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]);
3045 
3046 		/* use slot 0 in the ft, there is nothing queued here */
3047 		/* XXX we can save the copy calling m_copydata in nm_bdg_flush,
3048 		 * need a special flag for this.
3049 		 */
3050 		m_copydata(m, 0, (int)len, dst);
3051 		ft->ft_flags = 0;
3052 		ft->ft_len = len;
3053 		ft->ft_buf = dst;
3054 		ft->ft_next = NM_FT_NULL;
3055 		ft->ft_frags = 1;
3056 		if (netmap_verbose & NM_VERB_HOST)
3057 			RD(5, "pkt %p size %d to bridge port %d",
3058 				dst, len, na->bdg_port);
3059 		nm_bdg_flush(ft, 1, na, 0);
3060 		na = NA(ifp);	/* back to the regular object/lock */
3061 		error = 0;
3062 		goto done;
3063 	}
3064 
3065 	/* protect against other instances of netmap_transmit,
3066 	 * and userspace invocations of rxsync().
3067 	 * XXX could reuse core_lock
3068 	 */
3069 	// XXX [Linux] there can be no other instances of netmap_transmit
3070 	// on this same ring, but we still need this lock to protect
3071 	// concurrent access from netmap_sw_to_nic() -gl
3072 	mtx_lock(&kring->q_lock);
3073 	if (kring->nr_hwavail >= lim) {
3074 		if (netmap_verbose)
3075 			D("stack ring %s full\n", ifp->if_xname);
3076 	} else {
3077 		/* compute the insert position */
3078 		i = nm_kr_rxpos(kring);
3079 		slot = &kring->ring->slot[i];
3080 		m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot));
3081 		slot->len = len;
3082 		slot->flags = kring->nkr_slot_flags;
3083 		kring->nr_hwavail++;
3084 		if (netmap_verbose  & NM_VERB_HOST)
3085 			D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings);
3086 		selwakeuppri(&kring->si, PI_NET);
3087 		error = 0;
3088 	}
3089 	mtx_unlock(&kring->q_lock);
3090 
3091 done:
3092 	// mtx_unlock(&na->core_lock);
3093 
3094 	/* release the mbuf in either cases of success or failure. As an
3095 	 * alternative, put the mbuf in a free list and free the list
3096 	 * only when really necessary.
3097 	 */
3098 	m_freem(m);
3099 
3100 	return (error);
3101 }
3102 
3103 
3104 /*
3105  * netmap_reset() is called by the driver routines when reinitializing
3106  * a ring. The driver is in charge of locking to protect the kring.
3107  * If netmap mode is not set just return NULL.
3108  */
3109 struct netmap_slot *
3110 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3111 	u_int new_cur)
3112 {
3113 	struct netmap_kring *kring;
3114 	int new_hwofs, lim;
3115 
3116 	if (na == NULL) {
3117 		D("NULL na, should not happen");
3118 		return NULL;	/* no netmap support here */
3119 	}
3120 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
3121 		D("interface not in netmap mode");
3122 		return NULL;	/* nothing to reinitialize */
3123 	}
3124 
3125 	/* XXX note- in the new scheme, we are not guaranteed to be
3126 	 * under lock (e.g. when called on a device reset).
3127 	 * In this case, we should set a flag and do not trust too
3128 	 * much the values. In practice: TODO
3129 	 * - set a RESET flag somewhere in the kring
3130 	 * - do the processing in a conservative way
3131 	 * - let the *sync() fixup at the end.
3132 	 */
3133 	if (tx == NR_TX) {
3134 		if (n >= na->num_tx_rings)
3135 			return NULL;
3136 		kring = na->tx_rings + n;
3137 		new_hwofs = kring->nr_hwcur - new_cur;
3138 	} else {
3139 		if (n >= na->num_rx_rings)
3140 			return NULL;
3141 		kring = na->rx_rings + n;
3142 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
3143 	}
3144 	lim = kring->nkr_num_slots - 1;
3145 	if (new_hwofs > lim)
3146 		new_hwofs -= lim + 1;
3147 
3148 	/* Always set the new offset value and realign the ring. */
3149 	D("%s hwofs %d -> %d, hwavail %d -> %d",
3150 		tx == NR_TX ? "TX" : "RX",
3151 		kring->nkr_hwofs, new_hwofs,
3152 		kring->nr_hwavail,
3153 		tx == NR_TX ? lim : kring->nr_hwavail);
3154 	kring->nkr_hwofs = new_hwofs;
3155 	if (tx == NR_TX)
3156 		kring->nr_hwavail = lim;
3157 
3158 #if 0 // def linux
3159 	/* XXX check that the mappings are correct */
3160 	/* need ring_nr, adapter->pdev, direction */
3161 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3162 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3163 		D("error mapping rx netmap buffer %d", i);
3164 		// XXX fix error handling
3165 	}
3166 
3167 #endif /* linux */
3168 	/*
3169 	 * Wakeup on the individual and global selwait
3170 	 * We do the wakeup here, but the ring is not yet reconfigured.
3171 	 * However, we are under lock so there are no races.
3172 	 */
3173 	selwakeuppri(&kring->si, PI_NET);
3174 	selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET);
3175 	return kring->ring->slot;
3176 }
3177 
3178 
3179 /*
3180  * Grab packets from a kring, move them into the ft structure
3181  * associated to the tx (input) port. Max one instance per port,
3182  * filtered on input (ioctl, poll or XXX).
3183  * Returns the next position in the ring.
3184  */
3185 static int
3186 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr,
3187 	struct netmap_kring *kring, u_int end)
3188 {
3189 	struct netmap_ring *ring = kring->ring;
3190 	struct nm_bdg_fwd *ft;
3191 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
3192 	u_int ft_i = 0;	/* start from 0 */
3193 	u_int frags = 1; /* how many frags ? */
3194 	struct nm_bridge *b = na->na_bdg;
3195 
3196 	/* To protect against modifications to the bridge we acquire a
3197 	 * shared lock, waiting if we can sleep (if the source port is
3198 	 * attached to a user process) or with a trylock otherwise (NICs).
3199 	 */
3200 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3201 	if (na->na_flags & NAF_BDG_MAYSLEEP)
3202 		BDG_RLOCK(b);
3203 	else if (!BDG_RTRYLOCK(b))
3204 		return 0;
3205 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3206 	ft = kring->nkr_ft;
3207 
3208 	for (; likely(j != end); j = nm_next(j, lim)) {
3209 		struct netmap_slot *slot = &ring->slot[j];
3210 		char *buf;
3211 
3212 		ft[ft_i].ft_len = slot->len;
3213 		ft[ft_i].ft_flags = slot->flags;
3214 
3215 		ND("flags is 0x%x", slot->flags);
3216 		/* this slot goes into a list so initialize the link field */
3217 		ft[ft_i].ft_next = NM_FT_NULL;
3218 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
3219 			(void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot);
3220 		prefetch(buf);
3221 		++ft_i;
3222 		if (slot->flags & NS_MOREFRAG) {
3223 			frags++;
3224 			continue;
3225 		}
3226 		if (unlikely(netmap_verbose && frags > 1))
3227 			RD(5, "%d frags at %d", frags, ft_i - frags);
3228 		ft[ft_i - frags].ft_frags = frags;
3229 		frags = 1;
3230 		if (unlikely((int)ft_i >= bridge_batch))
3231 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3232 	}
3233 	if (frags > 1) {
3234 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
3235 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
3236 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
3237 		ft[ft_i - frags].ft_frags = frags - 1;
3238 	}
3239 	if (ft_i)
3240 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3241 	BDG_RUNLOCK(b);
3242 	return j;
3243 }
3244 
3245 
3246 /*
3247  * Pass packets from nic to the bridge.
3248  * XXX TODO check locking: this is called from the interrupt
3249  * handler so we should make sure that the interface is not
3250  * disconnected while passing down an interrupt.
3251  *
3252  * Note, no user process can access this NIC so we can ignore
3253  * the info in the 'ring'.
3254  */
3255 static void
3256 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr)
3257 {
3258 	struct netmap_adapter *na = NA(ifp);
3259 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
3260 	struct netmap_ring *ring = kring->ring;
3261 	u_int j, k;
3262 
3263 	/* make sure that only one thread is ever in here,
3264 	 * after which we can unlock. Probably unnecessary XXX.
3265 	 */
3266 	if (nm_kr_tryget(kring))
3267 		return;
3268 	/* fetch packets that have arrived.
3269 	 * XXX maybe do this in a loop ?
3270 	 */
3271 	if (na->nm_rxsync(ifp, ring_nr, 0))
3272 		goto put_out;
3273 	if (kring->nr_hwavail == 0 && netmap_verbose) {
3274 		D("how strange, interrupt with no packets on %s",
3275 			ifp->if_xname);
3276 		goto put_out;
3277 	}
3278 	k = nm_kr_rxpos(kring);
3279 
3280 	j = nm_bdg_preflush(na, ring_nr, kring, k);
3281 
3282 	/* we consume everything, but we cannot update kring directly
3283 	 * because the nic may have destroyed the info in the NIC ring.
3284 	 * So we need to call rxsync again to restore it.
3285 	 */
3286 	ring->cur = j;
3287 	ring->avail = 0;
3288 	na->nm_rxsync(ifp, ring_nr, 0);
3289 
3290 put_out:
3291 	nm_kr_put(kring);
3292 	return;
3293 }
3294 
3295 
3296 /*
3297  * Default functions to handle rx/tx interrupts from a physical device.
3298  * "work_done" is non-null on the RX path, NULL for the TX path.
3299  * We rely on the OS to make sure that there is only one active
3300  * instance per queue, and that there is appropriate locking.
3301  *
3302  * If the card is not in netmap mode, simply return 0,
3303  * so that the caller proceeds with regular processing.
3304  *
3305  * If the card is connected to a netmap file descriptor,
3306  * do a selwakeup on the individual queue, plus one on the global one
3307  * if needed (multiqueue card _and_ there are multiqueue listeners),
3308  * and return 1.
3309  *
3310  * Finally, if called on rx from an interface connected to a switch,
3311  * calls the proper forwarding routine, and return 1.
3312  */
3313 int
3314 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3315 {
3316 	struct netmap_adapter *na;
3317 	struct netmap_kring *kring;
3318 
3319 	if (!(ifp->if_capenable & IFCAP_NETMAP))
3320 		return 0;
3321 
3322 	q &= NETMAP_RING_MASK;
3323 
3324 	if (netmap_verbose)
3325 		RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3326 	na = NA(ifp);
3327 	if (na->na_flags & NAF_SKIP_INTR) {
3328 		ND("use regular interrupt");
3329 		return 0;
3330 	}
3331 
3332 	if (work_done) { /* RX path */
3333 		if (q >= na->num_rx_rings)
3334 			return 0;	// not a physical queue
3335 		kring = na->rx_rings + q;
3336 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3337 		if (na->na_bdg != NULL) {
3338 			netmap_nic_to_bdg(ifp, q);
3339 		} else {
3340 			selwakeuppri(&kring->si, PI_NET);
3341 			if (na->num_rx_rings > 1 /* or multiple listeners */ )
3342 				selwakeuppri(&na->rx_si, PI_NET);
3343 		}
3344 		*work_done = 1; /* do not fire napi again */
3345 	} else { /* TX path */
3346 		if (q >= na->num_tx_rings)
3347 			return 0;	// not a physical queue
3348 		kring = na->tx_rings + q;
3349 		selwakeuppri(&kring->si, PI_NET);
3350 		if (na->num_tx_rings > 1 /* or multiple listeners */ )
3351 			selwakeuppri(&na->tx_si, PI_NET);
3352 	}
3353 	return 1;
3354 }
3355 
3356 
3357 #ifdef linux	/* linux-specific routines */
3358 
3359 
3360 /*
3361  * Remap linux arguments into the FreeBSD call.
3362  * - pwait is the poll table, passed as 'dev';
3363  *   If pwait == NULL someone else already woke up before. We can report
3364  *   events but they are filtered upstream.
3365  *   If pwait != NULL, then pwait->key contains the list of events.
3366  * - events is computed from pwait as above.
3367  * - file is passed as 'td';
3368  */
3369 static u_int
3370 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait)
3371 {
3372 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
3373 	int events = POLLIN | POLLOUT; /* XXX maybe... */
3374 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
3375 	int events = pwait ? pwait->key : POLLIN | POLLOUT;
3376 #else /* in 3.4.0 field 'key' was renamed to '_key' */
3377 	int events = pwait ? pwait->_key : POLLIN | POLLOUT;
3378 #endif
3379 	return netmap_poll((void *)pwait, events, (void *)file);
3380 }
3381 
3382 
3383 static int
3384 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
3385 {
3386 	int error = 0;
3387 	unsigned long off, va;
3388 	vm_ooffset_t pa;
3389 	struct netmap_priv_d *priv = f->private_data;
3390 	/*
3391 	 * vma->vm_start: start of mapping user address space
3392 	 * vma->vm_end: end of the mapping user address space
3393 	 * vma->vm_pfoff: offset of first page in the device
3394 	 */
3395 
3396 	// XXX security checks
3397 
3398 	error = netmap_get_memory(priv);
3399 	ND("get_memory returned %d", error);
3400 	if (error)
3401 	    return -error;
3402 
3403 	if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) {
3404 		ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end);
3405 		return -EINVAL;
3406 	}
3407 
3408 	for (va = vma->vm_start, off = vma->vm_pgoff;
3409 	     va < vma->vm_end;
3410 	     va += PAGE_SIZE, off++)
3411 	{
3412 		pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT);
3413 		if (pa == 0)
3414 			return -EINVAL;
3415 
3416 		ND("va %lx pa %p", va, pa);
3417 		error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot);
3418 		if (error)
3419 			return error;
3420 	}
3421 	return 0;
3422 }
3423 
3424 
3425 /*
3426  * This one is probably already protected by the netif lock XXX
3427  */
3428 static netdev_tx_t
3429 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev)
3430 {
3431 	netmap_transmit(dev, skb);
3432 	return (NETDEV_TX_OK);
3433 }
3434 
3435 
3436 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)	// XXX was 37
3437 #define LIN_IOCTL_NAME	.ioctl
3438 int
3439 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */)
3440 #else
3441 #define LIN_IOCTL_NAME	.unlocked_ioctl
3442 long
3443 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */)
3444 #endif
3445 {
3446 	int ret;
3447 	struct nmreq nmr;
3448 	bzero(&nmr, sizeof(nmr));
3449 
3450 	if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) {
3451 		data = 0;	/* no argument required here */
3452 	}
3453 	if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0)
3454 		return -EFAULT;
3455 	ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file);
3456 	if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0)
3457 		return -EFAULT;
3458 	return -ret;
3459 }
3460 
3461 
3462 static int
3463 netmap_release(struct inode *inode, struct file *file)
3464 {
3465 	(void)inode;	/* UNUSED */
3466 	if (file->private_data)
3467 		netmap_dtor(file->private_data);
3468 	return (0);
3469 }
3470 
3471 
3472 static int
3473 linux_netmap_open(struct inode *inode, struct file *file)
3474 {
3475 	struct netmap_priv_d *priv;
3476 	(void)inode;	/* UNUSED */
3477 
3478 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
3479 			      M_NOWAIT | M_ZERO);
3480 	if (priv == NULL)
3481 		return -ENOMEM;
3482 
3483 	file->private_data = priv;
3484 
3485 	return (0);
3486 }
3487 
3488 
3489 static struct file_operations netmap_fops = {
3490     .owner = THIS_MODULE,
3491     .open = linux_netmap_open,
3492     .mmap = linux_netmap_mmap,
3493     LIN_IOCTL_NAME = linux_netmap_ioctl,
3494     .poll = linux_netmap_poll,
3495     .release = netmap_release,
3496 };
3497 
3498 
3499 static struct miscdevice netmap_cdevsw = {	/* same name as FreeBSD */
3500 	MISC_DYNAMIC_MINOR,
3501 	"netmap",
3502 	&netmap_fops,
3503 };
3504 
3505 static int netmap_init(void);
3506 static void netmap_fini(void);
3507 
3508 
3509 /* Errors have negative values on linux */
3510 static int linux_netmap_init(void)
3511 {
3512 	return -netmap_init();
3513 }
3514 
3515 module_init(linux_netmap_init);
3516 module_exit(netmap_fini);
3517 /* export certain symbols to other modules */
3518 EXPORT_SYMBOL(netmap_attach);		// driver attach routines
3519 EXPORT_SYMBOL(netmap_detach);		// driver detach routines
3520 EXPORT_SYMBOL(netmap_ring_reinit);	// ring init on error
3521 EXPORT_SYMBOL(netmap_buffer_lut);
3522 EXPORT_SYMBOL(netmap_total_buffers);	// index check
3523 EXPORT_SYMBOL(netmap_buffer_base);
3524 EXPORT_SYMBOL(netmap_reset);		// ring init routines
3525 EXPORT_SYMBOL(netmap_buf_size);
3526 EXPORT_SYMBOL(netmap_rx_irq);		// default irq handler
3527 EXPORT_SYMBOL(netmap_no_pendintr);	// XXX mitigation - should go away
3528 EXPORT_SYMBOL(netmap_bdg_ctl);		// bridge configuration routine
3529 EXPORT_SYMBOL(netmap_bdg_learning);	// the default lookup function
3530 EXPORT_SYMBOL(netmap_disable_all_rings);
3531 EXPORT_SYMBOL(netmap_enable_all_rings);
3532 
3533 
3534 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/");
3535 MODULE_DESCRIPTION("The netmap packet I/O framework");
3536 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
3537 
3538 #else /* __FreeBSD__ */
3539 
3540 
3541 static struct cdevsw netmap_cdevsw = {
3542 	.d_version = D_VERSION,
3543 	.d_name = "netmap",
3544 	.d_open = netmap_open,
3545 	.d_mmap_single = netmap_mmap_single,
3546 	.d_ioctl = netmap_ioctl,
3547 	.d_poll = netmap_poll,
3548 	.d_close = netmap_close,
3549 };
3550 #endif /* __FreeBSD__ */
3551 
3552 /*
3553  *---- support for virtual bridge -----
3554  */
3555 
3556 /* ----- FreeBSD if_bridge hash function ------- */
3557 
3558 /*
3559  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
3560  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
3561  *
3562  * http://www.burtleburtle.net/bob/hash/spooky.html
3563  */
3564 #define mix(a, b, c)                                                    \
3565 do {                                                                    \
3566         a -= b; a -= c; a ^= (c >> 13);                                 \
3567         b -= c; b -= a; b ^= (a << 8);                                  \
3568         c -= a; c -= b; c ^= (b >> 13);                                 \
3569         a -= b; a -= c; a ^= (c >> 12);                                 \
3570         b -= c; b -= a; b ^= (a << 16);                                 \
3571         c -= a; c -= b; c ^= (b >> 5);                                  \
3572         a -= b; a -= c; a ^= (c >> 3);                                  \
3573         b -= c; b -= a; b ^= (a << 10);                                 \
3574         c -= a; c -= b; c ^= (b >> 15);                                 \
3575 } while (/*CONSTCOND*/0)
3576 
3577 static __inline uint32_t
3578 nm_bridge_rthash(const uint8_t *addr)
3579 {
3580         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
3581 
3582         b += addr[5] << 8;
3583         b += addr[4];
3584         a += addr[3] << 24;
3585         a += addr[2] << 16;
3586         a += addr[1] << 8;
3587         a += addr[0];
3588 
3589         mix(a, b, c);
3590 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
3591         return (c & BRIDGE_RTHASH_MASK);
3592 }
3593 
3594 #undef mix
3595 
3596 
3597 static int
3598 bdg_netmap_reg(struct ifnet *ifp, int onoff)
3599 {
3600 	/* the interface is already attached to the bridge,
3601 	 * so we only need to toggle IFCAP_NETMAP.
3602 	 */
3603 	if (onoff) {
3604 		ifp->if_capenable |= IFCAP_NETMAP;
3605 	} else {
3606 		ifp->if_capenable &= ~IFCAP_NETMAP;
3607 	}
3608 	return 0;
3609 }
3610 
3611 
3612 /*
3613  * Lookup function for a learning bridge.
3614  * Update the hash table with the source address,
3615  * and then returns the destination port index, and the
3616  * ring in *dst_ring (at the moment, always use ring 0)
3617  */
3618 u_int
3619 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
3620 		struct netmap_adapter *na)
3621 {
3622 	struct nm_hash_ent *ht = na->na_bdg->ht;
3623 	uint32_t sh, dh;
3624 	u_int dst, mysrc = na->bdg_port;
3625 	uint64_t smac, dmac;
3626 
3627 	if (buf_len < 14) {
3628 		D("invalid buf length %d", buf_len);
3629 		return NM_BDG_NOPORT;
3630 	}
3631 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
3632 	smac = le64toh(*(uint64_t *)(buf + 4));
3633 	smac >>= 16;
3634 
3635 	/*
3636 	 * The hash is somewhat expensive, there might be some
3637 	 * worthwhile optimizations here.
3638 	 */
3639 	if ((buf[6] & 1) == 0) { /* valid src */
3640 		uint8_t *s = buf+6;
3641 		sh = nm_bridge_rthash(s); // XXX hash of source
3642 		/* update source port forwarding entry */
3643 		ht[sh].mac = smac;	/* XXX expire ? */
3644 		ht[sh].ports = mysrc;
3645 		if (netmap_verbose)
3646 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
3647 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
3648 	}
3649 	dst = NM_BDG_BROADCAST;
3650 	if ((buf[0] & 1) == 0) { /* unicast */
3651 		dh = nm_bridge_rthash(buf); // XXX hash of dst
3652 		if (ht[dh].mac == dmac) {	/* found dst */
3653 			dst = ht[dh].ports;
3654 		}
3655 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
3656 	}
3657 	*dst_ring = 0;
3658 	return dst;
3659 }
3660 
3661 
3662 /*
3663  * This flush routine supports only unicast and broadcast but a large
3664  * number of ports, and lets us replace the learn and dispatch functions.
3665  */
3666 int
3667 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na,
3668 		u_int ring_nr)
3669 {
3670 	struct nm_bdg_q *dst_ents, *brddst;
3671 	uint16_t num_dsts = 0, *dsts;
3672 	struct nm_bridge *b = na->na_bdg;
3673 	u_int i, j, me = na->bdg_port;
3674 
3675 	/*
3676 	 * The work area (pointed by ft) is followed by an array of
3677 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
3678 	 * queues per port plus one for the broadcast traffic.
3679 	 * Then we have an array of destination indexes.
3680 	 */
3681 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
3682 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
3683 
3684 	/* first pass: find a destination for each packet in the batch */
3685 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
3686 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
3687 		uint16_t dst_port, d_i;
3688 		struct nm_bdg_q *d;
3689 
3690 		ND("slot %d frags %d", i, ft[i].ft_frags);
3691 		dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len,
3692 			&dst_ring, na);
3693 		if (netmap_verbose > 255)
3694 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
3695 		if (dst_port == NM_BDG_NOPORT)
3696 			continue; /* this packet is identified to be dropped */
3697 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
3698 			continue;
3699 		else if (dst_port == NM_BDG_BROADCAST)
3700 			dst_ring = 0; /* broadcasts always go to ring 0 */
3701 		else if (unlikely(dst_port == me ||
3702 		    !b->bdg_ports[dst_port]))
3703 			continue;
3704 
3705 		/* get a position in the scratch pad */
3706 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
3707 		d = dst_ents + d_i;
3708 
3709 		/* append the first fragment to the list */
3710 		if (d->bq_head == NM_FT_NULL) { /* new destination */
3711 			d->bq_head = d->bq_tail = i;
3712 			/* remember this position to be scanned later */
3713 			if (dst_port != NM_BDG_BROADCAST)
3714 				dsts[num_dsts++] = d_i;
3715 		} else {
3716 			ft[d->bq_tail].ft_next = i;
3717 			d->bq_tail = i;
3718 		}
3719 		d->bq_len += ft[i].ft_frags;
3720 	}
3721 
3722 	/*
3723 	 * Broadcast traffic goes to ring 0 on all destinations.
3724 	 * So we need to add these rings to the list of ports to scan.
3725 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
3726 	 * expensive. We should keep a compact list of active destinations
3727 	 * so we could shorten this loop.
3728 	 */
3729 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
3730 	if (brddst->bq_head != NM_FT_NULL) {
3731 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
3732 			uint16_t d_i;
3733 			i = b->bdg_port_index[j];
3734 			if (unlikely(i == me))
3735 				continue;
3736 			d_i = i * NM_BDG_MAXRINGS;
3737 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
3738 				dsts[num_dsts++] = d_i;
3739 		}
3740 	}
3741 
3742 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
3743 	/* second pass: scan destinations (XXX will be modular somehow) */
3744 	for (i = 0; i < num_dsts; i++) {
3745 		struct ifnet *dst_ifp;
3746 		struct netmap_adapter *dst_na;
3747 		struct netmap_kring *kring;
3748 		struct netmap_ring *ring;
3749 		u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next;
3750 		u_int needed, howmany;
3751 		int retry = netmap_txsync_retry;
3752 		struct nm_bdg_q *d;
3753 		uint32_t my_start = 0, lease_idx = 0;
3754 		int nrings;
3755 
3756 		d_i = dsts[i];
3757 		ND("second pass %d port %d", i, d_i);
3758 		d = dst_ents + d_i;
3759 		// XXX fix the division
3760 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
3761 		/* protect from the lookup function returning an inactive
3762 		 * destination port
3763 		 */
3764 		if (unlikely(dst_na == NULL))
3765 			goto cleanup;
3766 		if (dst_na->na_flags & NAF_SW_ONLY)
3767 			goto cleanup;
3768 		dst_ifp = dst_na->ifp;
3769 		/*
3770 		 * The interface may be in !netmap mode in two cases:
3771 		 * - when na is attached but not activated yet;
3772 		 * - when na is being deactivated but is still attached.
3773 		 */
3774 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
3775 			ND("not in netmap mode!");
3776 			goto cleanup;
3777 		}
3778 
3779 		/* there is at least one either unicast or broadcast packet */
3780 		brd_next = brddst->bq_head;
3781 		next = d->bq_head;
3782 		/* we need to reserve this many slots. If fewer are
3783 		 * available, some packets will be dropped.
3784 		 * Packets may have multiple fragments, so we may not use
3785 		 * there is a chance that we may not use all of the slots
3786 		 * we have claimed, so we will need to handle the leftover
3787 		 * ones when we regain the lock.
3788 		 */
3789 		needed = d->bq_len + brddst->bq_len;
3790 
3791 		is_vp = nma_is_vp(dst_na);
3792 		ND(5, "pass 2 dst %d is %x %s",
3793 			i, d_i, is_vp ? "virtual" : "nic/host");
3794 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
3795 		if (is_vp) { /* virtual port */
3796 			nrings = dst_na->num_rx_rings;
3797 		} else {
3798 			nrings = dst_na->num_tx_rings;
3799 		}
3800 		if (dst_nr >= nrings)
3801 			dst_nr = dst_nr % nrings;
3802 		kring = is_vp ?  &dst_na->rx_rings[dst_nr] :
3803 				&dst_na->tx_rings[dst_nr];
3804 		ring = kring->ring;
3805 		lim = kring->nkr_num_slots - 1;
3806 
3807 retry:
3808 
3809 		/* reserve the buffers in the queue and an entry
3810 		 * to report completion, and drop lock.
3811 		 * XXX this might become a helper function.
3812 		 */
3813 		mtx_lock(&kring->q_lock);
3814 		if (kring->nkr_stopped) {
3815 			mtx_unlock(&kring->q_lock);
3816 			goto cleanup;
3817 		}
3818 		/* on physical interfaces, do a txsync to recover
3819 		 * slots for packets already transmitted.
3820 		 * XXX maybe we could be optimistic and rely on a retry
3821 		 * in case of failure.
3822 		 */
3823 		if (nma_is_hw(dst_na)) {
3824 			dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3825 		}
3826 		my_start = j = kring->nkr_hwlease;
3827 		howmany = nm_kr_space(kring, is_vp);
3828 		if (needed < howmany)
3829 			howmany = needed;
3830 		lease_idx = nm_kr_lease(kring, howmany, is_vp);
3831 		mtx_unlock(&kring->q_lock);
3832 
3833 		/* only retry if we need more than available slots */
3834 		if (retry && needed <= howmany)
3835 			retry = 0;
3836 
3837 		/* copy to the destination queue */
3838 		while (howmany > 0) {
3839 			struct netmap_slot *slot;
3840 			struct nm_bdg_fwd *ft_p, *ft_end;
3841 			u_int cnt;
3842 
3843 			/* find the queue from which we pick next packet.
3844 			 * NM_FT_NULL is always higher than valid indexes
3845 			 * so we never dereference it if the other list
3846 			 * has packets (and if both are empty we never
3847 			 * get here).
3848 			 */
3849 			if (next < brd_next) {
3850 				ft_p = ft + next;
3851 				next = ft_p->ft_next;
3852 			} else { /* insert broadcast */
3853 				ft_p = ft + brd_next;
3854 				brd_next = ft_p->ft_next;
3855 			}
3856 			cnt = ft_p->ft_frags; // cnt > 0
3857 			if (unlikely(cnt > howmany))
3858 			    break; /* no more space */
3859 			howmany -= cnt;
3860 			if (netmap_verbose && cnt > 1)
3861 				RD(5, "rx %d frags to %d", cnt, j);
3862 			ft_end = ft_p + cnt;
3863 			do {
3864 			    void *dst, *src = ft_p->ft_buf;
3865 			    size_t len = (ft_p->ft_len + 63) & ~63;
3866 
3867 			    slot = &ring->slot[j];
3868 			    dst = BDG_NMB(dst_na->nm_mem, slot);
3869 			    /* round to a multiple of 64 */
3870 
3871 			    ND("send %d %d bytes at %s:%d",
3872 				i, ft_p->ft_len, dst_ifp->if_xname, j);
3873 			    if (ft_p->ft_flags & NS_INDIRECT) {
3874 				if (copyin(src, dst, len)) {
3875 					// invalid user pointer, pretend len is 0
3876 					ft_p->ft_len = 0;
3877 				}
3878 			    } else {
3879 				//memcpy(dst, src, len);
3880 				pkt_copy(src, dst, (int)len);
3881 			    }
3882 			    slot->len = ft_p->ft_len;
3883 			    slot->flags = (cnt << 8)| NS_MOREFRAG;
3884 			    j = nm_next(j, lim);
3885 			    ft_p++;
3886 			    sent++;
3887 			} while (ft_p != ft_end);
3888 			slot->flags = (cnt << 8); /* clear flag on last entry */
3889 			/* are we done ? */
3890 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
3891 				break;
3892 		}
3893 		{
3894 		    /* current position */
3895 		    uint32_t *p = kring->nkr_leases; /* shorthand */
3896 		    uint32_t update_pos;
3897 		    int still_locked = 1;
3898 
3899 		    mtx_lock(&kring->q_lock);
3900 		    if (unlikely(howmany > 0)) {
3901 			/* not used all bufs. If i am the last one
3902 			 * i can recover the slots, otherwise must
3903 			 * fill them with 0 to mark empty packets.
3904 			 */
3905 			ND("leftover %d bufs", howmany);
3906 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
3907 			    /* yes i am the last one */
3908 			    ND("roll back nkr_hwlease to %d", j);
3909 			    kring->nkr_hwlease = j;
3910 			} else {
3911 			    while (howmany-- > 0) {
3912 				ring->slot[j].len = 0;
3913 				ring->slot[j].flags = 0;
3914 				j = nm_next(j, lim);
3915 			    }
3916 			}
3917 		    }
3918 		    p[lease_idx] = j; /* report I am done */
3919 
3920 		    update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur;
3921 
3922 		    if (my_start == update_pos) {
3923 			/* all slots before my_start have been reported,
3924 			 * so scan subsequent leases to see if other ranges
3925 			 * have been completed, and to a selwakeup or txsync.
3926 		         */
3927 			while (lease_idx != kring->nkr_lease_idx &&
3928 				p[lease_idx] != NR_NOSLOT) {
3929 			    j = p[lease_idx];
3930 			    p[lease_idx] = NR_NOSLOT;
3931 			    lease_idx = nm_next(lease_idx, lim);
3932 			}
3933 			/* j is the new 'write' position. j != my_start
3934 			 * means there are new buffers to report
3935 			 */
3936 			if (likely(j != my_start)) {
3937 			    if (is_vp) {
3938 				uint32_t old_avail = kring->nr_hwavail;
3939 
3940 				kring->nr_hwavail = (j >= kring->nr_hwcur) ?
3941 					j - kring->nr_hwcur :
3942 					j + lim + 1 - kring->nr_hwcur;
3943 				if (kring->nr_hwavail < old_avail) {
3944 					D("avail shrink %d -> %d",
3945 						old_avail, kring->nr_hwavail);
3946 				}
3947 				still_locked = 0;
3948 				mtx_unlock(&kring->q_lock);
3949 				selwakeuppri(&kring->si, PI_NET);
3950 			    } else {
3951 				ring->cur = j;
3952 				/* XXX update avail ? */
3953 				still_locked = 0;
3954 				dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3955 				mtx_unlock(&kring->q_lock);
3956 
3957 				/* retry to send more packets */
3958 				if (nma_is_hw(dst_na) && retry--)
3959 					goto retry;
3960 			    }
3961 			}
3962 		    }
3963 		    if (still_locked)
3964 			mtx_unlock(&kring->q_lock);
3965 		}
3966 cleanup:
3967 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
3968 		d->bq_len = 0;
3969 	}
3970 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
3971 	brddst->bq_len = 0;
3972 	return 0;
3973 }
3974 
3975 
3976 /*
3977  * main dispatch routine for the bridge.
3978  * We already know that only one thread is running this.
3979  * we must run nm_bdg_preflush without lock.
3980  */
3981 static int
3982 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags)
3983 {
3984 	struct netmap_adapter *na = NA(ifp);
3985 	struct netmap_kring *kring = &na->tx_rings[ring_nr];
3986 	struct netmap_ring *ring = kring->ring;
3987 	u_int j, k, lim = kring->nkr_num_slots - 1;
3988 
3989 	k = ring->cur;
3990 	if (k > lim)
3991 		return netmap_ring_reinit(kring);
3992 
3993 	if (bridge_batch <= 0) { /* testing only */
3994 		j = k; // used all
3995 		goto done;
3996 	}
3997 	if (bridge_batch > NM_BDG_BATCH)
3998 		bridge_batch = NM_BDG_BATCH;
3999 
4000 	j = nm_bdg_preflush(na, ring_nr, kring, k);
4001 	if (j != k)
4002 		D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
4003 	/* k-j modulo ring size is the number of slots processed */
4004 	if (k < j)
4005 		k += kring->nkr_num_slots;
4006 	kring->nr_hwavail = lim - (k - j);
4007 
4008 done:
4009 	kring->nr_hwcur = j;
4010 	ring->avail = kring->nr_hwavail;
4011 	if (netmap_verbose)
4012 		D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags);
4013 	return 0;
4014 }
4015 
4016 
4017 /*
4018  * user process reading from a VALE switch.
4019  * Already protected against concurrent calls from userspace,
4020  * but we must acquire the queue's lock to protect against
4021  * writers on the same queue.
4022  */
4023 static int
4024 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags)
4025 {
4026 	struct netmap_adapter *na = NA(ifp);
4027 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
4028 	struct netmap_ring *ring = kring->ring;
4029 	u_int j, lim = kring->nkr_num_slots - 1;
4030 	u_int k = ring->cur, resvd = ring->reserved;
4031 	int n;
4032 
4033 	mtx_lock(&kring->q_lock);
4034 	if (k > lim) {
4035 		D("ouch dangerous reset!!!");
4036 		n = netmap_ring_reinit(kring);
4037 		goto done;
4038 	}
4039 
4040 	/* skip past packets that userspace has released */
4041 	j = kring->nr_hwcur;    /* netmap ring index */
4042 	if (resvd > 0) {
4043 		if (resvd + ring->avail >= lim + 1) {
4044 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
4045 			ring->reserved = resvd = 0; // XXX panic...
4046 		}
4047 		k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
4048 	}
4049 
4050 	if (j != k) { /* userspace has released some packets. */
4051 		n = k - j;
4052 		if (n < 0)
4053 			n += kring->nkr_num_slots;
4054 		ND("userspace releases %d packets", n);
4055                 for (n = 0; likely(j != k); n++) {
4056                         struct netmap_slot *slot = &ring->slot[j];
4057                         void *addr = BDG_NMB(na->nm_mem, slot);
4058 
4059                         if (addr == netmap_buffer_base) { /* bad buf */
4060 				D("bad buffer index %d, ignore ?",
4061 					slot->buf_idx);
4062                         }
4063 			slot->flags &= ~NS_BUF_CHANGED;
4064                         j = nm_next(j, lim);
4065                 }
4066                 kring->nr_hwavail -= n;
4067                 kring->nr_hwcur = k;
4068         }
4069         /* tell userspace that there are new packets */
4070         ring->avail = kring->nr_hwavail - resvd;
4071 	n = 0;
4072 done:
4073 	mtx_unlock(&kring->q_lock);
4074 	return n;
4075 }
4076 
4077 
4078 static void
4079 bdg_netmap_attach(struct netmap_adapter *arg)
4080 {
4081 	struct netmap_adapter na;
4082 
4083 	ND("attaching virtual bridge");
4084 	bzero(&na, sizeof(na));
4085 
4086 	na.ifp = arg->ifp;
4087 	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
4088 	na.num_tx_rings = arg->num_tx_rings;
4089 	na.num_rx_rings = arg->num_rx_rings;
4090 	na.num_tx_desc = arg->num_tx_desc;
4091 	na.num_rx_desc = arg->num_rx_desc;
4092 	na.nm_txsync = bdg_netmap_txsync;
4093 	na.nm_rxsync = bdg_netmap_rxsync;
4094 	na.nm_register = bdg_netmap_reg;
4095 	na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname,
4096 			na.num_tx_rings, na.num_tx_desc,
4097 			na.num_rx_rings, na.num_rx_desc);
4098 	netmap_attach(&na, na.num_tx_rings);
4099 }
4100 
4101 
4102 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4103 
4104 
4105 /*
4106  * Module loader.
4107  *
4108  * Create the /dev/netmap device and initialize all global
4109  * variables.
4110  *
4111  * Return 0 on success, errno on failure.
4112  */
4113 static int
4114 netmap_init(void)
4115 {
4116 	int i, error;
4117 
4118 	NMG_LOCK_INIT();
4119 
4120 	error = netmap_mem_init();
4121 	if (error != 0) {
4122 		printf("netmap: unable to initialize the memory allocator.\n");
4123 		return (error);
4124 	}
4125 	printf("netmap: loaded module\n");
4126 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
4127 			      "netmap");
4128 
4129 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
4130 	for (i = 0; i < NM_BRIDGES; i++)
4131 		BDG_RWINIT(&nm_bridges[i]);
4132 	return (error);
4133 }
4134 
4135 
4136 /*
4137  * Module unloader.
4138  *
4139  * Free all the memory, and destroy the ``/dev/netmap`` device.
4140  */
4141 static void
4142 netmap_fini(void)
4143 {
4144 	destroy_dev(netmap_dev);
4145 	netmap_mem_fini();
4146 	NMG_LOCK_DESTROY();
4147 	printf("netmap: unloaded module.\n");
4148 }
4149 
4150 
4151 #ifdef __FreeBSD__
4152 /*
4153  * Kernel entry point.
4154  *
4155  * Initialize/finalize the module and return.
4156  *
4157  * Return 0 on success, errno on failure.
4158  */
4159 static int
4160 netmap_loader(__unused struct module *module, int event, __unused void *arg)
4161 {
4162 	int error = 0;
4163 
4164 	switch (event) {
4165 	case MOD_LOAD:
4166 		error = netmap_init();
4167 		break;
4168 
4169 	case MOD_UNLOAD:
4170 		netmap_fini();
4171 		break;
4172 
4173 	default:
4174 		error = EOPNOTSUPP;
4175 		break;
4176 	}
4177 
4178 	return (error);
4179 }
4180 
4181 
4182 DEV_MODULE(netmap, netmap_loader, NULL);
4183 #endif /* __FreeBSD__ */
4184