xref: /freebsd-14.2/sys/dev/netmap/netmap_vale.c (revision 4f80b14c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2013-2016 Universita` di Pisa
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 
30 /*
31  * This module implements the VALE switch for netmap
32 
33 --- VALE SWITCH ---
34 
35 NMG_LOCK() serializes all modifications to switches and ports.
36 A switch cannot be deleted until all ports are gone.
37 
38 For each switch, an SX lock (RWlock on linux) protects
39 deletion of ports. When configuring or deleting a new port, the
40 lock is acquired in exclusive mode (after holding NMG_LOCK).
41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
42 The lock is held throughout the entire forwarding cycle,
43 during which the thread may incur in a page fault.
44 Hence it is important that sleepable shared locks are used.
45 
46 On the rx ring, the per-port lock is grabbed initially to reserve
47 a number of slot in the ring, then the lock is released,
48 packets are copied from source to destination, and then
49 the lock is acquired again and the receive ring is updated.
50 (A similar thing is done on the tx ring for NIC and host stack
51 ports attached to the switch)
52 
53  */
54 
55 /*
56  * OS-specific code that is used only within this file.
57  * Other OS-specific code that must be accessed by drivers
58  * is present in netmap_kern.h
59  */
60 
61 #if defined(__FreeBSD__)
62 #include <sys/cdefs.h> /* prerequisite */
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>	/* defines used in kernel.h */
68 #include <sys/kernel.h>	/* types used in module initialization */
69 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
70 #include <sys/sockio.h>
71 #include <sys/socketvar.h>	/* struct socket */
72 #include <sys/malloc.h>
73 #include <sys/poll.h>
74 #include <sys/rwlock.h>
75 #include <sys/socket.h> /* sockaddrs */
76 #include <sys/selinfo.h>
77 #include <sys/sysctl.h>
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/bpf.h>		/* BIOCIMMEDIATE */
81 #include <machine/bus.h>	/* bus_dmamap_* */
82 #include <sys/endian.h>
83 #include <sys/refcount.h>
84 
85 
86 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
87 
88 #define	BDG_RWINIT(b)		\
89 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
90 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
91 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
92 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
93 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
94 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
95 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
96 
97 
98 #elif defined(linux)
99 
100 #include "bsd_glue.h"
101 
102 #elif defined(__APPLE__)
103 
104 #warning OSX support is only partial
105 #include "osx_glue.h"
106 
107 #elif defined(_WIN32)
108 #include "win_glue.h"
109 
110 #else
111 
112 #error	Unsupported platform
113 
114 #endif /* unsupported */
115 
116 /*
117  * common headers
118  */
119 
120 #include <net/netmap.h>
121 #include <dev/netmap/netmap_kern.h>
122 #include <dev/netmap/netmap_mem2.h>
123 
124 #ifdef WITH_VALE
125 
126 /*
127  * system parameters (most of them in netmap_kern.h)
128  * NM_BDG_NAME	prefix for switch port names, default "vale"
129  * NM_BDG_MAXPORTS	number of ports
130  * NM_BRIDGES	max number of switches in the system.
131  *	XXX should become a sysctl or tunable
132  *
133  * Switch ports are named valeX:Y where X is the switch name and Y
134  * is the port. If Y matches a physical interface name, the port is
135  * connected to a physical device.
136  *
137  * Unlike physical interfaces, switch ports use their own memory region
138  * for rings and buffers.
139  * The virtual interfaces use per-queue lock instead of core lock.
140  * In the tx loop, we aggregate traffic in batches to make all operations
141  * faster. The batch size is bridge_batch.
142  */
143 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
144 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
145 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
146 #define NM_BDG_HASH		1024	/* forwarding table entries */
147 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
148 #define NM_MULTISEG		64	/* max size of a chain of bufs */
149 /* actual size of the tables */
150 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
151 /* NM_FT_NULL terminates a list of slots in the ft */
152 #define NM_FT_NULL		NM_BDG_BATCH_MAX
153 /* Default size for the Maximum Frame Size. */
154 #define NM_BDG_MFS_DEFAULT	1514
155 
156 
157 /*
158  * bridge_batch is set via sysctl to the max batch size to be
159  * used in the bridge. The actual value may be larger as the
160  * last packet in the block may overflow the size.
161  */
162 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
163 SYSBEGIN(vars_vale);
164 SYSCTL_DECL(_dev_netmap);
165 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
166     "Max batch size to be used in the bridge");
167 SYSEND;
168 
169 static int netmap_vp_create(struct nmreq *, struct ifnet *,
170 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
171 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
172 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
173 
174 /*
175  * For each output interface, nm_bdg_q is used to construct a list.
176  * bq_len is the number of output buffers (we can have coalescing
177  * during the copy).
178  */
179 struct nm_bdg_q {
180 	uint16_t bq_head;
181 	uint16_t bq_tail;
182 	uint32_t bq_len;	/* number of buffers */
183 };
184 
185 /* XXX revise this */
186 struct nm_hash_ent {
187 	uint64_t	mac;	/* the top 2 bytes are the epoch */
188 	uint64_t	ports;
189 };
190 
191 /*
192  * nm_bridge is a descriptor for a VALE switch.
193  * Interfaces for a bridge are all in bdg_ports[].
194  * The array has fixed size, an empty entry does not terminate
195  * the search, but lookups only occur on attach/detach so we
196  * don't mind if they are slow.
197  *
198  * The bridge is non blocking on the transmit ports: excess
199  * packets are dropped if there is no room on the output port.
200  *
201  * bdg_lock protects accesses to the bdg_ports array.
202  * This is a rw lock (or equivalent).
203  */
204 struct nm_bridge {
205 	/* XXX what is the proper alignment/layout ? */
206 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
207 	int		bdg_namelen;
208 	uint32_t	bdg_active_ports; /* 0 means free */
209 	char		bdg_basename[IFNAMSIZ];
210 
211 	/* Indexes of active ports (up to active_ports)
212 	 * and all other remaining ports.
213 	 */
214 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
215 
216 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
217 
218 
219 	/*
220 	 * The function to decide the destination port.
221 	 * It returns either of an index of the destination port,
222 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
223 	 * forward this packet.  ring_nr is the source ring index, and the
224 	 * function may overwrite this value to forward this packet to a
225 	 * different ring index.
226 	 * This function must be set by netmap_bdg_ctl().
227 	 */
228 	struct netmap_bdg_ops bdg_ops;
229 
230 	/* the forwarding table, MAC+ports.
231 	 * XXX should be changed to an argument to be passed to
232 	 * the lookup function
233 	 */
234 	struct nm_hash_ent *ht; // allocated on attach
235 
236 #ifdef CONFIG_NET_NS
237 	struct net *ns;
238 #endif /* CONFIG_NET_NS */
239 };
240 
241 const char*
242 netmap_bdg_name(struct netmap_vp_adapter *vp)
243 {
244 	struct nm_bridge *b = vp->na_bdg;
245 	if (b == NULL)
246 		return NULL;
247 	return b->bdg_basename;
248 }
249 
250 
251 #ifndef CONFIG_NET_NS
252 /*
253  * XXX in principle nm_bridges could be created dynamically
254  * Right now we have a static array and deletions are protected
255  * by an exclusive lock.
256  */
257 static struct nm_bridge *nm_bridges;
258 #endif /* !CONFIG_NET_NS */
259 
260 
261 /*
262  * this is a slightly optimized copy routine which rounds
263  * to multiple of 64 bytes and is often faster than dealing
264  * with other odd sizes. We assume there is enough room
265  * in the source and destination buffers.
266  *
267  * XXX only for multiples of 64 bytes, non overlapped.
268  */
269 static inline void
270 pkt_copy(void *_src, void *_dst, int l)
271 {
272         uint64_t *src = _src;
273         uint64_t *dst = _dst;
274         if (unlikely(l >= 1024)) {
275                 memcpy(dst, src, l);
276                 return;
277         }
278         for (; likely(l > 0); l-=64) {
279                 *dst++ = *src++;
280                 *dst++ = *src++;
281                 *dst++ = *src++;
282                 *dst++ = *src++;
283                 *dst++ = *src++;
284                 *dst++ = *src++;
285                 *dst++ = *src++;
286                 *dst++ = *src++;
287         }
288 }
289 
290 
291 static int
292 nm_is_id_char(const char c)
293 {
294 	return (c >= 'a' && c <= 'z') ||
295 	       (c >= 'A' && c <= 'Z') ||
296 	       (c >= '0' && c <= '9') ||
297 	       (c == '_');
298 }
299 
300 /* Validate the name of a VALE bridge port and return the
301  * position of the ":" character. */
302 static int
303 nm_vale_name_validate(const char *name)
304 {
305 	int colon_pos = -1;
306 	int i;
307 
308 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
309 		return -1;
310 	}
311 
312 	for (i = 0; name[i]; i++) {
313 		if (name[i] == ':') {
314 			if (colon_pos != -1) {
315 				return -1;
316 			}
317 			colon_pos = i;
318 		} else if (!nm_is_id_char(name[i])) {
319 			return -1;
320 		}
321 	}
322 
323 	if (i >= IFNAMSIZ) {
324 		return -1;
325 	}
326 
327 	return colon_pos;
328 }
329 
330 /*
331  * locate a bridge among the existing ones.
332  * MUST BE CALLED WITH NMG_LOCK()
333  *
334  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
335  * We assume that this is called with a name of at least NM_NAME chars.
336  */
337 static struct nm_bridge *
338 nm_find_bridge(const char *name, int create)
339 {
340 	int i, namelen;
341 	struct nm_bridge *b = NULL, *bridges;
342 	u_int num_bridges;
343 
344 	NMG_LOCK_ASSERT();
345 
346 	netmap_bns_getbridges(&bridges, &num_bridges);
347 
348 	namelen = nm_vale_name_validate(name);
349 	if (namelen < 0) {
350 		D("invalid bridge name %s", name ? name : NULL);
351 		return NULL;
352 	}
353 
354 	/* lookup the name, remember empty slot if there is one */
355 	for (i = 0; i < num_bridges; i++) {
356 		struct nm_bridge *x = bridges + i;
357 
358 		if (x->bdg_active_ports == 0) {
359 			if (create && b == NULL)
360 				b = x;	/* record empty slot */
361 		} else if (x->bdg_namelen != namelen) {
362 			continue;
363 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
364 			ND("found '%.*s' at %d", namelen, name, i);
365 			b = x;
366 			break;
367 		}
368 	}
369 	if (i == num_bridges && b) { /* name not found, can create entry */
370 		/* initialize the bridge */
371 		ND("create new bridge %s with ports %d", b->bdg_basename,
372 			b->bdg_active_ports);
373 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
374 		if (b->ht == NULL) {
375 			D("failed to allocate hash table");
376 			return NULL;
377 		}
378 		strncpy(b->bdg_basename, name, namelen);
379 		b->bdg_namelen = namelen;
380 		b->bdg_active_ports = 0;
381 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
382 			b->bdg_port_index[i] = i;
383 		/* set the default function */
384 		b->bdg_ops.lookup = netmap_bdg_learning;
385 		NM_BNS_GET(b);
386 	}
387 	return b;
388 }
389 
390 
391 /*
392  * Free the forwarding tables for rings attached to switch ports.
393  */
394 static void
395 nm_free_bdgfwd(struct netmap_adapter *na)
396 {
397 	int nrings, i;
398 	struct netmap_kring *kring;
399 
400 	NMG_LOCK_ASSERT();
401 	nrings = na->num_tx_rings;
402 	kring = na->tx_rings;
403 	for (i = 0; i < nrings; i++) {
404 		if (kring[i].nkr_ft) {
405 			nm_os_free(kring[i].nkr_ft);
406 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
407 		}
408 	}
409 }
410 
411 
412 /*
413  * Allocate the forwarding tables for the rings attached to the bridge ports.
414  */
415 static int
416 nm_alloc_bdgfwd(struct netmap_adapter *na)
417 {
418 	int nrings, l, i, num_dstq;
419 	struct netmap_kring *kring;
420 
421 	NMG_LOCK_ASSERT();
422 	/* all port:rings + broadcast */
423 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
424 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
425 	l += sizeof(struct nm_bdg_q) * num_dstq;
426 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
427 
428 	nrings = netmap_real_rings(na, NR_TX);
429 	kring = na->tx_rings;
430 	for (i = 0; i < nrings; i++) {
431 		struct nm_bdg_fwd *ft;
432 		struct nm_bdg_q *dstq;
433 		int j;
434 
435 		ft = nm_os_malloc(l);
436 		if (!ft) {
437 			nm_free_bdgfwd(na);
438 			return ENOMEM;
439 		}
440 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
441 		for (j = 0; j < num_dstq; j++) {
442 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
443 			dstq[j].bq_len = 0;
444 		}
445 		kring[i].nkr_ft = ft;
446 	}
447 	return 0;
448 }
449 
450 
451 /* remove from bridge b the ports in slots hw and sw
452  * (sw can be -1 if not needed)
453  */
454 static void
455 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
456 {
457 	int s_hw = hw, s_sw = sw;
458 	int i, lim =b->bdg_active_ports;
459 	uint8_t tmp[NM_BDG_MAXPORTS];
460 
461 	/*
462 	New algorithm:
463 	make a copy of bdg_port_index;
464 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
465 	in the array of bdg_port_index, replacing them with
466 	entries from the bottom of the array;
467 	decrement bdg_active_ports;
468 	acquire BDG_WLOCK() and copy back the array.
469 	 */
470 
471 	if (netmap_verbose)
472 		D("detach %d and %d (lim %d)", hw, sw, lim);
473 	/* make a copy of the list of active ports, update it,
474 	 * and then copy back within BDG_WLOCK().
475 	 */
476 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
477 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
478 		if (hw >= 0 && tmp[i] == hw) {
479 			ND("detach hw %d at %d", hw, i);
480 			lim--; /* point to last active port */
481 			tmp[i] = tmp[lim]; /* swap with i */
482 			tmp[lim] = hw;	/* now this is inactive */
483 			hw = -1;
484 		} else if (sw >= 0 && tmp[i] == sw) {
485 			ND("detach sw %d at %d", sw, i);
486 			lim--;
487 			tmp[i] = tmp[lim];
488 			tmp[lim] = sw;
489 			sw = -1;
490 		} else {
491 			i++;
492 		}
493 	}
494 	if (hw >= 0 || sw >= 0) {
495 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
496 	}
497 
498 	BDG_WLOCK(b);
499 	if (b->bdg_ops.dtor)
500 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
501 	b->bdg_ports[s_hw] = NULL;
502 	if (s_sw >= 0) {
503 		b->bdg_ports[s_sw] = NULL;
504 	}
505 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
506 	b->bdg_active_ports = lim;
507 	BDG_WUNLOCK(b);
508 
509 	ND("now %d active ports", lim);
510 	if (lim == 0) {
511 		ND("marking bridge %s as free", b->bdg_basename);
512 		nm_os_free(b->ht);
513 		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
514 		NM_BNS_PUT(b);
515 	}
516 }
517 
518 /* nm_bdg_ctl callback for VALE ports */
519 static int
520 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
521 {
522 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
523 	struct nm_bridge *b = vpna->na_bdg;
524 
525 	(void)nmr;	// XXX merge ?
526 	if (attach)
527 		return 0; /* nothing to do */
528 	if (b) {
529 		netmap_set_all_rings(na, 0 /* disable */);
530 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
531 		vpna->na_bdg = NULL;
532 		netmap_set_all_rings(na, 1 /* enable */);
533 	}
534 	/* I have took reference just for attach */
535 	netmap_adapter_put(na);
536 	return 0;
537 }
538 
539 /* nm_dtor callback for ephemeral VALE ports */
540 static void
541 netmap_vp_dtor(struct netmap_adapter *na)
542 {
543 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
544 	struct nm_bridge *b = vpna->na_bdg;
545 
546 	ND("%s has %d references", na->name, na->na_refcount);
547 
548 	if (b) {
549 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
550 	}
551 
552 	if (na->ifp != NULL && !nm_iszombie(na)) {
553 		WNA(na->ifp) = NULL;
554 		if (vpna->autodelete) {
555 			ND("releasing %s", na->ifp->if_xname);
556 			NMG_UNLOCK();
557 			nm_os_vi_detach(na->ifp);
558 			NMG_LOCK();
559 		}
560 	}
561 }
562 
563 /* remove a persistent VALE port from the system */
564 static int
565 nm_vi_destroy(const char *name)
566 {
567 	struct ifnet *ifp;
568 	struct netmap_vp_adapter *vpna;
569 	int error;
570 
571 	ifp = ifunit_ref(name);
572 	if (!ifp)
573 		return ENXIO;
574 	NMG_LOCK();
575 	/* make sure this is actually a VALE port */
576 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
577 		error = EINVAL;
578 		goto err;
579 	}
580 
581 	vpna = (struct netmap_vp_adapter *)NA(ifp);
582 
583 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
584 	if (vpna->autodelete) {
585 		error = EINVAL;
586 		goto err;
587 	}
588 
589 	/* also make sure that nobody is using the inferface */
590 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
591 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
592 		error = EBUSY;
593 		goto err;
594 	}
595 
596 	NMG_UNLOCK();
597 
598 	D("destroying a persistent vale interface %s", ifp->if_xname);
599 	/* Linux requires all the references are released
600 	 * before unregister
601 	 */
602 	netmap_detach(ifp);
603 	if_rele(ifp);
604 	nm_os_vi_detach(ifp);
605 	return 0;
606 
607 err:
608 	NMG_UNLOCK();
609 	if_rele(ifp);
610 	return error;
611 }
612 
613 static int
614 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na)
615 {
616 	uint64_t memsize;
617 	int ret;
618 	nmr->nr_rx_rings = na->num_rx_rings;
619 	nmr->nr_tx_rings = na->num_tx_rings;
620 	nmr->nr_rx_slots = na->num_rx_desc;
621 	nmr->nr_tx_slots = na->num_tx_desc;
622 	ret = netmap_mem_get_info(na->nm_mem, &memsize, NULL, &nmr->nr_arg2);
623 	nmr->nr_memsize = (uint32_t)memsize;
624 	return ret;
625 }
626 
627 /*
628  * Create a virtual interface registered to the system.
629  * The interface will be attached to a bridge later.
630  */
631 int
632 netmap_vi_create(struct nmreq *nmr, int autodelete)
633 {
634 	struct ifnet *ifp;
635 	struct netmap_vp_adapter *vpna;
636 	struct netmap_mem_d *nmd = NULL;
637 	int error;
638 
639 	/* don't include VALE prefix */
640 	if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
641 		return EINVAL;
642 	ifp = ifunit_ref(nmr->nr_name);
643 	if (ifp) { /* already exist, cannot create new one */
644 		error = EEXIST;
645 		NMG_LOCK();
646 		if (NM_NA_VALID(ifp)) {
647 			int update_err = nm_update_info(nmr, NA(ifp));
648 			if (update_err)
649 				error = update_err;
650 		}
651 		NMG_UNLOCK();
652 		if_rele(ifp);
653 		return error;
654 	}
655 	error = nm_os_vi_persist(nmr->nr_name, &ifp);
656 	if (error)
657 		return error;
658 
659 	NMG_LOCK();
660 	if (nmr->nr_arg2) {
661 		nmd = netmap_mem_find(nmr->nr_arg2);
662 		if (nmd == NULL) {
663 			error = EINVAL;
664 			goto err_1;
665 		}
666 	}
667 	/* netmap_vp_create creates a struct netmap_vp_adapter */
668 	error = netmap_vp_create(nmr, ifp, nmd, &vpna);
669 	if (error) {
670 		D("error %d", error);
671 		goto err_1;
672 	}
673 	/* persist-specific routines */
674 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
675 	if (!autodelete) {
676 		netmap_adapter_get(&vpna->up);
677 	} else {
678 		vpna->autodelete = 1;
679 	}
680 	NM_ATTACH_NA(ifp, &vpna->up);
681 	/* return the updated info */
682 	error = nm_update_info(nmr, &vpna->up);
683 	if (error) {
684 		goto err_2;
685 	}
686 	D("returning nr_arg2 %d", nmr->nr_arg2);
687 	if (nmd)
688 		netmap_mem_put(nmd);
689 	NMG_UNLOCK();
690 	D("created %s", ifp->if_xname);
691 	return 0;
692 
693 err_2:
694 	netmap_detach(ifp);
695 err_1:
696 	if (nmd)
697 		netmap_mem_put(nmd);
698 	NMG_UNLOCK();
699 	nm_os_vi_detach(ifp);
700 
701 	return error;
702 }
703 
704 /* Try to get a reference to a netmap adapter attached to a VALE switch.
705  * If the adapter is found (or is created), this function returns 0, a
706  * non NULL pointer is returned into *na, and the caller holds a
707  * reference to the adapter.
708  * If an adapter is not found, then no reference is grabbed and the
709  * function returns an error code, or 0 if there is just a VALE prefix
710  * mismatch. Therefore the caller holds a reference when
711  * (*na != NULL && return == 0).
712  */
713 int
714 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na,
715 		struct netmap_mem_d *nmd, int create)
716 {
717 	char *nr_name = nmr->nr_name;
718 	const char *ifname;
719 	struct ifnet *ifp = NULL;
720 	int error = 0;
721 	struct netmap_vp_adapter *vpna, *hostna = NULL;
722 	struct nm_bridge *b;
723 	int i, j, cand = -1, cand2 = -1;
724 	int needed;
725 
726 	*na = NULL;     /* default return value */
727 
728 	/* first try to see if this is a bridge port. */
729 	NMG_LOCK_ASSERT();
730 	if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
731 		return 0;  /* no error, but no VALE prefix */
732 	}
733 
734 	b = nm_find_bridge(nr_name, create);
735 	if (b == NULL) {
736 		D("no bridges available for '%s'", nr_name);
737 		return (create ? ENOMEM : ENXIO);
738 	}
739 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
740 		panic("x");
741 
742 	/* Now we are sure that name starts with the bridge's name,
743 	 * lookup the port in the bridge. We need to scan the entire
744 	 * list. It is not important to hold a WLOCK on the bridge
745 	 * during the search because NMG_LOCK already guarantees
746 	 * that there are no other possible writers.
747 	 */
748 
749 	/* lookup in the local list of ports */
750 	for (j = 0; j < b->bdg_active_ports; j++) {
751 		i = b->bdg_port_index[j];
752 		vpna = b->bdg_ports[i];
753 		ND("checking %s", vpna->up.name);
754 		if (!strcmp(vpna->up.name, nr_name)) {
755 			netmap_adapter_get(&vpna->up);
756 			ND("found existing if %s refs %d", nr_name)
757 			*na = &vpna->up;
758 			return 0;
759 		}
760 	}
761 	/* not found, should we create it? */
762 	if (!create)
763 		return ENXIO;
764 	/* yes we should, see if we have space to attach entries */
765 	needed = 2; /* in some cases we only need 1 */
766 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
767 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
768 		return ENOMEM;
769 	}
770 	/* record the next two ports available, but do not allocate yet */
771 	cand = b->bdg_port_index[b->bdg_active_ports];
772 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
773 	ND("+++ bridge %s port %s used %d avail %d %d",
774 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
775 
776 	/*
777 	 * try see if there is a matching NIC with this name
778 	 * (after the bridge's name)
779 	 */
780 	ifname = nr_name + b->bdg_namelen + 1;
781 	ifp = ifunit_ref(ifname);
782 	if (!ifp) {
783 		/* Create an ephemeral virtual port
784 		 * This block contains all the ephemeral-specific logics
785 		 */
786 		if (nmr->nr_cmd) {
787 			/* nr_cmd must be 0 for a virtual port */
788 			error = EINVAL;
789 			goto out;
790 		}
791 
792 		/* bdg_netmap_attach creates a struct netmap_adapter */
793 		error = netmap_vp_create(nmr, NULL, nmd, &vpna);
794 		if (error) {
795 			D("error %d", error);
796 			goto out;
797 		}
798 		/* shortcut - we can skip get_hw_na(),
799 		 * ownership check and nm_bdg_attach()
800 		 */
801 	} else {
802 		struct netmap_adapter *hw;
803 
804 		/* the vale:nic syntax is only valid for some commands */
805 		switch (nmr->nr_cmd) {
806 		case NETMAP_BDG_ATTACH:
807 		case NETMAP_BDG_DETACH:
808 		case NETMAP_BDG_POLLING_ON:
809 		case NETMAP_BDG_POLLING_OFF:
810 			break; /* ok */
811 		default:
812 			error = EINVAL;
813 			goto out;
814 		}
815 
816 		error = netmap_get_hw_na(ifp, nmd, &hw);
817 		if (error || hw == NULL)
818 			goto out;
819 
820 		/* host adapter might not be created */
821 		error = hw->nm_bdg_attach(nr_name, hw);
822 		if (error)
823 			goto out;
824 		vpna = hw->na_vp;
825 		hostna = hw->na_hostvp;
826 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
827 			hostna = NULL;
828 	}
829 
830 	BDG_WLOCK(b);
831 	vpna->bdg_port = cand;
832 	ND("NIC  %p to bridge port %d", vpna, cand);
833 	/* bind the port to the bridge (virtual ports are not active) */
834 	b->bdg_ports[cand] = vpna;
835 	vpna->na_bdg = b;
836 	b->bdg_active_ports++;
837 	if (hostna != NULL) {
838 		/* also bind the host stack to the bridge */
839 		b->bdg_ports[cand2] = hostna;
840 		hostna->bdg_port = cand2;
841 		hostna->na_bdg = b;
842 		b->bdg_active_ports++;
843 		ND("host %p to bridge port %d", hostna, cand2);
844 	}
845 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
846 	BDG_WUNLOCK(b);
847 	*na = &vpna->up;
848 	netmap_adapter_get(*na);
849 
850 out:
851 	if (ifp)
852 		if_rele(ifp);
853 
854 	return error;
855 }
856 
857 
858 /* Process NETMAP_BDG_ATTACH */
859 static int
860 nm_bdg_ctl_attach(struct nmreq *nmr)
861 {
862 	struct netmap_adapter *na;
863 	struct netmap_mem_d *nmd = NULL;
864 	int error;
865 
866 	NMG_LOCK();
867 
868 	if (nmr->nr_arg2) {
869 		nmd = netmap_mem_find(nmr->nr_arg2);
870 		if (nmd == NULL) {
871 			error = EINVAL;
872 			goto unlock_exit;
873 		}
874 	}
875 
876 	/* XXX check existing one */
877 	error = netmap_get_bdg_na(nmr, &na, nmd, 0);
878 	if (!error) {
879 		error = EBUSY;
880 		goto unref_exit;
881 	}
882 	error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */);
883 	if (error) /* no device */
884 		goto unlock_exit;
885 
886 	if (na == NULL) { /* VALE prefix missing */
887 		error = EINVAL;
888 		goto unlock_exit;
889 	}
890 
891 	if (NETMAP_OWNED_BY_ANY(na)) {
892 		error = EBUSY;
893 		goto unref_exit;
894 	}
895 
896 	if (na->nm_bdg_ctl) {
897 		/* nop for VALE ports. The bwrap needs to put the hwna
898 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
899 		 */
900 		error = na->nm_bdg_ctl(na, nmr, 1);
901 		if (error)
902 			goto unref_exit;
903 		ND("registered %s to netmap-mode", na->name);
904 	}
905 	NMG_UNLOCK();
906 	return 0;
907 
908 unref_exit:
909 	netmap_adapter_put(na);
910 unlock_exit:
911 	NMG_UNLOCK();
912 	return error;
913 }
914 
915 static inline int
916 nm_is_bwrap(struct netmap_adapter *na)
917 {
918 	return na->nm_register == netmap_bwrap_reg;
919 }
920 
921 /* process NETMAP_BDG_DETACH */
922 static int
923 nm_bdg_ctl_detach(struct nmreq *nmr)
924 {
925 	struct netmap_adapter *na;
926 	int error;
927 
928 	NMG_LOCK();
929 	error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */);
930 	if (error) { /* no device, or another bridge or user owns the device */
931 		goto unlock_exit;
932 	}
933 
934 	if (na == NULL) { /* VALE prefix missing */
935 		error = EINVAL;
936 		goto unlock_exit;
937 	} else if (nm_is_bwrap(na) &&
938 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
939 		/* Don't detach a NIC with polling */
940 		error = EBUSY;
941 		netmap_adapter_put(na);
942 		goto unlock_exit;
943 	}
944 	if (na->nm_bdg_ctl) {
945 		/* remove the port from bridge. The bwrap
946 		 * also needs to put the hwna in normal mode
947 		 */
948 		error = na->nm_bdg_ctl(na, nmr, 0);
949 	}
950 
951 	netmap_adapter_put(na);
952 unlock_exit:
953 	NMG_UNLOCK();
954 	return error;
955 
956 }
957 
958 struct nm_bdg_polling_state;
959 struct
960 nm_bdg_kthread {
961 	struct nm_kctx *nmk;
962 	u_int qfirst;
963 	u_int qlast;
964 	struct nm_bdg_polling_state *bps;
965 };
966 
967 struct nm_bdg_polling_state {
968 	bool configured;
969 	bool stopped;
970 	struct netmap_bwrap_adapter *bna;
971 	u_int reg;
972 	u_int qfirst;
973 	u_int qlast;
974 	u_int cpu_from;
975 	u_int ncpus;
976 	struct nm_bdg_kthread *kthreads;
977 };
978 
979 static void
980 netmap_bwrap_polling(void *data, int is_kthread)
981 {
982 	struct nm_bdg_kthread *nbk = data;
983 	struct netmap_bwrap_adapter *bna;
984 	u_int qfirst, qlast, i;
985 	struct netmap_kring *kring0, *kring;
986 
987 	if (!nbk)
988 		return;
989 	qfirst = nbk->qfirst;
990 	qlast = nbk->qlast;
991 	bna = nbk->bps->bna;
992 	kring0 = NMR(bna->hwna, NR_RX);
993 
994 	for (i = qfirst; i < qlast; i++) {
995 		kring = kring0 + i;
996 		kring->nm_notify(kring, 0);
997 	}
998 }
999 
1000 static int
1001 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
1002 {
1003 	struct nm_kctx_cfg kcfg;
1004 	int i, j;
1005 
1006 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
1007 	if (bps->kthreads == NULL)
1008 		return ENOMEM;
1009 
1010 	bzero(&kcfg, sizeof(kcfg));
1011 	kcfg.worker_fn = netmap_bwrap_polling;
1012 	kcfg.use_kthread = 1;
1013 	for (i = 0; i < bps->ncpus; i++) {
1014 		struct nm_bdg_kthread *t = bps->kthreads + i;
1015 		int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
1016 		int affinity = bps->cpu_from + i;
1017 
1018 		t->bps = bps;
1019 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
1020 		t->qlast = all ? bps->qlast : t->qfirst + 1;
1021 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
1022 			t->qlast);
1023 
1024 		kcfg.type = i;
1025 		kcfg.worker_private = t;
1026 		t->nmk = nm_os_kctx_create(&kcfg, 0, NULL);
1027 		if (t->nmk == NULL) {
1028 			goto cleanup;
1029 		}
1030 		nm_os_kctx_worker_setaff(t->nmk, affinity);
1031 	}
1032 	return 0;
1033 
1034 cleanup:
1035 	for (j = 0; j < i; j++) {
1036 		struct nm_bdg_kthread *t = bps->kthreads + i;
1037 		nm_os_kctx_destroy(t->nmk);
1038 	}
1039 	nm_os_free(bps->kthreads);
1040 	return EFAULT;
1041 }
1042 
1043 /* A variant of ptnetmap_start_kthreads() */
1044 static int
1045 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1046 {
1047 	int error, i, j;
1048 
1049 	if (!bps) {
1050 		D("polling is not configured");
1051 		return EFAULT;
1052 	}
1053 	bps->stopped = false;
1054 
1055 	for (i = 0; i < bps->ncpus; i++) {
1056 		struct nm_bdg_kthread *t = bps->kthreads + i;
1057 		error = nm_os_kctx_worker_start(t->nmk);
1058 		if (error) {
1059 			D("error in nm_kthread_start()");
1060 			goto cleanup;
1061 		}
1062 	}
1063 	return 0;
1064 
1065 cleanup:
1066 	for (j = 0; j < i; j++) {
1067 		struct nm_bdg_kthread *t = bps->kthreads + i;
1068 		nm_os_kctx_worker_stop(t->nmk);
1069 	}
1070 	bps->stopped = true;
1071 	return error;
1072 }
1073 
1074 static void
1075 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1076 {
1077 	int i;
1078 
1079 	if (!bps)
1080 		return;
1081 
1082 	for (i = 0; i < bps->ncpus; i++) {
1083 		struct nm_bdg_kthread *t = bps->kthreads + i;
1084 		nm_os_kctx_worker_stop(t->nmk);
1085 		nm_os_kctx_destroy(t->nmk);
1086 	}
1087 	bps->stopped = true;
1088 }
1089 
1090 static int
1091 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
1092 			struct nm_bdg_polling_state *bps)
1093 {
1094 	int req_cpus, avail_cpus, core_from;
1095 	u_int reg, i, qfirst, qlast;
1096 
1097 	avail_cpus = nm_os_ncpus();
1098 	req_cpus = nmr->nr_arg1;
1099 
1100 	if (req_cpus == 0) {
1101 		D("req_cpus must be > 0");
1102 		return EINVAL;
1103 	} else if (req_cpus >= avail_cpus) {
1104 		D("for safety, we need at least one core left in the system");
1105 		return EINVAL;
1106 	}
1107 	reg = nmr->nr_flags & NR_REG_MASK;
1108 	i = nmr->nr_ringid & NETMAP_RING_MASK;
1109 	/*
1110 	 * ONE_NIC: dedicate one core to one ring. If multiple cores
1111 	 *          are specified, consecutive rings are also polled.
1112 	 *          For example, if ringid=2 and 2 cores are given,
1113 	 *          ring 2 and 3 are polled by core 2 and 3, respectively.
1114 	 * ALL_NIC: poll all the rings using a core specified by ringid.
1115 	 *          the number of cores must be 1.
1116 	 */
1117 	if (reg == NR_REG_ONE_NIC) {
1118 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1119 			D("only %d rings exist (ring %u-%u is given)",
1120 				nma_get_nrings(na, NR_RX), i, i+req_cpus);
1121 			return EINVAL;
1122 		}
1123 		qfirst = i;
1124 		qlast = qfirst + req_cpus;
1125 		core_from = qfirst;
1126 	} else if (reg == NR_REG_ALL_NIC) {
1127 		if (req_cpus != 1) {
1128 			D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1129 			return EINVAL;
1130 		}
1131 		qfirst = 0;
1132 		qlast = nma_get_nrings(na, NR_RX);
1133 		core_from = i;
1134 	} else {
1135 		D("reg must be ALL_NIC or ONE_NIC");
1136 		return EINVAL;
1137 	}
1138 
1139 	bps->reg = reg;
1140 	bps->qfirst = qfirst;
1141 	bps->qlast = qlast;
1142 	bps->cpu_from = core_from;
1143 	bps->ncpus = req_cpus;
1144 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1145 		reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1146 		qfirst, qlast, core_from, req_cpus);
1147 	return 0;
1148 }
1149 
1150 static int
1151 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1152 {
1153 	struct nm_bdg_polling_state *bps;
1154 	struct netmap_bwrap_adapter *bna;
1155 	int error;
1156 
1157 	bna = (struct netmap_bwrap_adapter *)na;
1158 	if (bna->na_polling_state) {
1159 		D("ERROR adapter already in polling mode");
1160 		return EFAULT;
1161 	}
1162 
1163 	bps = nm_os_malloc(sizeof(*bps));
1164 	if (!bps)
1165 		return ENOMEM;
1166 	bps->configured = false;
1167 	bps->stopped = true;
1168 
1169 	if (get_polling_cfg(nmr, na, bps)) {
1170 		nm_os_free(bps);
1171 		return EINVAL;
1172 	}
1173 
1174 	if (nm_bdg_create_kthreads(bps)) {
1175 		nm_os_free(bps);
1176 		return EFAULT;
1177 	}
1178 
1179 	bps->configured = true;
1180 	bna->na_polling_state = bps;
1181 	bps->bna = bna;
1182 
1183 	/* disable interrupts if possible */
1184 	nma_intr_enable(bna->hwna, 0);
1185 	/* start kthread now */
1186 	error = nm_bdg_polling_start_kthreads(bps);
1187 	if (error) {
1188 		D("ERROR nm_bdg_polling_start_kthread()");
1189 		nm_os_free(bps->kthreads);
1190 		nm_os_free(bps);
1191 		bna->na_polling_state = NULL;
1192 		nma_intr_enable(bna->hwna, 1);
1193 	}
1194 	return error;
1195 }
1196 
1197 static int
1198 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1199 {
1200 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1201 	struct nm_bdg_polling_state *bps;
1202 
1203 	if (!bna->na_polling_state) {
1204 		D("ERROR adapter is not in polling mode");
1205 		return EFAULT;
1206 	}
1207 	bps = bna->na_polling_state;
1208 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1209 	bps->configured = false;
1210 	nm_os_free(bps);
1211 	bna->na_polling_state = NULL;
1212 	/* reenable interrupts */
1213 	nma_intr_enable(bna->hwna, 1);
1214 	return 0;
1215 }
1216 
1217 /* Called by either user's context (netmap_ioctl())
1218  * or external kernel modules (e.g., Openvswitch).
1219  * Operation is indicated in nmr->nr_cmd.
1220  * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1221  * requires bdg_ops argument; the other commands ignore this argument.
1222  *
1223  * Called without NMG_LOCK.
1224  */
1225 int
1226 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1227 {
1228 	struct nm_bridge *b, *bridges;
1229 	struct netmap_adapter *na;
1230 	struct netmap_vp_adapter *vpna;
1231 	char *name = nmr->nr_name;
1232 	int cmd = nmr->nr_cmd, namelen = strlen(name);
1233 	int error = 0, i, j;
1234 	u_int num_bridges;
1235 
1236 	netmap_bns_getbridges(&bridges, &num_bridges);
1237 
1238 	switch (cmd) {
1239 	case NETMAP_BDG_NEWIF:
1240 		error = netmap_vi_create(nmr, 0 /* no autodelete */);
1241 		break;
1242 
1243 	case NETMAP_BDG_DELIF:
1244 		error = nm_vi_destroy(nmr->nr_name);
1245 		break;
1246 
1247 	case NETMAP_BDG_ATTACH:
1248 		error = nm_bdg_ctl_attach(nmr);
1249 		break;
1250 
1251 	case NETMAP_BDG_DETACH:
1252 		error = nm_bdg_ctl_detach(nmr);
1253 		break;
1254 
1255 	case NETMAP_BDG_LIST:
1256 		/* this is used to enumerate bridges and ports */
1257 		if (namelen) { /* look up indexes of bridge and port */
1258 			if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1259 				error = EINVAL;
1260 				break;
1261 			}
1262 			NMG_LOCK();
1263 			b = nm_find_bridge(name, 0 /* don't create */);
1264 			if (!b) {
1265 				error = ENOENT;
1266 				NMG_UNLOCK();
1267 				break;
1268 			}
1269 
1270 			error = 0;
1271 			nmr->nr_arg1 = b - bridges; /* bridge index */
1272 			nmr->nr_arg2 = NM_BDG_NOPORT;
1273 			for (j = 0; j < b->bdg_active_ports; j++) {
1274 				i = b->bdg_port_index[j];
1275 				vpna = b->bdg_ports[i];
1276 				if (vpna == NULL) {
1277 					D("---AAAAAAAAARGH-------");
1278 					continue;
1279 				}
1280 				/* the former and the latter identify a
1281 				 * virtual port and a NIC, respectively
1282 				 */
1283 				if (!strcmp(vpna->up.name, name)) {
1284 					nmr->nr_arg2 = i; /* port index */
1285 					break;
1286 				}
1287 			}
1288 			NMG_UNLOCK();
1289 		} else {
1290 			/* return the first non-empty entry starting from
1291 			 * bridge nr_arg1 and port nr_arg2.
1292 			 *
1293 			 * Users can detect the end of the same bridge by
1294 			 * seeing the new and old value of nr_arg1, and can
1295 			 * detect the end of all the bridge by error != 0
1296 			 */
1297 			i = nmr->nr_arg1;
1298 			j = nmr->nr_arg2;
1299 
1300 			NMG_LOCK();
1301 			for (error = ENOENT; i < NM_BRIDGES; i++) {
1302 				b = bridges + i;
1303 				for ( ; j < NM_BDG_MAXPORTS; j++) {
1304 					if (b->bdg_ports[j] == NULL)
1305 						continue;
1306 					vpna = b->bdg_ports[j];
1307 					strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1308 					error = 0;
1309 					goto out;
1310 				}
1311 				j = 0; /* following bridges scan from 0 */
1312 			}
1313 		out:
1314 			nmr->nr_arg1 = i;
1315 			nmr->nr_arg2 = j;
1316 			NMG_UNLOCK();
1317 		}
1318 		break;
1319 
1320 	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1321 		/* register callbacks to the given bridge.
1322 		 * nmr->nr_name may be just bridge's name (including ':'
1323 		 * if it is not just NM_NAME).
1324 		 */
1325 		if (!bdg_ops) {
1326 			error = EINVAL;
1327 			break;
1328 		}
1329 		NMG_LOCK();
1330 		b = nm_find_bridge(name, 0 /* don't create */);
1331 		if (!b) {
1332 			error = EINVAL;
1333 		} else {
1334 			b->bdg_ops = *bdg_ops;
1335 		}
1336 		NMG_UNLOCK();
1337 		break;
1338 
1339 	case NETMAP_BDG_VNET_HDR:
1340 		/* Valid lengths for the virtio-net header are 0 (no header),
1341 		   10 and 12. */
1342 		if (nmr->nr_arg1 != 0 &&
1343 			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1344 				nmr->nr_arg1 != 12) {
1345 			error = EINVAL;
1346 			break;
1347 		}
1348 		NMG_LOCK();
1349 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1350 		if (na && !error) {
1351 			vpna = (struct netmap_vp_adapter *)na;
1352 			na->virt_hdr_len = nmr->nr_arg1;
1353 			if (na->virt_hdr_len) {
1354 				vpna->mfs = NETMAP_BUF_SIZE(na);
1355 			}
1356 			D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1357 			netmap_adapter_put(na);
1358 		} else if (!na) {
1359 			error = ENXIO;
1360 		}
1361 		NMG_UNLOCK();
1362 		break;
1363 
1364 	case NETMAP_BDG_POLLING_ON:
1365 	case NETMAP_BDG_POLLING_OFF:
1366 		NMG_LOCK();
1367 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1368 		if (na && !error) {
1369 			if (!nm_is_bwrap(na)) {
1370 				error = EOPNOTSUPP;
1371 			} else if (cmd == NETMAP_BDG_POLLING_ON) {
1372 				error = nm_bdg_ctl_polling_start(nmr, na);
1373 				if (!error)
1374 					netmap_adapter_get(na);
1375 			} else {
1376 				error = nm_bdg_ctl_polling_stop(nmr, na);
1377 				if (!error)
1378 					netmap_adapter_put(na);
1379 			}
1380 			netmap_adapter_put(na);
1381 		}
1382 		NMG_UNLOCK();
1383 		break;
1384 
1385 	default:
1386 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1387 		error = EINVAL;
1388 		break;
1389 	}
1390 	return error;
1391 }
1392 
1393 int
1394 netmap_bdg_config(struct nmreq *nmr)
1395 {
1396 	struct nm_bridge *b;
1397 	int error = EINVAL;
1398 
1399 	NMG_LOCK();
1400 	b = nm_find_bridge(nmr->nr_name, 0);
1401 	if (!b) {
1402 		NMG_UNLOCK();
1403 		return error;
1404 	}
1405 	NMG_UNLOCK();
1406 	/* Don't call config() with NMG_LOCK() held */
1407 	BDG_RLOCK(b);
1408 	if (b->bdg_ops.config != NULL)
1409 		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1410 	BDG_RUNLOCK(b);
1411 	return error;
1412 }
1413 
1414 
1415 /* nm_krings_create callback for VALE ports.
1416  * Calls the standard netmap_krings_create, then adds leases on rx
1417  * rings and bdgfwd on tx rings.
1418  */
1419 static int
1420 netmap_vp_krings_create(struct netmap_adapter *na)
1421 {
1422 	u_int tailroom;
1423 	int error, i;
1424 	uint32_t *leases;
1425 	u_int nrx = netmap_real_rings(na, NR_RX);
1426 
1427 	/*
1428 	 * Leases are attached to RX rings on vale ports
1429 	 */
1430 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1431 
1432 	error = netmap_krings_create(na, tailroom);
1433 	if (error)
1434 		return error;
1435 
1436 	leases = na->tailroom;
1437 
1438 	for (i = 0; i < nrx; i++) { /* Receive rings */
1439 		na->rx_rings[i].nkr_leases = leases;
1440 		leases += na->num_rx_desc;
1441 	}
1442 
1443 	error = nm_alloc_bdgfwd(na);
1444 	if (error) {
1445 		netmap_krings_delete(na);
1446 		return error;
1447 	}
1448 
1449 	return 0;
1450 }
1451 
1452 
1453 /* nm_krings_delete callback for VALE ports. */
1454 static void
1455 netmap_vp_krings_delete(struct netmap_adapter *na)
1456 {
1457 	nm_free_bdgfwd(na);
1458 	netmap_krings_delete(na);
1459 }
1460 
1461 
1462 static int
1463 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1464 	struct netmap_vp_adapter *na, u_int ring_nr);
1465 
1466 
1467 /*
1468  * main dispatch routine for the bridge.
1469  * Grab packets from a kring, move them into the ft structure
1470  * associated to the tx (input) port. Max one instance per port,
1471  * filtered on input (ioctl, poll or XXX).
1472  * Returns the next position in the ring.
1473  */
1474 static int
1475 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1476 {
1477 	struct netmap_vp_adapter *na =
1478 		(struct netmap_vp_adapter*)kring->na;
1479 	struct netmap_ring *ring = kring->ring;
1480 	struct nm_bdg_fwd *ft;
1481 	u_int ring_nr = kring->ring_id;
1482 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1483 	u_int ft_i = 0;	/* start from 0 */
1484 	u_int frags = 1; /* how many frags ? */
1485 	struct nm_bridge *b = na->na_bdg;
1486 
1487 	/* To protect against modifications to the bridge we acquire a
1488 	 * shared lock, waiting if we can sleep (if the source port is
1489 	 * attached to a user process) or with a trylock otherwise (NICs).
1490 	 */
1491 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1492 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1493 		BDG_RLOCK(b);
1494 	else if (!BDG_RTRYLOCK(b))
1495 		return j;
1496 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1497 	ft = kring->nkr_ft;
1498 
1499 	for (; likely(j != end); j = nm_next(j, lim)) {
1500 		struct netmap_slot *slot = &ring->slot[j];
1501 		char *buf;
1502 
1503 		ft[ft_i].ft_len = slot->len;
1504 		ft[ft_i].ft_flags = slot->flags;
1505 
1506 		ND("flags is 0x%x", slot->flags);
1507 		/* we do not use the buf changed flag, but we still need to reset it */
1508 		slot->flags &= ~NS_BUF_CHANGED;
1509 
1510 		/* this slot goes into a list so initialize the link field */
1511 		ft[ft_i].ft_next = NM_FT_NULL;
1512 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1513 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1514 		if (unlikely(buf == NULL)) {
1515 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1516 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1517 				kring->name, j, ft[ft_i].ft_len);
1518 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1519 			ft[ft_i].ft_len = 0;
1520 			ft[ft_i].ft_flags = 0;
1521 		}
1522 		__builtin_prefetch(buf);
1523 		++ft_i;
1524 		if (slot->flags & NS_MOREFRAG) {
1525 			frags++;
1526 			continue;
1527 		}
1528 		if (unlikely(netmap_verbose && frags > 1))
1529 			RD(5, "%d frags at %d", frags, ft_i - frags);
1530 		ft[ft_i - frags].ft_frags = frags;
1531 		frags = 1;
1532 		if (unlikely((int)ft_i >= bridge_batch))
1533 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1534 	}
1535 	if (frags > 1) {
1536 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1537 		 * have to fix frags count. */
1538 		frags--;
1539 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1540 		ft[ft_i - frags].ft_frags = frags;
1541 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1542 	}
1543 	if (ft_i)
1544 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1545 	BDG_RUNLOCK(b);
1546 	return j;
1547 }
1548 
1549 
1550 /* ----- FreeBSD if_bridge hash function ------- */
1551 
1552 /*
1553  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1554  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1555  *
1556  * http://www.burtleburtle.net/bob/hash/spooky.html
1557  */
1558 #define mix(a, b, c)                                                    \
1559 do {                                                                    \
1560         a -= b; a -= c; a ^= (c >> 13);                                 \
1561         b -= c; b -= a; b ^= (a << 8);                                  \
1562         c -= a; c -= b; c ^= (b >> 13);                                 \
1563         a -= b; a -= c; a ^= (c >> 12);                                 \
1564         b -= c; b -= a; b ^= (a << 16);                                 \
1565         c -= a; c -= b; c ^= (b >> 5);                                  \
1566         a -= b; a -= c; a ^= (c >> 3);                                  \
1567         b -= c; b -= a; b ^= (a << 10);                                 \
1568         c -= a; c -= b; c ^= (b >> 15);                                 \
1569 } while (/*CONSTCOND*/0)
1570 
1571 
1572 static __inline uint32_t
1573 nm_bridge_rthash(const uint8_t *addr)
1574 {
1575         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1576 
1577         b += addr[5] << 8;
1578         b += addr[4];
1579         a += addr[3] << 24;
1580         a += addr[2] << 16;
1581         a += addr[1] << 8;
1582         a += addr[0];
1583 
1584         mix(a, b, c);
1585 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1586         return (c & BRIDGE_RTHASH_MASK);
1587 }
1588 
1589 #undef mix
1590 
1591 
1592 /* nm_register callback for VALE ports */
1593 static int
1594 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1595 {
1596 	struct netmap_vp_adapter *vpna =
1597 		(struct netmap_vp_adapter*)na;
1598 	enum txrx t;
1599 	int i;
1600 
1601 	/* persistent ports may be put in netmap mode
1602 	 * before being attached to a bridge
1603 	 */
1604 	if (vpna->na_bdg)
1605 		BDG_WLOCK(vpna->na_bdg);
1606 	if (onoff) {
1607 		for_rx_tx(t) {
1608 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1609 				struct netmap_kring *kring = &NMR(na, t)[i];
1610 
1611 				if (nm_kring_pending_on(kring))
1612 					kring->nr_mode = NKR_NETMAP_ON;
1613 			}
1614 		}
1615 		if (na->active_fds == 0)
1616 			na->na_flags |= NAF_NETMAP_ON;
1617 		 /* XXX on FreeBSD, persistent VALE ports should also
1618 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1619 		 */
1620 	} else {
1621 		if (na->active_fds == 0)
1622 			na->na_flags &= ~NAF_NETMAP_ON;
1623 		for_rx_tx(t) {
1624 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1625 				struct netmap_kring *kring = &NMR(na, t)[i];
1626 
1627 				if (nm_kring_pending_off(kring))
1628 					kring->nr_mode = NKR_NETMAP_OFF;
1629 			}
1630 		}
1631 	}
1632 	if (vpna->na_bdg)
1633 		BDG_WUNLOCK(vpna->na_bdg);
1634 	return 0;
1635 }
1636 
1637 
1638 /*
1639  * Lookup function for a learning bridge.
1640  * Update the hash table with the source address,
1641  * and then returns the destination port index, and the
1642  * ring in *dst_ring (at the moment, always use ring 0)
1643  */
1644 u_int
1645 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1646 		struct netmap_vp_adapter *na)
1647 {
1648 	uint8_t *buf = ft->ft_buf;
1649 	u_int buf_len = ft->ft_len;
1650 	struct nm_hash_ent *ht = na->na_bdg->ht;
1651 	uint32_t sh, dh;
1652 	u_int dst, mysrc = na->bdg_port;
1653 	uint64_t smac, dmac;
1654 	uint8_t indbuf[12];
1655 
1656 	/* safety check, unfortunately we have many cases */
1657 	if (buf_len >= 14 + na->up.virt_hdr_len) {
1658 		/* virthdr + mac_hdr in the same slot */
1659 		buf += na->up.virt_hdr_len;
1660 		buf_len -= na->up.virt_hdr_len;
1661 	} else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1662 		/* only header in first fragment */
1663 		ft++;
1664 		buf = ft->ft_buf;
1665 		buf_len = ft->ft_len;
1666 	} else {
1667 		RD(5, "invalid buf format, length %d", buf_len);
1668 		return NM_BDG_NOPORT;
1669 	}
1670 
1671 	if (ft->ft_flags & NS_INDIRECT) {
1672 		if (copyin(buf, indbuf, sizeof(indbuf))) {
1673 			return NM_BDG_NOPORT;
1674 		}
1675 		buf = indbuf;
1676 	}
1677 
1678 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1679 	smac = le64toh(*(uint64_t *)(buf + 4));
1680 	smac >>= 16;
1681 
1682 	/*
1683 	 * The hash is somewhat expensive, there might be some
1684 	 * worthwhile optimizations here.
1685 	 */
1686 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1687 		uint8_t *s = buf+6;
1688 		sh = nm_bridge_rthash(s); /* hash of source */
1689 		/* update source port forwarding entry */
1690 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1691 		ht[sh].ports = mysrc;
1692 		if (netmap_verbose)
1693 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1694 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1695 	}
1696 	dst = NM_BDG_BROADCAST;
1697 	if ((buf[0] & 1) == 0) { /* unicast */
1698 		dh = nm_bridge_rthash(buf); /* hash of dst */
1699 		if (ht[dh].mac == dmac) {	/* found dst */
1700 			dst = ht[dh].ports;
1701 		}
1702 	}
1703 	return dst;
1704 }
1705 
1706 
1707 /*
1708  * Available space in the ring. Only used in VALE code
1709  * and only with is_rx = 1
1710  */
1711 static inline uint32_t
1712 nm_kr_space(struct netmap_kring *k, int is_rx)
1713 {
1714 	int space;
1715 
1716 	if (is_rx) {
1717 		int busy = k->nkr_hwlease - k->nr_hwcur;
1718 		if (busy < 0)
1719 			busy += k->nkr_num_slots;
1720 		space = k->nkr_num_slots - 1 - busy;
1721 	} else {
1722 		/* XXX never used in this branch */
1723 		space = k->nr_hwtail - k->nkr_hwlease;
1724 		if (space < 0)
1725 			space += k->nkr_num_slots;
1726 	}
1727 #if 0
1728 	// sanity check
1729 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1730 		k->nr_hwcur >= k->nkr_num_slots ||
1731 		k->nr_tail >= k->nkr_num_slots ||
1732 		busy < 0 ||
1733 		busy >= k->nkr_num_slots) {
1734 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1735 			k->nkr_lease_idx, k->nkr_num_slots);
1736 	}
1737 #endif
1738 	return space;
1739 }
1740 
1741 
1742 
1743 
1744 /* make a lease on the kring for N positions. return the
1745  * lease index
1746  * XXX only used in VALE code and with is_rx = 1
1747  */
1748 static inline uint32_t
1749 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1750 {
1751 	uint32_t lim = k->nkr_num_slots - 1;
1752 	uint32_t lease_idx = k->nkr_lease_idx;
1753 
1754 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1755 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1756 
1757 	if (n > nm_kr_space(k, is_rx)) {
1758 		D("invalid request for %d slots", n);
1759 		panic("x");
1760 	}
1761 	/* XXX verify that there are n slots */
1762 	k->nkr_hwlease += n;
1763 	if (k->nkr_hwlease > lim)
1764 		k->nkr_hwlease -= lim + 1;
1765 
1766 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1767 		k->nr_hwcur >= k->nkr_num_slots ||
1768 		k->nr_hwtail >= k->nkr_num_slots ||
1769 		k->nkr_lease_idx >= k->nkr_num_slots) {
1770 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1771 			k->na->name,
1772 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1773 			k->nkr_lease_idx, k->nkr_num_slots);
1774 	}
1775 	return lease_idx;
1776 }
1777 
1778 /*
1779  *
1780  * This flush routine supports only unicast and broadcast but a large
1781  * number of ports, and lets us replace the learn and dispatch functions.
1782  */
1783 int
1784 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1785 		u_int ring_nr)
1786 {
1787 	struct nm_bdg_q *dst_ents, *brddst;
1788 	uint16_t num_dsts = 0, *dsts;
1789 	struct nm_bridge *b = na->na_bdg;
1790 	u_int i, me = na->bdg_port;
1791 
1792 	/*
1793 	 * The work area (pointed by ft) is followed by an array of
1794 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1795 	 * queues per port plus one for the broadcast traffic.
1796 	 * Then we have an array of destination indexes.
1797 	 */
1798 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1799 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1800 
1801 	/* first pass: find a destination for each packet in the batch */
1802 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1803 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1804 		uint16_t dst_port, d_i;
1805 		struct nm_bdg_q *d;
1806 
1807 		ND("slot %d frags %d", i, ft[i].ft_frags);
1808 		/* Drop the packet if the virtio-net header is not into the first
1809 		   fragment nor at the very beginning of the second. */
1810 		if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1811 			continue;
1812 		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1813 		if (netmap_verbose > 255)
1814 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1815 		if (dst_port >= NM_BDG_NOPORT)
1816 			continue; /* this packet is identified to be dropped */
1817 		else if (dst_port == NM_BDG_BROADCAST)
1818 			dst_ring = 0; /* broadcasts always go to ring 0 */
1819 		else if (unlikely(dst_port == me ||
1820 		    !b->bdg_ports[dst_port]))
1821 			continue;
1822 
1823 		/* get a position in the scratch pad */
1824 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1825 		d = dst_ents + d_i;
1826 
1827 		/* append the first fragment to the list */
1828 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1829 			d->bq_head = d->bq_tail = i;
1830 			/* remember this position to be scanned later */
1831 			if (dst_port != NM_BDG_BROADCAST)
1832 				dsts[num_dsts++] = d_i;
1833 		} else {
1834 			ft[d->bq_tail].ft_next = i;
1835 			d->bq_tail = i;
1836 		}
1837 		d->bq_len += ft[i].ft_frags;
1838 	}
1839 
1840 	/*
1841 	 * Broadcast traffic goes to ring 0 on all destinations.
1842 	 * So we need to add these rings to the list of ports to scan.
1843 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1844 	 * expensive. We should keep a compact list of active destinations
1845 	 * so we could shorten this loop.
1846 	 */
1847 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1848 	if (brddst->bq_head != NM_FT_NULL) {
1849 		u_int j;
1850 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1851 			uint16_t d_i;
1852 			i = b->bdg_port_index[j];
1853 			if (unlikely(i == me))
1854 				continue;
1855 			d_i = i * NM_BDG_MAXRINGS;
1856 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1857 				dsts[num_dsts++] = d_i;
1858 		}
1859 	}
1860 
1861 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1862 	/* second pass: scan destinations */
1863 	for (i = 0; i < num_dsts; i++) {
1864 		struct netmap_vp_adapter *dst_na;
1865 		struct netmap_kring *kring;
1866 		struct netmap_ring *ring;
1867 		u_int dst_nr, lim, j, d_i, next, brd_next;
1868 		u_int needed, howmany;
1869 		int retry = netmap_txsync_retry;
1870 		struct nm_bdg_q *d;
1871 		uint32_t my_start = 0, lease_idx = 0;
1872 		int nrings;
1873 		int virt_hdr_mismatch = 0;
1874 
1875 		d_i = dsts[i];
1876 		ND("second pass %d port %d", i, d_i);
1877 		d = dst_ents + d_i;
1878 		// XXX fix the division
1879 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1880 		/* protect from the lookup function returning an inactive
1881 		 * destination port
1882 		 */
1883 		if (unlikely(dst_na == NULL))
1884 			goto cleanup;
1885 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1886 			goto cleanup;
1887 		/*
1888 		 * The interface may be in !netmap mode in two cases:
1889 		 * - when na is attached but not activated yet;
1890 		 * - when na is being deactivated but is still attached.
1891 		 */
1892 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1893 			ND("not in netmap mode!");
1894 			goto cleanup;
1895 		}
1896 
1897 		/* there is at least one either unicast or broadcast packet */
1898 		brd_next = brddst->bq_head;
1899 		next = d->bq_head;
1900 		/* we need to reserve this many slots. If fewer are
1901 		 * available, some packets will be dropped.
1902 		 * Packets may have multiple fragments, so we may not use
1903 		 * there is a chance that we may not use all of the slots
1904 		 * we have claimed, so we will need to handle the leftover
1905 		 * ones when we regain the lock.
1906 		 */
1907 		needed = d->bq_len + brddst->bq_len;
1908 
1909 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1910                         if (netmap_verbose) {
1911                             RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1912                                   dst_na->up.virt_hdr_len);
1913                         }
1914 			/* There is a virtio-net header/offloadings mismatch between
1915 			 * source and destination. The slower mismatch datapath will
1916 			 * be used to cope with all the mismatches.
1917 			 */
1918 			virt_hdr_mismatch = 1;
1919 			if (dst_na->mfs < na->mfs) {
1920 				/* We may need to do segmentation offloadings, and so
1921 				 * we may need a number of destination slots greater
1922 				 * than the number of input slots ('needed').
1923 				 * We look for the smallest integer 'x' which satisfies:
1924 				 *	needed * na->mfs + x * H <= x * na->mfs
1925 				 * where 'H' is the length of the longest header that may
1926 				 * be replicated in the segmentation process (e.g. for
1927 				 * TCPv4 we must account for ethernet header, IP header
1928 				 * and TCPv4 header).
1929 				 */
1930 				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
1931 				needed = (needed * na->mfs) /
1932 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1933 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1934 			}
1935 		}
1936 
1937 		ND(5, "pass 2 dst %d is %x %s",
1938 			i, d_i, is_vp ? "virtual" : "nic/host");
1939 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1940 		nrings = dst_na->up.num_rx_rings;
1941 		if (dst_nr >= nrings)
1942 			dst_nr = dst_nr % nrings;
1943 		kring = &dst_na->up.rx_rings[dst_nr];
1944 		ring = kring->ring;
1945 		/* the destination ring may have not been opened for RX */
1946 		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
1947 			goto cleanup;
1948 		lim = kring->nkr_num_slots - 1;
1949 
1950 retry:
1951 
1952 		if (dst_na->retry && retry) {
1953 			/* try to get some free slot from the previous run */
1954 			kring->nm_notify(kring, 0);
1955 			/* actually useful only for bwraps, since there
1956 			 * the notify will trigger a txsync on the hwna. VALE ports
1957 			 * have dst_na->retry == 0
1958 			 */
1959 		}
1960 		/* reserve the buffers in the queue and an entry
1961 		 * to report completion, and drop lock.
1962 		 * XXX this might become a helper function.
1963 		 */
1964 		mtx_lock(&kring->q_lock);
1965 		if (kring->nkr_stopped) {
1966 			mtx_unlock(&kring->q_lock);
1967 			goto cleanup;
1968 		}
1969 		my_start = j = kring->nkr_hwlease;
1970 		howmany = nm_kr_space(kring, 1);
1971 		if (needed < howmany)
1972 			howmany = needed;
1973 		lease_idx = nm_kr_lease(kring, howmany, 1);
1974 		mtx_unlock(&kring->q_lock);
1975 
1976 		/* only retry if we need more than available slots */
1977 		if (retry && needed <= howmany)
1978 			retry = 0;
1979 
1980 		/* copy to the destination queue */
1981 		while (howmany > 0) {
1982 			struct netmap_slot *slot;
1983 			struct nm_bdg_fwd *ft_p, *ft_end;
1984 			u_int cnt;
1985 
1986 			/* find the queue from which we pick next packet.
1987 			 * NM_FT_NULL is always higher than valid indexes
1988 			 * so we never dereference it if the other list
1989 			 * has packets (and if both are empty we never
1990 			 * get here).
1991 			 */
1992 			if (next < brd_next) {
1993 				ft_p = ft + next;
1994 				next = ft_p->ft_next;
1995 			} else { /* insert broadcast */
1996 				ft_p = ft + brd_next;
1997 				brd_next = ft_p->ft_next;
1998 			}
1999 			cnt = ft_p->ft_frags; // cnt > 0
2000 			if (unlikely(cnt > howmany))
2001 			    break; /* no more space */
2002 			if (netmap_verbose && cnt > 1)
2003 				RD(5, "rx %d frags to %d", cnt, j);
2004 			ft_end = ft_p + cnt;
2005 			if (unlikely(virt_hdr_mismatch)) {
2006 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
2007 			} else {
2008 				howmany -= cnt;
2009 				do {
2010 					char *dst, *src = ft_p->ft_buf;
2011 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
2012 
2013 					slot = &ring->slot[j];
2014 					dst = NMB(&dst_na->up, slot);
2015 
2016 					ND("send [%d] %d(%d) bytes at %s:%d",
2017 							i, (int)copy_len, (int)dst_len,
2018 							NM_IFPNAME(dst_ifp), j);
2019 					/* round to a multiple of 64 */
2020 					copy_len = (copy_len + 63) & ~63;
2021 
2022 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
2023 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
2024 						RD(5, "invalid len %d, down to 64", (int)copy_len);
2025 						copy_len = dst_len = 64; // XXX
2026 					}
2027 					if (ft_p->ft_flags & NS_INDIRECT) {
2028 						if (copyin(src, dst, copy_len)) {
2029 							// invalid user pointer, pretend len is 0
2030 							dst_len = 0;
2031 						}
2032 					} else {
2033 						//memcpy(dst, src, copy_len);
2034 						pkt_copy(src, dst, (int)copy_len);
2035 					}
2036 					slot->len = dst_len;
2037 					slot->flags = (cnt << 8)| NS_MOREFRAG;
2038 					j = nm_next(j, lim);
2039 					needed--;
2040 					ft_p++;
2041 				} while (ft_p != ft_end);
2042 				slot->flags = (cnt << 8); /* clear flag on last entry */
2043 			}
2044 			/* are we done ? */
2045 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2046 				break;
2047 		}
2048 		{
2049 		    /* current position */
2050 		    uint32_t *p = kring->nkr_leases; /* shorthand */
2051 		    uint32_t update_pos;
2052 		    int still_locked = 1;
2053 
2054 		    mtx_lock(&kring->q_lock);
2055 		    if (unlikely(howmany > 0)) {
2056 			/* not used all bufs. If i am the last one
2057 			 * i can recover the slots, otherwise must
2058 			 * fill them with 0 to mark empty packets.
2059 			 */
2060 			ND("leftover %d bufs", howmany);
2061 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2062 			    /* yes i am the last one */
2063 			    ND("roll back nkr_hwlease to %d", j);
2064 			    kring->nkr_hwlease = j;
2065 			} else {
2066 			    while (howmany-- > 0) {
2067 				ring->slot[j].len = 0;
2068 				ring->slot[j].flags = 0;
2069 				j = nm_next(j, lim);
2070 			    }
2071 			}
2072 		    }
2073 		    p[lease_idx] = j; /* report I am done */
2074 
2075 		    update_pos = kring->nr_hwtail;
2076 
2077 		    if (my_start == update_pos) {
2078 			/* all slots before my_start have been reported,
2079 			 * so scan subsequent leases to see if other ranges
2080 			 * have been completed, and to a selwakeup or txsync.
2081 		         */
2082 			while (lease_idx != kring->nkr_lease_idx &&
2083 				p[lease_idx] != NR_NOSLOT) {
2084 			    j = p[lease_idx];
2085 			    p[lease_idx] = NR_NOSLOT;
2086 			    lease_idx = nm_next(lease_idx, lim);
2087 			}
2088 			/* j is the new 'write' position. j != my_start
2089 			 * means there are new buffers to report
2090 			 */
2091 			if (likely(j != my_start)) {
2092 				kring->nr_hwtail = j;
2093 				still_locked = 0;
2094 				mtx_unlock(&kring->q_lock);
2095 				kring->nm_notify(kring, 0);
2096 				/* this is netmap_notify for VALE ports and
2097 				 * netmap_bwrap_notify for bwrap. The latter will
2098 				 * trigger a txsync on the underlying hwna
2099 				 */
2100 				if (dst_na->retry && retry--) {
2101 					/* XXX this is going to call nm_notify again.
2102 					 * Only useful for bwrap in virtual machines
2103 					 */
2104 					goto retry;
2105 				}
2106 			}
2107 		    }
2108 		    if (still_locked)
2109 			mtx_unlock(&kring->q_lock);
2110 		}
2111 cleanup:
2112 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2113 		d->bq_len = 0;
2114 	}
2115 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2116 	brddst->bq_len = 0;
2117 	return 0;
2118 }
2119 
2120 /* nm_txsync callback for VALE ports */
2121 static int
2122 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2123 {
2124 	struct netmap_vp_adapter *na =
2125 		(struct netmap_vp_adapter *)kring->na;
2126 	u_int done;
2127 	u_int const lim = kring->nkr_num_slots - 1;
2128 	u_int const head = kring->rhead;
2129 
2130 	if (bridge_batch <= 0) { /* testing only */
2131 		done = head; // used all
2132 		goto done;
2133 	}
2134 	if (!na->na_bdg) {
2135 		done = head;
2136 		goto done;
2137 	}
2138 	if (bridge_batch > NM_BDG_BATCH)
2139 		bridge_batch = NM_BDG_BATCH;
2140 
2141 	done = nm_bdg_preflush(kring, head);
2142 done:
2143 	if (done != head)
2144 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2145 	/*
2146 	 * packets between 'done' and 'cur' are left unsent.
2147 	 */
2148 	kring->nr_hwcur = done;
2149 	kring->nr_hwtail = nm_prev(done, lim);
2150 	if (netmap_verbose)
2151 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2152 	return 0;
2153 }
2154 
2155 
2156 /* rxsync code used by VALE ports nm_rxsync callback and also
2157  * internally by the brwap
2158  */
2159 static int
2160 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2161 {
2162 	struct netmap_adapter *na = kring->na;
2163 	struct netmap_ring *ring = kring->ring;
2164 	u_int nm_i, lim = kring->nkr_num_slots - 1;
2165 	u_int head = kring->rhead;
2166 	int n;
2167 
2168 	if (head > lim) {
2169 		D("ouch dangerous reset!!!");
2170 		n = netmap_ring_reinit(kring);
2171 		goto done;
2172 	}
2173 
2174 	/* First part, import newly received packets. */
2175 	/* actually nothing to do here, they are already in the kring */
2176 
2177 	/* Second part, skip past packets that userspace has released. */
2178 	nm_i = kring->nr_hwcur;
2179 	if (nm_i != head) {
2180 		/* consistency check, but nothing really important here */
2181 		for (n = 0; likely(nm_i != head); n++) {
2182 			struct netmap_slot *slot = &ring->slot[nm_i];
2183 			void *addr = NMB(na, slot);
2184 
2185 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2186 				D("bad buffer index %d, ignore ?",
2187 					slot->buf_idx);
2188 			}
2189 			slot->flags &= ~NS_BUF_CHANGED;
2190 			nm_i = nm_next(nm_i, lim);
2191 		}
2192 		kring->nr_hwcur = head;
2193 	}
2194 
2195 	n = 0;
2196 done:
2197 	return n;
2198 }
2199 
2200 /*
2201  * nm_rxsync callback for VALE ports
2202  * user process reading from a VALE switch.
2203  * Already protected against concurrent calls from userspace,
2204  * but we must acquire the queue's lock to protect against
2205  * writers on the same queue.
2206  */
2207 static int
2208 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2209 {
2210 	int n;
2211 
2212 	mtx_lock(&kring->q_lock);
2213 	n = netmap_vp_rxsync_locked(kring, flags);
2214 	mtx_unlock(&kring->q_lock);
2215 	return n;
2216 }
2217 
2218 
2219 /* nm_bdg_attach callback for VALE ports
2220  * The na_vp port is this same netmap_adapter. There is no host port.
2221  */
2222 static int
2223 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2224 {
2225 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2226 
2227 	if (vpna->na_bdg)
2228 		return netmap_bwrap_attach(name, na);
2229 	na->na_vp = vpna;
2230 	strncpy(na->name, name, sizeof(na->name));
2231 	na->na_hostvp = NULL;
2232 	return 0;
2233 }
2234 
2235 /* create a netmap_vp_adapter that describes a VALE port.
2236  * Only persistent VALE ports have a non-null ifp.
2237  */
2238 static int
2239 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp,
2240 		struct netmap_mem_d *nmd,
2241 		struct netmap_vp_adapter **ret)
2242 {
2243 	struct netmap_vp_adapter *vpna;
2244 	struct netmap_adapter *na;
2245 	int error = 0;
2246 	u_int npipes = 0;
2247 
2248 	vpna = nm_os_malloc(sizeof(*vpna));
2249 	if (vpna == NULL)
2250 		return ENOMEM;
2251 
2252  	na = &vpna->up;
2253 
2254 	na->ifp = ifp;
2255 	strncpy(na->name, nmr->nr_name, sizeof(na->name));
2256 
2257 	/* bound checking */
2258 	na->num_tx_rings = nmr->nr_tx_rings;
2259 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2260 	nmr->nr_tx_rings = na->num_tx_rings; // write back
2261 	na->num_rx_rings = nmr->nr_rx_rings;
2262 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2263 	nmr->nr_rx_rings = na->num_rx_rings; // write back
2264 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2265 			1, NM_BDG_MAXSLOTS, NULL);
2266 	na->num_tx_desc = nmr->nr_tx_slots;
2267 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2268 			1, NM_BDG_MAXSLOTS, NULL);
2269 	/* validate number of pipes. We want at least 1,
2270 	 * but probably can do with some more.
2271 	 * So let's use 2 as default (when 0 is supplied)
2272 	 */
2273 	npipes = nmr->nr_arg1;
2274 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2275 	nmr->nr_arg1 = npipes;	/* write back */
2276 	/* validate extra bufs */
2277 	nm_bound_var(&nmr->nr_arg3, 0, 0,
2278 			128*NM_BDG_MAXSLOTS, NULL);
2279 	na->num_rx_desc = nmr->nr_rx_slots;
2280 	/* Set the mfs to a default value, as it is needed on the VALE
2281 	 * mismatch datapath. XXX We should set it according to the MTU
2282 	 * known to the kernel. */
2283 	vpna->mfs = NM_BDG_MFS_DEFAULT;
2284 	vpna->last_smac = ~0llu;
2285 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
2286 		vpna->mfs = netmap_buf_size; */
2287         if (netmap_verbose)
2288 		D("max frame size %u", vpna->mfs);
2289 
2290 	na->na_flags |= NAF_BDG_MAYSLEEP;
2291 	/* persistent VALE ports look like hw devices
2292 	 * with a native netmap adapter
2293 	 */
2294 	if (ifp)
2295 		na->na_flags |= NAF_NATIVE;
2296 	na->nm_txsync = netmap_vp_txsync;
2297 	na->nm_rxsync = netmap_vp_rxsync;
2298 	na->nm_register = netmap_vp_reg;
2299 	na->nm_krings_create = netmap_vp_krings_create;
2300 	na->nm_krings_delete = netmap_vp_krings_delete;
2301 	na->nm_dtor = netmap_vp_dtor;
2302 	D("nr_arg2 %d", nmr->nr_arg2);
2303 	na->nm_mem = nmd ?
2304 		netmap_mem_get(nmd):
2305 		netmap_mem_private_new(
2306 			na->num_tx_rings, na->num_tx_desc,
2307 			na->num_rx_rings, na->num_rx_desc,
2308 			nmr->nr_arg3, npipes, &error);
2309 	if (na->nm_mem == NULL)
2310 		goto err;
2311 	na->nm_bdg_attach = netmap_vp_bdg_attach;
2312 	/* other nmd fields are set in the common routine */
2313 	error = netmap_attach_common(na);
2314 	if (error)
2315 		goto err;
2316 	*ret = vpna;
2317 	return 0;
2318 
2319 err:
2320 	if (na->nm_mem != NULL)
2321 		netmap_mem_put(na->nm_mem);
2322 	nm_os_free(vpna);
2323 	return error;
2324 }
2325 
2326 /* Bridge wrapper code (bwrap).
2327  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2328  * VALE switch.
2329  * The main task is to swap the meaning of tx and rx rings to match the
2330  * expectations of the VALE switch code (see nm_bdg_flush).
2331  *
2332  * The bwrap works by interposing a netmap_bwrap_adapter between the
2333  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2334  * a netmap_vp_adapter to the rest the system, but, internally, it
2335  * translates all callbacks to what the hwna expects.
2336  *
2337  * Note that we have to intercept callbacks coming from two sides:
2338  *
2339  *  - callbacks coming from the netmap module are intercepted by
2340  *    passing around the netmap_bwrap_adapter instead of the hwna
2341  *
2342  *  - callbacks coming from outside of the netmap module only know
2343  *    about the hwna. This, however, only happens in interrupt
2344  *    handlers, where only the hwna->nm_notify callback is called.
2345  *    What the bwrap does is to overwrite the hwna->nm_notify callback
2346  *    with its own netmap_bwrap_intr_notify.
2347  *    XXX This assumes that the hwna->nm_notify callback was the
2348  *    standard netmap_notify(), as it is the case for nic adapters.
2349  *    Any additional action performed by hwna->nm_notify will not be
2350  *    performed by netmap_bwrap_intr_notify.
2351  *
2352  * Additionally, the bwrap can optionally attach the host rings pair
2353  * of the wrapped adapter to a different port of the switch.
2354  */
2355 
2356 
2357 static void
2358 netmap_bwrap_dtor(struct netmap_adapter *na)
2359 {
2360 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2361 	struct netmap_adapter *hwna = bna->hwna;
2362 	struct nm_bridge *b = bna->up.na_bdg,
2363 		*bh = bna->host.na_bdg;
2364 
2365 	if (bna->host.up.nm_mem)
2366 		netmap_mem_put(bna->host.up.nm_mem);
2367 
2368 	if (b) {
2369 		netmap_bdg_detach_common(b, bna->up.bdg_port,
2370 			    (bh ? bna->host.bdg_port : -1));
2371 	}
2372 
2373 	ND("na %p", na);
2374 	na->ifp = NULL;
2375 	bna->host.up.ifp = NULL;
2376 	hwna->na_private = NULL;
2377 	hwna->na_vp = hwna->na_hostvp = NULL;
2378 	hwna->na_flags &= ~NAF_BUSY;
2379 	netmap_adapter_put(hwna);
2380 
2381 }
2382 
2383 
2384 /*
2385  * Intr callback for NICs connected to a bridge.
2386  * Simply ignore tx interrupts (maybe we could try to recover space ?)
2387  * and pass received packets from nic to the bridge.
2388  *
2389  * XXX TODO check locking: this is called from the interrupt
2390  * handler so we should make sure that the interface is not
2391  * disconnected while passing down an interrupt.
2392  *
2393  * Note, no user process can access this NIC or the host stack.
2394  * The only part of the ring that is significant are the slots,
2395  * and head/cur/tail are set from the kring as needed
2396  * (part as a receive ring, part as a transmit ring).
2397  *
2398  * callback that overwrites the hwna notify callback.
2399  * Packets come from the outside or from the host stack and are put on an
2400  * hwna rx ring.
2401  * The bridge wrapper then sends the packets through the bridge.
2402  */
2403 static int
2404 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2405 {
2406 	struct netmap_adapter *na = kring->na;
2407 	struct netmap_bwrap_adapter *bna = na->na_private;
2408 	struct netmap_kring *bkring;
2409 	struct netmap_vp_adapter *vpna = &bna->up;
2410 	u_int ring_nr = kring->ring_id;
2411 	int ret = NM_IRQ_COMPLETED;
2412 	int error;
2413 
2414 	if (netmap_verbose)
2415 	    D("%s %s 0x%x", na->name, kring->name, flags);
2416 
2417 	bkring = &vpna->up.tx_rings[ring_nr];
2418 
2419 	/* make sure the ring is not disabled */
2420 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2421 		return EIO;
2422 	}
2423 
2424 	if (netmap_verbose)
2425 	    D("%s head %d cur %d tail %d",  na->name,
2426 		kring->rhead, kring->rcur, kring->rtail);
2427 
2428 	/* simulate a user wakeup on the rx ring
2429 	 * fetch packets that have arrived.
2430 	 */
2431 	error = kring->nm_sync(kring, 0);
2432 	if (error)
2433 		goto put_out;
2434 	if (kring->nr_hwcur == kring->nr_hwtail) {
2435 		if (netmap_verbose)
2436 			D("how strange, interrupt with no packets on %s",
2437 			    na->name);
2438 		goto put_out;
2439 	}
2440 
2441 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2442 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2443 	 * to push all packets out.
2444 	 */
2445 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
2446 
2447 	netmap_vp_txsync(bkring, flags);
2448 
2449 	/* mark all buffers as released on this ring */
2450 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2451 	/* another call to actually release the buffers */
2452 	error = kring->nm_sync(kring, 0);
2453 
2454 	/* The second rxsync may have further advanced hwtail. If this happens,
2455 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2456 	if (kring->rcur != kring->nr_hwtail) {
2457 		ret = NM_IRQ_RESCHED;
2458 	}
2459 put_out:
2460 	nm_kr_put(kring);
2461 
2462 	return error ? error : ret;
2463 }
2464 
2465 
2466 /* nm_register callback for bwrap */
2467 static int
2468 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2469 {
2470 	struct netmap_bwrap_adapter *bna =
2471 		(struct netmap_bwrap_adapter *)na;
2472 	struct netmap_adapter *hwna = bna->hwna;
2473 	struct netmap_vp_adapter *hostna = &bna->host;
2474 	int error, i;
2475 	enum txrx t;
2476 
2477 	ND("%s %s", na->name, onoff ? "on" : "off");
2478 
2479 	if (onoff) {
2480 		/* netmap_do_regif has been called on the bwrap na.
2481 		 * We need to pass the information about the
2482 		 * memory allocator down to the hwna before
2483 		 * putting it in netmap mode
2484 		 */
2485 		hwna->na_lut = na->na_lut;
2486 
2487 		if (hostna->na_bdg) {
2488 			/* if the host rings have been attached to switch,
2489 			 * we need to copy the memory allocator information
2490 			 * in the hostna also
2491 			 */
2492 			hostna->up.na_lut = na->na_lut;
2493 		}
2494 
2495 	}
2496 
2497 	/* pass down the pending ring state information */
2498 	for_rx_tx(t) {
2499 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2500 			NMR(hwna, t)[i].nr_pending_mode =
2501 				NMR(na, t)[i].nr_pending_mode;
2502 	}
2503 
2504 	/* forward the request to the hwna */
2505 	error = hwna->nm_register(hwna, onoff);
2506 	if (error)
2507 		return error;
2508 
2509 	/* copy up the current ring state information */
2510 	for_rx_tx(t) {
2511 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2512 			struct netmap_kring *kring = &NMR(hwna, t)[i];
2513 			NMR(na, t)[i].nr_mode = kring->nr_mode;
2514 		}
2515 	}
2516 
2517 	/* impersonate a netmap_vp_adapter */
2518 	netmap_vp_reg(na, onoff);
2519 	if (hostna->na_bdg)
2520 		netmap_vp_reg(&hostna->up, onoff);
2521 
2522 	if (onoff) {
2523 		u_int i;
2524 		/* intercept the hwna nm_nofify callback on the hw rings */
2525 		for (i = 0; i < hwna->num_rx_rings; i++) {
2526 			hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2527 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2528 		}
2529 		i = hwna->num_rx_rings; /* for safety */
2530 		/* save the host ring notify unconditionally */
2531 		hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2532 		if (hostna->na_bdg) {
2533 			/* also intercept the host ring notify */
2534 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2535 		}
2536 		if (na->active_fds == 0)
2537 			na->na_flags |= NAF_NETMAP_ON;
2538 	} else {
2539 		u_int i;
2540 
2541 		if (na->active_fds == 0)
2542 			na->na_flags &= ~NAF_NETMAP_ON;
2543 
2544 		/* reset all notify callbacks (including host ring) */
2545 		for (i = 0; i <= hwna->num_rx_rings; i++) {
2546 			hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2547 			hwna->rx_rings[i].save_notify = NULL;
2548 		}
2549 		hwna->na_lut.lut = NULL;
2550 		hwna->na_lut.objtotal = 0;
2551 		hwna->na_lut.objsize = 0;
2552 
2553 		/* pass ownership of the netmap rings to the hwna */
2554 		for_rx_tx(t) {
2555 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2556 				NMR(na, t)[i].ring = NULL;
2557 			}
2558 		}
2559 
2560 	}
2561 
2562 	return 0;
2563 }
2564 
2565 /* nm_config callback for bwrap */
2566 static int
2567 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2568 				    u_int *rxr, u_int *rxd)
2569 {
2570 	struct netmap_bwrap_adapter *bna =
2571 		(struct netmap_bwrap_adapter *)na;
2572 	struct netmap_adapter *hwna = bna->hwna;
2573 
2574 	/* forward the request */
2575 	netmap_update_config(hwna);
2576 	/* swap the results */
2577 	*txr = hwna->num_rx_rings;
2578 	*txd = hwna->num_rx_desc;
2579 	*rxr = hwna->num_tx_rings;
2580 	*rxd = hwna->num_rx_desc;
2581 
2582 	return 0;
2583 }
2584 
2585 
2586 /* nm_krings_create callback for bwrap */
2587 static int
2588 netmap_bwrap_krings_create(struct netmap_adapter *na)
2589 {
2590 	struct netmap_bwrap_adapter *bna =
2591 		(struct netmap_bwrap_adapter *)na;
2592 	struct netmap_adapter *hwna = bna->hwna;
2593 	struct netmap_adapter *hostna = &bna->host.up;
2594 	int i, error = 0;
2595 	enum txrx t;
2596 
2597 	ND("%s", na->name);
2598 
2599 	/* impersonate a netmap_vp_adapter */
2600 	error = netmap_vp_krings_create(na);
2601 	if (error)
2602 		return error;
2603 
2604 	/* also create the hwna krings */
2605 	error = hwna->nm_krings_create(hwna);
2606 	if (error) {
2607 		goto err_del_vp_rings;
2608 	}
2609 
2610 	/* increment the usage counter for all the hwna krings */
2611         for_rx_tx(t) {
2612                 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2613 			NMR(hwna, t)[i].users++;
2614 		}
2615         }
2616 
2617 	/* now create the actual rings */
2618 	error = netmap_mem_rings_create(hwna);
2619 	if (error) {
2620 		goto err_dec_users;
2621 	}
2622 
2623 	/* cross-link the netmap rings
2624 	 * The original number of rings comes from hwna,
2625 	 * rx rings on one side equals tx rings on the other.
2626 	 */
2627         for_rx_tx(t) {
2628                 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2629                 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2630                         NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2631                         NMR(na, t)[i].ring = NMR(hwna, r)[i].ring;
2632                 }
2633         }
2634 
2635 	if (na->na_flags & NAF_HOST_RINGS) {
2636 		/* the hostna rings are the host rings of the bwrap.
2637 		 * The corresponding krings must point back to the
2638 		 * hostna
2639 		 */
2640 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
2641 		hostna->tx_rings[0].na = hostna;
2642 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
2643 		hostna->rx_rings[0].na = hostna;
2644 	}
2645 
2646 	return 0;
2647 
2648 err_dec_users:
2649         for_rx_tx(t) {
2650 		NMR(hwna, t)[i].users--;
2651         }
2652 	hwna->nm_krings_delete(hwna);
2653 err_del_vp_rings:
2654 	netmap_vp_krings_delete(na);
2655 
2656 	return error;
2657 }
2658 
2659 
2660 static void
2661 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2662 {
2663 	struct netmap_bwrap_adapter *bna =
2664 		(struct netmap_bwrap_adapter *)na;
2665 	struct netmap_adapter *hwna = bna->hwna;
2666 	enum txrx t;
2667 	int i;
2668 
2669 	ND("%s", na->name);
2670 
2671 	/* decrement the usage counter for all the hwna krings */
2672         for_rx_tx(t) {
2673                 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2674 			NMR(hwna, t)[i].users--;
2675 		}
2676         }
2677 
2678 	/* delete any netmap rings that are no longer needed */
2679 	netmap_mem_rings_delete(hwna);
2680 	hwna->nm_krings_delete(hwna);
2681 	netmap_vp_krings_delete(na);
2682 }
2683 
2684 
2685 /* notify method for the bridge-->hwna direction */
2686 static int
2687 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2688 {
2689 	struct netmap_adapter *na = kring->na;
2690 	struct netmap_bwrap_adapter *bna = na->na_private;
2691 	struct netmap_adapter *hwna = bna->hwna;
2692 	u_int ring_n = kring->ring_id;
2693 	u_int lim = kring->nkr_num_slots - 1;
2694 	struct netmap_kring *hw_kring;
2695 	int error;
2696 
2697 	ND("%s: na %s hwna %s",
2698 			(kring ? kring->name : "NULL!"),
2699 			(na ? na->name : "NULL!"),
2700 			(hwna ? hwna->name : "NULL!"));
2701 	hw_kring = &hwna->tx_rings[ring_n];
2702 
2703 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
2704 		return ENXIO;
2705 	}
2706 
2707 	/* first step: simulate a user wakeup on the rx ring */
2708 	netmap_vp_rxsync(kring, flags);
2709 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2710 		na->name, ring_n,
2711 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2712 		ring->head, ring->cur, ring->tail,
2713 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2714 	/* second step: the new packets are sent on the tx ring
2715 	 * (which is actually the same ring)
2716 	 */
2717 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2718 	error = hw_kring->nm_sync(hw_kring, flags);
2719 	if (error)
2720 		goto put_out;
2721 
2722 	/* third step: now we are back the rx ring */
2723 	/* claim ownership on all hw owned bufs */
2724 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2725 
2726 	/* fourth step: the user goes to sleep again, causing another rxsync */
2727 	netmap_vp_rxsync(kring, flags);
2728 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2729 		na->name, ring_n,
2730 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2731 		ring->head, ring->cur, ring->tail,
2732 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2733 put_out:
2734 	nm_kr_put(hw_kring);
2735 
2736 	return error ? error : NM_IRQ_COMPLETED;
2737 }
2738 
2739 
2740 /* nm_bdg_ctl callback for the bwrap.
2741  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2742  * On attach, it needs to provide a fake netmap_priv_d structure and
2743  * perform a netmap_do_regif() on the bwrap. This will put both the
2744  * bwrap and the hwna in netmap mode, with the netmap rings shared
2745  * and cross linked. Moroever, it will start intercepting interrupts
2746  * directed to hwna.
2747  */
2748 static int
2749 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2750 {
2751 	struct netmap_priv_d *npriv;
2752 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2753 	int error = 0;
2754 
2755 	if (attach) {
2756 		if (NETMAP_OWNED_BY_ANY(na)) {
2757 			return EBUSY;
2758 		}
2759 		if (bna->na_kpriv) {
2760 			/* nothing to do */
2761 			return 0;
2762 		}
2763 		npriv = netmap_priv_new();
2764 		if (npriv == NULL)
2765 			return ENOMEM;
2766 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2767 		error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags);
2768 		if (error) {
2769 			netmap_priv_delete(npriv);
2770 			return error;
2771 		}
2772 		bna->na_kpriv = npriv;
2773 		na->na_flags |= NAF_BUSY;
2774 	} else {
2775 		if (na->active_fds == 0) /* not registered */
2776 			return EINVAL;
2777 		netmap_priv_delete(bna->na_kpriv);
2778 		bna->na_kpriv = NULL;
2779 		na->na_flags &= ~NAF_BUSY;
2780 	}
2781 	return error;
2782 
2783 }
2784 
2785 /* attach a bridge wrapper to the 'real' device */
2786 int
2787 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2788 {
2789 	struct netmap_bwrap_adapter *bna;
2790 	struct netmap_adapter *na = NULL;
2791 	struct netmap_adapter *hostna = NULL;
2792 	int error = 0;
2793 	enum txrx t;
2794 
2795 	/* make sure the NIC is not already in use */
2796 	if (NETMAP_OWNED_BY_ANY(hwna)) {
2797 		D("NIC %s busy, cannot attach to bridge", hwna->name);
2798 		return EBUSY;
2799 	}
2800 
2801 	bna = nm_os_malloc(sizeof(*bna));
2802 	if (bna == NULL) {
2803 		return ENOMEM;
2804 	}
2805 
2806 	na = &bna->up.up;
2807 	/* make bwrap ifp point to the real ifp */
2808 	na->ifp = hwna->ifp;
2809 	if_ref(na->ifp);
2810 	na->na_private = bna;
2811 	strncpy(na->name, nr_name, sizeof(na->name));
2812 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2813 	 * swapped. The real cross-linking will be done during register,
2814 	 * when all the krings will have been created.
2815 	 */
2816 	for_rx_tx(t) {
2817 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2818 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2819 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2820 	}
2821 	na->nm_dtor = netmap_bwrap_dtor;
2822 	na->nm_register = netmap_bwrap_reg;
2823 	// na->nm_txsync = netmap_bwrap_txsync;
2824 	// na->nm_rxsync = netmap_bwrap_rxsync;
2825 	na->nm_config = netmap_bwrap_config;
2826 	na->nm_krings_create = netmap_bwrap_krings_create;
2827 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2828 	na->nm_notify = netmap_bwrap_notify;
2829 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2830 	na->pdev = hwna->pdev;
2831 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
2832 	na->virt_hdr_len = hwna->virt_hdr_len;
2833 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2834 	/* Set the mfs, needed on the VALE mismatch datapath. */
2835 	bna->up.mfs = NM_BDG_MFS_DEFAULT;
2836 
2837 	bna->hwna = hwna;
2838 	netmap_adapter_get(hwna);
2839 	hwna->na_private = bna; /* weak reference */
2840 	hwna->na_vp = &bna->up;
2841 
2842 	if (hwna->na_flags & NAF_HOST_RINGS) {
2843 		if (hwna->na_flags & NAF_SW_ONLY)
2844 			na->na_flags |= NAF_SW_ONLY;
2845 		na->na_flags |= NAF_HOST_RINGS;
2846 		hostna = &bna->host.up;
2847 		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2848 		hostna->ifp = hwna->ifp;
2849 		for_rx_tx(t) {
2850 			enum txrx r = nm_txrx_swap(t);
2851 			nma_set_nrings(hostna, t, 1);
2852 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2853 		}
2854 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2855 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2856 		hostna->nm_notify = netmap_bwrap_notify;
2857 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
2858 		hostna->na_private = bna;
2859 		hostna->na_vp = &bna->up;
2860 		na->na_hostvp = hwna->na_hostvp =
2861 			hostna->na_hostvp = &bna->host;
2862 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2863 		bna->host.mfs = NM_BDG_MFS_DEFAULT;
2864 	}
2865 
2866 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2867 		na->name, ifp->if_xname,
2868 		na->num_tx_rings, na->num_tx_desc,
2869 		na->num_rx_rings, na->num_rx_desc);
2870 
2871 	error = netmap_attach_common(na);
2872 	if (error) {
2873 		goto err_free;
2874 	}
2875 	hwna->na_flags |= NAF_BUSY;
2876 	return 0;
2877 
2878 err_free:
2879 	hwna->na_vp = hwna->na_hostvp = NULL;
2880 	netmap_adapter_put(hwna);
2881 	nm_os_free(bna);
2882 	return error;
2883 
2884 }
2885 
2886 struct nm_bridge *
2887 netmap_init_bridges2(u_int n)
2888 {
2889 	int i;
2890 	struct nm_bridge *b;
2891 
2892 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
2893 	if (b == NULL)
2894 		return NULL;
2895 	for (i = 0; i < n; i++)
2896 		BDG_RWINIT(&b[i]);
2897 	return b;
2898 }
2899 
2900 void
2901 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2902 {
2903 	int i;
2904 
2905 	if (b == NULL)
2906 		return;
2907 
2908 	for (i = 0; i < n; i++)
2909 		BDG_RWDESTROY(&b[i]);
2910 	nm_os_free(b);
2911 }
2912 
2913 int
2914 netmap_init_bridges(void)
2915 {
2916 #ifdef CONFIG_NET_NS
2917 	return netmap_bns_register();
2918 #else
2919 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2920 	if (nm_bridges == NULL)
2921 		return ENOMEM;
2922 	return 0;
2923 #endif
2924 }
2925 
2926 void
2927 netmap_uninit_bridges(void)
2928 {
2929 #ifdef CONFIG_NET_NS
2930 	netmap_bns_unregister();
2931 #else
2932 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2933 #endif
2934 }
2935 #endif /* WITH_VALE */
2936