1 /* 2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module implements the VALE switch for netmap 29 30 --- VALE SWITCH --- 31 32 NMG_LOCK() serializes all modifications to switches and ports. 33 A switch cannot be deleted until all ports are gone. 34 35 For each switch, an SX lock (RWlock on linux) protects 36 deletion of ports. When configuring or deleting a new port, the 37 lock is acquired in exclusive mode (after holding NMG_LOCK). 38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 39 The lock is held throughout the entire forwarding cycle, 40 during which the thread may incur in a page fault. 41 Hence it is important that sleepable shared locks are used. 42 43 On the rx ring, the per-port lock is grabbed initially to reserve 44 a number of slot in the ring, then the lock is released, 45 packets are copied from source to destination, and then 46 the lock is acquired again and the receive ring is updated. 47 (A similar thing is done on the tx ring for NIC and host stack 48 ports attached to the switch) 49 50 */ 51 52 /* 53 * OS-specific code that is used only within this file. 54 * Other OS-specific code that must be accessed by drivers 55 * is present in netmap_kern.h 56 */ 57 58 #if defined(__FreeBSD__) 59 #include <sys/cdefs.h> /* prerequisite */ 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/types.h> 63 #include <sys/errno.h> 64 #include <sys/param.h> /* defines used in kernel.h */ 65 #include <sys/kernel.h> /* types used in module initialization */ 66 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 67 #include <sys/sockio.h> 68 #include <sys/socketvar.h> /* struct socket */ 69 #include <sys/malloc.h> 70 #include <sys/poll.h> 71 #include <sys/rwlock.h> 72 #include <sys/socket.h> /* sockaddrs */ 73 #include <sys/selinfo.h> 74 #include <sys/sysctl.h> 75 #include <net/if.h> 76 #include <net/if_var.h> 77 #include <net/bpf.h> /* BIOCIMMEDIATE */ 78 #include <machine/bus.h> /* bus_dmamap_* */ 79 #include <sys/endian.h> 80 #include <sys/refcount.h> 81 82 83 #define BDG_RWLOCK_T struct rwlock // struct rwlock 84 85 #define BDG_RWINIT(b) \ 86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 93 94 95 #elif defined(linux) 96 97 #include "bsd_glue.h" 98 99 #elif defined(__APPLE__) 100 101 #warning OSX support is only partial 102 #include "osx_glue.h" 103 104 #else 105 106 #error Unsupported platform 107 108 #endif /* unsupported */ 109 110 /* 111 * common headers 112 */ 113 114 #include <net/netmap.h> 115 #include <dev/netmap/netmap_kern.h> 116 #include <dev/netmap/netmap_mem2.h> 117 118 #ifdef WITH_VALE 119 120 /* 121 * system parameters (most of them in netmap_kern.h) 122 * NM_NAME prefix for switch port names, default "vale" 123 * NM_BDG_MAXPORTS number of ports 124 * NM_BRIDGES max number of switches in the system. 125 * XXX should become a sysctl or tunable 126 * 127 * Switch ports are named valeX:Y where X is the switch name and Y 128 * is the port. If Y matches a physical interface name, the port is 129 * connected to a physical device. 130 * 131 * Unlike physical interfaces, switch ports use their own memory region 132 * for rings and buffers. 133 * The virtual interfaces use per-queue lock instead of core lock. 134 * In the tx loop, we aggregate traffic in batches to make all operations 135 * faster. The batch size is bridge_batch. 136 */ 137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 140 #define NM_BDG_HASH 1024 /* forwarding table entries */ 141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 142 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 143 /* actual size of the tables */ 144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 145 /* NM_FT_NULL terminates a list of slots in the ft */ 146 #define NM_FT_NULL NM_BDG_BATCH_MAX 147 #define NM_BRIDGES 8 /* number of bridges */ 148 149 150 /* 151 * bridge_batch is set via sysctl to the max batch size to be 152 * used in the bridge. The actual value may be larger as the 153 * last packet in the block may overflow the size. 154 */ 155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 156 SYSCTL_DECL(_dev_netmap); 157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 158 159 160 static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); 161 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 162 static int netmap_bwrap_register(struct netmap_adapter *, int onoff); 163 164 /* 165 * For each output interface, nm_bdg_q is used to construct a list. 166 * bq_len is the number of output buffers (we can have coalescing 167 * during the copy). 168 */ 169 struct nm_bdg_q { 170 uint16_t bq_head; 171 uint16_t bq_tail; 172 uint32_t bq_len; /* number of buffers */ 173 }; 174 175 /* XXX revise this */ 176 struct nm_hash_ent { 177 uint64_t mac; /* the top 2 bytes are the epoch */ 178 uint64_t ports; 179 }; 180 181 /* 182 * nm_bridge is a descriptor for a VALE switch. 183 * Interfaces for a bridge are all in bdg_ports[]. 184 * The array has fixed size, an empty entry does not terminate 185 * the search, but lookups only occur on attach/detach so we 186 * don't mind if they are slow. 187 * 188 * The bridge is non blocking on the transmit ports: excess 189 * packets are dropped if there is no room on the output port. 190 * 191 * bdg_lock protects accesses to the bdg_ports array. 192 * This is a rw lock (or equivalent). 193 */ 194 struct nm_bridge { 195 /* XXX what is the proper alignment/layout ? */ 196 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 197 int bdg_namelen; 198 uint32_t bdg_active_ports; /* 0 means free */ 199 char bdg_basename[IFNAMSIZ]; 200 201 /* Indexes of active ports (up to active_ports) 202 * and all other remaining ports. 203 */ 204 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 205 206 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 207 208 209 /* 210 * The function to decide the destination port. 211 * It returns either of an index of the destination port, 212 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 213 * forward this packet. ring_nr is the source ring index, and the 214 * function may overwrite this value to forward this packet to a 215 * different ring index. 216 * This function must be set by netmap_bdgctl(). 217 */ 218 struct netmap_bdg_ops bdg_ops; 219 220 /* the forwarding table, MAC+ports. 221 * XXX should be changed to an argument to be passed to 222 * the lookup function, and allocated on attach 223 */ 224 struct nm_hash_ent ht[NM_BDG_HASH]; 225 226 #ifdef CONFIG_NET_NS 227 struct net *ns; 228 #endif /* CONFIG_NET_NS */ 229 }; 230 231 const char* 232 netmap_bdg_name(struct netmap_vp_adapter *vp) 233 { 234 struct nm_bridge *b = vp->na_bdg; 235 if (b == NULL) 236 return NULL; 237 return b->bdg_basename; 238 } 239 240 241 #ifndef CONFIG_NET_NS 242 /* 243 * XXX in principle nm_bridges could be created dynamically 244 * Right now we have a static array and deletions are protected 245 * by an exclusive lock. 246 */ 247 struct nm_bridge *nm_bridges; 248 #endif /* !CONFIG_NET_NS */ 249 250 251 /* 252 * this is a slightly optimized copy routine which rounds 253 * to multiple of 64 bytes and is often faster than dealing 254 * with other odd sizes. We assume there is enough room 255 * in the source and destination buffers. 256 * 257 * XXX only for multiples of 64 bytes, non overlapped. 258 */ 259 static inline void 260 pkt_copy(void *_src, void *_dst, int l) 261 { 262 uint64_t *src = _src; 263 uint64_t *dst = _dst; 264 if (unlikely(l >= 1024)) { 265 memcpy(dst, src, l); 266 return; 267 } 268 for (; likely(l > 0); l-=64) { 269 *dst++ = *src++; 270 *dst++ = *src++; 271 *dst++ = *src++; 272 *dst++ = *src++; 273 *dst++ = *src++; 274 *dst++ = *src++; 275 *dst++ = *src++; 276 *dst++ = *src++; 277 } 278 } 279 280 281 /* 282 * locate a bridge among the existing ones. 283 * MUST BE CALLED WITH NMG_LOCK() 284 * 285 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 286 * We assume that this is called with a name of at least NM_NAME chars. 287 */ 288 static struct nm_bridge * 289 nm_find_bridge(const char *name, int create) 290 { 291 int i, l, namelen; 292 struct nm_bridge *b = NULL, *bridges; 293 u_int num_bridges; 294 295 NMG_LOCK_ASSERT(); 296 297 netmap_bns_getbridges(&bridges, &num_bridges); 298 299 namelen = strlen(NM_NAME); /* base length */ 300 l = name ? strlen(name) : 0; /* actual length */ 301 if (l < namelen) { 302 D("invalid bridge name %s", name ? name : NULL); 303 return NULL; 304 } 305 for (i = namelen + 1; i < l; i++) { 306 if (name[i] == ':') { 307 namelen = i; 308 break; 309 } 310 } 311 if (namelen >= IFNAMSIZ) 312 namelen = IFNAMSIZ; 313 ND("--- prefix is '%.*s' ---", namelen, name); 314 315 /* lookup the name, remember empty slot if there is one */ 316 for (i = 0; i < num_bridges; i++) { 317 struct nm_bridge *x = bridges + i; 318 319 if (x->bdg_active_ports == 0) { 320 if (create && b == NULL) 321 b = x; /* record empty slot */ 322 } else if (x->bdg_namelen != namelen) { 323 continue; 324 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 325 ND("found '%.*s' at %d", namelen, name, i); 326 b = x; 327 break; 328 } 329 } 330 if (i == num_bridges && b) { /* name not found, can create entry */ 331 /* initialize the bridge */ 332 strncpy(b->bdg_basename, name, namelen); 333 ND("create new bridge %s with ports %d", b->bdg_basename, 334 b->bdg_active_ports); 335 b->bdg_namelen = namelen; 336 b->bdg_active_ports = 0; 337 for (i = 0; i < NM_BDG_MAXPORTS; i++) 338 b->bdg_port_index[i] = i; 339 /* set the default function */ 340 b->bdg_ops.lookup = netmap_bdg_learning; 341 /* reset the MAC address table */ 342 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 343 NM_BNS_GET(b); 344 } 345 return b; 346 } 347 348 349 /* 350 * Free the forwarding tables for rings attached to switch ports. 351 */ 352 static void 353 nm_free_bdgfwd(struct netmap_adapter *na) 354 { 355 int nrings, i; 356 struct netmap_kring *kring; 357 358 NMG_LOCK_ASSERT(); 359 nrings = na->num_tx_rings; 360 kring = na->tx_rings; 361 for (i = 0; i < nrings; i++) { 362 if (kring[i].nkr_ft) { 363 free(kring[i].nkr_ft, M_DEVBUF); 364 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 365 } 366 } 367 } 368 369 370 /* 371 * Allocate the forwarding tables for the rings attached to the bridge ports. 372 */ 373 static int 374 nm_alloc_bdgfwd(struct netmap_adapter *na) 375 { 376 int nrings, l, i, num_dstq; 377 struct netmap_kring *kring; 378 379 NMG_LOCK_ASSERT(); 380 /* all port:rings + broadcast */ 381 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 382 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 383 l += sizeof(struct nm_bdg_q) * num_dstq; 384 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 385 386 nrings = netmap_real_rings(na, NR_TX); 387 kring = na->tx_rings; 388 for (i = 0; i < nrings; i++) { 389 struct nm_bdg_fwd *ft; 390 struct nm_bdg_q *dstq; 391 int j; 392 393 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 394 if (!ft) { 395 nm_free_bdgfwd(na); 396 return ENOMEM; 397 } 398 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 399 for (j = 0; j < num_dstq; j++) { 400 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 401 dstq[j].bq_len = 0; 402 } 403 kring[i].nkr_ft = ft; 404 } 405 return 0; 406 } 407 408 409 /* remove from bridge b the ports in slots hw and sw 410 * (sw can be -1 if not needed) 411 */ 412 static void 413 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 414 { 415 int s_hw = hw, s_sw = sw; 416 int i, lim =b->bdg_active_ports; 417 uint8_t tmp[NM_BDG_MAXPORTS]; 418 419 /* 420 New algorithm: 421 make a copy of bdg_port_index; 422 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 423 in the array of bdg_port_index, replacing them with 424 entries from the bottom of the array; 425 decrement bdg_active_ports; 426 acquire BDG_WLOCK() and copy back the array. 427 */ 428 429 if (netmap_verbose) 430 D("detach %d and %d (lim %d)", hw, sw, lim); 431 /* make a copy of the list of active ports, update it, 432 * and then copy back within BDG_WLOCK(). 433 */ 434 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 435 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 436 if (hw >= 0 && tmp[i] == hw) { 437 ND("detach hw %d at %d", hw, i); 438 lim--; /* point to last active port */ 439 tmp[i] = tmp[lim]; /* swap with i */ 440 tmp[lim] = hw; /* now this is inactive */ 441 hw = -1; 442 } else if (sw >= 0 && tmp[i] == sw) { 443 ND("detach sw %d at %d", sw, i); 444 lim--; 445 tmp[i] = tmp[lim]; 446 tmp[lim] = sw; 447 sw = -1; 448 } else { 449 i++; 450 } 451 } 452 if (hw >= 0 || sw >= 0) { 453 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 454 } 455 456 BDG_WLOCK(b); 457 if (b->bdg_ops.dtor) 458 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 459 b->bdg_ports[s_hw] = NULL; 460 if (s_sw >= 0) { 461 b->bdg_ports[s_sw] = NULL; 462 } 463 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 464 b->bdg_active_ports = lim; 465 BDG_WUNLOCK(b); 466 467 ND("now %d active ports", lim); 468 if (lim == 0) { 469 ND("marking bridge %s as free", b->bdg_basename); 470 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 471 NM_BNS_PUT(b); 472 } 473 } 474 475 /* nm_bdg_ctl callback for VALE ports */ 476 static int 477 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 478 { 479 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 480 struct nm_bridge *b = vpna->na_bdg; 481 482 if (attach) 483 return 0; /* nothing to do */ 484 if (b) { 485 netmap_set_all_rings(na, 0 /* disable */); 486 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 487 vpna->na_bdg = NULL; 488 netmap_set_all_rings(na, 1 /* enable */); 489 } 490 /* I have took reference just for attach */ 491 netmap_adapter_put(na); 492 return 0; 493 } 494 495 /* nm_dtor callback for ephemeral VALE ports */ 496 static void 497 netmap_vp_dtor(struct netmap_adapter *na) 498 { 499 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 500 struct nm_bridge *b = vpna->na_bdg; 501 502 ND("%s has %d references", na->name, na->na_refcount); 503 504 if (b) { 505 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 506 } 507 } 508 509 /* nm_dtor callback for persistent VALE ports */ 510 static void 511 netmap_persist_vp_dtor(struct netmap_adapter *na) 512 { 513 struct ifnet *ifp = na->ifp; 514 515 netmap_vp_dtor(na); 516 na->ifp = NULL; 517 nm_vi_detach(ifp); 518 } 519 520 /* remove a persistent VALE port from the system */ 521 static int 522 nm_vi_destroy(const char *name) 523 { 524 struct ifnet *ifp; 525 int error; 526 527 ifp = ifunit_ref(name); 528 if (!ifp) 529 return ENXIO; 530 NMG_LOCK(); 531 /* make sure this is actually a VALE port */ 532 if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 533 error = EINVAL; 534 goto err; 535 } 536 537 if (NA(ifp)->na_refcount > 1) { 538 error = EBUSY; 539 goto err; 540 } 541 NMG_UNLOCK(); 542 543 D("destroying a persistent vale interface %s", ifp->if_xname); 544 /* Linux requires all the references are released 545 * before unregister 546 */ 547 if_rele(ifp); 548 netmap_detach(ifp); 549 return 0; 550 551 err: 552 NMG_UNLOCK(); 553 if_rele(ifp); 554 return error; 555 } 556 557 /* 558 * Create a virtual interface registered to the system. 559 * The interface will be attached to a bridge later. 560 */ 561 static int 562 nm_vi_create(struct nmreq *nmr) 563 { 564 struct ifnet *ifp; 565 struct netmap_vp_adapter *vpna; 566 int error; 567 568 /* don't include VALE prefix */ 569 if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) 570 return EINVAL; 571 ifp = ifunit_ref(nmr->nr_name); 572 if (ifp) { /* already exist, cannot create new one */ 573 if_rele(ifp); 574 return EEXIST; 575 } 576 error = nm_vi_persist(nmr->nr_name, &ifp); 577 if (error) 578 return error; 579 580 NMG_LOCK(); 581 /* netmap_vp_create creates a struct netmap_vp_adapter */ 582 error = netmap_vp_create(nmr, ifp, &vpna); 583 if (error) { 584 D("error %d", error); 585 nm_vi_detach(ifp); 586 return error; 587 } 588 /* persist-specific routines */ 589 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 590 vpna->up.nm_dtor = netmap_persist_vp_dtor; 591 netmap_adapter_get(&vpna->up); 592 NMG_UNLOCK(); 593 D("created %s", ifp->if_xname); 594 return 0; 595 } 596 597 /* Try to get a reference to a netmap adapter attached to a VALE switch. 598 * If the adapter is found (or is created), this function returns 0, a 599 * non NULL pointer is returned into *na, and the caller holds a 600 * reference to the adapter. 601 * If an adapter is not found, then no reference is grabbed and the 602 * function returns an error code, or 0 if there is just a VALE prefix 603 * mismatch. Therefore the caller holds a reference when 604 * (*na != NULL && return == 0). 605 */ 606 int 607 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 608 { 609 char *nr_name = nmr->nr_name; 610 const char *ifname; 611 struct ifnet *ifp; 612 int error = 0; 613 struct netmap_vp_adapter *vpna, *hostna = NULL; 614 struct nm_bridge *b; 615 int i, j, cand = -1, cand2 = -1; 616 int needed; 617 618 *na = NULL; /* default return value */ 619 620 /* first try to see if this is a bridge port. */ 621 NMG_LOCK_ASSERT(); 622 if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { 623 return 0; /* no error, but no VALE prefix */ 624 } 625 626 b = nm_find_bridge(nr_name, create); 627 if (b == NULL) { 628 D("no bridges available for '%s'", nr_name); 629 return (create ? ENOMEM : ENXIO); 630 } 631 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 632 panic("x"); 633 634 /* Now we are sure that name starts with the bridge's name, 635 * lookup the port in the bridge. We need to scan the entire 636 * list. It is not important to hold a WLOCK on the bridge 637 * during the search because NMG_LOCK already guarantees 638 * that there are no other possible writers. 639 */ 640 641 /* lookup in the local list of ports */ 642 for (j = 0; j < b->bdg_active_ports; j++) { 643 i = b->bdg_port_index[j]; 644 vpna = b->bdg_ports[i]; 645 // KASSERT(na != NULL); 646 ND("checking %s", vpna->up.name); 647 if (!strcmp(vpna->up.name, nr_name)) { 648 netmap_adapter_get(&vpna->up); 649 ND("found existing if %s refs %d", nr_name) 650 *na = &vpna->up; 651 return 0; 652 } 653 } 654 /* not found, should we create it? */ 655 if (!create) 656 return ENXIO; 657 /* yes we should, see if we have space to attach entries */ 658 needed = 2; /* in some cases we only need 1 */ 659 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 660 D("bridge full %d, cannot create new port", b->bdg_active_ports); 661 return ENOMEM; 662 } 663 /* record the next two ports available, but do not allocate yet */ 664 cand = b->bdg_port_index[b->bdg_active_ports]; 665 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 666 ND("+++ bridge %s port %s used %d avail %d %d", 667 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 668 669 /* 670 * try see if there is a matching NIC with this name 671 * (after the bridge's name) 672 */ 673 ifname = nr_name + b->bdg_namelen + 1; 674 ifp = ifunit_ref(ifname); 675 if (!ifp) { 676 /* Create an ephemeral virtual port 677 * This block contains all the ephemeral-specific logics 678 */ 679 if (nmr->nr_cmd) { 680 /* nr_cmd must be 0 for a virtual port */ 681 return EINVAL; 682 } 683 684 /* bdg_netmap_attach creates a struct netmap_adapter */ 685 error = netmap_vp_create(nmr, NULL, &vpna); 686 if (error) { 687 D("error %d", error); 688 free(ifp, M_DEVBUF); 689 return error; 690 } 691 /* shortcut - we can skip get_hw_na(), 692 * ownership check and nm_bdg_attach() 693 */ 694 } else { 695 struct netmap_adapter *hw; 696 697 error = netmap_get_hw_na(ifp, &hw); 698 if (error || hw == NULL) 699 goto out; 700 701 /* host adapter might not be created */ 702 error = hw->nm_bdg_attach(nr_name, hw); 703 if (error) 704 goto out; 705 vpna = hw->na_vp; 706 hostna = hw->na_hostvp; 707 if_rele(ifp); 708 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 709 hostna = NULL; 710 } 711 712 BDG_WLOCK(b); 713 vpna->bdg_port = cand; 714 ND("NIC %p to bridge port %d", vpna, cand); 715 /* bind the port to the bridge (virtual ports are not active) */ 716 b->bdg_ports[cand] = vpna; 717 vpna->na_bdg = b; 718 b->bdg_active_ports++; 719 if (hostna != NULL) { 720 /* also bind the host stack to the bridge */ 721 b->bdg_ports[cand2] = hostna; 722 hostna->bdg_port = cand2; 723 hostna->na_bdg = b; 724 b->bdg_active_ports++; 725 ND("host %p to bridge port %d", hostna, cand2); 726 } 727 ND("if %s refs %d", ifname, vpna->up.na_refcount); 728 BDG_WUNLOCK(b); 729 *na = &vpna->up; 730 netmap_adapter_get(*na); 731 return 0; 732 733 out: 734 if_rele(ifp); 735 736 return error; 737 } 738 739 740 /* Process NETMAP_BDG_ATTACH */ 741 static int 742 nm_bdg_ctl_attach(struct nmreq *nmr) 743 { 744 struct netmap_adapter *na; 745 int error; 746 747 NMG_LOCK(); 748 749 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); 750 if (error) /* no device */ 751 goto unlock_exit; 752 753 if (na == NULL) { /* VALE prefix missing */ 754 error = EINVAL; 755 goto unlock_exit; 756 } 757 758 if (NETMAP_OWNED_BY_ANY(na)) { 759 error = EBUSY; 760 goto unref_exit; 761 } 762 763 if (na->nm_bdg_ctl) { 764 /* nop for VALE ports. The bwrap needs to put the hwna 765 * in netmap mode (see netmap_bwrap_bdg_ctl) 766 */ 767 error = na->nm_bdg_ctl(na, nmr, 1); 768 if (error) 769 goto unref_exit; 770 ND("registered %s to netmap-mode", na->name); 771 } 772 NMG_UNLOCK(); 773 return 0; 774 775 unref_exit: 776 netmap_adapter_put(na); 777 unlock_exit: 778 NMG_UNLOCK(); 779 return error; 780 } 781 782 783 /* process NETMAP_BDG_DETACH */ 784 static int 785 nm_bdg_ctl_detach(struct nmreq *nmr) 786 { 787 struct netmap_adapter *na; 788 int error; 789 790 NMG_LOCK(); 791 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); 792 if (error) { /* no device, or another bridge or user owns the device */ 793 goto unlock_exit; 794 } 795 796 if (na == NULL) { /* VALE prefix missing */ 797 error = EINVAL; 798 goto unlock_exit; 799 } 800 801 if (na->nm_bdg_ctl) { 802 /* remove the port from bridge. The bwrap 803 * also needs to put the hwna in normal mode 804 */ 805 error = na->nm_bdg_ctl(na, nmr, 0); 806 } 807 808 netmap_adapter_put(na); 809 unlock_exit: 810 NMG_UNLOCK(); 811 return error; 812 813 } 814 815 816 /* Called by either user's context (netmap_ioctl()) 817 * or external kernel modules (e.g., Openvswitch). 818 * Operation is indicated in nmr->nr_cmd. 819 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 820 * requires bdg_ops argument; the other commands ignore this argument. 821 * 822 * Called without NMG_LOCK. 823 */ 824 int 825 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 826 { 827 struct nm_bridge *b, *bridges; 828 struct netmap_adapter *na; 829 struct netmap_vp_adapter *vpna; 830 char *name = nmr->nr_name; 831 int cmd = nmr->nr_cmd, namelen = strlen(name); 832 int error = 0, i, j; 833 u_int num_bridges; 834 835 netmap_bns_getbridges(&bridges, &num_bridges); 836 837 switch (cmd) { 838 case NETMAP_BDG_NEWIF: 839 error = nm_vi_create(nmr); 840 break; 841 842 case NETMAP_BDG_DELIF: 843 error = nm_vi_destroy(nmr->nr_name); 844 break; 845 846 case NETMAP_BDG_ATTACH: 847 error = nm_bdg_ctl_attach(nmr); 848 break; 849 850 case NETMAP_BDG_DETACH: 851 error = nm_bdg_ctl_detach(nmr); 852 break; 853 854 case NETMAP_BDG_LIST: 855 /* this is used to enumerate bridges and ports */ 856 if (namelen) { /* look up indexes of bridge and port */ 857 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 858 error = EINVAL; 859 break; 860 } 861 NMG_LOCK(); 862 b = nm_find_bridge(name, 0 /* don't create */); 863 if (!b) { 864 error = ENOENT; 865 NMG_UNLOCK(); 866 break; 867 } 868 869 error = ENOENT; 870 for (j = 0; j < b->bdg_active_ports; j++) { 871 i = b->bdg_port_index[j]; 872 vpna = b->bdg_ports[i]; 873 if (vpna == NULL) { 874 D("---AAAAAAAAARGH-------"); 875 continue; 876 } 877 /* the former and the latter identify a 878 * virtual port and a NIC, respectively 879 */ 880 if (!strcmp(vpna->up.name, name)) { 881 /* bridge index */ 882 nmr->nr_arg1 = b - bridges; 883 nmr->nr_arg2 = i; /* port index */ 884 error = 0; 885 break; 886 } 887 } 888 NMG_UNLOCK(); 889 } else { 890 /* return the first non-empty entry starting from 891 * bridge nr_arg1 and port nr_arg2. 892 * 893 * Users can detect the end of the same bridge by 894 * seeing the new and old value of nr_arg1, and can 895 * detect the end of all the bridge by error != 0 896 */ 897 i = nmr->nr_arg1; 898 j = nmr->nr_arg2; 899 900 NMG_LOCK(); 901 for (error = ENOENT; i < NM_BRIDGES; i++) { 902 b = bridges + i; 903 if (j >= b->bdg_active_ports) { 904 j = 0; /* following bridges scan from 0 */ 905 continue; 906 } 907 nmr->nr_arg1 = i; 908 nmr->nr_arg2 = j; 909 j = b->bdg_port_index[j]; 910 vpna = b->bdg_ports[j]; 911 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 912 error = 0; 913 break; 914 } 915 NMG_UNLOCK(); 916 } 917 break; 918 919 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 920 /* register callbacks to the given bridge. 921 * nmr->nr_name may be just bridge's name (including ':' 922 * if it is not just NM_NAME). 923 */ 924 if (!bdg_ops) { 925 error = EINVAL; 926 break; 927 } 928 NMG_LOCK(); 929 b = nm_find_bridge(name, 0 /* don't create */); 930 if (!b) { 931 error = EINVAL; 932 } else { 933 b->bdg_ops = *bdg_ops; 934 } 935 NMG_UNLOCK(); 936 break; 937 938 case NETMAP_BDG_VNET_HDR: 939 /* Valid lengths for the virtio-net header are 0 (no header), 940 10 and 12. */ 941 if (nmr->nr_arg1 != 0 && 942 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 943 nmr->nr_arg1 != 12) { 944 error = EINVAL; 945 break; 946 } 947 NMG_LOCK(); 948 error = netmap_get_bdg_na(nmr, &na, 0); 949 if (na && !error) { 950 vpna = (struct netmap_vp_adapter *)na; 951 vpna->virt_hdr_len = nmr->nr_arg1; 952 if (vpna->virt_hdr_len) 953 vpna->mfs = NETMAP_BUF_SIZE(na); 954 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); 955 netmap_adapter_put(na); 956 } 957 NMG_UNLOCK(); 958 break; 959 960 default: 961 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 962 error = EINVAL; 963 break; 964 } 965 return error; 966 } 967 968 int 969 netmap_bdg_config(struct nmreq *nmr) 970 { 971 struct nm_bridge *b; 972 int error = EINVAL; 973 974 NMG_LOCK(); 975 b = nm_find_bridge(nmr->nr_name, 0); 976 if (!b) { 977 NMG_UNLOCK(); 978 return error; 979 } 980 NMG_UNLOCK(); 981 /* Don't call config() with NMG_LOCK() held */ 982 BDG_RLOCK(b); 983 if (b->bdg_ops.config != NULL) 984 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 985 BDG_RUNLOCK(b); 986 return error; 987 } 988 989 990 /* nm_krings_create callback for VALE ports. 991 * Calls the standard netmap_krings_create, then adds leases on rx 992 * rings and bdgfwd on tx rings. 993 */ 994 static int 995 netmap_vp_krings_create(struct netmap_adapter *na) 996 { 997 u_int tailroom; 998 int error, i; 999 uint32_t *leases; 1000 u_int nrx = netmap_real_rings(na, NR_RX); 1001 1002 /* 1003 * Leases are attached to RX rings on vale ports 1004 */ 1005 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 1006 1007 error = netmap_krings_create(na, tailroom); 1008 if (error) 1009 return error; 1010 1011 leases = na->tailroom; 1012 1013 for (i = 0; i < nrx; i++) { /* Receive rings */ 1014 na->rx_rings[i].nkr_leases = leases; 1015 leases += na->num_rx_desc; 1016 } 1017 1018 error = nm_alloc_bdgfwd(na); 1019 if (error) { 1020 netmap_krings_delete(na); 1021 return error; 1022 } 1023 1024 return 0; 1025 } 1026 1027 1028 /* nm_krings_delete callback for VALE ports. */ 1029 static void 1030 netmap_vp_krings_delete(struct netmap_adapter *na) 1031 { 1032 nm_free_bdgfwd(na); 1033 netmap_krings_delete(na); 1034 } 1035 1036 1037 static int 1038 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1039 struct netmap_vp_adapter *na, u_int ring_nr); 1040 1041 1042 /* 1043 * main dispatch routine for the bridge. 1044 * Grab packets from a kring, move them into the ft structure 1045 * associated to the tx (input) port. Max one instance per port, 1046 * filtered on input (ioctl, poll or XXX). 1047 * Returns the next position in the ring. 1048 */ 1049 static int 1050 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1051 { 1052 struct netmap_vp_adapter *na = 1053 (struct netmap_vp_adapter*)kring->na; 1054 struct netmap_ring *ring = kring->ring; 1055 struct nm_bdg_fwd *ft; 1056 u_int ring_nr = kring->ring_id; 1057 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1058 u_int ft_i = 0; /* start from 0 */ 1059 u_int frags = 1; /* how many frags ? */ 1060 struct nm_bridge *b = na->na_bdg; 1061 1062 /* To protect against modifications to the bridge we acquire a 1063 * shared lock, waiting if we can sleep (if the source port is 1064 * attached to a user process) or with a trylock otherwise (NICs). 1065 */ 1066 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1067 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1068 BDG_RLOCK(b); 1069 else if (!BDG_RTRYLOCK(b)) 1070 return 0; 1071 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1072 ft = kring->nkr_ft; 1073 1074 for (; likely(j != end); j = nm_next(j, lim)) { 1075 struct netmap_slot *slot = &ring->slot[j]; 1076 char *buf; 1077 1078 ft[ft_i].ft_len = slot->len; 1079 ft[ft_i].ft_flags = slot->flags; 1080 1081 ND("flags is 0x%x", slot->flags); 1082 /* we do not use the buf changed flag, but we still need to reset it */ 1083 slot->flags &= ~NS_BUF_CHANGED; 1084 1085 /* this slot goes into a list so initialize the link field */ 1086 ft[ft_i].ft_next = NM_FT_NULL; 1087 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1088 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1089 if (unlikely(buf == NULL)) { 1090 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1091 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1092 kring->name, j, ft[ft_i].ft_len); 1093 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1094 ft[ft_i].ft_len = 0; 1095 ft[ft_i].ft_flags = 0; 1096 } 1097 __builtin_prefetch(buf); 1098 ++ft_i; 1099 if (slot->flags & NS_MOREFRAG) { 1100 frags++; 1101 continue; 1102 } 1103 if (unlikely(netmap_verbose && frags > 1)) 1104 RD(5, "%d frags at %d", frags, ft_i - frags); 1105 ft[ft_i - frags].ft_frags = frags; 1106 frags = 1; 1107 if (unlikely((int)ft_i >= bridge_batch)) 1108 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1109 } 1110 if (frags > 1) { 1111 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1112 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 1113 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 1114 ft[ft_i - frags].ft_frags = frags - 1; 1115 } 1116 if (ft_i) 1117 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1118 BDG_RUNLOCK(b); 1119 return j; 1120 } 1121 1122 1123 /* ----- FreeBSD if_bridge hash function ------- */ 1124 1125 /* 1126 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1127 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1128 * 1129 * http://www.burtleburtle.net/bob/hash/spooky.html 1130 */ 1131 #define mix(a, b, c) \ 1132 do { \ 1133 a -= b; a -= c; a ^= (c >> 13); \ 1134 b -= c; b -= a; b ^= (a << 8); \ 1135 c -= a; c -= b; c ^= (b >> 13); \ 1136 a -= b; a -= c; a ^= (c >> 12); \ 1137 b -= c; b -= a; b ^= (a << 16); \ 1138 c -= a; c -= b; c ^= (b >> 5); \ 1139 a -= b; a -= c; a ^= (c >> 3); \ 1140 b -= c; b -= a; b ^= (a << 10); \ 1141 c -= a; c -= b; c ^= (b >> 15); \ 1142 } while (/*CONSTCOND*/0) 1143 1144 1145 static __inline uint32_t 1146 nm_bridge_rthash(const uint8_t *addr) 1147 { 1148 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1149 1150 b += addr[5] << 8; 1151 b += addr[4]; 1152 a += addr[3] << 24; 1153 a += addr[2] << 16; 1154 a += addr[1] << 8; 1155 a += addr[0]; 1156 1157 mix(a, b, c); 1158 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1159 return (c & BRIDGE_RTHASH_MASK); 1160 } 1161 1162 #undef mix 1163 1164 1165 /* nm_register callback for VALE ports */ 1166 static int 1167 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1168 { 1169 struct netmap_vp_adapter *vpna = 1170 (struct netmap_vp_adapter*)na; 1171 1172 /* persistent ports may be put in netmap mode 1173 * before being attached to a bridge 1174 */ 1175 if (vpna->na_bdg) 1176 BDG_WLOCK(vpna->na_bdg); 1177 if (onoff) { 1178 na->na_flags |= NAF_NETMAP_ON; 1179 /* XXX on FreeBSD, persistent VALE ports should also 1180 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1181 */ 1182 } else { 1183 na->na_flags &= ~NAF_NETMAP_ON; 1184 } 1185 if (vpna->na_bdg) 1186 BDG_WUNLOCK(vpna->na_bdg); 1187 return 0; 1188 } 1189 1190 1191 /* 1192 * Lookup function for a learning bridge. 1193 * Update the hash table with the source address, 1194 * and then returns the destination port index, and the 1195 * ring in *dst_ring (at the moment, always use ring 0) 1196 */ 1197 u_int 1198 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1199 struct netmap_vp_adapter *na) 1200 { 1201 uint8_t *buf = ft->ft_buf; 1202 u_int buf_len = ft->ft_len; 1203 struct nm_hash_ent *ht = na->na_bdg->ht; 1204 uint32_t sh, dh; 1205 u_int dst, mysrc = na->bdg_port; 1206 uint64_t smac, dmac; 1207 1208 /* safety check, unfortunately we have many cases */ 1209 if (buf_len >= 14 + na->virt_hdr_len) { 1210 /* virthdr + mac_hdr in the same slot */ 1211 buf += na->virt_hdr_len; 1212 buf_len -= na->virt_hdr_len; 1213 } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1214 /* only header in first fragment */ 1215 ft++; 1216 buf = ft->ft_buf; 1217 buf_len = ft->ft_len; 1218 } else { 1219 RD(5, "invalid buf format, length %d", buf_len); 1220 return NM_BDG_NOPORT; 1221 } 1222 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1223 smac = le64toh(*(uint64_t *)(buf + 4)); 1224 smac >>= 16; 1225 1226 /* 1227 * The hash is somewhat expensive, there might be some 1228 * worthwhile optimizations here. 1229 */ 1230 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1231 uint8_t *s = buf+6; 1232 sh = nm_bridge_rthash(s); // XXX hash of source 1233 /* update source port forwarding entry */ 1234 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1235 ht[sh].ports = mysrc; 1236 if (netmap_verbose) 1237 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1238 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1239 } 1240 dst = NM_BDG_BROADCAST; 1241 if ((buf[0] & 1) == 0) { /* unicast */ 1242 dh = nm_bridge_rthash(buf); // XXX hash of dst 1243 if (ht[dh].mac == dmac) { /* found dst */ 1244 dst = ht[dh].ports; 1245 } 1246 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1247 } 1248 return dst; 1249 } 1250 1251 1252 /* 1253 * Available space in the ring. Only used in VALE code 1254 * and only with is_rx = 1 1255 */ 1256 static inline uint32_t 1257 nm_kr_space(struct netmap_kring *k, int is_rx) 1258 { 1259 int space; 1260 1261 if (is_rx) { 1262 int busy = k->nkr_hwlease - k->nr_hwcur; 1263 if (busy < 0) 1264 busy += k->nkr_num_slots; 1265 space = k->nkr_num_slots - 1 - busy; 1266 } else { 1267 /* XXX never used in this branch */ 1268 space = k->nr_hwtail - k->nkr_hwlease; 1269 if (space < 0) 1270 space += k->nkr_num_slots; 1271 } 1272 #if 0 1273 // sanity check 1274 if (k->nkr_hwlease >= k->nkr_num_slots || 1275 k->nr_hwcur >= k->nkr_num_slots || 1276 k->nr_tail >= k->nkr_num_slots || 1277 busy < 0 || 1278 busy >= k->nkr_num_slots) { 1279 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1280 k->nkr_lease_idx, k->nkr_num_slots); 1281 } 1282 #endif 1283 return space; 1284 } 1285 1286 1287 1288 1289 /* make a lease on the kring for N positions. return the 1290 * lease index 1291 * XXX only used in VALE code and with is_rx = 1 1292 */ 1293 static inline uint32_t 1294 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1295 { 1296 uint32_t lim = k->nkr_num_slots - 1; 1297 uint32_t lease_idx = k->nkr_lease_idx; 1298 1299 k->nkr_leases[lease_idx] = NR_NOSLOT; 1300 k->nkr_lease_idx = nm_next(lease_idx, lim); 1301 1302 if (n > nm_kr_space(k, is_rx)) { 1303 D("invalid request for %d slots", n); 1304 panic("x"); 1305 } 1306 /* XXX verify that there are n slots */ 1307 k->nkr_hwlease += n; 1308 if (k->nkr_hwlease > lim) 1309 k->nkr_hwlease -= lim + 1; 1310 1311 if (k->nkr_hwlease >= k->nkr_num_slots || 1312 k->nr_hwcur >= k->nkr_num_slots || 1313 k->nr_hwtail >= k->nkr_num_slots || 1314 k->nkr_lease_idx >= k->nkr_num_slots) { 1315 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1316 k->na->name, 1317 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1318 k->nkr_lease_idx, k->nkr_num_slots); 1319 } 1320 return lease_idx; 1321 } 1322 1323 /* 1324 * 1325 * This flush routine supports only unicast and broadcast but a large 1326 * number of ports, and lets us replace the learn and dispatch functions. 1327 */ 1328 int 1329 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1330 u_int ring_nr) 1331 { 1332 struct nm_bdg_q *dst_ents, *brddst; 1333 uint16_t num_dsts = 0, *dsts; 1334 struct nm_bridge *b = na->na_bdg; 1335 u_int i, j, me = na->bdg_port; 1336 1337 /* 1338 * The work area (pointed by ft) is followed by an array of 1339 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1340 * queues per port plus one for the broadcast traffic. 1341 * Then we have an array of destination indexes. 1342 */ 1343 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1344 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1345 1346 /* first pass: find a destination for each packet in the batch */ 1347 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1348 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1349 uint16_t dst_port, d_i; 1350 struct nm_bdg_q *d; 1351 1352 ND("slot %d frags %d", i, ft[i].ft_frags); 1353 /* Drop the packet if the virtio-net header is not into the first 1354 fragment nor at the very beginning of the second. */ 1355 if (unlikely(na->virt_hdr_len > ft[i].ft_len)) 1356 continue; 1357 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1358 if (netmap_verbose > 255) 1359 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1360 if (dst_port == NM_BDG_NOPORT) 1361 continue; /* this packet is identified to be dropped */ 1362 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1363 continue; 1364 else if (dst_port == NM_BDG_BROADCAST) 1365 dst_ring = 0; /* broadcasts always go to ring 0 */ 1366 else if (unlikely(dst_port == me || 1367 !b->bdg_ports[dst_port])) 1368 continue; 1369 1370 /* get a position in the scratch pad */ 1371 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1372 d = dst_ents + d_i; 1373 1374 /* append the first fragment to the list */ 1375 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1376 d->bq_head = d->bq_tail = i; 1377 /* remember this position to be scanned later */ 1378 if (dst_port != NM_BDG_BROADCAST) 1379 dsts[num_dsts++] = d_i; 1380 } else { 1381 ft[d->bq_tail].ft_next = i; 1382 d->bq_tail = i; 1383 } 1384 d->bq_len += ft[i].ft_frags; 1385 } 1386 1387 /* 1388 * Broadcast traffic goes to ring 0 on all destinations. 1389 * So we need to add these rings to the list of ports to scan. 1390 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1391 * expensive. We should keep a compact list of active destinations 1392 * so we could shorten this loop. 1393 */ 1394 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1395 if (brddst->bq_head != NM_FT_NULL) { 1396 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1397 uint16_t d_i; 1398 i = b->bdg_port_index[j]; 1399 if (unlikely(i == me)) 1400 continue; 1401 d_i = i * NM_BDG_MAXRINGS; 1402 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1403 dsts[num_dsts++] = d_i; 1404 } 1405 } 1406 1407 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1408 /* second pass: scan destinations */ 1409 for (i = 0; i < num_dsts; i++) { 1410 struct netmap_vp_adapter *dst_na; 1411 struct netmap_kring *kring; 1412 struct netmap_ring *ring; 1413 u_int dst_nr, lim, j, d_i, next, brd_next; 1414 u_int needed, howmany; 1415 int retry = netmap_txsync_retry; 1416 struct nm_bdg_q *d; 1417 uint32_t my_start = 0, lease_idx = 0; 1418 int nrings; 1419 int virt_hdr_mismatch = 0; 1420 1421 d_i = dsts[i]; 1422 ND("second pass %d port %d", i, d_i); 1423 d = dst_ents + d_i; 1424 // XXX fix the division 1425 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1426 /* protect from the lookup function returning an inactive 1427 * destination port 1428 */ 1429 if (unlikely(dst_na == NULL)) 1430 goto cleanup; 1431 if (dst_na->up.na_flags & NAF_SW_ONLY) 1432 goto cleanup; 1433 /* 1434 * The interface may be in !netmap mode in two cases: 1435 * - when na is attached but not activated yet; 1436 * - when na is being deactivated but is still attached. 1437 */ 1438 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1439 ND("not in netmap mode!"); 1440 goto cleanup; 1441 } 1442 1443 /* there is at least one either unicast or broadcast packet */ 1444 brd_next = brddst->bq_head; 1445 next = d->bq_head; 1446 /* we need to reserve this many slots. If fewer are 1447 * available, some packets will be dropped. 1448 * Packets may have multiple fragments, so we may not use 1449 * there is a chance that we may not use all of the slots 1450 * we have claimed, so we will need to handle the leftover 1451 * ones when we regain the lock. 1452 */ 1453 needed = d->bq_len + brddst->bq_len; 1454 1455 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { 1456 RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); 1457 /* There is a virtio-net header/offloadings mismatch between 1458 * source and destination. The slower mismatch datapath will 1459 * be used to cope with all the mismatches. 1460 */ 1461 virt_hdr_mismatch = 1; 1462 if (dst_na->mfs < na->mfs) { 1463 /* We may need to do segmentation offloadings, and so 1464 * we may need a number of destination slots greater 1465 * than the number of input slots ('needed'). 1466 * We look for the smallest integer 'x' which satisfies: 1467 * needed * na->mfs + x * H <= x * na->mfs 1468 * where 'H' is the length of the longest header that may 1469 * be replicated in the segmentation process (e.g. for 1470 * TCPv4 we must account for ethernet header, IP header 1471 * and TCPv4 header). 1472 */ 1473 needed = (needed * na->mfs) / 1474 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1475 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1476 } 1477 } 1478 1479 ND(5, "pass 2 dst %d is %x %s", 1480 i, d_i, is_vp ? "virtual" : "nic/host"); 1481 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1482 nrings = dst_na->up.num_rx_rings; 1483 if (dst_nr >= nrings) 1484 dst_nr = dst_nr % nrings; 1485 kring = &dst_na->up.rx_rings[dst_nr]; 1486 ring = kring->ring; 1487 lim = kring->nkr_num_slots - 1; 1488 1489 retry: 1490 1491 if (dst_na->retry && retry) { 1492 /* try to get some free slot from the previous run */ 1493 kring->nm_notify(kring, 0); 1494 /* actually useful only for bwraps, since there 1495 * the notify will trigger a txsync on the hwna. VALE ports 1496 * have dst_na->retry == 0 1497 */ 1498 } 1499 /* reserve the buffers in the queue and an entry 1500 * to report completion, and drop lock. 1501 * XXX this might become a helper function. 1502 */ 1503 mtx_lock(&kring->q_lock); 1504 if (kring->nkr_stopped) { 1505 mtx_unlock(&kring->q_lock); 1506 goto cleanup; 1507 } 1508 my_start = j = kring->nkr_hwlease; 1509 howmany = nm_kr_space(kring, 1); 1510 if (needed < howmany) 1511 howmany = needed; 1512 lease_idx = nm_kr_lease(kring, howmany, 1); 1513 mtx_unlock(&kring->q_lock); 1514 1515 /* only retry if we need more than available slots */ 1516 if (retry && needed <= howmany) 1517 retry = 0; 1518 1519 /* copy to the destination queue */ 1520 while (howmany > 0) { 1521 struct netmap_slot *slot; 1522 struct nm_bdg_fwd *ft_p, *ft_end; 1523 u_int cnt; 1524 1525 /* find the queue from which we pick next packet. 1526 * NM_FT_NULL is always higher than valid indexes 1527 * so we never dereference it if the other list 1528 * has packets (and if both are empty we never 1529 * get here). 1530 */ 1531 if (next < brd_next) { 1532 ft_p = ft + next; 1533 next = ft_p->ft_next; 1534 } else { /* insert broadcast */ 1535 ft_p = ft + brd_next; 1536 brd_next = ft_p->ft_next; 1537 } 1538 cnt = ft_p->ft_frags; // cnt > 0 1539 if (unlikely(cnt > howmany)) 1540 break; /* no more space */ 1541 if (netmap_verbose && cnt > 1) 1542 RD(5, "rx %d frags to %d", cnt, j); 1543 ft_end = ft_p + cnt; 1544 if (unlikely(virt_hdr_mismatch)) { 1545 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1546 } else { 1547 howmany -= cnt; 1548 do { 1549 char *dst, *src = ft_p->ft_buf; 1550 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1551 1552 slot = &ring->slot[j]; 1553 dst = NMB(&dst_na->up, slot); 1554 1555 ND("send [%d] %d(%d) bytes at %s:%d", 1556 i, (int)copy_len, (int)dst_len, 1557 NM_IFPNAME(dst_ifp), j); 1558 /* round to a multiple of 64 */ 1559 copy_len = (copy_len + 63) & ~63; 1560 1561 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1562 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1563 RD(5, "invalid len %d, down to 64", (int)copy_len); 1564 copy_len = dst_len = 64; // XXX 1565 } 1566 if (ft_p->ft_flags & NS_INDIRECT) { 1567 if (copyin(src, dst, copy_len)) { 1568 // invalid user pointer, pretend len is 0 1569 dst_len = 0; 1570 } 1571 } else { 1572 //memcpy(dst, src, copy_len); 1573 pkt_copy(src, dst, (int)copy_len); 1574 } 1575 slot->len = dst_len; 1576 slot->flags = (cnt << 8)| NS_MOREFRAG; 1577 j = nm_next(j, lim); 1578 needed--; 1579 ft_p++; 1580 } while (ft_p != ft_end); 1581 slot->flags = (cnt << 8); /* clear flag on last entry */ 1582 } 1583 /* are we done ? */ 1584 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1585 break; 1586 } 1587 { 1588 /* current position */ 1589 uint32_t *p = kring->nkr_leases; /* shorthand */ 1590 uint32_t update_pos; 1591 int still_locked = 1; 1592 1593 mtx_lock(&kring->q_lock); 1594 if (unlikely(howmany > 0)) { 1595 /* not used all bufs. If i am the last one 1596 * i can recover the slots, otherwise must 1597 * fill them with 0 to mark empty packets. 1598 */ 1599 ND("leftover %d bufs", howmany); 1600 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1601 /* yes i am the last one */ 1602 ND("roll back nkr_hwlease to %d", j); 1603 kring->nkr_hwlease = j; 1604 } else { 1605 while (howmany-- > 0) { 1606 ring->slot[j].len = 0; 1607 ring->slot[j].flags = 0; 1608 j = nm_next(j, lim); 1609 } 1610 } 1611 } 1612 p[lease_idx] = j; /* report I am done */ 1613 1614 update_pos = kring->nr_hwtail; 1615 1616 if (my_start == update_pos) { 1617 /* all slots before my_start have been reported, 1618 * so scan subsequent leases to see if other ranges 1619 * have been completed, and to a selwakeup or txsync. 1620 */ 1621 while (lease_idx != kring->nkr_lease_idx && 1622 p[lease_idx] != NR_NOSLOT) { 1623 j = p[lease_idx]; 1624 p[lease_idx] = NR_NOSLOT; 1625 lease_idx = nm_next(lease_idx, lim); 1626 } 1627 /* j is the new 'write' position. j != my_start 1628 * means there are new buffers to report 1629 */ 1630 if (likely(j != my_start)) { 1631 kring->nr_hwtail = j; 1632 still_locked = 0; 1633 mtx_unlock(&kring->q_lock); 1634 kring->nm_notify(kring, 0); 1635 /* this is netmap_notify for VALE ports and 1636 * netmap_bwrap_notify for bwrap. The latter will 1637 * trigger a txsync on the underlying hwna 1638 */ 1639 if (dst_na->retry && retry--) { 1640 /* XXX this is going to call nm_notify again. 1641 * Only useful for bwrap in virtual machines 1642 */ 1643 goto retry; 1644 } 1645 } 1646 } 1647 if (still_locked) 1648 mtx_unlock(&kring->q_lock); 1649 } 1650 cleanup: 1651 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1652 d->bq_len = 0; 1653 } 1654 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1655 brddst->bq_len = 0; 1656 return 0; 1657 } 1658 1659 /* nm_txsync callback for VALE ports */ 1660 static int 1661 netmap_vp_txsync(struct netmap_kring *kring, int flags) 1662 { 1663 struct netmap_vp_adapter *na = 1664 (struct netmap_vp_adapter *)kring->na; 1665 u_int done; 1666 u_int const lim = kring->nkr_num_slots - 1; 1667 u_int const head = kring->rhead; 1668 1669 if (bridge_batch <= 0) { /* testing only */ 1670 done = head; // used all 1671 goto done; 1672 } 1673 if (!na->na_bdg) { 1674 done = head; 1675 goto done; 1676 } 1677 if (bridge_batch > NM_BDG_BATCH) 1678 bridge_batch = NM_BDG_BATCH; 1679 1680 done = nm_bdg_preflush(kring, head); 1681 done: 1682 if (done != head) 1683 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1684 /* 1685 * packets between 'done' and 'cur' are left unsent. 1686 */ 1687 kring->nr_hwcur = done; 1688 kring->nr_hwtail = nm_prev(done, lim); 1689 if (netmap_verbose) 1690 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1691 return 0; 1692 } 1693 1694 1695 /* rxsync code used by VALE ports nm_rxsync callback and also 1696 * internally by the brwap 1697 */ 1698 static int 1699 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 1700 { 1701 struct netmap_adapter *na = kring->na; 1702 struct netmap_ring *ring = kring->ring; 1703 u_int nm_i, lim = kring->nkr_num_slots - 1; 1704 u_int head = kring->rhead; 1705 int n; 1706 1707 if (head > lim) { 1708 D("ouch dangerous reset!!!"); 1709 n = netmap_ring_reinit(kring); 1710 goto done; 1711 } 1712 1713 /* First part, import newly received packets. */ 1714 /* actually nothing to do here, they are already in the kring */ 1715 1716 /* Second part, skip past packets that userspace has released. */ 1717 nm_i = kring->nr_hwcur; 1718 if (nm_i != head) { 1719 /* consistency check, but nothing really important here */ 1720 for (n = 0; likely(nm_i != head); n++) { 1721 struct netmap_slot *slot = &ring->slot[nm_i]; 1722 void *addr = NMB(na, slot); 1723 1724 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 1725 D("bad buffer index %d, ignore ?", 1726 slot->buf_idx); 1727 } 1728 slot->flags &= ~NS_BUF_CHANGED; 1729 nm_i = nm_next(nm_i, lim); 1730 } 1731 kring->nr_hwcur = head; 1732 } 1733 1734 n = 0; 1735 done: 1736 return n; 1737 } 1738 1739 /* 1740 * nm_rxsync callback for VALE ports 1741 * user process reading from a VALE switch. 1742 * Already protected against concurrent calls from userspace, 1743 * but we must acquire the queue's lock to protect against 1744 * writers on the same queue. 1745 */ 1746 static int 1747 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 1748 { 1749 int n; 1750 1751 mtx_lock(&kring->q_lock); 1752 n = netmap_vp_rxsync_locked(kring, flags); 1753 mtx_unlock(&kring->q_lock); 1754 return n; 1755 } 1756 1757 1758 /* nm_bdg_attach callback for VALE ports 1759 * The na_vp port is this same netmap_adapter. There is no host port. 1760 */ 1761 static int 1762 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 1763 { 1764 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1765 1766 if (vpna->na_bdg) 1767 return EBUSY; 1768 na->na_vp = vpna; 1769 strncpy(na->name, name, sizeof(na->name)); 1770 na->na_hostvp = NULL; 1771 return 0; 1772 } 1773 1774 /* create a netmap_vp_adapter that describes a VALE port. 1775 * Only persistent VALE ports have a non-null ifp. 1776 */ 1777 static int 1778 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret) 1779 { 1780 struct netmap_vp_adapter *vpna; 1781 struct netmap_adapter *na; 1782 int error; 1783 u_int npipes = 0; 1784 1785 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); 1786 if (vpna == NULL) 1787 return ENOMEM; 1788 1789 na = &vpna->up; 1790 1791 na->ifp = ifp; 1792 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 1793 1794 /* bound checking */ 1795 na->num_tx_rings = nmr->nr_tx_rings; 1796 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1797 nmr->nr_tx_rings = na->num_tx_rings; // write back 1798 na->num_rx_rings = nmr->nr_rx_rings; 1799 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1800 nmr->nr_rx_rings = na->num_rx_rings; // write back 1801 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1802 1, NM_BDG_MAXSLOTS, NULL); 1803 na->num_tx_desc = nmr->nr_tx_slots; 1804 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1805 1, NM_BDG_MAXSLOTS, NULL); 1806 /* validate number of pipes. We want at least 1, 1807 * but probably can do with some more. 1808 * So let's use 2 as default (when 0 is supplied) 1809 */ 1810 npipes = nmr->nr_arg1; 1811 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1812 nmr->nr_arg1 = npipes; /* write back */ 1813 /* validate extra bufs */ 1814 nm_bound_var(&nmr->nr_arg3, 0, 0, 1815 128*NM_BDG_MAXSLOTS, NULL); 1816 na->num_rx_desc = nmr->nr_rx_slots; 1817 vpna->virt_hdr_len = 0; 1818 vpna->mfs = 1514; 1819 vpna->last_smac = ~0llu; 1820 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1821 vpna->mfs = netmap_buf_size; */ 1822 if (netmap_verbose) 1823 D("max frame size %u", vpna->mfs); 1824 1825 na->na_flags |= NAF_BDG_MAYSLEEP; 1826 na->nm_txsync = netmap_vp_txsync; 1827 na->nm_rxsync = netmap_vp_rxsync; 1828 na->nm_register = netmap_vp_reg; 1829 na->nm_krings_create = netmap_vp_krings_create; 1830 na->nm_krings_delete = netmap_vp_krings_delete; 1831 na->nm_dtor = netmap_vp_dtor; 1832 na->nm_mem = netmap_mem_private_new(na->name, 1833 na->num_tx_rings, na->num_tx_desc, 1834 na->num_rx_rings, na->num_rx_desc, 1835 nmr->nr_arg3, npipes, &error); 1836 if (na->nm_mem == NULL) 1837 goto err; 1838 na->nm_bdg_attach = netmap_vp_bdg_attach; 1839 /* other nmd fields are set in the common routine */ 1840 error = netmap_attach_common(na); 1841 if (error) 1842 goto err; 1843 *ret = vpna; 1844 return 0; 1845 1846 err: 1847 if (na->nm_mem != NULL) 1848 netmap_mem_delete(na->nm_mem); 1849 free(vpna, M_DEVBUF); 1850 return error; 1851 } 1852 1853 /* Bridge wrapper code (bwrap). 1854 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 1855 * VALE switch. 1856 * The main task is to swap the meaning of tx and rx rings to match the 1857 * expectations of the VALE switch code (see nm_bdg_flush). 1858 * 1859 * The bwrap works by interposing a netmap_bwrap_adapter between the 1860 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 1861 * a netmap_vp_adapter to the rest the system, but, internally, it 1862 * translates all callbacks to what the hwna expects. 1863 * 1864 * Note that we have to intercept callbacks coming from two sides: 1865 * 1866 * - callbacks coming from the netmap module are intercepted by 1867 * passing around the netmap_bwrap_adapter instead of the hwna 1868 * 1869 * - callbacks coming from outside of the netmap module only know 1870 * about the hwna. This, however, only happens in interrupt 1871 * handlers, where only the hwna->nm_notify callback is called. 1872 * What the bwrap does is to overwrite the hwna->nm_notify callback 1873 * with its own netmap_bwrap_intr_notify. 1874 * XXX This assumes that the hwna->nm_notify callback was the 1875 * standard netmap_notify(), as it is the case for nic adapters. 1876 * Any additional action performed by hwna->nm_notify will not be 1877 * performed by netmap_bwrap_intr_notify. 1878 * 1879 * Additionally, the bwrap can optionally attach the host rings pair 1880 * of the wrapped adapter to a different port of the switch. 1881 */ 1882 1883 1884 static void 1885 netmap_bwrap_dtor(struct netmap_adapter *na) 1886 { 1887 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 1888 struct netmap_adapter *hwna = bna->hwna; 1889 1890 ND("na %p", na); 1891 /* drop reference to hwna->ifp. 1892 * If we don't do this, netmap_detach_common(na) 1893 * will think it has set NA(na->ifp) to NULL 1894 */ 1895 na->ifp = NULL; 1896 /* for safety, also drop the possible reference 1897 * in the hostna 1898 */ 1899 bna->host.up.ifp = NULL; 1900 1901 hwna->nm_mem = bna->save_nmd; 1902 hwna->na_private = NULL; 1903 hwna->na_vp = hwna->na_hostvp = NULL; 1904 hwna->na_flags &= ~NAF_BUSY; 1905 netmap_adapter_put(hwna); 1906 1907 } 1908 1909 1910 /* 1911 * Intr callback for NICs connected to a bridge. 1912 * Simply ignore tx interrupts (maybe we could try to recover space ?) 1913 * and pass received packets from nic to the bridge. 1914 * 1915 * XXX TODO check locking: this is called from the interrupt 1916 * handler so we should make sure that the interface is not 1917 * disconnected while passing down an interrupt. 1918 * 1919 * Note, no user process can access this NIC or the host stack. 1920 * The only part of the ring that is significant are the slots, 1921 * and head/cur/tail are set from the kring as needed 1922 * (part as a receive ring, part as a transmit ring). 1923 * 1924 * callback that overwrites the hwna notify callback. 1925 * Packets come from the outside or from the host stack and are put on an hwna rx ring. 1926 * The bridge wrapper then sends the packets through the bridge. 1927 */ 1928 static int 1929 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 1930 { 1931 struct netmap_adapter *na = kring->na; 1932 struct netmap_bwrap_adapter *bna = na->na_private; 1933 struct netmap_kring *bkring; 1934 struct netmap_ring *ring; 1935 struct netmap_vp_adapter *vpna = &bna->up; 1936 u_int ring_nr = kring->ring_id; 1937 int error = 0; 1938 1939 if (netmap_verbose) 1940 D("%s %s 0x%x", na->name, kring->name, flags); 1941 1942 if (!nm_netmap_on(na)) 1943 return 0; 1944 1945 bkring = &vpna->up.tx_rings[ring_nr]; 1946 ring = kring->ring; /* == kbkring->ring */ 1947 1948 /* make sure the ring is not disabled */ 1949 if (nm_kr_tryget(kring)) 1950 return 0; 1951 1952 if (netmap_verbose) 1953 D("%s head %d cur %d tail %d", na->name, 1954 kring->rhead, kring->rcur, kring->rtail); 1955 1956 /* simulate a user wakeup on the rx ring 1957 * fetch packets that have arrived. 1958 */ 1959 error = kring->nm_sync(kring, 0); 1960 if (error) 1961 goto put_out; 1962 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { 1963 D("how strange, interrupt with no packets on %s", 1964 na->name); 1965 goto put_out; 1966 } 1967 1968 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 1969 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 1970 * to push all packets out. 1971 */ 1972 bkring->rhead = bkring->rcur = kring->nr_hwtail; 1973 1974 netmap_vp_txsync(bkring, flags); 1975 1976 /* mark all buffers as released on this ring */ 1977 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 1978 /* another call to actually release the buffers */ 1979 error = kring->nm_sync(kring, 0); 1980 1981 put_out: 1982 nm_kr_put(kring); 1983 return error; 1984 } 1985 1986 1987 /* nm_register callback for bwrap */ 1988 static int 1989 netmap_bwrap_register(struct netmap_adapter *na, int onoff) 1990 { 1991 struct netmap_bwrap_adapter *bna = 1992 (struct netmap_bwrap_adapter *)na; 1993 struct netmap_adapter *hwna = bna->hwna; 1994 struct netmap_vp_adapter *hostna = &bna->host; 1995 int error; 1996 enum txrx t; 1997 1998 ND("%s %s", na->name, onoff ? "on" : "off"); 1999 2000 if (onoff) { 2001 int i; 2002 2003 /* netmap_do_regif has been called on the bwrap na. 2004 * We need to pass the information about the 2005 * memory allocator down to the hwna before 2006 * putting it in netmap mode 2007 */ 2008 hwna->na_lut = na->na_lut; 2009 2010 if (hostna->na_bdg) { 2011 /* if the host rings have been attached to switch, 2012 * we need to copy the memory allocator information 2013 * in the hostna also 2014 */ 2015 hostna->up.na_lut = na->na_lut; 2016 } 2017 2018 /* cross-link the netmap rings 2019 * The original number of rings comes from hwna, 2020 * rx rings on one side equals tx rings on the other. 2021 * We need to do this now, after the initialization 2022 * of the kring->ring pointers 2023 */ 2024 for_rx_tx(t) { 2025 enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2026 for (i = 0; i < nma_get_nrings(na, r) + 1; i++) { 2027 NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots; 2028 NMR(hwna, t)[i].ring = NMR(na, r)[i].ring; 2029 } 2030 } 2031 } 2032 2033 /* forward the request to the hwna */ 2034 error = hwna->nm_register(hwna, onoff); 2035 if (error) 2036 return error; 2037 2038 /* impersonate a netmap_vp_adapter */ 2039 netmap_vp_reg(na, onoff); 2040 if (hostna->na_bdg) 2041 netmap_vp_reg(&hostna->up, onoff); 2042 2043 if (onoff) { 2044 u_int i; 2045 /* intercept the hwna nm_nofify callback on the hw rings */ 2046 for (i = 0; i < hwna->num_rx_rings; i++) { 2047 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2048 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2049 } 2050 i = hwna->num_rx_rings; /* for safety */ 2051 /* save the host ring notify unconditionally */ 2052 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2053 if (hostna->na_bdg) { 2054 /* also intercept the host ring notify */ 2055 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2056 } 2057 } else { 2058 u_int i; 2059 /* reset all notify callbacks (including host ring) */ 2060 for (i = 0; i <= hwna->num_rx_rings; i++) { 2061 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; 2062 hwna->rx_rings[i].save_notify = NULL; 2063 } 2064 hwna->na_lut.lut = NULL; 2065 hwna->na_lut.objtotal = 0; 2066 hwna->na_lut.objsize = 0; 2067 } 2068 2069 return 0; 2070 } 2071 2072 /* nm_config callback for bwrap */ 2073 static int 2074 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2075 u_int *rxr, u_int *rxd) 2076 { 2077 struct netmap_bwrap_adapter *bna = 2078 (struct netmap_bwrap_adapter *)na; 2079 struct netmap_adapter *hwna = bna->hwna; 2080 2081 /* forward the request */ 2082 netmap_update_config(hwna); 2083 /* swap the results */ 2084 *txr = hwna->num_rx_rings; 2085 *txd = hwna->num_rx_desc; 2086 *rxr = hwna->num_tx_rings; 2087 *rxd = hwna->num_rx_desc; 2088 2089 return 0; 2090 } 2091 2092 2093 /* nm_krings_create callback for bwrap */ 2094 static int 2095 netmap_bwrap_krings_create(struct netmap_adapter *na) 2096 { 2097 struct netmap_bwrap_adapter *bna = 2098 (struct netmap_bwrap_adapter *)na; 2099 struct netmap_adapter *hwna = bna->hwna; 2100 struct netmap_adapter *hostna = &bna->host.up; 2101 int error; 2102 2103 ND("%s", na->name); 2104 2105 /* impersonate a netmap_vp_adapter */ 2106 error = netmap_vp_krings_create(na); 2107 if (error) 2108 return error; 2109 2110 /* also create the hwna krings */ 2111 error = hwna->nm_krings_create(hwna); 2112 if (error) { 2113 netmap_vp_krings_delete(na); 2114 return error; 2115 } 2116 /* the connection between the bwrap krings and the hwna krings 2117 * will be perfomed later, in the nm_register callback, since 2118 * now the kring->ring pointers have not been initialized yet 2119 */ 2120 2121 if (na->na_flags & NAF_HOST_RINGS) { 2122 /* the hostna rings are the host rings of the bwrap. 2123 * The corresponding krings must point back to the 2124 * hostna 2125 */ 2126 hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2127 hostna->tx_rings[0].na = hostna; 2128 hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2129 hostna->rx_rings[0].na = hostna; 2130 } 2131 2132 return 0; 2133 } 2134 2135 2136 static void 2137 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2138 { 2139 struct netmap_bwrap_adapter *bna = 2140 (struct netmap_bwrap_adapter *)na; 2141 struct netmap_adapter *hwna = bna->hwna; 2142 2143 ND("%s", na->name); 2144 2145 hwna->nm_krings_delete(hwna); 2146 netmap_vp_krings_delete(na); 2147 } 2148 2149 2150 /* notify method for the bridge-->hwna direction */ 2151 static int 2152 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2153 { 2154 struct netmap_adapter *na = kring->na; 2155 struct netmap_bwrap_adapter *bna = na->na_private; 2156 struct netmap_adapter *hwna = bna->hwna; 2157 u_int ring_n = kring->ring_id; 2158 u_int lim = kring->nkr_num_slots - 1; 2159 struct netmap_kring *hw_kring; 2160 int error = 0; 2161 2162 ND("%s: na %s hwna %s", 2163 (kring ? kring->name : "NULL!"), 2164 (na ? na->name : "NULL!"), 2165 (hwna ? hwna->name : "NULL!")); 2166 hw_kring = &hwna->tx_rings[ring_n]; 2167 2168 if (nm_kr_tryget(hw_kring)) 2169 return 0; 2170 2171 if (!nm_netmap_on(hwna)) 2172 return 0; 2173 /* first step: simulate a user wakeup on the rx ring */ 2174 netmap_vp_rxsync(kring, flags); 2175 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2176 na->name, ring_n, 2177 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2178 ring->head, ring->cur, ring->tail, 2179 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2180 /* second step: the new packets are sent on the tx ring 2181 * (which is actually the same ring) 2182 */ 2183 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2184 error = hw_kring->nm_sync(hw_kring, flags); 2185 if (error) 2186 goto out; 2187 2188 /* third step: now we are back the rx ring */ 2189 /* claim ownership on all hw owned bufs */ 2190 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2191 2192 /* fourth step: the user goes to sleep again, causing another rxsync */ 2193 netmap_vp_rxsync(kring, flags); 2194 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2195 na->name, ring_n, 2196 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2197 ring->head, ring->cur, ring->tail, 2198 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2199 out: 2200 nm_kr_put(hw_kring); 2201 return error; 2202 } 2203 2204 2205 /* nm_bdg_ctl callback for the bwrap. 2206 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2207 * On attach, it needs to provide a fake netmap_priv_d structure and 2208 * perform a netmap_do_regif() on the bwrap. This will put both the 2209 * bwrap and the hwna in netmap mode, with the netmap rings shared 2210 * and cross linked. Moroever, it will start intercepting interrupts 2211 * directed to hwna. 2212 */ 2213 static int 2214 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2215 { 2216 struct netmap_priv_d *npriv; 2217 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2218 int error = 0; 2219 2220 if (attach) { 2221 if (NETMAP_OWNED_BY_ANY(na)) { 2222 return EBUSY; 2223 } 2224 if (bna->na_kpriv) { 2225 /* nothing to do */ 2226 return 0; 2227 } 2228 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2229 if (npriv == NULL) 2230 return ENOMEM; 2231 error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags); 2232 if (error) { 2233 bzero(npriv, sizeof(*npriv)); 2234 free(npriv, M_DEVBUF); 2235 return error; 2236 } 2237 bna->na_kpriv = npriv; 2238 na->na_flags |= NAF_BUSY; 2239 } else { 2240 int last_instance; 2241 2242 if (na->active_fds == 0) /* not registered */ 2243 return EINVAL; 2244 last_instance = netmap_dtor_locked(bna->na_kpriv); 2245 if (!last_instance) { 2246 D("--- error, trying to detach an entry with active mmaps"); 2247 error = EINVAL; 2248 } else { 2249 struct nm_bridge *b = bna->up.na_bdg, 2250 *bh = bna->host.na_bdg; 2251 npriv = bna->na_kpriv; 2252 bna->na_kpriv = NULL; 2253 D("deleting priv"); 2254 2255 bzero(npriv, sizeof(*npriv)); 2256 free(npriv, M_DEVBUF); 2257 if (b) { 2258 /* XXX the bwrap dtor should take care 2259 * of this (2014-06-16) 2260 */ 2261 netmap_bdg_detach_common(b, bna->up.bdg_port, 2262 (bh ? bna->host.bdg_port : -1)); 2263 } 2264 na->na_flags &= ~NAF_BUSY; 2265 } 2266 } 2267 return error; 2268 2269 } 2270 2271 /* attach a bridge wrapper to the 'real' device */ 2272 int 2273 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2274 { 2275 struct netmap_bwrap_adapter *bna; 2276 struct netmap_adapter *na = NULL; 2277 struct netmap_adapter *hostna = NULL; 2278 int error = 0; 2279 enum txrx t; 2280 2281 /* make sure the NIC is not already in use */ 2282 if (NETMAP_OWNED_BY_ANY(hwna)) { 2283 D("NIC %s busy, cannot attach to bridge", hwna->name); 2284 return EBUSY; 2285 } 2286 2287 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); 2288 if (bna == NULL) { 2289 return ENOMEM; 2290 } 2291 2292 na = &bna->up.up; 2293 na->na_private = bna; 2294 strncpy(na->name, nr_name, sizeof(na->name)); 2295 /* fill the ring data for the bwrap adapter with rx/tx meanings 2296 * swapped. The real cross-linking will be done during register, 2297 * when all the krings will have been created. 2298 */ 2299 for_rx_tx(t) { 2300 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2301 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 2302 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 2303 } 2304 na->nm_dtor = netmap_bwrap_dtor; 2305 na->nm_register = netmap_bwrap_register; 2306 // na->nm_txsync = netmap_bwrap_txsync; 2307 // na->nm_rxsync = netmap_bwrap_rxsync; 2308 na->nm_config = netmap_bwrap_config; 2309 na->nm_krings_create = netmap_bwrap_krings_create; 2310 na->nm_krings_delete = netmap_bwrap_krings_delete; 2311 na->nm_notify = netmap_bwrap_notify; 2312 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2313 na->pdev = hwna->pdev; 2314 na->nm_mem = netmap_mem_private_new(na->name, 2315 na->num_tx_rings, na->num_tx_desc, 2316 na->num_rx_rings, na->num_rx_desc, 2317 0, 0, &error); 2318 na->na_flags |= NAF_MEM_OWNER; 2319 if (na->nm_mem == NULL) 2320 goto err_put; 2321 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2322 2323 bna->hwna = hwna; 2324 netmap_adapter_get(hwna); 2325 hwna->na_private = bna; /* weak reference */ 2326 hwna->na_vp = &bna->up; 2327 2328 if (hwna->na_flags & NAF_HOST_RINGS) { 2329 if (hwna->na_flags & NAF_SW_ONLY) 2330 na->na_flags |= NAF_SW_ONLY; 2331 na->na_flags |= NAF_HOST_RINGS; 2332 hostna = &bna->host.up; 2333 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2334 hostna->ifp = hwna->ifp; 2335 for_rx_tx(t) { 2336 enum txrx r = nm_txrx_swap(t); 2337 nma_set_nrings(hostna, t, 1); 2338 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 2339 } 2340 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2341 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2342 hostna->nm_notify = netmap_bwrap_notify; 2343 hostna->nm_mem = na->nm_mem; 2344 hostna->na_private = bna; 2345 hostna->na_vp = &bna->up; 2346 na->na_hostvp = hwna->na_hostvp = 2347 hostna->na_hostvp = &bna->host; 2348 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2349 } 2350 2351 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2352 na->name, ifp->if_xname, 2353 na->num_tx_rings, na->num_tx_desc, 2354 na->num_rx_rings, na->num_rx_desc); 2355 2356 error = netmap_attach_common(na); 2357 if (error) { 2358 goto err_free; 2359 } 2360 /* make bwrap ifp point to the real ifp 2361 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp 2362 * as a request to make the ifp point to the na. Since we 2363 * do not want to change the na already pointed to by hwna->ifp, 2364 * the following assignment has to be delayed until now 2365 */ 2366 na->ifp = hwna->ifp; 2367 hwna->na_flags |= NAF_BUSY; 2368 /* make hwna point to the allocator we are actually using, 2369 * so that monitors will be able to find it 2370 */ 2371 bna->save_nmd = hwna->nm_mem; 2372 hwna->nm_mem = na->nm_mem; 2373 return 0; 2374 2375 err_free: 2376 netmap_mem_delete(na->nm_mem); 2377 err_put: 2378 hwna->na_vp = hwna->na_hostvp = NULL; 2379 netmap_adapter_put(hwna); 2380 free(bna, M_DEVBUF); 2381 return error; 2382 2383 } 2384 2385 struct nm_bridge * 2386 netmap_init_bridges2(u_int n) 2387 { 2388 int i; 2389 struct nm_bridge *b; 2390 2391 b = malloc(sizeof(struct nm_bridge) * n, M_DEVBUF, 2392 M_NOWAIT | M_ZERO); 2393 if (b == NULL) 2394 return NULL; 2395 for (i = 0; i < n; i++) 2396 BDG_RWINIT(&b[i]); 2397 return b; 2398 } 2399 2400 void 2401 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 2402 { 2403 int i; 2404 2405 if (b == NULL) 2406 return; 2407 2408 for (i = 0; i < n; i++) 2409 BDG_RWDESTROY(&b[i]); 2410 free(b, M_DEVBUF); 2411 } 2412 2413 int 2414 netmap_init_bridges(void) 2415 { 2416 #ifdef CONFIG_NET_NS 2417 return netmap_bns_register(); 2418 #else 2419 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 2420 if (nm_bridges == NULL) 2421 return ENOMEM; 2422 return 0; 2423 #endif 2424 } 2425 2426 void 2427 netmap_uninit_bridges(void) 2428 { 2429 #ifdef CONFIG_NET_NS 2430 netmap_bns_unregister(); 2431 #else 2432 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 2433 #endif 2434 } 2435 #endif /* WITH_VALE */ 2436