1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 /* 31 * This module implements the VALE switch for netmap 32 33 --- VALE SWITCH --- 34 35 NMG_LOCK() serializes all modifications to switches and ports. 36 A switch cannot be deleted until all ports are gone. 37 38 For each switch, an SX lock (RWlock on linux) protects 39 deletion of ports. When configuring or deleting a new port, the 40 lock is acquired in exclusive mode (after holding NMG_LOCK). 41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 42 The lock is held throughout the entire forwarding cycle, 43 during which the thread may incur in a page fault. 44 Hence it is important that sleepable shared locks are used. 45 46 On the rx ring, the per-port lock is grabbed initially to reserve 47 a number of slot in the ring, then the lock is released, 48 packets are copied from source to destination, and then 49 the lock is acquired again and the receive ring is updated. 50 (A similar thing is done on the tx ring for NIC and host stack 51 ports attached to the switch) 52 53 */ 54 55 /* 56 * OS-specific code that is used only within this file. 57 * Other OS-specific code that must be accessed by drivers 58 * is present in netmap_kern.h 59 */ 60 61 #if defined(__FreeBSD__) 62 #include <sys/cdefs.h> /* prerequisite */ 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/types.h> 66 #include <sys/errno.h> 67 #include <sys/param.h> /* defines used in kernel.h */ 68 #include <sys/kernel.h> /* types used in module initialization */ 69 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 70 #include <sys/sockio.h> 71 #include <sys/socketvar.h> /* struct socket */ 72 #include <sys/malloc.h> 73 #include <sys/poll.h> 74 #include <sys/rwlock.h> 75 #include <sys/socket.h> /* sockaddrs */ 76 #include <sys/selinfo.h> 77 #include <sys/sysctl.h> 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/bpf.h> /* BIOCIMMEDIATE */ 81 #include <machine/bus.h> /* bus_dmamap_* */ 82 #include <sys/endian.h> 83 #include <sys/refcount.h> 84 85 86 #define BDG_RWLOCK_T struct rwlock // struct rwlock 87 88 #define BDG_RWINIT(b) \ 89 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 90 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 91 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 92 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 93 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 94 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 95 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 96 97 98 #elif defined(linux) 99 100 #include "bsd_glue.h" 101 102 #elif defined(__APPLE__) 103 104 #warning OSX support is only partial 105 #include "osx_glue.h" 106 107 #elif defined(_WIN32) 108 #include "win_glue.h" 109 110 #else 111 112 #error Unsupported platform 113 114 #endif /* unsupported */ 115 116 /* 117 * common headers 118 */ 119 120 #include <net/netmap.h> 121 #include <dev/netmap/netmap_kern.h> 122 #include <dev/netmap/netmap_mem2.h> 123 124 #ifdef WITH_VALE 125 126 /* 127 * system parameters (most of them in netmap_kern.h) 128 * NM_BDG_NAME prefix for switch port names, default "vale" 129 * NM_BDG_MAXPORTS number of ports 130 * NM_BRIDGES max number of switches in the system. 131 * XXX should become a sysctl or tunable 132 * 133 * Switch ports are named valeX:Y where X is the switch name and Y 134 * is the port. If Y matches a physical interface name, the port is 135 * connected to a physical device. 136 * 137 * Unlike physical interfaces, switch ports use their own memory region 138 * for rings and buffers. 139 * The virtual interfaces use per-queue lock instead of core lock. 140 * In the tx loop, we aggregate traffic in batches to make all operations 141 * faster. The batch size is bridge_batch. 142 */ 143 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 144 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 145 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 146 #define NM_BDG_HASH 1024 /* forwarding table entries */ 147 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 148 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 149 /* actual size of the tables */ 150 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 151 /* NM_FT_NULL terminates a list of slots in the ft */ 152 #define NM_FT_NULL NM_BDG_BATCH_MAX 153 /* Default size for the Maximum Frame Size. */ 154 #define NM_BDG_MFS_DEFAULT 1514 155 156 157 /* 158 * bridge_batch is set via sysctl to the max batch size to be 159 * used in the bridge. The actual value may be larger as the 160 * last packet in the block may overflow the size. 161 */ 162 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 163 SYSBEGIN(vars_vale); 164 SYSCTL_DECL(_dev_netmap); 165 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 166 "Max batch size to be used in the bridge"); 167 SYSEND; 168 169 static int netmap_vp_create(struct nmreq *, struct ifnet *, 170 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 171 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 172 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); 173 174 /* 175 * For each output interface, nm_bdg_q is used to construct a list. 176 * bq_len is the number of output buffers (we can have coalescing 177 * during the copy). 178 */ 179 struct nm_bdg_q { 180 uint16_t bq_head; 181 uint16_t bq_tail; 182 uint32_t bq_len; /* number of buffers */ 183 }; 184 185 /* XXX revise this */ 186 struct nm_hash_ent { 187 uint64_t mac; /* the top 2 bytes are the epoch */ 188 uint64_t ports; 189 }; 190 191 /* 192 * nm_bridge is a descriptor for a VALE switch. 193 * Interfaces for a bridge are all in bdg_ports[]. 194 * The array has fixed size, an empty entry does not terminate 195 * the search, but lookups only occur on attach/detach so we 196 * don't mind if they are slow. 197 * 198 * The bridge is non blocking on the transmit ports: excess 199 * packets are dropped if there is no room on the output port. 200 * 201 * bdg_lock protects accesses to the bdg_ports array. 202 * This is a rw lock (or equivalent). 203 */ 204 struct nm_bridge { 205 /* XXX what is the proper alignment/layout ? */ 206 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 207 int bdg_namelen; 208 uint32_t bdg_active_ports; /* 0 means free */ 209 char bdg_basename[IFNAMSIZ]; 210 211 /* Indexes of active ports (up to active_ports) 212 * and all other remaining ports. 213 */ 214 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 215 216 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 217 218 219 /* 220 * The function to decide the destination port. 221 * It returns either of an index of the destination port, 222 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 223 * forward this packet. ring_nr is the source ring index, and the 224 * function may overwrite this value to forward this packet to a 225 * different ring index. 226 * This function must be set by netmap_bdg_ctl(). 227 */ 228 struct netmap_bdg_ops bdg_ops; 229 230 /* the forwarding table, MAC+ports. 231 * XXX should be changed to an argument to be passed to 232 * the lookup function 233 */ 234 struct nm_hash_ent *ht; // allocated on attach 235 236 #ifdef CONFIG_NET_NS 237 struct net *ns; 238 #endif /* CONFIG_NET_NS */ 239 }; 240 241 const char* 242 netmap_bdg_name(struct netmap_vp_adapter *vp) 243 { 244 struct nm_bridge *b = vp->na_bdg; 245 if (b == NULL) 246 return NULL; 247 return b->bdg_basename; 248 } 249 250 251 #ifndef CONFIG_NET_NS 252 /* 253 * XXX in principle nm_bridges could be created dynamically 254 * Right now we have a static array and deletions are protected 255 * by an exclusive lock. 256 */ 257 static struct nm_bridge *nm_bridges; 258 #endif /* !CONFIG_NET_NS */ 259 260 261 /* 262 * this is a slightly optimized copy routine which rounds 263 * to multiple of 64 bytes and is often faster than dealing 264 * with other odd sizes. We assume there is enough room 265 * in the source and destination buffers. 266 * 267 * XXX only for multiples of 64 bytes, non overlapped. 268 */ 269 static inline void 270 pkt_copy(void *_src, void *_dst, int l) 271 { 272 uint64_t *src = _src; 273 uint64_t *dst = _dst; 274 if (unlikely(l >= 1024)) { 275 memcpy(dst, src, l); 276 return; 277 } 278 for (; likely(l > 0); l-=64) { 279 *dst++ = *src++; 280 *dst++ = *src++; 281 *dst++ = *src++; 282 *dst++ = *src++; 283 *dst++ = *src++; 284 *dst++ = *src++; 285 *dst++ = *src++; 286 *dst++ = *src++; 287 } 288 } 289 290 291 static int 292 nm_is_id_char(const char c) 293 { 294 return (c >= 'a' && c <= 'z') || 295 (c >= 'A' && c <= 'Z') || 296 (c >= '0' && c <= '9') || 297 (c == '_'); 298 } 299 300 /* Validate the name of a VALE bridge port and return the 301 * position of the ":" character. */ 302 static int 303 nm_vale_name_validate(const char *name) 304 { 305 int colon_pos = -1; 306 int i; 307 308 if (!name || strlen(name) < strlen(NM_BDG_NAME)) { 309 return -1; 310 } 311 312 for (i = 0; name[i]; i++) { 313 if (name[i] == ':') { 314 if (colon_pos != -1) { 315 return -1; 316 } 317 colon_pos = i; 318 } else if (!nm_is_id_char(name[i])) { 319 return -1; 320 } 321 } 322 323 if (i >= IFNAMSIZ) { 324 return -1; 325 } 326 327 return colon_pos; 328 } 329 330 /* 331 * locate a bridge among the existing ones. 332 * MUST BE CALLED WITH NMG_LOCK() 333 * 334 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 335 * We assume that this is called with a name of at least NM_NAME chars. 336 */ 337 static struct nm_bridge * 338 nm_find_bridge(const char *name, int create) 339 { 340 int i, namelen; 341 struct nm_bridge *b = NULL, *bridges; 342 u_int num_bridges; 343 344 NMG_LOCK_ASSERT(); 345 346 netmap_bns_getbridges(&bridges, &num_bridges); 347 348 namelen = nm_vale_name_validate(name); 349 if (namelen < 0) { 350 D("invalid bridge name %s", name ? name : NULL); 351 return NULL; 352 } 353 354 /* lookup the name, remember empty slot if there is one */ 355 for (i = 0; i < num_bridges; i++) { 356 struct nm_bridge *x = bridges + i; 357 358 if (x->bdg_active_ports == 0) { 359 if (create && b == NULL) 360 b = x; /* record empty slot */ 361 } else if (x->bdg_namelen != namelen) { 362 continue; 363 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 364 ND("found '%.*s' at %d", namelen, name, i); 365 b = x; 366 break; 367 } 368 } 369 if (i == num_bridges && b) { /* name not found, can create entry */ 370 /* initialize the bridge */ 371 ND("create new bridge %s with ports %d", b->bdg_basename, 372 b->bdg_active_ports); 373 b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH); 374 if (b->ht == NULL) { 375 D("failed to allocate hash table"); 376 return NULL; 377 } 378 strncpy(b->bdg_basename, name, namelen); 379 b->bdg_namelen = namelen; 380 b->bdg_active_ports = 0; 381 for (i = 0; i < NM_BDG_MAXPORTS; i++) 382 b->bdg_port_index[i] = i; 383 /* set the default function */ 384 b->bdg_ops.lookup = netmap_bdg_learning; 385 NM_BNS_GET(b); 386 } 387 return b; 388 } 389 390 391 /* 392 * Free the forwarding tables for rings attached to switch ports. 393 */ 394 static void 395 nm_free_bdgfwd(struct netmap_adapter *na) 396 { 397 int nrings, i; 398 struct netmap_kring *kring; 399 400 NMG_LOCK_ASSERT(); 401 nrings = na->num_tx_rings; 402 kring = na->tx_rings; 403 for (i = 0; i < nrings; i++) { 404 if (kring[i].nkr_ft) { 405 nm_os_free(kring[i].nkr_ft); 406 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 407 } 408 } 409 } 410 411 412 /* 413 * Allocate the forwarding tables for the rings attached to the bridge ports. 414 */ 415 static int 416 nm_alloc_bdgfwd(struct netmap_adapter *na) 417 { 418 int nrings, l, i, num_dstq; 419 struct netmap_kring *kring; 420 421 NMG_LOCK_ASSERT(); 422 /* all port:rings + broadcast */ 423 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 424 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 425 l += sizeof(struct nm_bdg_q) * num_dstq; 426 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 427 428 nrings = netmap_real_rings(na, NR_TX); 429 kring = na->tx_rings; 430 for (i = 0; i < nrings; i++) { 431 struct nm_bdg_fwd *ft; 432 struct nm_bdg_q *dstq; 433 int j; 434 435 ft = nm_os_malloc(l); 436 if (!ft) { 437 nm_free_bdgfwd(na); 438 return ENOMEM; 439 } 440 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 441 for (j = 0; j < num_dstq; j++) { 442 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 443 dstq[j].bq_len = 0; 444 } 445 kring[i].nkr_ft = ft; 446 } 447 return 0; 448 } 449 450 451 /* remove from bridge b the ports in slots hw and sw 452 * (sw can be -1 if not needed) 453 */ 454 static void 455 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 456 { 457 int s_hw = hw, s_sw = sw; 458 int i, lim =b->bdg_active_ports; 459 uint8_t tmp[NM_BDG_MAXPORTS]; 460 461 /* 462 New algorithm: 463 make a copy of bdg_port_index; 464 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 465 in the array of bdg_port_index, replacing them with 466 entries from the bottom of the array; 467 decrement bdg_active_ports; 468 acquire BDG_WLOCK() and copy back the array. 469 */ 470 471 if (netmap_verbose) 472 D("detach %d and %d (lim %d)", hw, sw, lim); 473 /* make a copy of the list of active ports, update it, 474 * and then copy back within BDG_WLOCK(). 475 */ 476 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 477 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 478 if (hw >= 0 && tmp[i] == hw) { 479 ND("detach hw %d at %d", hw, i); 480 lim--; /* point to last active port */ 481 tmp[i] = tmp[lim]; /* swap with i */ 482 tmp[lim] = hw; /* now this is inactive */ 483 hw = -1; 484 } else if (sw >= 0 && tmp[i] == sw) { 485 ND("detach sw %d at %d", sw, i); 486 lim--; 487 tmp[i] = tmp[lim]; 488 tmp[lim] = sw; 489 sw = -1; 490 } else { 491 i++; 492 } 493 } 494 if (hw >= 0 || sw >= 0) { 495 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 496 } 497 498 BDG_WLOCK(b); 499 if (b->bdg_ops.dtor) 500 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 501 b->bdg_ports[s_hw] = NULL; 502 if (s_sw >= 0) { 503 b->bdg_ports[s_sw] = NULL; 504 } 505 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 506 b->bdg_active_ports = lim; 507 BDG_WUNLOCK(b); 508 509 ND("now %d active ports", lim); 510 if (lim == 0) { 511 ND("marking bridge %s as free", b->bdg_basename); 512 nm_os_free(b->ht); 513 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 514 NM_BNS_PUT(b); 515 } 516 } 517 518 /* nm_bdg_ctl callback for VALE ports */ 519 static int 520 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 521 { 522 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 523 struct nm_bridge *b = vpna->na_bdg; 524 525 (void)nmr; // XXX merge ? 526 if (attach) 527 return 0; /* nothing to do */ 528 if (b) { 529 netmap_set_all_rings(na, 0 /* disable */); 530 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 531 vpna->na_bdg = NULL; 532 netmap_set_all_rings(na, 1 /* enable */); 533 } 534 /* I have took reference just for attach */ 535 netmap_adapter_put(na); 536 return 0; 537 } 538 539 /* nm_dtor callback for ephemeral VALE ports */ 540 static void 541 netmap_vp_dtor(struct netmap_adapter *na) 542 { 543 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 544 struct nm_bridge *b = vpna->na_bdg; 545 546 ND("%s has %d references", na->name, na->na_refcount); 547 548 if (b) { 549 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 550 } 551 552 if (na->ifp != NULL && !nm_iszombie(na)) { 553 WNA(na->ifp) = NULL; 554 if (vpna->autodelete) { 555 ND("releasing %s", na->ifp->if_xname); 556 NMG_UNLOCK(); 557 nm_os_vi_detach(na->ifp); 558 NMG_LOCK(); 559 } 560 } 561 } 562 563 /* remove a persistent VALE port from the system */ 564 static int 565 nm_vi_destroy(const char *name) 566 { 567 struct ifnet *ifp; 568 struct netmap_vp_adapter *vpna; 569 int error; 570 571 ifp = ifunit_ref(name); 572 if (!ifp) 573 return ENXIO; 574 NMG_LOCK(); 575 /* make sure this is actually a VALE port */ 576 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 577 error = EINVAL; 578 goto err; 579 } 580 581 vpna = (struct netmap_vp_adapter *)NA(ifp); 582 583 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 584 if (vpna->autodelete) { 585 error = EINVAL; 586 goto err; 587 } 588 589 /* also make sure that nobody is using the inferface */ 590 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 591 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 592 error = EBUSY; 593 goto err; 594 } 595 596 NMG_UNLOCK(); 597 598 D("destroying a persistent vale interface %s", ifp->if_xname); 599 /* Linux requires all the references are released 600 * before unregister 601 */ 602 netmap_detach(ifp); 603 if_rele(ifp); 604 nm_os_vi_detach(ifp); 605 return 0; 606 607 err: 608 NMG_UNLOCK(); 609 if_rele(ifp); 610 return error; 611 } 612 613 static int 614 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na) 615 { 616 uint64_t memsize; 617 int ret; 618 nmr->nr_rx_rings = na->num_rx_rings; 619 nmr->nr_tx_rings = na->num_tx_rings; 620 nmr->nr_rx_slots = na->num_rx_desc; 621 nmr->nr_tx_slots = na->num_tx_desc; 622 ret = netmap_mem_get_info(na->nm_mem, &memsize, NULL, &nmr->nr_arg2); 623 nmr->nr_memsize = (uint32_t)memsize; 624 return ret; 625 } 626 627 /* 628 * Create a virtual interface registered to the system. 629 * The interface will be attached to a bridge later. 630 */ 631 int 632 netmap_vi_create(struct nmreq *nmr, int autodelete) 633 { 634 struct ifnet *ifp; 635 struct netmap_vp_adapter *vpna; 636 struct netmap_mem_d *nmd = NULL; 637 int error; 638 639 /* don't include VALE prefix */ 640 if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 641 return EINVAL; 642 ifp = ifunit_ref(nmr->nr_name); 643 if (ifp) { /* already exist, cannot create new one */ 644 error = EEXIST; 645 NMG_LOCK(); 646 if (NM_NA_VALID(ifp)) { 647 int update_err = nm_update_info(nmr, NA(ifp)); 648 if (update_err) 649 error = update_err; 650 } 651 NMG_UNLOCK(); 652 if_rele(ifp); 653 return error; 654 } 655 error = nm_os_vi_persist(nmr->nr_name, &ifp); 656 if (error) 657 return error; 658 659 NMG_LOCK(); 660 if (nmr->nr_arg2) { 661 nmd = netmap_mem_find(nmr->nr_arg2); 662 if (nmd == NULL) { 663 error = EINVAL; 664 goto err_1; 665 } 666 } 667 /* netmap_vp_create creates a struct netmap_vp_adapter */ 668 error = netmap_vp_create(nmr, ifp, nmd, &vpna); 669 if (error) { 670 D("error %d", error); 671 goto err_1; 672 } 673 /* persist-specific routines */ 674 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 675 if (!autodelete) { 676 netmap_adapter_get(&vpna->up); 677 } else { 678 vpna->autodelete = 1; 679 } 680 NM_ATTACH_NA(ifp, &vpna->up); 681 /* return the updated info */ 682 error = nm_update_info(nmr, &vpna->up); 683 if (error) { 684 goto err_2; 685 } 686 D("returning nr_arg2 %d", nmr->nr_arg2); 687 if (nmd) 688 netmap_mem_put(nmd); 689 NMG_UNLOCK(); 690 D("created %s", ifp->if_xname); 691 return 0; 692 693 err_2: 694 netmap_detach(ifp); 695 err_1: 696 if (nmd) 697 netmap_mem_put(nmd); 698 NMG_UNLOCK(); 699 nm_os_vi_detach(ifp); 700 701 return error; 702 } 703 704 /* Try to get a reference to a netmap adapter attached to a VALE switch. 705 * If the adapter is found (or is created), this function returns 0, a 706 * non NULL pointer is returned into *na, and the caller holds a 707 * reference to the adapter. 708 * If an adapter is not found, then no reference is grabbed and the 709 * function returns an error code, or 0 if there is just a VALE prefix 710 * mismatch. Therefore the caller holds a reference when 711 * (*na != NULL && return == 0). 712 */ 713 int 714 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, 715 struct netmap_mem_d *nmd, int create) 716 { 717 char *nr_name = nmr->nr_name; 718 const char *ifname; 719 struct ifnet *ifp = NULL; 720 int error = 0; 721 struct netmap_vp_adapter *vpna, *hostna = NULL; 722 struct nm_bridge *b; 723 int i, j, cand = -1, cand2 = -1; 724 int needed; 725 726 *na = NULL; /* default return value */ 727 728 /* first try to see if this is a bridge port. */ 729 NMG_LOCK_ASSERT(); 730 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { 731 return 0; /* no error, but no VALE prefix */ 732 } 733 734 b = nm_find_bridge(nr_name, create); 735 if (b == NULL) { 736 D("no bridges available for '%s'", nr_name); 737 return (create ? ENOMEM : ENXIO); 738 } 739 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 740 panic("x"); 741 742 /* Now we are sure that name starts with the bridge's name, 743 * lookup the port in the bridge. We need to scan the entire 744 * list. It is not important to hold a WLOCK on the bridge 745 * during the search because NMG_LOCK already guarantees 746 * that there are no other possible writers. 747 */ 748 749 /* lookup in the local list of ports */ 750 for (j = 0; j < b->bdg_active_ports; j++) { 751 i = b->bdg_port_index[j]; 752 vpna = b->bdg_ports[i]; 753 ND("checking %s", vpna->up.name); 754 if (!strcmp(vpna->up.name, nr_name)) { 755 netmap_adapter_get(&vpna->up); 756 ND("found existing if %s refs %d", nr_name) 757 *na = &vpna->up; 758 return 0; 759 } 760 } 761 /* not found, should we create it? */ 762 if (!create) 763 return ENXIO; 764 /* yes we should, see if we have space to attach entries */ 765 needed = 2; /* in some cases we only need 1 */ 766 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 767 D("bridge full %d, cannot create new port", b->bdg_active_ports); 768 return ENOMEM; 769 } 770 /* record the next two ports available, but do not allocate yet */ 771 cand = b->bdg_port_index[b->bdg_active_ports]; 772 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 773 ND("+++ bridge %s port %s used %d avail %d %d", 774 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 775 776 /* 777 * try see if there is a matching NIC with this name 778 * (after the bridge's name) 779 */ 780 ifname = nr_name + b->bdg_namelen + 1; 781 ifp = ifunit_ref(ifname); 782 if (!ifp) { 783 /* Create an ephemeral virtual port 784 * This block contains all the ephemeral-specific logics 785 */ 786 if (nmr->nr_cmd) { 787 /* nr_cmd must be 0 for a virtual port */ 788 error = EINVAL; 789 goto out; 790 } 791 792 /* bdg_netmap_attach creates a struct netmap_adapter */ 793 error = netmap_vp_create(nmr, NULL, nmd, &vpna); 794 if (error) { 795 D("error %d", error); 796 goto out; 797 } 798 /* shortcut - we can skip get_hw_na(), 799 * ownership check and nm_bdg_attach() 800 */ 801 } else { 802 struct netmap_adapter *hw; 803 804 /* the vale:nic syntax is only valid for some commands */ 805 switch (nmr->nr_cmd) { 806 case NETMAP_BDG_ATTACH: 807 case NETMAP_BDG_DETACH: 808 case NETMAP_BDG_POLLING_ON: 809 case NETMAP_BDG_POLLING_OFF: 810 break; /* ok */ 811 default: 812 error = EINVAL; 813 goto out; 814 } 815 816 error = netmap_get_hw_na(ifp, nmd, &hw); 817 if (error || hw == NULL) 818 goto out; 819 820 /* host adapter might not be created */ 821 error = hw->nm_bdg_attach(nr_name, hw); 822 if (error) 823 goto out; 824 vpna = hw->na_vp; 825 hostna = hw->na_hostvp; 826 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 827 hostna = NULL; 828 } 829 830 BDG_WLOCK(b); 831 vpna->bdg_port = cand; 832 ND("NIC %p to bridge port %d", vpna, cand); 833 /* bind the port to the bridge (virtual ports are not active) */ 834 b->bdg_ports[cand] = vpna; 835 vpna->na_bdg = b; 836 b->bdg_active_ports++; 837 if (hostna != NULL) { 838 /* also bind the host stack to the bridge */ 839 b->bdg_ports[cand2] = hostna; 840 hostna->bdg_port = cand2; 841 hostna->na_bdg = b; 842 b->bdg_active_ports++; 843 ND("host %p to bridge port %d", hostna, cand2); 844 } 845 ND("if %s refs %d", ifname, vpna->up.na_refcount); 846 BDG_WUNLOCK(b); 847 *na = &vpna->up; 848 netmap_adapter_get(*na); 849 850 out: 851 if (ifp) 852 if_rele(ifp); 853 854 return error; 855 } 856 857 858 /* Process NETMAP_BDG_ATTACH */ 859 static int 860 nm_bdg_ctl_attach(struct nmreq *nmr) 861 { 862 struct netmap_adapter *na; 863 struct netmap_mem_d *nmd = NULL; 864 int error; 865 866 NMG_LOCK(); 867 868 if (nmr->nr_arg2) { 869 nmd = netmap_mem_find(nmr->nr_arg2); 870 if (nmd == NULL) { 871 error = EINVAL; 872 goto unlock_exit; 873 } 874 } 875 876 /* XXX check existing one */ 877 error = netmap_get_bdg_na(nmr, &na, nmd, 0); 878 if (!error) { 879 error = EBUSY; 880 goto unref_exit; 881 } 882 error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */); 883 if (error) /* no device */ 884 goto unlock_exit; 885 886 if (na == NULL) { /* VALE prefix missing */ 887 error = EINVAL; 888 goto unlock_exit; 889 } 890 891 if (NETMAP_OWNED_BY_ANY(na)) { 892 error = EBUSY; 893 goto unref_exit; 894 } 895 896 if (na->nm_bdg_ctl) { 897 /* nop for VALE ports. The bwrap needs to put the hwna 898 * in netmap mode (see netmap_bwrap_bdg_ctl) 899 */ 900 error = na->nm_bdg_ctl(na, nmr, 1); 901 if (error) 902 goto unref_exit; 903 ND("registered %s to netmap-mode", na->name); 904 } 905 NMG_UNLOCK(); 906 return 0; 907 908 unref_exit: 909 netmap_adapter_put(na); 910 unlock_exit: 911 NMG_UNLOCK(); 912 return error; 913 } 914 915 static inline int 916 nm_is_bwrap(struct netmap_adapter *na) 917 { 918 return na->nm_register == netmap_bwrap_reg; 919 } 920 921 /* process NETMAP_BDG_DETACH */ 922 static int 923 nm_bdg_ctl_detach(struct nmreq *nmr) 924 { 925 struct netmap_adapter *na; 926 int error; 927 928 NMG_LOCK(); 929 error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */); 930 if (error) { /* no device, or another bridge or user owns the device */ 931 goto unlock_exit; 932 } 933 934 if (na == NULL) { /* VALE prefix missing */ 935 error = EINVAL; 936 goto unlock_exit; 937 } else if (nm_is_bwrap(na) && 938 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 939 /* Don't detach a NIC with polling */ 940 error = EBUSY; 941 netmap_adapter_put(na); 942 goto unlock_exit; 943 } 944 if (na->nm_bdg_ctl) { 945 /* remove the port from bridge. The bwrap 946 * also needs to put the hwna in normal mode 947 */ 948 error = na->nm_bdg_ctl(na, nmr, 0); 949 } 950 951 netmap_adapter_put(na); 952 unlock_exit: 953 NMG_UNLOCK(); 954 return error; 955 956 } 957 958 struct nm_bdg_polling_state; 959 struct 960 nm_bdg_kthread { 961 struct nm_kctx *nmk; 962 u_int qfirst; 963 u_int qlast; 964 struct nm_bdg_polling_state *bps; 965 }; 966 967 struct nm_bdg_polling_state { 968 bool configured; 969 bool stopped; 970 struct netmap_bwrap_adapter *bna; 971 u_int reg; 972 u_int qfirst; 973 u_int qlast; 974 u_int cpu_from; 975 u_int ncpus; 976 struct nm_bdg_kthread *kthreads; 977 }; 978 979 static void 980 netmap_bwrap_polling(void *data, int is_kthread) 981 { 982 struct nm_bdg_kthread *nbk = data; 983 struct netmap_bwrap_adapter *bna; 984 u_int qfirst, qlast, i; 985 struct netmap_kring *kring0, *kring; 986 987 if (!nbk) 988 return; 989 qfirst = nbk->qfirst; 990 qlast = nbk->qlast; 991 bna = nbk->bps->bna; 992 kring0 = NMR(bna->hwna, NR_RX); 993 994 for (i = qfirst; i < qlast; i++) { 995 kring = kring0 + i; 996 kring->nm_notify(kring, 0); 997 } 998 } 999 1000 static int 1001 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) 1002 { 1003 struct nm_kctx_cfg kcfg; 1004 int i, j; 1005 1006 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus); 1007 if (bps->kthreads == NULL) 1008 return ENOMEM; 1009 1010 bzero(&kcfg, sizeof(kcfg)); 1011 kcfg.worker_fn = netmap_bwrap_polling; 1012 kcfg.use_kthread = 1; 1013 for (i = 0; i < bps->ncpus; i++) { 1014 struct nm_bdg_kthread *t = bps->kthreads + i; 1015 int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); 1016 int affinity = bps->cpu_from + i; 1017 1018 t->bps = bps; 1019 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; 1020 t->qlast = all ? bps->qlast : t->qfirst + 1; 1021 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, 1022 t->qlast); 1023 1024 kcfg.type = i; 1025 kcfg.worker_private = t; 1026 t->nmk = nm_os_kctx_create(&kcfg, 0, NULL); 1027 if (t->nmk == NULL) { 1028 goto cleanup; 1029 } 1030 nm_os_kctx_worker_setaff(t->nmk, affinity); 1031 } 1032 return 0; 1033 1034 cleanup: 1035 for (j = 0; j < i; j++) { 1036 struct nm_bdg_kthread *t = bps->kthreads + i; 1037 nm_os_kctx_destroy(t->nmk); 1038 } 1039 nm_os_free(bps->kthreads); 1040 return EFAULT; 1041 } 1042 1043 /* A variant of ptnetmap_start_kthreads() */ 1044 static int 1045 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) 1046 { 1047 int error, i, j; 1048 1049 if (!bps) { 1050 D("polling is not configured"); 1051 return EFAULT; 1052 } 1053 bps->stopped = false; 1054 1055 for (i = 0; i < bps->ncpus; i++) { 1056 struct nm_bdg_kthread *t = bps->kthreads + i; 1057 error = nm_os_kctx_worker_start(t->nmk); 1058 if (error) { 1059 D("error in nm_kthread_start()"); 1060 goto cleanup; 1061 } 1062 } 1063 return 0; 1064 1065 cleanup: 1066 for (j = 0; j < i; j++) { 1067 struct nm_bdg_kthread *t = bps->kthreads + i; 1068 nm_os_kctx_worker_stop(t->nmk); 1069 } 1070 bps->stopped = true; 1071 return error; 1072 } 1073 1074 static void 1075 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) 1076 { 1077 int i; 1078 1079 if (!bps) 1080 return; 1081 1082 for (i = 0; i < bps->ncpus; i++) { 1083 struct nm_bdg_kthread *t = bps->kthreads + i; 1084 nm_os_kctx_worker_stop(t->nmk); 1085 nm_os_kctx_destroy(t->nmk); 1086 } 1087 bps->stopped = true; 1088 } 1089 1090 static int 1091 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, 1092 struct nm_bdg_polling_state *bps) 1093 { 1094 int req_cpus, avail_cpus, core_from; 1095 u_int reg, i, qfirst, qlast; 1096 1097 avail_cpus = nm_os_ncpus(); 1098 req_cpus = nmr->nr_arg1; 1099 1100 if (req_cpus == 0) { 1101 D("req_cpus must be > 0"); 1102 return EINVAL; 1103 } else if (req_cpus >= avail_cpus) { 1104 D("for safety, we need at least one core left in the system"); 1105 return EINVAL; 1106 } 1107 reg = nmr->nr_flags & NR_REG_MASK; 1108 i = nmr->nr_ringid & NETMAP_RING_MASK; 1109 /* 1110 * ONE_NIC: dedicate one core to one ring. If multiple cores 1111 * are specified, consecutive rings are also polled. 1112 * For example, if ringid=2 and 2 cores are given, 1113 * ring 2 and 3 are polled by core 2 and 3, respectively. 1114 * ALL_NIC: poll all the rings using a core specified by ringid. 1115 * the number of cores must be 1. 1116 */ 1117 if (reg == NR_REG_ONE_NIC) { 1118 if (i + req_cpus > nma_get_nrings(na, NR_RX)) { 1119 D("only %d rings exist (ring %u-%u is given)", 1120 nma_get_nrings(na, NR_RX), i, i+req_cpus); 1121 return EINVAL; 1122 } 1123 qfirst = i; 1124 qlast = qfirst + req_cpus; 1125 core_from = qfirst; 1126 } else if (reg == NR_REG_ALL_NIC) { 1127 if (req_cpus != 1) { 1128 D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); 1129 return EINVAL; 1130 } 1131 qfirst = 0; 1132 qlast = nma_get_nrings(na, NR_RX); 1133 core_from = i; 1134 } else { 1135 D("reg must be ALL_NIC or ONE_NIC"); 1136 return EINVAL; 1137 } 1138 1139 bps->reg = reg; 1140 bps->qfirst = qfirst; 1141 bps->qlast = qlast; 1142 bps->cpu_from = core_from; 1143 bps->ncpus = req_cpus; 1144 D("%s qfirst %u qlast %u cpu_from %u ncpus %u", 1145 reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", 1146 qfirst, qlast, core_from, req_cpus); 1147 return 0; 1148 } 1149 1150 static int 1151 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) 1152 { 1153 struct nm_bdg_polling_state *bps; 1154 struct netmap_bwrap_adapter *bna; 1155 int error; 1156 1157 bna = (struct netmap_bwrap_adapter *)na; 1158 if (bna->na_polling_state) { 1159 D("ERROR adapter already in polling mode"); 1160 return EFAULT; 1161 } 1162 1163 bps = nm_os_malloc(sizeof(*bps)); 1164 if (!bps) 1165 return ENOMEM; 1166 bps->configured = false; 1167 bps->stopped = true; 1168 1169 if (get_polling_cfg(nmr, na, bps)) { 1170 nm_os_free(bps); 1171 return EINVAL; 1172 } 1173 1174 if (nm_bdg_create_kthreads(bps)) { 1175 nm_os_free(bps); 1176 return EFAULT; 1177 } 1178 1179 bps->configured = true; 1180 bna->na_polling_state = bps; 1181 bps->bna = bna; 1182 1183 /* disable interrupts if possible */ 1184 nma_intr_enable(bna->hwna, 0); 1185 /* start kthread now */ 1186 error = nm_bdg_polling_start_kthreads(bps); 1187 if (error) { 1188 D("ERROR nm_bdg_polling_start_kthread()"); 1189 nm_os_free(bps->kthreads); 1190 nm_os_free(bps); 1191 bna->na_polling_state = NULL; 1192 nma_intr_enable(bna->hwna, 1); 1193 } 1194 return error; 1195 } 1196 1197 static int 1198 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) 1199 { 1200 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; 1201 struct nm_bdg_polling_state *bps; 1202 1203 if (!bna->na_polling_state) { 1204 D("ERROR adapter is not in polling mode"); 1205 return EFAULT; 1206 } 1207 bps = bna->na_polling_state; 1208 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); 1209 bps->configured = false; 1210 nm_os_free(bps); 1211 bna->na_polling_state = NULL; 1212 /* reenable interrupts */ 1213 nma_intr_enable(bna->hwna, 1); 1214 return 0; 1215 } 1216 1217 /* Called by either user's context (netmap_ioctl()) 1218 * or external kernel modules (e.g., Openvswitch). 1219 * Operation is indicated in nmr->nr_cmd. 1220 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 1221 * requires bdg_ops argument; the other commands ignore this argument. 1222 * 1223 * Called without NMG_LOCK. 1224 */ 1225 int 1226 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 1227 { 1228 struct nm_bridge *b, *bridges; 1229 struct netmap_adapter *na; 1230 struct netmap_vp_adapter *vpna; 1231 char *name = nmr->nr_name; 1232 int cmd = nmr->nr_cmd, namelen = strlen(name); 1233 int error = 0, i, j; 1234 u_int num_bridges; 1235 1236 netmap_bns_getbridges(&bridges, &num_bridges); 1237 1238 switch (cmd) { 1239 case NETMAP_BDG_NEWIF: 1240 error = netmap_vi_create(nmr, 0 /* no autodelete */); 1241 break; 1242 1243 case NETMAP_BDG_DELIF: 1244 error = nm_vi_destroy(nmr->nr_name); 1245 break; 1246 1247 case NETMAP_BDG_ATTACH: 1248 error = nm_bdg_ctl_attach(nmr); 1249 break; 1250 1251 case NETMAP_BDG_DETACH: 1252 error = nm_bdg_ctl_detach(nmr); 1253 break; 1254 1255 case NETMAP_BDG_LIST: 1256 /* this is used to enumerate bridges and ports */ 1257 if (namelen) { /* look up indexes of bridge and port */ 1258 if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { 1259 error = EINVAL; 1260 break; 1261 } 1262 NMG_LOCK(); 1263 b = nm_find_bridge(name, 0 /* don't create */); 1264 if (!b) { 1265 error = ENOENT; 1266 NMG_UNLOCK(); 1267 break; 1268 } 1269 1270 error = 0; 1271 nmr->nr_arg1 = b - bridges; /* bridge index */ 1272 nmr->nr_arg2 = NM_BDG_NOPORT; 1273 for (j = 0; j < b->bdg_active_ports; j++) { 1274 i = b->bdg_port_index[j]; 1275 vpna = b->bdg_ports[i]; 1276 if (vpna == NULL) { 1277 D("---AAAAAAAAARGH-------"); 1278 continue; 1279 } 1280 /* the former and the latter identify a 1281 * virtual port and a NIC, respectively 1282 */ 1283 if (!strcmp(vpna->up.name, name)) { 1284 nmr->nr_arg2 = i; /* port index */ 1285 break; 1286 } 1287 } 1288 NMG_UNLOCK(); 1289 } else { 1290 /* return the first non-empty entry starting from 1291 * bridge nr_arg1 and port nr_arg2. 1292 * 1293 * Users can detect the end of the same bridge by 1294 * seeing the new and old value of nr_arg1, and can 1295 * detect the end of all the bridge by error != 0 1296 */ 1297 i = nmr->nr_arg1; 1298 j = nmr->nr_arg2; 1299 1300 NMG_LOCK(); 1301 for (error = ENOENT; i < NM_BRIDGES; i++) { 1302 b = bridges + i; 1303 for ( ; j < NM_BDG_MAXPORTS; j++) { 1304 if (b->bdg_ports[j] == NULL) 1305 continue; 1306 vpna = b->bdg_ports[j]; 1307 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 1308 error = 0; 1309 goto out; 1310 } 1311 j = 0; /* following bridges scan from 0 */ 1312 } 1313 out: 1314 nmr->nr_arg1 = i; 1315 nmr->nr_arg2 = j; 1316 NMG_UNLOCK(); 1317 } 1318 break; 1319 1320 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 1321 /* register callbacks to the given bridge. 1322 * nmr->nr_name may be just bridge's name (including ':' 1323 * if it is not just NM_NAME). 1324 */ 1325 if (!bdg_ops) { 1326 error = EINVAL; 1327 break; 1328 } 1329 NMG_LOCK(); 1330 b = nm_find_bridge(name, 0 /* don't create */); 1331 if (!b) { 1332 error = EINVAL; 1333 } else { 1334 b->bdg_ops = *bdg_ops; 1335 } 1336 NMG_UNLOCK(); 1337 break; 1338 1339 case NETMAP_BDG_VNET_HDR: 1340 /* Valid lengths for the virtio-net header are 0 (no header), 1341 10 and 12. */ 1342 if (nmr->nr_arg1 != 0 && 1343 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 1344 nmr->nr_arg1 != 12) { 1345 error = EINVAL; 1346 break; 1347 } 1348 NMG_LOCK(); 1349 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1350 if (na && !error) { 1351 vpna = (struct netmap_vp_adapter *)na; 1352 na->virt_hdr_len = nmr->nr_arg1; 1353 if (na->virt_hdr_len) { 1354 vpna->mfs = NETMAP_BUF_SIZE(na); 1355 } 1356 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); 1357 netmap_adapter_put(na); 1358 } else if (!na) { 1359 error = ENXIO; 1360 } 1361 NMG_UNLOCK(); 1362 break; 1363 1364 case NETMAP_BDG_POLLING_ON: 1365 case NETMAP_BDG_POLLING_OFF: 1366 NMG_LOCK(); 1367 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1368 if (na && !error) { 1369 if (!nm_is_bwrap(na)) { 1370 error = EOPNOTSUPP; 1371 } else if (cmd == NETMAP_BDG_POLLING_ON) { 1372 error = nm_bdg_ctl_polling_start(nmr, na); 1373 if (!error) 1374 netmap_adapter_get(na); 1375 } else { 1376 error = nm_bdg_ctl_polling_stop(nmr, na); 1377 if (!error) 1378 netmap_adapter_put(na); 1379 } 1380 netmap_adapter_put(na); 1381 } 1382 NMG_UNLOCK(); 1383 break; 1384 1385 default: 1386 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 1387 error = EINVAL; 1388 break; 1389 } 1390 return error; 1391 } 1392 1393 int 1394 netmap_bdg_config(struct nmreq *nmr) 1395 { 1396 struct nm_bridge *b; 1397 int error = EINVAL; 1398 1399 NMG_LOCK(); 1400 b = nm_find_bridge(nmr->nr_name, 0); 1401 if (!b) { 1402 NMG_UNLOCK(); 1403 return error; 1404 } 1405 NMG_UNLOCK(); 1406 /* Don't call config() with NMG_LOCK() held */ 1407 BDG_RLOCK(b); 1408 if (b->bdg_ops.config != NULL) 1409 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 1410 BDG_RUNLOCK(b); 1411 return error; 1412 } 1413 1414 1415 /* nm_krings_create callback for VALE ports. 1416 * Calls the standard netmap_krings_create, then adds leases on rx 1417 * rings and bdgfwd on tx rings. 1418 */ 1419 static int 1420 netmap_vp_krings_create(struct netmap_adapter *na) 1421 { 1422 u_int tailroom; 1423 int error, i; 1424 uint32_t *leases; 1425 u_int nrx = netmap_real_rings(na, NR_RX); 1426 1427 /* 1428 * Leases are attached to RX rings on vale ports 1429 */ 1430 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 1431 1432 error = netmap_krings_create(na, tailroom); 1433 if (error) 1434 return error; 1435 1436 leases = na->tailroom; 1437 1438 for (i = 0; i < nrx; i++) { /* Receive rings */ 1439 na->rx_rings[i].nkr_leases = leases; 1440 leases += na->num_rx_desc; 1441 } 1442 1443 error = nm_alloc_bdgfwd(na); 1444 if (error) { 1445 netmap_krings_delete(na); 1446 return error; 1447 } 1448 1449 return 0; 1450 } 1451 1452 1453 /* nm_krings_delete callback for VALE ports. */ 1454 static void 1455 netmap_vp_krings_delete(struct netmap_adapter *na) 1456 { 1457 nm_free_bdgfwd(na); 1458 netmap_krings_delete(na); 1459 } 1460 1461 1462 static int 1463 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1464 struct netmap_vp_adapter *na, u_int ring_nr); 1465 1466 1467 /* 1468 * main dispatch routine for the bridge. 1469 * Grab packets from a kring, move them into the ft structure 1470 * associated to the tx (input) port. Max one instance per port, 1471 * filtered on input (ioctl, poll or XXX). 1472 * Returns the next position in the ring. 1473 */ 1474 static int 1475 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1476 { 1477 struct netmap_vp_adapter *na = 1478 (struct netmap_vp_adapter*)kring->na; 1479 struct netmap_ring *ring = kring->ring; 1480 struct nm_bdg_fwd *ft; 1481 u_int ring_nr = kring->ring_id; 1482 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1483 u_int ft_i = 0; /* start from 0 */ 1484 u_int frags = 1; /* how many frags ? */ 1485 struct nm_bridge *b = na->na_bdg; 1486 1487 /* To protect against modifications to the bridge we acquire a 1488 * shared lock, waiting if we can sleep (if the source port is 1489 * attached to a user process) or with a trylock otherwise (NICs). 1490 */ 1491 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1492 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1493 BDG_RLOCK(b); 1494 else if (!BDG_RTRYLOCK(b)) 1495 return j; 1496 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1497 ft = kring->nkr_ft; 1498 1499 for (; likely(j != end); j = nm_next(j, lim)) { 1500 struct netmap_slot *slot = &ring->slot[j]; 1501 char *buf; 1502 1503 ft[ft_i].ft_len = slot->len; 1504 ft[ft_i].ft_flags = slot->flags; 1505 1506 ND("flags is 0x%x", slot->flags); 1507 /* we do not use the buf changed flag, but we still need to reset it */ 1508 slot->flags &= ~NS_BUF_CHANGED; 1509 1510 /* this slot goes into a list so initialize the link field */ 1511 ft[ft_i].ft_next = NM_FT_NULL; 1512 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1513 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1514 if (unlikely(buf == NULL)) { 1515 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1516 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1517 kring->name, j, ft[ft_i].ft_len); 1518 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1519 ft[ft_i].ft_len = 0; 1520 ft[ft_i].ft_flags = 0; 1521 } 1522 __builtin_prefetch(buf); 1523 ++ft_i; 1524 if (slot->flags & NS_MOREFRAG) { 1525 frags++; 1526 continue; 1527 } 1528 if (unlikely(netmap_verbose && frags > 1)) 1529 RD(5, "%d frags at %d", frags, ft_i - frags); 1530 ft[ft_i - frags].ft_frags = frags; 1531 frags = 1; 1532 if (unlikely((int)ft_i >= bridge_batch)) 1533 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1534 } 1535 if (frags > 1) { 1536 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 1537 * have to fix frags count. */ 1538 frags--; 1539 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 1540 ft[ft_i - frags].ft_frags = frags; 1541 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1542 } 1543 if (ft_i) 1544 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1545 BDG_RUNLOCK(b); 1546 return j; 1547 } 1548 1549 1550 /* ----- FreeBSD if_bridge hash function ------- */ 1551 1552 /* 1553 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1554 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1555 * 1556 * http://www.burtleburtle.net/bob/hash/spooky.html 1557 */ 1558 #define mix(a, b, c) \ 1559 do { \ 1560 a -= b; a -= c; a ^= (c >> 13); \ 1561 b -= c; b -= a; b ^= (a << 8); \ 1562 c -= a; c -= b; c ^= (b >> 13); \ 1563 a -= b; a -= c; a ^= (c >> 12); \ 1564 b -= c; b -= a; b ^= (a << 16); \ 1565 c -= a; c -= b; c ^= (b >> 5); \ 1566 a -= b; a -= c; a ^= (c >> 3); \ 1567 b -= c; b -= a; b ^= (a << 10); \ 1568 c -= a; c -= b; c ^= (b >> 15); \ 1569 } while (/*CONSTCOND*/0) 1570 1571 1572 static __inline uint32_t 1573 nm_bridge_rthash(const uint8_t *addr) 1574 { 1575 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1576 1577 b += addr[5] << 8; 1578 b += addr[4]; 1579 a += addr[3] << 24; 1580 a += addr[2] << 16; 1581 a += addr[1] << 8; 1582 a += addr[0]; 1583 1584 mix(a, b, c); 1585 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1586 return (c & BRIDGE_RTHASH_MASK); 1587 } 1588 1589 #undef mix 1590 1591 1592 /* nm_register callback for VALE ports */ 1593 static int 1594 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1595 { 1596 struct netmap_vp_adapter *vpna = 1597 (struct netmap_vp_adapter*)na; 1598 enum txrx t; 1599 int i; 1600 1601 /* persistent ports may be put in netmap mode 1602 * before being attached to a bridge 1603 */ 1604 if (vpna->na_bdg) 1605 BDG_WLOCK(vpna->na_bdg); 1606 if (onoff) { 1607 for_rx_tx(t) { 1608 for (i = 0; i < netmap_real_rings(na, t); i++) { 1609 struct netmap_kring *kring = &NMR(na, t)[i]; 1610 1611 if (nm_kring_pending_on(kring)) 1612 kring->nr_mode = NKR_NETMAP_ON; 1613 } 1614 } 1615 if (na->active_fds == 0) 1616 na->na_flags |= NAF_NETMAP_ON; 1617 /* XXX on FreeBSD, persistent VALE ports should also 1618 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1619 */ 1620 } else { 1621 if (na->active_fds == 0) 1622 na->na_flags &= ~NAF_NETMAP_ON; 1623 for_rx_tx(t) { 1624 for (i = 0; i < netmap_real_rings(na, t); i++) { 1625 struct netmap_kring *kring = &NMR(na, t)[i]; 1626 1627 if (nm_kring_pending_off(kring)) 1628 kring->nr_mode = NKR_NETMAP_OFF; 1629 } 1630 } 1631 } 1632 if (vpna->na_bdg) 1633 BDG_WUNLOCK(vpna->na_bdg); 1634 return 0; 1635 } 1636 1637 1638 /* 1639 * Lookup function for a learning bridge. 1640 * Update the hash table with the source address, 1641 * and then returns the destination port index, and the 1642 * ring in *dst_ring (at the moment, always use ring 0) 1643 */ 1644 u_int 1645 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1646 struct netmap_vp_adapter *na) 1647 { 1648 uint8_t *buf = ft->ft_buf; 1649 u_int buf_len = ft->ft_len; 1650 struct nm_hash_ent *ht = na->na_bdg->ht; 1651 uint32_t sh, dh; 1652 u_int dst, mysrc = na->bdg_port; 1653 uint64_t smac, dmac; 1654 uint8_t indbuf[12]; 1655 1656 /* safety check, unfortunately we have many cases */ 1657 if (buf_len >= 14 + na->up.virt_hdr_len) { 1658 /* virthdr + mac_hdr in the same slot */ 1659 buf += na->up.virt_hdr_len; 1660 buf_len -= na->up.virt_hdr_len; 1661 } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1662 /* only header in first fragment */ 1663 ft++; 1664 buf = ft->ft_buf; 1665 buf_len = ft->ft_len; 1666 } else { 1667 RD(5, "invalid buf format, length %d", buf_len); 1668 return NM_BDG_NOPORT; 1669 } 1670 1671 if (ft->ft_flags & NS_INDIRECT) { 1672 if (copyin(buf, indbuf, sizeof(indbuf))) { 1673 return NM_BDG_NOPORT; 1674 } 1675 buf = indbuf; 1676 } 1677 1678 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1679 smac = le64toh(*(uint64_t *)(buf + 4)); 1680 smac >>= 16; 1681 1682 /* 1683 * The hash is somewhat expensive, there might be some 1684 * worthwhile optimizations here. 1685 */ 1686 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1687 uint8_t *s = buf+6; 1688 sh = nm_bridge_rthash(s); /* hash of source */ 1689 /* update source port forwarding entry */ 1690 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1691 ht[sh].ports = mysrc; 1692 if (netmap_verbose) 1693 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1694 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1695 } 1696 dst = NM_BDG_BROADCAST; 1697 if ((buf[0] & 1) == 0) { /* unicast */ 1698 dh = nm_bridge_rthash(buf); /* hash of dst */ 1699 if (ht[dh].mac == dmac) { /* found dst */ 1700 dst = ht[dh].ports; 1701 } 1702 } 1703 return dst; 1704 } 1705 1706 1707 /* 1708 * Available space in the ring. Only used in VALE code 1709 * and only with is_rx = 1 1710 */ 1711 static inline uint32_t 1712 nm_kr_space(struct netmap_kring *k, int is_rx) 1713 { 1714 int space; 1715 1716 if (is_rx) { 1717 int busy = k->nkr_hwlease - k->nr_hwcur; 1718 if (busy < 0) 1719 busy += k->nkr_num_slots; 1720 space = k->nkr_num_slots - 1 - busy; 1721 } else { 1722 /* XXX never used in this branch */ 1723 space = k->nr_hwtail - k->nkr_hwlease; 1724 if (space < 0) 1725 space += k->nkr_num_slots; 1726 } 1727 #if 0 1728 // sanity check 1729 if (k->nkr_hwlease >= k->nkr_num_slots || 1730 k->nr_hwcur >= k->nkr_num_slots || 1731 k->nr_tail >= k->nkr_num_slots || 1732 busy < 0 || 1733 busy >= k->nkr_num_slots) { 1734 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1735 k->nkr_lease_idx, k->nkr_num_slots); 1736 } 1737 #endif 1738 return space; 1739 } 1740 1741 1742 1743 1744 /* make a lease on the kring for N positions. return the 1745 * lease index 1746 * XXX only used in VALE code and with is_rx = 1 1747 */ 1748 static inline uint32_t 1749 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1750 { 1751 uint32_t lim = k->nkr_num_slots - 1; 1752 uint32_t lease_idx = k->nkr_lease_idx; 1753 1754 k->nkr_leases[lease_idx] = NR_NOSLOT; 1755 k->nkr_lease_idx = nm_next(lease_idx, lim); 1756 1757 if (n > nm_kr_space(k, is_rx)) { 1758 D("invalid request for %d slots", n); 1759 panic("x"); 1760 } 1761 /* XXX verify that there are n slots */ 1762 k->nkr_hwlease += n; 1763 if (k->nkr_hwlease > lim) 1764 k->nkr_hwlease -= lim + 1; 1765 1766 if (k->nkr_hwlease >= k->nkr_num_slots || 1767 k->nr_hwcur >= k->nkr_num_slots || 1768 k->nr_hwtail >= k->nkr_num_slots || 1769 k->nkr_lease_idx >= k->nkr_num_slots) { 1770 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1771 k->na->name, 1772 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1773 k->nkr_lease_idx, k->nkr_num_slots); 1774 } 1775 return lease_idx; 1776 } 1777 1778 /* 1779 * 1780 * This flush routine supports only unicast and broadcast but a large 1781 * number of ports, and lets us replace the learn and dispatch functions. 1782 */ 1783 int 1784 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1785 u_int ring_nr) 1786 { 1787 struct nm_bdg_q *dst_ents, *brddst; 1788 uint16_t num_dsts = 0, *dsts; 1789 struct nm_bridge *b = na->na_bdg; 1790 u_int i, me = na->bdg_port; 1791 1792 /* 1793 * The work area (pointed by ft) is followed by an array of 1794 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1795 * queues per port plus one for the broadcast traffic. 1796 * Then we have an array of destination indexes. 1797 */ 1798 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1799 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1800 1801 /* first pass: find a destination for each packet in the batch */ 1802 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1803 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1804 uint16_t dst_port, d_i; 1805 struct nm_bdg_q *d; 1806 1807 ND("slot %d frags %d", i, ft[i].ft_frags); 1808 /* Drop the packet if the virtio-net header is not into the first 1809 fragment nor at the very beginning of the second. */ 1810 if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) 1811 continue; 1812 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1813 if (netmap_verbose > 255) 1814 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1815 if (dst_port >= NM_BDG_NOPORT) 1816 continue; /* this packet is identified to be dropped */ 1817 else if (dst_port == NM_BDG_BROADCAST) 1818 dst_ring = 0; /* broadcasts always go to ring 0 */ 1819 else if (unlikely(dst_port == me || 1820 !b->bdg_ports[dst_port])) 1821 continue; 1822 1823 /* get a position in the scratch pad */ 1824 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1825 d = dst_ents + d_i; 1826 1827 /* append the first fragment to the list */ 1828 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1829 d->bq_head = d->bq_tail = i; 1830 /* remember this position to be scanned later */ 1831 if (dst_port != NM_BDG_BROADCAST) 1832 dsts[num_dsts++] = d_i; 1833 } else { 1834 ft[d->bq_tail].ft_next = i; 1835 d->bq_tail = i; 1836 } 1837 d->bq_len += ft[i].ft_frags; 1838 } 1839 1840 /* 1841 * Broadcast traffic goes to ring 0 on all destinations. 1842 * So we need to add these rings to the list of ports to scan. 1843 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1844 * expensive. We should keep a compact list of active destinations 1845 * so we could shorten this loop. 1846 */ 1847 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1848 if (brddst->bq_head != NM_FT_NULL) { 1849 u_int j; 1850 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1851 uint16_t d_i; 1852 i = b->bdg_port_index[j]; 1853 if (unlikely(i == me)) 1854 continue; 1855 d_i = i * NM_BDG_MAXRINGS; 1856 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1857 dsts[num_dsts++] = d_i; 1858 } 1859 } 1860 1861 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1862 /* second pass: scan destinations */ 1863 for (i = 0; i < num_dsts; i++) { 1864 struct netmap_vp_adapter *dst_na; 1865 struct netmap_kring *kring; 1866 struct netmap_ring *ring; 1867 u_int dst_nr, lim, j, d_i, next, brd_next; 1868 u_int needed, howmany; 1869 int retry = netmap_txsync_retry; 1870 struct nm_bdg_q *d; 1871 uint32_t my_start = 0, lease_idx = 0; 1872 int nrings; 1873 int virt_hdr_mismatch = 0; 1874 1875 d_i = dsts[i]; 1876 ND("second pass %d port %d", i, d_i); 1877 d = dst_ents + d_i; 1878 // XXX fix the division 1879 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1880 /* protect from the lookup function returning an inactive 1881 * destination port 1882 */ 1883 if (unlikely(dst_na == NULL)) 1884 goto cleanup; 1885 if (dst_na->up.na_flags & NAF_SW_ONLY) 1886 goto cleanup; 1887 /* 1888 * The interface may be in !netmap mode in two cases: 1889 * - when na is attached but not activated yet; 1890 * - when na is being deactivated but is still attached. 1891 */ 1892 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1893 ND("not in netmap mode!"); 1894 goto cleanup; 1895 } 1896 1897 /* there is at least one either unicast or broadcast packet */ 1898 brd_next = brddst->bq_head; 1899 next = d->bq_head; 1900 /* we need to reserve this many slots. If fewer are 1901 * available, some packets will be dropped. 1902 * Packets may have multiple fragments, so we may not use 1903 * there is a chance that we may not use all of the slots 1904 * we have claimed, so we will need to handle the leftover 1905 * ones when we regain the lock. 1906 */ 1907 needed = d->bq_len + brddst->bq_len; 1908 1909 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 1910 if (netmap_verbose) { 1911 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 1912 dst_na->up.virt_hdr_len); 1913 } 1914 /* There is a virtio-net header/offloadings mismatch between 1915 * source and destination. The slower mismatch datapath will 1916 * be used to cope with all the mismatches. 1917 */ 1918 virt_hdr_mismatch = 1; 1919 if (dst_na->mfs < na->mfs) { 1920 /* We may need to do segmentation offloadings, and so 1921 * we may need a number of destination slots greater 1922 * than the number of input slots ('needed'). 1923 * We look for the smallest integer 'x' which satisfies: 1924 * needed * na->mfs + x * H <= x * na->mfs 1925 * where 'H' is the length of the longest header that may 1926 * be replicated in the segmentation process (e.g. for 1927 * TCPv4 we must account for ethernet header, IP header 1928 * and TCPv4 header). 1929 */ 1930 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 1931 needed = (needed * na->mfs) / 1932 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1933 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1934 } 1935 } 1936 1937 ND(5, "pass 2 dst %d is %x %s", 1938 i, d_i, is_vp ? "virtual" : "nic/host"); 1939 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1940 nrings = dst_na->up.num_rx_rings; 1941 if (dst_nr >= nrings) 1942 dst_nr = dst_nr % nrings; 1943 kring = &dst_na->up.rx_rings[dst_nr]; 1944 ring = kring->ring; 1945 /* the destination ring may have not been opened for RX */ 1946 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 1947 goto cleanup; 1948 lim = kring->nkr_num_slots - 1; 1949 1950 retry: 1951 1952 if (dst_na->retry && retry) { 1953 /* try to get some free slot from the previous run */ 1954 kring->nm_notify(kring, 0); 1955 /* actually useful only for bwraps, since there 1956 * the notify will trigger a txsync on the hwna. VALE ports 1957 * have dst_na->retry == 0 1958 */ 1959 } 1960 /* reserve the buffers in the queue and an entry 1961 * to report completion, and drop lock. 1962 * XXX this might become a helper function. 1963 */ 1964 mtx_lock(&kring->q_lock); 1965 if (kring->nkr_stopped) { 1966 mtx_unlock(&kring->q_lock); 1967 goto cleanup; 1968 } 1969 my_start = j = kring->nkr_hwlease; 1970 howmany = nm_kr_space(kring, 1); 1971 if (needed < howmany) 1972 howmany = needed; 1973 lease_idx = nm_kr_lease(kring, howmany, 1); 1974 mtx_unlock(&kring->q_lock); 1975 1976 /* only retry if we need more than available slots */ 1977 if (retry && needed <= howmany) 1978 retry = 0; 1979 1980 /* copy to the destination queue */ 1981 while (howmany > 0) { 1982 struct netmap_slot *slot; 1983 struct nm_bdg_fwd *ft_p, *ft_end; 1984 u_int cnt; 1985 1986 /* find the queue from which we pick next packet. 1987 * NM_FT_NULL is always higher than valid indexes 1988 * so we never dereference it if the other list 1989 * has packets (and if both are empty we never 1990 * get here). 1991 */ 1992 if (next < brd_next) { 1993 ft_p = ft + next; 1994 next = ft_p->ft_next; 1995 } else { /* insert broadcast */ 1996 ft_p = ft + brd_next; 1997 brd_next = ft_p->ft_next; 1998 } 1999 cnt = ft_p->ft_frags; // cnt > 0 2000 if (unlikely(cnt > howmany)) 2001 break; /* no more space */ 2002 if (netmap_verbose && cnt > 1) 2003 RD(5, "rx %d frags to %d", cnt, j); 2004 ft_end = ft_p + cnt; 2005 if (unlikely(virt_hdr_mismatch)) { 2006 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 2007 } else { 2008 howmany -= cnt; 2009 do { 2010 char *dst, *src = ft_p->ft_buf; 2011 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 2012 2013 slot = &ring->slot[j]; 2014 dst = NMB(&dst_na->up, slot); 2015 2016 ND("send [%d] %d(%d) bytes at %s:%d", 2017 i, (int)copy_len, (int)dst_len, 2018 NM_IFPNAME(dst_ifp), j); 2019 /* round to a multiple of 64 */ 2020 copy_len = (copy_len + 63) & ~63; 2021 2022 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 2023 copy_len > NETMAP_BUF_SIZE(&na->up))) { 2024 RD(5, "invalid len %d, down to 64", (int)copy_len); 2025 copy_len = dst_len = 64; // XXX 2026 } 2027 if (ft_p->ft_flags & NS_INDIRECT) { 2028 if (copyin(src, dst, copy_len)) { 2029 // invalid user pointer, pretend len is 0 2030 dst_len = 0; 2031 } 2032 } else { 2033 //memcpy(dst, src, copy_len); 2034 pkt_copy(src, dst, (int)copy_len); 2035 } 2036 slot->len = dst_len; 2037 slot->flags = (cnt << 8)| NS_MOREFRAG; 2038 j = nm_next(j, lim); 2039 needed--; 2040 ft_p++; 2041 } while (ft_p != ft_end); 2042 slot->flags = (cnt << 8); /* clear flag on last entry */ 2043 } 2044 /* are we done ? */ 2045 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 2046 break; 2047 } 2048 { 2049 /* current position */ 2050 uint32_t *p = kring->nkr_leases; /* shorthand */ 2051 uint32_t update_pos; 2052 int still_locked = 1; 2053 2054 mtx_lock(&kring->q_lock); 2055 if (unlikely(howmany > 0)) { 2056 /* not used all bufs. If i am the last one 2057 * i can recover the slots, otherwise must 2058 * fill them with 0 to mark empty packets. 2059 */ 2060 ND("leftover %d bufs", howmany); 2061 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 2062 /* yes i am the last one */ 2063 ND("roll back nkr_hwlease to %d", j); 2064 kring->nkr_hwlease = j; 2065 } else { 2066 while (howmany-- > 0) { 2067 ring->slot[j].len = 0; 2068 ring->slot[j].flags = 0; 2069 j = nm_next(j, lim); 2070 } 2071 } 2072 } 2073 p[lease_idx] = j; /* report I am done */ 2074 2075 update_pos = kring->nr_hwtail; 2076 2077 if (my_start == update_pos) { 2078 /* all slots before my_start have been reported, 2079 * so scan subsequent leases to see if other ranges 2080 * have been completed, and to a selwakeup or txsync. 2081 */ 2082 while (lease_idx != kring->nkr_lease_idx && 2083 p[lease_idx] != NR_NOSLOT) { 2084 j = p[lease_idx]; 2085 p[lease_idx] = NR_NOSLOT; 2086 lease_idx = nm_next(lease_idx, lim); 2087 } 2088 /* j is the new 'write' position. j != my_start 2089 * means there are new buffers to report 2090 */ 2091 if (likely(j != my_start)) { 2092 kring->nr_hwtail = j; 2093 still_locked = 0; 2094 mtx_unlock(&kring->q_lock); 2095 kring->nm_notify(kring, 0); 2096 /* this is netmap_notify for VALE ports and 2097 * netmap_bwrap_notify for bwrap. The latter will 2098 * trigger a txsync on the underlying hwna 2099 */ 2100 if (dst_na->retry && retry--) { 2101 /* XXX this is going to call nm_notify again. 2102 * Only useful for bwrap in virtual machines 2103 */ 2104 goto retry; 2105 } 2106 } 2107 } 2108 if (still_locked) 2109 mtx_unlock(&kring->q_lock); 2110 } 2111 cleanup: 2112 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 2113 d->bq_len = 0; 2114 } 2115 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 2116 brddst->bq_len = 0; 2117 return 0; 2118 } 2119 2120 /* nm_txsync callback for VALE ports */ 2121 static int 2122 netmap_vp_txsync(struct netmap_kring *kring, int flags) 2123 { 2124 struct netmap_vp_adapter *na = 2125 (struct netmap_vp_adapter *)kring->na; 2126 u_int done; 2127 u_int const lim = kring->nkr_num_slots - 1; 2128 u_int const head = kring->rhead; 2129 2130 if (bridge_batch <= 0) { /* testing only */ 2131 done = head; // used all 2132 goto done; 2133 } 2134 if (!na->na_bdg) { 2135 done = head; 2136 goto done; 2137 } 2138 if (bridge_batch > NM_BDG_BATCH) 2139 bridge_batch = NM_BDG_BATCH; 2140 2141 done = nm_bdg_preflush(kring, head); 2142 done: 2143 if (done != head) 2144 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 2145 /* 2146 * packets between 'done' and 'cur' are left unsent. 2147 */ 2148 kring->nr_hwcur = done; 2149 kring->nr_hwtail = nm_prev(done, lim); 2150 if (netmap_verbose) 2151 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 2152 return 0; 2153 } 2154 2155 2156 /* rxsync code used by VALE ports nm_rxsync callback and also 2157 * internally by the brwap 2158 */ 2159 static int 2160 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 2161 { 2162 struct netmap_adapter *na = kring->na; 2163 struct netmap_ring *ring = kring->ring; 2164 u_int nm_i, lim = kring->nkr_num_slots - 1; 2165 u_int head = kring->rhead; 2166 int n; 2167 2168 if (head > lim) { 2169 D("ouch dangerous reset!!!"); 2170 n = netmap_ring_reinit(kring); 2171 goto done; 2172 } 2173 2174 /* First part, import newly received packets. */ 2175 /* actually nothing to do here, they are already in the kring */ 2176 2177 /* Second part, skip past packets that userspace has released. */ 2178 nm_i = kring->nr_hwcur; 2179 if (nm_i != head) { 2180 /* consistency check, but nothing really important here */ 2181 for (n = 0; likely(nm_i != head); n++) { 2182 struct netmap_slot *slot = &ring->slot[nm_i]; 2183 void *addr = NMB(na, slot); 2184 2185 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 2186 D("bad buffer index %d, ignore ?", 2187 slot->buf_idx); 2188 } 2189 slot->flags &= ~NS_BUF_CHANGED; 2190 nm_i = nm_next(nm_i, lim); 2191 } 2192 kring->nr_hwcur = head; 2193 } 2194 2195 n = 0; 2196 done: 2197 return n; 2198 } 2199 2200 /* 2201 * nm_rxsync callback for VALE ports 2202 * user process reading from a VALE switch. 2203 * Already protected against concurrent calls from userspace, 2204 * but we must acquire the queue's lock to protect against 2205 * writers on the same queue. 2206 */ 2207 static int 2208 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 2209 { 2210 int n; 2211 2212 mtx_lock(&kring->q_lock); 2213 n = netmap_vp_rxsync_locked(kring, flags); 2214 mtx_unlock(&kring->q_lock); 2215 return n; 2216 } 2217 2218 2219 /* nm_bdg_attach callback for VALE ports 2220 * The na_vp port is this same netmap_adapter. There is no host port. 2221 */ 2222 static int 2223 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 2224 { 2225 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 2226 2227 if (vpna->na_bdg) 2228 return netmap_bwrap_attach(name, na); 2229 na->na_vp = vpna; 2230 strncpy(na->name, name, sizeof(na->name)); 2231 na->na_hostvp = NULL; 2232 return 0; 2233 } 2234 2235 /* create a netmap_vp_adapter that describes a VALE port. 2236 * Only persistent VALE ports have a non-null ifp. 2237 */ 2238 static int 2239 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, 2240 struct netmap_mem_d *nmd, 2241 struct netmap_vp_adapter **ret) 2242 { 2243 struct netmap_vp_adapter *vpna; 2244 struct netmap_adapter *na; 2245 int error = 0; 2246 u_int npipes = 0; 2247 2248 vpna = nm_os_malloc(sizeof(*vpna)); 2249 if (vpna == NULL) 2250 return ENOMEM; 2251 2252 na = &vpna->up; 2253 2254 na->ifp = ifp; 2255 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 2256 2257 /* bound checking */ 2258 na->num_tx_rings = nmr->nr_tx_rings; 2259 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2260 nmr->nr_tx_rings = na->num_tx_rings; // write back 2261 na->num_rx_rings = nmr->nr_rx_rings; 2262 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2263 nmr->nr_rx_rings = na->num_rx_rings; // write back 2264 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 2265 1, NM_BDG_MAXSLOTS, NULL); 2266 na->num_tx_desc = nmr->nr_tx_slots; 2267 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 2268 1, NM_BDG_MAXSLOTS, NULL); 2269 /* validate number of pipes. We want at least 1, 2270 * but probably can do with some more. 2271 * So let's use 2 as default (when 0 is supplied) 2272 */ 2273 npipes = nmr->nr_arg1; 2274 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 2275 nmr->nr_arg1 = npipes; /* write back */ 2276 /* validate extra bufs */ 2277 nm_bound_var(&nmr->nr_arg3, 0, 0, 2278 128*NM_BDG_MAXSLOTS, NULL); 2279 na->num_rx_desc = nmr->nr_rx_slots; 2280 /* Set the mfs to a default value, as it is needed on the VALE 2281 * mismatch datapath. XXX We should set it according to the MTU 2282 * known to the kernel. */ 2283 vpna->mfs = NM_BDG_MFS_DEFAULT; 2284 vpna->last_smac = ~0llu; 2285 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 2286 vpna->mfs = netmap_buf_size; */ 2287 if (netmap_verbose) 2288 D("max frame size %u", vpna->mfs); 2289 2290 na->na_flags |= NAF_BDG_MAYSLEEP; 2291 /* persistent VALE ports look like hw devices 2292 * with a native netmap adapter 2293 */ 2294 if (ifp) 2295 na->na_flags |= NAF_NATIVE; 2296 na->nm_txsync = netmap_vp_txsync; 2297 na->nm_rxsync = netmap_vp_rxsync; 2298 na->nm_register = netmap_vp_reg; 2299 na->nm_krings_create = netmap_vp_krings_create; 2300 na->nm_krings_delete = netmap_vp_krings_delete; 2301 na->nm_dtor = netmap_vp_dtor; 2302 D("nr_arg2 %d", nmr->nr_arg2); 2303 na->nm_mem = nmd ? 2304 netmap_mem_get(nmd): 2305 netmap_mem_private_new( 2306 na->num_tx_rings, na->num_tx_desc, 2307 na->num_rx_rings, na->num_rx_desc, 2308 nmr->nr_arg3, npipes, &error); 2309 if (na->nm_mem == NULL) 2310 goto err; 2311 na->nm_bdg_attach = netmap_vp_bdg_attach; 2312 /* other nmd fields are set in the common routine */ 2313 error = netmap_attach_common(na); 2314 if (error) 2315 goto err; 2316 *ret = vpna; 2317 return 0; 2318 2319 err: 2320 if (na->nm_mem != NULL) 2321 netmap_mem_put(na->nm_mem); 2322 nm_os_free(vpna); 2323 return error; 2324 } 2325 2326 /* Bridge wrapper code (bwrap). 2327 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 2328 * VALE switch. 2329 * The main task is to swap the meaning of tx and rx rings to match the 2330 * expectations of the VALE switch code (see nm_bdg_flush). 2331 * 2332 * The bwrap works by interposing a netmap_bwrap_adapter between the 2333 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 2334 * a netmap_vp_adapter to the rest the system, but, internally, it 2335 * translates all callbacks to what the hwna expects. 2336 * 2337 * Note that we have to intercept callbacks coming from two sides: 2338 * 2339 * - callbacks coming from the netmap module are intercepted by 2340 * passing around the netmap_bwrap_adapter instead of the hwna 2341 * 2342 * - callbacks coming from outside of the netmap module only know 2343 * about the hwna. This, however, only happens in interrupt 2344 * handlers, where only the hwna->nm_notify callback is called. 2345 * What the bwrap does is to overwrite the hwna->nm_notify callback 2346 * with its own netmap_bwrap_intr_notify. 2347 * XXX This assumes that the hwna->nm_notify callback was the 2348 * standard netmap_notify(), as it is the case for nic adapters. 2349 * Any additional action performed by hwna->nm_notify will not be 2350 * performed by netmap_bwrap_intr_notify. 2351 * 2352 * Additionally, the bwrap can optionally attach the host rings pair 2353 * of the wrapped adapter to a different port of the switch. 2354 */ 2355 2356 2357 static void 2358 netmap_bwrap_dtor(struct netmap_adapter *na) 2359 { 2360 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2361 struct netmap_adapter *hwna = bna->hwna; 2362 struct nm_bridge *b = bna->up.na_bdg, 2363 *bh = bna->host.na_bdg; 2364 2365 if (bna->host.up.nm_mem) 2366 netmap_mem_put(bna->host.up.nm_mem); 2367 2368 if (b) { 2369 netmap_bdg_detach_common(b, bna->up.bdg_port, 2370 (bh ? bna->host.bdg_port : -1)); 2371 } 2372 2373 ND("na %p", na); 2374 na->ifp = NULL; 2375 bna->host.up.ifp = NULL; 2376 hwna->na_private = NULL; 2377 hwna->na_vp = hwna->na_hostvp = NULL; 2378 hwna->na_flags &= ~NAF_BUSY; 2379 netmap_adapter_put(hwna); 2380 2381 } 2382 2383 2384 /* 2385 * Intr callback for NICs connected to a bridge. 2386 * Simply ignore tx interrupts (maybe we could try to recover space ?) 2387 * and pass received packets from nic to the bridge. 2388 * 2389 * XXX TODO check locking: this is called from the interrupt 2390 * handler so we should make sure that the interface is not 2391 * disconnected while passing down an interrupt. 2392 * 2393 * Note, no user process can access this NIC or the host stack. 2394 * The only part of the ring that is significant are the slots, 2395 * and head/cur/tail are set from the kring as needed 2396 * (part as a receive ring, part as a transmit ring). 2397 * 2398 * callback that overwrites the hwna notify callback. 2399 * Packets come from the outside or from the host stack and are put on an 2400 * hwna rx ring. 2401 * The bridge wrapper then sends the packets through the bridge. 2402 */ 2403 static int 2404 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 2405 { 2406 struct netmap_adapter *na = kring->na; 2407 struct netmap_bwrap_adapter *bna = na->na_private; 2408 struct netmap_kring *bkring; 2409 struct netmap_vp_adapter *vpna = &bna->up; 2410 u_int ring_nr = kring->ring_id; 2411 int ret = NM_IRQ_COMPLETED; 2412 int error; 2413 2414 if (netmap_verbose) 2415 D("%s %s 0x%x", na->name, kring->name, flags); 2416 2417 bkring = &vpna->up.tx_rings[ring_nr]; 2418 2419 /* make sure the ring is not disabled */ 2420 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { 2421 return EIO; 2422 } 2423 2424 if (netmap_verbose) 2425 D("%s head %d cur %d tail %d", na->name, 2426 kring->rhead, kring->rcur, kring->rtail); 2427 2428 /* simulate a user wakeup on the rx ring 2429 * fetch packets that have arrived. 2430 */ 2431 error = kring->nm_sync(kring, 0); 2432 if (error) 2433 goto put_out; 2434 if (kring->nr_hwcur == kring->nr_hwtail) { 2435 if (netmap_verbose) 2436 D("how strange, interrupt with no packets on %s", 2437 na->name); 2438 goto put_out; 2439 } 2440 2441 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 2442 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 2443 * to push all packets out. 2444 */ 2445 bkring->rhead = bkring->rcur = kring->nr_hwtail; 2446 2447 netmap_vp_txsync(bkring, flags); 2448 2449 /* mark all buffers as released on this ring */ 2450 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 2451 /* another call to actually release the buffers */ 2452 error = kring->nm_sync(kring, 0); 2453 2454 /* The second rxsync may have further advanced hwtail. If this happens, 2455 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ 2456 if (kring->rcur != kring->nr_hwtail) { 2457 ret = NM_IRQ_RESCHED; 2458 } 2459 put_out: 2460 nm_kr_put(kring); 2461 2462 return error ? error : ret; 2463 } 2464 2465 2466 /* nm_register callback for bwrap */ 2467 static int 2468 netmap_bwrap_reg(struct netmap_adapter *na, int onoff) 2469 { 2470 struct netmap_bwrap_adapter *bna = 2471 (struct netmap_bwrap_adapter *)na; 2472 struct netmap_adapter *hwna = bna->hwna; 2473 struct netmap_vp_adapter *hostna = &bna->host; 2474 int error, i; 2475 enum txrx t; 2476 2477 ND("%s %s", na->name, onoff ? "on" : "off"); 2478 2479 if (onoff) { 2480 /* netmap_do_regif has been called on the bwrap na. 2481 * We need to pass the information about the 2482 * memory allocator down to the hwna before 2483 * putting it in netmap mode 2484 */ 2485 hwna->na_lut = na->na_lut; 2486 2487 if (hostna->na_bdg) { 2488 /* if the host rings have been attached to switch, 2489 * we need to copy the memory allocator information 2490 * in the hostna also 2491 */ 2492 hostna->up.na_lut = na->na_lut; 2493 } 2494 2495 } 2496 2497 /* pass down the pending ring state information */ 2498 for_rx_tx(t) { 2499 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2500 NMR(hwna, t)[i].nr_pending_mode = 2501 NMR(na, t)[i].nr_pending_mode; 2502 } 2503 2504 /* forward the request to the hwna */ 2505 error = hwna->nm_register(hwna, onoff); 2506 if (error) 2507 return error; 2508 2509 /* copy up the current ring state information */ 2510 for_rx_tx(t) { 2511 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 2512 struct netmap_kring *kring = &NMR(hwna, t)[i]; 2513 NMR(na, t)[i].nr_mode = kring->nr_mode; 2514 } 2515 } 2516 2517 /* impersonate a netmap_vp_adapter */ 2518 netmap_vp_reg(na, onoff); 2519 if (hostna->na_bdg) 2520 netmap_vp_reg(&hostna->up, onoff); 2521 2522 if (onoff) { 2523 u_int i; 2524 /* intercept the hwna nm_nofify callback on the hw rings */ 2525 for (i = 0; i < hwna->num_rx_rings; i++) { 2526 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2527 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2528 } 2529 i = hwna->num_rx_rings; /* for safety */ 2530 /* save the host ring notify unconditionally */ 2531 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2532 if (hostna->na_bdg) { 2533 /* also intercept the host ring notify */ 2534 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2535 } 2536 if (na->active_fds == 0) 2537 na->na_flags |= NAF_NETMAP_ON; 2538 } else { 2539 u_int i; 2540 2541 if (na->active_fds == 0) 2542 na->na_flags &= ~NAF_NETMAP_ON; 2543 2544 /* reset all notify callbacks (including host ring) */ 2545 for (i = 0; i <= hwna->num_rx_rings; i++) { 2546 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; 2547 hwna->rx_rings[i].save_notify = NULL; 2548 } 2549 hwna->na_lut.lut = NULL; 2550 hwna->na_lut.objtotal = 0; 2551 hwna->na_lut.objsize = 0; 2552 2553 /* pass ownership of the netmap rings to the hwna */ 2554 for_rx_tx(t) { 2555 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 2556 NMR(na, t)[i].ring = NULL; 2557 } 2558 } 2559 2560 } 2561 2562 return 0; 2563 } 2564 2565 /* nm_config callback for bwrap */ 2566 static int 2567 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2568 u_int *rxr, u_int *rxd) 2569 { 2570 struct netmap_bwrap_adapter *bna = 2571 (struct netmap_bwrap_adapter *)na; 2572 struct netmap_adapter *hwna = bna->hwna; 2573 2574 /* forward the request */ 2575 netmap_update_config(hwna); 2576 /* swap the results */ 2577 *txr = hwna->num_rx_rings; 2578 *txd = hwna->num_rx_desc; 2579 *rxr = hwna->num_tx_rings; 2580 *rxd = hwna->num_rx_desc; 2581 2582 return 0; 2583 } 2584 2585 2586 /* nm_krings_create callback for bwrap */ 2587 static int 2588 netmap_bwrap_krings_create(struct netmap_adapter *na) 2589 { 2590 struct netmap_bwrap_adapter *bna = 2591 (struct netmap_bwrap_adapter *)na; 2592 struct netmap_adapter *hwna = bna->hwna; 2593 struct netmap_adapter *hostna = &bna->host.up; 2594 int i, error = 0; 2595 enum txrx t; 2596 2597 ND("%s", na->name); 2598 2599 /* impersonate a netmap_vp_adapter */ 2600 error = netmap_vp_krings_create(na); 2601 if (error) 2602 return error; 2603 2604 /* also create the hwna krings */ 2605 error = hwna->nm_krings_create(hwna); 2606 if (error) { 2607 goto err_del_vp_rings; 2608 } 2609 2610 /* increment the usage counter for all the hwna krings */ 2611 for_rx_tx(t) { 2612 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) { 2613 NMR(hwna, t)[i].users++; 2614 } 2615 } 2616 2617 /* now create the actual rings */ 2618 error = netmap_mem_rings_create(hwna); 2619 if (error) { 2620 goto err_dec_users; 2621 } 2622 2623 /* cross-link the netmap rings 2624 * The original number of rings comes from hwna, 2625 * rx rings on one side equals tx rings on the other. 2626 */ 2627 for_rx_tx(t) { 2628 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2629 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2630 NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; 2631 NMR(na, t)[i].ring = NMR(hwna, r)[i].ring; 2632 } 2633 } 2634 2635 if (na->na_flags & NAF_HOST_RINGS) { 2636 /* the hostna rings are the host rings of the bwrap. 2637 * The corresponding krings must point back to the 2638 * hostna 2639 */ 2640 hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2641 hostna->tx_rings[0].na = hostna; 2642 hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2643 hostna->rx_rings[0].na = hostna; 2644 } 2645 2646 return 0; 2647 2648 err_dec_users: 2649 for_rx_tx(t) { 2650 NMR(hwna, t)[i].users--; 2651 } 2652 hwna->nm_krings_delete(hwna); 2653 err_del_vp_rings: 2654 netmap_vp_krings_delete(na); 2655 2656 return error; 2657 } 2658 2659 2660 static void 2661 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2662 { 2663 struct netmap_bwrap_adapter *bna = 2664 (struct netmap_bwrap_adapter *)na; 2665 struct netmap_adapter *hwna = bna->hwna; 2666 enum txrx t; 2667 int i; 2668 2669 ND("%s", na->name); 2670 2671 /* decrement the usage counter for all the hwna krings */ 2672 for_rx_tx(t) { 2673 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) { 2674 NMR(hwna, t)[i].users--; 2675 } 2676 } 2677 2678 /* delete any netmap rings that are no longer needed */ 2679 netmap_mem_rings_delete(hwna); 2680 hwna->nm_krings_delete(hwna); 2681 netmap_vp_krings_delete(na); 2682 } 2683 2684 2685 /* notify method for the bridge-->hwna direction */ 2686 static int 2687 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2688 { 2689 struct netmap_adapter *na = kring->na; 2690 struct netmap_bwrap_adapter *bna = na->na_private; 2691 struct netmap_adapter *hwna = bna->hwna; 2692 u_int ring_n = kring->ring_id; 2693 u_int lim = kring->nkr_num_slots - 1; 2694 struct netmap_kring *hw_kring; 2695 int error; 2696 2697 ND("%s: na %s hwna %s", 2698 (kring ? kring->name : "NULL!"), 2699 (na ? na->name : "NULL!"), 2700 (hwna ? hwna->name : "NULL!")); 2701 hw_kring = &hwna->tx_rings[ring_n]; 2702 2703 if (nm_kr_tryget(hw_kring, 0, NULL)) { 2704 return ENXIO; 2705 } 2706 2707 /* first step: simulate a user wakeup on the rx ring */ 2708 netmap_vp_rxsync(kring, flags); 2709 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2710 na->name, ring_n, 2711 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2712 ring->head, ring->cur, ring->tail, 2713 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2714 /* second step: the new packets are sent on the tx ring 2715 * (which is actually the same ring) 2716 */ 2717 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2718 error = hw_kring->nm_sync(hw_kring, flags); 2719 if (error) 2720 goto put_out; 2721 2722 /* third step: now we are back the rx ring */ 2723 /* claim ownership on all hw owned bufs */ 2724 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2725 2726 /* fourth step: the user goes to sleep again, causing another rxsync */ 2727 netmap_vp_rxsync(kring, flags); 2728 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2729 na->name, ring_n, 2730 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2731 ring->head, ring->cur, ring->tail, 2732 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2733 put_out: 2734 nm_kr_put(hw_kring); 2735 2736 return error ? error : NM_IRQ_COMPLETED; 2737 } 2738 2739 2740 /* nm_bdg_ctl callback for the bwrap. 2741 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2742 * On attach, it needs to provide a fake netmap_priv_d structure and 2743 * perform a netmap_do_regif() on the bwrap. This will put both the 2744 * bwrap and the hwna in netmap mode, with the netmap rings shared 2745 * and cross linked. Moroever, it will start intercepting interrupts 2746 * directed to hwna. 2747 */ 2748 static int 2749 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2750 { 2751 struct netmap_priv_d *npriv; 2752 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2753 int error = 0; 2754 2755 if (attach) { 2756 if (NETMAP_OWNED_BY_ANY(na)) { 2757 return EBUSY; 2758 } 2759 if (bna->na_kpriv) { 2760 /* nothing to do */ 2761 return 0; 2762 } 2763 npriv = netmap_priv_new(); 2764 if (npriv == NULL) 2765 return ENOMEM; 2766 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ 2767 error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags); 2768 if (error) { 2769 netmap_priv_delete(npriv); 2770 return error; 2771 } 2772 bna->na_kpriv = npriv; 2773 na->na_flags |= NAF_BUSY; 2774 } else { 2775 if (na->active_fds == 0) /* not registered */ 2776 return EINVAL; 2777 netmap_priv_delete(bna->na_kpriv); 2778 bna->na_kpriv = NULL; 2779 na->na_flags &= ~NAF_BUSY; 2780 } 2781 return error; 2782 2783 } 2784 2785 /* attach a bridge wrapper to the 'real' device */ 2786 int 2787 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2788 { 2789 struct netmap_bwrap_adapter *bna; 2790 struct netmap_adapter *na = NULL; 2791 struct netmap_adapter *hostna = NULL; 2792 int error = 0; 2793 enum txrx t; 2794 2795 /* make sure the NIC is not already in use */ 2796 if (NETMAP_OWNED_BY_ANY(hwna)) { 2797 D("NIC %s busy, cannot attach to bridge", hwna->name); 2798 return EBUSY; 2799 } 2800 2801 bna = nm_os_malloc(sizeof(*bna)); 2802 if (bna == NULL) { 2803 return ENOMEM; 2804 } 2805 2806 na = &bna->up.up; 2807 /* make bwrap ifp point to the real ifp */ 2808 na->ifp = hwna->ifp; 2809 if_ref(na->ifp); 2810 na->na_private = bna; 2811 strncpy(na->name, nr_name, sizeof(na->name)); 2812 /* fill the ring data for the bwrap adapter with rx/tx meanings 2813 * swapped. The real cross-linking will be done during register, 2814 * when all the krings will have been created. 2815 */ 2816 for_rx_tx(t) { 2817 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2818 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 2819 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 2820 } 2821 na->nm_dtor = netmap_bwrap_dtor; 2822 na->nm_register = netmap_bwrap_reg; 2823 // na->nm_txsync = netmap_bwrap_txsync; 2824 // na->nm_rxsync = netmap_bwrap_rxsync; 2825 na->nm_config = netmap_bwrap_config; 2826 na->nm_krings_create = netmap_bwrap_krings_create; 2827 na->nm_krings_delete = netmap_bwrap_krings_delete; 2828 na->nm_notify = netmap_bwrap_notify; 2829 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2830 na->pdev = hwna->pdev; 2831 na->nm_mem = netmap_mem_get(hwna->nm_mem); 2832 na->virt_hdr_len = hwna->virt_hdr_len; 2833 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2834 /* Set the mfs, needed on the VALE mismatch datapath. */ 2835 bna->up.mfs = NM_BDG_MFS_DEFAULT; 2836 2837 bna->hwna = hwna; 2838 netmap_adapter_get(hwna); 2839 hwna->na_private = bna; /* weak reference */ 2840 hwna->na_vp = &bna->up; 2841 2842 if (hwna->na_flags & NAF_HOST_RINGS) { 2843 if (hwna->na_flags & NAF_SW_ONLY) 2844 na->na_flags |= NAF_SW_ONLY; 2845 na->na_flags |= NAF_HOST_RINGS; 2846 hostna = &bna->host.up; 2847 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2848 hostna->ifp = hwna->ifp; 2849 for_rx_tx(t) { 2850 enum txrx r = nm_txrx_swap(t); 2851 nma_set_nrings(hostna, t, 1); 2852 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 2853 } 2854 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2855 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2856 hostna->nm_notify = netmap_bwrap_notify; 2857 hostna->nm_mem = netmap_mem_get(na->nm_mem); 2858 hostna->na_private = bna; 2859 hostna->na_vp = &bna->up; 2860 na->na_hostvp = hwna->na_hostvp = 2861 hostna->na_hostvp = &bna->host; 2862 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2863 bna->host.mfs = NM_BDG_MFS_DEFAULT; 2864 } 2865 2866 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2867 na->name, ifp->if_xname, 2868 na->num_tx_rings, na->num_tx_desc, 2869 na->num_rx_rings, na->num_rx_desc); 2870 2871 error = netmap_attach_common(na); 2872 if (error) { 2873 goto err_free; 2874 } 2875 hwna->na_flags |= NAF_BUSY; 2876 return 0; 2877 2878 err_free: 2879 hwna->na_vp = hwna->na_hostvp = NULL; 2880 netmap_adapter_put(hwna); 2881 nm_os_free(bna); 2882 return error; 2883 2884 } 2885 2886 struct nm_bridge * 2887 netmap_init_bridges2(u_int n) 2888 { 2889 int i; 2890 struct nm_bridge *b; 2891 2892 b = nm_os_malloc(sizeof(struct nm_bridge) * n); 2893 if (b == NULL) 2894 return NULL; 2895 for (i = 0; i < n; i++) 2896 BDG_RWINIT(&b[i]); 2897 return b; 2898 } 2899 2900 void 2901 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 2902 { 2903 int i; 2904 2905 if (b == NULL) 2906 return; 2907 2908 for (i = 0; i < n; i++) 2909 BDG_RWDESTROY(&b[i]); 2910 nm_os_free(b); 2911 } 2912 2913 int 2914 netmap_init_bridges(void) 2915 { 2916 #ifdef CONFIG_NET_NS 2917 return netmap_bns_register(); 2918 #else 2919 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 2920 if (nm_bridges == NULL) 2921 return ENOMEM; 2922 return 0; 2923 #endif 2924 } 2925 2926 void 2927 netmap_uninit_bridges(void) 2928 { 2929 #ifdef CONFIG_NET_NS 2930 netmap_bns_unregister(); 2931 #else 2932 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 2933 #endif 2934 } 2935 #endif /* WITH_VALE */ 2936