1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/rwlock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/selinfo.h> 147 #include <sys/sysctl.h> 148 #include <net/if.h> 149 #include <net/if_var.h> 150 #include <net/bpf.h> /* BIOCIMMEDIATE */ 151 #include <machine/bus.h> /* bus_dmamap_* */ 152 #include <sys/endian.h> 153 #include <sys/refcount.h> 154 155 156 /* reduce conditional code */ 157 #define init_waitqueue_head(x) // only needed in linux 158 159 160 161 #elif defined(linux) 162 163 #include "bsd_glue.h" 164 165 166 167 #elif defined(__APPLE__) 168 169 #warning OSX support is only partial 170 #include "osx_glue.h" 171 172 #else 173 174 #error Unsupported platform 175 176 #endif /* unsupported */ 177 178 /* 179 * common headers 180 */ 181 #include <net/netmap.h> 182 #include <dev/netmap/netmap_kern.h> 183 #include <dev/netmap/netmap_mem2.h> 184 185 186 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 187 188 /* 189 * The following variables are used by the drivers and replicate 190 * fields in the global memory pool. They only refer to buffers 191 * used by physical interfaces. 192 */ 193 u_int netmap_total_buffers; 194 u_int netmap_buf_size; 195 char *netmap_buffer_base; /* also address of an invalid buffer */ 196 197 /* user-controlled variables */ 198 int netmap_verbose; 199 200 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 201 202 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 203 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 204 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 205 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 206 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 207 int netmap_mitigate = 1; 208 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 209 int netmap_no_pendintr = 1; 210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 211 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 212 int netmap_txsync_retry = 2; 213 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 214 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 215 216 int netmap_flags = 0; /* debug flags */ 217 int netmap_fwd = 0; /* force transparent mode */ 218 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 219 220 /* 221 * netmap_admode selects the netmap mode to use. 222 * Invalid values are reset to NETMAP_ADMODE_BEST 223 */ 224 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 225 NETMAP_ADMODE_NATIVE, /* either native or none */ 226 NETMAP_ADMODE_GENERIC, /* force generic */ 227 NETMAP_ADMODE_LAST }; 228 static int netmap_admode = NETMAP_ADMODE_BEST; 229 230 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 231 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 232 233 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 234 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 235 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 236 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 237 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 238 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 239 240 NMG_LOCK_T netmap_global_lock; 241 242 243 static void 244 nm_kr_get(struct netmap_kring *kr) 245 { 246 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 247 tsleep(kr, 0, "NM_KR_GET", 4); 248 } 249 250 251 /* 252 * mark the ring as stopped, and run through the locks 253 * to make sure other users get to see it. 254 */ 255 void 256 netmap_disable_ring(struct netmap_kring *kr) 257 { 258 kr->nkr_stopped = 1; 259 nm_kr_get(kr); 260 mtx_lock(&kr->q_lock); 261 mtx_unlock(&kr->q_lock); 262 nm_kr_put(kr); 263 } 264 265 266 static void 267 netmap_set_all_rings(struct ifnet *ifp, int stopped) 268 { 269 struct netmap_adapter *na; 270 int i; 271 272 if (!(ifp->if_capenable & IFCAP_NETMAP)) 273 return; 274 275 na = NA(ifp); 276 277 for (i = 0; i <= na->num_tx_rings; i++) { 278 if (stopped) 279 netmap_disable_ring(na->tx_rings + i); 280 else 281 na->tx_rings[i].nkr_stopped = 0; 282 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 283 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 284 } 285 286 for (i = 0; i <= na->num_rx_rings; i++) { 287 if (stopped) 288 netmap_disable_ring(na->rx_rings + i); 289 else 290 na->rx_rings[i].nkr_stopped = 0; 291 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 292 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 293 } 294 } 295 296 297 void 298 netmap_disable_all_rings(struct ifnet *ifp) 299 { 300 netmap_set_all_rings(ifp, 1 /* stopped */); 301 } 302 303 304 void 305 netmap_enable_all_rings(struct ifnet *ifp) 306 { 307 netmap_set_all_rings(ifp, 0 /* enabled */); 308 } 309 310 311 /* 312 * generic bound_checking function 313 */ 314 u_int 315 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 316 { 317 u_int oldv = *v; 318 const char *op = NULL; 319 320 if (dflt < lo) 321 dflt = lo; 322 if (dflt > hi) 323 dflt = hi; 324 if (oldv < lo) { 325 *v = dflt; 326 op = "Bump"; 327 } else if (oldv > hi) { 328 *v = hi; 329 op = "Clamp"; 330 } 331 if (op && msg) 332 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 333 return *v; 334 } 335 336 337 /* 338 * packet-dump function, user-supplied or static buffer. 339 * The destination buffer must be at least 30+4*len 340 */ 341 const char * 342 nm_dump_buf(char *p, int len, int lim, char *dst) 343 { 344 static char _dst[8192]; 345 int i, j, i0; 346 static char hex[] ="0123456789abcdef"; 347 char *o; /* output position */ 348 349 #define P_HI(x) hex[((x) & 0xf0)>>4] 350 #define P_LO(x) hex[((x) & 0xf)] 351 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 352 if (!dst) 353 dst = _dst; 354 if (lim <= 0 || lim > len) 355 lim = len; 356 o = dst; 357 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 358 o += strlen(o); 359 /* hexdump routine */ 360 for (i = 0; i < lim; ) { 361 sprintf(o, "%5d: ", i); 362 o += strlen(o); 363 memset(o, ' ', 48); 364 i0 = i; 365 for (j=0; j < 16 && i < lim; i++, j++) { 366 o[j*3] = P_HI(p[i]); 367 o[j*3+1] = P_LO(p[i]); 368 } 369 i = i0; 370 for (j=0; j < 16 && i < lim; i++, j++) 371 o[j + 48] = P_C(p[i]); 372 o[j+48] = '\n'; 373 o += j+49; 374 } 375 *o = '\0'; 376 #undef P_HI 377 #undef P_LO 378 #undef P_C 379 return dst; 380 } 381 382 383 /* 384 * Fetch configuration from the device, to cope with dynamic 385 * reconfigurations after loading the module. 386 */ 387 int 388 netmap_update_config(struct netmap_adapter *na) 389 { 390 struct ifnet *ifp = na->ifp; 391 u_int txr, txd, rxr, rxd; 392 393 txr = txd = rxr = rxd = 0; 394 if (na->nm_config) { 395 na->nm_config(na, &txr, &txd, &rxr, &rxd); 396 } else { 397 /* take whatever we had at init time */ 398 txr = na->num_tx_rings; 399 txd = na->num_tx_desc; 400 rxr = na->num_rx_rings; 401 rxd = na->num_rx_desc; 402 } 403 404 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 405 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 406 return 0; /* nothing changed */ 407 if (netmap_verbose || na->active_fds > 0) { 408 D("stored config %s: txring %d x %d, rxring %d x %d", 409 NM_IFPNAME(ifp), 410 na->num_tx_rings, na->num_tx_desc, 411 na->num_rx_rings, na->num_rx_desc); 412 D("new config %s: txring %d x %d, rxring %d x %d", 413 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 414 } 415 if (na->active_fds == 0) { 416 D("configuration changed (but fine)"); 417 na->num_tx_rings = txr; 418 na->num_tx_desc = txd; 419 na->num_rx_rings = rxr; 420 na->num_rx_desc = rxd; 421 return 0; 422 } 423 D("configuration changed while active, this is bad..."); 424 return 1; 425 } 426 427 428 int 429 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 430 { 431 u_int i, len, ndesc; 432 struct netmap_kring *kring; 433 434 // XXX additional space for extra rings ? 435 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 436 437 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 438 if (na->tx_rings == NULL) { 439 D("Cannot allocate krings"); 440 return ENOMEM; 441 } 442 na->rx_rings = na->tx_rings + ntx; 443 444 /* 445 * All fields in krings are 0 except the one initialized below. 446 * but better be explicit on important kring fields. 447 */ 448 ndesc = na->num_tx_desc; 449 for (i = 0; i < ntx; i++) { /* Transmit rings */ 450 kring = &na->tx_rings[i]; 451 bzero(kring, sizeof(*kring)); 452 kring->na = na; 453 kring->ring_id = i; 454 kring->nkr_num_slots = ndesc; 455 /* 456 * IMPORTANT: Always keep one slot empty. 457 */ 458 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 459 kring->rtail = kring->nr_hwtail = ndesc - 1; 460 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); 461 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 462 init_waitqueue_head(&kring->si); 463 } 464 465 ndesc = na->num_rx_desc; 466 for (i = 0; i < nrx; i++) { /* Receive rings */ 467 kring = &na->rx_rings[i]; 468 bzero(kring, sizeof(*kring)); 469 kring->na = na; 470 kring->ring_id = i; 471 kring->nkr_num_slots = ndesc; 472 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 473 kring->rtail = kring->nr_hwtail = 0; 474 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); 475 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 476 init_waitqueue_head(&kring->si); 477 } 478 init_waitqueue_head(&na->tx_si); 479 init_waitqueue_head(&na->rx_si); 480 481 na->tailroom = na->rx_rings + nrx; 482 483 return 0; 484 } 485 486 487 /* XXX check boundaries */ 488 void 489 netmap_krings_delete(struct netmap_adapter *na) 490 { 491 int i; 492 493 for (i = 0; i < na->num_tx_rings + 1; i++) { 494 mtx_destroy(&na->tx_rings[i].q_lock); 495 } 496 for (i = 0; i < na->num_rx_rings + 1; i++) { 497 mtx_destroy(&na->rx_rings[i].q_lock); 498 } 499 free(na->tx_rings, M_DEVBUF); 500 na->tx_rings = na->rx_rings = na->tailroom = NULL; 501 } 502 503 504 /* 505 * Destructor for NIC ports. They also have an mbuf queue 506 * on the rings connected to the host so we need to purge 507 * them first. 508 */ 509 static void 510 netmap_hw_krings_delete(struct netmap_adapter *na) 511 { 512 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 513 514 ND("destroy sw mbq with len %d", mbq_len(q)); 515 mbq_purge(q); 516 mbq_safe_destroy(q); 517 netmap_krings_delete(na); 518 } 519 520 521 static struct netmap_if* 522 netmap_if_new(const char *ifname, struct netmap_adapter *na) 523 { 524 struct netmap_if *nifp; 525 526 if (netmap_update_config(na)) { 527 /* configuration mismatch, report and fail */ 528 return NULL; 529 } 530 531 if (na->active_fds) 532 goto final; 533 534 if (na->nm_krings_create(na)) 535 goto cleanup; 536 537 if (netmap_mem_rings_create(na)) 538 goto cleanup; 539 540 final: 541 542 nifp = netmap_mem_if_new(ifname, na); 543 if (nifp == NULL) 544 goto cleanup; 545 546 return (nifp); 547 548 cleanup: 549 550 if (na->active_fds == 0) { 551 netmap_mem_rings_delete(na); 552 na->nm_krings_delete(na); 553 } 554 555 return NULL; 556 } 557 558 559 /* grab a reference to the memory allocator, if we don't have one already. The 560 * reference is taken from the netmap_adapter registered with the priv. 561 * 562 */ 563 static int 564 netmap_get_memory_locked(struct netmap_priv_d* p) 565 { 566 struct netmap_mem_d *nmd; 567 int error = 0; 568 569 if (p->np_na == NULL) { 570 if (!netmap_mmap_unreg) 571 return ENODEV; 572 /* for compatibility with older versions of the API 573 * we use the global allocator when no interface has been 574 * registered 575 */ 576 nmd = &nm_mem; 577 } else { 578 nmd = p->np_na->nm_mem; 579 } 580 if (p->np_mref == NULL) { 581 error = netmap_mem_finalize(nmd); 582 if (!error) 583 p->np_mref = nmd; 584 } else if (p->np_mref != nmd) { 585 /* a virtual port has been registered, but previous 586 * syscalls already used the global allocator. 587 * We cannot continue 588 */ 589 error = ENODEV; 590 } 591 return error; 592 } 593 594 595 int 596 netmap_get_memory(struct netmap_priv_d* p) 597 { 598 int error; 599 NMG_LOCK(); 600 error = netmap_get_memory_locked(p); 601 NMG_UNLOCK(); 602 return error; 603 } 604 605 606 static int 607 netmap_have_memory_locked(struct netmap_priv_d* p) 608 { 609 return p->np_mref != NULL; 610 } 611 612 613 static void 614 netmap_drop_memory_locked(struct netmap_priv_d* p) 615 { 616 if (p->np_mref) { 617 netmap_mem_deref(p->np_mref); 618 p->np_mref = NULL; 619 } 620 } 621 622 623 /* 624 * File descriptor's private data destructor. 625 * 626 * Call nm_register(ifp,0) to stop netmap mode on the interface and 627 * revert to normal operation. We expect that np_na->ifp has not gone. 628 * The second argument is the nifp to work on. In some cases it is 629 * not attached yet to the netmap_priv_d so we need to pass it as 630 * a separate argument. 631 */ 632 /* call with NMG_LOCK held */ 633 static void 634 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 635 { 636 struct netmap_adapter *na = priv->np_na; 637 struct ifnet *ifp = na->ifp; 638 639 NMG_LOCK_ASSERT(); 640 na->active_fds--; 641 if (na->active_fds <= 0) { /* last instance */ 642 643 if (netmap_verbose) 644 D("deleting last instance for %s", NM_IFPNAME(ifp)); 645 /* 646 * (TO CHECK) This function is only called 647 * when the last reference to this file descriptor goes 648 * away. This means we cannot have any pending poll() 649 * or interrupt routine operating on the structure. 650 * XXX The file may be closed in a thread while 651 * another thread is using it. 652 * Linux keeps the file opened until the last reference 653 * by any outstanding ioctl/poll or mmap is gone. 654 * FreeBSD does not track mmap()s (but we do) and 655 * wakes up any sleeping poll(). Need to check what 656 * happens if the close() occurs while a concurrent 657 * syscall is running. 658 */ 659 if (ifp) 660 na->nm_register(na, 0); /* off, clear flags */ 661 /* Wake up any sleeping threads. netmap_poll will 662 * then return POLLERR 663 * XXX The wake up now must happen during *_down(), when 664 * we order all activities to stop. -gl 665 */ 666 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 667 /* knlist_destroy(&na->tx_si.si_note); */ 668 /* knlist_destroy(&na->rx_si.si_note); */ 669 670 /* delete rings and buffers */ 671 netmap_mem_rings_delete(na); 672 na->nm_krings_delete(na); 673 } 674 /* delete the nifp */ 675 netmap_mem_if_delete(na, nifp); 676 } 677 678 679 /* 680 * returns 1 if this is the last instance and we can free priv 681 */ 682 int 683 netmap_dtor_locked(struct netmap_priv_d *priv) 684 { 685 struct netmap_adapter *na = priv->np_na; 686 687 #ifdef __FreeBSD__ 688 /* 689 * np_refcount is the number of active mmaps on 690 * this file descriptor 691 */ 692 if (--priv->np_refcount > 0) { 693 return 0; 694 } 695 #endif /* __FreeBSD__ */ 696 if (!na) { 697 return 1; //XXX is it correct? 698 } 699 netmap_do_unregif(priv, priv->np_nifp); 700 priv->np_nifp = NULL; 701 netmap_drop_memory_locked(priv); 702 if (priv->np_na) { 703 netmap_adapter_put(na); 704 priv->np_na = NULL; 705 } 706 return 1; 707 } 708 709 710 void 711 netmap_dtor(void *data) 712 { 713 struct netmap_priv_d *priv = data; 714 int last_instance; 715 716 NMG_LOCK(); 717 last_instance = netmap_dtor_locked(priv); 718 NMG_UNLOCK(); 719 if (last_instance) { 720 bzero(priv, sizeof(*priv)); /* for safety */ 721 free(priv, M_DEVBUF); 722 } 723 } 724 725 726 727 728 /* 729 * Handlers for synchronization of the queues from/to the host. 730 * Netmap has two operating modes: 731 * - in the default mode, the rings connected to the host stack are 732 * just another ring pair managed by userspace; 733 * - in transparent mode (XXX to be defined) incoming packets 734 * (from the host or the NIC) are marked as NS_FORWARD upon 735 * arrival, and the user application has a chance to reset the 736 * flag for packets that should be dropped. 737 * On the RXSYNC or poll(), packets in RX rings between 738 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 739 * to the other side. 740 * The transfer NIC --> host is relatively easy, just encapsulate 741 * into mbufs and we are done. The host --> NIC side is slightly 742 * harder because there might not be room in the tx ring so it 743 * might take a while before releasing the buffer. 744 */ 745 746 747 /* 748 * pass a chain of buffers to the host stack as coming from 'dst' 749 * We do not need to lock because the queue is private. 750 */ 751 static void 752 netmap_send_up(struct ifnet *dst, struct mbq *q) 753 { 754 struct mbuf *m; 755 756 /* send packets up, outside the lock */ 757 while ((m = mbq_dequeue(q)) != NULL) { 758 if (netmap_verbose & NM_VERB_HOST) 759 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 760 NM_SEND_UP(dst, m); 761 } 762 mbq_destroy(q); 763 } 764 765 766 /* 767 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 768 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 769 * and pass them up. Drop remaining packets in the unlikely event 770 * of an mbuf shortage. 771 */ 772 static void 773 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 774 { 775 u_int const lim = kring->nkr_num_slots - 1; 776 u_int const head = kring->ring->head; 777 u_int n; 778 struct netmap_adapter *na = kring->na; 779 780 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 781 struct mbuf *m; 782 struct netmap_slot *slot = &kring->ring->slot[n]; 783 784 if ((slot->flags & NS_FORWARD) == 0 && !force) 785 continue; 786 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 787 RD(5, "bad pkt at %d len %d", n, slot->len); 788 continue; 789 } 790 slot->flags &= ~NS_FORWARD; // XXX needed ? 791 /* XXX TODO: adapt to the case of a multisegment packet */ 792 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 793 794 if (m == NULL) 795 break; 796 mbq_enqueue(q, m); 797 } 798 } 799 800 801 /* 802 * Send to the NIC rings packets marked NS_FORWARD between 803 * kring->nr_hwcur and kring->rhead 804 * Called under kring->rx_queue.lock on the sw rx ring, 805 */ 806 static u_int 807 netmap_sw_to_nic(struct netmap_adapter *na) 808 { 809 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 810 struct netmap_slot *rxslot = kring->ring->slot; 811 u_int i, rxcur = kring->nr_hwcur; 812 u_int const head = kring->rhead; 813 u_int const src_lim = kring->nkr_num_slots - 1; 814 u_int sent = 0; 815 816 /* scan rings to find space, then fill as much as possible */ 817 for (i = 0; i < na->num_tx_rings; i++) { 818 struct netmap_kring *kdst = &na->tx_rings[i]; 819 struct netmap_ring *rdst = kdst->ring; 820 u_int const dst_lim = kdst->nkr_num_slots - 1; 821 822 /* XXX do we trust ring or kring->rcur,rtail ? */ 823 for (; rxcur != head && !nm_ring_empty(rdst); 824 rxcur = nm_next(rxcur, src_lim) ) { 825 struct netmap_slot *src, *dst, tmp; 826 u_int dst_cur = rdst->cur; 827 828 src = &rxslot[rxcur]; 829 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 830 continue; 831 832 sent++; 833 834 dst = &rdst->slot[dst_cur]; 835 836 tmp = *src; 837 838 src->buf_idx = dst->buf_idx; 839 src->flags = NS_BUF_CHANGED; 840 841 dst->buf_idx = tmp.buf_idx; 842 dst->len = tmp.len; 843 dst->flags = NS_BUF_CHANGED; 844 845 rdst->cur = nm_next(dst_cur, dst_lim); 846 } 847 /* if (sent) XXX txsync ? */ 848 } 849 return sent; 850 } 851 852 853 /* 854 * netmap_txsync_to_host() passes packets up. We are called from a 855 * system call in user process context, and the only contention 856 * can be among multiple user threads erroneously calling 857 * this routine concurrently. 858 */ 859 void 860 netmap_txsync_to_host(struct netmap_adapter *na) 861 { 862 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 863 struct netmap_ring *ring = kring->ring; 864 u_int const lim = kring->nkr_num_slots - 1; 865 u_int const head = nm_txsync_prologue(kring); 866 struct mbq q; 867 int error; 868 869 error = nm_kr_tryget(kring); 870 if (error) { 871 if (error == NM_KR_BUSY) 872 D("ring %p busy (user error)", kring); 873 return; 874 } 875 if (head > lim) { 876 D("invalid ring index in stack TX kring %p", kring); 877 netmap_ring_reinit(kring); 878 nm_kr_put(kring); 879 return; 880 } 881 882 /* Take packets from hwcur to head and pass them up. 883 * force head = cur since netmap_grab_packets() stops at head 884 * In case of no buffers we give up. At the end of the loop, 885 * the queue is drained in all cases. 886 */ 887 mbq_init(&q); 888 ring->cur = head; 889 netmap_grab_packets(kring, &q, 1 /* force */); 890 ND("have %d pkts in queue", mbq_len(&q)); 891 kring->nr_hwcur = head; 892 kring->nr_hwtail = head + lim; 893 if (kring->nr_hwtail > lim) 894 kring->nr_hwtail -= lim + 1; 895 nm_txsync_finalize(kring); 896 897 nm_kr_put(kring); 898 netmap_send_up(na->ifp, &q); 899 } 900 901 902 /* 903 * rxsync backend for packets coming from the host stack. 904 * They have been put in kring->rx_queue by netmap_transmit(). 905 * We protect access to the kring using kring->rx_queue.lock 906 * 907 * This routine also does the selrecord if called from the poll handler 908 * (we know because td != NULL). 909 * 910 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 911 * as an additional hidden argument. 912 * returns the number of packets delivered to tx queues in 913 * transparent mode, or a negative value if error 914 */ 915 int 916 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 917 { 918 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 919 struct netmap_ring *ring = kring->ring; 920 u_int nm_i, n; 921 u_int const lim = kring->nkr_num_slots - 1; 922 u_int const head = nm_rxsync_prologue(kring); 923 int ret = 0; 924 struct mbq *q = &kring->rx_queue; 925 926 (void)pwait; /* disable unused warnings */ 927 928 if (head > lim) { 929 netmap_ring_reinit(kring); 930 return EINVAL; 931 } 932 933 if (kring->nkr_stopped) /* check a first time without lock */ 934 return EBUSY; 935 936 mtx_lock(&q->lock); 937 938 if (kring->nkr_stopped) { /* check again with lock held */ 939 ret = EBUSY; 940 goto unlock_out; 941 } 942 943 /* First part: import newly received packets */ 944 n = mbq_len(q); 945 if (n) { /* grab packets from the queue */ 946 struct mbuf *m; 947 uint32_t stop_i; 948 949 nm_i = kring->nr_hwtail; 950 stop_i = nm_prev(nm_i, lim); 951 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 952 int len = MBUF_LEN(m); 953 struct netmap_slot *slot = &ring->slot[nm_i]; 954 955 m_copydata(m, 0, len, BDG_NMB(na, slot)); 956 ND("nm %d len %d", nm_i, len); 957 if (netmap_verbose) 958 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); 959 960 slot->len = len; 961 slot->flags = kring->nkr_slot_flags; 962 nm_i = nm_next(nm_i, lim); 963 } 964 kring->nr_hwtail = nm_i; 965 } 966 967 /* 968 * Second part: skip past packets that userspace has released. 969 */ 970 nm_i = kring->nr_hwcur; 971 if (nm_i != head) { /* something was released */ 972 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 973 ret = netmap_sw_to_nic(na); 974 kring->nr_hwcur = head; 975 } 976 977 nm_rxsync_finalize(kring); 978 979 /* access copies of cur,tail in the kring */ 980 if (kring->rcur == kring->rtail && td) /* no bufs available */ 981 selrecord(td, &kring->si); 982 983 unlock_out: 984 985 mtx_unlock(&q->lock); 986 return ret; 987 } 988 989 990 /* Get a netmap adapter for the port. 991 * 992 * If it is possible to satisfy the request, return 0 993 * with *na containing the netmap adapter found. 994 * Otherwise return an error code, with *na containing NULL. 995 * 996 * When the port is attached to a bridge, we always return 997 * EBUSY. 998 * Otherwise, if the port is already bound to a file descriptor, 999 * then we unconditionally return the existing adapter into *na. 1000 * In all the other cases, we return (into *na) either native, 1001 * generic or NULL, according to the following table: 1002 * 1003 * native_support 1004 * active_fds dev.netmap.admode YES NO 1005 * ------------------------------------------------------- 1006 * >0 * NA(ifp) NA(ifp) 1007 * 1008 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1009 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1010 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1011 * 1012 */ 1013 1014 int 1015 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1016 { 1017 /* generic support */ 1018 int i = netmap_admode; /* Take a snapshot. */ 1019 int error = 0; 1020 struct netmap_adapter *prev_na; 1021 struct netmap_generic_adapter *gna; 1022 1023 *na = NULL; /* default */ 1024 1025 /* reset in case of invalid value */ 1026 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1027 i = netmap_admode = NETMAP_ADMODE_BEST; 1028 1029 if (NETMAP_CAPABLE(ifp)) { 1030 /* If an adapter already exists, but is 1031 * attached to a vale port, we report that the 1032 * port is busy. 1033 */ 1034 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 1035 return EBUSY; 1036 1037 /* If an adapter already exists, return it if 1038 * there are active file descriptors or if 1039 * netmap is not forced to use generic 1040 * adapters. 1041 */ 1042 if (NA(ifp)->active_fds > 0 || 1043 i != NETMAP_ADMODE_GENERIC) { 1044 *na = NA(ifp); 1045 return 0; 1046 } 1047 } 1048 1049 /* If there isn't native support and netmap is not allowed 1050 * to use generic adapters, we cannot satisfy the request. 1051 */ 1052 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1053 return EINVAL; 1054 1055 /* Otherwise, create a generic adapter and return it, 1056 * saving the previously used netmap adapter, if any. 1057 * 1058 * Note that here 'prev_na', if not NULL, MUST be a 1059 * native adapter, and CANNOT be a generic one. This is 1060 * true because generic adapters are created on demand, and 1061 * destroyed when not used anymore. Therefore, if the adapter 1062 * currently attached to an interface 'ifp' is generic, it 1063 * must be that 1064 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1065 * Consequently, if NA(ifp) is generic, we will enter one of 1066 * the branches above. This ensures that we never override 1067 * a generic adapter with another generic adapter. 1068 */ 1069 prev_na = NA(ifp); 1070 error = generic_netmap_attach(ifp); 1071 if (error) 1072 return error; 1073 1074 *na = NA(ifp); 1075 gna = (struct netmap_generic_adapter*)NA(ifp); 1076 gna->prev = prev_na; /* save old na */ 1077 if (prev_na != NULL) { 1078 ifunit_ref(ifp->if_xname); 1079 // XXX add a refcount ? 1080 netmap_adapter_get(prev_na); 1081 } 1082 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1083 1084 return 0; 1085 } 1086 1087 1088 /* 1089 * MUST BE CALLED UNDER NMG_LOCK() 1090 * 1091 * get a refcounted reference to an interface. 1092 * This is always called in the execution of an ioctl(). 1093 * 1094 * Return ENXIO if the interface does not exist, EINVAL if netmap 1095 * is not supported by the interface. 1096 * If successful, hold a reference. 1097 * 1098 * When the NIC is attached to a bridge, reference is managed 1099 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1100 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1101 * is detached from the bridge, then ifp's refcount is dropped (this 1102 * is equivalent to that ifp is destroyed in case of virtual ports. 1103 * 1104 * This function uses if_rele() when we want to prevent the NIC from 1105 * being detached from the bridge in error handling. But once refcount 1106 * is acquired by this function, it must be released using nm_if_rele(). 1107 */ 1108 int 1109 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1110 { 1111 struct ifnet *ifp; 1112 int error = 0; 1113 struct netmap_adapter *ret; 1114 1115 *na = NULL; /* default return value */ 1116 1117 /* first try to see if this is a bridge port. */ 1118 NMG_LOCK_ASSERT(); 1119 1120 error = netmap_get_bdg_na(nmr, na, create); 1121 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1122 return error; 1123 1124 ifp = ifunit_ref(nmr->nr_name); 1125 if (ifp == NULL) { 1126 return ENXIO; 1127 } 1128 1129 error = netmap_get_hw_na(ifp, &ret); 1130 if (error) 1131 goto out; 1132 1133 if (ret != NULL) { 1134 /* Users cannot use the NIC attached to a bridge directly */ 1135 if (NETMAP_OWNED_BY_KERN(ret)) { 1136 error = EINVAL; 1137 goto out; 1138 } 1139 error = 0; 1140 *na = ret; 1141 netmap_adapter_get(ret); 1142 } 1143 out: 1144 if_rele(ifp); 1145 1146 return error; 1147 } 1148 1149 1150 /* 1151 * validate parameters on entry for *_txsync() 1152 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1153 * in case of error. 1154 * 1155 * rhead, rcur and rtail=hwtail are stored from previous round. 1156 * hwcur is the next packet to send to the ring. 1157 * 1158 * We want 1159 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1160 * 1161 * hwcur, rhead, rtail and hwtail are reliable 1162 */ 1163 u_int 1164 nm_txsync_prologue(struct netmap_kring *kring) 1165 { 1166 struct netmap_ring *ring = kring->ring; 1167 u_int head = ring->head; /* read only once */ 1168 u_int cur = ring->cur; /* read only once */ 1169 u_int n = kring->nkr_num_slots; 1170 1171 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1172 kring->name, 1173 kring->nr_hwcur, kring->nr_hwtail, 1174 ring->head, ring->cur, ring->tail); 1175 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1176 if (kring->nr_hwcur >= n || kring->rhead >= n || 1177 kring->rtail >= n || kring->nr_hwtail >= n) 1178 goto error; 1179 #endif /* kernel sanity checks */ 1180 /* 1181 * user sanity checks. We only use 'cur', 1182 * A, B, ... are possible positions for cur: 1183 * 1184 * 0 A cur B tail C n-1 1185 * 0 D tail E cur F n-1 1186 * 1187 * B, F, D are valid. A, C, E are wrong 1188 */ 1189 if (kring->rtail >= kring->rhead) { 1190 /* want rhead <= head <= rtail */ 1191 if (head < kring->rhead || head > kring->rtail) 1192 goto error; 1193 /* and also head <= cur <= rtail */ 1194 if (cur < head || cur > kring->rtail) 1195 goto error; 1196 } else { /* here rtail < rhead */ 1197 /* we need head outside rtail .. rhead */ 1198 if (head > kring->rtail && head < kring->rhead) 1199 goto error; 1200 1201 /* two cases now: head <= rtail or head >= rhead */ 1202 if (head <= kring->rtail) { 1203 /* want head <= cur <= rtail */ 1204 if (cur < head || cur > kring->rtail) 1205 goto error; 1206 } else { /* head >= rhead */ 1207 /* cur must be outside rtail..head */ 1208 if (cur > kring->rtail && cur < head) 1209 goto error; 1210 } 1211 } 1212 if (ring->tail != kring->rtail) { 1213 RD(5, "tail overwritten was %d need %d", 1214 ring->tail, kring->rtail); 1215 ring->tail = kring->rtail; 1216 } 1217 kring->rhead = head; 1218 kring->rcur = cur; 1219 return head; 1220 1221 error: 1222 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1223 kring->name, 1224 kring->nr_hwcur, 1225 kring->rcur, kring->nr_hwtail, 1226 cur, ring->tail); 1227 return n; 1228 } 1229 1230 1231 /* 1232 * validate parameters on entry for *_rxsync() 1233 * Returns ring->head if ok, kring->nkr_num_slots on error. 1234 * 1235 * For a valid configuration, 1236 * hwcur <= head <= cur <= tail <= hwtail 1237 * 1238 * We only consider head and cur. 1239 * hwcur and hwtail are reliable. 1240 * 1241 */ 1242 u_int 1243 nm_rxsync_prologue(struct netmap_kring *kring) 1244 { 1245 struct netmap_ring *ring = kring->ring; 1246 uint32_t const n = kring->nkr_num_slots; 1247 uint32_t head, cur; 1248 1249 ND("%s kc %d kt %d h %d c %d t %d", 1250 kring->name, 1251 kring->nr_hwcur, kring->nr_hwtail, 1252 ring->head, ring->cur, ring->tail); 1253 /* 1254 * Before storing the new values, we should check they do not 1255 * move backwards. However: 1256 * - head is not an issue because the previous value is hwcur; 1257 * - cur could in principle go back, however it does not matter 1258 * because we are processing a brand new rxsync() 1259 */ 1260 cur = kring->rcur = ring->cur; /* read only once */ 1261 head = kring->rhead = ring->head; /* read only once */ 1262 #if 1 /* kernel sanity checks */ 1263 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1264 goto error; 1265 #endif /* kernel sanity checks */ 1266 /* user sanity checks */ 1267 if (kring->nr_hwtail >= kring->nr_hwcur) { 1268 /* want hwcur <= rhead <= hwtail */ 1269 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1270 goto error; 1271 /* and also rhead <= rcur <= hwtail */ 1272 if (cur < head || cur > kring->nr_hwtail) 1273 goto error; 1274 } else { 1275 /* we need rhead outside hwtail..hwcur */ 1276 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1277 goto error; 1278 /* two cases now: head <= hwtail or head >= hwcur */ 1279 if (head <= kring->nr_hwtail) { 1280 /* want head <= cur <= hwtail */ 1281 if (cur < head || cur > kring->nr_hwtail) 1282 goto error; 1283 } else { 1284 /* cur must be outside hwtail..head */ 1285 if (cur < head && cur > kring->nr_hwtail) 1286 goto error; 1287 } 1288 } 1289 if (ring->tail != kring->rtail) { 1290 RD(5, "%s tail overwritten was %d need %d", 1291 kring->name, 1292 ring->tail, kring->rtail); 1293 ring->tail = kring->rtail; 1294 } 1295 return head; 1296 1297 error: 1298 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1299 kring->nr_hwcur, 1300 kring->rcur, kring->nr_hwtail, 1301 kring->rhead, kring->rcur, ring->tail); 1302 return n; 1303 } 1304 1305 1306 /* 1307 * Error routine called when txsync/rxsync detects an error. 1308 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1309 * Return 1 on reinit. 1310 * 1311 * This routine is only called by the upper half of the kernel. 1312 * It only reads hwcur (which is changed only by the upper half, too) 1313 * and hwtail (which may be changed by the lower half, but only on 1314 * a tx ring and only to increase it, so any error will be recovered 1315 * on the next call). For the above, we don't strictly need to call 1316 * it under lock. 1317 */ 1318 int 1319 netmap_ring_reinit(struct netmap_kring *kring) 1320 { 1321 struct netmap_ring *ring = kring->ring; 1322 u_int i, lim = kring->nkr_num_slots - 1; 1323 int errors = 0; 1324 1325 // XXX KASSERT nm_kr_tryget 1326 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1327 // XXX probably wrong to trust userspace 1328 kring->rhead = ring->head; 1329 kring->rcur = ring->cur; 1330 kring->rtail = ring->tail; 1331 1332 if (ring->cur > lim) 1333 errors++; 1334 if (ring->head > lim) 1335 errors++; 1336 if (ring->tail > lim) 1337 errors++; 1338 for (i = 0; i <= lim; i++) { 1339 u_int idx = ring->slot[i].buf_idx; 1340 u_int len = ring->slot[i].len; 1341 if (idx < 2 || idx >= netmap_total_buffers) { 1342 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1343 ring->slot[i].buf_idx = 0; 1344 ring->slot[i].len = 0; 1345 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1346 ring->slot[i].len = 0; 1347 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1348 } 1349 } 1350 if (errors) { 1351 RD(10, "total %d errors", errors); 1352 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1353 kring->name, 1354 ring->cur, kring->nr_hwcur, 1355 ring->tail, kring->nr_hwtail); 1356 ring->head = kring->rhead = kring->nr_hwcur; 1357 ring->cur = kring->rcur = kring->nr_hwcur; 1358 ring->tail = kring->rtail = kring->nr_hwtail; 1359 } 1360 return (errors ? 1 : 0); 1361 } 1362 1363 1364 /* 1365 * Set the ring ID. For devices with a single queue, a request 1366 * for all rings is the same as a single ring. 1367 */ 1368 static int 1369 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1370 { 1371 struct netmap_adapter *na = priv->np_na; 1372 struct ifnet *ifp = na->ifp; 1373 u_int i = ringid & NETMAP_RING_MASK; 1374 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1375 u_int lim = na->num_rx_rings; 1376 1377 if (na->num_tx_rings > lim) 1378 lim = na->num_tx_rings; 1379 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1380 D("invalid ring id %d", i); 1381 return (EINVAL); 1382 } 1383 priv->np_ringid = ringid; 1384 if (ringid & NETMAP_SW_RING) { 1385 priv->np_qfirst = NETMAP_SW_RING; 1386 priv->np_qlast = 0; 1387 } else if (ringid & NETMAP_HW_RING) { 1388 priv->np_qfirst = i; 1389 priv->np_qlast = i + 1; 1390 } else { 1391 priv->np_qfirst = 0; 1392 priv->np_qlast = NETMAP_HW_RING ; 1393 } 1394 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1395 if (netmap_verbose) { 1396 if (ringid & NETMAP_SW_RING) 1397 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1398 else if (ringid & NETMAP_HW_RING) 1399 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1400 priv->np_qfirst); 1401 else 1402 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1403 } 1404 return 0; 1405 } 1406 1407 1408 /* 1409 * possibly move the interface to netmap-mode. 1410 * If success it returns a pointer to netmap_if, otherwise NULL. 1411 * This must be called with NMG_LOCK held. 1412 */ 1413 struct netmap_if * 1414 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1415 uint16_t ringid, int *err) 1416 { 1417 struct ifnet *ifp = na->ifp; 1418 struct netmap_if *nifp = NULL; 1419 int error, need_mem = 0; 1420 1421 NMG_LOCK_ASSERT(); 1422 /* ring configuration may have changed, fetch from the card */ 1423 netmap_update_config(na); 1424 priv->np_na = na; /* store the reference */ 1425 error = netmap_set_ringid(priv, ringid); 1426 if (error) 1427 goto out; 1428 /* ensure allocators are ready */ 1429 need_mem = !netmap_have_memory_locked(priv); 1430 if (need_mem) { 1431 error = netmap_get_memory_locked(priv); 1432 ND("get_memory returned %d", error); 1433 if (error) 1434 goto out; 1435 } 1436 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1437 if (nifp == NULL) { /* allocation failed */ 1438 /* we should drop the allocator, but only 1439 * if we were the ones who grabbed it 1440 */ 1441 error = ENOMEM; 1442 goto out; 1443 } 1444 na->active_fds++; 1445 if (ifp->if_capenable & IFCAP_NETMAP) { 1446 /* was already set */ 1447 } else { 1448 /* Otherwise set the card in netmap mode 1449 * and make it use the shared buffers. 1450 * 1451 * do not core lock because the race is harmless here, 1452 * there cannot be any traffic to netmap_transmit() 1453 */ 1454 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1455 ND("%p->na_lut == %p", na, na->na_lut); 1456 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1457 error = na->nm_register(na, 1); /* mode on */ 1458 if (error) { 1459 netmap_do_unregif(priv, nifp); 1460 nifp = NULL; 1461 } 1462 } 1463 out: 1464 *err = error; 1465 if (error) { 1466 priv->np_na = NULL; 1467 if (need_mem) 1468 netmap_drop_memory_locked(priv); 1469 } 1470 if (nifp != NULL) { 1471 /* 1472 * advertise that the interface is ready bt setting ni_nifp. 1473 * The barrier is needed because readers (poll and *SYNC) 1474 * check for priv->np_nifp != NULL without locking 1475 */ 1476 wmb(); /* make sure previous writes are visible to all CPUs */ 1477 priv->np_nifp = nifp; 1478 } 1479 return nifp; 1480 } 1481 1482 1483 1484 /* 1485 * ioctl(2) support for the "netmap" device. 1486 * 1487 * Following a list of accepted commands: 1488 * - NIOCGINFO 1489 * - SIOCGIFADDR just for convenience 1490 * - NIOCREGIF 1491 * - NIOCTXSYNC 1492 * - NIOCRXSYNC 1493 * 1494 * Return 0 on success, errno otherwise. 1495 */ 1496 int 1497 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1498 int fflag, struct thread *td) 1499 { 1500 struct netmap_priv_d *priv = NULL; 1501 struct ifnet *ifp = NULL; 1502 struct nmreq *nmr = (struct nmreq *) data; 1503 struct netmap_adapter *na = NULL; 1504 int error; 1505 u_int i, lim; 1506 struct netmap_if *nifp; 1507 struct netmap_kring *krings; 1508 1509 (void)dev; /* UNUSED */ 1510 (void)fflag; /* UNUSED */ 1511 #ifdef linux 1512 #define devfs_get_cdevpriv(pp) \ 1513 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 1514 (*pp ? 0 : ENOENT); }) 1515 1516 /* devfs_set_cdevpriv cannot fail on linux */ 1517 #define devfs_set_cdevpriv(p, fn) \ 1518 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 1519 1520 1521 #define devfs_clear_cdevpriv() do { \ 1522 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 1523 } while (0) 1524 #endif /* linux */ 1525 1526 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 1527 /* truncate name */ 1528 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 1529 if (nmr->nr_version != NETMAP_API) { 1530 D("API mismatch for %s got %d need %d", 1531 nmr->nr_name, 1532 nmr->nr_version, NETMAP_API); 1533 nmr->nr_version = NETMAP_API; 1534 return EINVAL; 1535 } 1536 } 1537 CURVNET_SET(TD_TO_VNET(td)); 1538 1539 error = devfs_get_cdevpriv((void **)&priv); 1540 if (error) { 1541 CURVNET_RESTORE(); 1542 /* XXX ENOENT should be impossible, since the priv 1543 * is now created in the open */ 1544 return (error == ENOENT ? ENXIO : error); 1545 } 1546 1547 switch (cmd) { 1548 case NIOCGINFO: /* return capabilities etc */ 1549 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1550 error = netmap_bdg_ctl(nmr, NULL); 1551 break; 1552 } 1553 1554 NMG_LOCK(); 1555 do { 1556 /* memsize is always valid */ 1557 struct netmap_mem_d *nmd = &nm_mem; 1558 u_int memflags; 1559 1560 if (nmr->nr_name[0] != '\0') { 1561 /* get a refcount */ 1562 error = netmap_get_na(nmr, &na, 1 /* create */); 1563 if (error) 1564 break; 1565 nmd = na->nm_mem; /* get memory allocator */ 1566 } 1567 1568 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1569 if (error) 1570 break; 1571 if (na == NULL) /* only memory info */ 1572 break; 1573 nmr->nr_offset = 0; 1574 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1575 netmap_update_config(na); 1576 nmr->nr_rx_rings = na->num_rx_rings; 1577 nmr->nr_tx_rings = na->num_tx_rings; 1578 nmr->nr_rx_slots = na->num_rx_desc; 1579 nmr->nr_tx_slots = na->num_tx_desc; 1580 if (memflags & NETMAP_MEM_PRIVATE) 1581 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1582 netmap_adapter_put(na); 1583 } while (0); 1584 NMG_UNLOCK(); 1585 break; 1586 1587 case NIOCREGIF: 1588 /* possibly attach/detach NIC and VALE switch */ 1589 i = nmr->nr_cmd; 1590 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1591 || i == NETMAP_BDG_OFFSET) { 1592 error = netmap_bdg_ctl(nmr, NULL); 1593 break; 1594 } else if (i != 0) { 1595 D("nr_cmd must be 0 not %d", i); 1596 error = EINVAL; 1597 break; 1598 } 1599 1600 /* protect access to priv from concurrent NIOCREGIF */ 1601 NMG_LOCK(); 1602 do { 1603 u_int memflags; 1604 1605 if (priv->np_na != NULL) { /* thread already registered */ 1606 error = netmap_set_ringid(priv, nmr->nr_ringid); 1607 break; 1608 } 1609 /* find the interface and a reference */ 1610 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1611 if (error) 1612 break; 1613 ifp = na->ifp; 1614 if (NETMAP_OWNED_BY_KERN(na)) { 1615 netmap_adapter_put(na); 1616 error = EBUSY; 1617 break; 1618 } 1619 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1620 if (!nifp) { /* reg. failed, release priv and ref */ 1621 netmap_adapter_put(na); 1622 priv->np_nifp = NULL; 1623 break; 1624 } 1625 1626 /* return the offset of the netmap_if object */ 1627 nmr->nr_rx_rings = na->num_rx_rings; 1628 nmr->nr_tx_rings = na->num_tx_rings; 1629 nmr->nr_rx_slots = na->num_rx_desc; 1630 nmr->nr_tx_slots = na->num_tx_desc; 1631 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1632 if (error) { 1633 netmap_adapter_put(na); 1634 break; 1635 } 1636 if (memflags & NETMAP_MEM_PRIVATE) { 1637 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1638 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1639 } 1640 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1641 } while (0); 1642 NMG_UNLOCK(); 1643 break; 1644 1645 case NIOCTXSYNC: 1646 case NIOCRXSYNC: 1647 nifp = priv->np_nifp; 1648 1649 if (nifp == NULL) { 1650 error = ENXIO; 1651 break; 1652 } 1653 rmb(); /* make sure following reads are not from cache */ 1654 1655 na = priv->np_na; /* we have a reference */ 1656 1657 if (na == NULL) { 1658 D("Internal error: nifp != NULL && na == NULL"); 1659 error = ENXIO; 1660 break; 1661 } 1662 1663 ifp = na->ifp; 1664 if (ifp == NULL) { 1665 RD(1, "the ifp is gone"); 1666 error = ENXIO; 1667 break; 1668 } 1669 1670 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1671 if (cmd == NIOCTXSYNC) 1672 netmap_txsync_to_host(na); 1673 else 1674 netmap_rxsync_from_host(na, NULL, NULL); 1675 break; 1676 } 1677 /* find the last ring to scan */ 1678 lim = priv->np_qlast; 1679 if (lim == NETMAP_HW_RING) 1680 lim = (cmd == NIOCTXSYNC) ? 1681 na->num_tx_rings : na->num_rx_rings; 1682 1683 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1684 for (i = priv->np_qfirst; i < lim; i++) { 1685 struct netmap_kring *kring = krings + i; 1686 if (nm_kr_tryget(kring)) { 1687 error = EBUSY; 1688 goto out; 1689 } 1690 if (cmd == NIOCTXSYNC) { 1691 if (netmap_verbose & NM_VERB_TXSYNC) 1692 D("pre txsync ring %d cur %d hwcur %d", 1693 i, kring->ring->cur, 1694 kring->nr_hwcur); 1695 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1696 netmap_ring_reinit(kring); 1697 } else { 1698 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1699 } 1700 if (netmap_verbose & NM_VERB_TXSYNC) 1701 D("post txsync ring %d cur %d hwcur %d", 1702 i, kring->ring->cur, 1703 kring->nr_hwcur); 1704 } else { 1705 na->nm_rxsync(na, i, NAF_FORCE_READ); 1706 microtime(&na->rx_rings[i].ring->ts); 1707 } 1708 nm_kr_put(kring); 1709 } 1710 1711 break; 1712 1713 #ifdef __FreeBSD__ 1714 case BIOCIMMEDIATE: 1715 case BIOCGHDRCMPLT: 1716 case BIOCSHDRCMPLT: 1717 case BIOCSSEESENT: 1718 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1719 break; 1720 1721 default: /* allow device-specific ioctls */ 1722 { 1723 struct socket so; 1724 1725 bzero(&so, sizeof(so)); 1726 NMG_LOCK(); 1727 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1728 if (error) { 1729 netmap_adapter_put(na); 1730 NMG_UNLOCK(); 1731 break; 1732 } 1733 ifp = na->ifp; 1734 so.so_vnet = ifp->if_vnet; 1735 // so->so_proto not null. 1736 error = ifioctl(&so, cmd, data, td); 1737 netmap_adapter_put(na); 1738 NMG_UNLOCK(); 1739 break; 1740 } 1741 1742 #else /* linux */ 1743 default: 1744 error = EOPNOTSUPP; 1745 #endif /* linux */ 1746 } 1747 out: 1748 1749 CURVNET_RESTORE(); 1750 return (error); 1751 } 1752 1753 1754 /* 1755 * select(2) and poll(2) handlers for the "netmap" device. 1756 * 1757 * Can be called for one or more queues. 1758 * Return true the event mask corresponding to ready events. 1759 * If there are no ready events, do a selrecord on either individual 1760 * selinfo or on the global one. 1761 * Device-dependent parts (locking and sync of tx/rx rings) 1762 * are done through callbacks. 1763 * 1764 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1765 * The first one is remapped to pwait as selrecord() uses the name as an 1766 * hidden argument. 1767 */ 1768 int 1769 netmap_poll(struct cdev *dev, int events, struct thread *td) 1770 { 1771 struct netmap_priv_d *priv = NULL; 1772 struct netmap_adapter *na; 1773 struct ifnet *ifp; 1774 struct netmap_kring *kring; 1775 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1776 u_int lim_tx, lim_rx; 1777 struct mbq q; /* packets from hw queues to host stack */ 1778 void *pwait = dev; /* linux compatibility */ 1779 1780 /* 1781 * In order to avoid nested locks, we need to "double check" 1782 * txsync and rxsync if we decide to do a selrecord(). 1783 * retry_tx (and retry_rx, later) prevent looping forever. 1784 */ 1785 int retry_tx = 1, retry_rx = 1; 1786 1787 (void)pwait; 1788 mbq_init(&q); 1789 1790 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 1791 return POLLERR; 1792 1793 if (priv->np_nifp == NULL) { 1794 D("No if registered"); 1795 return POLLERR; 1796 } 1797 rmb(); /* make sure following reads are not from cache */ 1798 1799 na = priv->np_na; 1800 ifp = na->ifp; 1801 // check for deleted 1802 if (ifp == NULL) { 1803 RD(1, "the ifp is gone"); 1804 return POLLERR; 1805 } 1806 1807 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1808 return POLLERR; 1809 1810 if (netmap_verbose & 0x8000) 1811 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1812 want_tx = events & (POLLOUT | POLLWRNORM); 1813 want_rx = events & (POLLIN | POLLRDNORM); 1814 1815 lim_tx = na->num_tx_rings; 1816 lim_rx = na->num_rx_rings; 1817 1818 if (priv->np_qfirst == NETMAP_SW_RING) { 1819 // XXX locking ? 1820 /* handle the host stack ring */ 1821 if (priv->np_txpoll || want_tx) { 1822 /* push any packets up, then we are always ready */ 1823 netmap_txsync_to_host(na); 1824 revents |= want_tx; 1825 } 1826 if (want_rx) { 1827 kring = &na->rx_rings[lim_rx]; 1828 /* XXX replace with rxprologue etc. */ 1829 if (nm_ring_empty(kring->ring)) 1830 netmap_rxsync_from_host(na, td, dev); 1831 if (!nm_ring_empty(kring->ring)) 1832 revents |= want_rx; 1833 } 1834 return (revents); 1835 } 1836 1837 1838 /* 1839 * check_all_{tx|rx} are set if the card has more than one queue AND 1840 * the file descriptor is bound to all of them. If so, we sleep on 1841 * the "global" selinfo, otherwise we sleep on individual selinfo 1842 * (FreeBSD only allows two selinfo's per file descriptor). 1843 * The interrupt routine in the driver wake one or the other 1844 * (or both) depending on which clients are active. 1845 * 1846 * rxsync() is only called if we run out of buffers on a POLLIN. 1847 * txsync() is called if we run out of buffers on POLLOUT, or 1848 * there are pending packets to send. The latter can be disabled 1849 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1850 */ 1851 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1852 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1853 1854 if (priv->np_qlast != NETMAP_HW_RING) { 1855 lim_tx = lim_rx = priv->np_qlast; 1856 } 1857 1858 /* 1859 * We start with a lock free round which is cheap if we have 1860 * slots available. If this fails, then lock and call the sync 1861 * routines. 1862 */ 1863 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1864 kring = &na->rx_rings[i]; 1865 /* XXX compare ring->cur and kring->tail */ 1866 if (!nm_ring_empty(kring->ring)) { 1867 revents |= want_rx; 1868 want_rx = 0; /* also breaks the loop */ 1869 } 1870 } 1871 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1872 kring = &na->tx_rings[i]; 1873 /* XXX compare ring->cur and kring->tail */ 1874 if (!nm_ring_empty(kring->ring)) { 1875 revents |= want_tx; 1876 want_tx = 0; /* also breaks the loop */ 1877 } 1878 } 1879 1880 /* 1881 * If we want to push packets out (priv->np_txpoll) or 1882 * want_tx is still set, we must issue txsync calls 1883 * (on all rings, to avoid that the tx rings stall). 1884 * XXX should also check cur != hwcur on the tx rings. 1885 * Fortunately, normal tx mode has np_txpoll set. 1886 */ 1887 if (priv->np_txpoll || want_tx) { 1888 /* 1889 * The first round checks if anyone is ready, if not 1890 * do a selrecord and another round to handle races. 1891 * want_tx goes to 0 if any space is found, and is 1892 * used to skip rings with no pending transmissions. 1893 */ 1894 flush_tx: 1895 for (i = priv->np_qfirst; i < lim_tx; i++) { 1896 int found = 0; 1897 1898 kring = &na->tx_rings[i]; 1899 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1900 continue; 1901 /* only one thread does txsync */ 1902 if (nm_kr_tryget(kring)) { 1903 D("%p lost race on txring %d, ok", priv, i); 1904 continue; 1905 } 1906 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1907 netmap_ring_reinit(kring); 1908 revents |= POLLERR; 1909 } else { 1910 if (na->nm_txsync(na, i, 0)) 1911 revents |= POLLERR; 1912 } 1913 1914 /* 1915 * If we found new slots, notify potential 1916 * listeners on the same ring. 1917 * Since we just did a txsync, look at the copies 1918 * of cur,tail in the kring. 1919 */ 1920 found = kring->rcur != kring->rtail; 1921 nm_kr_put(kring); 1922 if (found) { /* notify other listeners */ 1923 revents |= want_tx; 1924 want_tx = 0; 1925 na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); 1926 } 1927 } 1928 if (want_tx && retry_tx) { 1929 selrecord(td, check_all_tx ? 1930 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 1931 retry_tx = 0; 1932 goto flush_tx; 1933 } 1934 } 1935 1936 /* 1937 * If want_rx is still set scan receive rings. 1938 * Do it on all rings because otherwise we starve. 1939 */ 1940 if (want_rx) { 1941 int send_down = 0; /* transparent mode */ 1942 /* two rounds here to for race avoidance */ 1943 do_retry_rx: 1944 for (i = priv->np_qfirst; i < lim_rx; i++) { 1945 int found = 0; 1946 1947 kring = &na->rx_rings[i]; 1948 1949 if (nm_kr_tryget(kring)) { 1950 D("%p lost race on rxring %d, ok", priv, i); 1951 continue; 1952 } 1953 1954 /* 1955 * transparent mode support: collect packets 1956 * from the rxring(s). 1957 * XXX NR_FORWARD should only be read on 1958 * physical or NIC ports 1959 */ 1960 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1961 ND(10, "forwarding some buffers up %d to %d", 1962 kring->nr_hwcur, kring->ring->cur); 1963 netmap_grab_packets(kring, &q, netmap_fwd); 1964 } 1965 1966 if (na->nm_rxsync(na, i, 0)) 1967 revents |= POLLERR; 1968 if (netmap_no_timestamp == 0 || 1969 kring->ring->flags & NR_TIMESTAMP) { 1970 microtime(&kring->ring->ts); 1971 } 1972 /* after an rxsync we can use kring->rcur, rtail */ 1973 found = kring->rcur != kring->rtail; 1974 nm_kr_put(kring); 1975 if (found) { 1976 revents |= want_rx; 1977 retry_rx = 0; 1978 na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); 1979 } 1980 } 1981 1982 /* transparent mode XXX only during first pass ? */ 1983 kring = &na->rx_rings[lim_rx]; 1984 if (check_all_rx 1985 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 1986 /* XXX fix to use kring fields */ 1987 if (nm_ring_empty(kring->ring)) 1988 send_down = netmap_rxsync_from_host(na, td, dev); 1989 if (!nm_ring_empty(kring->ring)) 1990 revents |= want_rx; 1991 } 1992 1993 if (retry_rx) 1994 selrecord(td, check_all_rx ? 1995 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 1996 if (send_down > 0 || retry_rx) { 1997 retry_rx = 0; 1998 if (send_down) 1999 goto flush_tx; /* and retry_rx */ 2000 else 2001 goto do_retry_rx; 2002 } 2003 } 2004 2005 /* 2006 * Transparent mode: marked bufs on rx rings between 2007 * kring->nr_hwcur and ring->head 2008 * are passed to the other endpoint. 2009 * 2010 * In this mode we also scan the sw rxring, which in 2011 * turn passes packets up. 2012 * 2013 * XXX Transparent mode at the moment requires to bind all 2014 * rings to a single file descriptor. 2015 */ 2016 2017 if (q.head) 2018 netmap_send_up(na->ifp, &q); 2019 2020 return (revents); 2021 } 2022 2023 2024 /*-------------------- driver support routines -------------------*/ 2025 2026 static int netmap_hw_krings_create(struct netmap_adapter *); 2027 2028 static int 2029 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2030 enum txrx tx, int flags) 2031 { 2032 struct netmap_kring *kring; 2033 2034 if (tx == NR_TX) { 2035 kring = na->tx_rings + n_ring; 2036 selwakeuppri(&kring->si, PI_NET); 2037 if (flags & NAF_GLOBAL_NOTIFY) 2038 selwakeuppri(&na->tx_si, PI_NET); 2039 } else { 2040 kring = na->rx_rings + n_ring; 2041 selwakeuppri(&kring->si, PI_NET); 2042 if (flags & NAF_GLOBAL_NOTIFY) 2043 selwakeuppri(&na->rx_si, PI_NET); 2044 } 2045 return 0; 2046 } 2047 2048 2049 // XXX check handling of failures 2050 int 2051 netmap_attach_common(struct netmap_adapter *na) 2052 { 2053 struct ifnet *ifp = na->ifp; 2054 2055 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2056 D("%s: invalid rings tx %d rx %d", 2057 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2058 return EINVAL; 2059 } 2060 WNA(ifp) = na; 2061 2062 /* the following is only needed for na that use the host port. 2063 * XXX do we have something similar for linux ? 2064 */ 2065 #ifdef __FreeBSD__ 2066 na->if_input = ifp->if_input; /* for netmap_send_up */ 2067 #endif /* __FreeBSD__ */ 2068 2069 NETMAP_SET_CAPABLE(ifp); 2070 if (na->nm_krings_create == NULL) { 2071 na->nm_krings_create = netmap_hw_krings_create; 2072 na->nm_krings_delete = netmap_hw_krings_delete; 2073 } 2074 if (na->nm_notify == NULL) 2075 na->nm_notify = netmap_notify; 2076 na->active_fds = 0; 2077 2078 if (na->nm_mem == NULL) 2079 na->nm_mem = &nm_mem; 2080 return 0; 2081 } 2082 2083 2084 void 2085 netmap_detach_common(struct netmap_adapter *na) 2086 { 2087 if (na->ifp) 2088 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2089 2090 if (na->tx_rings) { /* XXX should not happen */ 2091 D("freeing leftover tx_rings"); 2092 na->nm_krings_delete(na); 2093 } 2094 if (na->na_flags & NAF_MEM_OWNER) 2095 netmap_mem_private_delete(na->nm_mem); 2096 bzero(na, sizeof(*na)); 2097 free(na, M_DEVBUF); 2098 } 2099 2100 2101 /* 2102 * Initialize a ``netmap_adapter`` object created by driver on attach. 2103 * We allocate a block of memory with room for a struct netmap_adapter 2104 * plus two sets of N+2 struct netmap_kring (where N is the number 2105 * of hardware rings): 2106 * krings 0..N-1 are for the hardware queues. 2107 * kring N is for the host stack queue 2108 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2109 * Return 0 on success, ENOMEM otherwise. 2110 */ 2111 int 2112 netmap_attach(struct netmap_adapter *arg) 2113 { 2114 struct netmap_hw_adapter *hwna = NULL; 2115 // XXX when is arg == NULL ? 2116 struct ifnet *ifp = arg ? arg->ifp : NULL; 2117 2118 if (arg == NULL || ifp == NULL) 2119 goto fail; 2120 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2121 if (hwna == NULL) 2122 goto fail; 2123 hwna->up = *arg; 2124 if (netmap_attach_common(&hwna->up)) { 2125 free(hwna, M_DEVBUF); 2126 goto fail; 2127 } 2128 netmap_adapter_get(&hwna->up); 2129 2130 #ifdef linux 2131 if (ifp->netdev_ops) { 2132 /* prepare a clone of the netdev ops */ 2133 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2134 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2135 #else 2136 hwna->nm_ndo = *ifp->netdev_ops; 2137 #endif 2138 } 2139 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2140 #endif /* linux */ 2141 2142 D("success for %s", NM_IFPNAME(ifp)); 2143 return 0; 2144 2145 fail: 2146 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2147 netmap_detach(ifp); 2148 return (hwna ? EINVAL : ENOMEM); 2149 } 2150 2151 2152 void 2153 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2154 { 2155 if (!na) { 2156 return; 2157 } 2158 2159 refcount_acquire(&na->na_refcount); 2160 } 2161 2162 2163 /* returns 1 iff the netmap_adapter is destroyed */ 2164 int 2165 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2166 { 2167 if (!na) 2168 return 1; 2169 2170 if (!refcount_release(&na->na_refcount)) 2171 return 0; 2172 2173 if (na->nm_dtor) 2174 na->nm_dtor(na); 2175 2176 netmap_detach_common(na); 2177 2178 return 1; 2179 } 2180 2181 2182 int 2183 netmap_hw_krings_create(struct netmap_adapter *na) 2184 { 2185 int ret = netmap_krings_create(na, 2186 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 2187 if (ret == 0) { 2188 /* initialize the mbq for the sw rx ring */ 2189 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2190 ND("initialized sw rx queue %d", na->num_rx_rings); 2191 } 2192 return ret; 2193 } 2194 2195 2196 2197 /* 2198 * Free the allocated memory linked to the given ``netmap_adapter`` 2199 * object. 2200 */ 2201 void 2202 netmap_detach(struct ifnet *ifp) 2203 { 2204 struct netmap_adapter *na = NA(ifp); 2205 2206 if (!na) 2207 return; 2208 2209 NMG_LOCK(); 2210 netmap_disable_all_rings(ifp); 2211 if (!netmap_adapter_put(na)) { 2212 /* someone is still using the adapter, 2213 * tell them that the interface is gone 2214 */ 2215 na->ifp = NULL; 2216 /* give them a chance to notice */ 2217 netmap_enable_all_rings(ifp); 2218 } 2219 NMG_UNLOCK(); 2220 } 2221 2222 2223 /* 2224 * Intercept packets from the network stack and pass them 2225 * to netmap as incoming packets on the 'software' ring. 2226 * 2227 * We only store packets in a bounded mbq and then copy them 2228 * in the relevant rxsync routine. 2229 * 2230 * We rely on the OS to make sure that the ifp and na do not go 2231 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2232 * In nm_register() or whenever there is a reinitialization, 2233 * we make sure to make the mode change visible here. 2234 */ 2235 int 2236 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2237 { 2238 struct netmap_adapter *na = NA(ifp); 2239 struct netmap_kring *kring; 2240 u_int len = MBUF_LEN(m); 2241 u_int error = ENOBUFS; 2242 struct mbq *q; 2243 int space; 2244 2245 // XXX [Linux] we do not need this lock 2246 // if we follow the down/configure/up protocol -gl 2247 // mtx_lock(&na->core_lock); 2248 2249 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2250 D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); 2251 error = ENXIO; 2252 goto done; 2253 } 2254 2255 kring = &na->rx_rings[na->num_rx_rings]; 2256 q = &kring->rx_queue; 2257 2258 // XXX reconsider long packets if we handle fragments 2259 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2260 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2261 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2262 goto done; 2263 } 2264 2265 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2266 * and maybe other instances of netmap_transmit (the latter 2267 * not possible on Linux). 2268 * Also avoid overflowing the queue. 2269 */ 2270 mtx_lock(&q->lock); 2271 2272 space = kring->nr_hwtail - kring->nr_hwcur; 2273 if (space < 0) 2274 space += kring->nkr_num_slots; 2275 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2276 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2277 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2278 len, m); 2279 } else { 2280 mbq_enqueue(q, m); 2281 ND(10, "%s %d bufs in queue len %d m %p", 2282 NM_IFPNAME(ifp), mbq_len(q), len, m); 2283 /* notify outside the lock */ 2284 m = NULL; 2285 error = 0; 2286 } 2287 mtx_unlock(&q->lock); 2288 2289 done: 2290 if (m) 2291 m_freem(m); 2292 /* unconditionally wake up listeners */ 2293 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2294 2295 return (error); 2296 } 2297 2298 2299 /* 2300 * netmap_reset() is called by the driver routines when reinitializing 2301 * a ring. The driver is in charge of locking to protect the kring. 2302 * If native netmap mode is not set just return NULL. 2303 */ 2304 struct netmap_slot * 2305 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2306 u_int new_cur) 2307 { 2308 struct netmap_kring *kring; 2309 int new_hwofs, lim; 2310 2311 if (na == NULL) { 2312 D("NULL na, should not happen"); 2313 return NULL; /* no netmap support here */ 2314 } 2315 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2316 ND("interface not in netmap mode"); 2317 return NULL; /* nothing to reinitialize */ 2318 } 2319 2320 /* XXX note- in the new scheme, we are not guaranteed to be 2321 * under lock (e.g. when called on a device reset). 2322 * In this case, we should set a flag and do not trust too 2323 * much the values. In practice: TODO 2324 * - set a RESET flag somewhere in the kring 2325 * - do the processing in a conservative way 2326 * - let the *sync() fixup at the end. 2327 */ 2328 if (tx == NR_TX) { 2329 if (n >= na->num_tx_rings) 2330 return NULL; 2331 kring = na->tx_rings + n; 2332 // XXX check whether we should use hwcur or rcur 2333 new_hwofs = kring->nr_hwcur - new_cur; 2334 } else { 2335 if (n >= na->num_rx_rings) 2336 return NULL; 2337 kring = na->rx_rings + n; 2338 new_hwofs = kring->nr_hwtail - new_cur; 2339 } 2340 lim = kring->nkr_num_slots - 1; 2341 if (new_hwofs > lim) 2342 new_hwofs -= lim + 1; 2343 2344 /* Always set the new offset value and realign the ring. */ 2345 if (netmap_verbose) 2346 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2347 NM_IFPNAME(na->ifp), 2348 tx == NR_TX ? "TX" : "RX", n, 2349 kring->nkr_hwofs, new_hwofs, 2350 kring->nr_hwtail, 2351 tx == NR_TX ? lim : kring->nr_hwtail); 2352 kring->nkr_hwofs = new_hwofs; 2353 if (tx == NR_TX) { 2354 kring->nr_hwtail = kring->nr_hwcur + lim; 2355 if (kring->nr_hwtail > lim) 2356 kring->nr_hwtail -= lim + 1; 2357 } 2358 2359 #if 0 // def linux 2360 /* XXX check that the mappings are correct */ 2361 /* need ring_nr, adapter->pdev, direction */ 2362 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2363 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2364 D("error mapping rx netmap buffer %d", i); 2365 // XXX fix error handling 2366 } 2367 2368 #endif /* linux */ 2369 /* 2370 * Wakeup on the individual and global selwait 2371 * We do the wakeup here, but the ring is not yet reconfigured. 2372 * However, we are under lock so there are no races. 2373 */ 2374 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2375 return kring->ring->slot; 2376 } 2377 2378 2379 /* 2380 * Dispatch rx/tx interrupts to the netmap rings. 2381 * 2382 * "work_done" is non-null on the RX path, NULL for the TX path. 2383 * We rely on the OS to make sure that there is only one active 2384 * instance per queue, and that there is appropriate locking. 2385 * 2386 * The 'notify' routine depends on what the ring is attached to. 2387 * - for a netmap file descriptor, do a selwakeup on the individual 2388 * waitqueue, plus one on the global one if needed 2389 * - for a switch, call the proper forwarding routine 2390 * - XXX more ? 2391 */ 2392 void 2393 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2394 { 2395 struct netmap_adapter *na = NA(ifp); 2396 struct netmap_kring *kring; 2397 2398 q &= NETMAP_RING_MASK; 2399 2400 if (netmap_verbose) { 2401 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2402 } 2403 2404 if (work_done) { /* RX path */ 2405 if (q >= na->num_rx_rings) 2406 return; // not a physical queue 2407 kring = na->rx_rings + q; 2408 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2409 na->nm_notify(na, q, NR_RX, 2410 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2411 *work_done = 1; /* do not fire napi again */ 2412 } else { /* TX path */ 2413 if (q >= na->num_tx_rings) 2414 return; // not a physical queue 2415 kring = na->tx_rings + q; 2416 na->nm_notify(na, q, NR_TX, 2417 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2418 } 2419 } 2420 2421 2422 /* 2423 * Default functions to handle rx/tx interrupts from a physical device. 2424 * "work_done" is non-null on the RX path, NULL for the TX path. 2425 * 2426 * If the card is not in netmap mode, simply return 0, 2427 * so that the caller proceeds with regular processing. 2428 * Otherwise call netmap_common_irq() and return 1. 2429 * 2430 * If the card is connected to a netmap file descriptor, 2431 * do a selwakeup on the individual queue, plus one on the global one 2432 * if needed (multiqueue card _and_ there are multiqueue listeners), 2433 * and return 1. 2434 * 2435 * Finally, if called on rx from an interface connected to a switch, 2436 * calls the proper forwarding routine, and return 1. 2437 */ 2438 int 2439 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2440 { 2441 // XXX could we check NAF_NATIVE_ON ? 2442 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2443 return 0; 2444 2445 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2446 ND("use regular interrupt"); 2447 return 0; 2448 } 2449 2450 netmap_common_irq(ifp, q, work_done); 2451 return 1; 2452 } 2453 2454 2455 /* 2456 * Module loader and unloader 2457 * 2458 * netmap_init() creates the /dev/netmap device and initializes 2459 * all global variables. Returns 0 on success, errno on failure 2460 * (but there is no chance) 2461 * 2462 * netmap_fini() destroys everything. 2463 */ 2464 2465 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2466 extern struct cdevsw netmap_cdevsw; 2467 2468 2469 void 2470 netmap_fini(void) 2471 { 2472 // XXX destroy_bridges() ? 2473 if (netmap_dev) 2474 destroy_dev(netmap_dev); 2475 netmap_mem_fini(); 2476 NMG_LOCK_DESTROY(); 2477 printf("netmap: unloaded module.\n"); 2478 } 2479 2480 2481 int 2482 netmap_init(void) 2483 { 2484 int error; 2485 2486 NMG_LOCK_INIT(); 2487 2488 error = netmap_mem_init(); 2489 if (error != 0) 2490 goto fail; 2491 /* XXX could use make_dev_credv() to get error number */ 2492 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2493 "netmap"); 2494 if (!netmap_dev) 2495 goto fail; 2496 2497 netmap_init_bridges(); 2498 printf("netmap: loaded module\n"); 2499 return (0); 2500 fail: 2501 netmap_fini(); 2502 return (EINVAL); /* may be incorrect */ 2503 } 2504