1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/rwlock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/selinfo.h> 147 #include <sys/sysctl.h> 148 #include <sys/jail.h> 149 #include <net/vnet.h> 150 #include <net/if.h> 151 #include <net/if_var.h> 152 #include <net/bpf.h> /* BIOCIMMEDIATE */ 153 #include <machine/bus.h> /* bus_dmamap_* */ 154 #include <sys/endian.h> 155 #include <sys/refcount.h> 156 157 158 /* reduce conditional code */ 159 // linux API, use for the knlist in FreeBSD 160 #define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) 161 162 void freebsd_selwakeup(struct selinfo *si, int pri); 163 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b) 164 165 #elif defined(linux) 166 167 #include "bsd_glue.h" 168 169 170 171 #elif defined(__APPLE__) 172 173 #warning OSX support is only partial 174 #include "osx_glue.h" 175 176 #else 177 178 #error Unsupported platform 179 180 #endif /* unsupported */ 181 182 /* 183 * common headers 184 */ 185 #include <net/netmap.h> 186 #include <dev/netmap/netmap_kern.h> 187 #include <dev/netmap/netmap_mem2.h> 188 189 190 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 191 192 /* 193 * The following variables are used by the drivers and replicate 194 * fields in the global memory pool. They only refer to buffers 195 * used by physical interfaces. 196 */ 197 u_int netmap_total_buffers; 198 u_int netmap_buf_size; 199 char *netmap_buffer_base; /* also address of an invalid buffer */ 200 201 /* user-controlled variables */ 202 int netmap_verbose; 203 204 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 205 206 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 207 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 208 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 209 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 210 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 211 int netmap_mitigate = 1; 212 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 213 int netmap_no_pendintr = 1; 214 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 215 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 216 int netmap_txsync_retry = 2; 217 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 218 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 219 220 int netmap_flags = 0; /* debug flags */ 221 int netmap_fwd = 0; /* force transparent mode */ 222 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 223 224 /* 225 * netmap_admode selects the netmap mode to use. 226 * Invalid values are reset to NETMAP_ADMODE_BEST 227 */ 228 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 229 NETMAP_ADMODE_NATIVE, /* either native or none */ 230 NETMAP_ADMODE_GENERIC, /* force generic */ 231 NETMAP_ADMODE_LAST }; 232 static int netmap_admode = NETMAP_ADMODE_BEST; 233 234 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 235 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 236 int netmap_generic_rings = 1; /* number of queues in generic. */ 237 238 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 239 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 240 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 241 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 242 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); 245 246 NMG_LOCK_T netmap_global_lock; 247 248 249 static void 250 nm_kr_get(struct netmap_kring *kr) 251 { 252 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 253 tsleep(kr, 0, "NM_KR_GET", 4); 254 } 255 256 257 /* 258 * mark the ring as stopped, and run through the locks 259 * to make sure other users get to see it. 260 */ 261 void 262 netmap_disable_ring(struct netmap_kring *kr) 263 { 264 kr->nkr_stopped = 1; 265 nm_kr_get(kr); 266 mtx_lock(&kr->q_lock); 267 mtx_unlock(&kr->q_lock); 268 nm_kr_put(kr); 269 } 270 271 272 static void 273 netmap_set_all_rings(struct ifnet *ifp, int stopped) 274 { 275 struct netmap_adapter *na; 276 int i; 277 u_int ntx, nrx; 278 279 if (!(ifp->if_capenable & IFCAP_NETMAP)) 280 return; 281 282 na = NA(ifp); 283 284 ntx = netmap_real_tx_rings(na); 285 nrx = netmap_real_rx_rings(na); 286 287 for (i = 0; i < ntx; i++) { 288 if (stopped) 289 netmap_disable_ring(na->tx_rings + i); 290 else 291 na->tx_rings[i].nkr_stopped = 0; 292 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); 293 } 294 295 for (i = 0; i < nrx; i++) { 296 if (stopped) 297 netmap_disable_ring(na->rx_rings + i); 298 else 299 na->rx_rings[i].nkr_stopped = 0; 300 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); 301 } 302 } 303 304 305 void 306 netmap_disable_all_rings(struct ifnet *ifp) 307 { 308 netmap_set_all_rings(ifp, 1 /* stopped */); 309 } 310 311 312 void 313 netmap_enable_all_rings(struct ifnet *ifp) 314 { 315 netmap_set_all_rings(ifp, 0 /* enabled */); 316 } 317 318 319 /* 320 * generic bound_checking function 321 */ 322 u_int 323 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 324 { 325 u_int oldv = *v; 326 const char *op = NULL; 327 328 if (dflt < lo) 329 dflt = lo; 330 if (dflt > hi) 331 dflt = hi; 332 if (oldv < lo) { 333 *v = dflt; 334 op = "Bump"; 335 } else if (oldv > hi) { 336 *v = hi; 337 op = "Clamp"; 338 } 339 if (op && msg) 340 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 341 return *v; 342 } 343 344 345 /* 346 * packet-dump function, user-supplied or static buffer. 347 * The destination buffer must be at least 30+4*len 348 */ 349 const char * 350 nm_dump_buf(char *p, int len, int lim, char *dst) 351 { 352 static char _dst[8192]; 353 int i, j, i0; 354 static char hex[] ="0123456789abcdef"; 355 char *o; /* output position */ 356 357 #define P_HI(x) hex[((x) & 0xf0)>>4] 358 #define P_LO(x) hex[((x) & 0xf)] 359 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 360 if (!dst) 361 dst = _dst; 362 if (lim <= 0 || lim > len) 363 lim = len; 364 o = dst; 365 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 366 o += strlen(o); 367 /* hexdump routine */ 368 for (i = 0; i < lim; ) { 369 sprintf(o, "%5d: ", i); 370 o += strlen(o); 371 memset(o, ' ', 48); 372 i0 = i; 373 for (j=0; j < 16 && i < lim; i++, j++) { 374 o[j*3] = P_HI(p[i]); 375 o[j*3+1] = P_LO(p[i]); 376 } 377 i = i0; 378 for (j=0; j < 16 && i < lim; i++, j++) 379 o[j + 48] = P_C(p[i]); 380 o[j+48] = '\n'; 381 o += j+49; 382 } 383 *o = '\0'; 384 #undef P_HI 385 #undef P_LO 386 #undef P_C 387 return dst; 388 } 389 390 391 /* 392 * Fetch configuration from the device, to cope with dynamic 393 * reconfigurations after loading the module. 394 */ 395 int 396 netmap_update_config(struct netmap_adapter *na) 397 { 398 struct ifnet *ifp = na->ifp; 399 u_int txr, txd, rxr, rxd; 400 401 txr = txd = rxr = rxd = 0; 402 if (na->nm_config) { 403 na->nm_config(na, &txr, &txd, &rxr, &rxd); 404 } else { 405 /* take whatever we had at init time */ 406 txr = na->num_tx_rings; 407 txd = na->num_tx_desc; 408 rxr = na->num_rx_rings; 409 rxd = na->num_rx_desc; 410 } 411 412 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 413 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 414 return 0; /* nothing changed */ 415 if (netmap_verbose || na->active_fds > 0) { 416 D("stored config %s: txring %d x %d, rxring %d x %d", 417 NM_IFPNAME(ifp), 418 na->num_tx_rings, na->num_tx_desc, 419 na->num_rx_rings, na->num_rx_desc); 420 D("new config %s: txring %d x %d, rxring %d x %d", 421 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 422 } 423 if (na->active_fds == 0) { 424 D("configuration changed (but fine)"); 425 na->num_tx_rings = txr; 426 na->num_tx_desc = txd; 427 na->num_rx_rings = rxr; 428 na->num_rx_desc = rxd; 429 return 0; 430 } 431 D("configuration changed while active, this is bad..."); 432 return 1; 433 } 434 435 static int 436 netmap_txsync_compat(struct netmap_kring *kring, int flags) 437 { 438 struct netmap_adapter *na = kring->na; 439 return na->nm_txsync(na, kring->ring_id, flags); 440 } 441 442 static int 443 netmap_rxsync_compat(struct netmap_kring *kring, int flags) 444 { 445 struct netmap_adapter *na = kring->na; 446 return na->nm_rxsync(na, kring->ring_id, flags); 447 } 448 449 static int 450 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) 451 { 452 (void)flags; 453 netmap_txsync_to_host(kring->na); 454 return 0; 455 } 456 457 static int 458 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) 459 { 460 (void)flags; 461 netmap_rxsync_from_host(kring->na, NULL, NULL); 462 return 0; 463 } 464 465 466 467 /* create the krings array and initialize the fields common to all adapters. 468 * The array layout is this: 469 * 470 * +----------+ 471 * na->tx_rings ----->| | \ 472 * | | } na->num_tx_ring 473 * | | / 474 * +----------+ 475 * | | host tx kring 476 * na->rx_rings ----> +----------+ 477 * | | \ 478 * | | } na->num_rx_rings 479 * | | / 480 * +----------+ 481 * | | host rx kring 482 * +----------+ 483 * na->tailroom ----->| | \ 484 * | | } tailroom bytes 485 * | | / 486 * +----------+ 487 * 488 * Note: for compatibility, host krings are created even when not needed. 489 * The tailroom space is currently used by vale ports for allocating leases. 490 */ 491 int 492 netmap_krings_create(struct netmap_adapter *na, u_int tailroom) 493 { 494 u_int i, len, ndesc; 495 struct netmap_kring *kring; 496 u_int ntx, nrx; 497 498 /* account for the (possibly fake) host rings */ 499 ntx = na->num_tx_rings + 1; 500 nrx = na->num_rx_rings + 1; 501 502 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 503 504 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 505 if (na->tx_rings == NULL) { 506 D("Cannot allocate krings"); 507 return ENOMEM; 508 } 509 na->rx_rings = na->tx_rings + ntx; 510 511 /* 512 * All fields in krings are 0 except the one initialized below. 513 * but better be explicit on important kring fields. 514 */ 515 ndesc = na->num_tx_desc; 516 for (i = 0; i < ntx; i++) { /* Transmit rings */ 517 kring = &na->tx_rings[i]; 518 bzero(kring, sizeof(*kring)); 519 kring->na = na; 520 kring->ring_id = i; 521 kring->nkr_num_slots = ndesc; 522 if (i < na->num_tx_rings) { 523 kring->nm_sync = netmap_txsync_compat; // XXX 524 } else if (i == na->num_tx_rings) { 525 kring->nm_sync = netmap_txsync_to_host_compat; 526 } 527 /* 528 * IMPORTANT: Always keep one slot empty. 529 */ 530 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 531 kring->rtail = kring->nr_hwtail = ndesc - 1; 532 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); 533 ND("ktx %s h %d c %d t %d", 534 kring->name, kring->rhead, kring->rcur, kring->rtail); 535 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 536 init_waitqueue_head(&kring->si); 537 } 538 539 ndesc = na->num_rx_desc; 540 for (i = 0; i < nrx; i++) { /* Receive rings */ 541 kring = &na->rx_rings[i]; 542 bzero(kring, sizeof(*kring)); 543 kring->na = na; 544 kring->ring_id = i; 545 kring->nkr_num_slots = ndesc; 546 if (i < na->num_rx_rings) { 547 kring->nm_sync = netmap_rxsync_compat; // XXX 548 } else if (i == na->num_rx_rings) { 549 kring->nm_sync = netmap_rxsync_from_host_compat; 550 } 551 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 552 kring->rtail = kring->nr_hwtail = 0; 553 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); 554 ND("krx %s h %d c %d t %d", 555 kring->name, kring->rhead, kring->rcur, kring->rtail); 556 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 557 init_waitqueue_head(&kring->si); 558 } 559 init_waitqueue_head(&na->tx_si); 560 init_waitqueue_head(&na->rx_si); 561 562 na->tailroom = na->rx_rings + nrx; 563 564 return 0; 565 } 566 567 568 /* undo the actions performed by netmap_krings_create */ 569 void 570 netmap_krings_delete(struct netmap_adapter *na) 571 { 572 struct netmap_kring *kring = na->tx_rings; 573 574 /* we rely on the krings layout described above */ 575 for ( ; kring != na->tailroom; kring++) { 576 mtx_destroy(&kring->q_lock); 577 } 578 free(na->tx_rings, M_DEVBUF); 579 na->tx_rings = na->rx_rings = na->tailroom = NULL; 580 } 581 582 583 /* 584 * Destructor for NIC ports. They also have an mbuf queue 585 * on the rings connected to the host so we need to purge 586 * them first. 587 */ 588 static void 589 netmap_hw_krings_delete(struct netmap_adapter *na) 590 { 591 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 592 593 ND("destroy sw mbq with len %d", mbq_len(q)); 594 mbq_purge(q); 595 mbq_safe_destroy(q); 596 netmap_krings_delete(na); 597 } 598 599 600 static struct netmap_if* 601 netmap_if_new(const char *ifname, struct netmap_adapter *na) 602 { 603 struct netmap_if *nifp; 604 605 if (netmap_update_config(na)) { 606 /* configuration mismatch, report and fail */ 607 return NULL; 608 } 609 610 if (na->active_fds) 611 goto final; 612 613 if (na->nm_krings_create(na)) 614 goto cleanup; 615 616 if (netmap_mem_rings_create(na)) 617 goto cleanup; 618 619 final: 620 621 nifp = netmap_mem_if_new(ifname, na); 622 if (nifp == NULL) 623 goto cleanup; 624 625 return (nifp); 626 627 cleanup: 628 629 if (na->active_fds == 0) { 630 netmap_mem_rings_delete(na); 631 na->nm_krings_delete(na); 632 } 633 634 return NULL; 635 } 636 637 638 /* grab a reference to the memory allocator, if we don't have one already. The 639 * reference is taken from the netmap_adapter registered with the priv. 640 * 641 */ 642 static int 643 netmap_get_memory_locked(struct netmap_priv_d* p) 644 { 645 struct netmap_mem_d *nmd; 646 int error = 0; 647 648 if (p->np_na == NULL) { 649 if (!netmap_mmap_unreg) 650 return ENODEV; 651 /* for compatibility with older versions of the API 652 * we use the global allocator when no interface has been 653 * registered 654 */ 655 nmd = &nm_mem; 656 } else { 657 nmd = p->np_na->nm_mem; 658 } 659 if (p->np_mref == NULL) { 660 error = netmap_mem_finalize(nmd); 661 if (!error) 662 p->np_mref = nmd; 663 } else if (p->np_mref != nmd) { 664 /* a virtual port has been registered, but previous 665 * syscalls already used the global allocator. 666 * We cannot continue 667 */ 668 error = ENODEV; 669 } 670 return error; 671 } 672 673 674 int 675 netmap_get_memory(struct netmap_priv_d* p) 676 { 677 int error; 678 NMG_LOCK(); 679 error = netmap_get_memory_locked(p); 680 NMG_UNLOCK(); 681 return error; 682 } 683 684 685 static int 686 netmap_have_memory_locked(struct netmap_priv_d* p) 687 { 688 return p->np_mref != NULL; 689 } 690 691 692 static void 693 netmap_drop_memory_locked(struct netmap_priv_d* p) 694 { 695 if (p->np_mref) { 696 netmap_mem_deref(p->np_mref); 697 p->np_mref = NULL; 698 } 699 } 700 701 702 /* 703 * File descriptor's private data destructor. 704 * 705 * Call nm_register(ifp,0) to stop netmap mode on the interface and 706 * revert to normal operation. We expect that np_na->ifp has not gone. 707 * The second argument is the nifp to work on. In some cases it is 708 * not attached yet to the netmap_priv_d so we need to pass it as 709 * a separate argument. 710 */ 711 /* call with NMG_LOCK held */ 712 static void 713 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 714 { 715 struct netmap_adapter *na = priv->np_na; 716 struct ifnet *ifp = na->ifp; 717 718 NMG_LOCK_ASSERT(); 719 na->active_fds--; 720 if (na->active_fds <= 0) { /* last instance */ 721 722 if (netmap_verbose) 723 D("deleting last instance for %s", NM_IFPNAME(ifp)); 724 /* 725 * (TO CHECK) This function is only called 726 * when the last reference to this file descriptor goes 727 * away. This means we cannot have any pending poll() 728 * or interrupt routine operating on the structure. 729 * XXX The file may be closed in a thread while 730 * another thread is using it. 731 * Linux keeps the file opened until the last reference 732 * by any outstanding ioctl/poll or mmap is gone. 733 * FreeBSD does not track mmap()s (but we do) and 734 * wakes up any sleeping poll(). Need to check what 735 * happens if the close() occurs while a concurrent 736 * syscall is running. 737 */ 738 if (ifp) 739 na->nm_register(na, 0); /* off, clear flags */ 740 /* Wake up any sleeping threads. netmap_poll will 741 * then return POLLERR 742 * XXX The wake up now must happen during *_down(), when 743 * we order all activities to stop. -gl 744 */ 745 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 746 /* knlist_destroy(&na->tx_si.si_note); */ 747 /* knlist_destroy(&na->rx_si.si_note); */ 748 749 /* delete rings and buffers */ 750 netmap_mem_rings_delete(na); 751 na->nm_krings_delete(na); 752 } 753 /* delete the nifp */ 754 netmap_mem_if_delete(na, nifp); 755 } 756 757 static __inline int 758 nm_tx_si_user(struct netmap_priv_d *priv) 759 { 760 return (priv->np_na != NULL && 761 (priv->np_txqlast - priv->np_txqfirst > 1)); 762 } 763 764 static __inline int 765 nm_rx_si_user(struct netmap_priv_d *priv) 766 { 767 return (priv->np_na != NULL && 768 (priv->np_rxqlast - priv->np_rxqfirst > 1)); 769 } 770 771 772 /* 773 * returns 1 if this is the last instance and we can free priv 774 */ 775 int 776 netmap_dtor_locked(struct netmap_priv_d *priv) 777 { 778 struct netmap_adapter *na = priv->np_na; 779 780 #ifdef __FreeBSD__ 781 /* 782 * np_refcount is the number of active mmaps on 783 * this file descriptor 784 */ 785 if (--priv->np_refcount > 0) { 786 return 0; 787 } 788 #endif /* __FreeBSD__ */ 789 if (!na) { 790 return 1; //XXX is it correct? 791 } 792 netmap_do_unregif(priv, priv->np_nifp); 793 priv->np_nifp = NULL; 794 netmap_drop_memory_locked(priv); 795 if (priv->np_na) { 796 if (nm_tx_si_user(priv)) 797 na->tx_si_users--; 798 if (nm_rx_si_user(priv)) 799 na->rx_si_users--; 800 netmap_adapter_put(na); 801 priv->np_na = NULL; 802 } 803 return 1; 804 } 805 806 807 void 808 netmap_dtor(void *data) 809 { 810 struct netmap_priv_d *priv = data; 811 int last_instance; 812 813 NMG_LOCK(); 814 last_instance = netmap_dtor_locked(priv); 815 NMG_UNLOCK(); 816 if (last_instance) { 817 bzero(priv, sizeof(*priv)); /* for safety */ 818 free(priv, M_DEVBUF); 819 } 820 } 821 822 823 824 825 /* 826 * Handlers for synchronization of the queues from/to the host. 827 * Netmap has two operating modes: 828 * - in the default mode, the rings connected to the host stack are 829 * just another ring pair managed by userspace; 830 * - in transparent mode (XXX to be defined) incoming packets 831 * (from the host or the NIC) are marked as NS_FORWARD upon 832 * arrival, and the user application has a chance to reset the 833 * flag for packets that should be dropped. 834 * On the RXSYNC or poll(), packets in RX rings between 835 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 836 * to the other side. 837 * The transfer NIC --> host is relatively easy, just encapsulate 838 * into mbufs and we are done. The host --> NIC side is slightly 839 * harder because there might not be room in the tx ring so it 840 * might take a while before releasing the buffer. 841 */ 842 843 844 /* 845 * pass a chain of buffers to the host stack as coming from 'dst' 846 * We do not need to lock because the queue is private. 847 */ 848 static void 849 netmap_send_up(struct ifnet *dst, struct mbq *q) 850 { 851 struct mbuf *m; 852 853 /* send packets up, outside the lock */ 854 while ((m = mbq_dequeue(q)) != NULL) { 855 if (netmap_verbose & NM_VERB_HOST) 856 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 857 NM_SEND_UP(dst, m); 858 } 859 mbq_destroy(q); 860 } 861 862 863 /* 864 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 865 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 866 * and pass them up. Drop remaining packets in the unlikely event 867 * of an mbuf shortage. 868 */ 869 static void 870 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 871 { 872 u_int const lim = kring->nkr_num_slots - 1; 873 u_int const head = kring->ring->head; 874 u_int n; 875 struct netmap_adapter *na = kring->na; 876 877 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 878 struct mbuf *m; 879 struct netmap_slot *slot = &kring->ring->slot[n]; 880 881 if ((slot->flags & NS_FORWARD) == 0 && !force) 882 continue; 883 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 884 RD(5, "bad pkt at %d len %d", n, slot->len); 885 continue; 886 } 887 slot->flags &= ~NS_FORWARD; // XXX needed ? 888 /* XXX TODO: adapt to the case of a multisegment packet */ 889 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 890 891 if (m == NULL) 892 break; 893 mbq_enqueue(q, m); 894 } 895 } 896 897 898 /* 899 * Send to the NIC rings packets marked NS_FORWARD between 900 * kring->nr_hwcur and kring->rhead 901 * Called under kring->rx_queue.lock on the sw rx ring, 902 */ 903 static u_int 904 netmap_sw_to_nic(struct netmap_adapter *na) 905 { 906 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 907 struct netmap_slot *rxslot = kring->ring->slot; 908 u_int i, rxcur = kring->nr_hwcur; 909 u_int const head = kring->rhead; 910 u_int const src_lim = kring->nkr_num_slots - 1; 911 u_int sent = 0; 912 913 /* scan rings to find space, then fill as much as possible */ 914 for (i = 0; i < na->num_tx_rings; i++) { 915 struct netmap_kring *kdst = &na->tx_rings[i]; 916 struct netmap_ring *rdst = kdst->ring; 917 u_int const dst_lim = kdst->nkr_num_slots - 1; 918 919 /* XXX do we trust ring or kring->rcur,rtail ? */ 920 for (; rxcur != head && !nm_ring_empty(rdst); 921 rxcur = nm_next(rxcur, src_lim) ) { 922 struct netmap_slot *src, *dst, tmp; 923 u_int dst_cur = rdst->cur; 924 925 src = &rxslot[rxcur]; 926 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 927 continue; 928 929 sent++; 930 931 dst = &rdst->slot[dst_cur]; 932 933 tmp = *src; 934 935 src->buf_idx = dst->buf_idx; 936 src->flags = NS_BUF_CHANGED; 937 938 dst->buf_idx = tmp.buf_idx; 939 dst->len = tmp.len; 940 dst->flags = NS_BUF_CHANGED; 941 942 rdst->cur = nm_next(dst_cur, dst_lim); 943 } 944 /* if (sent) XXX txsync ? */ 945 } 946 return sent; 947 } 948 949 950 /* 951 * netmap_txsync_to_host() passes packets up. We are called from a 952 * system call in user process context, and the only contention 953 * can be among multiple user threads erroneously calling 954 * this routine concurrently. 955 */ 956 void 957 netmap_txsync_to_host(struct netmap_adapter *na) 958 { 959 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 960 struct netmap_ring *ring = kring->ring; 961 u_int const lim = kring->nkr_num_slots - 1; 962 u_int const head = kring->rhead; 963 struct mbq q; 964 965 /* Take packets from hwcur to head and pass them up. 966 * force head = cur since netmap_grab_packets() stops at head 967 * In case of no buffers we give up. At the end of the loop, 968 * the queue is drained in all cases. 969 */ 970 mbq_init(&q); 971 ring->cur = head; 972 netmap_grab_packets(kring, &q, 1 /* force */); 973 ND("have %d pkts in queue", mbq_len(&q)); 974 kring->nr_hwcur = head; 975 kring->nr_hwtail = head + lim; 976 if (kring->nr_hwtail > lim) 977 kring->nr_hwtail -= lim + 1; 978 nm_txsync_finalize(kring); 979 980 netmap_send_up(na->ifp, &q); 981 } 982 983 984 /* 985 * rxsync backend for packets coming from the host stack. 986 * They have been put in kring->rx_queue by netmap_transmit(). 987 * We protect access to the kring using kring->rx_queue.lock 988 * 989 * This routine also does the selrecord if called from the poll handler 990 * (we know because td != NULL). 991 * 992 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 993 * as an additional hidden argument. 994 * returns the number of packets delivered to tx queues in 995 * transparent mode, or a negative value if error 996 */ 997 int 998 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 999 { 1000 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1001 struct netmap_ring *ring = kring->ring; 1002 u_int nm_i, n; 1003 u_int const lim = kring->nkr_num_slots - 1; 1004 u_int const head = kring->rhead; 1005 int ret = 0; 1006 struct mbq *q = &kring->rx_queue; 1007 1008 (void)pwait; /* disable unused warnings */ 1009 (void)td; 1010 1011 mtx_lock(&q->lock); 1012 1013 /* First part: import newly received packets */ 1014 n = mbq_len(q); 1015 if (n) { /* grab packets from the queue */ 1016 struct mbuf *m; 1017 uint32_t stop_i; 1018 1019 nm_i = kring->nr_hwtail; 1020 stop_i = nm_prev(nm_i, lim); 1021 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 1022 int len = MBUF_LEN(m); 1023 struct netmap_slot *slot = &ring->slot[nm_i]; 1024 1025 m_copydata(m, 0, len, BDG_NMB(na, slot)); 1026 ND("nm %d len %d", nm_i, len); 1027 if (netmap_verbose) 1028 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); 1029 1030 slot->len = len; 1031 slot->flags = kring->nkr_slot_flags; 1032 nm_i = nm_next(nm_i, lim); 1033 } 1034 kring->nr_hwtail = nm_i; 1035 } 1036 1037 /* 1038 * Second part: skip past packets that userspace has released. 1039 */ 1040 nm_i = kring->nr_hwcur; 1041 if (nm_i != head) { /* something was released */ 1042 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 1043 ret = netmap_sw_to_nic(na); 1044 kring->nr_hwcur = head; 1045 } 1046 1047 nm_rxsync_finalize(kring); 1048 1049 /* access copies of cur,tail in the kring */ 1050 if (kring->rcur == kring->rtail && td) /* no bufs available */ 1051 selrecord(td, &kring->si); 1052 1053 mtx_unlock(&q->lock); 1054 return ret; 1055 } 1056 1057 1058 /* Get a netmap adapter for the port. 1059 * 1060 * If it is possible to satisfy the request, return 0 1061 * with *na containing the netmap adapter found. 1062 * Otherwise return an error code, with *na containing NULL. 1063 * 1064 * When the port is attached to a bridge, we always return 1065 * EBUSY. 1066 * Otherwise, if the port is already bound to a file descriptor, 1067 * then we unconditionally return the existing adapter into *na. 1068 * In all the other cases, we return (into *na) either native, 1069 * generic or NULL, according to the following table: 1070 * 1071 * native_support 1072 * active_fds dev.netmap.admode YES NO 1073 * ------------------------------------------------------- 1074 * >0 * NA(ifp) NA(ifp) 1075 * 1076 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1077 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1078 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1079 * 1080 */ 1081 1082 int 1083 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1084 { 1085 /* generic support */ 1086 int i = netmap_admode; /* Take a snapshot. */ 1087 int error = 0; 1088 struct netmap_adapter *prev_na; 1089 struct netmap_generic_adapter *gna; 1090 1091 *na = NULL; /* default */ 1092 1093 /* reset in case of invalid value */ 1094 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1095 i = netmap_admode = NETMAP_ADMODE_BEST; 1096 1097 if (NETMAP_CAPABLE(ifp)) { 1098 /* If an adapter already exists, but is 1099 * attached to a vale port, we report that the 1100 * port is busy. 1101 */ 1102 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 1103 return EBUSY; 1104 1105 /* If an adapter already exists, return it if 1106 * there are active file descriptors or if 1107 * netmap is not forced to use generic 1108 * adapters. 1109 */ 1110 if (NA(ifp)->active_fds > 0 || 1111 i != NETMAP_ADMODE_GENERIC) { 1112 *na = NA(ifp); 1113 return 0; 1114 } 1115 } 1116 1117 /* If there isn't native support and netmap is not allowed 1118 * to use generic adapters, we cannot satisfy the request. 1119 */ 1120 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1121 return EOPNOTSUPP; 1122 1123 /* Otherwise, create a generic adapter and return it, 1124 * saving the previously used netmap adapter, if any. 1125 * 1126 * Note that here 'prev_na', if not NULL, MUST be a 1127 * native adapter, and CANNOT be a generic one. This is 1128 * true because generic adapters are created on demand, and 1129 * destroyed when not used anymore. Therefore, if the adapter 1130 * currently attached to an interface 'ifp' is generic, it 1131 * must be that 1132 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1133 * Consequently, if NA(ifp) is generic, we will enter one of 1134 * the branches above. This ensures that we never override 1135 * a generic adapter with another generic adapter. 1136 */ 1137 prev_na = NA(ifp); 1138 error = generic_netmap_attach(ifp); 1139 if (error) 1140 return error; 1141 1142 *na = NA(ifp); 1143 gna = (struct netmap_generic_adapter*)NA(ifp); 1144 gna->prev = prev_na; /* save old na */ 1145 if (prev_na != NULL) { 1146 ifunit_ref(ifp->if_xname); 1147 // XXX add a refcount ? 1148 netmap_adapter_get(prev_na); 1149 } 1150 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1151 1152 return 0; 1153 } 1154 1155 1156 /* 1157 * MUST BE CALLED UNDER NMG_LOCK() 1158 * 1159 * Get a refcounted reference to a netmap adapter attached 1160 * to the interface specified by nmr. 1161 * This is always called in the execution of an ioctl(). 1162 * 1163 * Return ENXIO if the interface specified by the request does 1164 * not exist, ENOTSUP if netmap is not supported by the interface, 1165 * EBUSY if the interface is already attached to a bridge, 1166 * EINVAL if parameters are invalid, ENOMEM if needed resources 1167 * could not be allocated. 1168 * If successful, hold a reference to the netmap adapter. 1169 * 1170 * No reference is kept on the real interface, which may then 1171 * disappear at any time. 1172 */ 1173 int 1174 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1175 { 1176 struct ifnet *ifp = NULL; 1177 int error = 0; 1178 struct netmap_adapter *ret = NULL; 1179 1180 *na = NULL; /* default return value */ 1181 1182 /* first try to see if this is a bridge port. */ 1183 NMG_LOCK_ASSERT(); 1184 1185 error = netmap_get_pipe_na(nmr, na, create); 1186 if (error || *na != NULL) 1187 return error; 1188 1189 error = netmap_get_bdg_na(nmr, na, create); 1190 if (error) 1191 return error; 1192 1193 if (*na != NULL) /* valid match in netmap_get_bdg_na() */ 1194 goto pipes; 1195 1196 ifp = ifunit_ref(nmr->nr_name); 1197 if (ifp == NULL) { 1198 return ENXIO; 1199 } 1200 1201 error = netmap_get_hw_na(ifp, &ret); 1202 if (error) 1203 goto out; 1204 1205 /* Users cannot use the NIC attached to a bridge directly */ 1206 if (NETMAP_OWNED_BY_KERN(ret)) { 1207 error = EBUSY; 1208 goto out; 1209 } 1210 *na = ret; 1211 netmap_adapter_get(ret); 1212 1213 pipes: 1214 error = netmap_pipe_alloc(*na, nmr); 1215 1216 out: 1217 if (error && ret != NULL) 1218 netmap_adapter_put(ret); 1219 1220 if (ifp) 1221 if_rele(ifp); 1222 1223 return error; 1224 } 1225 1226 1227 /* 1228 * validate parameters on entry for *_txsync() 1229 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1230 * in case of error. 1231 * 1232 * rhead, rcur and rtail=hwtail are stored from previous round. 1233 * hwcur is the next packet to send to the ring. 1234 * 1235 * We want 1236 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1237 * 1238 * hwcur, rhead, rtail and hwtail are reliable 1239 */ 1240 u_int 1241 nm_txsync_prologue(struct netmap_kring *kring) 1242 { 1243 struct netmap_ring *ring = kring->ring; 1244 u_int head = ring->head; /* read only once */ 1245 u_int cur = ring->cur; /* read only once */ 1246 u_int n = kring->nkr_num_slots; 1247 1248 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1249 kring->name, 1250 kring->nr_hwcur, kring->nr_hwtail, 1251 ring->head, ring->cur, ring->tail); 1252 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1253 if (kring->nr_hwcur >= n || kring->rhead >= n || 1254 kring->rtail >= n || kring->nr_hwtail >= n) 1255 goto error; 1256 #endif /* kernel sanity checks */ 1257 /* 1258 * user sanity checks. We only use 'cur', 1259 * A, B, ... are possible positions for cur: 1260 * 1261 * 0 A cur B tail C n-1 1262 * 0 D tail E cur F n-1 1263 * 1264 * B, F, D are valid. A, C, E are wrong 1265 */ 1266 if (kring->rtail >= kring->rhead) { 1267 /* want rhead <= head <= rtail */ 1268 if (head < kring->rhead || head > kring->rtail) 1269 goto error; 1270 /* and also head <= cur <= rtail */ 1271 if (cur < head || cur > kring->rtail) 1272 goto error; 1273 } else { /* here rtail < rhead */ 1274 /* we need head outside rtail .. rhead */ 1275 if (head > kring->rtail && head < kring->rhead) 1276 goto error; 1277 1278 /* two cases now: head <= rtail or head >= rhead */ 1279 if (head <= kring->rtail) { 1280 /* want head <= cur <= rtail */ 1281 if (cur < head || cur > kring->rtail) 1282 goto error; 1283 } else { /* head >= rhead */ 1284 /* cur must be outside rtail..head */ 1285 if (cur > kring->rtail && cur < head) 1286 goto error; 1287 } 1288 } 1289 if (ring->tail != kring->rtail) { 1290 RD(5, "tail overwritten was %d need %d", 1291 ring->tail, kring->rtail); 1292 ring->tail = kring->rtail; 1293 } 1294 kring->rhead = head; 1295 kring->rcur = cur; 1296 return head; 1297 1298 error: 1299 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1300 kring->name, 1301 kring->nr_hwcur, 1302 kring->rcur, kring->nr_hwtail, 1303 cur, ring->tail); 1304 return n; 1305 } 1306 1307 1308 /* 1309 * validate parameters on entry for *_rxsync() 1310 * Returns ring->head if ok, kring->nkr_num_slots on error. 1311 * 1312 * For a valid configuration, 1313 * hwcur <= head <= cur <= tail <= hwtail 1314 * 1315 * We only consider head and cur. 1316 * hwcur and hwtail are reliable. 1317 * 1318 */ 1319 u_int 1320 nm_rxsync_prologue(struct netmap_kring *kring) 1321 { 1322 struct netmap_ring *ring = kring->ring; 1323 uint32_t const n = kring->nkr_num_slots; 1324 uint32_t head, cur; 1325 1326 ND("%s kc %d kt %d h %d c %d t %d", 1327 kring->name, 1328 kring->nr_hwcur, kring->nr_hwtail, 1329 ring->head, ring->cur, ring->tail); 1330 /* 1331 * Before storing the new values, we should check they do not 1332 * move backwards. However: 1333 * - head is not an issue because the previous value is hwcur; 1334 * - cur could in principle go back, however it does not matter 1335 * because we are processing a brand new rxsync() 1336 */ 1337 cur = kring->rcur = ring->cur; /* read only once */ 1338 head = kring->rhead = ring->head; /* read only once */ 1339 #if 1 /* kernel sanity checks */ 1340 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1341 goto error; 1342 #endif /* kernel sanity checks */ 1343 /* user sanity checks */ 1344 if (kring->nr_hwtail >= kring->nr_hwcur) { 1345 /* want hwcur <= rhead <= hwtail */ 1346 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1347 goto error; 1348 /* and also rhead <= rcur <= hwtail */ 1349 if (cur < head || cur > kring->nr_hwtail) 1350 goto error; 1351 } else { 1352 /* we need rhead outside hwtail..hwcur */ 1353 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1354 goto error; 1355 /* two cases now: head <= hwtail or head >= hwcur */ 1356 if (head <= kring->nr_hwtail) { 1357 /* want head <= cur <= hwtail */ 1358 if (cur < head || cur > kring->nr_hwtail) 1359 goto error; 1360 } else { 1361 /* cur must be outside hwtail..head */ 1362 if (cur < head && cur > kring->nr_hwtail) 1363 goto error; 1364 } 1365 } 1366 if (ring->tail != kring->rtail) { 1367 RD(5, "%s tail overwritten was %d need %d", 1368 kring->name, 1369 ring->tail, kring->rtail); 1370 ring->tail = kring->rtail; 1371 } 1372 return head; 1373 1374 error: 1375 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1376 kring->nr_hwcur, 1377 kring->rcur, kring->nr_hwtail, 1378 kring->rhead, kring->rcur, ring->tail); 1379 return n; 1380 } 1381 1382 1383 /* 1384 * Error routine called when txsync/rxsync detects an error. 1385 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1386 * Return 1 on reinit. 1387 * 1388 * This routine is only called by the upper half of the kernel. 1389 * It only reads hwcur (which is changed only by the upper half, too) 1390 * and hwtail (which may be changed by the lower half, but only on 1391 * a tx ring and only to increase it, so any error will be recovered 1392 * on the next call). For the above, we don't strictly need to call 1393 * it under lock. 1394 */ 1395 int 1396 netmap_ring_reinit(struct netmap_kring *kring) 1397 { 1398 struct netmap_ring *ring = kring->ring; 1399 u_int i, lim = kring->nkr_num_slots - 1; 1400 int errors = 0; 1401 1402 // XXX KASSERT nm_kr_tryget 1403 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1404 // XXX probably wrong to trust userspace 1405 kring->rhead = ring->head; 1406 kring->rcur = ring->cur; 1407 kring->rtail = ring->tail; 1408 1409 if (ring->cur > lim) 1410 errors++; 1411 if (ring->head > lim) 1412 errors++; 1413 if (ring->tail > lim) 1414 errors++; 1415 for (i = 0; i <= lim; i++) { 1416 u_int idx = ring->slot[i].buf_idx; 1417 u_int len = ring->slot[i].len; 1418 if (idx < 2 || idx >= netmap_total_buffers) { 1419 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1420 ring->slot[i].buf_idx = 0; 1421 ring->slot[i].len = 0; 1422 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1423 ring->slot[i].len = 0; 1424 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1425 } 1426 } 1427 if (errors) { 1428 RD(10, "total %d errors", errors); 1429 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1430 kring->name, 1431 ring->cur, kring->nr_hwcur, 1432 ring->tail, kring->nr_hwtail); 1433 ring->head = kring->rhead = kring->nr_hwcur; 1434 ring->cur = kring->rcur = kring->nr_hwcur; 1435 ring->tail = kring->rtail = kring->nr_hwtail; 1436 } 1437 return (errors ? 1 : 0); 1438 } 1439 1440 1441 /* 1442 * Set the ring ID. For devices with a single queue, a request 1443 * for all rings is the same as a single ring. 1444 */ 1445 static int 1446 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1447 { 1448 struct netmap_adapter *na = priv->np_na; 1449 u_int j, i = ringid & NETMAP_RING_MASK; 1450 u_int reg = flags & NR_REG_MASK; 1451 1452 if (reg == NR_REG_DEFAULT) { 1453 /* convert from old ringid to flags */ 1454 if (ringid & NETMAP_SW_RING) { 1455 reg = NR_REG_SW; 1456 } else if (ringid & NETMAP_HW_RING) { 1457 reg = NR_REG_ONE_NIC; 1458 } else { 1459 reg = NR_REG_ALL_NIC; 1460 } 1461 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); 1462 } 1463 switch (reg) { 1464 case NR_REG_ALL_NIC: 1465 case NR_REG_PIPE_MASTER: 1466 case NR_REG_PIPE_SLAVE: 1467 priv->np_txqfirst = 0; 1468 priv->np_txqlast = na->num_tx_rings; 1469 priv->np_rxqfirst = 0; 1470 priv->np_rxqlast = na->num_rx_rings; 1471 ND("%s %d %d", "ALL/PIPE", 1472 priv->np_rxqfirst, priv->np_rxqlast); 1473 break; 1474 case NR_REG_SW: 1475 case NR_REG_NIC_SW: 1476 if (!(na->na_flags & NAF_HOST_RINGS)) { 1477 D("host rings not supported"); 1478 return EINVAL; 1479 } 1480 priv->np_txqfirst = (reg == NR_REG_SW ? 1481 na->num_tx_rings : 0); 1482 priv->np_txqlast = na->num_tx_rings + 1; 1483 priv->np_rxqfirst = (reg == NR_REG_SW ? 1484 na->num_rx_rings : 0); 1485 priv->np_rxqlast = na->num_rx_rings + 1; 1486 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", 1487 priv->np_rxqfirst, priv->np_rxqlast); 1488 break; 1489 case NR_REG_ONE_NIC: 1490 if (i >= na->num_tx_rings && i >= na->num_rx_rings) { 1491 D("invalid ring id %d", i); 1492 return EINVAL; 1493 } 1494 /* if not enough rings, use the first one */ 1495 j = i; 1496 if (j >= na->num_tx_rings) 1497 j = 0; 1498 priv->np_txqfirst = j; 1499 priv->np_txqlast = j + 1; 1500 j = i; 1501 if (j >= na->num_rx_rings) 1502 j = 0; 1503 priv->np_rxqfirst = j; 1504 priv->np_rxqlast = j + 1; 1505 break; 1506 default: 1507 D("invalid regif type %d", reg); 1508 return EINVAL; 1509 } 1510 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1511 priv->np_flags = (flags & ~NR_REG_MASK) | reg; 1512 if (nm_tx_si_user(priv)) 1513 na->tx_si_users++; 1514 if (nm_rx_si_user(priv)) 1515 na->rx_si_users++; 1516 if (netmap_verbose) { 1517 D("%s: tx [%d,%d) rx [%d,%d) id %d", 1518 NM_IFPNAME(na->ifp), 1519 priv->np_txqfirst, 1520 priv->np_txqlast, 1521 priv->np_rxqfirst, 1522 priv->np_rxqlast, 1523 i); 1524 } 1525 return 0; 1526 } 1527 1528 /* 1529 * possibly move the interface to netmap-mode. 1530 * If success it returns a pointer to netmap_if, otherwise NULL. 1531 * This must be called with NMG_LOCK held. 1532 */ 1533 struct netmap_if * 1534 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1535 uint16_t ringid, uint32_t flags, int *err) 1536 { 1537 struct ifnet *ifp = na->ifp; 1538 struct netmap_if *nifp = NULL; 1539 int error, need_mem = 0; 1540 1541 NMG_LOCK_ASSERT(); 1542 /* ring configuration may have changed, fetch from the card */ 1543 netmap_update_config(na); 1544 priv->np_na = na; /* store the reference */ 1545 error = netmap_set_ringid(priv, ringid, flags); 1546 if (error) 1547 goto out; 1548 /* ensure allocators are ready */ 1549 need_mem = !netmap_have_memory_locked(priv); 1550 if (need_mem) { 1551 error = netmap_get_memory_locked(priv); 1552 ND("get_memory returned %d", error); 1553 if (error) 1554 goto out; 1555 } 1556 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1557 if (nifp == NULL) { /* allocation failed */ 1558 /* we should drop the allocator, but only 1559 * if we were the ones who grabbed it 1560 */ 1561 error = ENOMEM; 1562 goto out; 1563 } 1564 na->active_fds++; 1565 if (ifp->if_capenable & IFCAP_NETMAP) { 1566 /* was already set */ 1567 } else { 1568 /* Otherwise set the card in netmap mode 1569 * and make it use the shared buffers. 1570 * 1571 * do not core lock because the race is harmless here, 1572 * there cannot be any traffic to netmap_transmit() 1573 */ 1574 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1575 ND("%p->na_lut == %p", na, na->na_lut); 1576 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1577 error = na->nm_register(na, 1); /* mode on */ 1578 if (error) { 1579 netmap_do_unregif(priv, nifp); 1580 nifp = NULL; 1581 } 1582 } 1583 out: 1584 *err = error; 1585 if (error) { 1586 priv->np_na = NULL; 1587 if (need_mem) 1588 netmap_drop_memory_locked(priv); 1589 } 1590 if (nifp != NULL) { 1591 /* 1592 * advertise that the interface is ready bt setting ni_nifp. 1593 * The barrier is needed because readers (poll and *SYNC) 1594 * check for priv->np_nifp != NULL without locking 1595 */ 1596 wmb(); /* make sure previous writes are visible to all CPUs */ 1597 priv->np_nifp = nifp; 1598 } 1599 return nifp; 1600 } 1601 1602 1603 1604 /* 1605 * ioctl(2) support for the "netmap" device. 1606 * 1607 * Following a list of accepted commands: 1608 * - NIOCGINFO 1609 * - SIOCGIFADDR just for convenience 1610 * - NIOCREGIF 1611 * - NIOCTXSYNC 1612 * - NIOCRXSYNC 1613 * 1614 * Return 0 on success, errno otherwise. 1615 */ 1616 int 1617 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1618 int fflag, struct thread *td) 1619 { 1620 struct netmap_priv_d *priv = NULL; 1621 struct ifnet *ifp = NULL; 1622 struct nmreq *nmr = (struct nmreq *) data; 1623 struct netmap_adapter *na = NULL; 1624 int error; 1625 u_int i, qfirst, qlast; 1626 struct netmap_if *nifp; 1627 struct netmap_kring *krings; 1628 1629 (void)dev; /* UNUSED */ 1630 (void)fflag; /* UNUSED */ 1631 1632 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 1633 /* truncate name */ 1634 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 1635 if (nmr->nr_version != NETMAP_API) { 1636 D("API mismatch for %s got %d need %d", 1637 nmr->nr_name, 1638 nmr->nr_version, NETMAP_API); 1639 nmr->nr_version = NETMAP_API; 1640 } 1641 if (nmr->nr_version < NETMAP_MIN_API || 1642 nmr->nr_version > NETMAP_MAX_API) { 1643 return EINVAL; 1644 } 1645 } 1646 CURVNET_SET(TD_TO_VNET(td)); 1647 1648 error = devfs_get_cdevpriv((void **)&priv); 1649 if (error) { 1650 CURVNET_RESTORE(); 1651 /* XXX ENOENT should be impossible, since the priv 1652 * is now created in the open */ 1653 return (error == ENOENT ? ENXIO : error); 1654 } 1655 1656 switch (cmd) { 1657 case NIOCGINFO: /* return capabilities etc */ 1658 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1659 error = netmap_bdg_ctl(nmr, NULL); 1660 break; 1661 } 1662 1663 NMG_LOCK(); 1664 do { 1665 /* memsize is always valid */ 1666 struct netmap_mem_d *nmd = &nm_mem; 1667 u_int memflags; 1668 1669 if (nmr->nr_name[0] != '\0') { 1670 /* get a refcount */ 1671 error = netmap_get_na(nmr, &na, 1 /* create */); 1672 if (error) 1673 break; 1674 nmd = na->nm_mem; /* get memory allocator */ 1675 } 1676 1677 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, 1678 &nmr->nr_arg2); 1679 if (error) 1680 break; 1681 if (na == NULL) /* only memory info */ 1682 break; 1683 nmr->nr_offset = 0; 1684 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1685 netmap_update_config(na); 1686 nmr->nr_rx_rings = na->num_rx_rings; 1687 nmr->nr_tx_rings = na->num_tx_rings; 1688 nmr->nr_rx_slots = na->num_rx_desc; 1689 nmr->nr_tx_slots = na->num_tx_desc; 1690 netmap_adapter_put(na); 1691 } while (0); 1692 NMG_UNLOCK(); 1693 break; 1694 1695 case NIOCREGIF: 1696 /* possibly attach/detach NIC and VALE switch */ 1697 i = nmr->nr_cmd; 1698 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1699 || i == NETMAP_BDG_VNET_HDR) { 1700 error = netmap_bdg_ctl(nmr, NULL); 1701 break; 1702 } else if (i != 0) { 1703 D("nr_cmd must be 0 not %d", i); 1704 error = EINVAL; 1705 break; 1706 } 1707 1708 /* protect access to priv from concurrent NIOCREGIF */ 1709 NMG_LOCK(); 1710 do { 1711 u_int memflags; 1712 1713 if (priv->np_na != NULL) { /* thread already registered */ 1714 error = EBUSY; 1715 break; 1716 } 1717 /* find the interface and a reference */ 1718 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1719 if (error) 1720 break; 1721 ifp = na->ifp; 1722 if (NETMAP_OWNED_BY_KERN(na)) { 1723 netmap_adapter_put(na); 1724 error = EBUSY; 1725 break; 1726 } 1727 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); 1728 if (!nifp) { /* reg. failed, release priv and ref */ 1729 netmap_adapter_put(na); 1730 priv->np_nifp = NULL; 1731 break; 1732 } 1733 priv->np_td = td; // XXX kqueue, debugging only 1734 1735 /* return the offset of the netmap_if object */ 1736 nmr->nr_rx_rings = na->num_rx_rings; 1737 nmr->nr_tx_rings = na->num_tx_rings; 1738 nmr->nr_rx_slots = na->num_rx_desc; 1739 nmr->nr_tx_slots = na->num_tx_desc; 1740 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, 1741 &nmr->nr_arg2); 1742 if (error) { 1743 netmap_adapter_put(na); 1744 break; 1745 } 1746 if (memflags & NETMAP_MEM_PRIVATE) { 1747 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1748 } 1749 priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? 1750 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; 1751 priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? 1752 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; 1753 1754 if (nmr->nr_arg3) { 1755 D("requested %d extra buffers", nmr->nr_arg3); 1756 nmr->nr_arg3 = netmap_extra_alloc(na, 1757 &nifp->ni_bufs_head, nmr->nr_arg3); 1758 D("got %d extra buffers", nmr->nr_arg3); 1759 } 1760 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1761 } while (0); 1762 NMG_UNLOCK(); 1763 break; 1764 1765 case NIOCTXSYNC: 1766 case NIOCRXSYNC: 1767 nifp = priv->np_nifp; 1768 1769 if (nifp == NULL) { 1770 error = ENXIO; 1771 break; 1772 } 1773 rmb(); /* make sure following reads are not from cache */ 1774 1775 na = priv->np_na; /* we have a reference */ 1776 1777 if (na == NULL) { 1778 D("Internal error: nifp != NULL && na == NULL"); 1779 error = ENXIO; 1780 break; 1781 } 1782 1783 ifp = na->ifp; 1784 if (ifp == NULL) { 1785 RD(1, "the ifp is gone"); 1786 error = ENXIO; 1787 break; 1788 } 1789 1790 if (cmd == NIOCTXSYNC) { 1791 krings = na->tx_rings; 1792 qfirst = priv->np_txqfirst; 1793 qlast = priv->np_txqlast; 1794 } else { 1795 krings = na->rx_rings; 1796 qfirst = priv->np_rxqfirst; 1797 qlast = priv->np_rxqlast; 1798 } 1799 1800 for (i = qfirst; i < qlast; i++) { 1801 struct netmap_kring *kring = krings + i; 1802 if (nm_kr_tryget(kring)) { 1803 error = EBUSY; 1804 goto out; 1805 } 1806 if (cmd == NIOCTXSYNC) { 1807 if (netmap_verbose & NM_VERB_TXSYNC) 1808 D("pre txsync ring %d cur %d hwcur %d", 1809 i, kring->ring->cur, 1810 kring->nr_hwcur); 1811 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1812 netmap_ring_reinit(kring); 1813 } else { 1814 kring->nm_sync(kring, NAF_FORCE_RECLAIM); 1815 } 1816 if (netmap_verbose & NM_VERB_TXSYNC) 1817 D("post txsync ring %d cur %d hwcur %d", 1818 i, kring->ring->cur, 1819 kring->nr_hwcur); 1820 } else { 1821 kring->nm_sync(kring, NAF_FORCE_READ); 1822 microtime(&na->rx_rings[i].ring->ts); 1823 } 1824 nm_kr_put(kring); 1825 } 1826 1827 break; 1828 1829 #ifdef __FreeBSD__ 1830 case BIOCIMMEDIATE: 1831 case BIOCGHDRCMPLT: 1832 case BIOCSHDRCMPLT: 1833 case BIOCSSEESENT: 1834 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1835 break; 1836 1837 default: /* allow device-specific ioctls */ 1838 { 1839 struct socket so; 1840 1841 bzero(&so, sizeof(so)); 1842 NMG_LOCK(); 1843 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1844 if (error) { 1845 netmap_adapter_put(na); 1846 NMG_UNLOCK(); 1847 break; 1848 } 1849 ifp = na->ifp; 1850 so.so_vnet = ifp->if_vnet; 1851 // so->so_proto not null. 1852 error = ifioctl(&so, cmd, data, td); 1853 netmap_adapter_put(na); 1854 NMG_UNLOCK(); 1855 break; 1856 } 1857 1858 #else /* linux */ 1859 default: 1860 error = EOPNOTSUPP; 1861 #endif /* linux */ 1862 } 1863 out: 1864 1865 CURVNET_RESTORE(); 1866 return (error); 1867 } 1868 1869 1870 /* 1871 * select(2) and poll(2) handlers for the "netmap" device. 1872 * 1873 * Can be called for one or more queues. 1874 * Return true the event mask corresponding to ready events. 1875 * If there are no ready events, do a selrecord on either individual 1876 * selinfo or on the global one. 1877 * Device-dependent parts (locking and sync of tx/rx rings) 1878 * are done through callbacks. 1879 * 1880 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1881 * The first one is remapped to pwait as selrecord() uses the name as an 1882 * hidden argument. 1883 */ 1884 int 1885 netmap_poll(struct cdev *dev, int events, struct thread *td) 1886 { 1887 struct netmap_priv_d *priv = NULL; 1888 struct netmap_adapter *na; 1889 struct ifnet *ifp; 1890 struct netmap_kring *kring; 1891 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1892 struct mbq q; /* packets from hw queues to host stack */ 1893 void *pwait = dev; /* linux compatibility */ 1894 int is_kevent = 0; 1895 1896 /* 1897 * In order to avoid nested locks, we need to "double check" 1898 * txsync and rxsync if we decide to do a selrecord(). 1899 * retry_tx (and retry_rx, later) prevent looping forever. 1900 */ 1901 int retry_tx = 1, retry_rx = 1; 1902 1903 (void)pwait; 1904 mbq_init(&q); 1905 1906 /* 1907 * XXX kevent has curthread->tp_fop == NULL, 1908 * so devfs_get_cdevpriv() fails. We circumvent this by passing 1909 * priv as the first argument, which is also useful to avoid 1910 * the selrecord() which are not necessary in that case. 1911 */ 1912 if (devfs_get_cdevpriv((void **)&priv) != 0) { 1913 is_kevent = 1; 1914 if (netmap_verbose) 1915 D("called from kevent"); 1916 priv = (struct netmap_priv_d *)dev; 1917 } 1918 if (priv == NULL) 1919 return POLLERR; 1920 1921 if (priv->np_nifp == NULL) { 1922 D("No if registered"); 1923 return POLLERR; 1924 } 1925 rmb(); /* make sure following reads are not from cache */ 1926 1927 na = priv->np_na; 1928 ifp = na->ifp; 1929 // check for deleted 1930 if (ifp == NULL) { 1931 RD(1, "the ifp is gone"); 1932 return POLLERR; 1933 } 1934 1935 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1936 return POLLERR; 1937 1938 if (netmap_verbose & 0x8000) 1939 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1940 want_tx = events & (POLLOUT | POLLWRNORM); 1941 want_rx = events & (POLLIN | POLLRDNORM); 1942 1943 1944 /* 1945 * check_all_{tx|rx} are set if the card has more than one queue AND 1946 * the file descriptor is bound to all of them. If so, we sleep on 1947 * the "global" selinfo, otherwise we sleep on individual selinfo 1948 * (FreeBSD only allows two selinfo's per file descriptor). 1949 * The interrupt routine in the driver wake one or the other 1950 * (or both) depending on which clients are active. 1951 * 1952 * rxsync() is only called if we run out of buffers on a POLLIN. 1953 * txsync() is called if we run out of buffers on POLLOUT, or 1954 * there are pending packets to send. The latter can be disabled 1955 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1956 */ 1957 check_all_tx = nm_tx_si_user(priv); 1958 check_all_rx = nm_rx_si_user(priv); 1959 1960 /* 1961 * We start with a lock free round which is cheap if we have 1962 * slots available. If this fails, then lock and call the sync 1963 * routines. 1964 */ 1965 for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { 1966 kring = &na->rx_rings[i]; 1967 /* XXX compare ring->cur and kring->tail */ 1968 if (!nm_ring_empty(kring->ring)) { 1969 revents |= want_rx; 1970 want_rx = 0; /* also breaks the loop */ 1971 } 1972 } 1973 for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { 1974 kring = &na->tx_rings[i]; 1975 /* XXX compare ring->cur and kring->tail */ 1976 if (!nm_ring_empty(kring->ring)) { 1977 revents |= want_tx; 1978 want_tx = 0; /* also breaks the loop */ 1979 } 1980 } 1981 1982 /* 1983 * If we want to push packets out (priv->np_txpoll) or 1984 * want_tx is still set, we must issue txsync calls 1985 * (on all rings, to avoid that the tx rings stall). 1986 * XXX should also check cur != hwcur on the tx rings. 1987 * Fortunately, normal tx mode has np_txpoll set. 1988 */ 1989 if (priv->np_txpoll || want_tx) { 1990 /* 1991 * The first round checks if anyone is ready, if not 1992 * do a selrecord and another round to handle races. 1993 * want_tx goes to 0 if any space is found, and is 1994 * used to skip rings with no pending transmissions. 1995 */ 1996 flush_tx: 1997 for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { 1998 int found = 0; 1999 2000 kring = &na->tx_rings[i]; 2001 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2002 continue; 2003 /* only one thread does txsync */ 2004 if (nm_kr_tryget(kring)) { 2005 D("%p lost race on txring %d, ok", priv, i); 2006 continue; 2007 } 2008 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2009 netmap_ring_reinit(kring); 2010 revents |= POLLERR; 2011 } else { 2012 if (kring->nm_sync(kring, 0)) 2013 revents |= POLLERR; 2014 } 2015 2016 /* 2017 * If we found new slots, notify potential 2018 * listeners on the same ring. 2019 * Since we just did a txsync, look at the copies 2020 * of cur,tail in the kring. 2021 */ 2022 found = kring->rcur != kring->rtail; 2023 nm_kr_put(kring); 2024 if (found) { /* notify other listeners */ 2025 revents |= want_tx; 2026 want_tx = 0; 2027 na->nm_notify(na, i, NR_TX, 0); 2028 } 2029 } 2030 if (want_tx && retry_tx && !is_kevent) { 2031 selrecord(td, check_all_tx ? 2032 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); 2033 retry_tx = 0; 2034 goto flush_tx; 2035 } 2036 } 2037 2038 /* 2039 * If want_rx is still set scan receive rings. 2040 * Do it on all rings because otherwise we starve. 2041 */ 2042 if (want_rx) { 2043 int send_down = 0; /* transparent mode */ 2044 /* two rounds here to for race avoidance */ 2045 do_retry_rx: 2046 for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { 2047 int found = 0; 2048 2049 kring = &na->rx_rings[i]; 2050 2051 if (nm_kr_tryget(kring)) { 2052 D("%p lost race on rxring %d, ok", priv, i); 2053 continue; 2054 } 2055 2056 /* 2057 * transparent mode support: collect packets 2058 * from the rxring(s). 2059 * XXX NR_FORWARD should only be read on 2060 * physical or NIC ports 2061 */ 2062 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2063 ND(10, "forwarding some buffers up %d to %d", 2064 kring->nr_hwcur, kring->ring->cur); 2065 netmap_grab_packets(kring, &q, netmap_fwd); 2066 } 2067 2068 if (kring->nm_sync(kring, 0)) 2069 revents |= POLLERR; 2070 if (netmap_no_timestamp == 0 || 2071 kring->ring->flags & NR_TIMESTAMP) { 2072 microtime(&kring->ring->ts); 2073 } 2074 /* after an rxsync we can use kring->rcur, rtail */ 2075 found = kring->rcur != kring->rtail; 2076 nm_kr_put(kring); 2077 if (found) { 2078 revents |= want_rx; 2079 retry_rx = 0; 2080 na->nm_notify(na, i, NR_RX, 0); 2081 } 2082 } 2083 2084 /* transparent mode XXX only during first pass ? */ 2085 if (na->na_flags & NAF_HOST_RINGS) { 2086 kring = &na->rx_rings[na->num_rx_rings]; 2087 if (check_all_rx 2088 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 2089 /* XXX fix to use kring fields */ 2090 if (nm_ring_empty(kring->ring)) 2091 send_down = netmap_rxsync_from_host(na, td, dev); 2092 if (!nm_ring_empty(kring->ring)) 2093 revents |= want_rx; 2094 } 2095 } 2096 2097 if (retry_rx && !is_kevent) 2098 selrecord(td, check_all_rx ? 2099 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); 2100 if (send_down > 0 || retry_rx) { 2101 retry_rx = 0; 2102 if (send_down) 2103 goto flush_tx; /* and retry_rx */ 2104 else 2105 goto do_retry_rx; 2106 } 2107 } 2108 2109 /* 2110 * Transparent mode: marked bufs on rx rings between 2111 * kring->nr_hwcur and ring->head 2112 * are passed to the other endpoint. 2113 * 2114 * In this mode we also scan the sw rxring, which in 2115 * turn passes packets up. 2116 * 2117 * XXX Transparent mode at the moment requires to bind all 2118 * rings to a single file descriptor. 2119 */ 2120 2121 if (q.head) 2122 netmap_send_up(na->ifp, &q); 2123 2124 return (revents); 2125 } 2126 2127 2128 /*-------------------- driver support routines -------------------*/ 2129 2130 static int netmap_hw_krings_create(struct netmap_adapter *); 2131 2132 static int 2133 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2134 enum txrx tx, int flags) 2135 { 2136 struct netmap_kring *kring; 2137 2138 if (tx == NR_TX) { 2139 kring = na->tx_rings + n_ring; 2140 OS_selwakeup(&kring->si, PI_NET); 2141 if (na->tx_si_users > 0) 2142 OS_selwakeup(&na->tx_si, PI_NET); 2143 } else { 2144 kring = na->rx_rings + n_ring; 2145 OS_selwakeup(&kring->si, PI_NET); 2146 if (na->rx_si_users > 0) 2147 OS_selwakeup(&na->rx_si, PI_NET); 2148 } 2149 return 0; 2150 } 2151 2152 2153 // XXX check handling of failures 2154 int 2155 netmap_attach_common(struct netmap_adapter *na) 2156 { 2157 struct ifnet *ifp = na->ifp; 2158 2159 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2160 D("%s: invalid rings tx %d rx %d", 2161 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2162 return EINVAL; 2163 } 2164 WNA(ifp) = na; 2165 2166 /* the following is only needed for na that use the host port. 2167 * XXX do we have something similar for linux ? 2168 */ 2169 #ifdef __FreeBSD__ 2170 na->if_input = ifp->if_input; /* for netmap_send_up */ 2171 #endif /* __FreeBSD__ */ 2172 2173 NETMAP_SET_CAPABLE(ifp); 2174 if (na->nm_krings_create == NULL) { 2175 na->nm_krings_create = netmap_hw_krings_create; 2176 na->nm_krings_delete = netmap_hw_krings_delete; 2177 } 2178 if (na->nm_notify == NULL) 2179 na->nm_notify = netmap_notify; 2180 na->active_fds = 0; 2181 2182 if (na->nm_mem == NULL) 2183 na->nm_mem = &nm_mem; 2184 return 0; 2185 } 2186 2187 2188 void 2189 netmap_detach_common(struct netmap_adapter *na) 2190 { 2191 if (na->ifp) 2192 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2193 2194 if (na->tx_rings) { /* XXX should not happen */ 2195 D("freeing leftover tx_rings"); 2196 na->nm_krings_delete(na); 2197 } 2198 netmap_pipe_dealloc(na); 2199 if (na->na_flags & NAF_MEM_OWNER) 2200 netmap_mem_private_delete(na->nm_mem); 2201 bzero(na, sizeof(*na)); 2202 free(na, M_DEVBUF); 2203 } 2204 2205 2206 /* 2207 * Initialize a ``netmap_adapter`` object created by driver on attach. 2208 * We allocate a block of memory with room for a struct netmap_adapter 2209 * plus two sets of N+2 struct netmap_kring (where N is the number 2210 * of hardware rings): 2211 * krings 0..N-1 are for the hardware queues. 2212 * kring N is for the host stack queue 2213 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2214 * Return 0 on success, ENOMEM otherwise. 2215 */ 2216 int 2217 netmap_attach(struct netmap_adapter *arg) 2218 { 2219 struct netmap_hw_adapter *hwna = NULL; 2220 // XXX when is arg == NULL ? 2221 struct ifnet *ifp = arg ? arg->ifp : NULL; 2222 2223 if (arg == NULL || ifp == NULL) 2224 goto fail; 2225 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2226 if (hwna == NULL) 2227 goto fail; 2228 hwna->up = *arg; 2229 hwna->up.na_flags |= NAF_HOST_RINGS; 2230 if (netmap_attach_common(&hwna->up)) { 2231 free(hwna, M_DEVBUF); 2232 goto fail; 2233 } 2234 netmap_adapter_get(&hwna->up); 2235 2236 #ifdef linux 2237 if (ifp->netdev_ops) { 2238 /* prepare a clone of the netdev ops */ 2239 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2240 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2241 #else 2242 hwna->nm_ndo = *ifp->netdev_ops; 2243 #endif 2244 } 2245 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2246 #endif /* linux */ 2247 2248 D("success for %s", NM_IFPNAME(ifp)); 2249 return 0; 2250 2251 fail: 2252 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2253 netmap_detach(ifp); 2254 return (hwna ? EINVAL : ENOMEM); 2255 } 2256 2257 2258 void 2259 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2260 { 2261 if (!na) { 2262 return; 2263 } 2264 2265 refcount_acquire(&na->na_refcount); 2266 } 2267 2268 2269 /* returns 1 iff the netmap_adapter is destroyed */ 2270 int 2271 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2272 { 2273 if (!na) 2274 return 1; 2275 2276 if (!refcount_release(&na->na_refcount)) 2277 return 0; 2278 2279 if (na->nm_dtor) 2280 na->nm_dtor(na); 2281 2282 netmap_detach_common(na); 2283 2284 return 1; 2285 } 2286 2287 int 2288 netmap_hw_krings_create(struct netmap_adapter *na) 2289 { 2290 int ret = netmap_krings_create(na, 0); 2291 if (ret == 0) { 2292 /* initialize the mbq for the sw rx ring */ 2293 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2294 ND("initialized sw rx queue %d", na->num_rx_rings); 2295 } 2296 return ret; 2297 } 2298 2299 2300 2301 /* 2302 * Free the allocated memory linked to the given ``netmap_adapter`` 2303 * object. 2304 */ 2305 void 2306 netmap_detach(struct ifnet *ifp) 2307 { 2308 struct netmap_adapter *na = NA(ifp); 2309 2310 if (!na) 2311 return; 2312 2313 NMG_LOCK(); 2314 netmap_disable_all_rings(ifp); 2315 if (!netmap_adapter_put(na)) { 2316 /* someone is still using the adapter, 2317 * tell them that the interface is gone 2318 */ 2319 na->ifp = NULL; 2320 /* give them a chance to notice */ 2321 netmap_enable_all_rings(ifp); 2322 } 2323 NMG_UNLOCK(); 2324 } 2325 2326 2327 /* 2328 * Intercept packets from the network stack and pass them 2329 * to netmap as incoming packets on the 'software' ring. 2330 * 2331 * We only store packets in a bounded mbq and then copy them 2332 * in the relevant rxsync routine. 2333 * 2334 * We rely on the OS to make sure that the ifp and na do not go 2335 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2336 * In nm_register() or whenever there is a reinitialization, 2337 * we make sure to make the mode change visible here. 2338 */ 2339 int 2340 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2341 { 2342 struct netmap_adapter *na = NA(ifp); 2343 struct netmap_kring *kring; 2344 u_int len = MBUF_LEN(m); 2345 u_int error = ENOBUFS; 2346 struct mbq *q; 2347 int space; 2348 2349 // XXX [Linux] we do not need this lock 2350 // if we follow the down/configure/up protocol -gl 2351 // mtx_lock(&na->core_lock); 2352 2353 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2354 D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); 2355 error = ENXIO; 2356 goto done; 2357 } 2358 2359 kring = &na->rx_rings[na->num_rx_rings]; 2360 q = &kring->rx_queue; 2361 2362 // XXX reconsider long packets if we handle fragments 2363 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2364 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2365 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2366 goto done; 2367 } 2368 2369 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2370 * and maybe other instances of netmap_transmit (the latter 2371 * not possible on Linux). 2372 * Also avoid overflowing the queue. 2373 */ 2374 mtx_lock(&q->lock); 2375 2376 space = kring->nr_hwtail - kring->nr_hwcur; 2377 if (space < 0) 2378 space += kring->nkr_num_slots; 2379 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2380 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2381 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2382 len, m); 2383 } else { 2384 mbq_enqueue(q, m); 2385 ND(10, "%s %d bufs in queue len %d m %p", 2386 NM_IFPNAME(ifp), mbq_len(q), len, m); 2387 /* notify outside the lock */ 2388 m = NULL; 2389 error = 0; 2390 } 2391 mtx_unlock(&q->lock); 2392 2393 done: 2394 if (m) 2395 m_freem(m); 2396 /* unconditionally wake up listeners */ 2397 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2398 2399 return (error); 2400 } 2401 2402 2403 /* 2404 * netmap_reset() is called by the driver routines when reinitializing 2405 * a ring. The driver is in charge of locking to protect the kring. 2406 * If native netmap mode is not set just return NULL. 2407 */ 2408 struct netmap_slot * 2409 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2410 u_int new_cur) 2411 { 2412 struct netmap_kring *kring; 2413 int new_hwofs, lim; 2414 2415 if (na == NULL) { 2416 D("NULL na, should not happen"); 2417 return NULL; /* no netmap support here */ 2418 } 2419 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2420 ND("interface not in netmap mode"); 2421 return NULL; /* nothing to reinitialize */ 2422 } 2423 2424 /* XXX note- in the new scheme, we are not guaranteed to be 2425 * under lock (e.g. when called on a device reset). 2426 * In this case, we should set a flag and do not trust too 2427 * much the values. In practice: TODO 2428 * - set a RESET flag somewhere in the kring 2429 * - do the processing in a conservative way 2430 * - let the *sync() fixup at the end. 2431 */ 2432 if (tx == NR_TX) { 2433 if (n >= na->num_tx_rings) 2434 return NULL; 2435 kring = na->tx_rings + n; 2436 // XXX check whether we should use hwcur or rcur 2437 new_hwofs = kring->nr_hwcur - new_cur; 2438 } else { 2439 if (n >= na->num_rx_rings) 2440 return NULL; 2441 kring = na->rx_rings + n; 2442 new_hwofs = kring->nr_hwtail - new_cur; 2443 } 2444 lim = kring->nkr_num_slots - 1; 2445 if (new_hwofs > lim) 2446 new_hwofs -= lim + 1; 2447 2448 /* Always set the new offset value and realign the ring. */ 2449 if (netmap_verbose) 2450 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2451 NM_IFPNAME(na->ifp), 2452 tx == NR_TX ? "TX" : "RX", n, 2453 kring->nkr_hwofs, new_hwofs, 2454 kring->nr_hwtail, 2455 tx == NR_TX ? lim : kring->nr_hwtail); 2456 kring->nkr_hwofs = new_hwofs; 2457 if (tx == NR_TX) { 2458 kring->nr_hwtail = kring->nr_hwcur + lim; 2459 if (kring->nr_hwtail > lim) 2460 kring->nr_hwtail -= lim + 1; 2461 } 2462 2463 #if 0 // def linux 2464 /* XXX check that the mappings are correct */ 2465 /* need ring_nr, adapter->pdev, direction */ 2466 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2467 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2468 D("error mapping rx netmap buffer %d", i); 2469 // XXX fix error handling 2470 } 2471 2472 #endif /* linux */ 2473 /* 2474 * Wakeup on the individual and global selwait 2475 * We do the wakeup here, but the ring is not yet reconfigured. 2476 * However, we are under lock so there are no races. 2477 */ 2478 na->nm_notify(na, n, tx, 0); 2479 return kring->ring->slot; 2480 } 2481 2482 2483 /* 2484 * Dispatch rx/tx interrupts to the netmap rings. 2485 * 2486 * "work_done" is non-null on the RX path, NULL for the TX path. 2487 * We rely on the OS to make sure that there is only one active 2488 * instance per queue, and that there is appropriate locking. 2489 * 2490 * The 'notify' routine depends on what the ring is attached to. 2491 * - for a netmap file descriptor, do a selwakeup on the individual 2492 * waitqueue, plus one on the global one if needed 2493 * - for a switch, call the proper forwarding routine 2494 * - XXX more ? 2495 */ 2496 void 2497 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2498 { 2499 struct netmap_adapter *na = NA(ifp); 2500 struct netmap_kring *kring; 2501 2502 q &= NETMAP_RING_MASK; 2503 2504 if (netmap_verbose) { 2505 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2506 } 2507 2508 if (work_done) { /* RX path */ 2509 if (q >= na->num_rx_rings) 2510 return; // not a physical queue 2511 kring = na->rx_rings + q; 2512 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2513 na->nm_notify(na, q, NR_RX, 0); 2514 *work_done = 1; /* do not fire napi again */ 2515 } else { /* TX path */ 2516 if (q >= na->num_tx_rings) 2517 return; // not a physical queue 2518 kring = na->tx_rings + q; 2519 na->nm_notify(na, q, NR_TX, 0); 2520 } 2521 } 2522 2523 2524 /* 2525 * Default functions to handle rx/tx interrupts from a physical device. 2526 * "work_done" is non-null on the RX path, NULL for the TX path. 2527 * 2528 * If the card is not in netmap mode, simply return 0, 2529 * so that the caller proceeds with regular processing. 2530 * Otherwise call netmap_common_irq() and return 1. 2531 * 2532 * If the card is connected to a netmap file descriptor, 2533 * do a selwakeup on the individual queue, plus one on the global one 2534 * if needed (multiqueue card _and_ there are multiqueue listeners), 2535 * and return 1. 2536 * 2537 * Finally, if called on rx from an interface connected to a switch, 2538 * calls the proper forwarding routine, and return 1. 2539 */ 2540 int 2541 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2542 { 2543 // XXX could we check NAF_NATIVE_ON ? 2544 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2545 return 0; 2546 2547 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2548 ND("use regular interrupt"); 2549 return 0; 2550 } 2551 2552 netmap_common_irq(ifp, q, work_done); 2553 return 1; 2554 } 2555 2556 2557 /* 2558 * Module loader and unloader 2559 * 2560 * netmap_init() creates the /dev/netmap device and initializes 2561 * all global variables. Returns 0 on success, errno on failure 2562 * (but there is no chance) 2563 * 2564 * netmap_fini() destroys everything. 2565 */ 2566 2567 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2568 extern struct cdevsw netmap_cdevsw; 2569 2570 2571 void 2572 netmap_fini(void) 2573 { 2574 // XXX destroy_bridges() ? 2575 if (netmap_dev) 2576 destroy_dev(netmap_dev); 2577 netmap_mem_fini(); 2578 NMG_LOCK_DESTROY(); 2579 printf("netmap: unloaded module.\n"); 2580 } 2581 2582 2583 int 2584 netmap_init(void) 2585 { 2586 int error; 2587 2588 NMG_LOCK_INIT(); 2589 2590 error = netmap_mem_init(); 2591 if (error != 0) 2592 goto fail; 2593 /* XXX could use make_dev_credv() to get error number */ 2594 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2595 "netmap"); 2596 if (!netmap_dev) 2597 goto fail; 2598 2599 netmap_init_bridges(); 2600 printf("netmap: loaded module\n"); 2601 return (0); 2602 fail: 2603 netmap_fini(); 2604 return (EINVAL); /* may be incorrect */ 2605 } 2606