1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 128 /* --- internals ---- 129 * 130 * Roadmap to the code that implements the above. 131 * 132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create 133 * > select()able file descriptor on which events are reported. 134 * 135 * Internally, we allocate a netmap_priv_d structure, that will be 136 * initialized on ioctl(NIOCREGIF). 137 * 138 * os-specific: 139 * FreeBSD: netmap_open (netmap_freebsd.c). The priv is 140 * per-thread. 141 * linux: linux_netmap_open (netmap_linux.c). The priv is 142 * per-open. 143 * 144 * > 2. on each descriptor, the process issues an ioctl() to identify 145 * > the interface that should report events to the file descriptor. 146 * 147 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0. 148 * Most important things happen in netmap_get_na() and 149 * netmap_do_regif(), called from there. Additional details can be 150 * found in the comments above those functions. 151 * 152 * In all cases, this action creates/takes-a-reference-to a 153 * netmap_*_adapter describing the port, and allocates a netmap_if 154 * and all necessary netmap rings, filling them with netmap buffers. 155 * 156 * In this phase, the sync callbacks for each ring are set (these are used 157 * in steps 5 and 6 below). The callbacks depend on the type of adapter. 158 * The adapter creation/initialization code puts them in the 159 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they 160 * are copied from there to the netmap_kring's during netmap_do_regif(), by 161 * the nm_krings_create() callback. All the nm_krings_create callbacks 162 * actually call netmap_krings_create() to perform this and the other 163 * common stuff. netmap_krings_create() also takes care of the host rings, 164 * if needed, by setting their sync callbacks appropriately. 165 * 166 * Additional actions depend on the kind of netmap_adapter that has been 167 * registered: 168 * 169 * - netmap_hw_adapter: [netmap.c] 170 * This is a system netdev/ifp with native netmap support. 171 * The ifp is detached from the host stack by redirecting: 172 * - transmissions (from the network stack) to netmap_transmit() 173 * - receive notifications to the nm_notify() callback for 174 * this adapter. The callback is normally netmap_notify(), unless 175 * the ifp is attached to a bridge using bwrap, in which case it 176 * is netmap_bwrap_intr_notify(). 177 * 178 * - netmap_generic_adapter: [netmap_generic.c] 179 * A system netdev/ifp without native netmap support. 180 * 181 * (the decision about native/non native support is taken in 182 * netmap_get_hw_na(), called by netmap_get_na()) 183 * 184 * - netmap_vp_adapter [netmap_vale.c] 185 * Returned by netmap_get_bdg_na(). 186 * This is a persistent or ephemeral VALE port. Ephemeral ports 187 * are created on the fly if they don't already exist, and are 188 * always attached to a bridge. 189 * Persistent VALE ports must must be created seperately, and i 190 * then attached like normal NICs. The NIOCREGIF we are examining 191 * will find them only if they had previosly been created and 192 * attached (see VALE_CTL below). 193 * 194 * - netmap_pipe_adapter [netmap_pipe.c] 195 * Returned by netmap_get_pipe_na(). 196 * Both pipe ends are created, if they didn't already exist. 197 * 198 * - netmap_monitor_adapter [netmap_monitor.c] 199 * Returned by netmap_get_monitor_na(). 200 * If successful, the nm_sync callbacks of the monitored adapter 201 * will be intercepted by the returned monitor. 202 * 203 * - netmap_bwrap_adapter [netmap_vale.c] 204 * Cannot be obtained in this way, see VALE_CTL below 205 * 206 * 207 * os-specific: 208 * linux: we first go through linux_netmap_ioctl() to 209 * adapt the FreeBSD interface to the linux one. 210 * 211 * 212 * > 3. on each descriptor, the process issues an mmap() request to 213 * > map the shared memory region within the process' address space. 214 * > The list of interesting queues is indicated by a location in 215 * > the shared memory region. 216 * 217 * os-specific: 218 * FreeBSD: netmap_mmap_single (netmap_freebsd.c). 219 * linux: linux_netmap_mmap (netmap_linux.c). 220 * 221 * > 4. using the functions in the netmap(4) userspace API, a process 222 * > can look up the occupation state of a queue, access memory buffers, 223 * > and retrieve received packets or enqueue packets to transmit. 224 * 225 * these actions do not involve the kernel. 226 * 227 * > 5. using some ioctl()s the process can synchronize the userspace view 228 * > of the queue with the actual status in the kernel. This includes both 229 * > receiving the notification of new packets, and transmitting new 230 * > packets on the output interface. 231 * 232 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC 233 * cases. They invoke the nm_sync callbacks on the netmap_kring 234 * structures, as initialized in step 2 and maybe later modified 235 * by a monitor. Monitors, however, will always call the original 236 * callback before doing anything else. 237 * 238 * 239 * > 6. select() or poll() can be used to wait for events on individual 240 * > transmit or receive queues (or all queues for a given interface). 241 * 242 * Implemented in netmap_poll(). This will call the same nm_sync() 243 * callbacks as in step 5 above. 244 * 245 * os-specific: 246 * linux: we first go through linux_netmap_poll() to adapt 247 * the FreeBSD interface to the linux one. 248 * 249 * 250 * ---- VALE_CTL ----- 251 * 252 * VALE switches are controlled by issuing a NIOCREGIF with a non-null 253 * nr_cmd in the nmreq structure. These subcommands are handled by 254 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created 255 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF 256 * subcommands, respectively. 257 * 258 * Any network interface known to the system (including a persistent VALE 259 * port) can be attached to a VALE switch by issuing the 260 * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports 261 * look exactly like ephemeral VALE ports (as created in step 2 above). The 262 * attachment of other interfaces, instead, requires the creation of a 263 * netmap_bwrap_adapter. Moreover, the attached interface must be put in 264 * netmap mode. This may require the creation of a netmap_generic_adapter if 265 * we have no native support for the interface, or if generic adapters have 266 * been forced by sysctl. 267 * 268 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(), 269 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach() 270 * callback. In the case of the bwrap, the callback creates the 271 * netmap_bwrap_adapter. The initialization of the bwrap is then 272 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl() 273 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c). 274 * A generic adapter for the wrapped ifp will be created if needed, when 275 * netmap_get_bdg_na() calls netmap_get_hw_na(). 276 * 277 * 278 * ---- DATAPATHS ----- 279 * 280 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =- 281 * 282 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach() 283 * 284 * - tx from netmap userspace: 285 * concurrently: 286 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context 287 * kring->nm_sync() == DEVICE_netmap_txsync() 288 * 2) device interrupt handler 289 * na->nm_notify() == netmap_notify() 290 * - rx from netmap userspace: 291 * concurrently: 292 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context 293 * kring->nm_sync() == DEVICE_netmap_rxsync() 294 * 2) device interrupt handler 295 * na->nm_notify() == netmap_notify() 296 * - rx from host stack 297 * concurrently: 298 * 1) host stack 299 * netmap_transmit() 300 * na->nm_notify == netmap_notify() 301 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context 302 * kring->nm_sync() == netmap_rxsync_from_host_compat 303 * netmap_rxsync_from_host(na, NULL, NULL) 304 * - tx to host stack 305 * ioctl(NIOCTXSYNC)/netmap_poll() in process context 306 * kring->nm_sync() == netmap_txsync_to_host_compat 307 * netmap_txsync_to_host(na) 308 * NM_SEND_UP() 309 * FreeBSD: na->if_input() == ?? XXX 310 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX 311 * 312 * 313 * 314 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- 315 * 316 * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach() 317 * 318 * - tx from netmap userspace: 319 * concurrently: 320 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context 321 * kring->nm_sync() == generic_netmap_txsync() 322 * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX 323 * generic_ndo_start_xmit() 324 * orig. dev. start_xmit 325 * FreeBSD: na->if_transmit() == orig. dev if_transmit 326 * 2) generic_mbuf_destructor() 327 * na->nm_notify() == netmap_notify() 328 * - rx from netmap userspace: 329 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context 330 * kring->nm_sync() == generic_netmap_rxsync() 331 * mbq_safe_dequeue() 332 * 2) device driver 333 * generic_rx_handler() 334 * mbq_safe_enqueue() 335 * na->nm_notify() == netmap_notify() 336 * - rx from host stack: 337 * concurrently: 338 * 1) host stack 339 * linux: generic_ndo_start_xmit() 340 * netmap_transmit() 341 * FreeBSD: ifp->if_input() == netmap_transmit 342 * both: 343 * na->nm_notify() == netmap_notify() 344 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context 345 * kring->nm_sync() == netmap_rxsync_from_host_compat 346 * netmap_rxsync_from_host(na, NULL, NULL) 347 * - tx to host stack: 348 * ioctl(NIOCTXSYNC)/netmap_poll() in process context 349 * kring->nm_sync() == netmap_txsync_to_host_compat 350 * netmap_txsync_to_host(na) 351 * NM_SEND_UP() 352 * FreeBSD: na->if_input() == ??? XXX 353 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX 354 * 355 * 356 * -= VALE =- 357 * 358 * INCOMING: 359 * 360 * - VALE ports: 361 * ioctl(NIOCTXSYNC)/netmap_poll() in process context 362 * kring->nm_sync() == netmap_vp_txsync() 363 * 364 * - system device with native support: 365 * from cable: 366 * interrupt 367 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring) 368 * kring->nm_sync() == DEVICE_netmap_rxsync() 369 * netmap_vp_txsync() 370 * kring->nm_sync() == DEVICE_netmap_rxsync() 371 * from host stack: 372 * netmap_transmit() 373 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) 374 * kring->nm_sync() == netmap_rxsync_from_host_compat() 375 * netmap_vp_txsync() 376 * 377 * - system device with generic support: 378 * from device driver: 379 * generic_rx_handler() 380 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring) 381 * kring->nm_sync() == generic_netmap_rxsync() 382 * netmap_vp_txsync() 383 * kring->nm_sync() == generic_netmap_rxsync() 384 * from host stack: 385 * netmap_transmit() 386 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) 387 * kring->nm_sync() == netmap_rxsync_from_host_compat() 388 * netmap_vp_txsync() 389 * 390 * (all cases) --> nm_bdg_flush() 391 * dest_na->nm_notify() == (see below) 392 * 393 * OUTGOING: 394 * 395 * - VALE ports: 396 * concurrently: 397 * 1) ioctlNIOCRXSYNC)/netmap_poll() in process context 398 * kring->nm_sync() == netmap_vp_rxsync() 399 * 2) from nm_bdg_flush() 400 * na->nm_notify() == netmap_notify() 401 * 402 * - system device with native support: 403 * to cable: 404 * na->nm_notify() == netmap_bwrap_notify() 405 * netmap_vp_rxsync() 406 * kring->nm_sync() == DEVICE_netmap_txsync() 407 * netmap_vp_rxsync() 408 * to host stack: 409 * netmap_vp_rxsync() 410 * kring->nm_sync() == netmap_txsync_to_host_compat 411 * netmap_vp_rxsync_locked() 412 * 413 * - system device with generic adapter: 414 * to device driver: 415 * na->nm_notify() == netmap_bwrap_notify() 416 * netmap_vp_rxsync() 417 * kring->nm_sync() == generic_netmap_txsync() 418 * netmap_vp_rxsync() 419 * to host stack: 420 * netmap_vp_rxsync() 421 * kring->nm_sync() == netmap_txsync_to_host_compat 422 * netmap_vp_rxsync() 423 * 424 */ 425 426 /* 427 * OS-specific code that is used only within this file. 428 * Other OS-specific code that must be accessed by drivers 429 * is present in netmap_kern.h 430 */ 431 432 #if defined(__FreeBSD__) 433 #include <sys/cdefs.h> /* prerequisite */ 434 #include <sys/types.h> 435 #include <sys/errno.h> 436 #include <sys/param.h> /* defines used in kernel.h */ 437 #include <sys/kernel.h> /* types used in module initialization */ 438 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 439 #include <sys/filio.h> /* FIONBIO */ 440 #include <sys/sockio.h> 441 #include <sys/socketvar.h> /* struct socket */ 442 #include <sys/malloc.h> 443 #include <sys/poll.h> 444 #include <sys/rwlock.h> 445 #include <sys/socket.h> /* sockaddrs */ 446 #include <sys/selinfo.h> 447 #include <sys/sysctl.h> 448 #include <sys/jail.h> 449 #include <net/vnet.h> 450 #include <net/if.h> 451 #include <net/if_var.h> 452 #include <net/bpf.h> /* BIOCIMMEDIATE */ 453 #include <machine/bus.h> /* bus_dmamap_* */ 454 #include <sys/endian.h> 455 #include <sys/refcount.h> 456 457 458 /* reduce conditional code */ 459 // linux API, use for the knlist in FreeBSD 460 /* use a private mutex for the knlist */ 461 #define init_waitqueue_head(x) do { \ 462 struct mtx *m = &(x)->m; \ 463 mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \ 464 knlist_init_mtx(&(x)->si.si_note, m); \ 465 } while (0) 466 467 #define OS_selrecord(a, b) selrecord(a, &((b)->si)) 468 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b) 469 470 #elif defined(linux) 471 472 #include "bsd_glue.h" 473 474 475 476 #elif defined(__APPLE__) 477 478 #warning OSX support is only partial 479 #include "osx_glue.h" 480 481 #else 482 483 #error Unsupported platform 484 485 #endif /* unsupported */ 486 487 /* 488 * common headers 489 */ 490 #include <net/netmap.h> 491 #include <dev/netmap/netmap_kern.h> 492 #include <dev/netmap/netmap_mem2.h> 493 494 495 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 496 497 /* user-controlled variables */ 498 int netmap_verbose; 499 500 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 501 502 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 503 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 504 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 505 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 506 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 507 int netmap_mitigate = 1; 508 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 509 int netmap_no_pendintr = 1; 510 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 511 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 512 int netmap_txsync_retry = 2; 513 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 514 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 515 516 int netmap_adaptive_io = 0; 517 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, 518 &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); 519 520 int netmap_flags = 0; /* debug flags */ 521 int netmap_fwd = 0; /* force transparent mode */ 522 523 /* 524 * netmap_admode selects the netmap mode to use. 525 * Invalid values are reset to NETMAP_ADMODE_BEST 526 */ 527 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 528 NETMAP_ADMODE_NATIVE, /* either native or none */ 529 NETMAP_ADMODE_GENERIC, /* force generic */ 530 NETMAP_ADMODE_LAST }; 531 static int netmap_admode = NETMAP_ADMODE_BEST; 532 533 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 534 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 535 int netmap_generic_rings = 1; /* number of queues in generic. */ 536 537 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 538 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 539 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 540 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 541 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 542 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); 543 544 NMG_LOCK_T netmap_global_lock; 545 546 /* 547 * mark the ring as stopped, and run through the locks 548 * to make sure other users get to see it. 549 */ 550 static void 551 netmap_disable_ring(struct netmap_kring *kr) 552 { 553 kr->nkr_stopped = 1; 554 nm_kr_get(kr); 555 mtx_lock(&kr->q_lock); 556 mtx_unlock(&kr->q_lock); 557 nm_kr_put(kr); 558 } 559 560 /* stop or enable a single ring */ 561 void 562 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped) 563 { 564 if (stopped) 565 netmap_disable_ring(NMR(na, t) + ring_id); 566 else 567 NMR(na, t)[ring_id].nkr_stopped = 0; 568 } 569 570 571 /* stop or enable all the rings of na */ 572 void 573 netmap_set_all_rings(struct netmap_adapter *na, int stopped) 574 { 575 int i; 576 enum txrx t; 577 578 if (!nm_netmap_on(na)) 579 return; 580 581 for_rx_tx(t) { 582 for (i = 0; i < netmap_real_rings(na, t); i++) { 583 netmap_set_ring(na, i, t, stopped); 584 } 585 } 586 } 587 588 /* 589 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s 590 * to finish and prevents any new one from starting. Call this before turning 591 * netmap mode off, or before removing the harware rings (e.g., on module 592 * onload). As a rule of thumb for linux drivers, this should be placed near 593 * each napi_disable(). 594 */ 595 void 596 netmap_disable_all_rings(struct ifnet *ifp) 597 { 598 netmap_set_all_rings(NA(ifp), 1 /* stopped */); 599 } 600 601 /* 602 * Convenience function used in drivers. Re-enables rxsync and txsync on the 603 * adapter's rings In linux drivers, this should be placed near each 604 * napi_enable(). 605 */ 606 void 607 netmap_enable_all_rings(struct ifnet *ifp) 608 { 609 netmap_set_all_rings(NA(ifp), 0 /* enabled */); 610 } 611 612 613 /* 614 * generic bound_checking function 615 */ 616 u_int 617 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 618 { 619 u_int oldv = *v; 620 const char *op = NULL; 621 622 if (dflt < lo) 623 dflt = lo; 624 if (dflt > hi) 625 dflt = hi; 626 if (oldv < lo) { 627 *v = dflt; 628 op = "Bump"; 629 } else if (oldv > hi) { 630 *v = hi; 631 op = "Clamp"; 632 } 633 if (op && msg) 634 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 635 return *v; 636 } 637 638 639 /* 640 * packet-dump function, user-supplied or static buffer. 641 * The destination buffer must be at least 30+4*len 642 */ 643 const char * 644 nm_dump_buf(char *p, int len, int lim, char *dst) 645 { 646 static char _dst[8192]; 647 int i, j, i0; 648 static char hex[] ="0123456789abcdef"; 649 char *o; /* output position */ 650 651 #define P_HI(x) hex[((x) & 0xf0)>>4] 652 #define P_LO(x) hex[((x) & 0xf)] 653 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 654 if (!dst) 655 dst = _dst; 656 if (lim <= 0 || lim > len) 657 lim = len; 658 o = dst; 659 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 660 o += strlen(o); 661 /* hexdump routine */ 662 for (i = 0; i < lim; ) { 663 sprintf(o, "%5d: ", i); 664 o += strlen(o); 665 memset(o, ' ', 48); 666 i0 = i; 667 for (j=0; j < 16 && i < lim; i++, j++) { 668 o[j*3] = P_HI(p[i]); 669 o[j*3+1] = P_LO(p[i]); 670 } 671 i = i0; 672 for (j=0; j < 16 && i < lim; i++, j++) 673 o[j + 48] = P_C(p[i]); 674 o[j+48] = '\n'; 675 o += j+49; 676 } 677 *o = '\0'; 678 #undef P_HI 679 #undef P_LO 680 #undef P_C 681 return dst; 682 } 683 684 685 /* 686 * Fetch configuration from the device, to cope with dynamic 687 * reconfigurations after loading the module. 688 */ 689 /* call with NMG_LOCK held */ 690 int 691 netmap_update_config(struct netmap_adapter *na) 692 { 693 u_int txr, txd, rxr, rxd; 694 695 txr = txd = rxr = rxd = 0; 696 if (na->nm_config == NULL || 697 na->nm_config(na, &txr, &txd, &rxr, &rxd)) 698 { 699 /* take whatever we had at init time */ 700 txr = na->num_tx_rings; 701 txd = na->num_tx_desc; 702 rxr = na->num_rx_rings; 703 rxd = na->num_rx_desc; 704 } 705 706 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 707 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 708 return 0; /* nothing changed */ 709 if (netmap_verbose || na->active_fds > 0) { 710 D("stored config %s: txring %d x %d, rxring %d x %d", 711 na->name, 712 na->num_tx_rings, na->num_tx_desc, 713 na->num_rx_rings, na->num_rx_desc); 714 D("new config %s: txring %d x %d, rxring %d x %d", 715 na->name, txr, txd, rxr, rxd); 716 } 717 if (na->active_fds == 0) { 718 D("configuration changed (but fine)"); 719 na->num_tx_rings = txr; 720 na->num_tx_desc = txd; 721 na->num_rx_rings = rxr; 722 na->num_rx_desc = rxd; 723 return 0; 724 } 725 D("configuration changed while active, this is bad..."); 726 return 1; 727 } 728 729 static void netmap_txsync_to_host(struct netmap_adapter *na); 730 static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); 731 732 /* kring->nm_sync callback for the host tx ring */ 733 static int 734 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) 735 { 736 (void)flags; /* unused */ 737 netmap_txsync_to_host(kring->na); 738 return 0; 739 } 740 741 /* kring->nm_sync callback for the host rx ring */ 742 static int 743 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) 744 { 745 (void)flags; /* unused */ 746 netmap_rxsync_from_host(kring->na, NULL, NULL); 747 return 0; 748 } 749 750 751 752 /* create the krings array and initialize the fields common to all adapters. 753 * The array layout is this: 754 * 755 * +----------+ 756 * na->tx_rings ----->| | \ 757 * | | } na->num_tx_ring 758 * | | / 759 * +----------+ 760 * | | host tx kring 761 * na->rx_rings ----> +----------+ 762 * | | \ 763 * | | } na->num_rx_rings 764 * | | / 765 * +----------+ 766 * | | host rx kring 767 * +----------+ 768 * na->tailroom ----->| | \ 769 * | | } tailroom bytes 770 * | | / 771 * +----------+ 772 * 773 * Note: for compatibility, host krings are created even when not needed. 774 * The tailroom space is currently used by vale ports for allocating leases. 775 */ 776 /* call with NMG_LOCK held */ 777 int 778 netmap_krings_create(struct netmap_adapter *na, u_int tailroom) 779 { 780 u_int i, len, ndesc; 781 struct netmap_kring *kring; 782 u_int n[NR_TXRX]; 783 enum txrx t; 784 785 /* account for the (possibly fake) host rings */ 786 n[NR_TX] = na->num_tx_rings + 1; 787 n[NR_RX] = na->num_rx_rings + 1; 788 789 len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom; 790 791 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 792 if (na->tx_rings == NULL) { 793 D("Cannot allocate krings"); 794 return ENOMEM; 795 } 796 na->rx_rings = na->tx_rings + n[NR_TX]; 797 798 /* 799 * All fields in krings are 0 except the one initialized below. 800 * but better be explicit on important kring fields. 801 */ 802 for_rx_tx(t) { 803 ndesc = nma_get_ndesc(na, t); 804 for (i = 0; i < n[t]; i++) { 805 kring = &NMR(na, t)[i]; 806 bzero(kring, sizeof(*kring)); 807 kring->na = na; 808 kring->ring_id = i; 809 kring->tx = t; 810 kring->nkr_num_slots = ndesc; 811 if (i < nma_get_nrings(na, t)) { 812 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync); 813 } else if (i == na->num_tx_rings) { 814 kring->nm_sync = (t == NR_TX ? 815 netmap_txsync_to_host_compat : 816 netmap_rxsync_from_host_compat); 817 } 818 kring->nm_notify = na->nm_notify; 819 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 820 /* 821 * IMPORTANT: Always keep one slot empty. 822 */ 823 kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0); 824 snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, 825 nm_txrx2str(t), i); 826 ND("ktx %s h %d c %d t %d", 827 kring->name, kring->rhead, kring->rcur, kring->rtail); 828 mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF); 829 init_waitqueue_head(&kring->si); 830 } 831 init_waitqueue_head(&na->si[t]); 832 } 833 834 na->tailroom = na->rx_rings + n[NR_RX]; 835 836 return 0; 837 } 838 839 840 #ifdef __FreeBSD__ 841 static void 842 netmap_knlist_destroy(NM_SELINFO_T *si) 843 { 844 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 845 knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); 846 knlist_destroy(&si->si.si_note); 847 /* now we don't need the mutex anymore */ 848 mtx_destroy(&si->m); 849 } 850 #endif /* __FreeBSD__ */ 851 852 853 /* undo the actions performed by netmap_krings_create */ 854 /* call with NMG_LOCK held */ 855 void 856 netmap_krings_delete(struct netmap_adapter *na) 857 { 858 struct netmap_kring *kring = na->tx_rings; 859 enum txrx t; 860 861 for_rx_tx(t) 862 netmap_knlist_destroy(&na->si[t]); 863 864 /* we rely on the krings layout described above */ 865 for ( ; kring != na->tailroom; kring++) { 866 mtx_destroy(&kring->q_lock); 867 netmap_knlist_destroy(&kring->si); 868 } 869 free(na->tx_rings, M_DEVBUF); 870 na->tx_rings = na->rx_rings = na->tailroom = NULL; 871 } 872 873 874 /* 875 * Destructor for NIC ports. They also have an mbuf queue 876 * on the rings connected to the host so we need to purge 877 * them first. 878 */ 879 /* call with NMG_LOCK held */ 880 static void 881 netmap_hw_krings_delete(struct netmap_adapter *na) 882 { 883 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 884 885 ND("destroy sw mbq with len %d", mbq_len(q)); 886 mbq_purge(q); 887 mbq_safe_destroy(q); 888 netmap_krings_delete(na); 889 } 890 891 892 893 /* 894 * Undo everything that was done in netmap_do_regif(). In particular, 895 * call nm_register(ifp,0) to stop netmap mode on the interface and 896 * revert to normal operation. 897 */ 898 /* call with NMG_LOCK held */ 899 static void netmap_unset_ringid(struct netmap_priv_d *); 900 static void netmap_rel_exclusive(struct netmap_priv_d *); 901 static void 902 netmap_do_unregif(struct netmap_priv_d *priv) 903 { 904 struct netmap_adapter *na = priv->np_na; 905 906 NMG_LOCK_ASSERT(); 907 na->active_fds--; 908 /* release exclusive use if it was requested on regif */ 909 netmap_rel_exclusive(priv); 910 if (na->active_fds <= 0) { /* last instance */ 911 912 if (netmap_verbose) 913 D("deleting last instance for %s", na->name); 914 915 #ifdef WITH_MONITOR 916 /* walk through all the rings and tell any monitor 917 * that the port is going to exit netmap mode 918 */ 919 netmap_monitor_stop(na); 920 #endif 921 /* 922 * (TO CHECK) This function is only called 923 * when the last reference to this file descriptor goes 924 * away. This means we cannot have any pending poll() 925 * or interrupt routine operating on the structure. 926 * XXX The file may be closed in a thread while 927 * another thread is using it. 928 * Linux keeps the file opened until the last reference 929 * by any outstanding ioctl/poll or mmap is gone. 930 * FreeBSD does not track mmap()s (but we do) and 931 * wakes up any sleeping poll(). Need to check what 932 * happens if the close() occurs while a concurrent 933 * syscall is running. 934 */ 935 na->nm_register(na, 0); /* off, clear flags */ 936 /* Wake up any sleeping threads. netmap_poll will 937 * then return POLLERR 938 * XXX The wake up now must happen during *_down(), when 939 * we order all activities to stop. -gl 940 */ 941 /* delete rings and buffers */ 942 netmap_mem_rings_delete(na); 943 na->nm_krings_delete(na); 944 } 945 /* possibily decrement counter of tx_si/rx_si users */ 946 netmap_unset_ringid(priv); 947 /* delete the nifp */ 948 netmap_mem_if_delete(na, priv->np_nifp); 949 /* drop the allocator */ 950 netmap_mem_deref(na->nm_mem, na); 951 /* mark the priv as unregistered */ 952 priv->np_na = NULL; 953 priv->np_nifp = NULL; 954 } 955 956 /* call with NMG_LOCK held */ 957 static __inline int 958 nm_si_user(struct netmap_priv_d *priv, enum txrx t) 959 { 960 return (priv->np_na != NULL && 961 (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); 962 } 963 964 /* 965 * Destructor of the netmap_priv_d, called when the fd is closed 966 * Action: undo all the things done by NIOCREGIF, 967 * On FreeBSD we need to track whether there are active mmap()s, 968 * and we use np_active_mmaps for that. On linux, the field is always 0. 969 * Return: 1 if we can free priv, 0 otherwise. 970 * 971 */ 972 /* call with NMG_LOCK held */ 973 int 974 netmap_dtor_locked(struct netmap_priv_d *priv) 975 { 976 struct netmap_adapter *na = priv->np_na; 977 978 /* number of active mmaps on this fd (FreeBSD only) */ 979 if (--priv->np_refs > 0) { 980 return 0; 981 } 982 983 if (!na) { 984 return 1; //XXX is it correct? 985 } 986 netmap_do_unregif(priv); 987 netmap_adapter_put(na); 988 return 1; 989 } 990 991 992 /* call with NMG_LOCK *not* held */ 993 void 994 netmap_dtor(void *data) 995 { 996 struct netmap_priv_d *priv = data; 997 int last_instance; 998 999 NMG_LOCK(); 1000 last_instance = netmap_dtor_locked(priv); 1001 NMG_UNLOCK(); 1002 if (last_instance) { 1003 bzero(priv, sizeof(*priv)); /* for safety */ 1004 free(priv, M_DEVBUF); 1005 } 1006 } 1007 1008 1009 1010 1011 /* 1012 * Handlers for synchronization of the queues from/to the host. 1013 * Netmap has two operating modes: 1014 * - in the default mode, the rings connected to the host stack are 1015 * just another ring pair managed by userspace; 1016 * - in transparent mode (XXX to be defined) incoming packets 1017 * (from the host or the NIC) are marked as NS_FORWARD upon 1018 * arrival, and the user application has a chance to reset the 1019 * flag for packets that should be dropped. 1020 * On the RXSYNC or poll(), packets in RX rings between 1021 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1022 * to the other side. 1023 * The transfer NIC --> host is relatively easy, just encapsulate 1024 * into mbufs and we are done. The host --> NIC side is slightly 1025 * harder because there might not be room in the tx ring so it 1026 * might take a while before releasing the buffer. 1027 */ 1028 1029 1030 /* 1031 * pass a chain of buffers to the host stack as coming from 'dst' 1032 * We do not need to lock because the queue is private. 1033 */ 1034 static void 1035 netmap_send_up(struct ifnet *dst, struct mbq *q) 1036 { 1037 struct mbuf *m; 1038 1039 /* send packets up, outside the lock */ 1040 while ((m = mbq_dequeue(q)) != NULL) { 1041 if (netmap_verbose & NM_VERB_HOST) 1042 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1043 NM_SEND_UP(dst, m); 1044 } 1045 mbq_destroy(q); 1046 } 1047 1048 1049 /* 1050 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1051 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 1052 * and pass them up. Drop remaining packets in the unlikely event 1053 * of an mbuf shortage. 1054 */ 1055 static void 1056 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1057 { 1058 u_int const lim = kring->nkr_num_slots - 1; 1059 u_int const head = kring->rhead; 1060 u_int n; 1061 struct netmap_adapter *na = kring->na; 1062 1063 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 1064 struct mbuf *m; 1065 struct netmap_slot *slot = &kring->ring->slot[n]; 1066 1067 if ((slot->flags & NS_FORWARD) == 0 && !force) 1068 continue; 1069 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) { 1070 RD(5, "bad pkt at %d len %d", n, slot->len); 1071 continue; 1072 } 1073 slot->flags &= ~NS_FORWARD; // XXX needed ? 1074 /* XXX TODO: adapt to the case of a multisegment packet */ 1075 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL); 1076 1077 if (m == NULL) 1078 break; 1079 mbq_enqueue(q, m); 1080 } 1081 } 1082 1083 1084 /* 1085 * Send to the NIC rings packets marked NS_FORWARD between 1086 * kring->nr_hwcur and kring->rhead 1087 * Called under kring->rx_queue.lock on the sw rx ring, 1088 */ 1089 static u_int 1090 netmap_sw_to_nic(struct netmap_adapter *na) 1091 { 1092 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1093 struct netmap_slot *rxslot = kring->ring->slot; 1094 u_int i, rxcur = kring->nr_hwcur; 1095 u_int const head = kring->rhead; 1096 u_int const src_lim = kring->nkr_num_slots - 1; 1097 u_int sent = 0; 1098 1099 /* scan rings to find space, then fill as much as possible */ 1100 for (i = 0; i < na->num_tx_rings; i++) { 1101 struct netmap_kring *kdst = &na->tx_rings[i]; 1102 struct netmap_ring *rdst = kdst->ring; 1103 u_int const dst_lim = kdst->nkr_num_slots - 1; 1104 1105 /* XXX do we trust ring or kring->rcur,rtail ? */ 1106 for (; rxcur != head && !nm_ring_empty(rdst); 1107 rxcur = nm_next(rxcur, src_lim) ) { 1108 struct netmap_slot *src, *dst, tmp; 1109 u_int dst_cur = rdst->cur; 1110 1111 src = &rxslot[rxcur]; 1112 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 1113 continue; 1114 1115 sent++; 1116 1117 dst = &rdst->slot[dst_cur]; 1118 1119 tmp = *src; 1120 1121 src->buf_idx = dst->buf_idx; 1122 src->flags = NS_BUF_CHANGED; 1123 1124 dst->buf_idx = tmp.buf_idx; 1125 dst->len = tmp.len; 1126 dst->flags = NS_BUF_CHANGED; 1127 1128 rdst->cur = nm_next(dst_cur, dst_lim); 1129 } 1130 /* if (sent) XXX txsync ? */ 1131 } 1132 return sent; 1133 } 1134 1135 1136 /* 1137 * netmap_txsync_to_host() passes packets up. We are called from a 1138 * system call in user process context, and the only contention 1139 * can be among multiple user threads erroneously calling 1140 * this routine concurrently. 1141 */ 1142 static void 1143 netmap_txsync_to_host(struct netmap_adapter *na) 1144 { 1145 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1146 u_int const lim = kring->nkr_num_slots - 1; 1147 u_int const head = kring->rhead; 1148 struct mbq q; 1149 1150 /* Take packets from hwcur to head and pass them up. 1151 * force head = cur since netmap_grab_packets() stops at head 1152 * In case of no buffers we give up. At the end of the loop, 1153 * the queue is drained in all cases. 1154 */ 1155 mbq_init(&q); 1156 netmap_grab_packets(kring, &q, 1 /* force */); 1157 ND("have %d pkts in queue", mbq_len(&q)); 1158 kring->nr_hwcur = head; 1159 kring->nr_hwtail = head + lim; 1160 if (kring->nr_hwtail > lim) 1161 kring->nr_hwtail -= lim + 1; 1162 1163 netmap_send_up(na->ifp, &q); 1164 } 1165 1166 1167 /* 1168 * rxsync backend for packets coming from the host stack. 1169 * They have been put in kring->rx_queue by netmap_transmit(). 1170 * We protect access to the kring using kring->rx_queue.lock 1171 * 1172 * This routine also does the selrecord if called from the poll handler 1173 * (we know because td != NULL). 1174 * 1175 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1176 * as an additional hidden argument. 1177 * returns the number of packets delivered to tx queues in 1178 * transparent mode, or a negative value if error 1179 */ 1180 static int 1181 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1182 { 1183 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1184 struct netmap_ring *ring = kring->ring; 1185 u_int nm_i, n; 1186 u_int const lim = kring->nkr_num_slots - 1; 1187 u_int const head = kring->rhead; 1188 int ret = 0; 1189 struct mbq *q = &kring->rx_queue, fq; 1190 1191 (void)pwait; /* disable unused warnings */ 1192 (void)td; 1193 1194 mbq_init(&fq); /* fq holds packets to be freed */ 1195 1196 mbq_lock(q); 1197 1198 /* First part: import newly received packets */ 1199 n = mbq_len(q); 1200 if (n) { /* grab packets from the queue */ 1201 struct mbuf *m; 1202 uint32_t stop_i; 1203 1204 nm_i = kring->nr_hwtail; 1205 stop_i = nm_prev(nm_i, lim); 1206 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 1207 int len = MBUF_LEN(m); 1208 struct netmap_slot *slot = &ring->slot[nm_i]; 1209 1210 m_copydata(m, 0, len, NMB(na, slot)); 1211 ND("nm %d len %d", nm_i, len); 1212 if (netmap_verbose) 1213 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL)); 1214 1215 slot->len = len; 1216 slot->flags = kring->nkr_slot_flags; 1217 nm_i = nm_next(nm_i, lim); 1218 mbq_enqueue(&fq, m); 1219 } 1220 kring->nr_hwtail = nm_i; 1221 } 1222 1223 /* 1224 * Second part: skip past packets that userspace has released. 1225 */ 1226 nm_i = kring->nr_hwcur; 1227 if (nm_i != head) { /* something was released */ 1228 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 1229 ret = netmap_sw_to_nic(na); 1230 kring->nr_hwcur = head; 1231 } 1232 1233 /* access copies of cur,tail in the kring */ 1234 if (kring->rcur == kring->rtail && td) /* no bufs available */ 1235 OS_selrecord(td, &kring->si); 1236 1237 mbq_unlock(q); 1238 1239 mbq_purge(&fq); 1240 mbq_destroy(&fq); 1241 1242 return ret; 1243 } 1244 1245 1246 /* Get a netmap adapter for the port. 1247 * 1248 * If it is possible to satisfy the request, return 0 1249 * with *na containing the netmap adapter found. 1250 * Otherwise return an error code, with *na containing NULL. 1251 * 1252 * When the port is attached to a bridge, we always return 1253 * EBUSY. 1254 * Otherwise, if the port is already bound to a file descriptor, 1255 * then we unconditionally return the existing adapter into *na. 1256 * In all the other cases, we return (into *na) either native, 1257 * generic or NULL, according to the following table: 1258 * 1259 * native_support 1260 * active_fds dev.netmap.admode YES NO 1261 * ------------------------------------------------------- 1262 * >0 * NA(ifp) NA(ifp) 1263 * 1264 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1265 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1266 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1267 * 1268 */ 1269 1270 int 1271 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1272 { 1273 /* generic support */ 1274 int i = netmap_admode; /* Take a snapshot. */ 1275 struct netmap_adapter *prev_na; 1276 #ifdef WITH_GENERIC 1277 struct netmap_generic_adapter *gna; 1278 int error = 0; 1279 #endif 1280 1281 *na = NULL; /* default */ 1282 1283 /* reset in case of invalid value */ 1284 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1285 i = netmap_admode = NETMAP_ADMODE_BEST; 1286 1287 if (NETMAP_CAPABLE(ifp)) { 1288 prev_na = NA(ifp); 1289 /* If an adapter already exists, return it if 1290 * there are active file descriptors or if 1291 * netmap is not forced to use generic 1292 * adapters. 1293 */ 1294 if (NETMAP_OWNED_BY_ANY(prev_na) 1295 || i != NETMAP_ADMODE_GENERIC 1296 || prev_na->na_flags & NAF_FORCE_NATIVE 1297 #ifdef WITH_PIPES 1298 /* ugly, but we cannot allow an adapter switch 1299 * if some pipe is referring to this one 1300 */ 1301 || prev_na->na_next_pipe > 0 1302 #endif 1303 ) { 1304 *na = prev_na; 1305 return 0; 1306 } 1307 } 1308 1309 /* If there isn't native support and netmap is not allowed 1310 * to use generic adapters, we cannot satisfy the request. 1311 */ 1312 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1313 return EOPNOTSUPP; 1314 1315 #ifdef WITH_GENERIC 1316 /* Otherwise, create a generic adapter and return it, 1317 * saving the previously used netmap adapter, if any. 1318 * 1319 * Note that here 'prev_na', if not NULL, MUST be a 1320 * native adapter, and CANNOT be a generic one. This is 1321 * true because generic adapters are created on demand, and 1322 * destroyed when not used anymore. Therefore, if the adapter 1323 * currently attached to an interface 'ifp' is generic, it 1324 * must be that 1325 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1326 * Consequently, if NA(ifp) is generic, we will enter one of 1327 * the branches above. This ensures that we never override 1328 * a generic adapter with another generic adapter. 1329 */ 1330 prev_na = NA(ifp); 1331 error = generic_netmap_attach(ifp); 1332 if (error) 1333 return error; 1334 1335 *na = NA(ifp); 1336 gna = (struct netmap_generic_adapter*)NA(ifp); 1337 gna->prev = prev_na; /* save old na */ 1338 if (prev_na != NULL) { 1339 ifunit_ref(ifp->if_xname); 1340 // XXX add a refcount ? 1341 netmap_adapter_get(prev_na); 1342 } 1343 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1344 1345 return 0; 1346 #else /* !WITH_GENERIC */ 1347 return EOPNOTSUPP; 1348 #endif 1349 } 1350 1351 1352 /* 1353 * MUST BE CALLED UNDER NMG_LOCK() 1354 * 1355 * Get a refcounted reference to a netmap adapter attached 1356 * to the interface specified by nmr. 1357 * This is always called in the execution of an ioctl(). 1358 * 1359 * Return ENXIO if the interface specified by the request does 1360 * not exist, ENOTSUP if netmap is not supported by the interface, 1361 * EBUSY if the interface is already attached to a bridge, 1362 * EINVAL if parameters are invalid, ENOMEM if needed resources 1363 * could not be allocated. 1364 * If successful, hold a reference to the netmap adapter. 1365 * 1366 * No reference is kept on the real interface, which may then 1367 * disappear at any time. 1368 */ 1369 int 1370 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1371 { 1372 struct ifnet *ifp = NULL; 1373 int error = 0; 1374 struct netmap_adapter *ret = NULL; 1375 1376 *na = NULL; /* default return value */ 1377 1378 NMG_LOCK_ASSERT(); 1379 1380 /* we cascade through all possibile types of netmap adapter. 1381 * All netmap_get_*_na() functions return an error and an na, 1382 * with the following combinations: 1383 * 1384 * error na 1385 * 0 NULL type doesn't match 1386 * !0 NULL type matches, but na creation/lookup failed 1387 * 0 !NULL type matches and na created/found 1388 * !0 !NULL impossible 1389 */ 1390 1391 /* try to see if this is a monitor port */ 1392 error = netmap_get_monitor_na(nmr, na, create); 1393 if (error || *na != NULL) 1394 return error; 1395 1396 /* try to see if this is a pipe port */ 1397 error = netmap_get_pipe_na(nmr, na, create); 1398 if (error || *na != NULL) 1399 return error; 1400 1401 /* try to see if this is a bridge port */ 1402 error = netmap_get_bdg_na(nmr, na, create); 1403 if (error) 1404 return error; 1405 1406 if (*na != NULL) /* valid match in netmap_get_bdg_na() */ 1407 goto out; 1408 1409 /* 1410 * This must be a hardware na, lookup the name in the system. 1411 * Note that by hardware we actually mean "it shows up in ifconfig". 1412 * This may still be a tap, a veth/epair, or even a 1413 * persistent VALE port. 1414 */ 1415 ifp = ifunit_ref(nmr->nr_name); 1416 if (ifp == NULL) { 1417 return ENXIO; 1418 } 1419 1420 error = netmap_get_hw_na(ifp, &ret); 1421 if (error) 1422 goto out; 1423 1424 *na = ret; 1425 netmap_adapter_get(ret); 1426 1427 out: 1428 if (error && ret != NULL) 1429 netmap_adapter_put(ret); 1430 1431 if (ifp) 1432 if_rele(ifp); /* allow live unloading of drivers modules */ 1433 1434 return error; 1435 } 1436 1437 1438 /* 1439 * validate parameters on entry for *_txsync() 1440 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1441 * in case of error. 1442 * 1443 * rhead, rcur and rtail=hwtail are stored from previous round. 1444 * hwcur is the next packet to send to the ring. 1445 * 1446 * We want 1447 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1448 * 1449 * hwcur, rhead, rtail and hwtail are reliable 1450 */ 1451 static u_int 1452 nm_txsync_prologue(struct netmap_kring *kring) 1453 { 1454 #define NM_ASSERT(t) if (t) { D("fail " #t); goto error; } 1455 struct netmap_ring *ring = kring->ring; 1456 u_int head = ring->head; /* read only once */ 1457 u_int cur = ring->cur; /* read only once */ 1458 u_int n = kring->nkr_num_slots; 1459 1460 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1461 kring->name, 1462 kring->nr_hwcur, kring->nr_hwtail, 1463 ring->head, ring->cur, ring->tail); 1464 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1465 if (kring->nr_hwcur >= n || kring->rhead >= n || 1466 kring->rtail >= n || kring->nr_hwtail >= n) 1467 goto error; 1468 #endif /* kernel sanity checks */ 1469 /* 1470 * user sanity checks. We only use 'cur', 1471 * A, B, ... are possible positions for cur: 1472 * 1473 * 0 A cur B tail C n-1 1474 * 0 D tail E cur F n-1 1475 * 1476 * B, F, D are valid. A, C, E are wrong 1477 */ 1478 if (kring->rtail >= kring->rhead) { 1479 /* want rhead <= head <= rtail */ 1480 NM_ASSERT(head < kring->rhead || head > kring->rtail); 1481 /* and also head <= cur <= rtail */ 1482 NM_ASSERT(cur < head || cur > kring->rtail); 1483 } else { /* here rtail < rhead */ 1484 /* we need head outside rtail .. rhead */ 1485 NM_ASSERT(head > kring->rtail && head < kring->rhead); 1486 1487 /* two cases now: head <= rtail or head >= rhead */ 1488 if (head <= kring->rtail) { 1489 /* want head <= cur <= rtail */ 1490 NM_ASSERT(cur < head || cur > kring->rtail); 1491 } else { /* head >= rhead */ 1492 /* cur must be outside rtail..head */ 1493 NM_ASSERT(cur > kring->rtail && cur < head); 1494 } 1495 } 1496 if (ring->tail != kring->rtail) { 1497 RD(5, "tail overwritten was %d need %d", 1498 ring->tail, kring->rtail); 1499 ring->tail = kring->rtail; 1500 } 1501 kring->rhead = head; 1502 kring->rcur = cur; 1503 return head; 1504 1505 error: 1506 RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d", 1507 kring->name, 1508 head, cur, ring->tail, 1509 kring->rhead, kring->rcur, kring->rtail, 1510 kring->nr_hwcur, kring->nr_hwtail); 1511 return n; 1512 #undef NM_ASSERT 1513 } 1514 1515 1516 /* 1517 * validate parameters on entry for *_rxsync() 1518 * Returns ring->head if ok, kring->nkr_num_slots on error. 1519 * 1520 * For a valid configuration, 1521 * hwcur <= head <= cur <= tail <= hwtail 1522 * 1523 * We only consider head and cur. 1524 * hwcur and hwtail are reliable. 1525 * 1526 */ 1527 static u_int 1528 nm_rxsync_prologue(struct netmap_kring *kring) 1529 { 1530 struct netmap_ring *ring = kring->ring; 1531 uint32_t const n = kring->nkr_num_slots; 1532 uint32_t head, cur; 1533 1534 ND(5,"%s kc %d kt %d h %d c %d t %d", 1535 kring->name, 1536 kring->nr_hwcur, kring->nr_hwtail, 1537 ring->head, ring->cur, ring->tail); 1538 /* 1539 * Before storing the new values, we should check they do not 1540 * move backwards. However: 1541 * - head is not an issue because the previous value is hwcur; 1542 * - cur could in principle go back, however it does not matter 1543 * because we are processing a brand new rxsync() 1544 */ 1545 cur = kring->rcur = ring->cur; /* read only once */ 1546 head = kring->rhead = ring->head; /* read only once */ 1547 #if 1 /* kernel sanity checks */ 1548 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1549 goto error; 1550 #endif /* kernel sanity checks */ 1551 /* user sanity checks */ 1552 if (kring->nr_hwtail >= kring->nr_hwcur) { 1553 /* want hwcur <= rhead <= hwtail */ 1554 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1555 goto error; 1556 /* and also rhead <= rcur <= hwtail */ 1557 if (cur < head || cur > kring->nr_hwtail) 1558 goto error; 1559 } else { 1560 /* we need rhead outside hwtail..hwcur */ 1561 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1562 goto error; 1563 /* two cases now: head <= hwtail or head >= hwcur */ 1564 if (head <= kring->nr_hwtail) { 1565 /* want head <= cur <= hwtail */ 1566 if (cur < head || cur > kring->nr_hwtail) 1567 goto error; 1568 } else { 1569 /* cur must be outside hwtail..head */ 1570 if (cur < head && cur > kring->nr_hwtail) 1571 goto error; 1572 } 1573 } 1574 if (ring->tail != kring->rtail) { 1575 RD(5, "%s tail overwritten was %d need %d", 1576 kring->name, 1577 ring->tail, kring->rtail); 1578 ring->tail = kring->rtail; 1579 } 1580 return head; 1581 1582 error: 1583 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1584 kring->nr_hwcur, 1585 kring->rcur, kring->nr_hwtail, 1586 kring->rhead, kring->rcur, ring->tail); 1587 return n; 1588 } 1589 1590 1591 /* 1592 * Error routine called when txsync/rxsync detects an error. 1593 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1594 * Return 1 on reinit. 1595 * 1596 * This routine is only called by the upper half of the kernel. 1597 * It only reads hwcur (which is changed only by the upper half, too) 1598 * and hwtail (which may be changed by the lower half, but only on 1599 * a tx ring and only to increase it, so any error will be recovered 1600 * on the next call). For the above, we don't strictly need to call 1601 * it under lock. 1602 */ 1603 int 1604 netmap_ring_reinit(struct netmap_kring *kring) 1605 { 1606 struct netmap_ring *ring = kring->ring; 1607 u_int i, lim = kring->nkr_num_slots - 1; 1608 int errors = 0; 1609 1610 // XXX KASSERT nm_kr_tryget 1611 RD(10, "called for %s", kring->name); 1612 // XXX probably wrong to trust userspace 1613 kring->rhead = ring->head; 1614 kring->rcur = ring->cur; 1615 kring->rtail = ring->tail; 1616 1617 if (ring->cur > lim) 1618 errors++; 1619 if (ring->head > lim) 1620 errors++; 1621 if (ring->tail > lim) 1622 errors++; 1623 for (i = 0; i <= lim; i++) { 1624 u_int idx = ring->slot[i].buf_idx; 1625 u_int len = ring->slot[i].len; 1626 if (idx < 2 || idx >= kring->na->na_lut.objtotal) { 1627 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1628 ring->slot[i].buf_idx = 0; 1629 ring->slot[i].len = 0; 1630 } else if (len > NETMAP_BUF_SIZE(kring->na)) { 1631 ring->slot[i].len = 0; 1632 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1633 } 1634 } 1635 if (errors) { 1636 RD(10, "total %d errors", errors); 1637 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1638 kring->name, 1639 ring->cur, kring->nr_hwcur, 1640 ring->tail, kring->nr_hwtail); 1641 ring->head = kring->rhead = kring->nr_hwcur; 1642 ring->cur = kring->rcur = kring->nr_hwcur; 1643 ring->tail = kring->rtail = kring->nr_hwtail; 1644 } 1645 return (errors ? 1 : 0); 1646 } 1647 1648 /* interpret the ringid and flags fields of an nmreq, by translating them 1649 * into a pair of intervals of ring indices: 1650 * 1651 * [priv->np_txqfirst, priv->np_txqlast) and 1652 * [priv->np_rxqfirst, priv->np_rxqlast) 1653 * 1654 */ 1655 int 1656 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1657 { 1658 struct netmap_adapter *na = priv->np_na; 1659 u_int j, i = ringid & NETMAP_RING_MASK; 1660 u_int reg = flags & NR_REG_MASK; 1661 enum txrx t; 1662 1663 if (reg == NR_REG_DEFAULT) { 1664 /* convert from old ringid to flags */ 1665 if (ringid & NETMAP_SW_RING) { 1666 reg = NR_REG_SW; 1667 } else if (ringid & NETMAP_HW_RING) { 1668 reg = NR_REG_ONE_NIC; 1669 } else { 1670 reg = NR_REG_ALL_NIC; 1671 } 1672 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); 1673 } 1674 switch (reg) { 1675 case NR_REG_ALL_NIC: 1676 case NR_REG_PIPE_MASTER: 1677 case NR_REG_PIPE_SLAVE: 1678 for_rx_tx(t) { 1679 priv->np_qfirst[t] = 0; 1680 priv->np_qlast[t] = nma_get_nrings(na, t); 1681 } 1682 ND("%s %d %d", "ALL/PIPE", 1683 priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); 1684 break; 1685 case NR_REG_SW: 1686 case NR_REG_NIC_SW: 1687 if (!(na->na_flags & NAF_HOST_RINGS)) { 1688 D("host rings not supported"); 1689 return EINVAL; 1690 } 1691 for_rx_tx(t) { 1692 priv->np_qfirst[t] = (reg == NR_REG_SW ? 1693 nma_get_nrings(na, t) : 0); 1694 priv->np_qlast[t] = nma_get_nrings(na, t) + 1; 1695 } 1696 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", 1697 priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); 1698 break; 1699 case NR_REG_ONE_NIC: 1700 if (i >= na->num_tx_rings && i >= na->num_rx_rings) { 1701 D("invalid ring id %d", i); 1702 return EINVAL; 1703 } 1704 for_rx_tx(t) { 1705 /* if not enough rings, use the first one */ 1706 j = i; 1707 if (j >= nma_get_nrings(na, t)) 1708 j = 0; 1709 priv->np_qfirst[t] = j; 1710 priv->np_qlast[t] = j + 1; 1711 } 1712 break; 1713 default: 1714 D("invalid regif type %d", reg); 1715 return EINVAL; 1716 } 1717 priv->np_flags = (flags & ~NR_REG_MASK) | reg; 1718 1719 if (netmap_verbose) { 1720 D("%s: tx [%d,%d) rx [%d,%d) id %d", 1721 na->name, 1722 priv->np_qfirst[NR_TX], 1723 priv->np_qlast[NR_TX], 1724 priv->np_qfirst[NR_RX], 1725 priv->np_qlast[NR_RX], 1726 i); 1727 } 1728 return 0; 1729 } 1730 1731 1732 /* 1733 * Set the ring ID. For devices with a single queue, a request 1734 * for all rings is the same as a single ring. 1735 */ 1736 static int 1737 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1738 { 1739 struct netmap_adapter *na = priv->np_na; 1740 int error; 1741 enum txrx t; 1742 1743 error = netmap_interp_ringid(priv, ringid, flags); 1744 if (error) { 1745 return error; 1746 } 1747 1748 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1749 1750 /* optimization: count the users registered for more than 1751 * one ring, which are the ones sleeping on the global queue. 1752 * The default netmap_notify() callback will then 1753 * avoid signaling the global queue if nobody is using it 1754 */ 1755 for_rx_tx(t) { 1756 if (nm_si_user(priv, t)) 1757 na->si_users[t]++; 1758 } 1759 return 0; 1760 } 1761 1762 static void 1763 netmap_unset_ringid(struct netmap_priv_d *priv) 1764 { 1765 struct netmap_adapter *na = priv->np_na; 1766 enum txrx t; 1767 1768 for_rx_tx(t) { 1769 if (nm_si_user(priv, t)) 1770 na->si_users[t]--; 1771 priv->np_qfirst[t] = priv->np_qlast[t] = 0; 1772 } 1773 priv->np_flags = 0; 1774 priv->np_txpoll = 0; 1775 } 1776 1777 1778 /* check that the rings we want to bind are not exclusively owned by a previous 1779 * bind. If exclusive ownership has been requested, we also mark the rings. 1780 */ 1781 static int 1782 netmap_get_exclusive(struct netmap_priv_d *priv) 1783 { 1784 struct netmap_adapter *na = priv->np_na; 1785 u_int i; 1786 struct netmap_kring *kring; 1787 int excl = (priv->np_flags & NR_EXCLUSIVE); 1788 enum txrx t; 1789 1790 ND("%s: grabbing tx [%d, %d) rx [%d, %d)", 1791 na->name, 1792 priv->np_qfirst[NR_TX], 1793 priv->np_qlast[NR_TX], 1794 priv->np_qfirst[NR_RX], 1795 priv->np_qlast[NR_RX]); 1796 1797 /* first round: check that all the requested rings 1798 * are neither alread exclusively owned, nor we 1799 * want exclusive ownership when they are already in use 1800 */ 1801 for_rx_tx(t) { 1802 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { 1803 kring = &NMR(na, t)[i]; 1804 if ((kring->nr_kflags & NKR_EXCLUSIVE) || 1805 (kring->users && excl)) 1806 { 1807 ND("ring %s busy", kring->name); 1808 return EBUSY; 1809 } 1810 } 1811 } 1812 1813 /* second round: increment usage cound and possibly 1814 * mark as exclusive 1815 */ 1816 1817 for_rx_tx(t) { 1818 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { 1819 kring = &NMR(na, t)[i]; 1820 kring->users++; 1821 if (excl) 1822 kring->nr_kflags |= NKR_EXCLUSIVE; 1823 } 1824 } 1825 1826 return 0; 1827 1828 } 1829 1830 /* undo netmap_get_ownership() */ 1831 static void 1832 netmap_rel_exclusive(struct netmap_priv_d *priv) 1833 { 1834 struct netmap_adapter *na = priv->np_na; 1835 u_int i; 1836 struct netmap_kring *kring; 1837 int excl = (priv->np_flags & NR_EXCLUSIVE); 1838 enum txrx t; 1839 1840 ND("%s: releasing tx [%d, %d) rx [%d, %d)", 1841 na->name, 1842 priv->np_qfirst[NR_TX], 1843 priv->np_qlast[NR_TX], 1844 priv->np_qfirst[NR_RX], 1845 priv->np_qlast[MR_RX]); 1846 1847 1848 for_rx_tx(t) { 1849 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { 1850 kring = &NMR(na, t)[i]; 1851 if (excl) 1852 kring->nr_kflags &= ~NKR_EXCLUSIVE; 1853 kring->users--; 1854 } 1855 } 1856 } 1857 1858 /* 1859 * possibly move the interface to netmap-mode. 1860 * If success it returns a pointer to netmap_if, otherwise NULL. 1861 * This must be called with NMG_LOCK held. 1862 * 1863 * The following na callbacks are called in the process: 1864 * 1865 * na->nm_config() [by netmap_update_config] 1866 * (get current number and size of rings) 1867 * 1868 * We have a generic one for linux (netmap_linux_config). 1869 * The bwrap has to override this, since it has to forward 1870 * the request to the wrapped adapter (netmap_bwrap_config). 1871 * 1872 * 1873 * na->nm_krings_create() 1874 * (create and init the krings array) 1875 * 1876 * One of the following: 1877 * 1878 * * netmap_hw_krings_create, (hw ports) 1879 * creates the standard layout for the krings 1880 * and adds the mbq (used for the host rings). 1881 * 1882 * * netmap_vp_krings_create (VALE ports) 1883 * add leases and scratchpads 1884 * 1885 * * netmap_pipe_krings_create (pipes) 1886 * create the krings and rings of both ends and 1887 * cross-link them 1888 * 1889 * * netmap_monitor_krings_create (monitors) 1890 * avoid allocating the mbq 1891 * 1892 * * netmap_bwrap_krings_create (bwraps) 1893 * create both the brap krings array, 1894 * the krings array of the wrapped adapter, and 1895 * (if needed) the fake array for the host adapter 1896 * 1897 * na->nm_register(, 1) 1898 * (put the adapter in netmap mode) 1899 * 1900 * This may be one of the following: 1901 * (XXX these should be either all *_register or all *_reg 2014-03-15) 1902 * 1903 * * netmap_hw_register (hw ports) 1904 * checks that the ifp is still there, then calls 1905 * the hardware specific callback; 1906 * 1907 * * netmap_vp_reg (VALE ports) 1908 * If the port is connected to a bridge, 1909 * set the NAF_NETMAP_ON flag under the 1910 * bridge write lock. 1911 * 1912 * * netmap_pipe_reg (pipes) 1913 * inform the other pipe end that it is no 1914 * longer responsibile for the lifetime of this 1915 * pipe end 1916 * 1917 * * netmap_monitor_reg (monitors) 1918 * intercept the sync callbacks of the monitored 1919 * rings 1920 * 1921 * * netmap_bwrap_register (bwraps) 1922 * cross-link the bwrap and hwna rings, 1923 * forward the request to the hwna, override 1924 * the hwna notify callback (to get the frames 1925 * coming from outside go through the bridge). 1926 * 1927 * 1928 */ 1929 int 1930 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1931 uint16_t ringid, uint32_t flags) 1932 { 1933 struct netmap_if *nifp = NULL; 1934 int error; 1935 1936 NMG_LOCK_ASSERT(); 1937 /* ring configuration may have changed, fetch from the card */ 1938 netmap_update_config(na); 1939 priv->np_na = na; /* store the reference */ 1940 error = netmap_set_ringid(priv, ringid, flags); 1941 if (error) 1942 goto err; 1943 error = netmap_mem_finalize(na->nm_mem, na); 1944 if (error) 1945 goto err; 1946 1947 if (na->active_fds == 0) { 1948 /* 1949 * If this is the first registration of the adapter, 1950 * also create the netmap rings and their in-kernel view, 1951 * the netmap krings. 1952 */ 1953 1954 /* 1955 * Depending on the adapter, this may also create 1956 * the netmap rings themselves 1957 */ 1958 error = na->nm_krings_create(na); 1959 if (error) 1960 goto err_drop_mem; 1961 1962 /* create all missing netmap rings */ 1963 error = netmap_mem_rings_create(na); 1964 if (error) 1965 goto err_del_krings; 1966 } 1967 1968 /* now the kring must exist and we can check whether some 1969 * previous bind has exclusive ownership on them 1970 */ 1971 error = netmap_get_exclusive(priv); 1972 if (error) 1973 goto err_del_rings; 1974 1975 /* in all cases, create a new netmap if */ 1976 nifp = netmap_mem_if_new(na); 1977 if (nifp == NULL) { 1978 error = ENOMEM; 1979 goto err_rel_excl; 1980 } 1981 1982 na->active_fds++; 1983 if (!nm_netmap_on(na)) { 1984 /* Netmap not active, set the card in netmap mode 1985 * and make it use the shared buffers. 1986 */ 1987 /* cache the allocator info in the na */ 1988 netmap_mem_get_lut(na->nm_mem, &na->na_lut); 1989 ND("%p->na_lut == %p", na, na->na_lut.lut); 1990 error = na->nm_register(na, 1); /* mode on */ 1991 if (error) 1992 goto err_del_if; 1993 } 1994 1995 /* 1996 * advertise that the interface is ready by setting np_nifp. 1997 * The barrier is needed because readers (poll, *SYNC and mmap) 1998 * check for priv->np_nifp != NULL without locking 1999 */ 2000 mb(); /* make sure previous writes are visible to all CPUs */ 2001 priv->np_nifp = nifp; 2002 2003 return 0; 2004 2005 err_del_if: 2006 memset(&na->na_lut, 0, sizeof(na->na_lut)); 2007 na->active_fds--; 2008 netmap_mem_if_delete(na, nifp); 2009 err_rel_excl: 2010 netmap_rel_exclusive(priv); 2011 err_del_rings: 2012 if (na->active_fds == 0) 2013 netmap_mem_rings_delete(na); 2014 err_del_krings: 2015 if (na->active_fds == 0) 2016 na->nm_krings_delete(na); 2017 err_drop_mem: 2018 netmap_mem_deref(na->nm_mem, na); 2019 err: 2020 priv->np_na = NULL; 2021 return error; 2022 } 2023 2024 2025 /* 2026 * update kring and ring at the end of txsync. 2027 */ 2028 static inline void 2029 nm_txsync_finalize(struct netmap_kring *kring) 2030 { 2031 /* update ring tail to what the kernel knows */ 2032 kring->ring->tail = kring->rtail = kring->nr_hwtail; 2033 2034 /* note, head/rhead/hwcur might be behind cur/rcur 2035 * if no carrier 2036 */ 2037 ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", 2038 kring->name, kring->nr_hwcur, kring->nr_hwtail, 2039 kring->rhead, kring->rcur, kring->rtail); 2040 } 2041 2042 2043 /* 2044 * update kring and ring at the end of rxsync 2045 */ 2046 static inline void 2047 nm_rxsync_finalize(struct netmap_kring *kring) 2048 { 2049 /* tell userspace that there might be new packets */ 2050 //struct netmap_ring *ring = kring->ring; 2051 ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, 2052 kring->nr_hwtail); 2053 kring->ring->tail = kring->rtail = kring->nr_hwtail; 2054 /* make a copy of the state for next round */ 2055 kring->rhead = kring->ring->head; 2056 kring->rcur = kring->ring->cur; 2057 } 2058 2059 2060 2061 /* 2062 * ioctl(2) support for the "netmap" device. 2063 * 2064 * Following a list of accepted commands: 2065 * - NIOCGINFO 2066 * - SIOCGIFADDR just for convenience 2067 * - NIOCREGIF 2068 * - NIOCTXSYNC 2069 * - NIOCRXSYNC 2070 * 2071 * Return 0 on success, errno otherwise. 2072 */ 2073 int 2074 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2075 int fflag, struct thread *td) 2076 { 2077 struct netmap_priv_d *priv = NULL; 2078 struct nmreq *nmr = (struct nmreq *) data; 2079 struct netmap_adapter *na = NULL; 2080 int error; 2081 u_int i, qfirst, qlast; 2082 struct netmap_if *nifp; 2083 struct netmap_kring *krings; 2084 enum txrx t; 2085 2086 (void)dev; /* UNUSED */ 2087 (void)fflag; /* UNUSED */ 2088 2089 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 2090 /* truncate name */ 2091 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 2092 if (nmr->nr_version != NETMAP_API) { 2093 D("API mismatch for %s got %d need %d", 2094 nmr->nr_name, 2095 nmr->nr_version, NETMAP_API); 2096 nmr->nr_version = NETMAP_API; 2097 } 2098 if (nmr->nr_version < NETMAP_MIN_API || 2099 nmr->nr_version > NETMAP_MAX_API) { 2100 return EINVAL; 2101 } 2102 } 2103 CURVNET_SET(TD_TO_VNET(td)); 2104 2105 error = devfs_get_cdevpriv((void **)&priv); 2106 if (error) { 2107 CURVNET_RESTORE(); 2108 /* XXX ENOENT should be impossible, since the priv 2109 * is now created in the open */ 2110 return (error == ENOENT ? ENXIO : error); 2111 } 2112 2113 switch (cmd) { 2114 case NIOCGINFO: /* return capabilities etc */ 2115 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2116 error = netmap_bdg_ctl(nmr, NULL); 2117 break; 2118 } 2119 2120 NMG_LOCK(); 2121 do { 2122 /* memsize is always valid */ 2123 struct netmap_mem_d *nmd = &nm_mem; 2124 u_int memflags; 2125 2126 if (nmr->nr_name[0] != '\0') { 2127 /* get a refcount */ 2128 error = netmap_get_na(nmr, &na, 1 /* create */); 2129 if (error) 2130 break; 2131 nmd = na->nm_mem; /* get memory allocator */ 2132 } 2133 2134 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, 2135 &nmr->nr_arg2); 2136 if (error) 2137 break; 2138 if (na == NULL) /* only memory info */ 2139 break; 2140 nmr->nr_offset = 0; 2141 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2142 netmap_update_config(na); 2143 nmr->nr_rx_rings = na->num_rx_rings; 2144 nmr->nr_tx_rings = na->num_tx_rings; 2145 nmr->nr_rx_slots = na->num_rx_desc; 2146 nmr->nr_tx_slots = na->num_tx_desc; 2147 netmap_adapter_put(na); 2148 } while (0); 2149 NMG_UNLOCK(); 2150 break; 2151 2152 case NIOCREGIF: 2153 /* possibly attach/detach NIC and VALE switch */ 2154 i = nmr->nr_cmd; 2155 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 2156 || i == NETMAP_BDG_VNET_HDR 2157 || i == NETMAP_BDG_NEWIF 2158 || i == NETMAP_BDG_DELIF) { 2159 error = netmap_bdg_ctl(nmr, NULL); 2160 break; 2161 } else if (i != 0) { 2162 D("nr_cmd must be 0 not %d", i); 2163 error = EINVAL; 2164 break; 2165 } 2166 2167 /* protect access to priv from concurrent NIOCREGIF */ 2168 NMG_LOCK(); 2169 do { 2170 u_int memflags; 2171 2172 if (priv->np_nifp != NULL) { /* thread already registered */ 2173 error = EBUSY; 2174 break; 2175 } 2176 /* find the interface and a reference */ 2177 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 2178 if (error) 2179 break; 2180 if (NETMAP_OWNED_BY_KERN(na)) { 2181 netmap_adapter_put(na); 2182 error = EBUSY; 2183 break; 2184 } 2185 error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags); 2186 if (error) { /* reg. failed, release priv and ref */ 2187 netmap_adapter_put(na); 2188 break; 2189 } 2190 nifp = priv->np_nifp; 2191 priv->np_td = td; // XXX kqueue, debugging only 2192 2193 /* return the offset of the netmap_if object */ 2194 nmr->nr_rx_rings = na->num_rx_rings; 2195 nmr->nr_tx_rings = na->num_tx_rings; 2196 nmr->nr_rx_slots = na->num_rx_desc; 2197 nmr->nr_tx_slots = na->num_tx_desc; 2198 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, 2199 &nmr->nr_arg2); 2200 if (error) { 2201 netmap_do_unregif(priv); 2202 netmap_adapter_put(na); 2203 break; 2204 } 2205 if (memflags & NETMAP_MEM_PRIVATE) { 2206 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2207 } 2208 for_rx_tx(t) { 2209 priv->np_si[t] = nm_si_user(priv, t) ? 2210 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si; 2211 } 2212 2213 if (nmr->nr_arg3) { 2214 D("requested %d extra buffers", nmr->nr_arg3); 2215 nmr->nr_arg3 = netmap_extra_alloc(na, 2216 &nifp->ni_bufs_head, nmr->nr_arg3); 2217 D("got %d extra buffers", nmr->nr_arg3); 2218 } 2219 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2220 } while (0); 2221 NMG_UNLOCK(); 2222 break; 2223 2224 case NIOCTXSYNC: 2225 case NIOCRXSYNC: 2226 nifp = priv->np_nifp; 2227 2228 if (nifp == NULL) { 2229 error = ENXIO; 2230 break; 2231 } 2232 mb(); /* make sure following reads are not from cache */ 2233 2234 na = priv->np_na; /* we have a reference */ 2235 2236 if (na == NULL) { 2237 D("Internal error: nifp != NULL && na == NULL"); 2238 error = ENXIO; 2239 break; 2240 } 2241 2242 if (!nm_netmap_on(na)) { 2243 error = ENXIO; 2244 break; 2245 } 2246 2247 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX); 2248 krings = NMR(na, t); 2249 qfirst = priv->np_qfirst[t]; 2250 qlast = priv->np_qlast[t]; 2251 2252 for (i = qfirst; i < qlast; i++) { 2253 struct netmap_kring *kring = krings + i; 2254 if (nm_kr_tryget(kring)) { 2255 error = EBUSY; 2256 goto out; 2257 } 2258 if (cmd == NIOCTXSYNC) { 2259 if (netmap_verbose & NM_VERB_TXSYNC) 2260 D("pre txsync ring %d cur %d hwcur %d", 2261 i, kring->ring->cur, 2262 kring->nr_hwcur); 2263 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2264 netmap_ring_reinit(kring); 2265 } else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) { 2266 nm_txsync_finalize(kring); 2267 } 2268 if (netmap_verbose & NM_VERB_TXSYNC) 2269 D("post txsync ring %d cur %d hwcur %d", 2270 i, kring->ring->cur, 2271 kring->nr_hwcur); 2272 } else { 2273 if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { 2274 netmap_ring_reinit(kring); 2275 } else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) { 2276 nm_rxsync_finalize(kring); 2277 } 2278 microtime(&na->rx_rings[i].ring->ts); 2279 } 2280 nm_kr_put(kring); 2281 } 2282 2283 break; 2284 2285 #ifdef WITH_VALE 2286 case NIOCCONFIG: 2287 error = netmap_bdg_config(nmr); 2288 break; 2289 #endif 2290 #ifdef __FreeBSD__ 2291 case FIONBIO: 2292 case FIOASYNC: 2293 ND("FIONBIO/FIOASYNC are no-ops"); 2294 break; 2295 2296 case BIOCIMMEDIATE: 2297 case BIOCGHDRCMPLT: 2298 case BIOCSHDRCMPLT: 2299 case BIOCSSEESENT: 2300 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2301 break; 2302 2303 default: /* allow device-specific ioctls */ 2304 { 2305 struct ifnet *ifp = ifunit_ref(nmr->nr_name); 2306 if (ifp == NULL) { 2307 error = ENXIO; 2308 } else { 2309 struct socket so; 2310 2311 bzero(&so, sizeof(so)); 2312 so.so_vnet = ifp->if_vnet; 2313 // so->so_proto not null. 2314 error = ifioctl(&so, cmd, data, td); 2315 if_rele(ifp); 2316 } 2317 break; 2318 } 2319 2320 #else /* linux */ 2321 default: 2322 error = EOPNOTSUPP; 2323 #endif /* linux */ 2324 } 2325 out: 2326 2327 CURVNET_RESTORE(); 2328 return (error); 2329 } 2330 2331 2332 /* 2333 * select(2) and poll(2) handlers for the "netmap" device. 2334 * 2335 * Can be called for one or more queues. 2336 * Return true the event mask corresponding to ready events. 2337 * If there are no ready events, do a selrecord on either individual 2338 * selinfo or on the global one. 2339 * Device-dependent parts (locking and sync of tx/rx rings) 2340 * are done through callbacks. 2341 * 2342 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2343 * The first one is remapped to pwait as selrecord() uses the name as an 2344 * hidden argument. 2345 */ 2346 int 2347 netmap_poll(struct cdev *dev, int events, struct thread *td) 2348 { 2349 struct netmap_priv_d *priv = NULL; 2350 struct netmap_adapter *na; 2351 struct netmap_kring *kring; 2352 u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0; 2353 #define want_tx want[NR_TX] 2354 #define want_rx want[NR_RX] 2355 struct mbq q; /* packets from hw queues to host stack */ 2356 void *pwait = dev; /* linux compatibility */ 2357 int is_kevent = 0; 2358 enum txrx t; 2359 2360 /* 2361 * In order to avoid nested locks, we need to "double check" 2362 * txsync and rxsync if we decide to do a selrecord(). 2363 * retry_tx (and retry_rx, later) prevent looping forever. 2364 */ 2365 int retry_tx = 1, retry_rx = 1; 2366 2367 (void)pwait; 2368 mbq_init(&q); 2369 2370 /* 2371 * XXX kevent has curthread->tp_fop == NULL, 2372 * so devfs_get_cdevpriv() fails. We circumvent this by passing 2373 * priv as the first argument, which is also useful to avoid 2374 * the selrecord() which are not necessary in that case. 2375 */ 2376 if (devfs_get_cdevpriv((void **)&priv) != 0) { 2377 is_kevent = 1; 2378 if (netmap_verbose) 2379 D("called from kevent"); 2380 priv = (struct netmap_priv_d *)dev; 2381 } 2382 if (priv == NULL) 2383 return POLLERR; 2384 2385 if (priv->np_nifp == NULL) { 2386 D("No if registered"); 2387 return POLLERR; 2388 } 2389 mb(); /* make sure following reads are not from cache */ 2390 2391 na = priv->np_na; 2392 2393 if (!nm_netmap_on(na)) 2394 return POLLERR; 2395 2396 if (netmap_verbose & 0x8000) 2397 D("device %s events 0x%x", na->name, events); 2398 want_tx = events & (POLLOUT | POLLWRNORM); 2399 want_rx = events & (POLLIN | POLLRDNORM); 2400 2401 2402 /* 2403 * check_all_{tx|rx} are set if the card has more than one queue AND 2404 * the file descriptor is bound to all of them. If so, we sleep on 2405 * the "global" selinfo, otherwise we sleep on individual selinfo 2406 * (FreeBSD only allows two selinfo's per file descriptor). 2407 * The interrupt routine in the driver wake one or the other 2408 * (or both) depending on which clients are active. 2409 * 2410 * rxsync() is only called if we run out of buffers on a POLLIN. 2411 * txsync() is called if we run out of buffers on POLLOUT, or 2412 * there are pending packets to send. The latter can be disabled 2413 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2414 */ 2415 check_all_tx = nm_si_user(priv, NR_TX); 2416 check_all_rx = nm_si_user(priv, NR_RX); 2417 2418 /* 2419 * We start with a lock free round which is cheap if we have 2420 * slots available. If this fails, then lock and call the sync 2421 * routines. 2422 */ 2423 for_rx_tx(t) { 2424 for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { 2425 kring = &NMR(na, t)[i]; 2426 /* XXX compare ring->cur and kring->tail */ 2427 if (!nm_ring_empty(kring->ring)) { 2428 revents |= want[t]; 2429 want[t] = 0; /* also breaks the loop */ 2430 } 2431 } 2432 } 2433 2434 /* 2435 * If we want to push packets out (priv->np_txpoll) or 2436 * want_tx is still set, we must issue txsync calls 2437 * (on all rings, to avoid that the tx rings stall). 2438 * XXX should also check cur != hwcur on the tx rings. 2439 * Fortunately, normal tx mode has np_txpoll set. 2440 */ 2441 if (priv->np_txpoll || want_tx) { 2442 /* 2443 * The first round checks if anyone is ready, if not 2444 * do a selrecord and another round to handle races. 2445 * want_tx goes to 0 if any space is found, and is 2446 * used to skip rings with no pending transmissions. 2447 */ 2448 flush_tx: 2449 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) { 2450 int found = 0; 2451 2452 kring = &na->tx_rings[i]; 2453 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2454 continue; 2455 /* only one thread does txsync */ 2456 if (nm_kr_tryget(kring)) { 2457 /* either busy or stopped 2458 * XXX if the ring is stopped, sleeping would 2459 * be better. In current code, however, we only 2460 * stop the rings for brief intervals (2014-03-14) 2461 */ 2462 if (netmap_verbose) 2463 RD(2, "%p lost race on txring %d, ok", 2464 priv, i); 2465 continue; 2466 } 2467 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2468 netmap_ring_reinit(kring); 2469 revents |= POLLERR; 2470 } else { 2471 if (kring->nm_sync(kring, 0)) 2472 revents |= POLLERR; 2473 else 2474 nm_txsync_finalize(kring); 2475 } 2476 2477 /* 2478 * If we found new slots, notify potential 2479 * listeners on the same ring. 2480 * Since we just did a txsync, look at the copies 2481 * of cur,tail in the kring. 2482 */ 2483 found = kring->rcur != kring->rtail; 2484 nm_kr_put(kring); 2485 if (found) { /* notify other listeners */ 2486 revents |= want_tx; 2487 want_tx = 0; 2488 kring->nm_notify(kring, 0); 2489 } 2490 } 2491 if (want_tx && retry_tx && !is_kevent) { 2492 OS_selrecord(td, check_all_tx ? 2493 &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si); 2494 retry_tx = 0; 2495 goto flush_tx; 2496 } 2497 } 2498 2499 /* 2500 * If want_rx is still set scan receive rings. 2501 * Do it on all rings because otherwise we starve. 2502 */ 2503 if (want_rx) { 2504 int send_down = 0; /* transparent mode */ 2505 /* two rounds here for race avoidance */ 2506 do_retry_rx: 2507 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) { 2508 int found = 0; 2509 2510 kring = &na->rx_rings[i]; 2511 2512 if (nm_kr_tryget(kring)) { 2513 if (netmap_verbose) 2514 RD(2, "%p lost race on rxring %d, ok", 2515 priv, i); 2516 continue; 2517 } 2518 2519 if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) { 2520 netmap_ring_reinit(kring); 2521 revents |= POLLERR; 2522 } 2523 /* now we can use kring->rcur, rtail */ 2524 2525 /* 2526 * transparent mode support: collect packets 2527 * from the rxring(s). 2528 * XXX NR_FORWARD should only be read on 2529 * physical or NIC ports 2530 */ 2531 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2532 ND(10, "forwarding some buffers up %d to %d", 2533 kring->nr_hwcur, kring->ring->cur); 2534 netmap_grab_packets(kring, &q, netmap_fwd); 2535 } 2536 2537 if (kring->nm_sync(kring, 0)) 2538 revents |= POLLERR; 2539 else 2540 nm_rxsync_finalize(kring); 2541 if (netmap_no_timestamp == 0 || 2542 kring->ring->flags & NR_TIMESTAMP) { 2543 microtime(&kring->ring->ts); 2544 } 2545 found = kring->rcur != kring->rtail; 2546 nm_kr_put(kring); 2547 if (found) { 2548 revents |= want_rx; 2549 retry_rx = 0; 2550 kring->nm_notify(kring, 0); 2551 } 2552 } 2553 2554 /* transparent mode XXX only during first pass ? */ 2555 if (na->na_flags & NAF_HOST_RINGS) { 2556 kring = &na->rx_rings[na->num_rx_rings]; 2557 if (check_all_rx 2558 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 2559 /* XXX fix to use kring fields */ 2560 if (nm_ring_empty(kring->ring)) 2561 send_down = netmap_rxsync_from_host(na, td, dev); 2562 if (!nm_ring_empty(kring->ring)) 2563 revents |= want_rx; 2564 } 2565 } 2566 2567 if (retry_rx && !is_kevent) 2568 OS_selrecord(td, check_all_rx ? 2569 &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si); 2570 if (send_down > 0 || retry_rx) { 2571 retry_rx = 0; 2572 if (send_down) 2573 goto flush_tx; /* and retry_rx */ 2574 else 2575 goto do_retry_rx; 2576 } 2577 } 2578 2579 /* 2580 * Transparent mode: marked bufs on rx rings between 2581 * kring->nr_hwcur and ring->head 2582 * are passed to the other endpoint. 2583 * 2584 * In this mode we also scan the sw rxring, which in 2585 * turn passes packets up. 2586 * 2587 * XXX Transparent mode at the moment requires to bind all 2588 * rings to a single file descriptor. 2589 */ 2590 2591 if (q.head && na->ifp != NULL) 2592 netmap_send_up(na->ifp, &q); 2593 2594 return (revents); 2595 #undef want_tx 2596 #undef want_rx 2597 } 2598 2599 2600 /*-------------------- driver support routines -------------------*/ 2601 2602 static int netmap_hw_krings_create(struct netmap_adapter *); 2603 2604 /* default notify callback */ 2605 static int 2606 netmap_notify(struct netmap_kring *kring, int flags) 2607 { 2608 struct netmap_adapter *na = kring->na; 2609 enum txrx t = kring->tx; 2610 2611 OS_selwakeup(&kring->si, PI_NET); 2612 /* optimization: avoid a wake up on the global 2613 * queue if nobody has registered for more 2614 * than one ring 2615 */ 2616 if (na->si_users[t] > 0) 2617 OS_selwakeup(&na->si[t], PI_NET); 2618 2619 return 0; 2620 } 2621 2622 2623 /* called by all routines that create netmap_adapters. 2624 * Attach na to the ifp (if any) and provide defaults 2625 * for optional callbacks. Defaults assume that we 2626 * are creating an hardware netmap_adapter. 2627 */ 2628 int 2629 netmap_attach_common(struct netmap_adapter *na) 2630 { 2631 struct ifnet *ifp = na->ifp; 2632 2633 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2634 D("%s: invalid rings tx %d rx %d", 2635 na->name, na->num_tx_rings, na->num_rx_rings); 2636 return EINVAL; 2637 } 2638 /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, 2639 * pipes, monitors). For bwrap we actually have a non-null ifp for 2640 * use by the external modules, but that is set after this 2641 * function has been called. 2642 * XXX this is ugly, maybe split this function in two (2014-03-14) 2643 */ 2644 if (ifp != NULL) { 2645 WNA(ifp) = na; 2646 2647 /* the following is only needed for na that use the host port. 2648 * XXX do we have something similar for linux ? 2649 */ 2650 #ifdef __FreeBSD__ 2651 na->if_input = ifp->if_input; /* for netmap_send_up */ 2652 #endif /* __FreeBSD__ */ 2653 2654 NETMAP_SET_CAPABLE(ifp); 2655 } 2656 if (na->nm_krings_create == NULL) { 2657 /* we assume that we have been called by a driver, 2658 * since other port types all provide their own 2659 * nm_krings_create 2660 */ 2661 na->nm_krings_create = netmap_hw_krings_create; 2662 na->nm_krings_delete = netmap_hw_krings_delete; 2663 } 2664 if (na->nm_notify == NULL) 2665 na->nm_notify = netmap_notify; 2666 na->active_fds = 0; 2667 2668 if (na->nm_mem == NULL) 2669 /* use the global allocator */ 2670 na->nm_mem = &nm_mem; 2671 netmap_mem_get(na->nm_mem); 2672 #ifdef WITH_VALE 2673 if (na->nm_bdg_attach == NULL) 2674 /* no special nm_bdg_attach callback. On VALE 2675 * attach, we need to interpose a bwrap 2676 */ 2677 na->nm_bdg_attach = netmap_bwrap_attach; 2678 #endif 2679 return 0; 2680 } 2681 2682 2683 /* standard cleanup, called by all destructors */ 2684 void 2685 netmap_detach_common(struct netmap_adapter *na) 2686 { 2687 if (na->ifp != NULL) 2688 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2689 2690 if (na->tx_rings) { /* XXX should not happen */ 2691 D("freeing leftover tx_rings"); 2692 na->nm_krings_delete(na); 2693 } 2694 netmap_pipe_dealloc(na); 2695 if (na->nm_mem) 2696 netmap_mem_put(na->nm_mem); 2697 bzero(na, sizeof(*na)); 2698 free(na, M_DEVBUF); 2699 } 2700 2701 /* Wrapper for the register callback provided hardware drivers. 2702 * na->ifp == NULL means the the driver module has been 2703 * unloaded, so we cannot call into it. 2704 * Note that module unloading, in our patched linux drivers, 2705 * happens under NMG_LOCK and after having stopped all the 2706 * nic rings (see netmap_detach). This provides sufficient 2707 * protection for the other driver-provied callbacks 2708 * (i.e., nm_config and nm_*xsync), that therefore don't need 2709 * to wrapped. 2710 */ 2711 static int 2712 netmap_hw_register(struct netmap_adapter *na, int onoff) 2713 { 2714 struct netmap_hw_adapter *hwna = 2715 (struct netmap_hw_adapter*)na; 2716 2717 if (na->ifp == NULL) 2718 return onoff ? ENXIO : 0; 2719 2720 return hwna->nm_hw_register(na, onoff); 2721 } 2722 2723 2724 /* 2725 * Initialize a ``netmap_adapter`` object created by driver on attach. 2726 * We allocate a block of memory with room for a struct netmap_adapter 2727 * plus two sets of N+2 struct netmap_kring (where N is the number 2728 * of hardware rings): 2729 * krings 0..N-1 are for the hardware queues. 2730 * kring N is for the host stack queue 2731 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2732 * Return 0 on success, ENOMEM otherwise. 2733 */ 2734 int 2735 netmap_attach(struct netmap_adapter *arg) 2736 { 2737 struct netmap_hw_adapter *hwna = NULL; 2738 // XXX when is arg == NULL ? 2739 struct ifnet *ifp = arg ? arg->ifp : NULL; 2740 2741 if (arg == NULL || ifp == NULL) 2742 goto fail; 2743 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2744 if (hwna == NULL) 2745 goto fail; 2746 hwna->up = *arg; 2747 hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE; 2748 strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); 2749 hwna->nm_hw_register = hwna->up.nm_register; 2750 hwna->up.nm_register = netmap_hw_register; 2751 if (netmap_attach_common(&hwna->up)) { 2752 free(hwna, M_DEVBUF); 2753 goto fail; 2754 } 2755 netmap_adapter_get(&hwna->up); 2756 2757 #ifdef linux 2758 if (ifp->netdev_ops) { 2759 /* prepare a clone of the netdev ops */ 2760 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS 2761 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2762 #else 2763 hwna->nm_ndo = *ifp->netdev_ops; 2764 #endif 2765 } 2766 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2767 if (ifp->ethtool_ops) { 2768 hwna->nm_eto = *ifp->ethtool_ops; 2769 } 2770 hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; 2771 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS 2772 hwna->nm_eto.set_channels = linux_netmap_set_channels; 2773 #endif 2774 if (arg->nm_config == NULL) { 2775 hwna->up.nm_config = netmap_linux_config; 2776 } 2777 #endif /* linux */ 2778 2779 if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n", 2780 hwna->up.num_tx_rings, hwna->up.num_tx_desc, 2781 hwna->up.num_rx_rings, hwna->up.num_rx_desc); 2782 return 0; 2783 2784 fail: 2785 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2786 if (ifp) 2787 netmap_detach(ifp); 2788 return (hwna ? EINVAL : ENOMEM); 2789 } 2790 2791 2792 void 2793 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2794 { 2795 if (!na) { 2796 return; 2797 } 2798 2799 refcount_acquire(&na->na_refcount); 2800 } 2801 2802 2803 /* returns 1 iff the netmap_adapter is destroyed */ 2804 int 2805 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2806 { 2807 if (!na) 2808 return 1; 2809 2810 if (!refcount_release(&na->na_refcount)) 2811 return 0; 2812 2813 if (na->nm_dtor) 2814 na->nm_dtor(na); 2815 2816 netmap_detach_common(na); 2817 2818 return 1; 2819 } 2820 2821 /* nm_krings_create callback for all hardware native adapters */ 2822 int 2823 netmap_hw_krings_create(struct netmap_adapter *na) 2824 { 2825 int ret = netmap_krings_create(na, 0); 2826 if (ret == 0) { 2827 /* initialize the mbq for the sw rx ring */ 2828 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2829 ND("initialized sw rx queue %d", na->num_rx_rings); 2830 } 2831 return ret; 2832 } 2833 2834 2835 2836 /* 2837 * Called on module unload by the netmap-enabled drivers 2838 */ 2839 void 2840 netmap_detach(struct ifnet *ifp) 2841 { 2842 struct netmap_adapter *na = NA(ifp); 2843 2844 if (!na) 2845 return; 2846 2847 NMG_LOCK(); 2848 netmap_disable_all_rings(ifp); 2849 na->ifp = NULL; 2850 na->na_flags &= ~NAF_NETMAP_ON; 2851 /* 2852 * if the netmap adapter is not native, somebody 2853 * changed it, so we can not release it here. 2854 * The NULL na->ifp will notify the new owner that 2855 * the driver is gone. 2856 */ 2857 if (na->na_flags & NAF_NATIVE) { 2858 netmap_adapter_put(na); 2859 } 2860 /* give them a chance to notice */ 2861 netmap_enable_all_rings(ifp); 2862 NMG_UNLOCK(); 2863 } 2864 2865 2866 /* 2867 * Intercept packets from the network stack and pass them 2868 * to netmap as incoming packets on the 'software' ring. 2869 * 2870 * We only store packets in a bounded mbq and then copy them 2871 * in the relevant rxsync routine. 2872 * 2873 * We rely on the OS to make sure that the ifp and na do not go 2874 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2875 * In nm_register() or whenever there is a reinitialization, 2876 * we make sure to make the mode change visible here. 2877 */ 2878 int 2879 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2880 { 2881 struct netmap_adapter *na = NA(ifp); 2882 struct netmap_kring *kring; 2883 u_int len = MBUF_LEN(m); 2884 u_int error = ENOBUFS; 2885 struct mbq *q; 2886 int space; 2887 2888 kring = &na->rx_rings[na->num_rx_rings]; 2889 // XXX [Linux] we do not need this lock 2890 // if we follow the down/configure/up protocol -gl 2891 // mtx_lock(&na->core_lock); 2892 2893 if (!nm_netmap_on(na)) { 2894 D("%s not in netmap mode anymore", na->name); 2895 error = ENXIO; 2896 goto done; 2897 } 2898 2899 q = &kring->rx_queue; 2900 2901 // XXX reconsider long packets if we handle fragments 2902 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */ 2903 D("%s from_host, drop packet size %d > %d", na->name, 2904 len, NETMAP_BUF_SIZE(na)); 2905 goto done; 2906 } 2907 2908 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2909 * and maybe other instances of netmap_transmit (the latter 2910 * not possible on Linux). 2911 * Also avoid overflowing the queue. 2912 */ 2913 mbq_lock(q); 2914 2915 space = kring->nr_hwtail - kring->nr_hwcur; 2916 if (space < 0) 2917 space += kring->nkr_num_slots; 2918 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2919 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2920 na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2921 len, m); 2922 } else { 2923 mbq_enqueue(q, m); 2924 ND(10, "%s %d bufs in queue len %d m %p", 2925 na->name, mbq_len(q), len, m); 2926 /* notify outside the lock */ 2927 m = NULL; 2928 error = 0; 2929 } 2930 mbq_unlock(q); 2931 2932 done: 2933 if (m) 2934 m_freem(m); 2935 /* unconditionally wake up listeners */ 2936 kring->nm_notify(kring, 0); 2937 /* this is normally netmap_notify(), but for nics 2938 * connected to a bridge it is netmap_bwrap_intr_notify(), 2939 * that possibly forwards the frames through the switch 2940 */ 2941 2942 return (error); 2943 } 2944 2945 2946 /* 2947 * netmap_reset() is called by the driver routines when reinitializing 2948 * a ring. The driver is in charge of locking to protect the kring. 2949 * If native netmap mode is not set just return NULL. 2950 */ 2951 struct netmap_slot * 2952 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2953 u_int new_cur) 2954 { 2955 struct netmap_kring *kring; 2956 int new_hwofs, lim; 2957 2958 if (!nm_native_on(na)) { 2959 ND("interface not in native netmap mode"); 2960 return NULL; /* nothing to reinitialize */ 2961 } 2962 2963 /* XXX note- in the new scheme, we are not guaranteed to be 2964 * under lock (e.g. when called on a device reset). 2965 * In this case, we should set a flag and do not trust too 2966 * much the values. In practice: TODO 2967 * - set a RESET flag somewhere in the kring 2968 * - do the processing in a conservative way 2969 * - let the *sync() fixup at the end. 2970 */ 2971 if (tx == NR_TX) { 2972 if (n >= na->num_tx_rings) 2973 return NULL; 2974 kring = na->tx_rings + n; 2975 // XXX check whether we should use hwcur or rcur 2976 new_hwofs = kring->nr_hwcur - new_cur; 2977 } else { 2978 if (n >= na->num_rx_rings) 2979 return NULL; 2980 kring = na->rx_rings + n; 2981 new_hwofs = kring->nr_hwtail - new_cur; 2982 } 2983 lim = kring->nkr_num_slots - 1; 2984 if (new_hwofs > lim) 2985 new_hwofs -= lim + 1; 2986 2987 /* Always set the new offset value and realign the ring. */ 2988 if (netmap_verbose) 2989 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2990 na->name, 2991 tx == NR_TX ? "TX" : "RX", n, 2992 kring->nkr_hwofs, new_hwofs, 2993 kring->nr_hwtail, 2994 tx == NR_TX ? lim : kring->nr_hwtail); 2995 kring->nkr_hwofs = new_hwofs; 2996 if (tx == NR_TX) { 2997 kring->nr_hwtail = kring->nr_hwcur + lim; 2998 if (kring->nr_hwtail > lim) 2999 kring->nr_hwtail -= lim + 1; 3000 } 3001 3002 #if 0 // def linux 3003 /* XXX check that the mappings are correct */ 3004 /* need ring_nr, adapter->pdev, direction */ 3005 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 3006 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 3007 D("error mapping rx netmap buffer %d", i); 3008 // XXX fix error handling 3009 } 3010 3011 #endif /* linux */ 3012 /* 3013 * Wakeup on the individual and global selwait 3014 * We do the wakeup here, but the ring is not yet reconfigured. 3015 * However, we are under lock so there are no races. 3016 */ 3017 kring->nm_notify(kring, 0); 3018 return kring->ring->slot; 3019 } 3020 3021 3022 /* 3023 * Dispatch rx/tx interrupts to the netmap rings. 3024 * 3025 * "work_done" is non-null on the RX path, NULL for the TX path. 3026 * We rely on the OS to make sure that there is only one active 3027 * instance per queue, and that there is appropriate locking. 3028 * 3029 * The 'notify' routine depends on what the ring is attached to. 3030 * - for a netmap file descriptor, do a selwakeup on the individual 3031 * waitqueue, plus one on the global one if needed 3032 * (see netmap_notify) 3033 * - for a nic connected to a switch, call the proper forwarding routine 3034 * (see netmap_bwrap_intr_notify) 3035 */ 3036 void 3037 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3038 { 3039 struct netmap_adapter *na = NA(ifp); 3040 struct netmap_kring *kring; 3041 enum txrx t = (work_done ? NR_RX : NR_TX); 3042 3043 q &= NETMAP_RING_MASK; 3044 3045 if (netmap_verbose) { 3046 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 3047 } 3048 3049 if (q >= nma_get_nrings(na, t)) 3050 return; // not a physical queue 3051 3052 kring = NMR(na, t) + q; 3053 3054 if (t == NR_RX) { 3055 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 3056 *work_done = 1; /* do not fire napi again */ 3057 } 3058 kring->nm_notify(kring, 0); 3059 } 3060 3061 3062 /* 3063 * Default functions to handle rx/tx interrupts from a physical device. 3064 * "work_done" is non-null on the RX path, NULL for the TX path. 3065 * 3066 * If the card is not in netmap mode, simply return 0, 3067 * so that the caller proceeds with regular processing. 3068 * Otherwise call netmap_common_irq() and return 1. 3069 * 3070 * If the card is connected to a netmap file descriptor, 3071 * do a selwakeup on the individual queue, plus one on the global one 3072 * if needed (multiqueue card _and_ there are multiqueue listeners), 3073 * and return 1. 3074 * 3075 * Finally, if called on rx from an interface connected to a switch, 3076 * calls the proper forwarding routine, and return 1. 3077 */ 3078 int 3079 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3080 { 3081 struct netmap_adapter *na = NA(ifp); 3082 3083 /* 3084 * XXX emulated netmap mode sets NAF_SKIP_INTR so 3085 * we still use the regular driver even though the previous 3086 * check fails. It is unclear whether we should use 3087 * nm_native_on() here. 3088 */ 3089 if (!nm_netmap_on(na)) 3090 return 0; 3091 3092 if (na->na_flags & NAF_SKIP_INTR) { 3093 ND("use regular interrupt"); 3094 return 0; 3095 } 3096 3097 netmap_common_irq(ifp, q, work_done); 3098 return 1; 3099 } 3100 3101 3102 /* 3103 * Module loader and unloader 3104 * 3105 * netmap_init() creates the /dev/netmap device and initializes 3106 * all global variables. Returns 0 on success, errno on failure 3107 * (but there is no chance) 3108 * 3109 * netmap_fini() destroys everything. 3110 */ 3111 3112 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 3113 extern struct cdevsw netmap_cdevsw; 3114 3115 3116 void 3117 netmap_fini(void) 3118 { 3119 netmap_uninit_bridges(); 3120 if (netmap_dev) 3121 destroy_dev(netmap_dev); 3122 netmap_mem_fini(); 3123 NMG_LOCK_DESTROY(); 3124 printf("netmap: unloaded module.\n"); 3125 } 3126 3127 3128 int 3129 netmap_init(void) 3130 { 3131 int error; 3132 3133 NMG_LOCK_INIT(); 3134 3135 error = netmap_mem_init(); 3136 if (error != 0) 3137 goto fail; 3138 /* 3139 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls 3140 * when the module is compiled in. 3141 * XXX could use make_dev_credv() to get error number 3142 */ 3143 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 3144 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, 3145 "netmap"); 3146 if (!netmap_dev) 3147 goto fail; 3148 3149 error = netmap_init_bridges(); 3150 if (error) 3151 goto fail; 3152 3153 #ifdef __FreeBSD__ 3154 nm_vi_init_index(); 3155 #endif 3156 3157 printf("netmap: loaded module\n"); 3158 return (0); 3159 fail: 3160 netmap_fini(); 3161 return (EINVAL); /* may be incorrect */ 3162 } 3163