1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 #ifdef __FreeBSD__ 28 #define TEST_STUFF // test code, does not compile yet on linux 29 #endif /* __FreeBSD__ */ 30 31 /* 32 * This module supports memory mapped access to network devices, 33 * see netmap(4). 34 * 35 * The module uses a large, memory pool allocated by the kernel 36 * and accessible as mmapped memory by multiple userspace threads/processes. 37 * The memory pool contains packet buffers and "netmap rings", 38 * i.e. user-accessible copies of the interface's queues. 39 * 40 * Access to the network card works like this: 41 * 1. a process/thread issues one or more open() on /dev/netmap, to create 42 * select()able file descriptor on which events are reported. 43 * 2. on each descriptor, the process issues an ioctl() to identify 44 * the interface that should report events to the file descriptor. 45 * 3. on each descriptor, the process issues an mmap() request to 46 * map the shared memory region within the process' address space. 47 * The list of interesting queues is indicated by a location in 48 * the shared memory region. 49 * 4. using the functions in the netmap(4) userspace API, a process 50 * can look up the occupation state of a queue, access memory buffers, 51 * and retrieve received packets or enqueue packets to transmit. 52 * 5. using some ioctl()s the process can synchronize the userspace view 53 * of the queue with the actual status in the kernel. This includes both 54 * receiving the notification of new packets, and transmitting new 55 * packets on the output interface. 56 * 6. select() or poll() can be used to wait for events on individual 57 * transmit or receive queues (or all queues for a given interface). 58 * 59 60 SYNCHRONIZATION (USER) 61 62 The netmap rings and data structures may be shared among multiple 63 user threads or even independent processes. 64 Any synchronization among those threads/processes is delegated 65 to the threads themselves. Only one thread at a time can be in 66 a system call on the same netmap ring. The OS does not enforce 67 this and only guarantees against system crashes in case of 68 invalid usage. 69 70 LOCKING (INTERNAL) 71 72 Within the kernel, access to the netmap rings is protected as follows: 73 74 - a spinlock on each ring, to handle producer/consumer races on 75 RX rings attached to the host stack (against multiple host 76 threads writing from the host stack to the same ring), 77 and on 'destination' rings attached to a VALE switch 78 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 79 protecting multiple active senders for the same destination) 80 81 - an atomic variable to guarantee that there is at most one 82 instance of *_*xsync() on the ring at any time. 83 For rings connected to user file 84 descriptors, an atomic_test_and_set() protects this, and the 85 lock on the ring is not actually used. 86 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 87 is also used to prevent multiple executions (the driver might indeed 88 already guarantee this). 89 For NIC TX rings connected to a VALE switch, the lock arbitrates 90 access to the queue (both when allocating buffers and when pushing 91 them out). 92 93 - *xsync() should be protected against initializations of the card. 94 On FreeBSD most devices have the reset routine protected by 95 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 96 the RING protection on rx_reset(), this should be added. 97 98 On linux there is an external lock on the tx path, which probably 99 also arbitrates access to the reset routine. XXX to be revised 100 101 - a per-interface core_lock protecting access from the host stack 102 while interfaces may be detached from netmap mode. 103 XXX there should be no need for this lock if we detach the interfaces 104 only while they are down. 105 106 107 --- VALE SWITCH --- 108 109 NMG_LOCK() serializes all modifications to switches and ports. 110 A switch cannot be deleted until all ports are gone. 111 112 For each switch, an SX lock (RWlock on linux) protects 113 deletion of ports. When configuring or deleting a new port, the 114 lock is acquired in exclusive mode (after holding NMG_LOCK). 115 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 116 The lock is held throughout the entire forwarding cycle, 117 during which the thread may incur in a page fault. 118 Hence it is important that sleepable shared locks are used. 119 120 On the rx ring, the per-port lock is grabbed initially to reserve 121 a number of slot in the ring, then the lock is released, 122 packets are copied from source to destination, and then 123 the lock is acquired again and the receive ring is updated. 124 (A similar thing is done on the tx ring for NIC and host stack 125 ports attached to the switch) 126 127 */ 128 129 /* 130 * OS-specific code that is used only within this file. 131 * Other OS-specific code that must be accessed by drivers 132 * is present in netmap_kern.h 133 */ 134 135 #if defined(__FreeBSD__) 136 #include <sys/cdefs.h> /* prerequisite */ 137 __FBSDID("$FreeBSD$"); 138 139 #include <sys/types.h> 140 #include <sys/module.h> 141 #include <sys/errno.h> 142 #include <sys/param.h> /* defines used in kernel.h */ 143 #include <sys/jail.h> 144 #include <sys/kernel.h> /* types used in module initialization */ 145 #include <sys/conf.h> /* cdevsw struct */ 146 #include <sys/uio.h> /* uio struct */ 147 #include <sys/sockio.h> 148 #include <sys/socketvar.h> /* struct socket */ 149 #include <sys/malloc.h> 150 #include <sys/mman.h> /* PROT_EXEC */ 151 #include <sys/poll.h> 152 #include <sys/proc.h> 153 #include <sys/rwlock.h> 154 #include <vm/vm.h> /* vtophys */ 155 #include <vm/pmap.h> /* vtophys */ 156 #include <vm/vm_param.h> 157 #include <vm/vm_object.h> 158 #include <vm/vm_page.h> 159 #include <vm/vm_pager.h> 160 #include <vm/uma.h> 161 #include <sys/socket.h> /* sockaddrs */ 162 #include <sys/selinfo.h> 163 #include <sys/sysctl.h> 164 #include <net/if.h> 165 #include <net/if_var.h> 166 #include <net/bpf.h> /* BIOCIMMEDIATE */ 167 #include <net/vnet.h> 168 #include <machine/bus.h> /* bus_dmamap_* */ 169 #include <sys/endian.h> 170 #include <sys/refcount.h> 171 172 #define prefetch(x) __builtin_prefetch(x) 173 174 #define BDG_RWLOCK_T struct rwlock // struct rwlock 175 176 #define BDG_RWINIT(b) \ 177 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 178 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 179 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 180 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 181 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 182 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 183 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 184 185 186 /* netmap global lock. 187 * normally called within the user thread (upon a system call) 188 * or when a file descriptor or process is terminated 189 * (last close or last munmap) 190 */ 191 192 #define NMG_LOCK_T struct mtx 193 #define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) 194 #define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) 195 #define NMG_LOCK() mtx_lock(&netmap_global_lock) 196 #define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) 197 #define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) 198 199 200 /* atomic operations */ 201 #include <machine/atomic.h> 202 #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) 203 #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) 204 205 206 #elif defined(linux) 207 208 #include "bsd_glue.h" 209 210 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); 211 212 static struct device_driver* 213 linux_netmap_find_driver(struct device *dev) 214 { 215 struct device_driver *dd; 216 217 while ( (dd = dev->driver) == NULL ) { 218 if ( (dev = dev->parent) == NULL ) 219 return NULL; 220 } 221 return dd; 222 } 223 224 static struct net_device* 225 ifunit_ref(const char *name) 226 { 227 struct net_device *ifp = dev_get_by_name(&init_net, name); 228 struct device_driver *dd; 229 230 if (ifp == NULL) 231 return NULL; 232 233 if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) 234 goto error; 235 236 if (!try_module_get(dd->owner)) 237 goto error; 238 239 return ifp; 240 error: 241 dev_put(ifp); 242 return NULL; 243 } 244 245 static void 246 if_rele(struct net_device *ifp) 247 { 248 struct device_driver *dd; 249 dd = linux_netmap_find_driver(&ifp->dev); 250 dev_put(ifp); 251 if (dd) 252 module_put(dd->owner); 253 } 254 255 // XXX a mtx would suffice here too 20130404 gl 256 #define NMG_LOCK_T struct semaphore 257 #define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) 258 #define NMG_LOCK_DESTROY() 259 #define NMG_LOCK() down(&netmap_global_lock) 260 #define NMG_UNLOCK() up(&netmap_global_lock) 261 #define NMG_LOCK_ASSERT() // XXX to be completed 262 263 264 #elif defined(__APPLE__) 265 266 #warning OSX support is only partial 267 #include "osx_glue.h" 268 269 #else 270 271 #error Unsupported platform 272 273 #endif /* unsupported */ 274 275 /* 276 * common headers 277 */ 278 #include <net/netmap.h> 279 #include <dev/netmap/netmap_kern.h> 280 #include <dev/netmap/netmap_mem2.h> 281 282 283 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 284 285 /* 286 * The following variables are used by the drivers and replicate 287 * fields in the global memory pool. They only refer to buffers 288 * used by physical interfaces. 289 */ 290 u_int netmap_total_buffers; 291 u_int netmap_buf_size; 292 char *netmap_buffer_base; /* also address of an invalid buffer */ 293 294 /* user-controlled variables */ 295 int netmap_verbose; 296 297 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 298 299 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 300 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 301 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 302 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 303 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 304 int netmap_mitigate = 1; 305 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 306 int netmap_no_pendintr = 1; 307 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 308 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 309 int netmap_txsync_retry = 2; 310 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 311 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 312 313 int netmap_drop = 0; /* debugging */ 314 int netmap_flags = 0; /* debug flags */ 315 int netmap_fwd = 0; /* force transparent mode */ 316 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 317 318 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); 319 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 320 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 321 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 322 323 NMG_LOCK_T netmap_global_lock; 324 325 /* 326 * protect against multiple threads using the same ring. 327 * also check that the ring has not been stopped. 328 */ 329 #define NM_KR_BUSY 1 330 #define NM_KR_STOPPED 2 331 static void nm_kr_put(struct netmap_kring *kr); 332 static __inline int nm_kr_tryget(struct netmap_kring *kr) 333 { 334 /* check a first time without taking the lock 335 * to avoid starvation for nm_kr_get() 336 */ 337 if (unlikely(kr->nkr_stopped)) { 338 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 339 return NM_KR_STOPPED; 340 } 341 if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) 342 return NM_KR_BUSY; 343 /* check a second time with lock held */ 344 if (unlikely(kr->nkr_stopped)) { 345 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 346 nm_kr_put(kr); 347 return NM_KR_STOPPED; 348 } 349 return 0; 350 } 351 352 static __inline void nm_kr_put(struct netmap_kring *kr) 353 { 354 NM_ATOMIC_CLEAR(&kr->nr_busy); 355 } 356 357 static void nm_kr_get(struct netmap_kring *kr) 358 { 359 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 360 tsleep(kr, 0, "NM_KR_GET", 4); 361 } 362 363 static void nm_disable_ring(struct netmap_kring *kr) 364 { 365 kr->nkr_stopped = 1; 366 nm_kr_get(kr); 367 mtx_lock(&kr->q_lock); 368 mtx_unlock(&kr->q_lock); 369 nm_kr_put(kr); 370 } 371 372 void netmap_disable_all_rings(struct ifnet *ifp) 373 { 374 struct netmap_adapter *na; 375 int i; 376 377 if (!(ifp->if_capenable & IFCAP_NETMAP)) 378 return; 379 380 na = NA(ifp); 381 382 for (i = 0; i < na->num_tx_rings + 1; i++) { 383 nm_disable_ring(na->tx_rings + i); 384 selwakeuppri(&na->tx_rings[i].si, PI_NET); 385 } 386 for (i = 0; i < na->num_rx_rings + 1; i++) { 387 nm_disable_ring(na->rx_rings + i); 388 selwakeuppri(&na->rx_rings[i].si, PI_NET); 389 } 390 selwakeuppri(&na->tx_si, PI_NET); 391 selwakeuppri(&na->rx_si, PI_NET); 392 } 393 394 void netmap_enable_all_rings(struct ifnet *ifp) 395 { 396 struct netmap_adapter *na; 397 int i; 398 399 if (!(ifp->if_capenable & IFCAP_NETMAP)) 400 return; 401 402 na = NA(ifp); 403 for (i = 0; i < na->num_tx_rings + 1; i++) { 404 D("enabling %p", na->tx_rings + i); 405 na->tx_rings[i].nkr_stopped = 0; 406 } 407 for (i = 0; i < na->num_rx_rings + 1; i++) { 408 D("enabling %p", na->rx_rings + i); 409 na->rx_rings[i].nkr_stopped = 0; 410 } 411 } 412 413 414 /* 415 * generic bound_checking function 416 */ 417 u_int 418 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 419 { 420 u_int oldv = *v; 421 const char *op = NULL; 422 423 if (dflt < lo) 424 dflt = lo; 425 if (dflt > hi) 426 dflt = hi; 427 if (oldv < lo) { 428 *v = dflt; 429 op = "Bump"; 430 } else if (oldv > hi) { 431 *v = hi; 432 op = "Clamp"; 433 } 434 if (op && msg) 435 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 436 return *v; 437 } 438 439 /* 440 * packet-dump function, user-supplied or static buffer. 441 * The destination buffer must be at least 30+4*len 442 */ 443 const char * 444 nm_dump_buf(char *p, int len, int lim, char *dst) 445 { 446 static char _dst[8192]; 447 int i, j, i0; 448 static char hex[] ="0123456789abcdef"; 449 char *o; /* output position */ 450 451 #define P_HI(x) hex[((x) & 0xf0)>>4] 452 #define P_LO(x) hex[((x) & 0xf)] 453 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 454 if (!dst) 455 dst = _dst; 456 if (lim <= 0 || lim > len) 457 lim = len; 458 o = dst; 459 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 460 o += strlen(o); 461 /* hexdump routine */ 462 for (i = 0; i < lim; ) { 463 sprintf(o, "%5d: ", i); 464 o += strlen(o); 465 memset(o, ' ', 48); 466 i0 = i; 467 for (j=0; j < 16 && i < lim; i++, j++) { 468 o[j*3] = P_HI(p[i]); 469 o[j*3+1] = P_LO(p[i]); 470 } 471 i = i0; 472 for (j=0; j < 16 && i < lim; i++, j++) 473 o[j + 48] = P_C(p[i]); 474 o[j+48] = '\n'; 475 o += j+49; 476 } 477 *o = '\0'; 478 #undef P_HI 479 #undef P_LO 480 #undef P_C 481 return dst; 482 } 483 484 /* 485 * system parameters (most of them in netmap_kern.h) 486 * NM_NAME prefix for switch port names, default "vale" 487 * NM_BDG_MAXPORTS number of ports 488 * NM_BRIDGES max number of switches in the system. 489 * XXX should become a sysctl or tunable 490 * 491 * Switch ports are named valeX:Y where X is the switch name and Y 492 * is the port. If Y matches a physical interface name, the port is 493 * connected to a physical device. 494 * 495 * Unlike physical interfaces, switch ports use their own memory region 496 * for rings and buffers. 497 * The virtual interfaces use per-queue lock instead of core lock. 498 * In the tx loop, we aggregate traffic in batches to make all operations 499 * faster. The batch size is bridge_batch. 500 */ 501 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 502 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 503 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 504 #define NM_BDG_HASH 1024 /* forwarding table entries */ 505 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 506 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 507 /* actual size of the tables */ 508 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 509 /* NM_FT_NULL terminates a list of slots in the ft */ 510 #define NM_FT_NULL NM_BDG_BATCH_MAX 511 #define NM_BRIDGES 8 /* number of bridges */ 512 513 514 /* 515 * bridge_batch is set via sysctl to the max batch size to be 516 * used in the bridge. The actual value may be larger as the 517 * last packet in the block may overflow the size. 518 */ 519 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 520 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 521 522 523 /* 524 * These are used to handle reference counters for bridge ports. 525 */ 526 #define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) 527 #define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) 528 529 /* The bridge references the buffers using the device specific look up table */ 530 static inline void * 531 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) 532 { 533 struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; 534 uint32_t i = slot->buf_idx; 535 return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; 536 } 537 538 static int bdg_netmap_attach(struct netmap_adapter *); 539 static int bdg_netmap_reg(struct ifnet *ifp, int onoff); 540 int kern_netmap_regif(struct nmreq *nmr); 541 542 /* 543 * Each transmit queue accumulates a batch of packets into 544 * a structure before forwarding. Packets to the same 545 * destination are put in a list using ft_next as a link field. 546 * ft_frags and ft_next are valid only on the first fragment. 547 */ 548 struct nm_bdg_fwd { /* forwarding entry for a bridge */ 549 void *ft_buf; /* netmap or indirect buffer */ 550 uint8_t ft_frags; /* how many fragments (only on 1st frag) */ 551 uint8_t _ft_port; /* dst port (unused) */ 552 uint16_t ft_flags; /* flags, e.g. indirect */ 553 uint16_t ft_len; /* src fragment len */ 554 uint16_t ft_next; /* next packet to same destination */ 555 }; 556 557 /* 558 * For each output interface, nm_bdg_q is used to construct a list. 559 * bq_len is the number of output buffers (we can have coalescing 560 * during the copy). 561 */ 562 struct nm_bdg_q { 563 uint16_t bq_head; 564 uint16_t bq_tail; 565 uint32_t bq_len; /* number of buffers */ 566 }; 567 568 /* XXX revise this */ 569 struct nm_hash_ent { 570 uint64_t mac; /* the top 2 bytes are the epoch */ 571 uint64_t ports; 572 }; 573 574 /* 575 * nm_bridge is a descriptor for a VALE switch. 576 * Interfaces for a bridge are all in bdg_ports[]. 577 * The array has fixed size, an empty entry does not terminate 578 * the search, but lookups only occur on attach/detach so we 579 * don't mind if they are slow. 580 * 581 * The bridge is non blocking on the transmit ports: excess 582 * packets are dropped if there is no room on the output port. 583 * 584 * bdg_lock protects accesses to the bdg_ports array. 585 * This is a rw lock (or equivalent). 586 */ 587 struct nm_bridge { 588 /* XXX what is the proper alignment/layout ? */ 589 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 590 int bdg_namelen; 591 uint32_t bdg_active_ports; /* 0 means free */ 592 char bdg_basename[IFNAMSIZ]; 593 594 /* Indexes of active ports (up to active_ports) 595 * and all other remaining ports. 596 */ 597 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 598 599 struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; 600 601 602 /* 603 * The function to decide the destination port. 604 * It returns either of an index of the destination port, 605 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 606 * forward this packet. ring_nr is the source ring index, and the 607 * function may overwrite this value to forward this packet to a 608 * different ring index. 609 * This function must be set by netmap_bdgctl(). 610 */ 611 bdg_lookup_fn_t nm_bdg_lookup; 612 613 /* the forwarding table, MAC+ports. 614 * XXX should be changed to an argument to be passed to 615 * the lookup function, and allocated on attach 616 */ 617 struct nm_hash_ent ht[NM_BDG_HASH]; 618 }; 619 620 621 /* 622 * XXX in principle nm_bridges could be created dynamically 623 * Right now we have a static array and deletions are protected 624 * by an exclusive lock. 625 */ 626 struct nm_bridge nm_bridges[NM_BRIDGES]; 627 628 629 /* 630 * A few function to tell which kind of port are we using. 631 * XXX should we hold a lock ? 632 * 633 * nma_is_vp() virtual port 634 * nma_is_host() port connected to the host stack 635 * nma_is_hw() port connected to a NIC 636 */ 637 int nma_is_vp(struct netmap_adapter *na); 638 int 639 nma_is_vp(struct netmap_adapter *na) 640 { 641 return na->nm_register == bdg_netmap_reg; 642 } 643 644 static __inline int 645 nma_is_host(struct netmap_adapter *na) 646 { 647 return na->nm_register == NULL; 648 } 649 650 static __inline int 651 nma_is_hw(struct netmap_adapter *na) 652 { 653 /* In case of sw adapter, nm_register is NULL */ 654 return !nma_is_vp(na) && !nma_is_host(na); 655 } 656 657 658 /* 659 * If the NIC is owned by the kernel 660 * (i.e., bridge), neither another bridge nor user can use it; 661 * if the NIC is owned by a user, only users can share it. 662 * Evaluation must be done under NMG_LOCK(). 663 */ 664 #define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) 665 #define NETMAP_OWNED_BY_ANY(ifp) \ 666 (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) 667 668 /* 669 * NA(ifp)->bdg_port port index 670 */ 671 672 673 /* 674 * this is a slightly optimized copy routine which rounds 675 * to multiple of 64 bytes and is often faster than dealing 676 * with other odd sizes. We assume there is enough room 677 * in the source and destination buffers. 678 * 679 * XXX only for multiples of 64 bytes, non overlapped. 680 */ 681 static inline void 682 pkt_copy(void *_src, void *_dst, int l) 683 { 684 uint64_t *src = _src; 685 uint64_t *dst = _dst; 686 if (unlikely(l >= 1024)) { 687 memcpy(dst, src, l); 688 return; 689 } 690 for (; likely(l > 0); l-=64) { 691 *dst++ = *src++; 692 *dst++ = *src++; 693 *dst++ = *src++; 694 *dst++ = *src++; 695 *dst++ = *src++; 696 *dst++ = *src++; 697 *dst++ = *src++; 698 *dst++ = *src++; 699 } 700 } 701 702 703 #ifdef TEST_STUFF 704 struct xxx { 705 char *name; 706 void (*fn)(uint32_t); 707 }; 708 709 710 static void 711 nm_test_defmtx(uint32_t n) 712 { 713 uint32_t i; 714 struct mtx m; 715 mtx_init(&m, "test", NULL, MTX_DEF); 716 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 717 mtx_destroy(&m); 718 return; 719 } 720 721 static void 722 nm_test_spinmtx(uint32_t n) 723 { 724 uint32_t i; 725 struct mtx m; 726 mtx_init(&m, "test", NULL, MTX_SPIN); 727 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 728 mtx_destroy(&m); 729 return; 730 } 731 732 static void 733 nm_test_rlock(uint32_t n) 734 { 735 uint32_t i; 736 struct rwlock m; 737 rw_init(&m, "test"); 738 for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); } 739 rw_destroy(&m); 740 return; 741 } 742 743 static void 744 nm_test_wlock(uint32_t n) 745 { 746 uint32_t i; 747 struct rwlock m; 748 rw_init(&m, "test"); 749 for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); } 750 rw_destroy(&m); 751 return; 752 } 753 754 static void 755 nm_test_slock(uint32_t n) 756 { 757 uint32_t i; 758 struct sx m; 759 sx_init(&m, "test"); 760 for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); } 761 sx_destroy(&m); 762 return; 763 } 764 765 static void 766 nm_test_xlock(uint32_t n) 767 { 768 uint32_t i; 769 struct sx m; 770 sx_init(&m, "test"); 771 for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); } 772 sx_destroy(&m); 773 return; 774 } 775 776 777 struct xxx nm_tests[] = { 778 { "defmtx", nm_test_defmtx }, 779 { "spinmtx", nm_test_spinmtx }, 780 { "rlock", nm_test_rlock }, 781 { "wlock", nm_test_wlock }, 782 { "slock", nm_test_slock }, 783 { "xlock", nm_test_xlock }, 784 }; 785 786 static int 787 nm_test(struct nmreq *nmr) 788 { 789 uint32_t scale, n, test; 790 static int old_test = -1; 791 792 test = nmr->nr_cmd; 793 scale = nmr->nr_offset; 794 n = sizeof(nm_tests) / sizeof(struct xxx) - 1; 795 if (test > n) { 796 D("test index too high, max %d", n); 797 return 0; 798 } 799 800 if (old_test != test) { 801 D("test %s scale %d", nm_tests[test].name, scale); 802 old_test = test; 803 } 804 nm_tests[test].fn(scale); 805 return 0; 806 } 807 #endif /* TEST_STUFF */ 808 809 /* 810 * locate a bridge among the existing ones. 811 * MUST BE CALLED WITH NMG_LOCK() 812 * 813 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 814 * We assume that this is called with a name of at least NM_NAME chars. 815 */ 816 static struct nm_bridge * 817 nm_find_bridge(const char *name, int create) 818 { 819 int i, l, namelen; 820 struct nm_bridge *b = NULL; 821 822 NMG_LOCK_ASSERT(); 823 824 namelen = strlen(NM_NAME); /* base length */ 825 l = name ? strlen(name) : 0; /* actual length */ 826 if (l < namelen) { 827 D("invalid bridge name %s", name ? name : NULL); 828 return NULL; 829 } 830 for (i = namelen + 1; i < l; i++) { 831 if (name[i] == ':') { 832 namelen = i; 833 break; 834 } 835 } 836 if (namelen >= IFNAMSIZ) 837 namelen = IFNAMSIZ; 838 ND("--- prefix is '%.*s' ---", namelen, name); 839 840 /* lookup the name, remember empty slot if there is one */ 841 for (i = 0; i < NM_BRIDGES; i++) { 842 struct nm_bridge *x = nm_bridges + i; 843 844 if (x->bdg_active_ports == 0) { 845 if (create && b == NULL) 846 b = x; /* record empty slot */ 847 } else if (x->bdg_namelen != namelen) { 848 continue; 849 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 850 ND("found '%.*s' at %d", namelen, name, i); 851 b = x; 852 break; 853 } 854 } 855 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 856 /* initialize the bridge */ 857 strncpy(b->bdg_basename, name, namelen); 858 ND("create new bridge %s with ports %d", b->bdg_basename, 859 b->bdg_active_ports); 860 b->bdg_namelen = namelen; 861 b->bdg_active_ports = 0; 862 for (i = 0; i < NM_BDG_MAXPORTS; i++) 863 b->bdg_port_index[i] = i; 864 /* set the default function */ 865 b->nm_bdg_lookup = netmap_bdg_learning; 866 /* reset the MAC address table */ 867 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 868 } 869 return b; 870 } 871 872 873 /* 874 * Free the forwarding tables for rings attached to switch ports. 875 */ 876 static void 877 nm_free_bdgfwd(struct netmap_adapter *na) 878 { 879 int nrings, i; 880 struct netmap_kring *kring; 881 882 NMG_LOCK_ASSERT(); 883 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 884 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 885 for (i = 0; i < nrings; i++) { 886 if (kring[i].nkr_ft) { 887 free(kring[i].nkr_ft, M_DEVBUF); 888 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 889 } 890 } 891 if (nma_is_hw(na)) 892 nm_free_bdgfwd(SWNA(na->ifp)); 893 } 894 895 896 /* 897 * Allocate the forwarding tables for the rings attached to the bridge ports. 898 */ 899 static int 900 nm_alloc_bdgfwd(struct netmap_adapter *na) 901 { 902 int nrings, l, i, num_dstq; 903 struct netmap_kring *kring; 904 905 NMG_LOCK_ASSERT(); 906 /* all port:rings + broadcast */ 907 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 908 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 909 l += sizeof(struct nm_bdg_q) * num_dstq; 910 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 911 912 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 913 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 914 for (i = 0; i < nrings; i++) { 915 struct nm_bdg_fwd *ft; 916 struct nm_bdg_q *dstq; 917 int j; 918 919 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 920 if (!ft) { 921 nm_free_bdgfwd(na); 922 return ENOMEM; 923 } 924 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 925 for (j = 0; j < num_dstq; j++) { 926 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 927 dstq[j].bq_len = 0; 928 } 929 kring[i].nkr_ft = ft; 930 } 931 if (nma_is_hw(na)) 932 nm_alloc_bdgfwd(SWNA(na->ifp)); 933 return 0; 934 } 935 936 937 /* 938 * Fetch configuration from the device, to cope with dynamic 939 * reconfigurations after loading the module. 940 */ 941 static int 942 netmap_update_config(struct netmap_adapter *na) 943 { 944 struct ifnet *ifp = na->ifp; 945 u_int txr, txd, rxr, rxd; 946 947 txr = txd = rxr = rxd = 0; 948 if (na->nm_config) { 949 na->nm_config(ifp, &txr, &txd, &rxr, &rxd); 950 } else { 951 /* take whatever we had at init time */ 952 txr = na->num_tx_rings; 953 txd = na->num_tx_desc; 954 rxr = na->num_rx_rings; 955 rxd = na->num_rx_desc; 956 } 957 958 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 959 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 960 return 0; /* nothing changed */ 961 if (netmap_verbose || na->refcount > 0) { 962 D("stored config %s: txring %d x %d, rxring %d x %d", 963 ifp->if_xname, 964 na->num_tx_rings, na->num_tx_desc, 965 na->num_rx_rings, na->num_rx_desc); 966 D("new config %s: txring %d x %d, rxring %d x %d", 967 ifp->if_xname, txr, txd, rxr, rxd); 968 } 969 if (na->refcount == 0) { 970 D("configuration changed (but fine)"); 971 na->num_tx_rings = txr; 972 na->num_tx_desc = txd; 973 na->num_rx_rings = rxr; 974 na->num_rx_desc = rxd; 975 return 0; 976 } 977 D("configuration changed while active, this is bad..."); 978 return 1; 979 } 980 981 static struct netmap_if * 982 netmap_if_new(const char *ifname, struct netmap_adapter *na) 983 { 984 if (netmap_update_config(na)) { 985 /* configuration mismatch, report and fail */ 986 return NULL; 987 } 988 return netmap_mem_if_new(ifname, na); 989 } 990 991 992 /* Structure associated to each thread which registered an interface. 993 * 994 * The first 4 fields of this structure are written by NIOCREGIF and 995 * read by poll() and NIOC?XSYNC. 996 * There is low contention among writers (actually, a correct user program 997 * should have no contention among writers) and among writers and readers, 998 * so we use a single global lock to protect the structure initialization. 999 * Since initialization involves the allocation of memory, we reuse the memory 1000 * allocator lock. 1001 * Read access to the structure is lock free. Readers must check that 1002 * np_nifp is not NULL before using the other fields. 1003 * If np_nifp is NULL initialization has not been performed, so they should 1004 * return an error to userlevel. 1005 * 1006 * The ref_done field is used to regulate access to the refcount in the 1007 * memory allocator. The refcount must be incremented at most once for 1008 * each open("/dev/netmap"). The increment is performed by the first 1009 * function that calls netmap_get_memory() (currently called by 1010 * mmap(), NIOCGINFO and NIOCREGIF). 1011 * If the refcount is incremented, it is then decremented when the 1012 * private structure is destroyed. 1013 */ 1014 struct netmap_priv_d { 1015 struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ 1016 1017 struct ifnet *np_ifp; /* device for which we hold a ref. */ 1018 int np_ringid; /* from the ioctl */ 1019 u_int np_qfirst, np_qlast; /* range of rings to scan */ 1020 uint16_t np_txpoll; 1021 1022 struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ 1023 #ifdef __FreeBSD__ 1024 int np_refcount; /* use with NMG_LOCK held */ 1025 #endif /* __FreeBSD__ */ 1026 }; 1027 1028 /* grab a reference to the memory allocator, if we don't have one already. The 1029 * reference is taken from the netmap_adapter registered with the priv. 1030 * 1031 */ 1032 static int 1033 netmap_get_memory_locked(struct netmap_priv_d* p) 1034 { 1035 struct netmap_mem_d *nmd; 1036 int error = 0; 1037 1038 if (p->np_ifp == NULL) { 1039 if (!netmap_mmap_unreg) 1040 return ENODEV; 1041 /* for compatibility with older versions of the API 1042 * we use the global allocator when no interface has been 1043 * registered 1044 */ 1045 nmd = &nm_mem; 1046 } else { 1047 nmd = NA(p->np_ifp)->nm_mem; 1048 } 1049 if (p->np_mref == NULL) { 1050 error = netmap_mem_finalize(nmd); 1051 if (!error) 1052 p->np_mref = nmd; 1053 } else if (p->np_mref != nmd) { 1054 /* a virtual port has been registered, but previous 1055 * syscalls already used the global allocator. 1056 * We cannot continue 1057 */ 1058 error = ENODEV; 1059 } 1060 return error; 1061 } 1062 1063 static int 1064 netmap_get_memory(struct netmap_priv_d* p) 1065 { 1066 int error; 1067 NMG_LOCK(); 1068 error = netmap_get_memory_locked(p); 1069 NMG_UNLOCK(); 1070 return error; 1071 } 1072 1073 static int 1074 netmap_have_memory_locked(struct netmap_priv_d* p) 1075 { 1076 return p->np_mref != NULL; 1077 } 1078 1079 static void 1080 netmap_drop_memory_locked(struct netmap_priv_d* p) 1081 { 1082 if (p->np_mref) { 1083 netmap_mem_deref(p->np_mref); 1084 p->np_mref = NULL; 1085 } 1086 } 1087 1088 /* 1089 * File descriptor's private data destructor. 1090 * 1091 * Call nm_register(ifp,0) to stop netmap mode on the interface and 1092 * revert to normal operation. We expect that np_ifp has not gone. 1093 * The second argument is the nifp to work on. In some cases it is 1094 * not attached yet to the netmap_priv_d so we need to pass it as 1095 * a separate argument. 1096 */ 1097 /* call with NMG_LOCK held */ 1098 static void 1099 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 1100 { 1101 struct ifnet *ifp = priv->np_ifp; 1102 struct netmap_adapter *na = NA(ifp); 1103 1104 NMG_LOCK_ASSERT(); 1105 na->refcount--; 1106 if (na->refcount <= 0) { /* last instance */ 1107 u_int i; 1108 1109 if (netmap_verbose) 1110 D("deleting last instance for %s", ifp->if_xname); 1111 /* 1112 * (TO CHECK) This function is only called 1113 * when the last reference to this file descriptor goes 1114 * away. This means we cannot have any pending poll() 1115 * or interrupt routine operating on the structure. 1116 * XXX The file may be closed in a thread while 1117 * another thread is using it. 1118 * Linux keeps the file opened until the last reference 1119 * by any outstanding ioctl/poll or mmap is gone. 1120 * FreeBSD does not track mmap()s (but we do) and 1121 * wakes up any sleeping poll(). Need to check what 1122 * happens if the close() occurs while a concurrent 1123 * syscall is running. 1124 */ 1125 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ 1126 /* Wake up any sleeping threads. netmap_poll will 1127 * then return POLLERR 1128 * XXX The wake up now must happen during *_down(), when 1129 * we order all activities to stop. -gl 1130 */ 1131 nm_free_bdgfwd(na); 1132 for (i = 0; i < na->num_tx_rings + 1; i++) { 1133 mtx_destroy(&na->tx_rings[i].q_lock); 1134 } 1135 for (i = 0; i < na->num_rx_rings + 1; i++) { 1136 mtx_destroy(&na->rx_rings[i].q_lock); 1137 } 1138 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 1139 /* knlist_destroy(&na->tx_si.si_note); */ 1140 /* knlist_destroy(&na->rx_si.si_note); */ 1141 if (nma_is_hw(na)) 1142 SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; 1143 } 1144 /* 1145 * netmap_mem_if_delete() deletes the nifp, and if this is 1146 * the last instance also buffers, rings and krings. 1147 */ 1148 netmap_mem_if_delete(na, nifp); 1149 } 1150 1151 1152 /* we assume netmap adapter exists 1153 * Called with NMG_LOCK held 1154 */ 1155 static void 1156 nm_if_rele(struct ifnet *ifp) 1157 { 1158 int i, is_hw, hw, sw, lim; 1159 struct nm_bridge *b; 1160 struct netmap_adapter *na; 1161 uint8_t tmp[NM_BDG_MAXPORTS]; 1162 1163 NMG_LOCK_ASSERT(); 1164 /* I can be called not only for get_ifp()-ed references where netmap's 1165 * capability is guaranteed, but also for non-netmap-capable NICs. 1166 */ 1167 if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { 1168 if_rele(ifp); 1169 return; 1170 } 1171 na = NA(ifp); 1172 b = na->na_bdg; 1173 is_hw = nma_is_hw(na); 1174 1175 ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); 1176 1177 if (!DROP_BDG_REF(ifp)) 1178 return; 1179 1180 /* 1181 New algorithm: 1182 make a copy of bdg_port_index; 1183 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 1184 in the array of bdg_port_index, replacing them with 1185 entries from the bottom of the array; 1186 decrement bdg_active_ports; 1187 acquire BDG_WLOCK() and copy back the array. 1188 */ 1189 1190 hw = NA(ifp)->bdg_port; 1191 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1192 lim = b->bdg_active_ports; 1193 1194 ND("detach %d and %d (lim %d)", hw, sw, lim); 1195 /* make a copy of the list of active ports, update it, 1196 * and then copy back within BDG_WLOCK(). 1197 */ 1198 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 1199 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 1200 if (hw >= 0 && tmp[i] == hw) { 1201 ND("detach hw %d at %d", hw, i); 1202 lim--; /* point to last active port */ 1203 tmp[i] = tmp[lim]; /* swap with i */ 1204 tmp[lim] = hw; /* now this is inactive */ 1205 hw = -1; 1206 } else if (sw >= 0 && tmp[i] == sw) { 1207 ND("detach sw %d at %d", sw, i); 1208 lim--; 1209 tmp[i] = tmp[lim]; 1210 tmp[lim] = sw; 1211 sw = -1; 1212 } else { 1213 i++; 1214 } 1215 } 1216 if (hw >= 0 || sw >= 0) { 1217 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 1218 } 1219 hw = NA(ifp)->bdg_port; 1220 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1221 1222 BDG_WLOCK(b); 1223 b->bdg_ports[hw] = NULL; 1224 na->na_bdg = NULL; 1225 if (sw >= 0) { 1226 b->bdg_ports[sw] = NULL; 1227 SWNA(ifp)->na_bdg = NULL; 1228 } 1229 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 1230 b->bdg_active_ports = lim; 1231 BDG_WUNLOCK(b); 1232 1233 ND("now %d active ports", lim); 1234 if (lim == 0) { 1235 ND("marking bridge %s as free", b->bdg_basename); 1236 b->nm_bdg_lookup = NULL; 1237 } 1238 1239 if (is_hw) { 1240 if_rele(ifp); 1241 } else { 1242 if (na->na_flags & NAF_MEM_OWNER) 1243 netmap_mem_private_delete(na->nm_mem); 1244 bzero(na, sizeof(*na)); 1245 free(na, M_DEVBUF); 1246 bzero(ifp, sizeof(*ifp)); 1247 free(ifp, M_DEVBUF); 1248 } 1249 } 1250 1251 1252 /* 1253 * returns 1 if this is the last instance and we can free priv 1254 */ 1255 static int 1256 netmap_dtor_locked(struct netmap_priv_d *priv) 1257 { 1258 struct ifnet *ifp = priv->np_ifp; 1259 1260 #ifdef __FreeBSD__ 1261 /* 1262 * np_refcount is the number of active mmaps on 1263 * this file descriptor 1264 */ 1265 if (--priv->np_refcount > 0) { 1266 return 0; 1267 } 1268 #endif /* __FreeBSD__ */ 1269 if (ifp) { 1270 netmap_do_unregif(priv, priv->np_nifp); 1271 } 1272 netmap_drop_memory_locked(priv); 1273 if (ifp) { 1274 nm_if_rele(ifp); /* might also destroy *na */ 1275 } 1276 return 1; 1277 } 1278 1279 static void 1280 netmap_dtor(void *data) 1281 { 1282 struct netmap_priv_d *priv = data; 1283 int last_instance; 1284 1285 NMG_LOCK(); 1286 last_instance = netmap_dtor_locked(priv); 1287 NMG_UNLOCK(); 1288 if (last_instance) { 1289 bzero(priv, sizeof(*priv)); /* for safety */ 1290 free(priv, M_DEVBUF); 1291 } 1292 } 1293 1294 1295 #ifdef __FreeBSD__ 1296 1297 /* 1298 * In order to track whether pages are still mapped, we hook into 1299 * the standard cdev_pager and intercept the constructor and 1300 * destructor. 1301 */ 1302 1303 struct netmap_vm_handle_t { 1304 struct cdev *dev; 1305 struct netmap_priv_d *priv; 1306 }; 1307 1308 static int 1309 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 1310 vm_ooffset_t foff, struct ucred *cred, u_short *color) 1311 { 1312 struct netmap_vm_handle_t *vmh = handle; 1313 D("handle %p size %jd prot %d foff %jd", 1314 handle, (intmax_t)size, prot, (intmax_t)foff); 1315 dev_ref(vmh->dev); 1316 return 0; 1317 } 1318 1319 1320 static void 1321 netmap_dev_pager_dtor(void *handle) 1322 { 1323 struct netmap_vm_handle_t *vmh = handle; 1324 struct cdev *dev = vmh->dev; 1325 struct netmap_priv_d *priv = vmh->priv; 1326 D("handle %p", handle); 1327 netmap_dtor(priv); 1328 free(vmh, M_DEVBUF); 1329 dev_rel(dev); 1330 } 1331 1332 static int 1333 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, 1334 int prot, vm_page_t *mres) 1335 { 1336 struct netmap_vm_handle_t *vmh = object->handle; 1337 struct netmap_priv_d *priv = vmh->priv; 1338 vm_paddr_t paddr; 1339 vm_page_t page; 1340 vm_memattr_t memattr; 1341 vm_pindex_t pidx; 1342 1343 ND("object %p offset %jd prot %d mres %p", 1344 object, (intmax_t)offset, prot, mres); 1345 memattr = object->memattr; 1346 pidx = OFF_TO_IDX(offset); 1347 paddr = netmap_mem_ofstophys(priv->np_mref, offset); 1348 if (paddr == 0) 1349 return VM_PAGER_FAIL; 1350 1351 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 1352 /* 1353 * If the passed in result page is a fake page, update it with 1354 * the new physical address. 1355 */ 1356 page = *mres; 1357 vm_page_updatefake(page, paddr, memattr); 1358 } else { 1359 /* 1360 * Replace the passed in reqpage page with our own fake page and 1361 * free up the all of the original pages. 1362 */ 1363 #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ 1364 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK 1365 #define VM_OBJECT_WLOCK VM_OBJECT_LOCK 1366 #endif /* VM_OBJECT_WUNLOCK */ 1367 1368 VM_OBJECT_WUNLOCK(object); 1369 page = vm_page_getfake(paddr, memattr); 1370 VM_OBJECT_WLOCK(object); 1371 vm_page_lock(*mres); 1372 vm_page_free(*mres); 1373 vm_page_unlock(*mres); 1374 *mres = page; 1375 vm_page_insert(page, object, pidx); 1376 } 1377 page->valid = VM_PAGE_BITS_ALL; 1378 return (VM_PAGER_OK); 1379 } 1380 1381 1382 static struct cdev_pager_ops netmap_cdev_pager_ops = { 1383 .cdev_pg_ctor = netmap_dev_pager_ctor, 1384 .cdev_pg_dtor = netmap_dev_pager_dtor, 1385 .cdev_pg_fault = netmap_dev_pager_fault, 1386 }; 1387 1388 1389 static int 1390 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, 1391 vm_size_t objsize, vm_object_t *objp, int prot) 1392 { 1393 int error; 1394 struct netmap_vm_handle_t *vmh; 1395 struct netmap_priv_d *priv; 1396 vm_object_t obj; 1397 1398 D("cdev %p foff %jd size %jd objp %p prot %d", cdev, 1399 (intmax_t )*foff, (intmax_t )objsize, objp, prot); 1400 1401 vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, 1402 M_NOWAIT | M_ZERO); 1403 if (vmh == NULL) 1404 return ENOMEM; 1405 vmh->dev = cdev; 1406 1407 NMG_LOCK(); 1408 error = devfs_get_cdevpriv((void**)&priv); 1409 if (error) 1410 goto err_unlock; 1411 vmh->priv = priv; 1412 priv->np_refcount++; 1413 NMG_UNLOCK(); 1414 1415 error = netmap_get_memory(priv); 1416 if (error) 1417 goto err_deref; 1418 1419 obj = cdev_pager_allocate(vmh, OBJT_DEVICE, 1420 &netmap_cdev_pager_ops, objsize, prot, 1421 *foff, NULL); 1422 if (obj == NULL) { 1423 D("cdev_pager_allocate failed"); 1424 error = EINVAL; 1425 goto err_deref; 1426 } 1427 1428 *objp = obj; 1429 return 0; 1430 1431 err_deref: 1432 NMG_LOCK(); 1433 priv->np_refcount--; 1434 err_unlock: 1435 NMG_UNLOCK(); 1436 // err: 1437 free(vmh, M_DEVBUF); 1438 return error; 1439 } 1440 1441 1442 // XXX can we remove this ? 1443 static int 1444 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) 1445 { 1446 if (netmap_verbose) 1447 D("dev %p fflag 0x%x devtype %d td %p", 1448 dev, fflag, devtype, td); 1449 return 0; 1450 } 1451 1452 1453 static int 1454 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 1455 { 1456 struct netmap_priv_d *priv; 1457 int error; 1458 1459 (void)dev; 1460 (void)oflags; 1461 (void)devtype; 1462 (void)td; 1463 1464 // XXX wait or nowait ? 1465 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 1466 M_NOWAIT | M_ZERO); 1467 if (priv == NULL) 1468 return ENOMEM; 1469 1470 error = devfs_set_cdevpriv(priv, netmap_dtor); 1471 if (error) 1472 return error; 1473 1474 priv->np_refcount = 1; 1475 1476 return 0; 1477 } 1478 #endif /* __FreeBSD__ */ 1479 1480 1481 /* 1482 * Handlers for synchronization of the queues from/to the host. 1483 * Netmap has two operating modes: 1484 * - in the default mode, the rings connected to the host stack are 1485 * just another ring pair managed by userspace; 1486 * - in transparent mode (XXX to be defined) incoming packets 1487 * (from the host or the NIC) are marked as NS_FORWARD upon 1488 * arrival, and the user application has a chance to reset the 1489 * flag for packets that should be dropped. 1490 * On the RXSYNC or poll(), packets in RX rings between 1491 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1492 * to the other side. 1493 * The transfer NIC --> host is relatively easy, just encapsulate 1494 * into mbufs and we are done. The host --> NIC side is slightly 1495 * harder because there might not be room in the tx ring so it 1496 * might take a while before releasing the buffer. 1497 */ 1498 1499 1500 /* 1501 * pass a chain of buffers to the host stack as coming from 'dst' 1502 */ 1503 static void 1504 netmap_send_up(struct ifnet *dst, struct mbuf *head) 1505 { 1506 struct mbuf *m; 1507 1508 /* send packets up, outside the lock */ 1509 while ((m = head) != NULL) { 1510 head = head->m_nextpkt; 1511 m->m_nextpkt = NULL; 1512 if (netmap_verbose & NM_VERB_HOST) 1513 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1514 NM_SEND_UP(dst, m); 1515 } 1516 } 1517 1518 struct mbq { 1519 struct mbuf *head; 1520 struct mbuf *tail; 1521 int count; 1522 }; 1523 1524 1525 /* 1526 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1527 * Run from hwcur to cur - reserved 1528 */ 1529 static void 1530 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1531 { 1532 /* Take packets from hwcur to cur-reserved and pass them up. 1533 * In case of no buffers we give up. At the end of the loop, 1534 * the queue is drained in all cases. 1535 * XXX handle reserved 1536 */ 1537 u_int lim = kring->nkr_num_slots - 1; 1538 struct mbuf *m, *tail = q->tail; 1539 u_int k = kring->ring->cur, n = kring->ring->reserved; 1540 struct netmap_mem_d *nmd = kring->na->nm_mem; 1541 1542 /* compute the final position, ring->cur - ring->reserved */ 1543 if (n > 0) { 1544 if (k < n) 1545 k += kring->nkr_num_slots; 1546 k += n; 1547 } 1548 for (n = kring->nr_hwcur; n != k;) { 1549 struct netmap_slot *slot = &kring->ring->slot[n]; 1550 1551 n = nm_next(n, lim); 1552 if ((slot->flags & NS_FORWARD) == 0 && !force) 1553 continue; 1554 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { 1555 D("bad pkt at %d len %d", n, slot->len); 1556 continue; 1557 } 1558 slot->flags &= ~NS_FORWARD; // XXX needed ? 1559 /* XXX adapt to the case of a multisegment packet */ 1560 m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); 1561 1562 if (m == NULL) 1563 break; 1564 if (tail) 1565 tail->m_nextpkt = m; 1566 else 1567 q->head = m; 1568 tail = m; 1569 q->count++; 1570 m->m_nextpkt = NULL; 1571 } 1572 q->tail = tail; 1573 } 1574 1575 1576 /* 1577 * The host ring has packets from nr_hwcur to (cur - reserved) 1578 * to be sent down to the NIC. 1579 * We need to use the queue lock on the source (host RX ring) 1580 * to protect against netmap_transmit. 1581 * If the user is well behaved we do not need to acquire locks 1582 * on the destination(s), 1583 * so we only need to make sure that there are no panics because 1584 * of user errors. 1585 * XXX verify 1586 * 1587 * We scan the tx rings, which have just been 1588 * flushed so nr_hwcur == cur. Pushing packets down means 1589 * increment cur and decrement avail. 1590 * XXX to be verified 1591 */ 1592 static void 1593 netmap_sw_to_nic(struct netmap_adapter *na) 1594 { 1595 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1596 struct netmap_kring *k1 = &na->tx_rings[0]; 1597 u_int i, howmany, src_lim, dst_lim; 1598 1599 /* XXX we should also check that the carrier is on */ 1600 if (kring->nkr_stopped) 1601 return; 1602 1603 mtx_lock(&kring->q_lock); 1604 1605 if (kring->nkr_stopped) 1606 goto out; 1607 1608 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 1609 1610 src_lim = kring->nkr_num_slots - 1; 1611 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 1612 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 1613 dst_lim = k1->nkr_num_slots - 1; 1614 while (howmany > 0 && k1->ring->avail > 0) { 1615 struct netmap_slot *src, *dst, tmp; 1616 src = &kring->ring->slot[kring->nr_hwcur]; 1617 dst = &k1->ring->slot[k1->ring->cur]; 1618 tmp = *src; 1619 src->buf_idx = dst->buf_idx; 1620 src->flags = NS_BUF_CHANGED; 1621 1622 dst->buf_idx = tmp.buf_idx; 1623 dst->len = tmp.len; 1624 dst->flags = NS_BUF_CHANGED; 1625 ND("out len %d buf %d from %d to %d", 1626 dst->len, dst->buf_idx, 1627 kring->nr_hwcur, k1->ring->cur); 1628 1629 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 1630 howmany--; 1631 kring->nr_hwavail--; 1632 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 1633 k1->ring->avail--; 1634 } 1635 kring->ring->cur = kring->nr_hwcur; // XXX 1636 k1++; // XXX why? 1637 } 1638 out: 1639 mtx_unlock(&kring->q_lock); 1640 } 1641 1642 1643 /* 1644 * netmap_txsync_to_host() passes packets up. We are called from a 1645 * system call in user process context, and the only contention 1646 * can be among multiple user threads erroneously calling 1647 * this routine concurrently. 1648 */ 1649 static void 1650 netmap_txsync_to_host(struct netmap_adapter *na) 1651 { 1652 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1653 struct netmap_ring *ring = kring->ring; 1654 u_int k, lim = kring->nkr_num_slots - 1; 1655 struct mbq q = { NULL, NULL, 0 }; 1656 1657 if (nm_kr_tryget(kring)) { 1658 D("ring %p busy (user error)", kring); 1659 return; 1660 } 1661 k = ring->cur; 1662 if (k > lim) { 1663 D("invalid ring index in stack TX kring %p", kring); 1664 netmap_ring_reinit(kring); 1665 nm_kr_put(kring); 1666 return; 1667 } 1668 1669 /* Take packets from hwcur to cur and pass them up. 1670 * In case of no buffers we give up. At the end of the loop, 1671 * the queue is drained in all cases. 1672 */ 1673 netmap_grab_packets(kring, &q, 1); 1674 kring->nr_hwcur = k; 1675 kring->nr_hwavail = ring->avail = lim; 1676 1677 nm_kr_put(kring); 1678 netmap_send_up(na->ifp, q.head); 1679 } 1680 1681 1682 /* 1683 * This is the 'txsync' handler to send from a software ring to the 1684 * host stack. 1685 */ 1686 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ 1687 static int 1688 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) 1689 { 1690 (void)ring_nr; 1691 (void)flags; 1692 if (netmap_verbose > 255) 1693 RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); 1694 netmap_txsync_to_host(NA(ifp)); 1695 return 0; 1696 } 1697 1698 1699 /* 1700 * rxsync backend for packets coming from the host stack. 1701 * They have been put in the queue by netmap_transmit() so we 1702 * need to protect access to the kring using a lock. 1703 * 1704 * This routine also does the selrecord if called from the poll handler 1705 * (we know because td != NULL). 1706 * 1707 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1708 * as an additional hidden argument. 1709 */ 1710 static void 1711 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1712 { 1713 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1714 struct netmap_ring *ring = kring->ring; 1715 u_int j, n, lim = kring->nkr_num_slots; 1716 u_int k = ring->cur, resvd = ring->reserved; 1717 1718 (void)pwait; /* disable unused warnings */ 1719 1720 if (kring->nkr_stopped) /* check a first time without lock */ 1721 return; 1722 1723 /* XXX as an optimization we could reuse na->core_lock */ 1724 mtx_lock(&kring->q_lock); 1725 1726 if (kring->nkr_stopped) /* check again with lock held */ 1727 goto unlock_out; 1728 1729 if (k >= lim) { 1730 netmap_ring_reinit(kring); 1731 goto unlock_out; 1732 } 1733 /* new packets are already set in nr_hwavail */ 1734 /* skip past packets that userspace has released */ 1735 j = kring->nr_hwcur; 1736 if (resvd > 0) { 1737 if (resvd + ring->avail >= lim + 1) { 1738 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 1739 ring->reserved = resvd = 0; // XXX panic... 1740 } 1741 k = (k >= resvd) ? k - resvd : k + lim - resvd; 1742 } 1743 if (j != k) { 1744 n = k >= j ? k - j : k + lim - j; 1745 kring->nr_hwavail -= n; 1746 kring->nr_hwcur = k; 1747 } 1748 k = ring->avail = kring->nr_hwavail - resvd; 1749 if (k == 0 && td) 1750 selrecord(td, &kring->si); 1751 if (k && (netmap_verbose & NM_VERB_HOST)) 1752 D("%d pkts from stack", k); 1753 unlock_out: 1754 1755 mtx_unlock(&kring->q_lock); 1756 } 1757 1758 1759 /* 1760 * MUST BE CALLED UNDER NMG_LOCK() 1761 * 1762 * get a refcounted reference to an interface. 1763 * This is always called in the execution of an ioctl(). 1764 * 1765 * Return ENXIO if the interface does not exist, EINVAL if netmap 1766 * is not supported by the interface. 1767 * If successful, hold a reference. 1768 * 1769 * When the NIC is attached to a bridge, reference is managed 1770 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1771 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1772 * is detached from the bridge, then ifp's refcount is dropped (this 1773 * is equivalent to that ifp is destroyed in case of virtual ports. 1774 * 1775 * This function uses if_rele() when we want to prevent the NIC from 1776 * being detached from the bridge in error handling. But once refcount 1777 * is acquired by this function, it must be released using nm_if_rele(). 1778 */ 1779 static int 1780 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) 1781 { 1782 const char *name = nmr->nr_name; 1783 int namelen = strlen(name); 1784 struct ifnet *iter = NULL; 1785 int no_prefix = 0; 1786 1787 /* first try to see if this is a bridge port. */ 1788 struct nm_bridge *b; 1789 struct netmap_adapter *na; 1790 int i, j, cand = -1, cand2 = -1; 1791 int needed; 1792 1793 NMG_LOCK_ASSERT(); 1794 *ifp = NULL; /* default */ 1795 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { 1796 no_prefix = 1; /* no VALE prefix */ 1797 goto no_bridge_port; 1798 } 1799 1800 b = nm_find_bridge(name, create); 1801 if (b == NULL) { 1802 D("no bridges available for '%s'", name); 1803 return (ENXIO); 1804 } 1805 1806 /* Now we are sure that name starts with the bridge's name, 1807 * lookup the port in the bridge. We need to scan the entire 1808 * list. It is not important to hold a WLOCK on the bridge 1809 * during the search because NMG_LOCK already guarantees 1810 * that there are no other possible writers. 1811 */ 1812 1813 /* lookup in the local list of ports */ 1814 for (j = 0; j < b->bdg_active_ports; j++) { 1815 i = b->bdg_port_index[j]; 1816 na = b->bdg_ports[i]; 1817 // KASSERT(na != NULL); 1818 iter = na->ifp; 1819 /* XXX make sure the name only contains one : */ 1820 if (!strcmp(iter->if_xname, name) /* virtual port */ || 1821 (namelen > b->bdg_namelen && !strcmp(iter->if_xname, 1822 name + b->bdg_namelen + 1)) /* NIC */) { 1823 ADD_BDG_REF(iter); 1824 ND("found existing if %s refs %d", name, 1825 NA(iter)->na_bdg_refcount); 1826 *ifp = iter; 1827 /* we are done, this is surely netmap capable */ 1828 return 0; 1829 } 1830 } 1831 /* not found, should we create it? */ 1832 if (!create) 1833 return ENXIO; 1834 /* yes we should, see if we have space to attach entries */ 1835 needed = 2; /* in some cases we only need 1 */ 1836 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 1837 D("bridge full %d, cannot create new port", b->bdg_active_ports); 1838 return EINVAL; 1839 } 1840 /* record the next two ports available, but do not allocate yet */ 1841 cand = b->bdg_port_index[b->bdg_active_ports]; 1842 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 1843 ND("+++ bridge %s port %s used %d avail %d %d", 1844 b->bdg_basename, name, b->bdg_active_ports, cand, cand2); 1845 1846 /* 1847 * try see if there is a matching NIC with this name 1848 * (after the bridge's name) 1849 */ 1850 iter = ifunit_ref(name + b->bdg_namelen + 1); 1851 if (!iter) { /* this is a virtual port */ 1852 /* Create a temporary NA with arguments, then 1853 * bdg_netmap_attach() will allocate the real one 1854 * and attach it to the ifp 1855 */ 1856 struct netmap_adapter tmp_na; 1857 int error; 1858 1859 if (nmr->nr_cmd) { 1860 /* nr_cmd must be 0 for a virtual port */ 1861 return EINVAL; 1862 } 1863 bzero(&tmp_na, sizeof(tmp_na)); 1864 /* bound checking */ 1865 tmp_na.num_tx_rings = nmr->nr_tx_rings; 1866 nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1867 nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back 1868 tmp_na.num_rx_rings = nmr->nr_rx_rings; 1869 nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1870 nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back 1871 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1872 1, NM_BDG_MAXSLOTS, NULL); 1873 tmp_na.num_tx_desc = nmr->nr_tx_slots; 1874 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1875 1, NM_BDG_MAXSLOTS, NULL); 1876 tmp_na.num_rx_desc = nmr->nr_rx_slots; 1877 1878 /* create a struct ifnet for the new port. 1879 * need M_NOWAIT as we are under nma_lock 1880 */ 1881 iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); 1882 if (!iter) 1883 return ENOMEM; 1884 1885 strcpy(iter->if_xname, name); 1886 tmp_na.ifp = iter; 1887 /* bdg_netmap_attach creates a struct netmap_adapter */ 1888 error = bdg_netmap_attach(&tmp_na); 1889 if (error) { 1890 D("error %d", error); 1891 free(iter, M_DEVBUF); 1892 return error; 1893 } 1894 cand2 = -1; /* only need one port */ 1895 } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ 1896 /* make sure the NIC is not already in use */ 1897 if (NETMAP_OWNED_BY_ANY(iter)) { 1898 D("NIC %s busy, cannot attach to bridge", 1899 iter->if_xname); 1900 if_rele(iter); /* don't detach from bridge */ 1901 return EINVAL; 1902 } 1903 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 1904 cand2 = -1; /* only need one port */ 1905 } else { /* not a netmap-capable NIC */ 1906 if_rele(iter); /* don't detach from bridge */ 1907 return EINVAL; 1908 } 1909 na = NA(iter); 1910 1911 BDG_WLOCK(b); 1912 na->bdg_port = cand; 1913 ND("NIC %p to bridge port %d", NA(iter), cand); 1914 /* bind the port to the bridge (virtual ports are not active) */ 1915 b->bdg_ports[cand] = na; 1916 na->na_bdg = b; 1917 b->bdg_active_ports++; 1918 if (cand2 >= 0) { 1919 /* also bind the host stack to the bridge */ 1920 b->bdg_ports[cand2] = SWNA(iter); 1921 SWNA(iter)->bdg_port = cand2; 1922 SWNA(iter)->na_bdg = b; 1923 b->bdg_active_ports++; 1924 ND("host %p to bridge port %d", SWNA(iter), cand2); 1925 } 1926 ADD_BDG_REF(iter); // XXX one or two ? 1927 ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); 1928 BDG_WUNLOCK(b); 1929 *ifp = iter; 1930 return 0; 1931 1932 no_bridge_port: 1933 *ifp = iter; 1934 if (! *ifp) 1935 *ifp = ifunit_ref(name); 1936 if (*ifp == NULL) 1937 return (ENXIO); 1938 1939 if (NETMAP_CAPABLE(*ifp)) { 1940 /* Users cannot use the NIC attached to a bridge directly */ 1941 if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { 1942 if_rele(*ifp); /* don't detach from bridge */ 1943 return EINVAL; 1944 } else 1945 return 0; /* valid pointer, we hold the refcount */ 1946 } 1947 nm_if_rele(*ifp); 1948 return EINVAL; // not NETMAP capable 1949 } 1950 1951 1952 /* 1953 * Error routine called when txsync/rxsync detects an error. 1954 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1955 * Return 1 on reinit. 1956 * 1957 * This routine is only called by the upper half of the kernel. 1958 * It only reads hwcur (which is changed only by the upper half, too) 1959 * and hwavail (which may be changed by the lower half, but only on 1960 * a tx ring and only to increase it, so any error will be recovered 1961 * on the next call). For the above, we don't strictly need to call 1962 * it under lock. 1963 */ 1964 int 1965 netmap_ring_reinit(struct netmap_kring *kring) 1966 { 1967 struct netmap_ring *ring = kring->ring; 1968 u_int i, lim = kring->nkr_num_slots - 1; 1969 int errors = 0; 1970 1971 // XXX KASSERT nm_kr_tryget 1972 RD(10, "called for %s", kring->na->ifp->if_xname); 1973 if (ring->cur > lim) 1974 errors++; 1975 for (i = 0; i <= lim; i++) { 1976 u_int idx = ring->slot[i].buf_idx; 1977 u_int len = ring->slot[i].len; 1978 if (idx < 2 || idx >= netmap_total_buffers) { 1979 if (!errors++) 1980 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1981 ring->slot[i].buf_idx = 0; 1982 ring->slot[i].len = 0; 1983 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1984 ring->slot[i].len = 0; 1985 if (!errors++) 1986 D("bad len %d at slot %d idx %d", 1987 len, i, idx); 1988 } 1989 } 1990 if (errors) { 1991 int pos = kring - kring->na->tx_rings; 1992 int n = kring->na->num_tx_rings + 1; 1993 1994 RD(10, "total %d errors", errors); 1995 errors++; 1996 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1997 kring->na->ifp->if_xname, 1998 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1999 ring->cur, kring->nr_hwcur, 2000 ring->avail, kring->nr_hwavail); 2001 ring->cur = kring->nr_hwcur; 2002 ring->avail = kring->nr_hwavail; 2003 } 2004 return (errors ? 1 : 0); 2005 } 2006 2007 2008 /* 2009 * Set the ring ID. For devices with a single queue, a request 2010 * for all rings is the same as a single ring. 2011 */ 2012 static int 2013 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 2014 { 2015 struct ifnet *ifp = priv->np_ifp; 2016 struct netmap_adapter *na = NA(ifp); 2017 u_int i = ringid & NETMAP_RING_MASK; 2018 /* initially (np_qfirst == np_qlast) we don't want to lock */ 2019 u_int lim = na->num_rx_rings; 2020 2021 if (na->num_tx_rings > lim) 2022 lim = na->num_tx_rings; 2023 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 2024 D("invalid ring id %d", i); 2025 return (EINVAL); 2026 } 2027 priv->np_ringid = ringid; 2028 if (ringid & NETMAP_SW_RING) { 2029 priv->np_qfirst = NETMAP_SW_RING; 2030 priv->np_qlast = 0; 2031 } else if (ringid & NETMAP_HW_RING) { 2032 priv->np_qfirst = i; 2033 priv->np_qlast = i + 1; 2034 } else { 2035 priv->np_qfirst = 0; 2036 priv->np_qlast = NETMAP_HW_RING ; 2037 } 2038 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 2039 if (netmap_verbose) { 2040 if (ringid & NETMAP_SW_RING) 2041 D("ringid %s set to SW RING", ifp->if_xname); 2042 else if (ringid & NETMAP_HW_RING) 2043 D("ringid %s set to HW RING %d", ifp->if_xname, 2044 priv->np_qfirst); 2045 else 2046 D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); 2047 } 2048 return 0; 2049 } 2050 2051 2052 /* 2053 * possibly move the interface to netmap-mode. 2054 * If success it returns a pointer to netmap_if, otherwise NULL. 2055 * This must be called with NMG_LOCK held. 2056 */ 2057 static struct netmap_if * 2058 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, 2059 uint16_t ringid, int *err) 2060 { 2061 struct netmap_adapter *na = NA(ifp); 2062 struct netmap_if *nifp = NULL; 2063 int error, need_mem; 2064 2065 NMG_LOCK_ASSERT(); 2066 /* ring configuration may have changed, fetch from the card */ 2067 netmap_update_config(na); 2068 priv->np_ifp = ifp; /* store the reference */ 2069 error = netmap_set_ringid(priv, ringid); 2070 if (error) 2071 goto out; 2072 /* ensure allocators are ready */ 2073 need_mem = !netmap_have_memory_locked(priv); 2074 if (need_mem) { 2075 error = netmap_get_memory_locked(priv); 2076 ND("get_memory returned %d", error); 2077 if (error) 2078 goto out; 2079 } 2080 nifp = netmap_if_new(ifp->if_xname, na); 2081 if (nifp == NULL) { /* allocation failed */ 2082 /* we should drop the allocator, but only 2083 * if we were the ones who grabbed it 2084 */ 2085 if (need_mem) 2086 netmap_drop_memory_locked(priv); 2087 error = ENOMEM; 2088 goto out; 2089 } 2090 na->refcount++; 2091 if (ifp->if_capenable & IFCAP_NETMAP) { 2092 /* was already set */ 2093 } else { 2094 u_int i; 2095 /* Otherwise set the card in netmap mode 2096 * and make it use the shared buffers. 2097 * 2098 * If the interface is attached to a bridge, lock it. 2099 */ 2100 if (NETMAP_OWNED_BY_KERN(ifp)) 2101 BDG_WLOCK(NA(ifp)->na_bdg); 2102 for (i = 0 ; i < na->num_tx_rings + 1; i++) 2103 mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", 2104 NULL, MTX_DEF); 2105 for (i = 0 ; i < na->num_rx_rings + 1; i++) { 2106 mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", 2107 NULL, MTX_DEF); 2108 } 2109 if (nma_is_hw(na)) { 2110 SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; 2111 SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; 2112 } 2113 /* 2114 * do not core lock because the race is harmless here, 2115 * there cannot be any traffic to netmap_transmit() 2116 */ 2117 error = na->nm_register(ifp, 1); /* mode on */ 2118 // XXX do we need to nm_alloc_bdgfwd() in all cases ? 2119 if (!error) 2120 error = nm_alloc_bdgfwd(na); 2121 if (error) { 2122 netmap_do_unregif(priv, nifp); 2123 nifp = NULL; 2124 } 2125 if (NETMAP_OWNED_BY_KERN(ifp)) 2126 BDG_WUNLOCK(NA(ifp)->na_bdg); 2127 2128 } 2129 out: 2130 *err = error; 2131 if (nifp != NULL) { 2132 /* 2133 * advertise that the interface is ready bt setting ni_nifp. 2134 * The barrier is needed because readers (poll and *SYNC) 2135 * check for priv->np_nifp != NULL without locking 2136 */ 2137 wmb(); /* make sure previous writes are visible to all CPUs */ 2138 priv->np_nifp = nifp; 2139 } 2140 return nifp; 2141 } 2142 2143 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ 2144 static int 2145 nm_bdg_attach(struct nmreq *nmr) 2146 { 2147 struct ifnet *ifp; 2148 struct netmap_if *nifp; 2149 struct netmap_priv_d *npriv; 2150 int error; 2151 2152 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2153 if (npriv == NULL) 2154 return ENOMEM; 2155 NMG_LOCK(); 2156 error = get_ifp(nmr, &ifp, 1 /* create if not exists */); 2157 if (error) /* no device, or another bridge or user owns the device */ 2158 goto unlock_exit; 2159 /* get_ifp() sets na_bdg if this is a physical interface 2160 * that we can attach to a switch. 2161 */ 2162 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2163 /* got reference to a virtual port or direct access to a NIC. 2164 * perhaps specified no bridge prefix or wrong NIC name 2165 */ 2166 error = EINVAL; 2167 goto unref_exit; 2168 } 2169 2170 if (NA(ifp)->refcount > 0) { /* already registered */ 2171 error = EBUSY; 2172 DROP_BDG_REF(ifp); 2173 goto unlock_exit; 2174 } 2175 2176 nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); 2177 if (!nifp) { 2178 goto unref_exit; 2179 } 2180 2181 NA(ifp)->na_kpriv = npriv; 2182 NMG_UNLOCK(); 2183 ND("registered %s to netmap-mode", ifp->if_xname); 2184 return 0; 2185 2186 unref_exit: 2187 nm_if_rele(ifp); 2188 unlock_exit: 2189 NMG_UNLOCK(); 2190 bzero(npriv, sizeof(*npriv)); 2191 free(npriv, M_DEVBUF); 2192 return error; 2193 } 2194 2195 static int 2196 nm_bdg_detach(struct nmreq *nmr) 2197 { 2198 struct ifnet *ifp; 2199 int error; 2200 int last_instance; 2201 2202 NMG_LOCK(); 2203 error = get_ifp(nmr, &ifp, 0 /* don't create */); 2204 if (error) { /* no device, or another bridge or user owns the device */ 2205 goto unlock_exit; 2206 } 2207 /* XXX do we need to check this ? */ 2208 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2209 /* got reference to a virtual port or direct access to a NIC. 2210 * perhaps specified no bridge's prefix or wrong NIC's name 2211 */ 2212 error = EINVAL; 2213 goto unref_exit; 2214 } 2215 2216 if (NA(ifp)->refcount == 0) { /* not registered */ 2217 error = EINVAL; 2218 goto unref_exit; 2219 } 2220 2221 DROP_BDG_REF(ifp); /* the one from get_ifp */ 2222 last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ 2223 NMG_UNLOCK(); 2224 if (!last_instance) { 2225 D("--- error, trying to detach an entry with active mmaps"); 2226 error = EINVAL; 2227 } else { 2228 struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; 2229 NA(ifp)->na_kpriv = NULL; 2230 2231 bzero(npriv, sizeof(*npriv)); 2232 free(npriv, M_DEVBUF); 2233 } 2234 return error; 2235 2236 unref_exit: 2237 nm_if_rele(ifp); 2238 unlock_exit: 2239 NMG_UNLOCK(); 2240 return error; 2241 } 2242 2243 2244 /* Initialize necessary fields of sw adapter located in right after hw's 2245 * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. 2246 * It is always activated and deactivated at the same tie with the hw's one. 2247 * Thus we don't need refcounting on the sw adapter. 2248 * Regardless of NIC's feature we use separate lock so that anybody can lock 2249 * me independently from the hw adapter. 2250 * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw 2251 */ 2252 static void 2253 netmap_attach_sw(struct ifnet *ifp) 2254 { 2255 struct netmap_adapter *hw_na = NA(ifp); 2256 struct netmap_adapter *na = SWNA(ifp); 2257 2258 na->ifp = ifp; 2259 na->num_rx_rings = na->num_tx_rings = 1; 2260 na->num_tx_desc = hw_na->num_tx_desc; 2261 na->num_rx_desc = hw_na->num_rx_desc; 2262 na->nm_txsync = netmap_bdg_to_host; 2263 /* we use the same memory allocator as the 2264 * the hw adapter */ 2265 na->nm_mem = hw_na->nm_mem; 2266 } 2267 2268 2269 /* exported to kernel callers, e.g. OVS ? 2270 * Entry point. 2271 * Called without NMG_LOCK. 2272 */ 2273 int 2274 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) 2275 { 2276 struct nm_bridge *b; 2277 struct netmap_adapter *na; 2278 struct ifnet *iter; 2279 char *name = nmr->nr_name; 2280 int cmd = nmr->nr_cmd, namelen = strlen(name); 2281 int error = 0, i, j; 2282 2283 switch (cmd) { 2284 case NETMAP_BDG_ATTACH: 2285 error = nm_bdg_attach(nmr); 2286 break; 2287 2288 case NETMAP_BDG_DETACH: 2289 error = nm_bdg_detach(nmr); 2290 break; 2291 2292 case NETMAP_BDG_LIST: 2293 /* this is used to enumerate bridges and ports */ 2294 if (namelen) { /* look up indexes of bridge and port */ 2295 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 2296 error = EINVAL; 2297 break; 2298 } 2299 NMG_LOCK(); 2300 b = nm_find_bridge(name, 0 /* don't create */); 2301 if (!b) { 2302 error = ENOENT; 2303 NMG_UNLOCK(); 2304 break; 2305 } 2306 2307 error = ENOENT; 2308 for (j = 0; j < b->bdg_active_ports; j++) { 2309 i = b->bdg_port_index[j]; 2310 na = b->bdg_ports[i]; 2311 if (na == NULL) { 2312 D("---AAAAAAAAARGH-------"); 2313 continue; 2314 } 2315 iter = na->ifp; 2316 /* the former and the latter identify a 2317 * virtual port and a NIC, respectively 2318 */ 2319 if (!strcmp(iter->if_xname, name) || 2320 (namelen > b->bdg_namelen && 2321 !strcmp(iter->if_xname, 2322 name + b->bdg_namelen + 1))) { 2323 /* bridge index */ 2324 nmr->nr_arg1 = b - nm_bridges; 2325 nmr->nr_arg2 = i; /* port index */ 2326 error = 0; 2327 break; 2328 } 2329 } 2330 NMG_UNLOCK(); 2331 } else { 2332 /* return the first non-empty entry starting from 2333 * bridge nr_arg1 and port nr_arg2. 2334 * 2335 * Users can detect the end of the same bridge by 2336 * seeing the new and old value of nr_arg1, and can 2337 * detect the end of all the bridge by error != 0 2338 */ 2339 i = nmr->nr_arg1; 2340 j = nmr->nr_arg2; 2341 2342 NMG_LOCK(); 2343 for (error = ENOENT; i < NM_BRIDGES; i++) { 2344 b = nm_bridges + i; 2345 if (j >= b->bdg_active_ports) { 2346 j = 0; /* following bridges scan from 0 */ 2347 continue; 2348 } 2349 nmr->nr_arg1 = i; 2350 nmr->nr_arg2 = j; 2351 j = b->bdg_port_index[j]; 2352 na = b->bdg_ports[j]; 2353 iter = na->ifp; 2354 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); 2355 error = 0; 2356 break; 2357 } 2358 NMG_UNLOCK(); 2359 } 2360 break; 2361 2362 case NETMAP_BDG_LOOKUP_REG: 2363 /* register a lookup function to the given bridge. 2364 * nmr->nr_name may be just bridge's name (including ':' 2365 * if it is not just NM_NAME). 2366 */ 2367 if (!func) { 2368 error = EINVAL; 2369 break; 2370 } 2371 NMG_LOCK(); 2372 b = nm_find_bridge(name, 0 /* don't create */); 2373 if (!b) { 2374 error = EINVAL; 2375 } else { 2376 b->nm_bdg_lookup = func; 2377 } 2378 NMG_UNLOCK(); 2379 break; 2380 2381 default: 2382 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 2383 error = EINVAL; 2384 break; 2385 } 2386 return error; 2387 } 2388 2389 2390 /* 2391 * ioctl(2) support for the "netmap" device. 2392 * 2393 * Following a list of accepted commands: 2394 * - NIOCGINFO 2395 * - SIOCGIFADDR just for convenience 2396 * - NIOCREGIF 2397 * - NIOCUNREGIF 2398 * - NIOCTXSYNC 2399 * - NIOCRXSYNC 2400 * 2401 * Return 0 on success, errno otherwise. 2402 */ 2403 static int 2404 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2405 int fflag, struct thread *td) 2406 { 2407 struct netmap_priv_d *priv = NULL; 2408 struct ifnet *ifp = NULL; 2409 struct nmreq *nmr = (struct nmreq *) data; 2410 struct netmap_adapter *na = NULL; 2411 int error; 2412 u_int i, lim; 2413 struct netmap_if *nifp; 2414 struct netmap_kring *krings; 2415 2416 (void)dev; /* UNUSED */ 2417 (void)fflag; /* UNUSED */ 2418 #ifdef linux 2419 #define devfs_get_cdevpriv(pp) \ 2420 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 2421 (*pp ? 0 : ENOENT); }) 2422 2423 /* devfs_set_cdevpriv cannot fail on linux */ 2424 #define devfs_set_cdevpriv(p, fn) \ 2425 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 2426 2427 2428 #define devfs_clear_cdevpriv() do { \ 2429 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 2430 } while (0) 2431 #endif /* linux */ 2432 2433 CURVNET_SET(TD_TO_VNET(td)); 2434 2435 error = devfs_get_cdevpriv((void **)&priv); 2436 if (error) { 2437 CURVNET_RESTORE(); 2438 /* XXX ENOENT should be impossible, since the priv 2439 * is now created in the open */ 2440 return (error == ENOENT ? ENXIO : error); 2441 } 2442 2443 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 2444 switch (cmd) { 2445 case NIOCGINFO: /* return capabilities etc */ 2446 if (nmr->nr_version != NETMAP_API) { 2447 #ifdef TEST_STUFF 2448 /* some test code for locks etc */ 2449 if (nmr->nr_version == 666) { 2450 error = nm_test(nmr); 2451 break; 2452 } 2453 #endif /* TEST_STUFF */ 2454 D("API mismatch got %d have %d", 2455 nmr->nr_version, NETMAP_API); 2456 nmr->nr_version = NETMAP_API; 2457 error = EINVAL; 2458 break; 2459 } 2460 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2461 error = netmap_bdg_ctl(nmr, NULL); 2462 break; 2463 } 2464 2465 NMG_LOCK(); 2466 do { 2467 /* memsize is always valid */ 2468 struct netmap_mem_d *nmd = &nm_mem; 2469 u_int memflags; 2470 2471 if (nmr->nr_name[0] != '\0') { 2472 /* get a refcount */ 2473 error = get_ifp(nmr, &ifp, 1 /* create */); 2474 if (error) 2475 break; 2476 na = NA(ifp); /* retrieve the netmap adapter */ 2477 nmd = na->nm_mem; /* and its memory allocator */ 2478 } 2479 2480 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 2481 if (error) 2482 break; 2483 if (na == NULL) /* only memory info */ 2484 break; 2485 nmr->nr_offset = 0; 2486 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2487 netmap_update_config(na); 2488 nmr->nr_rx_rings = na->num_rx_rings; 2489 nmr->nr_tx_rings = na->num_tx_rings; 2490 nmr->nr_rx_slots = na->num_rx_desc; 2491 nmr->nr_tx_slots = na->num_tx_desc; 2492 if (memflags & NETMAP_MEM_PRIVATE) 2493 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2494 } while (0); 2495 if (ifp) 2496 nm_if_rele(ifp); /* return the refcount */ 2497 NMG_UNLOCK(); 2498 break; 2499 2500 case NIOCREGIF: 2501 if (nmr->nr_version != NETMAP_API) { 2502 nmr->nr_version = NETMAP_API; 2503 error = EINVAL; 2504 break; 2505 } 2506 /* possibly attach/detach NIC and VALE switch */ 2507 i = nmr->nr_cmd; 2508 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 2509 error = netmap_bdg_ctl(nmr, NULL); 2510 break; 2511 } else if (i != 0) { 2512 D("nr_cmd must be 0 not %d", i); 2513 error = EINVAL; 2514 break; 2515 } 2516 2517 /* protect access to priv from concurrent NIOCREGIF */ 2518 NMG_LOCK(); 2519 do { 2520 u_int memflags; 2521 2522 if (priv->np_ifp != NULL) { /* thread already registered */ 2523 error = netmap_set_ringid(priv, nmr->nr_ringid); 2524 break; 2525 } 2526 /* find the interface and a reference */ 2527 error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ 2528 if (error) 2529 break; 2530 if (NETMAP_OWNED_BY_KERN(ifp)) { 2531 nm_if_rele(ifp); 2532 error = EBUSY; 2533 break; 2534 } 2535 nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); 2536 if (!nifp) { /* reg. failed, release priv and ref */ 2537 nm_if_rele(ifp); /* return the refcount */ 2538 priv->np_ifp = NULL; 2539 priv->np_nifp = NULL; 2540 break; 2541 } 2542 2543 /* return the offset of the netmap_if object */ 2544 na = NA(ifp); /* retrieve netmap adapter */ 2545 nmr->nr_rx_rings = na->num_rx_rings; 2546 nmr->nr_tx_rings = na->num_tx_rings; 2547 nmr->nr_rx_slots = na->num_rx_desc; 2548 nmr->nr_tx_slots = na->num_tx_desc; 2549 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 2550 if (error) { 2551 nm_if_rele(ifp); 2552 break; 2553 } 2554 if (memflags & NETMAP_MEM_PRIVATE) { 2555 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2556 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2557 } 2558 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2559 } while (0); 2560 NMG_UNLOCK(); 2561 break; 2562 2563 case NIOCUNREGIF: 2564 // XXX we have no data here ? 2565 D("deprecated, data is %p", nmr); 2566 error = EINVAL; 2567 break; 2568 2569 case NIOCTXSYNC: 2570 case NIOCRXSYNC: 2571 nifp = priv->np_nifp; 2572 2573 if (nifp == NULL) { 2574 error = ENXIO; 2575 break; 2576 } 2577 rmb(); /* make sure following reads are not from cache */ 2578 2579 ifp = priv->np_ifp; /* we have a reference */ 2580 2581 if (ifp == NULL) { 2582 D("Internal error: nifp != NULL && ifp == NULL"); 2583 error = ENXIO; 2584 break; 2585 } 2586 2587 na = NA(ifp); /* retrieve netmap adapter */ 2588 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 2589 if (cmd == NIOCTXSYNC) 2590 netmap_txsync_to_host(na); 2591 else 2592 netmap_rxsync_from_host(na, NULL, NULL); 2593 break; 2594 } 2595 /* find the last ring to scan */ 2596 lim = priv->np_qlast; 2597 if (lim == NETMAP_HW_RING) 2598 lim = (cmd == NIOCTXSYNC) ? 2599 na->num_tx_rings : na->num_rx_rings; 2600 2601 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 2602 for (i = priv->np_qfirst; i < lim; i++) { 2603 struct netmap_kring *kring = krings + i; 2604 if (nm_kr_tryget(kring)) { 2605 error = EBUSY; 2606 goto out; 2607 } 2608 if (cmd == NIOCTXSYNC) { 2609 if (netmap_verbose & NM_VERB_TXSYNC) 2610 D("pre txsync ring %d cur %d hwcur %d", 2611 i, kring->ring->cur, 2612 kring->nr_hwcur); 2613 na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); 2614 if (netmap_verbose & NM_VERB_TXSYNC) 2615 D("post txsync ring %d cur %d hwcur %d", 2616 i, kring->ring->cur, 2617 kring->nr_hwcur); 2618 } else { 2619 na->nm_rxsync(ifp, i, NAF_FORCE_READ); 2620 microtime(&na->rx_rings[i].ring->ts); 2621 } 2622 nm_kr_put(kring); 2623 } 2624 2625 break; 2626 2627 #ifdef __FreeBSD__ 2628 case BIOCIMMEDIATE: 2629 case BIOCGHDRCMPLT: 2630 case BIOCSHDRCMPLT: 2631 case BIOCSSEESENT: 2632 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2633 break; 2634 2635 default: /* allow device-specific ioctls */ 2636 { 2637 struct socket so; 2638 2639 bzero(&so, sizeof(so)); 2640 NMG_LOCK(); 2641 error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ 2642 if (error) { 2643 NMG_UNLOCK(); 2644 break; 2645 } 2646 so.so_vnet = ifp->if_vnet; 2647 // so->so_proto not null. 2648 error = ifioctl(&so, cmd, data, td); 2649 nm_if_rele(ifp); 2650 NMG_UNLOCK(); 2651 break; 2652 } 2653 2654 #else /* linux */ 2655 default: 2656 error = EOPNOTSUPP; 2657 #endif /* linux */ 2658 } 2659 out: 2660 2661 CURVNET_RESTORE(); 2662 return (error); 2663 } 2664 2665 2666 /* 2667 * select(2) and poll(2) handlers for the "netmap" device. 2668 * 2669 * Can be called for one or more queues. 2670 * Return true the event mask corresponding to ready events. 2671 * If there are no ready events, do a selrecord on either individual 2672 * selinfo or on the global one. 2673 * Device-dependent parts (locking and sync of tx/rx rings) 2674 * are done through callbacks. 2675 * 2676 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2677 * The first one is remapped to pwait as selrecord() uses the name as an 2678 * hidden argument. 2679 */ 2680 static int 2681 netmap_poll(struct cdev *dev, int events, struct thread *td) 2682 { 2683 struct netmap_priv_d *priv = NULL; 2684 struct netmap_adapter *na; 2685 struct ifnet *ifp; 2686 struct netmap_kring *kring; 2687 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 2688 u_int lim_tx, lim_rx, host_forwarded = 0; 2689 struct mbq q = { NULL, NULL, 0 }; 2690 void *pwait = dev; /* linux compatibility */ 2691 2692 int retry_tx = 1; 2693 2694 (void)pwait; 2695 2696 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 2697 return POLLERR; 2698 2699 if (priv->np_nifp == NULL) { 2700 D("No if registered"); 2701 return POLLERR; 2702 } 2703 rmb(); /* make sure following reads are not from cache */ 2704 2705 ifp = priv->np_ifp; 2706 // XXX check for deleting() ? 2707 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 2708 return POLLERR; 2709 2710 if (netmap_verbose & 0x8000) 2711 D("device %s events 0x%x", ifp->if_xname, events); 2712 want_tx = events & (POLLOUT | POLLWRNORM); 2713 want_rx = events & (POLLIN | POLLRDNORM); 2714 2715 na = NA(ifp); /* retrieve netmap adapter */ 2716 2717 lim_tx = na->num_tx_rings; 2718 lim_rx = na->num_rx_rings; 2719 2720 if (priv->np_qfirst == NETMAP_SW_RING) { 2721 /* handle the host stack ring */ 2722 if (priv->np_txpoll || want_tx) { 2723 /* push any packets up, then we are always ready */ 2724 netmap_txsync_to_host(na); 2725 revents |= want_tx; 2726 } 2727 if (want_rx) { 2728 kring = &na->rx_rings[lim_rx]; 2729 if (kring->ring->avail == 0) 2730 netmap_rxsync_from_host(na, td, dev); 2731 if (kring->ring->avail > 0) { 2732 revents |= want_rx; 2733 } 2734 } 2735 return (revents); 2736 } 2737 2738 /* if we are in transparent mode, check also the host rx ring */ 2739 kring = &na->rx_rings[lim_rx]; 2740 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2741 && want_rx 2742 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 2743 if (kring->ring->avail == 0) 2744 netmap_rxsync_from_host(na, td, dev); 2745 if (kring->ring->avail > 0) 2746 revents |= want_rx; 2747 } 2748 2749 /* 2750 * check_all is set if the card has more than one queue AND 2751 * the client is polling all of them. If true, we sleep on 2752 * the "global" selinfo, otherwise we sleep on individual selinfo 2753 * (FreeBSD only allows two selinfo's per file descriptor). 2754 * The interrupt routine in the driver wake one or the other 2755 * (or both) depending on which clients are active. 2756 * 2757 * rxsync() is only called if we run out of buffers on a POLLIN. 2758 * txsync() is called if we run out of buffers on POLLOUT, or 2759 * there are pending packets to send. The latter can be disabled 2760 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2761 */ 2762 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 2763 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 2764 2765 if (priv->np_qlast != NETMAP_HW_RING) { 2766 lim_tx = lim_rx = priv->np_qlast; 2767 } 2768 2769 /* 2770 * We start with a lock free round which is good if we have 2771 * data available. If this fails, then lock and call the sync 2772 * routines. 2773 */ 2774 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 2775 kring = &na->rx_rings[i]; 2776 if (kring->ring->avail > 0) { 2777 revents |= want_rx; 2778 want_rx = 0; /* also breaks the loop */ 2779 } 2780 } 2781 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 2782 kring = &na->tx_rings[i]; 2783 if (kring->ring->avail > 0) { 2784 revents |= want_tx; 2785 want_tx = 0; /* also breaks the loop */ 2786 } 2787 } 2788 2789 /* 2790 * If we to push packets out (priv->np_txpoll) or want_tx is 2791 * still set, we do need to run the txsync calls (on all rings, 2792 * to avoid that the tx rings stall). 2793 */ 2794 if (priv->np_txpoll || want_tx) { 2795 /* If we really want to be woken up (want_tx), 2796 * do a selrecord, either on the global or on 2797 * the private structure. Then issue the txsync 2798 * so there is no race in the selrecord/selwait 2799 */ 2800 flush_tx: 2801 for (i = priv->np_qfirst; i < lim_tx; i++) { 2802 kring = &na->tx_rings[i]; 2803 /* 2804 * Skip this ring if want_tx == 0 2805 * (we have already done a successful sync on 2806 * a previous ring) AND kring->cur == kring->hwcur 2807 * (there are no pending transmissions for this ring). 2808 */ 2809 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2810 continue; 2811 /* make sure only one user thread is doing this */ 2812 if (nm_kr_tryget(kring)) { 2813 ND("ring %p busy is %d", kring, (int)kring->nr_busy); 2814 revents |= POLLERR; 2815 goto out; 2816 } 2817 2818 if (netmap_verbose & NM_VERB_TXSYNC) 2819 D("send %d on %s %d", 2820 kring->ring->cur, ifp->if_xname, i); 2821 if (na->nm_txsync(ifp, i, 0)) 2822 revents |= POLLERR; 2823 2824 /* Check avail/call selrecord only if called with POLLOUT */ 2825 if (want_tx) { 2826 if (kring->ring->avail > 0) { 2827 /* stop at the first ring. We don't risk 2828 * starvation. 2829 */ 2830 revents |= want_tx; 2831 want_tx = 0; 2832 } 2833 } 2834 nm_kr_put(kring); 2835 } 2836 if (want_tx && retry_tx) { 2837 selrecord(td, check_all_tx ? 2838 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 2839 retry_tx = 0; 2840 goto flush_tx; 2841 } 2842 } 2843 2844 /* 2845 * now if want_rx is still set we need to lock and rxsync. 2846 * Do it on all rings because otherwise we starve. 2847 */ 2848 if (want_rx) { 2849 int retry_rx = 1; 2850 do_retry_rx: 2851 for (i = priv->np_qfirst; i < lim_rx; i++) { 2852 kring = &na->rx_rings[i]; 2853 2854 if (nm_kr_tryget(kring)) { 2855 revents |= POLLERR; 2856 goto out; 2857 } 2858 2859 /* XXX NR_FORWARD should only be read on 2860 * physical or NIC ports 2861 */ 2862 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2863 ND(10, "forwarding some buffers up %d to %d", 2864 kring->nr_hwcur, kring->ring->cur); 2865 netmap_grab_packets(kring, &q, netmap_fwd); 2866 } 2867 2868 if (na->nm_rxsync(ifp, i, 0)) 2869 revents |= POLLERR; 2870 if (netmap_no_timestamp == 0 || 2871 kring->ring->flags & NR_TIMESTAMP) { 2872 microtime(&kring->ring->ts); 2873 } 2874 2875 if (kring->ring->avail > 0) { 2876 revents |= want_rx; 2877 retry_rx = 0; 2878 } 2879 nm_kr_put(kring); 2880 } 2881 if (retry_rx) { 2882 retry_rx = 0; 2883 selrecord(td, check_all_rx ? 2884 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 2885 goto do_retry_rx; 2886 } 2887 } 2888 2889 /* forward host to the netmap ring. 2890 * I am accessing nr_hwavail without lock, but netmap_transmit 2891 * can only increment it, so the operation is safe. 2892 */ 2893 kring = &na->rx_rings[lim_rx]; 2894 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2895 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 2896 && kring->nr_hwavail > 0 && !host_forwarded) { 2897 netmap_sw_to_nic(na); 2898 host_forwarded = 1; /* prevent another pass */ 2899 want_rx = 0; 2900 goto flush_tx; 2901 } 2902 2903 if (q.head) 2904 netmap_send_up(na->ifp, q.head); 2905 2906 out: 2907 2908 return (revents); 2909 } 2910 2911 /*------- driver support routines ------*/ 2912 2913 2914 /* 2915 * Initialize a ``netmap_adapter`` object created by driver on attach. 2916 * We allocate a block of memory with room for a struct netmap_adapter 2917 * plus two sets of N+2 struct netmap_kring (where N is the number 2918 * of hardware rings): 2919 * krings 0..N-1 are for the hardware queues. 2920 * kring N is for the host stack queue 2921 * kring N+1 is only used for the selinfo for all queues. 2922 * Return 0 on success, ENOMEM otherwise. 2923 * 2924 * By default the receive and transmit adapter ring counts are both initialized 2925 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2926 * setups. 2927 */ 2928 int 2929 netmap_attach(struct netmap_adapter *arg, u_int num_queues) 2930 { 2931 struct netmap_adapter *na = NULL; 2932 struct ifnet *ifp = arg ? arg->ifp : NULL; 2933 size_t len; 2934 2935 if (arg == NULL || ifp == NULL) 2936 goto fail; 2937 /* a VALE port uses two endpoints */ 2938 len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; 2939 na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); 2940 if (na == NULL) 2941 goto fail; 2942 WNA(ifp) = na; 2943 *na = *arg; /* copy everything, trust the driver to not pass junk */ 2944 NETMAP_SET_CAPABLE(ifp); 2945 if (na->num_tx_rings == 0) 2946 na->num_tx_rings = num_queues; 2947 na->num_rx_rings = num_queues; 2948 na->refcount = na->na_single = na->na_multi = 0; 2949 /* Core lock initialized here, others after netmap_if_new. */ 2950 mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); 2951 #ifdef linux 2952 if (ifp->netdev_ops) { 2953 ND("netdev_ops %p", ifp->netdev_ops); 2954 /* prepare a clone of the netdev ops */ 2955 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2956 na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2957 #else 2958 na->nm_ndo = *ifp->netdev_ops; 2959 #endif 2960 } 2961 na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2962 #endif /* linux */ 2963 na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; 2964 if (!nma_is_vp(arg)) 2965 netmap_attach_sw(ifp); 2966 D("success for %s", ifp->if_xname); 2967 return 0; 2968 2969 fail: 2970 D("fail, arg %p ifp %p na %p", arg, ifp, na); 2971 netmap_detach(ifp); 2972 return (na ? EINVAL : ENOMEM); 2973 } 2974 2975 2976 /* 2977 * Free the allocated memory linked to the given ``netmap_adapter`` 2978 * object. 2979 */ 2980 void 2981 netmap_detach(struct ifnet *ifp) 2982 { 2983 struct netmap_adapter *na = NA(ifp); 2984 2985 if (!na) 2986 return; 2987 2988 mtx_destroy(&na->core_lock); 2989 2990 if (na->tx_rings) { /* XXX should not happen */ 2991 D("freeing leftover tx_rings"); 2992 free(na->tx_rings, M_DEVBUF); 2993 } 2994 if (na->na_flags & NAF_MEM_OWNER) 2995 netmap_mem_private_delete(na->nm_mem); 2996 bzero(na, sizeof(*na)); 2997 WNA(ifp) = NULL; 2998 free(na, M_DEVBUF); 2999 } 3000 3001 3002 int 3003 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 3004 struct netmap_adapter *na, u_int ring_nr); 3005 3006 3007 /* 3008 * Intercept packets from the network stack and pass them 3009 * to netmap as incoming packets on the 'software' ring. 3010 * We rely on the OS to make sure that the ifp and na do not go 3011 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 3012 * In nm_register() or whenever there is a reinitialization, 3013 * we make sure to access the core lock and per-ring locks 3014 * so that IFCAP_NETMAP is visible here. 3015 */ 3016 int 3017 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 3018 { 3019 struct netmap_adapter *na = NA(ifp); 3020 struct netmap_kring *kring; 3021 u_int i, len = MBUF_LEN(m); 3022 u_int error = EBUSY, lim; 3023 struct netmap_slot *slot; 3024 3025 // XXX [Linux] we do not need this lock 3026 // if we follow the down/configure/up protocol -gl 3027 // mtx_lock(&na->core_lock); 3028 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 3029 /* interface not in netmap mode anymore */ 3030 error = ENXIO; 3031 goto done; 3032 } 3033 3034 kring = &na->rx_rings[na->num_rx_rings]; 3035 lim = kring->nkr_num_slots - 1; 3036 if (netmap_verbose & NM_VERB_HOST) 3037 D("%s packet %d len %d from the stack", ifp->if_xname, 3038 kring->nr_hwcur + kring->nr_hwavail, len); 3039 // XXX reconsider long packets if we handle fragments 3040 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 3041 D("%s from_host, drop packet size %d > %d", ifp->if_xname, 3042 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 3043 goto done; 3044 } 3045 if (SWNA(ifp)->na_bdg) { 3046 struct nm_bdg_fwd *ft; 3047 char *dst; 3048 3049 na = SWNA(ifp); /* we operate on the host port */ 3050 ft = na->rx_rings[0].nkr_ft; 3051 dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); 3052 3053 /* use slot 0 in the ft, there is nothing queued here */ 3054 /* XXX we can save the copy calling m_copydata in nm_bdg_flush, 3055 * need a special flag for this. 3056 */ 3057 m_copydata(m, 0, (int)len, dst); 3058 ft->ft_flags = 0; 3059 ft->ft_len = len; 3060 ft->ft_buf = dst; 3061 ft->ft_next = NM_FT_NULL; 3062 ft->ft_frags = 1; 3063 if (netmap_verbose & NM_VERB_HOST) 3064 RD(5, "pkt %p size %d to bridge port %d", 3065 dst, len, na->bdg_port); 3066 nm_bdg_flush(ft, 1, na, 0); 3067 na = NA(ifp); /* back to the regular object/lock */ 3068 error = 0; 3069 goto done; 3070 } 3071 3072 /* protect against other instances of netmap_transmit, 3073 * and userspace invocations of rxsync(). 3074 * XXX could reuse core_lock 3075 */ 3076 // XXX [Linux] there can be no other instances of netmap_transmit 3077 // on this same ring, but we still need this lock to protect 3078 // concurrent access from netmap_sw_to_nic() -gl 3079 mtx_lock(&kring->q_lock); 3080 if (kring->nr_hwavail >= lim) { 3081 if (netmap_verbose) 3082 D("stack ring %s full\n", ifp->if_xname); 3083 } else { 3084 /* compute the insert position */ 3085 i = nm_kr_rxpos(kring); 3086 slot = &kring->ring->slot[i]; 3087 m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); 3088 slot->len = len; 3089 slot->flags = kring->nkr_slot_flags; 3090 kring->nr_hwavail++; 3091 if (netmap_verbose & NM_VERB_HOST) 3092 D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); 3093 selwakeuppri(&kring->si, PI_NET); 3094 error = 0; 3095 } 3096 mtx_unlock(&kring->q_lock); 3097 3098 done: 3099 // mtx_unlock(&na->core_lock); 3100 3101 /* release the mbuf in either cases of success or failure. As an 3102 * alternative, put the mbuf in a free list and free the list 3103 * only when really necessary. 3104 */ 3105 m_freem(m); 3106 3107 return (error); 3108 } 3109 3110 3111 /* 3112 * netmap_reset() is called by the driver routines when reinitializing 3113 * a ring. The driver is in charge of locking to protect the kring. 3114 * If netmap mode is not set just return NULL. 3115 */ 3116 struct netmap_slot * 3117 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 3118 u_int new_cur) 3119 { 3120 struct netmap_kring *kring; 3121 int new_hwofs, lim; 3122 3123 if (na == NULL) { 3124 D("NULL na, should not happen"); 3125 return NULL; /* no netmap support here */ 3126 } 3127 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 3128 D("interface not in netmap mode"); 3129 return NULL; /* nothing to reinitialize */ 3130 } 3131 3132 /* XXX note- in the new scheme, we are not guaranteed to be 3133 * under lock (e.g. when called on a device reset). 3134 * In this case, we should set a flag and do not trust too 3135 * much the values. In practice: TODO 3136 * - set a RESET flag somewhere in the kring 3137 * - do the processing in a conservative way 3138 * - let the *sync() fixup at the end. 3139 */ 3140 if (tx == NR_TX) { 3141 if (n >= na->num_tx_rings) 3142 return NULL; 3143 kring = na->tx_rings + n; 3144 new_hwofs = kring->nr_hwcur - new_cur; 3145 } else { 3146 if (n >= na->num_rx_rings) 3147 return NULL; 3148 kring = na->rx_rings + n; 3149 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 3150 } 3151 lim = kring->nkr_num_slots - 1; 3152 if (new_hwofs > lim) 3153 new_hwofs -= lim + 1; 3154 3155 /* Always set the new offset value and realign the ring. */ 3156 D("%s hwofs %d -> %d, hwavail %d -> %d", 3157 tx == NR_TX ? "TX" : "RX", 3158 kring->nkr_hwofs, new_hwofs, 3159 kring->nr_hwavail, 3160 tx == NR_TX ? lim : kring->nr_hwavail); 3161 kring->nkr_hwofs = new_hwofs; 3162 if (tx == NR_TX) 3163 kring->nr_hwavail = lim; 3164 3165 #if 0 // def linux 3166 /* XXX check that the mappings are correct */ 3167 /* need ring_nr, adapter->pdev, direction */ 3168 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 3169 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 3170 D("error mapping rx netmap buffer %d", i); 3171 // XXX fix error handling 3172 } 3173 3174 #endif /* linux */ 3175 /* 3176 * Wakeup on the individual and global selwait 3177 * We do the wakeup here, but the ring is not yet reconfigured. 3178 * However, we are under lock so there are no races. 3179 */ 3180 selwakeuppri(&kring->si, PI_NET); 3181 selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); 3182 return kring->ring->slot; 3183 } 3184 3185 3186 /* 3187 * Grab packets from a kring, move them into the ft structure 3188 * associated to the tx (input) port. Max one instance per port, 3189 * filtered on input (ioctl, poll or XXX). 3190 * Returns the next position in the ring. 3191 */ 3192 static int 3193 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, 3194 struct netmap_kring *kring, u_int end) 3195 { 3196 struct netmap_ring *ring = kring->ring; 3197 struct nm_bdg_fwd *ft; 3198 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 3199 u_int ft_i = 0; /* start from 0 */ 3200 u_int frags = 1; /* how many frags ? */ 3201 struct nm_bridge *b = na->na_bdg; 3202 3203 /* To protect against modifications to the bridge we acquire a 3204 * shared lock, waiting if we can sleep (if the source port is 3205 * attached to a user process) or with a trylock otherwise (NICs). 3206 */ 3207 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3208 if (na->na_flags & NAF_BDG_MAYSLEEP) 3209 BDG_RLOCK(b); 3210 else if (!BDG_RTRYLOCK(b)) 3211 return 0; 3212 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3213 ft = kring->nkr_ft; 3214 3215 for (; likely(j != end); j = nm_next(j, lim)) { 3216 struct netmap_slot *slot = &ring->slot[j]; 3217 char *buf; 3218 3219 ft[ft_i].ft_len = slot->len; 3220 ft[ft_i].ft_flags = slot->flags; 3221 3222 ND("flags is 0x%x", slot->flags); 3223 /* this slot goes into a list so initialize the link field */ 3224 ft[ft_i].ft_next = NM_FT_NULL; 3225 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 3226 (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); 3227 prefetch(buf); 3228 ++ft_i; 3229 if (slot->flags & NS_MOREFRAG) { 3230 frags++; 3231 continue; 3232 } 3233 if (unlikely(netmap_verbose && frags > 1)) 3234 RD(5, "%d frags at %d", frags, ft_i - frags); 3235 ft[ft_i - frags].ft_frags = frags; 3236 frags = 1; 3237 if (unlikely((int)ft_i >= bridge_batch)) 3238 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3239 } 3240 if (frags > 1) { 3241 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 3242 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 3243 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 3244 ft[ft_i - frags].ft_frags = frags - 1; 3245 } 3246 if (ft_i) 3247 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3248 BDG_RUNLOCK(b); 3249 return j; 3250 } 3251 3252 3253 /* 3254 * Pass packets from nic to the bridge. 3255 * XXX TODO check locking: this is called from the interrupt 3256 * handler so we should make sure that the interface is not 3257 * disconnected while passing down an interrupt. 3258 * 3259 * Note, no user process can access this NIC so we can ignore 3260 * the info in the 'ring'. 3261 */ 3262 static void 3263 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) 3264 { 3265 struct netmap_adapter *na = NA(ifp); 3266 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 3267 struct netmap_ring *ring = kring->ring; 3268 u_int j, k; 3269 3270 /* make sure that only one thread is ever in here, 3271 * after which we can unlock. Probably unnecessary XXX. 3272 */ 3273 if (nm_kr_tryget(kring)) 3274 return; 3275 /* fetch packets that have arrived. 3276 * XXX maybe do this in a loop ? 3277 */ 3278 if (na->nm_rxsync(ifp, ring_nr, 0)) 3279 goto put_out; 3280 if (kring->nr_hwavail == 0 && netmap_verbose) { 3281 D("how strange, interrupt with no packets on %s", 3282 ifp->if_xname); 3283 goto put_out; 3284 } 3285 k = nm_kr_rxpos(kring); 3286 3287 j = nm_bdg_preflush(na, ring_nr, kring, k); 3288 3289 /* we consume everything, but we cannot update kring directly 3290 * because the nic may have destroyed the info in the NIC ring. 3291 * So we need to call rxsync again to restore it. 3292 */ 3293 ring->cur = j; 3294 ring->avail = 0; 3295 na->nm_rxsync(ifp, ring_nr, 0); 3296 3297 put_out: 3298 nm_kr_put(kring); 3299 return; 3300 } 3301 3302 3303 /* 3304 * Default functions to handle rx/tx interrupts from a physical device. 3305 * "work_done" is non-null on the RX path, NULL for the TX path. 3306 * We rely on the OS to make sure that there is only one active 3307 * instance per queue, and that there is appropriate locking. 3308 * 3309 * If the card is not in netmap mode, simply return 0, 3310 * so that the caller proceeds with regular processing. 3311 * 3312 * If the card is connected to a netmap file descriptor, 3313 * do a selwakeup on the individual queue, plus one on the global one 3314 * if needed (multiqueue card _and_ there are multiqueue listeners), 3315 * and return 1. 3316 * 3317 * Finally, if called on rx from an interface connected to a switch, 3318 * calls the proper forwarding routine, and return 1. 3319 */ 3320 int 3321 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3322 { 3323 struct netmap_adapter *na; 3324 struct netmap_kring *kring; 3325 3326 if (!(ifp->if_capenable & IFCAP_NETMAP)) 3327 return 0; 3328 3329 q &= NETMAP_RING_MASK; 3330 3331 if (netmap_verbose) 3332 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 3333 na = NA(ifp); 3334 if (na->na_flags & NAF_SKIP_INTR) { 3335 ND("use regular interrupt"); 3336 return 0; 3337 } 3338 3339 if (work_done) { /* RX path */ 3340 if (q >= na->num_rx_rings) 3341 return 0; // not a physical queue 3342 kring = na->rx_rings + q; 3343 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 3344 if (na->na_bdg != NULL) { 3345 netmap_nic_to_bdg(ifp, q); 3346 } else { 3347 selwakeuppri(&kring->si, PI_NET); 3348 if (na->num_rx_rings > 1 /* or multiple listeners */ ) 3349 selwakeuppri(&na->rx_si, PI_NET); 3350 } 3351 *work_done = 1; /* do not fire napi again */ 3352 } else { /* TX path */ 3353 if (q >= na->num_tx_rings) 3354 return 0; // not a physical queue 3355 kring = na->tx_rings + q; 3356 selwakeuppri(&kring->si, PI_NET); 3357 if (na->num_tx_rings > 1 /* or multiple listeners */ ) 3358 selwakeuppri(&na->tx_si, PI_NET); 3359 } 3360 return 1; 3361 } 3362 3363 3364 #ifdef linux /* linux-specific routines */ 3365 3366 3367 /* 3368 * Remap linux arguments into the FreeBSD call. 3369 * - pwait is the poll table, passed as 'dev'; 3370 * If pwait == NULL someone else already woke up before. We can report 3371 * events but they are filtered upstream. 3372 * If pwait != NULL, then pwait->key contains the list of events. 3373 * - events is computed from pwait as above. 3374 * - file is passed as 'td'; 3375 */ 3376 static u_int 3377 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) 3378 { 3379 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) 3380 int events = POLLIN | POLLOUT; /* XXX maybe... */ 3381 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) 3382 int events = pwait ? pwait->key : POLLIN | POLLOUT; 3383 #else /* in 3.4.0 field 'key' was renamed to '_key' */ 3384 int events = pwait ? pwait->_key : POLLIN | POLLOUT; 3385 #endif 3386 return netmap_poll((void *)pwait, events, (void *)file); 3387 } 3388 3389 3390 static int 3391 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) 3392 { 3393 int error = 0; 3394 unsigned long off, va; 3395 vm_ooffset_t pa; 3396 struct netmap_priv_d *priv = f->private_data; 3397 /* 3398 * vma->vm_start: start of mapping user address space 3399 * vma->vm_end: end of the mapping user address space 3400 * vma->vm_pfoff: offset of first page in the device 3401 */ 3402 3403 // XXX security checks 3404 3405 error = netmap_get_memory(priv); 3406 ND("get_memory returned %d", error); 3407 if (error) 3408 return -error; 3409 3410 if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { 3411 ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); 3412 return -EINVAL; 3413 } 3414 3415 for (va = vma->vm_start, off = vma->vm_pgoff; 3416 va < vma->vm_end; 3417 va += PAGE_SIZE, off++) 3418 { 3419 pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); 3420 if (pa == 0) 3421 return -EINVAL; 3422 3423 ND("va %lx pa %p", va, pa); 3424 error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); 3425 if (error) 3426 return error; 3427 } 3428 return 0; 3429 } 3430 3431 3432 /* 3433 * This one is probably already protected by the netif lock XXX 3434 */ 3435 static netdev_tx_t 3436 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) 3437 { 3438 netmap_transmit(dev, skb); 3439 return (NETDEV_TX_OK); 3440 } 3441 3442 3443 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 3444 #define LIN_IOCTL_NAME .ioctl 3445 int 3446 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) 3447 #else 3448 #define LIN_IOCTL_NAME .unlocked_ioctl 3449 long 3450 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) 3451 #endif 3452 { 3453 int ret; 3454 struct nmreq nmr; 3455 bzero(&nmr, sizeof(nmr)); 3456 3457 if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { 3458 data = 0; /* no argument required here */ 3459 } 3460 if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) 3461 return -EFAULT; 3462 ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); 3463 if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) 3464 return -EFAULT; 3465 return -ret; 3466 } 3467 3468 3469 static int 3470 netmap_release(struct inode *inode, struct file *file) 3471 { 3472 (void)inode; /* UNUSED */ 3473 if (file->private_data) 3474 netmap_dtor(file->private_data); 3475 return (0); 3476 } 3477 3478 3479 static int 3480 linux_netmap_open(struct inode *inode, struct file *file) 3481 { 3482 struct netmap_priv_d *priv; 3483 (void)inode; /* UNUSED */ 3484 3485 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 3486 M_NOWAIT | M_ZERO); 3487 if (priv == NULL) 3488 return -ENOMEM; 3489 3490 file->private_data = priv; 3491 3492 return (0); 3493 } 3494 3495 3496 static struct file_operations netmap_fops = { 3497 .owner = THIS_MODULE, 3498 .open = linux_netmap_open, 3499 .mmap = linux_netmap_mmap, 3500 LIN_IOCTL_NAME = linux_netmap_ioctl, 3501 .poll = linux_netmap_poll, 3502 .release = netmap_release, 3503 }; 3504 3505 3506 static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ 3507 MISC_DYNAMIC_MINOR, 3508 "netmap", 3509 &netmap_fops, 3510 }; 3511 3512 static int netmap_init(void); 3513 static void netmap_fini(void); 3514 3515 3516 /* Errors have negative values on linux */ 3517 static int linux_netmap_init(void) 3518 { 3519 return -netmap_init(); 3520 } 3521 3522 module_init(linux_netmap_init); 3523 module_exit(netmap_fini); 3524 /* export certain symbols to other modules */ 3525 EXPORT_SYMBOL(netmap_attach); // driver attach routines 3526 EXPORT_SYMBOL(netmap_detach); // driver detach routines 3527 EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error 3528 EXPORT_SYMBOL(netmap_buffer_lut); 3529 EXPORT_SYMBOL(netmap_total_buffers); // index check 3530 EXPORT_SYMBOL(netmap_buffer_base); 3531 EXPORT_SYMBOL(netmap_reset); // ring init routines 3532 EXPORT_SYMBOL(netmap_buf_size); 3533 EXPORT_SYMBOL(netmap_rx_irq); // default irq handler 3534 EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away 3535 EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine 3536 EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function 3537 EXPORT_SYMBOL(netmap_disable_all_rings); 3538 EXPORT_SYMBOL(netmap_enable_all_rings); 3539 3540 3541 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); 3542 MODULE_DESCRIPTION("The netmap packet I/O framework"); 3543 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ 3544 3545 #else /* __FreeBSD__ */ 3546 3547 3548 static struct cdevsw netmap_cdevsw = { 3549 .d_version = D_VERSION, 3550 .d_name = "netmap", 3551 .d_open = netmap_open, 3552 .d_mmap_single = netmap_mmap_single, 3553 .d_ioctl = netmap_ioctl, 3554 .d_poll = netmap_poll, 3555 .d_close = netmap_close, 3556 }; 3557 #endif /* __FreeBSD__ */ 3558 3559 /* 3560 *---- support for virtual bridge ----- 3561 */ 3562 3563 /* ----- FreeBSD if_bridge hash function ------- */ 3564 3565 /* 3566 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 3567 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 3568 * 3569 * http://www.burtleburtle.net/bob/hash/spooky.html 3570 */ 3571 #define mix(a, b, c) \ 3572 do { \ 3573 a -= b; a -= c; a ^= (c >> 13); \ 3574 b -= c; b -= a; b ^= (a << 8); \ 3575 c -= a; c -= b; c ^= (b >> 13); \ 3576 a -= b; a -= c; a ^= (c >> 12); \ 3577 b -= c; b -= a; b ^= (a << 16); \ 3578 c -= a; c -= b; c ^= (b >> 5); \ 3579 a -= b; a -= c; a ^= (c >> 3); \ 3580 b -= c; b -= a; b ^= (a << 10); \ 3581 c -= a; c -= b; c ^= (b >> 15); \ 3582 } while (/*CONSTCOND*/0) 3583 3584 static __inline uint32_t 3585 nm_bridge_rthash(const uint8_t *addr) 3586 { 3587 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 3588 3589 b += addr[5] << 8; 3590 b += addr[4]; 3591 a += addr[3] << 24; 3592 a += addr[2] << 16; 3593 a += addr[1] << 8; 3594 a += addr[0]; 3595 3596 mix(a, b, c); 3597 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 3598 return (c & BRIDGE_RTHASH_MASK); 3599 } 3600 3601 #undef mix 3602 3603 3604 static int 3605 bdg_netmap_reg(struct ifnet *ifp, int onoff) 3606 { 3607 /* the interface is already attached to the bridge, 3608 * so we only need to toggle IFCAP_NETMAP. 3609 */ 3610 if (onoff) { 3611 ifp->if_capenable |= IFCAP_NETMAP; 3612 } else { 3613 ifp->if_capenable &= ~IFCAP_NETMAP; 3614 } 3615 return 0; 3616 } 3617 3618 3619 /* 3620 * Lookup function for a learning bridge. 3621 * Update the hash table with the source address, 3622 * and then returns the destination port index, and the 3623 * ring in *dst_ring (at the moment, always use ring 0) 3624 */ 3625 u_int 3626 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, 3627 struct netmap_adapter *na) 3628 { 3629 struct nm_hash_ent *ht = na->na_bdg->ht; 3630 uint32_t sh, dh; 3631 u_int dst, mysrc = na->bdg_port; 3632 uint64_t smac, dmac; 3633 3634 if (buf_len < 14) { 3635 D("invalid buf length %d", buf_len); 3636 return NM_BDG_NOPORT; 3637 } 3638 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 3639 smac = le64toh(*(uint64_t *)(buf + 4)); 3640 smac >>= 16; 3641 3642 /* 3643 * The hash is somewhat expensive, there might be some 3644 * worthwhile optimizations here. 3645 */ 3646 if ((buf[6] & 1) == 0) { /* valid src */ 3647 uint8_t *s = buf+6; 3648 sh = nm_bridge_rthash(s); // XXX hash of source 3649 /* update source port forwarding entry */ 3650 ht[sh].mac = smac; /* XXX expire ? */ 3651 ht[sh].ports = mysrc; 3652 if (netmap_verbose) 3653 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 3654 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 3655 } 3656 dst = NM_BDG_BROADCAST; 3657 if ((buf[0] & 1) == 0) { /* unicast */ 3658 dh = nm_bridge_rthash(buf); // XXX hash of dst 3659 if (ht[dh].mac == dmac) { /* found dst */ 3660 dst = ht[dh].ports; 3661 } 3662 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 3663 } 3664 *dst_ring = 0; 3665 return dst; 3666 } 3667 3668 3669 /* 3670 * This flush routine supports only unicast and broadcast but a large 3671 * number of ports, and lets us replace the learn and dispatch functions. 3672 */ 3673 int 3674 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, 3675 u_int ring_nr) 3676 { 3677 struct nm_bdg_q *dst_ents, *brddst; 3678 uint16_t num_dsts = 0, *dsts; 3679 struct nm_bridge *b = na->na_bdg; 3680 u_int i, j, me = na->bdg_port; 3681 3682 /* 3683 * The work area (pointed by ft) is followed by an array of 3684 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 3685 * queues per port plus one for the broadcast traffic. 3686 * Then we have an array of destination indexes. 3687 */ 3688 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 3689 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 3690 3691 /* first pass: find a destination for each packet in the batch */ 3692 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 3693 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 3694 uint16_t dst_port, d_i; 3695 struct nm_bdg_q *d; 3696 3697 ND("slot %d frags %d", i, ft[i].ft_frags); 3698 dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, 3699 &dst_ring, na); 3700 if (netmap_verbose > 255) 3701 RD(5, "slot %d port %d -> %d", i, me, dst_port); 3702 if (dst_port == NM_BDG_NOPORT) 3703 continue; /* this packet is identified to be dropped */ 3704 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 3705 continue; 3706 else if (dst_port == NM_BDG_BROADCAST) 3707 dst_ring = 0; /* broadcasts always go to ring 0 */ 3708 else if (unlikely(dst_port == me || 3709 !b->bdg_ports[dst_port])) 3710 continue; 3711 3712 /* get a position in the scratch pad */ 3713 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 3714 d = dst_ents + d_i; 3715 3716 /* append the first fragment to the list */ 3717 if (d->bq_head == NM_FT_NULL) { /* new destination */ 3718 d->bq_head = d->bq_tail = i; 3719 /* remember this position to be scanned later */ 3720 if (dst_port != NM_BDG_BROADCAST) 3721 dsts[num_dsts++] = d_i; 3722 } else { 3723 ft[d->bq_tail].ft_next = i; 3724 d->bq_tail = i; 3725 } 3726 d->bq_len += ft[i].ft_frags; 3727 } 3728 3729 /* 3730 * Broadcast traffic goes to ring 0 on all destinations. 3731 * So we need to add these rings to the list of ports to scan. 3732 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 3733 * expensive. We should keep a compact list of active destinations 3734 * so we could shorten this loop. 3735 */ 3736 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 3737 if (brddst->bq_head != NM_FT_NULL) { 3738 for (j = 0; likely(j < b->bdg_active_ports); j++) { 3739 uint16_t d_i; 3740 i = b->bdg_port_index[j]; 3741 if (unlikely(i == me)) 3742 continue; 3743 d_i = i * NM_BDG_MAXRINGS; 3744 if (dst_ents[d_i].bq_head == NM_FT_NULL) 3745 dsts[num_dsts++] = d_i; 3746 } 3747 } 3748 3749 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 3750 /* second pass: scan destinations (XXX will be modular somehow) */ 3751 for (i = 0; i < num_dsts; i++) { 3752 struct ifnet *dst_ifp; 3753 struct netmap_adapter *dst_na; 3754 struct netmap_kring *kring; 3755 struct netmap_ring *ring; 3756 u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; 3757 u_int needed, howmany; 3758 int retry = netmap_txsync_retry; 3759 struct nm_bdg_q *d; 3760 uint32_t my_start = 0, lease_idx = 0; 3761 int nrings; 3762 3763 d_i = dsts[i]; 3764 ND("second pass %d port %d", i, d_i); 3765 d = dst_ents + d_i; 3766 // XXX fix the division 3767 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 3768 /* protect from the lookup function returning an inactive 3769 * destination port 3770 */ 3771 if (unlikely(dst_na == NULL)) 3772 goto cleanup; 3773 if (dst_na->na_flags & NAF_SW_ONLY) 3774 goto cleanup; 3775 dst_ifp = dst_na->ifp; 3776 /* 3777 * The interface may be in !netmap mode in two cases: 3778 * - when na is attached but not activated yet; 3779 * - when na is being deactivated but is still attached. 3780 */ 3781 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { 3782 ND("not in netmap mode!"); 3783 goto cleanup; 3784 } 3785 3786 /* there is at least one either unicast or broadcast packet */ 3787 brd_next = brddst->bq_head; 3788 next = d->bq_head; 3789 /* we need to reserve this many slots. If fewer are 3790 * available, some packets will be dropped. 3791 * Packets may have multiple fragments, so we may not use 3792 * there is a chance that we may not use all of the slots 3793 * we have claimed, so we will need to handle the leftover 3794 * ones when we regain the lock. 3795 */ 3796 needed = d->bq_len + brddst->bq_len; 3797 3798 is_vp = nma_is_vp(dst_na); 3799 ND(5, "pass 2 dst %d is %x %s", 3800 i, d_i, is_vp ? "virtual" : "nic/host"); 3801 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 3802 if (is_vp) { /* virtual port */ 3803 nrings = dst_na->num_rx_rings; 3804 } else { 3805 nrings = dst_na->num_tx_rings; 3806 } 3807 if (dst_nr >= nrings) 3808 dst_nr = dst_nr % nrings; 3809 kring = is_vp ? &dst_na->rx_rings[dst_nr] : 3810 &dst_na->tx_rings[dst_nr]; 3811 ring = kring->ring; 3812 lim = kring->nkr_num_slots - 1; 3813 3814 retry: 3815 3816 /* reserve the buffers in the queue and an entry 3817 * to report completion, and drop lock. 3818 * XXX this might become a helper function. 3819 */ 3820 mtx_lock(&kring->q_lock); 3821 if (kring->nkr_stopped) { 3822 mtx_unlock(&kring->q_lock); 3823 goto cleanup; 3824 } 3825 /* on physical interfaces, do a txsync to recover 3826 * slots for packets already transmitted. 3827 * XXX maybe we could be optimistic and rely on a retry 3828 * in case of failure. 3829 */ 3830 if (nma_is_hw(dst_na)) { 3831 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3832 } 3833 my_start = j = kring->nkr_hwlease; 3834 howmany = nm_kr_space(kring, is_vp); 3835 if (needed < howmany) 3836 howmany = needed; 3837 lease_idx = nm_kr_lease(kring, howmany, is_vp); 3838 mtx_unlock(&kring->q_lock); 3839 3840 /* only retry if we need more than available slots */ 3841 if (retry && needed <= howmany) 3842 retry = 0; 3843 3844 /* copy to the destination queue */ 3845 while (howmany > 0) { 3846 struct netmap_slot *slot; 3847 struct nm_bdg_fwd *ft_p, *ft_end; 3848 u_int cnt; 3849 3850 /* find the queue from which we pick next packet. 3851 * NM_FT_NULL is always higher than valid indexes 3852 * so we never dereference it if the other list 3853 * has packets (and if both are empty we never 3854 * get here). 3855 */ 3856 if (next < brd_next) { 3857 ft_p = ft + next; 3858 next = ft_p->ft_next; 3859 } else { /* insert broadcast */ 3860 ft_p = ft + brd_next; 3861 brd_next = ft_p->ft_next; 3862 } 3863 cnt = ft_p->ft_frags; // cnt > 0 3864 if (unlikely(cnt > howmany)) 3865 break; /* no more space */ 3866 howmany -= cnt; 3867 if (netmap_verbose && cnt > 1) 3868 RD(5, "rx %d frags to %d", cnt, j); 3869 ft_end = ft_p + cnt; 3870 do { 3871 void *dst, *src = ft_p->ft_buf; 3872 size_t len = (ft_p->ft_len + 63) & ~63; 3873 3874 slot = &ring->slot[j]; 3875 dst = BDG_NMB(dst_na->nm_mem, slot); 3876 /* round to a multiple of 64 */ 3877 3878 ND("send %d %d bytes at %s:%d", 3879 i, ft_p->ft_len, dst_ifp->if_xname, j); 3880 if (ft_p->ft_flags & NS_INDIRECT) { 3881 if (copyin(src, dst, len)) { 3882 // invalid user pointer, pretend len is 0 3883 ft_p->ft_len = 0; 3884 } 3885 } else { 3886 //memcpy(dst, src, len); 3887 pkt_copy(src, dst, (int)len); 3888 } 3889 slot->len = ft_p->ft_len; 3890 slot->flags = (cnt << 8)| NS_MOREFRAG; 3891 j = nm_next(j, lim); 3892 ft_p++; 3893 sent++; 3894 } while (ft_p != ft_end); 3895 slot->flags = (cnt << 8); /* clear flag on last entry */ 3896 /* are we done ? */ 3897 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 3898 break; 3899 } 3900 { 3901 /* current position */ 3902 uint32_t *p = kring->nkr_leases; /* shorthand */ 3903 uint32_t update_pos; 3904 int still_locked = 1; 3905 3906 mtx_lock(&kring->q_lock); 3907 if (unlikely(howmany > 0)) { 3908 /* not used all bufs. If i am the last one 3909 * i can recover the slots, otherwise must 3910 * fill them with 0 to mark empty packets. 3911 */ 3912 ND("leftover %d bufs", howmany); 3913 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 3914 /* yes i am the last one */ 3915 ND("roll back nkr_hwlease to %d", j); 3916 kring->nkr_hwlease = j; 3917 } else { 3918 while (howmany-- > 0) { 3919 ring->slot[j].len = 0; 3920 ring->slot[j].flags = 0; 3921 j = nm_next(j, lim); 3922 } 3923 } 3924 } 3925 p[lease_idx] = j; /* report I am done */ 3926 3927 update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; 3928 3929 if (my_start == update_pos) { 3930 /* all slots before my_start have been reported, 3931 * so scan subsequent leases to see if other ranges 3932 * have been completed, and to a selwakeup or txsync. 3933 */ 3934 while (lease_idx != kring->nkr_lease_idx && 3935 p[lease_idx] != NR_NOSLOT) { 3936 j = p[lease_idx]; 3937 p[lease_idx] = NR_NOSLOT; 3938 lease_idx = nm_next(lease_idx, lim); 3939 } 3940 /* j is the new 'write' position. j != my_start 3941 * means there are new buffers to report 3942 */ 3943 if (likely(j != my_start)) { 3944 if (is_vp) { 3945 uint32_t old_avail = kring->nr_hwavail; 3946 3947 kring->nr_hwavail = (j >= kring->nr_hwcur) ? 3948 j - kring->nr_hwcur : 3949 j + lim + 1 - kring->nr_hwcur; 3950 if (kring->nr_hwavail < old_avail) { 3951 D("avail shrink %d -> %d", 3952 old_avail, kring->nr_hwavail); 3953 } 3954 still_locked = 0; 3955 mtx_unlock(&kring->q_lock); 3956 selwakeuppri(&kring->si, PI_NET); 3957 } else { 3958 ring->cur = j; 3959 /* XXX update avail ? */ 3960 still_locked = 0; 3961 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3962 mtx_unlock(&kring->q_lock); 3963 3964 /* retry to send more packets */ 3965 if (nma_is_hw(dst_na) && retry--) 3966 goto retry; 3967 } 3968 } 3969 } 3970 if (still_locked) 3971 mtx_unlock(&kring->q_lock); 3972 } 3973 cleanup: 3974 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 3975 d->bq_len = 0; 3976 } 3977 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 3978 brddst->bq_len = 0; 3979 return 0; 3980 } 3981 3982 3983 /* 3984 * main dispatch routine for the bridge. 3985 * We already know that only one thread is running this. 3986 * we must run nm_bdg_preflush without lock. 3987 */ 3988 static int 3989 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) 3990 { 3991 struct netmap_adapter *na = NA(ifp); 3992 struct netmap_kring *kring = &na->tx_rings[ring_nr]; 3993 struct netmap_ring *ring = kring->ring; 3994 u_int j, k, lim = kring->nkr_num_slots - 1; 3995 3996 k = ring->cur; 3997 if (k > lim) 3998 return netmap_ring_reinit(kring); 3999 4000 if (bridge_batch <= 0) { /* testing only */ 4001 j = k; // used all 4002 goto done; 4003 } 4004 if (bridge_batch > NM_BDG_BATCH) 4005 bridge_batch = NM_BDG_BATCH; 4006 4007 j = nm_bdg_preflush(na, ring_nr, kring, k); 4008 if (j != k) 4009 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); 4010 /* k-j modulo ring size is the number of slots processed */ 4011 if (k < j) 4012 k += kring->nkr_num_slots; 4013 kring->nr_hwavail = lim - (k - j); 4014 4015 done: 4016 kring->nr_hwcur = j; 4017 ring->avail = kring->nr_hwavail; 4018 if (netmap_verbose) 4019 D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); 4020 return 0; 4021 } 4022 4023 4024 /* 4025 * user process reading from a VALE switch. 4026 * Already protected against concurrent calls from userspace, 4027 * but we must acquire the queue's lock to protect against 4028 * writers on the same queue. 4029 */ 4030 static int 4031 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) 4032 { 4033 struct netmap_adapter *na = NA(ifp); 4034 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 4035 struct netmap_ring *ring = kring->ring; 4036 u_int j, lim = kring->nkr_num_slots - 1; 4037 u_int k = ring->cur, resvd = ring->reserved; 4038 int n; 4039 4040 mtx_lock(&kring->q_lock); 4041 if (k > lim) { 4042 D("ouch dangerous reset!!!"); 4043 n = netmap_ring_reinit(kring); 4044 goto done; 4045 } 4046 4047 /* skip past packets that userspace has released */ 4048 j = kring->nr_hwcur; /* netmap ring index */ 4049 if (resvd > 0) { 4050 if (resvd + ring->avail >= lim + 1) { 4051 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 4052 ring->reserved = resvd = 0; // XXX panic... 4053 } 4054 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; 4055 } 4056 4057 if (j != k) { /* userspace has released some packets. */ 4058 n = k - j; 4059 if (n < 0) 4060 n += kring->nkr_num_slots; 4061 ND("userspace releases %d packets", n); 4062 for (n = 0; likely(j != k); n++) { 4063 struct netmap_slot *slot = &ring->slot[j]; 4064 void *addr = BDG_NMB(na->nm_mem, slot); 4065 4066 if (addr == netmap_buffer_base) { /* bad buf */ 4067 D("bad buffer index %d, ignore ?", 4068 slot->buf_idx); 4069 } 4070 slot->flags &= ~NS_BUF_CHANGED; 4071 j = nm_next(j, lim); 4072 } 4073 kring->nr_hwavail -= n; 4074 kring->nr_hwcur = k; 4075 } 4076 /* tell userspace that there are new packets */ 4077 ring->avail = kring->nr_hwavail - resvd; 4078 n = 0; 4079 done: 4080 mtx_unlock(&kring->q_lock); 4081 return n; 4082 } 4083 4084 4085 static int 4086 bdg_netmap_attach(struct netmap_adapter *arg) 4087 { 4088 struct netmap_adapter na; 4089 4090 ND("attaching virtual bridge"); 4091 bzero(&na, sizeof(na)); 4092 4093 na.ifp = arg->ifp; 4094 na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 4095 na.num_tx_rings = arg->num_tx_rings; 4096 na.num_rx_rings = arg->num_rx_rings; 4097 na.num_tx_desc = arg->num_tx_desc; 4098 na.num_rx_desc = arg->num_rx_desc; 4099 na.nm_txsync = bdg_netmap_txsync; 4100 na.nm_rxsync = bdg_netmap_rxsync; 4101 na.nm_register = bdg_netmap_reg; 4102 na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, 4103 na.num_tx_rings, na.num_tx_desc, 4104 na.num_rx_rings, na.num_rx_desc); 4105 return netmap_attach(&na, na.num_tx_rings); 4106 } 4107 4108 4109 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 4110 4111 4112 /* 4113 * Module loader. 4114 * 4115 * Create the /dev/netmap device and initialize all global 4116 * variables. 4117 * 4118 * Return 0 on success, errno on failure. 4119 */ 4120 static int 4121 netmap_init(void) 4122 { 4123 int i, error; 4124 4125 NMG_LOCK_INIT(); 4126 4127 error = netmap_mem_init(); 4128 if (error != 0) { 4129 printf("netmap: unable to initialize the memory allocator.\n"); 4130 return (error); 4131 } 4132 printf("netmap: loaded module\n"); 4133 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 4134 "netmap"); 4135 4136 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 4137 for (i = 0; i < NM_BRIDGES; i++) 4138 BDG_RWINIT(&nm_bridges[i]); 4139 return (error); 4140 } 4141 4142 4143 /* 4144 * Module unloader. 4145 * 4146 * Free all the memory, and destroy the ``/dev/netmap`` device. 4147 */ 4148 static void 4149 netmap_fini(void) 4150 { 4151 destroy_dev(netmap_dev); 4152 netmap_mem_fini(); 4153 NMG_LOCK_DESTROY(); 4154 printf("netmap: unloaded module.\n"); 4155 } 4156 4157 4158 #ifdef __FreeBSD__ 4159 /* 4160 * Kernel entry point. 4161 * 4162 * Initialize/finalize the module and return. 4163 * 4164 * Return 0 on success, errno on failure. 4165 */ 4166 static int 4167 netmap_loader(__unused struct module *module, int event, __unused void *arg) 4168 { 4169 int error = 0; 4170 4171 switch (event) { 4172 case MOD_LOAD: 4173 error = netmap_init(); 4174 break; 4175 4176 case MOD_UNLOAD: 4177 netmap_fini(); 4178 break; 4179 4180 default: 4181 error = EOPNOTSUPP; 4182 break; 4183 } 4184 4185 return (error); 4186 } 4187 4188 4189 DEV_MODULE(netmap, netmap_loader, NULL); 4190 #endif /* __FreeBSD__ */ 4191