1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 #ifdef __FreeBSD__ 28 #define TEST_STUFF // test code, does not compile yet on linux 29 #endif /* __FreeBSD__ */ 30 31 /* 32 * This module supports memory mapped access to network devices, 33 * see netmap(4). 34 * 35 * The module uses a large, memory pool allocated by the kernel 36 * and accessible as mmapped memory by multiple userspace threads/processes. 37 * The memory pool contains packet buffers and "netmap rings", 38 * i.e. user-accessible copies of the interface's queues. 39 * 40 * Access to the network card works like this: 41 * 1. a process/thread issues one or more open() on /dev/netmap, to create 42 * select()able file descriptor on which events are reported. 43 * 2. on each descriptor, the process issues an ioctl() to identify 44 * the interface that should report events to the file descriptor. 45 * 3. on each descriptor, the process issues an mmap() request to 46 * map the shared memory region within the process' address space. 47 * The list of interesting queues is indicated by a location in 48 * the shared memory region. 49 * 4. using the functions in the netmap(4) userspace API, a process 50 * can look up the occupation state of a queue, access memory buffers, 51 * and retrieve received packets or enqueue packets to transmit. 52 * 5. using some ioctl()s the process can synchronize the userspace view 53 * of the queue with the actual status in the kernel. This includes both 54 * receiving the notification of new packets, and transmitting new 55 * packets on the output interface. 56 * 6. select() or poll() can be used to wait for events on individual 57 * transmit or receive queues (or all queues for a given interface). 58 * 59 60 SYNCHRONIZATION (USER) 61 62 The netmap rings and data structures may be shared among multiple 63 user threads or even independent processes. 64 Any synchronization among those threads/processes is delegated 65 to the threads themselves. Only one thread at a time can be in 66 a system call on the same netmap ring. The OS does not enforce 67 this and only guarantees against system crashes in case of 68 invalid usage. 69 70 LOCKING (INTERNAL) 71 72 Within the kernel, access to the netmap rings is protected as follows: 73 74 - a spinlock on each ring, to handle producer/consumer races on 75 RX rings attached to the host stack (against multiple host 76 threads writing from the host stack to the same ring), 77 and on 'destination' rings attached to a VALE switch 78 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 79 protecting multiple active senders for the same destination) 80 81 - an atomic variable to guarantee that there is at most one 82 instance of *_*xsync() on the ring at any time. 83 For rings connected to user file 84 descriptors, an atomic_test_and_set() protects this, and the 85 lock on the ring is not actually used. 86 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 87 is also used to prevent multiple executions (the driver might indeed 88 already guarantee this). 89 For NIC TX rings connected to a VALE switch, the lock arbitrates 90 access to the queue (both when allocating buffers and when pushing 91 them out). 92 93 - *xsync() should be protected against initializations of the card. 94 On FreeBSD most devices have the reset routine protected by 95 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 96 the RING protection on rx_reset(), this should be added. 97 98 On linux there is an external lock on the tx path, which probably 99 also arbitrates access to the reset routine. XXX to be revised 100 101 - a per-interface core_lock protecting access from the host stack 102 while interfaces may be detached from netmap mode. 103 XXX there should be no need for this lock if we detach the interfaces 104 only while they are down. 105 106 107 --- VALE SWITCH --- 108 109 NMG_LOCK() serializes all modifications to switches and ports. 110 A switch cannot be deleted until all ports are gone. 111 112 For each switch, an SX lock (RWlock on linux) protects 113 deletion of ports. When configuring or deleting a new port, the 114 lock is acquired in exclusive mode (after holding NMG_LOCK). 115 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 116 The lock is held throughout the entire forwarding cycle, 117 during which the thread may incur in a page fault. 118 Hence it is important that sleepable shared locks are used. 119 120 On the rx ring, the per-port lock is grabbed initially to reserve 121 a number of slot in the ring, then the lock is released, 122 packets are copied from source to destination, and then 123 the lock is acquired again and the receive ring is updated. 124 (A similar thing is done on the tx ring for NIC and host stack 125 ports attached to the switch) 126 127 */ 128 129 /* 130 * OS-specific code that is used only within this file. 131 * Other OS-specific code that must be accessed by drivers 132 * is present in netmap_kern.h 133 */ 134 135 #if defined(__FreeBSD__) 136 #include <sys/cdefs.h> /* prerequisite */ 137 __FBSDID("$FreeBSD$"); 138 139 #include <sys/types.h> 140 #include <sys/module.h> 141 #include <sys/errno.h> 142 #include <sys/param.h> /* defines used in kernel.h */ 143 #include <sys/jail.h> 144 #include <sys/kernel.h> /* types used in module initialization */ 145 #include <sys/conf.h> /* cdevsw struct */ 146 #include <sys/uio.h> /* uio struct */ 147 #include <sys/sockio.h> 148 #include <sys/socketvar.h> /* struct socket */ 149 #include <sys/malloc.h> 150 #include <sys/mman.h> /* PROT_EXEC */ 151 #include <sys/poll.h> 152 #include <sys/proc.h> 153 #include <sys/rwlock.h> 154 #include <vm/vm.h> /* vtophys */ 155 #include <vm/pmap.h> /* vtophys */ 156 #include <vm/vm_param.h> 157 #include <vm/vm_object.h> 158 #include <vm/vm_page.h> 159 #include <vm/vm_pager.h> 160 #include <vm/uma.h> 161 #include <sys/socket.h> /* sockaddrs */ 162 #include <sys/selinfo.h> 163 #include <sys/sysctl.h> 164 #include <net/if.h> 165 #include <net/if_var.h> 166 #include <net/bpf.h> /* BIOCIMMEDIATE */ 167 #include <net/vnet.h> 168 #include <machine/bus.h> /* bus_dmamap_* */ 169 #include <sys/endian.h> 170 #include <sys/refcount.h> 171 172 #define prefetch(x) __builtin_prefetch(x) 173 174 #define BDG_RWLOCK_T struct rwlock // struct rwlock 175 176 #define BDG_RWINIT(b) \ 177 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 178 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 179 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 180 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 181 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 182 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 183 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 184 185 186 /* netmap global lock. 187 * normally called within the user thread (upon a system call) 188 * or when a file descriptor or process is terminated 189 * (last close or last munmap) 190 */ 191 192 #define NMG_LOCK_T struct mtx 193 #define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) 194 #define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) 195 #define NMG_LOCK() mtx_lock(&netmap_global_lock) 196 #define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) 197 #define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) 198 199 200 /* atomic operations */ 201 #include <machine/atomic.h> 202 #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) 203 #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) 204 205 206 #elif defined(linux) 207 208 #include "bsd_glue.h" 209 210 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); 211 212 static struct device_driver* 213 linux_netmap_find_driver(struct device *dev) 214 { 215 struct device_driver *dd; 216 217 while ( (dd = dev->driver) == NULL ) { 218 if ( (dev = dev->parent) == NULL ) 219 return NULL; 220 } 221 return dd; 222 } 223 224 static struct net_device* 225 ifunit_ref(const char *name) 226 { 227 struct net_device *ifp = dev_get_by_name(&init_net, name); 228 struct device_driver *dd; 229 230 if (ifp == NULL) 231 return NULL; 232 233 if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) 234 goto error; 235 236 if (!try_module_get(dd->owner)) 237 goto error; 238 239 return ifp; 240 error: 241 dev_put(ifp); 242 return NULL; 243 } 244 245 static void 246 if_rele(struct net_device *ifp) 247 { 248 struct device_driver *dd; 249 dd = linux_netmap_find_driver(&ifp->dev); 250 dev_put(ifp); 251 if (dd) 252 module_put(dd->owner); 253 } 254 255 // XXX a mtx would suffice here too 20130404 gl 256 #define NMG_LOCK_T struct semaphore 257 #define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) 258 #define NMG_LOCK_DESTROY() 259 #define NMG_LOCK() down(&netmap_global_lock) 260 #define NMG_UNLOCK() up(&netmap_global_lock) 261 #define NMG_LOCK_ASSERT() // XXX to be completed 262 263 264 #elif defined(__APPLE__) 265 266 #warning OSX support is only partial 267 #include "osx_glue.h" 268 269 #else 270 271 #error Unsupported platform 272 273 #endif /* unsupported */ 274 275 /* 276 * common headers 277 */ 278 #include <net/netmap.h> 279 #include <dev/netmap/netmap_kern.h> 280 #include <dev/netmap/netmap_mem2.h> 281 282 283 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 284 285 /* 286 * The following variables are used by the drivers and replicate 287 * fields in the global memory pool. They only refer to buffers 288 * used by physical interfaces. 289 */ 290 u_int netmap_total_buffers; 291 u_int netmap_buf_size; 292 char *netmap_buffer_base; /* also address of an invalid buffer */ 293 294 /* user-controlled variables */ 295 int netmap_verbose; 296 297 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 298 299 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 300 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 301 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 302 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 303 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 304 int netmap_mitigate = 1; 305 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 306 int netmap_no_pendintr = 1; 307 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 308 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 309 int netmap_txsync_retry = 2; 310 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 311 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 312 313 int netmap_drop = 0; /* debugging */ 314 int netmap_flags = 0; /* debug flags */ 315 int netmap_fwd = 0; /* force transparent mode */ 316 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 317 318 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); 319 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 320 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 321 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 322 323 NMG_LOCK_T netmap_global_lock; 324 325 /* 326 * protect against multiple threads using the same ring. 327 * also check that the ring has not been stopped. 328 */ 329 #define NM_KR_BUSY 1 330 #define NM_KR_STOPPED 2 331 static void nm_kr_put(struct netmap_kring *kr); 332 static __inline int nm_kr_tryget(struct netmap_kring *kr) 333 { 334 /* check a first time without taking the lock 335 * to avoid starvation for nm_kr_get() 336 */ 337 if (unlikely(kr->nkr_stopped)) { 338 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 339 return NM_KR_STOPPED; 340 } 341 if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) 342 return NM_KR_BUSY; 343 /* check a second time with lock held */ 344 if (unlikely(kr->nkr_stopped)) { 345 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 346 nm_kr_put(kr); 347 return NM_KR_STOPPED; 348 } 349 return 0; 350 } 351 352 static __inline void nm_kr_put(struct netmap_kring *kr) 353 { 354 NM_ATOMIC_CLEAR(&kr->nr_busy); 355 } 356 357 static void nm_kr_get(struct netmap_kring *kr) 358 { 359 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 360 tsleep(kr, 0, "NM_KR_GET", 4); 361 } 362 363 static void nm_disable_ring(struct netmap_kring *kr) 364 { 365 kr->nkr_stopped = 1; 366 nm_kr_get(kr); 367 mtx_lock(&kr->q_lock); 368 mtx_unlock(&kr->q_lock); 369 nm_kr_put(kr); 370 } 371 372 void netmap_disable_all_rings(struct ifnet *ifp) 373 { 374 struct netmap_adapter *na; 375 int i; 376 377 if (!(ifp->if_capenable & IFCAP_NETMAP)) 378 return; 379 380 na = NA(ifp); 381 382 for (i = 0; i < na->num_tx_rings + 1; i++) { 383 nm_disable_ring(na->tx_rings + i); 384 selwakeuppri(&na->tx_rings[i].si, PI_NET); 385 } 386 for (i = 0; i < na->num_rx_rings + 1; i++) { 387 nm_disable_ring(na->rx_rings + i); 388 selwakeuppri(&na->rx_rings[i].si, PI_NET); 389 } 390 selwakeuppri(&na->tx_si, PI_NET); 391 selwakeuppri(&na->rx_si, PI_NET); 392 } 393 394 void netmap_enable_all_rings(struct ifnet *ifp) 395 { 396 struct netmap_adapter *na; 397 int i; 398 399 if (!(ifp->if_capenable & IFCAP_NETMAP)) 400 return; 401 402 na = NA(ifp); 403 for (i = 0; i < na->num_tx_rings + 1; i++) { 404 D("enabling %p", na->tx_rings + i); 405 na->tx_rings[i].nkr_stopped = 0; 406 } 407 for (i = 0; i < na->num_rx_rings + 1; i++) { 408 D("enabling %p", na->rx_rings + i); 409 na->rx_rings[i].nkr_stopped = 0; 410 } 411 } 412 413 414 /* 415 * generic bound_checking function 416 */ 417 u_int 418 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 419 { 420 u_int oldv = *v; 421 const char *op = NULL; 422 423 if (dflt < lo) 424 dflt = lo; 425 if (dflt > hi) 426 dflt = hi; 427 if (oldv < lo) { 428 *v = dflt; 429 op = "Bump"; 430 } else if (oldv > hi) { 431 *v = hi; 432 op = "Clamp"; 433 } 434 if (op && msg) 435 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 436 return *v; 437 } 438 439 /* 440 * packet-dump function, user-supplied or static buffer. 441 * The destination buffer must be at least 30+4*len 442 */ 443 const char * 444 nm_dump_buf(char *p, int len, int lim, char *dst) 445 { 446 static char _dst[8192]; 447 int i, j, i0; 448 static char hex[] ="0123456789abcdef"; 449 char *o; /* output position */ 450 451 #define P_HI(x) hex[((x) & 0xf0)>>4] 452 #define P_LO(x) hex[((x) & 0xf)] 453 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 454 if (!dst) 455 dst = _dst; 456 if (lim <= 0 || lim > len) 457 lim = len; 458 o = dst; 459 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 460 o += strlen(o); 461 /* hexdump routine */ 462 for (i = 0; i < lim; ) { 463 sprintf(o, "%5d: ", i); 464 o += strlen(o); 465 memset(o, ' ', 48); 466 i0 = i; 467 for (j=0; j < 16 && i < lim; i++, j++) { 468 o[j*3] = P_HI(p[i]); 469 o[j*3+1] = P_LO(p[i]); 470 } 471 i = i0; 472 for (j=0; j < 16 && i < lim; i++, j++) 473 o[j + 48] = P_C(p[i]); 474 o[j+48] = '\n'; 475 o += j+49; 476 } 477 *o = '\0'; 478 #undef P_HI 479 #undef P_LO 480 #undef P_C 481 return dst; 482 } 483 484 /* 485 * system parameters (most of them in netmap_kern.h) 486 * NM_NAME prefix for switch port names, default "vale" 487 * NM_BDG_MAXPORTS number of ports 488 * NM_BRIDGES max number of switches in the system. 489 * XXX should become a sysctl or tunable 490 * 491 * Switch ports are named valeX:Y where X is the switch name and Y 492 * is the port. If Y matches a physical interface name, the port is 493 * connected to a physical device. 494 * 495 * Unlike physical interfaces, switch ports use their own memory region 496 * for rings and buffers. 497 * The virtual interfaces use per-queue lock instead of core lock. 498 * In the tx loop, we aggregate traffic in batches to make all operations 499 * faster. The batch size is bridge_batch. 500 */ 501 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 502 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 503 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 504 #define NM_BDG_HASH 1024 /* forwarding table entries */ 505 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 506 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 507 /* actual size of the tables */ 508 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 509 /* NM_FT_NULL terminates a list of slots in the ft */ 510 #define NM_FT_NULL NM_BDG_BATCH_MAX 511 #define NM_BRIDGES 8 /* number of bridges */ 512 513 514 /* 515 * bridge_batch is set via sysctl to the max batch size to be 516 * used in the bridge. The actual value may be larger as the 517 * last packet in the block may overflow the size. 518 */ 519 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 520 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 521 522 523 /* 524 * These are used to handle reference counters for bridge ports. 525 */ 526 #define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) 527 #define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) 528 529 /* The bridge references the buffers using the device specific look up table */ 530 static inline void * 531 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) 532 { 533 struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; 534 uint32_t i = slot->buf_idx; 535 return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; 536 } 537 538 static int bdg_netmap_attach(struct netmap_adapter *); 539 static int bdg_netmap_reg(struct ifnet *ifp, int onoff); 540 int kern_netmap_regif(struct nmreq *nmr); 541 542 /* 543 * Each transmit queue accumulates a batch of packets into 544 * a structure before forwarding. Packets to the same 545 * destination are put in a list using ft_next as a link field. 546 * ft_frags and ft_next are valid only on the first fragment. 547 */ 548 struct nm_bdg_fwd { /* forwarding entry for a bridge */ 549 void *ft_buf; /* netmap or indirect buffer */ 550 uint8_t ft_frags; /* how many fragments (only on 1st frag) */ 551 uint8_t _ft_port; /* dst port (unused) */ 552 uint16_t ft_flags; /* flags, e.g. indirect */ 553 uint16_t ft_len; /* src fragment len */ 554 uint16_t ft_next; /* next packet to same destination */ 555 }; 556 557 /* 558 * For each output interface, nm_bdg_q is used to construct a list. 559 * bq_len is the number of output buffers (we can have coalescing 560 * during the copy). 561 */ 562 struct nm_bdg_q { 563 uint16_t bq_head; 564 uint16_t bq_tail; 565 uint32_t bq_len; /* number of buffers */ 566 }; 567 568 /* XXX revise this */ 569 struct nm_hash_ent { 570 uint64_t mac; /* the top 2 bytes are the epoch */ 571 uint64_t ports; 572 }; 573 574 /* 575 * nm_bridge is a descriptor for a VALE switch. 576 * Interfaces for a bridge are all in bdg_ports[]. 577 * The array has fixed size, an empty entry does not terminate 578 * the search, but lookups only occur on attach/detach so we 579 * don't mind if they are slow. 580 * 581 * The bridge is non blocking on the transmit ports: excess 582 * packets are dropped if there is no room on the output port. 583 * 584 * bdg_lock protects accesses to the bdg_ports array. 585 * This is a rw lock (or equivalent). 586 */ 587 struct nm_bridge { 588 /* XXX what is the proper alignment/layout ? */ 589 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 590 int bdg_namelen; 591 uint32_t bdg_active_ports; /* 0 means free */ 592 char bdg_basename[IFNAMSIZ]; 593 594 /* Indexes of active ports (up to active_ports) 595 * and all other remaining ports. 596 */ 597 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 598 599 struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; 600 601 602 /* 603 * The function to decide the destination port. 604 * It returns either of an index of the destination port, 605 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 606 * forward this packet. ring_nr is the source ring index, and the 607 * function may overwrite this value to forward this packet to a 608 * different ring index. 609 * This function must be set by netmap_bdgctl(). 610 */ 611 bdg_lookup_fn_t nm_bdg_lookup; 612 613 /* the forwarding table, MAC+ports. 614 * XXX should be changed to an argument to be passed to 615 * the lookup function, and allocated on attach 616 */ 617 struct nm_hash_ent ht[NM_BDG_HASH]; 618 }; 619 620 621 /* 622 * XXX in principle nm_bridges could be created dynamically 623 * Right now we have a static array and deletions are protected 624 * by an exclusive lock. 625 */ 626 struct nm_bridge nm_bridges[NM_BRIDGES]; 627 628 629 /* 630 * A few function to tell which kind of port are we using. 631 * XXX should we hold a lock ? 632 * 633 * nma_is_vp() virtual port 634 * nma_is_host() port connected to the host stack 635 * nma_is_hw() port connected to a NIC 636 */ 637 int nma_is_vp(struct netmap_adapter *na); 638 int 639 nma_is_vp(struct netmap_adapter *na) 640 { 641 return na->nm_register == bdg_netmap_reg; 642 } 643 644 static __inline int 645 nma_is_host(struct netmap_adapter *na) 646 { 647 return na->nm_register == NULL; 648 } 649 650 static __inline int 651 nma_is_hw(struct netmap_adapter *na) 652 { 653 /* In case of sw adapter, nm_register is NULL */ 654 return !nma_is_vp(na) && !nma_is_host(na); 655 } 656 657 658 /* 659 * If the NIC is owned by the kernel 660 * (i.e., bridge), neither another bridge nor user can use it; 661 * if the NIC is owned by a user, only users can share it. 662 * Evaluation must be done under NMG_LOCK(). 663 */ 664 #define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) 665 #define NETMAP_OWNED_BY_ANY(ifp) \ 666 (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) 667 668 /* 669 * NA(ifp)->bdg_port port index 670 */ 671 672 673 /* 674 * this is a slightly optimized copy routine which rounds 675 * to multiple of 64 bytes and is often faster than dealing 676 * with other odd sizes. We assume there is enough room 677 * in the source and destination buffers. 678 * 679 * XXX only for multiples of 64 bytes, non overlapped. 680 */ 681 static inline void 682 pkt_copy(void *_src, void *_dst, int l) 683 { 684 uint64_t *src = _src; 685 uint64_t *dst = _dst; 686 if (unlikely(l >= 1024)) { 687 memcpy(dst, src, l); 688 return; 689 } 690 for (; likely(l > 0); l-=64) { 691 *dst++ = *src++; 692 *dst++ = *src++; 693 *dst++ = *src++; 694 *dst++ = *src++; 695 *dst++ = *src++; 696 *dst++ = *src++; 697 *dst++ = *src++; 698 *dst++ = *src++; 699 } 700 } 701 702 703 #ifdef TEST_STUFF 704 struct xxx { 705 char *name; 706 void (*fn)(uint32_t); 707 }; 708 709 710 static void 711 nm_test_defmtx(uint32_t n) 712 { 713 uint32_t i; 714 struct mtx m; 715 mtx_init(&m, "test", NULL, MTX_DEF); 716 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 717 mtx_destroy(&m); 718 return; 719 } 720 721 static void 722 nm_test_spinmtx(uint32_t n) 723 { 724 uint32_t i; 725 struct mtx m; 726 mtx_init(&m, "test", NULL, MTX_SPIN); 727 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 728 mtx_destroy(&m); 729 return; 730 } 731 732 static void 733 nm_test_rlock(uint32_t n) 734 { 735 uint32_t i; 736 struct rwlock m; 737 rw_init(&m, "test"); 738 for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); } 739 rw_destroy(&m); 740 return; 741 } 742 743 static void 744 nm_test_wlock(uint32_t n) 745 { 746 uint32_t i; 747 struct rwlock m; 748 rw_init(&m, "test"); 749 for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); } 750 rw_destroy(&m); 751 return; 752 } 753 754 static void 755 nm_test_slock(uint32_t n) 756 { 757 uint32_t i; 758 struct sx m; 759 sx_init(&m, "test"); 760 for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); } 761 sx_destroy(&m); 762 return; 763 } 764 765 static void 766 nm_test_xlock(uint32_t n) 767 { 768 uint32_t i; 769 struct sx m; 770 sx_init(&m, "test"); 771 for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); } 772 sx_destroy(&m); 773 return; 774 } 775 776 777 struct xxx nm_tests[] = { 778 { "defmtx", nm_test_defmtx }, 779 { "spinmtx", nm_test_spinmtx }, 780 { "rlock", nm_test_rlock }, 781 { "wlock", nm_test_wlock }, 782 { "slock", nm_test_slock }, 783 { "xlock", nm_test_xlock }, 784 }; 785 786 static int 787 nm_test(struct nmreq *nmr) 788 { 789 uint32_t scale, n, test; 790 static int old_test = -1; 791 792 test = nmr->nr_cmd; 793 scale = nmr->nr_offset; 794 n = sizeof(nm_tests) / sizeof(struct xxx) - 1; 795 if (test > n) { 796 D("test index too high, max %d", n); 797 return 0; 798 } 799 800 if (old_test != test) { 801 D("test %s scale %d", nm_tests[test].name, scale); 802 old_test = test; 803 } 804 nm_tests[test].fn(scale); 805 return 0; 806 } 807 #endif /* TEST_STUFF */ 808 809 /* 810 * locate a bridge among the existing ones. 811 * MUST BE CALLED WITH NMG_LOCK() 812 * 813 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 814 * We assume that this is called with a name of at least NM_NAME chars. 815 */ 816 static struct nm_bridge * 817 nm_find_bridge(const char *name, int create) 818 { 819 int i, l, namelen; 820 struct nm_bridge *b = NULL; 821 822 NMG_LOCK_ASSERT(); 823 824 namelen = strlen(NM_NAME); /* base length */ 825 l = name ? strlen(name) : 0; /* actual length */ 826 if (l < namelen) { 827 D("invalid bridge name %s", name ? name : NULL); 828 return NULL; 829 } 830 for (i = namelen + 1; i < l; i++) { 831 if (name[i] == ':') { 832 namelen = i; 833 break; 834 } 835 } 836 if (namelen >= IFNAMSIZ) 837 namelen = IFNAMSIZ; 838 ND("--- prefix is '%.*s' ---", namelen, name); 839 840 /* lookup the name, remember empty slot if there is one */ 841 for (i = 0; i < NM_BRIDGES; i++) { 842 struct nm_bridge *x = nm_bridges + i; 843 844 if (x->bdg_active_ports == 0) { 845 if (create && b == NULL) 846 b = x; /* record empty slot */ 847 } else if (x->bdg_namelen != namelen) { 848 continue; 849 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 850 ND("found '%.*s' at %d", namelen, name, i); 851 b = x; 852 break; 853 } 854 } 855 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 856 /* initialize the bridge */ 857 strncpy(b->bdg_basename, name, namelen); 858 ND("create new bridge %s with ports %d", b->bdg_basename, 859 b->bdg_active_ports); 860 b->bdg_namelen = namelen; 861 b->bdg_active_ports = 0; 862 for (i = 0; i < NM_BDG_MAXPORTS; i++) 863 b->bdg_port_index[i] = i; 864 /* set the default function */ 865 b->nm_bdg_lookup = netmap_bdg_learning; 866 /* reset the MAC address table */ 867 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 868 } 869 return b; 870 } 871 872 873 /* 874 * Free the forwarding tables for rings attached to switch ports. 875 */ 876 static void 877 nm_free_bdgfwd(struct netmap_adapter *na) 878 { 879 int nrings, i; 880 struct netmap_kring *kring; 881 882 NMG_LOCK_ASSERT(); 883 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 884 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 885 for (i = 0; i < nrings; i++) { 886 if (kring[i].nkr_ft) { 887 free(kring[i].nkr_ft, M_DEVBUF); 888 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 889 } 890 } 891 if (nma_is_hw(na)) 892 nm_free_bdgfwd(SWNA(na->ifp)); 893 } 894 895 896 /* 897 * Allocate the forwarding tables for the rings attached to the bridge ports. 898 */ 899 static int 900 nm_alloc_bdgfwd(struct netmap_adapter *na) 901 { 902 int nrings, l, i, num_dstq; 903 struct netmap_kring *kring; 904 905 NMG_LOCK_ASSERT(); 906 /* all port:rings + broadcast */ 907 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 908 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 909 l += sizeof(struct nm_bdg_q) * num_dstq; 910 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 911 912 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 913 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 914 for (i = 0; i < nrings; i++) { 915 struct nm_bdg_fwd *ft; 916 struct nm_bdg_q *dstq; 917 int j; 918 919 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 920 if (!ft) { 921 nm_free_bdgfwd(na); 922 return ENOMEM; 923 } 924 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 925 for (j = 0; j < num_dstq; j++) { 926 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 927 dstq[j].bq_len = 0; 928 } 929 kring[i].nkr_ft = ft; 930 } 931 if (nma_is_hw(na)) 932 nm_alloc_bdgfwd(SWNA(na->ifp)); 933 return 0; 934 } 935 936 937 /* 938 * Fetch configuration from the device, to cope with dynamic 939 * reconfigurations after loading the module. 940 */ 941 static int 942 netmap_update_config(struct netmap_adapter *na) 943 { 944 struct ifnet *ifp = na->ifp; 945 u_int txr, txd, rxr, rxd; 946 947 txr = txd = rxr = rxd = 0; 948 if (na->nm_config) { 949 na->nm_config(ifp, &txr, &txd, &rxr, &rxd); 950 } else { 951 /* take whatever we had at init time */ 952 txr = na->num_tx_rings; 953 txd = na->num_tx_desc; 954 rxr = na->num_rx_rings; 955 rxd = na->num_rx_desc; 956 } 957 958 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 959 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 960 return 0; /* nothing changed */ 961 if (netmap_verbose || na->refcount > 0) { 962 D("stored config %s: txring %d x %d, rxring %d x %d", 963 ifp->if_xname, 964 na->num_tx_rings, na->num_tx_desc, 965 na->num_rx_rings, na->num_rx_desc); 966 D("new config %s: txring %d x %d, rxring %d x %d", 967 ifp->if_xname, txr, txd, rxr, rxd); 968 } 969 if (na->refcount == 0) { 970 D("configuration changed (but fine)"); 971 na->num_tx_rings = txr; 972 na->num_tx_desc = txd; 973 na->num_rx_rings = rxr; 974 na->num_rx_desc = rxd; 975 return 0; 976 } 977 D("configuration changed while active, this is bad..."); 978 return 1; 979 } 980 981 static struct netmap_if * 982 netmap_if_new(const char *ifname, struct netmap_adapter *na) 983 { 984 if (netmap_update_config(na)) { 985 /* configuration mismatch, report and fail */ 986 return NULL; 987 } 988 return netmap_mem_if_new(ifname, na); 989 } 990 991 992 /* Structure associated to each thread which registered an interface. 993 * 994 * The first 4 fields of this structure are written by NIOCREGIF and 995 * read by poll() and NIOC?XSYNC. 996 * There is low contention among writers (actually, a correct user program 997 * should have no contention among writers) and among writers and readers, 998 * so we use a single global lock to protect the structure initialization. 999 * Since initialization involves the allocation of memory, we reuse the memory 1000 * allocator lock. 1001 * Read access to the structure is lock free. Readers must check that 1002 * np_nifp is not NULL before using the other fields. 1003 * If np_nifp is NULL initialization has not been performed, so they should 1004 * return an error to userlevel. 1005 * 1006 * The ref_done field is used to regulate access to the refcount in the 1007 * memory allocator. The refcount must be incremented at most once for 1008 * each open("/dev/netmap"). The increment is performed by the first 1009 * function that calls netmap_get_memory() (currently called by 1010 * mmap(), NIOCGINFO and NIOCREGIF). 1011 * If the refcount is incremented, it is then decremented when the 1012 * private structure is destroyed. 1013 */ 1014 struct netmap_priv_d { 1015 struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ 1016 1017 struct ifnet *np_ifp; /* device for which we hold a ref. */ 1018 int np_ringid; /* from the ioctl */ 1019 u_int np_qfirst, np_qlast; /* range of rings to scan */ 1020 uint16_t np_txpoll; 1021 1022 struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ 1023 #ifdef __FreeBSD__ 1024 int np_refcount; /* use with NMG_LOCK held */ 1025 #endif /* __FreeBSD__ */ 1026 }; 1027 1028 /* grab a reference to the memory allocator, if we don't have one already. The 1029 * reference is taken from the netmap_adapter registered with the priv. 1030 * 1031 */ 1032 static int 1033 netmap_get_memory_locked(struct netmap_priv_d* p) 1034 { 1035 struct netmap_mem_d *nmd; 1036 int error = 0; 1037 1038 if (p->np_ifp == NULL) { 1039 if (!netmap_mmap_unreg) 1040 return ENODEV; 1041 /* for compatibility with older versions of the API 1042 * we use the global allocator when no interface has been 1043 * registered 1044 */ 1045 nmd = &nm_mem; 1046 } else { 1047 nmd = NA(p->np_ifp)->nm_mem; 1048 } 1049 if (p->np_mref == NULL) { 1050 error = netmap_mem_finalize(nmd); 1051 if (!error) 1052 p->np_mref = nmd; 1053 } else if (p->np_mref != nmd) { 1054 /* a virtual port has been registered, but previous 1055 * syscalls already used the global allocator. 1056 * We cannot continue 1057 */ 1058 error = ENODEV; 1059 } 1060 return error; 1061 } 1062 1063 static int 1064 netmap_get_memory(struct netmap_priv_d* p) 1065 { 1066 int error; 1067 NMG_LOCK(); 1068 error = netmap_get_memory_locked(p); 1069 NMG_UNLOCK(); 1070 return error; 1071 } 1072 1073 static int 1074 netmap_have_memory_locked(struct netmap_priv_d* p) 1075 { 1076 return p->np_mref != NULL; 1077 } 1078 1079 static void 1080 netmap_drop_memory_locked(struct netmap_priv_d* p) 1081 { 1082 if (p->np_mref) { 1083 netmap_mem_deref(p->np_mref); 1084 p->np_mref = NULL; 1085 } 1086 } 1087 1088 /* 1089 * File descriptor's private data destructor. 1090 * 1091 * Call nm_register(ifp,0) to stop netmap mode on the interface and 1092 * revert to normal operation. We expect that np_ifp has not gone. 1093 * The second argument is the nifp to work on. In some cases it is 1094 * not attached yet to the netmap_priv_d so we need to pass it as 1095 * a separate argument. 1096 */ 1097 /* call with NMG_LOCK held */ 1098 static void 1099 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 1100 { 1101 struct ifnet *ifp = priv->np_ifp; 1102 struct netmap_adapter *na = NA(ifp); 1103 1104 NMG_LOCK_ASSERT(); 1105 na->refcount--; 1106 if (na->refcount <= 0) { /* last instance */ 1107 u_int i; 1108 1109 if (netmap_verbose) 1110 D("deleting last instance for %s", ifp->if_xname); 1111 /* 1112 * (TO CHECK) This function is only called 1113 * when the last reference to this file descriptor goes 1114 * away. This means we cannot have any pending poll() 1115 * or interrupt routine operating on the structure. 1116 * XXX The file may be closed in a thread while 1117 * another thread is using it. 1118 * Linux keeps the file opened until the last reference 1119 * by any outstanding ioctl/poll or mmap is gone. 1120 * FreeBSD does not track mmap()s (but we do) and 1121 * wakes up any sleeping poll(). Need to check what 1122 * happens if the close() occurs while a concurrent 1123 * syscall is running. 1124 */ 1125 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ 1126 /* Wake up any sleeping threads. netmap_poll will 1127 * then return POLLERR 1128 * XXX The wake up now must happen during *_down(), when 1129 * we order all activities to stop. -gl 1130 */ 1131 nm_free_bdgfwd(na); 1132 for (i = 0; i < na->num_tx_rings + 1; i++) { 1133 mtx_destroy(&na->tx_rings[i].q_lock); 1134 } 1135 for (i = 0; i < na->num_rx_rings + 1; i++) { 1136 mtx_destroy(&na->rx_rings[i].q_lock); 1137 } 1138 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 1139 /* knlist_destroy(&na->tx_si.si_note); */ 1140 /* knlist_destroy(&na->rx_si.si_note); */ 1141 if (nma_is_hw(na)) 1142 SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; 1143 } 1144 /* 1145 * netmap_mem_if_delete() deletes the nifp, and if this is 1146 * the last instance also buffers, rings and krings. 1147 */ 1148 netmap_mem_if_delete(na, nifp); 1149 } 1150 1151 1152 /* we assume netmap adapter exists 1153 * Called with NMG_LOCK held 1154 */ 1155 static void 1156 nm_if_rele(struct ifnet *ifp) 1157 { 1158 int i, is_hw, hw, sw, lim; 1159 struct nm_bridge *b; 1160 struct netmap_adapter *na; 1161 uint8_t tmp[NM_BDG_MAXPORTS]; 1162 1163 NMG_LOCK_ASSERT(); 1164 /* I can be called not only for get_ifp()-ed references where netmap's 1165 * capability is guaranteed, but also for non-netmap-capable NICs. 1166 */ 1167 if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { 1168 if_rele(ifp); 1169 return; 1170 } 1171 na = NA(ifp); 1172 b = na->na_bdg; 1173 is_hw = nma_is_hw(na); 1174 1175 ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); 1176 1177 if (!DROP_BDG_REF(ifp)) 1178 return; 1179 1180 /* 1181 New algorithm: 1182 make a copy of bdg_port_index; 1183 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 1184 in the array of bdg_port_index, replacing them with 1185 entries from the bottom of the array; 1186 decrement bdg_active_ports; 1187 acquire BDG_WLOCK() and copy back the array. 1188 */ 1189 1190 hw = NA(ifp)->bdg_port; 1191 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1192 lim = b->bdg_active_ports; 1193 1194 ND("detach %d and %d (lim %d)", hw, sw, lim); 1195 /* make a copy of the list of active ports, update it, 1196 * and then copy back within BDG_WLOCK(). 1197 */ 1198 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 1199 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 1200 if (hw >= 0 && tmp[i] == hw) { 1201 ND("detach hw %d at %d", hw, i); 1202 lim--; /* point to last active port */ 1203 tmp[i] = tmp[lim]; /* swap with i */ 1204 tmp[lim] = hw; /* now this is inactive */ 1205 hw = -1; 1206 } else if (sw >= 0 && tmp[i] == sw) { 1207 ND("detach sw %d at %d", sw, i); 1208 lim--; 1209 tmp[i] = tmp[lim]; 1210 tmp[lim] = sw; 1211 sw = -1; 1212 } else { 1213 i++; 1214 } 1215 } 1216 if (hw >= 0 || sw >= 0) { 1217 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 1218 } 1219 hw = NA(ifp)->bdg_port; 1220 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1221 1222 BDG_WLOCK(b); 1223 b->bdg_ports[hw] = NULL; 1224 na->na_bdg = NULL; 1225 if (sw >= 0) { 1226 b->bdg_ports[sw] = NULL; 1227 SWNA(ifp)->na_bdg = NULL; 1228 } 1229 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 1230 b->bdg_active_ports = lim; 1231 BDG_WUNLOCK(b); 1232 1233 ND("now %d active ports", lim); 1234 if (lim == 0) { 1235 ND("marking bridge %s as free", b->bdg_basename); 1236 b->nm_bdg_lookup = NULL; 1237 } 1238 1239 if (is_hw) { 1240 if_rele(ifp); 1241 } else { 1242 if (na->na_flags & NAF_MEM_OWNER) 1243 netmap_mem_private_delete(na->nm_mem); 1244 bzero(na, sizeof(*na)); 1245 free(na, M_DEVBUF); 1246 bzero(ifp, sizeof(*ifp)); 1247 free(ifp, M_DEVBUF); 1248 } 1249 } 1250 1251 1252 /* 1253 * returns 1 if this is the last instance and we can free priv 1254 */ 1255 static int 1256 netmap_dtor_locked(struct netmap_priv_d *priv) 1257 { 1258 struct ifnet *ifp = priv->np_ifp; 1259 1260 #ifdef __FreeBSD__ 1261 /* 1262 * np_refcount is the number of active mmaps on 1263 * this file descriptor 1264 */ 1265 if (--priv->np_refcount > 0) { 1266 return 0; 1267 } 1268 #endif /* __FreeBSD__ */ 1269 if (ifp) { 1270 netmap_do_unregif(priv, priv->np_nifp); 1271 } 1272 netmap_drop_memory_locked(priv); 1273 if (ifp) { 1274 nm_if_rele(ifp); /* might also destroy *na */ 1275 } 1276 return 1; 1277 } 1278 1279 static void 1280 netmap_dtor(void *data) 1281 { 1282 struct netmap_priv_d *priv = data; 1283 int last_instance; 1284 1285 NMG_LOCK(); 1286 last_instance = netmap_dtor_locked(priv); 1287 NMG_UNLOCK(); 1288 if (last_instance) { 1289 bzero(priv, sizeof(*priv)); /* for safety */ 1290 free(priv, M_DEVBUF); 1291 } 1292 } 1293 1294 1295 #ifdef __FreeBSD__ 1296 1297 /* 1298 * In order to track whether pages are still mapped, we hook into 1299 * the standard cdev_pager and intercept the constructor and 1300 * destructor. 1301 */ 1302 1303 struct netmap_vm_handle_t { 1304 struct cdev *dev; 1305 struct netmap_priv_d *priv; 1306 }; 1307 1308 static int 1309 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 1310 vm_ooffset_t foff, struct ucred *cred, u_short *color) 1311 { 1312 struct netmap_vm_handle_t *vmh = handle; 1313 D("handle %p size %jd prot %d foff %jd", 1314 handle, (intmax_t)size, prot, (intmax_t)foff); 1315 dev_ref(vmh->dev); 1316 return 0; 1317 } 1318 1319 1320 static void 1321 netmap_dev_pager_dtor(void *handle) 1322 { 1323 struct netmap_vm_handle_t *vmh = handle; 1324 struct cdev *dev = vmh->dev; 1325 struct netmap_priv_d *priv = vmh->priv; 1326 D("handle %p", handle); 1327 netmap_dtor(priv); 1328 free(vmh, M_DEVBUF); 1329 dev_rel(dev); 1330 } 1331 1332 static int 1333 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, 1334 int prot, vm_page_t *mres) 1335 { 1336 struct netmap_vm_handle_t *vmh = object->handle; 1337 struct netmap_priv_d *priv = vmh->priv; 1338 vm_paddr_t paddr; 1339 vm_page_t page; 1340 vm_memattr_t memattr; 1341 vm_pindex_t pidx; 1342 1343 ND("object %p offset %jd prot %d mres %p", 1344 object, (intmax_t)offset, prot, mres); 1345 memattr = object->memattr; 1346 pidx = OFF_TO_IDX(offset); 1347 paddr = netmap_mem_ofstophys(priv->np_mref, offset); 1348 if (paddr == 0) 1349 return VM_PAGER_FAIL; 1350 1351 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 1352 /* 1353 * If the passed in result page is a fake page, update it with 1354 * the new physical address. 1355 */ 1356 page = *mres; 1357 vm_page_updatefake(page, paddr, memattr); 1358 } else { 1359 /* 1360 * Replace the passed in reqpage page with our own fake page and 1361 * free up the all of the original pages. 1362 */ 1363 #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ 1364 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK 1365 #define VM_OBJECT_WLOCK VM_OBJECT_LOCK 1366 #endif /* VM_OBJECT_WUNLOCK */ 1367 1368 VM_OBJECT_WUNLOCK(object); 1369 page = vm_page_getfake(paddr, memattr); 1370 VM_OBJECT_WLOCK(object); 1371 vm_page_lock(*mres); 1372 vm_page_free(*mres); 1373 vm_page_unlock(*mres); 1374 *mres = page; 1375 vm_page_insert(page, object, pidx); 1376 } 1377 page->valid = VM_PAGE_BITS_ALL; 1378 return (VM_PAGER_OK); 1379 } 1380 1381 1382 static struct cdev_pager_ops netmap_cdev_pager_ops = { 1383 .cdev_pg_ctor = netmap_dev_pager_ctor, 1384 .cdev_pg_dtor = netmap_dev_pager_dtor, 1385 .cdev_pg_fault = netmap_dev_pager_fault, 1386 }; 1387 1388 1389 static int 1390 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, 1391 vm_size_t objsize, vm_object_t *objp, int prot) 1392 { 1393 int error; 1394 struct netmap_vm_handle_t *vmh; 1395 struct netmap_priv_d *priv; 1396 vm_object_t obj; 1397 1398 D("cdev %p foff %jd size %jd objp %p prot %d", cdev, 1399 (intmax_t )*foff, (intmax_t )objsize, objp, prot); 1400 1401 vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, 1402 M_NOWAIT | M_ZERO); 1403 if (vmh == NULL) 1404 return ENOMEM; 1405 vmh->dev = cdev; 1406 1407 NMG_LOCK(); 1408 error = devfs_get_cdevpriv((void**)&priv); 1409 if (error) 1410 goto err_unlock; 1411 vmh->priv = priv; 1412 priv->np_refcount++; 1413 NMG_UNLOCK(); 1414 1415 error = netmap_get_memory(priv); 1416 if (error) 1417 goto err_deref; 1418 1419 obj = cdev_pager_allocate(vmh, OBJT_DEVICE, 1420 &netmap_cdev_pager_ops, objsize, prot, 1421 *foff, NULL); 1422 if (obj == NULL) { 1423 D("cdev_pager_allocate failed"); 1424 error = EINVAL; 1425 goto err_deref; 1426 } 1427 1428 *objp = obj; 1429 return 0; 1430 1431 err_deref: 1432 NMG_LOCK(); 1433 priv->np_refcount--; 1434 err_unlock: 1435 NMG_UNLOCK(); 1436 // err: 1437 free(vmh, M_DEVBUF); 1438 return error; 1439 } 1440 1441 1442 // XXX can we remove this ? 1443 static int 1444 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) 1445 { 1446 if (netmap_verbose) 1447 D("dev %p fflag 0x%x devtype %d td %p", 1448 dev, fflag, devtype, td); 1449 return 0; 1450 } 1451 1452 1453 static int 1454 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 1455 { 1456 struct netmap_priv_d *priv; 1457 int error; 1458 1459 (void)dev; 1460 (void)oflags; 1461 (void)devtype; 1462 (void)td; 1463 1464 // XXX wait or nowait ? 1465 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 1466 M_NOWAIT | M_ZERO); 1467 if (priv == NULL) 1468 return ENOMEM; 1469 1470 error = devfs_set_cdevpriv(priv, netmap_dtor); 1471 if (error) 1472 return error; 1473 1474 priv->np_refcount = 1; 1475 1476 return 0; 1477 } 1478 #endif /* __FreeBSD__ */ 1479 1480 1481 /* 1482 * Handlers for synchronization of the queues from/to the host. 1483 * Netmap has two operating modes: 1484 * - in the default mode, the rings connected to the host stack are 1485 * just another ring pair managed by userspace; 1486 * - in transparent mode (XXX to be defined) incoming packets 1487 * (from the host or the NIC) are marked as NS_FORWARD upon 1488 * arrival, and the user application has a chance to reset the 1489 * flag for packets that should be dropped. 1490 * On the RXSYNC or poll(), packets in RX rings between 1491 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1492 * to the other side. 1493 * The transfer NIC --> host is relatively easy, just encapsulate 1494 * into mbufs and we are done. The host --> NIC side is slightly 1495 * harder because there might not be room in the tx ring so it 1496 * might take a while before releasing the buffer. 1497 */ 1498 1499 1500 /* 1501 * pass a chain of buffers to the host stack as coming from 'dst' 1502 */ 1503 static void 1504 netmap_send_up(struct ifnet *dst, struct mbuf *head) 1505 { 1506 struct mbuf *m; 1507 1508 /* send packets up, outside the lock */ 1509 while ((m = head) != NULL) { 1510 head = head->m_nextpkt; 1511 m->m_nextpkt = NULL; 1512 if (netmap_verbose & NM_VERB_HOST) 1513 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1514 NM_SEND_UP(dst, m); 1515 } 1516 } 1517 1518 struct mbq { 1519 struct mbuf *head; 1520 struct mbuf *tail; 1521 int count; 1522 }; 1523 1524 1525 /* 1526 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1527 * Run from hwcur to cur - reserved 1528 */ 1529 static void 1530 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1531 { 1532 /* Take packets from hwcur to cur-reserved and pass them up. 1533 * In case of no buffers we give up. At the end of the loop, 1534 * the queue is drained in all cases. 1535 * XXX handle reserved 1536 */ 1537 u_int lim = kring->nkr_num_slots - 1; 1538 struct mbuf *m, *tail = q->tail; 1539 u_int k = kring->ring->cur, n = kring->ring->reserved; 1540 struct netmap_mem_d *nmd = kring->na->nm_mem; 1541 1542 /* compute the final position, ring->cur - ring->reserved */ 1543 if (n > 0) { 1544 if (k < n) 1545 k += kring->nkr_num_slots; 1546 k += n; 1547 } 1548 for (n = kring->nr_hwcur; n != k;) { 1549 struct netmap_slot *slot = &kring->ring->slot[n]; 1550 1551 n = nm_next(n, lim); 1552 if ((slot->flags & NS_FORWARD) == 0 && !force) 1553 continue; 1554 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { 1555 D("bad pkt at %d len %d", n, slot->len); 1556 continue; 1557 } 1558 slot->flags &= ~NS_FORWARD; // XXX needed ? 1559 /* XXX adapt to the case of a multisegment packet */ 1560 m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); 1561 1562 if (m == NULL) 1563 break; 1564 if (tail) 1565 tail->m_nextpkt = m; 1566 else 1567 q->head = m; 1568 tail = m; 1569 q->count++; 1570 m->m_nextpkt = NULL; 1571 } 1572 q->tail = tail; 1573 } 1574 1575 1576 /* 1577 * The host ring has packets from nr_hwcur to (cur - reserved) 1578 * to be sent down to the NIC. 1579 * We need to use the queue lock on the source (host RX ring) 1580 * to protect against netmap_transmit. 1581 * If the user is well behaved we do not need to acquire locks 1582 * on the destination(s), 1583 * so we only need to make sure that there are no panics because 1584 * of user errors. 1585 * XXX verify 1586 * 1587 * We scan the tx rings, which have just been 1588 * flushed so nr_hwcur == cur. Pushing packets down means 1589 * increment cur and decrement avail. 1590 * XXX to be verified 1591 */ 1592 static void 1593 netmap_sw_to_nic(struct netmap_adapter *na) 1594 { 1595 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1596 struct netmap_kring *k1 = &na->tx_rings[0]; 1597 u_int i, howmany, src_lim, dst_lim; 1598 1599 /* XXX we should also check that the carrier is on */ 1600 if (kring->nkr_stopped) 1601 return; 1602 1603 mtx_lock(&kring->q_lock); 1604 1605 if (kring->nkr_stopped) 1606 goto out; 1607 1608 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 1609 1610 src_lim = kring->nkr_num_slots - 1; 1611 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 1612 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 1613 dst_lim = k1->nkr_num_slots - 1; 1614 while (howmany > 0 && k1->ring->avail > 0) { 1615 struct netmap_slot *src, *dst, tmp; 1616 src = &kring->ring->slot[kring->nr_hwcur]; 1617 dst = &k1->ring->slot[k1->ring->cur]; 1618 tmp = *src; 1619 src->buf_idx = dst->buf_idx; 1620 src->flags = NS_BUF_CHANGED; 1621 1622 dst->buf_idx = tmp.buf_idx; 1623 dst->len = tmp.len; 1624 dst->flags = NS_BUF_CHANGED; 1625 ND("out len %d buf %d from %d to %d", 1626 dst->len, dst->buf_idx, 1627 kring->nr_hwcur, k1->ring->cur); 1628 1629 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 1630 howmany--; 1631 kring->nr_hwavail--; 1632 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 1633 k1->ring->avail--; 1634 } 1635 kring->ring->cur = kring->nr_hwcur; // XXX 1636 k1++; // XXX why? 1637 } 1638 out: 1639 mtx_unlock(&kring->q_lock); 1640 } 1641 1642 1643 /* 1644 * netmap_txsync_to_host() passes packets up. We are called from a 1645 * system call in user process context, and the only contention 1646 * can be among multiple user threads erroneously calling 1647 * this routine concurrently. 1648 */ 1649 static void 1650 netmap_txsync_to_host(struct netmap_adapter *na) 1651 { 1652 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1653 struct netmap_ring *ring = kring->ring; 1654 u_int k, lim = kring->nkr_num_slots - 1; 1655 struct mbq q = { NULL, NULL, 0 }; 1656 1657 if (nm_kr_tryget(kring)) { 1658 D("ring %p busy (user error)", kring); 1659 return; 1660 } 1661 k = ring->cur; 1662 if (k > lim) { 1663 D("invalid ring index in stack TX kring %p", kring); 1664 netmap_ring_reinit(kring); 1665 nm_kr_put(kring); 1666 return; 1667 } 1668 1669 /* Take packets from hwcur to cur and pass them up. 1670 * In case of no buffers we give up. At the end of the loop, 1671 * the queue is drained in all cases. 1672 */ 1673 netmap_grab_packets(kring, &q, 1); 1674 kring->nr_hwcur = k; 1675 kring->nr_hwavail = ring->avail = lim; 1676 1677 nm_kr_put(kring); 1678 netmap_send_up(na->ifp, q.head); 1679 } 1680 1681 1682 /* 1683 * This is the 'txsync' handler to send from a software ring to the 1684 * host stack. 1685 */ 1686 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ 1687 static int 1688 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) 1689 { 1690 (void)ring_nr; 1691 (void)flags; 1692 if (netmap_verbose > 255) 1693 RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); 1694 netmap_txsync_to_host(NA(ifp)); 1695 return 0; 1696 } 1697 1698 1699 /* 1700 * rxsync backend for packets coming from the host stack. 1701 * They have been put in the queue by netmap_transmit() so we 1702 * need to protect access to the kring using a lock. 1703 * 1704 * This routine also does the selrecord if called from the poll handler 1705 * (we know because td != NULL). 1706 * 1707 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1708 * as an additional hidden argument. 1709 */ 1710 static void 1711 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1712 { 1713 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1714 struct netmap_ring *ring = kring->ring; 1715 u_int j, n, lim = kring->nkr_num_slots; 1716 u_int k = ring->cur, resvd = ring->reserved; 1717 1718 (void)pwait; /* disable unused warnings */ 1719 1720 if (kring->nkr_stopped) /* check a first time without lock */ 1721 return; 1722 1723 /* XXX as an optimization we could reuse na->core_lock */ 1724 mtx_lock(&kring->q_lock); 1725 1726 if (kring->nkr_stopped) /* check again with lock held */ 1727 goto unlock_out; 1728 1729 if (k >= lim) { 1730 netmap_ring_reinit(kring); 1731 goto unlock_out; 1732 } 1733 /* new packets are already set in nr_hwavail */ 1734 /* skip past packets that userspace has released */ 1735 j = kring->nr_hwcur; 1736 if (resvd > 0) { 1737 if (resvd + ring->avail >= lim + 1) { 1738 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 1739 ring->reserved = resvd = 0; // XXX panic... 1740 } 1741 k = (k >= resvd) ? k - resvd : k + lim - resvd; 1742 } 1743 if (j != k) { 1744 n = k >= j ? k - j : k + lim - j; 1745 kring->nr_hwavail -= n; 1746 kring->nr_hwcur = k; 1747 } 1748 k = ring->avail = kring->nr_hwavail - resvd; 1749 if (k == 0 && td) 1750 selrecord(td, &kring->si); 1751 if (k && (netmap_verbose & NM_VERB_HOST)) 1752 D("%d pkts from stack", k); 1753 unlock_out: 1754 1755 mtx_unlock(&kring->q_lock); 1756 } 1757 1758 1759 /* 1760 * MUST BE CALLED UNDER NMG_LOCK() 1761 * 1762 * get a refcounted reference to an interface. 1763 * This is always called in the execution of an ioctl(). 1764 * 1765 * Return ENXIO if the interface does not exist, EINVAL if netmap 1766 * is not supported by the interface. 1767 * If successful, hold a reference. 1768 * 1769 * When the NIC is attached to a bridge, reference is managed 1770 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1771 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1772 * is detached from the bridge, then ifp's refcount is dropped (this 1773 * is equivalent to that ifp is destroyed in case of virtual ports. 1774 * 1775 * This function uses if_rele() when we want to prevent the NIC from 1776 * being detached from the bridge in error handling. But once refcount 1777 * is acquired by this function, it must be released using nm_if_rele(). 1778 */ 1779 static int 1780 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) 1781 { 1782 const char *name = nmr->nr_name; 1783 int namelen = strlen(name); 1784 struct ifnet *iter = NULL; 1785 int no_prefix = 0; 1786 1787 /* first try to see if this is a bridge port. */ 1788 struct nm_bridge *b; 1789 struct netmap_adapter *na; 1790 int i, j, cand = -1, cand2 = -1; 1791 int needed; 1792 1793 NMG_LOCK_ASSERT(); 1794 *ifp = NULL; /* default */ 1795 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { 1796 no_prefix = 1; /* no VALE prefix */ 1797 goto no_bridge_port; 1798 } 1799 1800 b = nm_find_bridge(name, create); 1801 if (b == NULL) { 1802 D("no bridges available for '%s'", name); 1803 return (ENXIO); 1804 } 1805 1806 /* Now we are sure that name starts with the bridge's name, 1807 * lookup the port in the bridge. We need to scan the entire 1808 * list. It is not important to hold a WLOCK on the bridge 1809 * during the search because NMG_LOCK already guarantees 1810 * that there are no other possible writers. 1811 */ 1812 1813 /* lookup in the local list of ports */ 1814 for (j = 0; j < b->bdg_active_ports; j++) { 1815 i = b->bdg_port_index[j]; 1816 na = b->bdg_ports[i]; 1817 // KASSERT(na != NULL); 1818 iter = na->ifp; 1819 /* XXX make sure the name only contains one : */ 1820 if (!strcmp(iter->if_xname, name) /* virtual port */ || 1821 (namelen > b->bdg_namelen && !strcmp(iter->if_xname, 1822 name + b->bdg_namelen + 1)) /* NIC */) { 1823 ADD_BDG_REF(iter); 1824 ND("found existing if %s refs %d", name, 1825 NA(iter)->na_bdg_refcount); 1826 *ifp = iter; 1827 /* we are done, this is surely netmap capable */ 1828 return 0; 1829 } 1830 } 1831 /* not found, should we create it? */ 1832 if (!create) 1833 return ENXIO; 1834 /* yes we should, see if we have space to attach entries */ 1835 needed = 2; /* in some cases we only need 1 */ 1836 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 1837 D("bridge full %d, cannot create new port", b->bdg_active_ports); 1838 return EINVAL; 1839 } 1840 /* record the next two ports available, but do not allocate yet */ 1841 cand = b->bdg_port_index[b->bdg_active_ports]; 1842 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 1843 ND("+++ bridge %s port %s used %d avail %d %d", 1844 b->bdg_basename, name, b->bdg_active_ports, cand, cand2); 1845 1846 /* 1847 * try see if there is a matching NIC with this name 1848 * (after the bridge's name) 1849 */ 1850 iter = ifunit_ref(name + b->bdg_namelen + 1); 1851 if (!iter) { /* this is a virtual port */ 1852 /* Create a temporary NA with arguments, then 1853 * bdg_netmap_attach() will allocate the real one 1854 * and attach it to the ifp 1855 */ 1856 struct netmap_adapter tmp_na; 1857 int error; 1858 1859 if (nmr->nr_cmd) { 1860 /* nr_cmd must be 0 for a virtual port */ 1861 return EINVAL; 1862 } 1863 bzero(&tmp_na, sizeof(tmp_na)); 1864 /* bound checking */ 1865 tmp_na.num_tx_rings = nmr->nr_tx_rings; 1866 nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1867 nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back 1868 tmp_na.num_rx_rings = nmr->nr_rx_rings; 1869 nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1870 nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back 1871 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1872 1, NM_BDG_MAXSLOTS, NULL); 1873 tmp_na.num_tx_desc = nmr->nr_tx_slots; 1874 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1875 1, NM_BDG_MAXSLOTS, NULL); 1876 tmp_na.num_rx_desc = nmr->nr_rx_slots; 1877 1878 /* create a struct ifnet for the new port. 1879 * need M_NOWAIT as we are under nma_lock 1880 */ 1881 iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); 1882 if (!iter) 1883 return ENOMEM; 1884 1885 strcpy(iter->if_xname, name); 1886 tmp_na.ifp = iter; 1887 /* bdg_netmap_attach creates a struct netmap_adapter */ 1888 error = bdg_netmap_attach(&tmp_na); 1889 if (error) { 1890 D("error %d", error); 1891 free(iter, M_DEVBUF); 1892 return error; 1893 } 1894 cand2 = -1; /* only need one port */ 1895 } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ 1896 /* make sure the NIC is not already in use */ 1897 if (NETMAP_OWNED_BY_ANY(iter)) { 1898 D("NIC %s busy, cannot attach to bridge", 1899 iter->if_xname); 1900 if_rele(iter); /* don't detach from bridge */ 1901 return EINVAL; 1902 } 1903 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 1904 cand2 = -1; /* only need one port */ 1905 } else { /* not a netmap-capable NIC */ 1906 if_rele(iter); /* don't detach from bridge */ 1907 return EINVAL; 1908 } 1909 na = NA(iter); 1910 1911 BDG_WLOCK(b); 1912 na->bdg_port = cand; 1913 ND("NIC %p to bridge port %d", NA(iter), cand); 1914 /* bind the port to the bridge (virtual ports are not active) */ 1915 b->bdg_ports[cand] = na; 1916 na->na_bdg = b; 1917 b->bdg_active_ports++; 1918 if (cand2 >= 0) { 1919 /* also bind the host stack to the bridge */ 1920 b->bdg_ports[cand2] = SWNA(iter); 1921 SWNA(iter)->bdg_port = cand2; 1922 SWNA(iter)->na_bdg = b; 1923 b->bdg_active_ports++; 1924 ND("host %p to bridge port %d", SWNA(iter), cand2); 1925 } 1926 ADD_BDG_REF(iter); // XXX one or two ? 1927 ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); 1928 BDG_WUNLOCK(b); 1929 *ifp = iter; 1930 return 0; 1931 1932 no_bridge_port: 1933 *ifp = iter; 1934 if (! *ifp) 1935 *ifp = ifunit_ref(name); 1936 if (*ifp == NULL) 1937 return (ENXIO); 1938 1939 if (NETMAP_CAPABLE(*ifp)) { 1940 /* Users cannot use the NIC attached to a bridge directly */ 1941 if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { 1942 if_rele(*ifp); /* don't detach from bridge */ 1943 return EINVAL; 1944 } else 1945 return 0; /* valid pointer, we hold the refcount */ 1946 } 1947 nm_if_rele(*ifp); 1948 return EINVAL; // not NETMAP capable 1949 } 1950 1951 1952 /* 1953 * Error routine called when txsync/rxsync detects an error. 1954 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1955 * Return 1 on reinit. 1956 * 1957 * This routine is only called by the upper half of the kernel. 1958 * It only reads hwcur (which is changed only by the upper half, too) 1959 * and hwavail (which may be changed by the lower half, but only on 1960 * a tx ring and only to increase it, so any error will be recovered 1961 * on the next call). For the above, we don't strictly need to call 1962 * it under lock. 1963 */ 1964 int 1965 netmap_ring_reinit(struct netmap_kring *kring) 1966 { 1967 struct netmap_ring *ring = kring->ring; 1968 u_int i, lim = kring->nkr_num_slots - 1; 1969 int errors = 0; 1970 1971 // XXX KASSERT nm_kr_tryget 1972 RD(10, "called for %s", kring->na->ifp->if_xname); 1973 if (ring->cur > lim) 1974 errors++; 1975 for (i = 0; i <= lim; i++) { 1976 u_int idx = ring->slot[i].buf_idx; 1977 u_int len = ring->slot[i].len; 1978 if (idx < 2 || idx >= netmap_total_buffers) { 1979 if (!errors++) 1980 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1981 ring->slot[i].buf_idx = 0; 1982 ring->slot[i].len = 0; 1983 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1984 ring->slot[i].len = 0; 1985 if (!errors++) 1986 D("bad len %d at slot %d idx %d", 1987 len, i, idx); 1988 } 1989 } 1990 if (errors) { 1991 int pos = kring - kring->na->tx_rings; 1992 int n = kring->na->num_tx_rings + 1; 1993 1994 RD(10, "total %d errors", errors); 1995 errors++; 1996 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1997 kring->na->ifp->if_xname, 1998 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1999 ring->cur, kring->nr_hwcur, 2000 ring->avail, kring->nr_hwavail); 2001 ring->cur = kring->nr_hwcur; 2002 ring->avail = kring->nr_hwavail; 2003 } 2004 return (errors ? 1 : 0); 2005 } 2006 2007 2008 /* 2009 * Set the ring ID. For devices with a single queue, a request 2010 * for all rings is the same as a single ring. 2011 */ 2012 static int 2013 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 2014 { 2015 struct ifnet *ifp = priv->np_ifp; 2016 struct netmap_adapter *na = NA(ifp); 2017 u_int i = ringid & NETMAP_RING_MASK; 2018 /* initially (np_qfirst == np_qlast) we don't want to lock */ 2019 u_int lim = na->num_rx_rings; 2020 2021 if (na->num_tx_rings > lim) 2022 lim = na->num_tx_rings; 2023 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 2024 D("invalid ring id %d", i); 2025 return (EINVAL); 2026 } 2027 priv->np_ringid = ringid; 2028 if (ringid & NETMAP_SW_RING) { 2029 priv->np_qfirst = NETMAP_SW_RING; 2030 priv->np_qlast = 0; 2031 } else if (ringid & NETMAP_HW_RING) { 2032 priv->np_qfirst = i; 2033 priv->np_qlast = i + 1; 2034 } else { 2035 priv->np_qfirst = 0; 2036 priv->np_qlast = NETMAP_HW_RING ; 2037 } 2038 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 2039 if (netmap_verbose) { 2040 if (ringid & NETMAP_SW_RING) 2041 D("ringid %s set to SW RING", ifp->if_xname); 2042 else if (ringid & NETMAP_HW_RING) 2043 D("ringid %s set to HW RING %d", ifp->if_xname, 2044 priv->np_qfirst); 2045 else 2046 D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); 2047 } 2048 return 0; 2049 } 2050 2051 2052 /* 2053 * possibly move the interface to netmap-mode. 2054 * If success it returns a pointer to netmap_if, otherwise NULL. 2055 * This must be called with NMG_LOCK held. 2056 */ 2057 static struct netmap_if * 2058 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, 2059 uint16_t ringid, int *err) 2060 { 2061 struct netmap_adapter *na = NA(ifp); 2062 struct netmap_if *nifp = NULL; 2063 int error, need_mem; 2064 2065 NMG_LOCK_ASSERT(); 2066 /* ring configuration may have changed, fetch from the card */ 2067 netmap_update_config(na); 2068 priv->np_ifp = ifp; /* store the reference */ 2069 error = netmap_set_ringid(priv, ringid); 2070 if (error) 2071 goto out; 2072 /* ensure allocators are ready */ 2073 need_mem = !netmap_have_memory_locked(priv); 2074 if (need_mem) { 2075 error = netmap_get_memory_locked(priv); 2076 ND("get_memory returned %d", error); 2077 if (error) 2078 goto out; 2079 } 2080 nifp = netmap_if_new(ifp->if_xname, na); 2081 if (nifp == NULL) { /* allocation failed */ 2082 /* we should drop the allocator, but only 2083 * if we were the ones who grabbed it 2084 */ 2085 if (need_mem) 2086 netmap_drop_memory_locked(priv); 2087 error = ENOMEM; 2088 goto out; 2089 } 2090 na->refcount++; 2091 if (ifp->if_capenable & IFCAP_NETMAP) { 2092 /* was already set */ 2093 } else { 2094 u_int i; 2095 /* Otherwise set the card in netmap mode 2096 * and make it use the shared buffers. 2097 * 2098 * If the interface is attached to a bridge, lock it. 2099 */ 2100 if (NETMAP_OWNED_BY_KERN(ifp)) 2101 BDG_WLOCK(NA(ifp)->na_bdg); 2102 for (i = 0 ; i < na->num_tx_rings + 1; i++) 2103 mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", 2104 NULL, MTX_DEF); 2105 for (i = 0 ; i < na->num_rx_rings + 1; i++) { 2106 mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", 2107 NULL, MTX_DEF); 2108 } 2109 if (nma_is_hw(na)) { 2110 SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; 2111 SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; 2112 } 2113 /* 2114 * do not core lock because the race is harmless here, 2115 * there cannot be any traffic to netmap_transmit() 2116 */ 2117 error = na->nm_register(ifp, 1); /* mode on */ 2118 // XXX do we need to nm_alloc_bdgfwd() in all cases ? 2119 if (!error) 2120 error = nm_alloc_bdgfwd(na); 2121 if (error) { 2122 netmap_do_unregif(priv, nifp); 2123 nifp = NULL; 2124 } 2125 if (NETMAP_OWNED_BY_KERN(ifp)) 2126 BDG_WUNLOCK(NA(ifp)->na_bdg); 2127 2128 } 2129 out: 2130 *err = error; 2131 if (nifp != NULL) { 2132 /* 2133 * advertise that the interface is ready bt setting ni_nifp. 2134 * The barrier is needed because readers (poll and *SYNC) 2135 * check for priv->np_nifp != NULL without locking 2136 */ 2137 wmb(); /* make sure previous writes are visible to all CPUs */ 2138 priv->np_nifp = nifp; 2139 } 2140 return nifp; 2141 } 2142 2143 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ 2144 static int 2145 nm_bdg_attach(struct nmreq *nmr) 2146 { 2147 struct ifnet *ifp; 2148 struct netmap_if *nifp; 2149 struct netmap_priv_d *npriv; 2150 int error; 2151 2152 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2153 if (npriv == NULL) 2154 return ENOMEM; 2155 NMG_LOCK(); 2156 error = get_ifp(nmr, &ifp, 1 /* create if not exists */); 2157 if (error) /* no device, or another bridge or user owns the device */ 2158 goto unlock_exit; 2159 /* get_ifp() sets na_bdg if this is a physical interface 2160 * that we can attach to a switch. 2161 */ 2162 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2163 /* got reference to a virtual port or direct access to a NIC. 2164 * perhaps specified no bridge prefix or wrong NIC name 2165 */ 2166 error = EINVAL; 2167 goto unref_exit; 2168 } 2169 2170 if (NA(ifp)->refcount > 0) { /* already registered */ 2171 error = EBUSY; 2172 DROP_BDG_REF(ifp); 2173 goto unlock_exit; 2174 } 2175 2176 nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); 2177 if (!nifp) { 2178 goto unref_exit; 2179 } 2180 2181 NA(ifp)->na_kpriv = npriv; 2182 NMG_UNLOCK(); 2183 ND("registered %s to netmap-mode", ifp->if_xname); 2184 return 0; 2185 2186 unref_exit: 2187 nm_if_rele(ifp); 2188 unlock_exit: 2189 NMG_UNLOCK(); 2190 bzero(npriv, sizeof(*npriv)); 2191 free(npriv, M_DEVBUF); 2192 return error; 2193 } 2194 2195 static int 2196 nm_bdg_detach(struct nmreq *nmr) 2197 { 2198 struct ifnet *ifp; 2199 int error; 2200 int last_instance; 2201 2202 NMG_LOCK(); 2203 error = get_ifp(nmr, &ifp, 0 /* don't create */); 2204 if (error) { /* no device, or another bridge or user owns the device */ 2205 goto unlock_exit; 2206 } 2207 /* XXX do we need to check this ? */ 2208 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2209 /* got reference to a virtual port or direct access to a NIC. 2210 * perhaps specified no bridge's prefix or wrong NIC's name 2211 */ 2212 error = EINVAL; 2213 goto unref_exit; 2214 } 2215 2216 if (NA(ifp)->refcount == 0) { /* not registered */ 2217 error = EINVAL; 2218 goto unref_exit; 2219 } 2220 2221 DROP_BDG_REF(ifp); /* the one from get_ifp */ 2222 last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ 2223 NMG_UNLOCK(); 2224 if (!last_instance) { 2225 D("--- error, trying to detach an entry with active mmaps"); 2226 error = EINVAL; 2227 } else { 2228 struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; 2229 NA(ifp)->na_kpriv = NULL; 2230 2231 bzero(npriv, sizeof(*npriv)); 2232 free(npriv, M_DEVBUF); 2233 } 2234 return error; 2235 2236 unref_exit: 2237 nm_if_rele(ifp); 2238 unlock_exit: 2239 NMG_UNLOCK(); 2240 return error; 2241 } 2242 2243 2244 /* Initialize necessary fields of sw adapter located in right after hw's 2245 * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. 2246 * It is always activated and deactivated at the same tie with the hw's one. 2247 * Thus we don't need refcounting on the sw adapter. 2248 * Regardless of NIC's feature we use separate lock so that anybody can lock 2249 * me independently from the hw adapter. 2250 * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw 2251 */ 2252 static void 2253 netmap_attach_sw(struct ifnet *ifp) 2254 { 2255 struct netmap_adapter *hw_na = NA(ifp); 2256 struct netmap_adapter *na = SWNA(ifp); 2257 2258 na->ifp = ifp; 2259 na->num_rx_rings = na->num_tx_rings = 1; 2260 na->num_tx_desc = hw_na->num_tx_desc; 2261 na->num_rx_desc = hw_na->num_rx_desc; 2262 na->nm_txsync = netmap_bdg_to_host; 2263 /* we use the same memory allocator as the 2264 * the hw adapter */ 2265 na->nm_mem = hw_na->nm_mem; 2266 } 2267 2268 2269 /* exported to kernel callers, e.g. OVS ? 2270 * Entry point. 2271 * Called without NMG_LOCK. 2272 */ 2273 int 2274 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) 2275 { 2276 struct nm_bridge *b; 2277 struct netmap_adapter *na; 2278 struct ifnet *iter; 2279 char *name = nmr->nr_name; 2280 int cmd = nmr->nr_cmd, namelen = strlen(name); 2281 int error = 0, i, j; 2282 2283 switch (cmd) { 2284 case NETMAP_BDG_ATTACH: 2285 error = nm_bdg_attach(nmr); 2286 break; 2287 2288 case NETMAP_BDG_DETACH: 2289 error = nm_bdg_detach(nmr); 2290 break; 2291 2292 case NETMAP_BDG_LIST: 2293 /* this is used to enumerate bridges and ports */ 2294 if (namelen) { /* look up indexes of bridge and port */ 2295 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 2296 error = EINVAL; 2297 break; 2298 } 2299 NMG_LOCK(); 2300 b = nm_find_bridge(name, 0 /* don't create */); 2301 if (!b) { 2302 error = ENOENT; 2303 NMG_UNLOCK(); 2304 break; 2305 } 2306 2307 error = ENOENT; 2308 for (j = 0; j < b->bdg_active_ports; j++) { 2309 i = b->bdg_port_index[j]; 2310 na = b->bdg_ports[i]; 2311 if (na == NULL) { 2312 D("---AAAAAAAAARGH-------"); 2313 continue; 2314 } 2315 iter = na->ifp; 2316 /* the former and the latter identify a 2317 * virtual port and a NIC, respectively 2318 */ 2319 if (!strcmp(iter->if_xname, name) || 2320 (namelen > b->bdg_namelen && 2321 !strcmp(iter->if_xname, 2322 name + b->bdg_namelen + 1))) { 2323 /* bridge index */ 2324 nmr->nr_arg1 = b - nm_bridges; 2325 nmr->nr_arg2 = i; /* port index */ 2326 error = 0; 2327 break; 2328 } 2329 } 2330 NMG_UNLOCK(); 2331 } else { 2332 /* return the first non-empty entry starting from 2333 * bridge nr_arg1 and port nr_arg2. 2334 * 2335 * Users can detect the end of the same bridge by 2336 * seeing the new and old value of nr_arg1, and can 2337 * detect the end of all the bridge by error != 0 2338 */ 2339 i = nmr->nr_arg1; 2340 j = nmr->nr_arg2; 2341 2342 NMG_LOCK(); 2343 for (error = ENOENT; i < NM_BRIDGES; i++) { 2344 b = nm_bridges + i; 2345 if (j >= b->bdg_active_ports) { 2346 j = 0; /* following bridges scan from 0 */ 2347 continue; 2348 } 2349 nmr->nr_arg1 = i; 2350 nmr->nr_arg2 = j; 2351 j = b->bdg_port_index[j]; 2352 na = b->bdg_ports[j]; 2353 iter = na->ifp; 2354 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); 2355 error = 0; 2356 break; 2357 } 2358 NMG_UNLOCK(); 2359 } 2360 break; 2361 2362 case NETMAP_BDG_LOOKUP_REG: 2363 /* register a lookup function to the given bridge. 2364 * nmr->nr_name may be just bridge's name (including ':' 2365 * if it is not just NM_NAME). 2366 */ 2367 if (!func) { 2368 error = EINVAL; 2369 break; 2370 } 2371 NMG_LOCK(); 2372 b = nm_find_bridge(name, 0 /* don't create */); 2373 if (!b) { 2374 error = EINVAL; 2375 } else { 2376 b->nm_bdg_lookup = func; 2377 } 2378 NMG_UNLOCK(); 2379 break; 2380 2381 default: 2382 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 2383 error = EINVAL; 2384 break; 2385 } 2386 return error; 2387 } 2388 2389 2390 /* 2391 * ioctl(2) support for the "netmap" device. 2392 * 2393 * Following a list of accepted commands: 2394 * - NIOCGINFO 2395 * - SIOCGIFADDR just for convenience 2396 * - NIOCREGIF 2397 * - NIOCUNREGIF 2398 * - NIOCTXSYNC 2399 * - NIOCRXSYNC 2400 * 2401 * Return 0 on success, errno otherwise. 2402 */ 2403 static int 2404 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2405 int fflag, struct thread *td) 2406 { 2407 struct netmap_priv_d *priv = NULL; 2408 struct ifnet *ifp = NULL; 2409 struct nmreq *nmr = (struct nmreq *) data; 2410 struct netmap_adapter *na = NULL; 2411 int error; 2412 u_int i, lim; 2413 struct netmap_if *nifp; 2414 struct netmap_kring *krings; 2415 2416 (void)dev; /* UNUSED */ 2417 (void)fflag; /* UNUSED */ 2418 #ifdef linux 2419 #define devfs_get_cdevpriv(pp) \ 2420 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 2421 (*pp ? 0 : ENOENT); }) 2422 2423 /* devfs_set_cdevpriv cannot fail on linux */ 2424 #define devfs_set_cdevpriv(p, fn) \ 2425 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 2426 2427 2428 #define devfs_clear_cdevpriv() do { \ 2429 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 2430 } while (0) 2431 #endif /* linux */ 2432 2433 CURVNET_SET(TD_TO_VNET(td)); 2434 2435 error = devfs_get_cdevpriv((void **)&priv); 2436 if (error) { 2437 CURVNET_RESTORE(); 2438 /* XXX ENOENT should be impossible, since the priv 2439 * is now created in the open */ 2440 return (error == ENOENT ? ENXIO : error); 2441 } 2442 2443 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 2444 switch (cmd) { 2445 case NIOCGINFO: /* return capabilities etc */ 2446 if (nmr->nr_version != NETMAP_API) { 2447 #ifdef TEST_STUFF 2448 /* some test code for locks etc */ 2449 if (nmr->nr_version == 666) { 2450 error = nm_test(nmr); 2451 break; 2452 } 2453 #endif /* TEST_STUFF */ 2454 D("API mismatch got %d have %d", 2455 nmr->nr_version, NETMAP_API); 2456 nmr->nr_version = NETMAP_API; 2457 error = EINVAL; 2458 break; 2459 } 2460 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2461 error = netmap_bdg_ctl(nmr, NULL); 2462 break; 2463 } 2464 2465 NMG_LOCK(); 2466 do { 2467 /* memsize is always valid */ 2468 struct netmap_mem_d *nmd = &nm_mem; 2469 u_int memflags; 2470 2471 if (nmr->nr_name[0] != '\0') { 2472 /* get a refcount */ 2473 error = get_ifp(nmr, &ifp, 1 /* create */); 2474 if (error) 2475 break; 2476 na = NA(ifp); /* retrieve the netmap adapter */ 2477 nmd = na->nm_mem; /* and its memory allocator */ 2478 } 2479 2480 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 2481 if (error) 2482 break; 2483 if (na == NULL) /* only memory info */ 2484 break; 2485 nmr->nr_offset = 0; 2486 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2487 netmap_update_config(na); 2488 nmr->nr_rx_rings = na->num_rx_rings; 2489 nmr->nr_tx_rings = na->num_tx_rings; 2490 nmr->nr_rx_slots = na->num_rx_desc; 2491 nmr->nr_tx_slots = na->num_tx_desc; 2492 if (memflags & NETMAP_MEM_PRIVATE) 2493 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2494 } while (0); 2495 if (ifp) 2496 nm_if_rele(ifp); /* return the refcount */ 2497 NMG_UNLOCK(); 2498 break; 2499 2500 case NIOCREGIF: 2501 if (nmr->nr_version != NETMAP_API) { 2502 nmr->nr_version = NETMAP_API; 2503 error = EINVAL; 2504 break; 2505 } 2506 /* possibly attach/detach NIC and VALE switch */ 2507 i = nmr->nr_cmd; 2508 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 2509 error = netmap_bdg_ctl(nmr, NULL); 2510 break; 2511 } else if (i != 0) { 2512 D("nr_cmd must be 0 not %d", i); 2513 error = EINVAL; 2514 break; 2515 } 2516 2517 /* protect access to priv from concurrent NIOCREGIF */ 2518 NMG_LOCK(); 2519 do { 2520 u_int memflags; 2521 2522 if (priv->np_ifp != NULL) { /* thread already registered */ 2523 error = netmap_set_ringid(priv, nmr->nr_ringid); 2524 break; 2525 } 2526 /* find the interface and a reference */ 2527 error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ 2528 if (error) 2529 break; 2530 if (NETMAP_OWNED_BY_KERN(ifp)) { 2531 nm_if_rele(ifp); 2532 error = EBUSY; 2533 break; 2534 } 2535 nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); 2536 if (!nifp) { /* reg. failed, release priv and ref */ 2537 nm_if_rele(ifp); /* return the refcount */ 2538 priv->np_ifp = NULL; 2539 priv->np_nifp = NULL; 2540 break; 2541 } 2542 2543 /* return the offset of the netmap_if object */ 2544 na = NA(ifp); /* retrieve netmap adapter */ 2545 nmr->nr_rx_rings = na->num_rx_rings; 2546 nmr->nr_tx_rings = na->num_tx_rings; 2547 nmr->nr_rx_slots = na->num_rx_desc; 2548 nmr->nr_tx_slots = na->num_tx_desc; 2549 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 2550 if (error) { 2551 nm_if_rele(ifp); 2552 break; 2553 } 2554 if (memflags & NETMAP_MEM_PRIVATE) { 2555 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2556 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2557 } 2558 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2559 } while (0); 2560 NMG_UNLOCK(); 2561 break; 2562 2563 case NIOCUNREGIF: 2564 // XXX we have no data here ? 2565 D("deprecated, data is %p", nmr); 2566 error = EINVAL; 2567 break; 2568 2569 case NIOCTXSYNC: 2570 case NIOCRXSYNC: 2571 nifp = priv->np_nifp; 2572 2573 if (nifp == NULL) { 2574 error = ENXIO; 2575 break; 2576 } 2577 rmb(); /* make sure following reads are not from cache */ 2578 2579 ifp = priv->np_ifp; /* we have a reference */ 2580 2581 if (ifp == NULL) { 2582 D("Internal error: nifp != NULL && ifp == NULL"); 2583 error = ENXIO; 2584 break; 2585 } 2586 2587 na = NA(ifp); /* retrieve netmap adapter */ 2588 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 2589 if (cmd == NIOCTXSYNC) 2590 netmap_txsync_to_host(na); 2591 else 2592 netmap_rxsync_from_host(na, NULL, NULL); 2593 break; 2594 } 2595 /* find the last ring to scan */ 2596 lim = priv->np_qlast; 2597 if (lim == NETMAP_HW_RING) 2598 lim = (cmd == NIOCTXSYNC) ? 2599 na->num_tx_rings : na->num_rx_rings; 2600 2601 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 2602 for (i = priv->np_qfirst; i < lim; i++) { 2603 struct netmap_kring *kring = krings + i; 2604 if (nm_kr_tryget(kring)) { 2605 error = EBUSY; 2606 goto out; 2607 } 2608 if (cmd == NIOCTXSYNC) { 2609 if (netmap_verbose & NM_VERB_TXSYNC) 2610 D("pre txsync ring %d cur %d hwcur %d", 2611 i, kring->ring->cur, 2612 kring->nr_hwcur); 2613 na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); 2614 if (netmap_verbose & NM_VERB_TXSYNC) 2615 D("post txsync ring %d cur %d hwcur %d", 2616 i, kring->ring->cur, 2617 kring->nr_hwcur); 2618 } else { 2619 na->nm_rxsync(ifp, i, NAF_FORCE_READ); 2620 microtime(&na->rx_rings[i].ring->ts); 2621 } 2622 nm_kr_put(kring); 2623 } 2624 2625 break; 2626 2627 #ifdef __FreeBSD__ 2628 case BIOCIMMEDIATE: 2629 case BIOCGHDRCMPLT: 2630 case BIOCSHDRCMPLT: 2631 case BIOCSSEESENT: 2632 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2633 break; 2634 2635 default: /* allow device-specific ioctls */ 2636 { 2637 struct socket so; 2638 2639 bzero(&so, sizeof(so)); 2640 NMG_LOCK(); 2641 error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ 2642 if (error) { 2643 NMG_UNLOCK(); 2644 break; 2645 } 2646 so.so_vnet = ifp->if_vnet; 2647 // so->so_proto not null. 2648 error = ifioctl(&so, cmd, data, td); 2649 nm_if_rele(ifp); 2650 NMG_UNLOCK(); 2651 break; 2652 } 2653 2654 #else /* linux */ 2655 default: 2656 error = EOPNOTSUPP; 2657 #endif /* linux */ 2658 } 2659 out: 2660 2661 CURVNET_RESTORE(); 2662 return (error); 2663 } 2664 2665 2666 /* 2667 * select(2) and poll(2) handlers for the "netmap" device. 2668 * 2669 * Can be called for one or more queues. 2670 * Return true the event mask corresponding to ready events. 2671 * If there are no ready events, do a selrecord on either individual 2672 * selinfo or on the global one. 2673 * Device-dependent parts (locking and sync of tx/rx rings) 2674 * are done through callbacks. 2675 * 2676 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2677 * The first one is remapped to pwait as selrecord() uses the name as an 2678 * hidden argument. 2679 */ 2680 static int 2681 netmap_poll(struct cdev *dev, int events, struct thread *td) 2682 { 2683 struct netmap_priv_d *priv = NULL; 2684 struct netmap_adapter *na; 2685 struct ifnet *ifp; 2686 struct netmap_kring *kring; 2687 u_int i, check_all, want_tx, want_rx, revents = 0; 2688 u_int lim_tx, lim_rx, host_forwarded = 0; 2689 struct mbq q = { NULL, NULL, 0 }; 2690 void *pwait = dev; /* linux compatibility */ 2691 2692 int retry_tx = 1; 2693 2694 (void)pwait; 2695 2696 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 2697 return POLLERR; 2698 2699 if (priv->np_nifp == NULL) { 2700 D("No if registered"); 2701 return POLLERR; 2702 } 2703 rmb(); /* make sure following reads are not from cache */ 2704 2705 ifp = priv->np_ifp; 2706 // XXX check for deleting() ? 2707 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 2708 return POLLERR; 2709 2710 if (netmap_verbose & 0x8000) 2711 D("device %s events 0x%x", ifp->if_xname, events); 2712 want_tx = events & (POLLOUT | POLLWRNORM); 2713 want_rx = events & (POLLIN | POLLRDNORM); 2714 2715 na = NA(ifp); /* retrieve netmap adapter */ 2716 2717 lim_tx = na->num_tx_rings; 2718 lim_rx = na->num_rx_rings; 2719 2720 if (priv->np_qfirst == NETMAP_SW_RING) { 2721 /* handle the host stack ring */ 2722 if (priv->np_txpoll || want_tx) { 2723 /* push any packets up, then we are always ready */ 2724 netmap_txsync_to_host(na); 2725 revents |= want_tx; 2726 } 2727 if (want_rx) { 2728 kring = &na->rx_rings[lim_rx]; 2729 if (kring->ring->avail == 0) 2730 netmap_rxsync_from_host(na, td, dev); 2731 if (kring->ring->avail > 0) { 2732 revents |= want_rx; 2733 } 2734 } 2735 return (revents); 2736 } 2737 2738 /* if we are in transparent mode, check also the host rx ring */ 2739 kring = &na->rx_rings[lim_rx]; 2740 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2741 && want_rx 2742 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 2743 if (kring->ring->avail == 0) 2744 netmap_rxsync_from_host(na, td, dev); 2745 if (kring->ring->avail > 0) 2746 revents |= want_rx; 2747 } 2748 2749 /* 2750 * check_all is set if the card has more than one queue AND 2751 * the client is polling all of them. If true, we sleep on 2752 * the "global" selinfo, otherwise we sleep on individual selinfo 2753 * (FreeBSD only allows two selinfo's per file descriptor). 2754 * The interrupt routine in the driver wake one or the other 2755 * (or both) depending on which clients are active. 2756 * 2757 * rxsync() is only called if we run out of buffers on a POLLIN. 2758 * txsync() is called if we run out of buffers on POLLOUT, or 2759 * there are pending packets to send. The latter can be disabled 2760 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2761 */ 2762 check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1); 2763 2764 if (priv->np_qlast != NETMAP_HW_RING) { 2765 lim_tx = lim_rx = priv->np_qlast; 2766 } 2767 2768 /* 2769 * We start with a lock free round which is good if we have 2770 * data available. If this fails, then lock and call the sync 2771 * routines. 2772 */ 2773 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 2774 kring = &na->rx_rings[i]; 2775 if (kring->ring->avail > 0) { 2776 revents |= want_rx; 2777 want_rx = 0; /* also breaks the loop */ 2778 } 2779 } 2780 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 2781 kring = &na->tx_rings[i]; 2782 if (kring->ring->avail > 0) { 2783 revents |= want_tx; 2784 want_tx = 0; /* also breaks the loop */ 2785 } 2786 } 2787 2788 /* 2789 * If we to push packets out (priv->np_txpoll) or want_tx is 2790 * still set, we do need to run the txsync calls (on all rings, 2791 * to avoid that the tx rings stall). 2792 */ 2793 if (priv->np_txpoll || want_tx) { 2794 /* If we really want to be woken up (want_tx), 2795 * do a selrecord, either on the global or on 2796 * the private structure. Then issue the txsync 2797 * so there is no race in the selrecord/selwait 2798 */ 2799 flush_tx: 2800 for (i = priv->np_qfirst; i < lim_tx; i++) { 2801 kring = &na->tx_rings[i]; 2802 /* 2803 * Skip this ring if want_tx == 0 2804 * (we have already done a successful sync on 2805 * a previous ring) AND kring->cur == kring->hwcur 2806 * (there are no pending transmissions for this ring). 2807 */ 2808 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2809 continue; 2810 /* make sure only one user thread is doing this */ 2811 if (nm_kr_tryget(kring)) { 2812 ND("ring %p busy is %d", kring, (int)kring->nr_busy); 2813 revents |= POLLERR; 2814 goto out; 2815 } 2816 2817 if (netmap_verbose & NM_VERB_TXSYNC) 2818 D("send %d on %s %d", 2819 kring->ring->cur, ifp->if_xname, i); 2820 if (na->nm_txsync(ifp, i, 0)) 2821 revents |= POLLERR; 2822 2823 /* Check avail/call selrecord only if called with POLLOUT */ 2824 if (want_tx) { 2825 if (kring->ring->avail > 0) { 2826 /* stop at the first ring. We don't risk 2827 * starvation. 2828 */ 2829 revents |= want_tx; 2830 want_tx = 0; 2831 } 2832 } 2833 nm_kr_put(kring); 2834 } 2835 if (want_tx && retry_tx) { 2836 selrecord(td, check_all ? 2837 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 2838 retry_tx = 0; 2839 goto flush_tx; 2840 } 2841 } 2842 2843 /* 2844 * now if want_rx is still set we need to lock and rxsync. 2845 * Do it on all rings because otherwise we starve. 2846 */ 2847 if (want_rx) { 2848 int retry_rx = 1; 2849 do_retry_rx: 2850 for (i = priv->np_qfirst; i < lim_rx; i++) { 2851 kring = &na->rx_rings[i]; 2852 2853 if (nm_kr_tryget(kring)) { 2854 revents |= POLLERR; 2855 goto out; 2856 } 2857 2858 /* XXX NR_FORWARD should only be read on 2859 * physical or NIC ports 2860 */ 2861 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2862 ND(10, "forwarding some buffers up %d to %d", 2863 kring->nr_hwcur, kring->ring->cur); 2864 netmap_grab_packets(kring, &q, netmap_fwd); 2865 } 2866 2867 if (na->nm_rxsync(ifp, i, 0)) 2868 revents |= POLLERR; 2869 if (netmap_no_timestamp == 0 || 2870 kring->ring->flags & NR_TIMESTAMP) { 2871 microtime(&kring->ring->ts); 2872 } 2873 2874 if (kring->ring->avail > 0) { 2875 revents |= want_rx; 2876 retry_rx = 0; 2877 } 2878 nm_kr_put(kring); 2879 } 2880 if (retry_rx) { 2881 retry_rx = 0; 2882 selrecord(td, check_all ? 2883 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 2884 goto do_retry_rx; 2885 } 2886 } 2887 2888 /* forward host to the netmap ring. 2889 * I am accessing nr_hwavail without lock, but netmap_transmit 2890 * can only increment it, so the operation is safe. 2891 */ 2892 kring = &na->rx_rings[lim_rx]; 2893 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2894 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 2895 && kring->nr_hwavail > 0 && !host_forwarded) { 2896 netmap_sw_to_nic(na); 2897 host_forwarded = 1; /* prevent another pass */ 2898 want_rx = 0; 2899 goto flush_tx; 2900 } 2901 2902 if (q.head) 2903 netmap_send_up(na->ifp, q.head); 2904 2905 out: 2906 2907 return (revents); 2908 } 2909 2910 /*------- driver support routines ------*/ 2911 2912 2913 /* 2914 * Initialize a ``netmap_adapter`` object created by driver on attach. 2915 * We allocate a block of memory with room for a struct netmap_adapter 2916 * plus two sets of N+2 struct netmap_kring (where N is the number 2917 * of hardware rings): 2918 * krings 0..N-1 are for the hardware queues. 2919 * kring N is for the host stack queue 2920 * kring N+1 is only used for the selinfo for all queues. 2921 * Return 0 on success, ENOMEM otherwise. 2922 * 2923 * By default the receive and transmit adapter ring counts are both initialized 2924 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2925 * setups. 2926 */ 2927 int 2928 netmap_attach(struct netmap_adapter *arg, u_int num_queues) 2929 { 2930 struct netmap_adapter *na = NULL; 2931 struct ifnet *ifp = arg ? arg->ifp : NULL; 2932 size_t len; 2933 2934 if (arg == NULL || ifp == NULL) 2935 goto fail; 2936 /* a VALE port uses two endpoints */ 2937 len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; 2938 na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); 2939 if (na == NULL) 2940 goto fail; 2941 WNA(ifp) = na; 2942 *na = *arg; /* copy everything, trust the driver to not pass junk */ 2943 NETMAP_SET_CAPABLE(ifp); 2944 if (na->num_tx_rings == 0) 2945 na->num_tx_rings = num_queues; 2946 na->num_rx_rings = num_queues; 2947 na->refcount = na->na_single = na->na_multi = 0; 2948 /* Core lock initialized here, others after netmap_if_new. */ 2949 mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); 2950 #ifdef linux 2951 if (ifp->netdev_ops) { 2952 ND("netdev_ops %p", ifp->netdev_ops); 2953 /* prepare a clone of the netdev ops */ 2954 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2955 na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2956 #else 2957 na->nm_ndo = *ifp->netdev_ops; 2958 #endif 2959 } 2960 na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2961 #endif /* linux */ 2962 na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; 2963 if (!nma_is_vp(arg)) 2964 netmap_attach_sw(ifp); 2965 D("success for %s", ifp->if_xname); 2966 return 0; 2967 2968 fail: 2969 D("fail, arg %p ifp %p na %p", arg, ifp, na); 2970 netmap_detach(ifp); 2971 return (na ? EINVAL : ENOMEM); 2972 } 2973 2974 2975 /* 2976 * Free the allocated memory linked to the given ``netmap_adapter`` 2977 * object. 2978 */ 2979 void 2980 netmap_detach(struct ifnet *ifp) 2981 { 2982 struct netmap_adapter *na = NA(ifp); 2983 2984 if (!na) 2985 return; 2986 2987 mtx_destroy(&na->core_lock); 2988 2989 if (na->tx_rings) { /* XXX should not happen */ 2990 D("freeing leftover tx_rings"); 2991 free(na->tx_rings, M_DEVBUF); 2992 } 2993 if (na->na_flags & NAF_MEM_OWNER) 2994 netmap_mem_private_delete(na->nm_mem); 2995 bzero(na, sizeof(*na)); 2996 WNA(ifp) = NULL; 2997 free(na, M_DEVBUF); 2998 } 2999 3000 3001 int 3002 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 3003 struct netmap_adapter *na, u_int ring_nr); 3004 3005 3006 /* 3007 * Intercept packets from the network stack and pass them 3008 * to netmap as incoming packets on the 'software' ring. 3009 * We rely on the OS to make sure that the ifp and na do not go 3010 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 3011 * In nm_register() or whenever there is a reinitialization, 3012 * we make sure to access the core lock and per-ring locks 3013 * so that IFCAP_NETMAP is visible here. 3014 */ 3015 int 3016 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 3017 { 3018 struct netmap_adapter *na = NA(ifp); 3019 struct netmap_kring *kring; 3020 u_int i, len = MBUF_LEN(m); 3021 u_int error = EBUSY, lim; 3022 struct netmap_slot *slot; 3023 3024 // XXX [Linux] we do not need this lock 3025 // if we follow the down/configure/up protocol -gl 3026 // mtx_lock(&na->core_lock); 3027 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 3028 /* interface not in netmap mode anymore */ 3029 error = ENXIO; 3030 goto done; 3031 } 3032 3033 kring = &na->rx_rings[na->num_rx_rings]; 3034 lim = kring->nkr_num_slots - 1; 3035 if (netmap_verbose & NM_VERB_HOST) 3036 D("%s packet %d len %d from the stack", ifp->if_xname, 3037 kring->nr_hwcur + kring->nr_hwavail, len); 3038 // XXX reconsider long packets if we handle fragments 3039 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 3040 D("%s from_host, drop packet size %d > %d", ifp->if_xname, 3041 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 3042 goto done; 3043 } 3044 if (SWNA(ifp)->na_bdg) { 3045 struct nm_bdg_fwd *ft; 3046 char *dst; 3047 3048 na = SWNA(ifp); /* we operate on the host port */ 3049 ft = na->rx_rings[0].nkr_ft; 3050 dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); 3051 3052 /* use slot 0 in the ft, there is nothing queued here */ 3053 /* XXX we can save the copy calling m_copydata in nm_bdg_flush, 3054 * need a special flag for this. 3055 */ 3056 m_copydata(m, 0, (int)len, dst); 3057 ft->ft_flags = 0; 3058 ft->ft_len = len; 3059 ft->ft_buf = dst; 3060 ft->ft_next = NM_FT_NULL; 3061 ft->ft_frags = 1; 3062 if (netmap_verbose & NM_VERB_HOST) 3063 RD(5, "pkt %p size %d to bridge port %d", 3064 dst, len, na->bdg_port); 3065 nm_bdg_flush(ft, 1, na, 0); 3066 na = NA(ifp); /* back to the regular object/lock */ 3067 error = 0; 3068 goto done; 3069 } 3070 3071 /* protect against other instances of netmap_transmit, 3072 * and userspace invocations of rxsync(). 3073 * XXX could reuse core_lock 3074 */ 3075 // XXX [Linux] there can be no other instances of netmap_transmit 3076 // on this same ring, but we still need this lock to protect 3077 // concurrent access from netmap_sw_to_nic() -gl 3078 mtx_lock(&kring->q_lock); 3079 if (kring->nr_hwavail >= lim) { 3080 if (netmap_verbose) 3081 D("stack ring %s full\n", ifp->if_xname); 3082 } else { 3083 /* compute the insert position */ 3084 i = nm_kr_rxpos(kring); 3085 slot = &kring->ring->slot[i]; 3086 m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); 3087 slot->len = len; 3088 slot->flags = kring->nkr_slot_flags; 3089 kring->nr_hwavail++; 3090 if (netmap_verbose & NM_VERB_HOST) 3091 D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); 3092 selwakeuppri(&kring->si, PI_NET); 3093 error = 0; 3094 } 3095 mtx_unlock(&kring->q_lock); 3096 3097 done: 3098 // mtx_unlock(&na->core_lock); 3099 3100 /* release the mbuf in either cases of success or failure. As an 3101 * alternative, put the mbuf in a free list and free the list 3102 * only when really necessary. 3103 */ 3104 m_freem(m); 3105 3106 return (error); 3107 } 3108 3109 3110 /* 3111 * netmap_reset() is called by the driver routines when reinitializing 3112 * a ring. The driver is in charge of locking to protect the kring. 3113 * If netmap mode is not set just return NULL. 3114 */ 3115 struct netmap_slot * 3116 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 3117 u_int new_cur) 3118 { 3119 struct netmap_kring *kring; 3120 int new_hwofs, lim; 3121 3122 if (na == NULL) { 3123 D("NULL na, should not happen"); 3124 return NULL; /* no netmap support here */ 3125 } 3126 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 3127 D("interface not in netmap mode"); 3128 return NULL; /* nothing to reinitialize */ 3129 } 3130 3131 /* XXX note- in the new scheme, we are not guaranteed to be 3132 * under lock (e.g. when called on a device reset). 3133 * In this case, we should set a flag and do not trust too 3134 * much the values. In practice: TODO 3135 * - set a RESET flag somewhere in the kring 3136 * - do the processing in a conservative way 3137 * - let the *sync() fixup at the end. 3138 */ 3139 if (tx == NR_TX) { 3140 if (n >= na->num_tx_rings) 3141 return NULL; 3142 kring = na->tx_rings + n; 3143 new_hwofs = kring->nr_hwcur - new_cur; 3144 } else { 3145 if (n >= na->num_rx_rings) 3146 return NULL; 3147 kring = na->rx_rings + n; 3148 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 3149 } 3150 lim = kring->nkr_num_slots - 1; 3151 if (new_hwofs > lim) 3152 new_hwofs -= lim + 1; 3153 3154 /* Always set the new offset value and realign the ring. */ 3155 D("%s hwofs %d -> %d, hwavail %d -> %d", 3156 tx == NR_TX ? "TX" : "RX", 3157 kring->nkr_hwofs, new_hwofs, 3158 kring->nr_hwavail, 3159 tx == NR_TX ? lim : kring->nr_hwavail); 3160 kring->nkr_hwofs = new_hwofs; 3161 if (tx == NR_TX) 3162 kring->nr_hwavail = lim; 3163 3164 #if 0 // def linux 3165 /* XXX check that the mappings are correct */ 3166 /* need ring_nr, adapter->pdev, direction */ 3167 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 3168 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 3169 D("error mapping rx netmap buffer %d", i); 3170 // XXX fix error handling 3171 } 3172 3173 #endif /* linux */ 3174 /* 3175 * Wakeup on the individual and global selwait 3176 * We do the wakeup here, but the ring is not yet reconfigured. 3177 * However, we are under lock so there are no races. 3178 */ 3179 selwakeuppri(&kring->si, PI_NET); 3180 selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); 3181 return kring->ring->slot; 3182 } 3183 3184 3185 /* 3186 * Grab packets from a kring, move them into the ft structure 3187 * associated to the tx (input) port. Max one instance per port, 3188 * filtered on input (ioctl, poll or XXX). 3189 * Returns the next position in the ring. 3190 */ 3191 static int 3192 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, 3193 struct netmap_kring *kring, u_int end) 3194 { 3195 struct netmap_ring *ring = kring->ring; 3196 struct nm_bdg_fwd *ft; 3197 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 3198 u_int ft_i = 0; /* start from 0 */ 3199 u_int frags = 1; /* how many frags ? */ 3200 struct nm_bridge *b = na->na_bdg; 3201 3202 /* To protect against modifications to the bridge we acquire a 3203 * shared lock, waiting if we can sleep (if the source port is 3204 * attached to a user process) or with a trylock otherwise (NICs). 3205 */ 3206 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3207 if (na->na_flags & NAF_BDG_MAYSLEEP) 3208 BDG_RLOCK(b); 3209 else if (!BDG_RTRYLOCK(b)) 3210 return 0; 3211 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3212 ft = kring->nkr_ft; 3213 3214 for (; likely(j != end); j = nm_next(j, lim)) { 3215 struct netmap_slot *slot = &ring->slot[j]; 3216 char *buf; 3217 3218 ft[ft_i].ft_len = slot->len; 3219 ft[ft_i].ft_flags = slot->flags; 3220 3221 ND("flags is 0x%x", slot->flags); 3222 /* this slot goes into a list so initialize the link field */ 3223 ft[ft_i].ft_next = NM_FT_NULL; 3224 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 3225 (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); 3226 prefetch(buf); 3227 ++ft_i; 3228 if (slot->flags & NS_MOREFRAG) { 3229 frags++; 3230 continue; 3231 } 3232 if (unlikely(netmap_verbose && frags > 1)) 3233 RD(5, "%d frags at %d", frags, ft_i - frags); 3234 ft[ft_i - frags].ft_frags = frags; 3235 frags = 1; 3236 if (unlikely((int)ft_i >= bridge_batch)) 3237 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3238 } 3239 if (frags > 1) { 3240 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 3241 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 3242 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 3243 ft[ft_i - frags].ft_frags = frags - 1; 3244 } 3245 if (ft_i) 3246 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3247 BDG_RUNLOCK(b); 3248 return j; 3249 } 3250 3251 3252 /* 3253 * Pass packets from nic to the bridge. 3254 * XXX TODO check locking: this is called from the interrupt 3255 * handler so we should make sure that the interface is not 3256 * disconnected while passing down an interrupt. 3257 * 3258 * Note, no user process can access this NIC so we can ignore 3259 * the info in the 'ring'. 3260 */ 3261 static void 3262 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) 3263 { 3264 struct netmap_adapter *na = NA(ifp); 3265 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 3266 struct netmap_ring *ring = kring->ring; 3267 u_int j, k; 3268 3269 /* make sure that only one thread is ever in here, 3270 * after which we can unlock. Probably unnecessary XXX. 3271 */ 3272 if (nm_kr_tryget(kring)) 3273 return; 3274 /* fetch packets that have arrived. 3275 * XXX maybe do this in a loop ? 3276 */ 3277 if (na->nm_rxsync(ifp, ring_nr, 0)) 3278 goto put_out; 3279 if (kring->nr_hwavail == 0 && netmap_verbose) { 3280 D("how strange, interrupt with no packets on %s", 3281 ifp->if_xname); 3282 goto put_out; 3283 } 3284 k = nm_kr_rxpos(kring); 3285 3286 j = nm_bdg_preflush(na, ring_nr, kring, k); 3287 3288 /* we consume everything, but we cannot update kring directly 3289 * because the nic may have destroyed the info in the NIC ring. 3290 * So we need to call rxsync again to restore it. 3291 */ 3292 ring->cur = j; 3293 ring->avail = 0; 3294 na->nm_rxsync(ifp, ring_nr, 0); 3295 3296 put_out: 3297 nm_kr_put(kring); 3298 return; 3299 } 3300 3301 3302 /* 3303 * Default functions to handle rx/tx interrupts from a physical device. 3304 * "work_done" is non-null on the RX path, NULL for the TX path. 3305 * We rely on the OS to make sure that there is only one active 3306 * instance per queue, and that there is appropriate locking. 3307 * 3308 * If the card is not in netmap mode, simply return 0, 3309 * so that the caller proceeds with regular processing. 3310 * 3311 * If the card is connected to a netmap file descriptor, 3312 * do a selwakeup on the individual queue, plus one on the global one 3313 * if needed (multiqueue card _and_ there are multiqueue listeners), 3314 * and return 1. 3315 * 3316 * Finally, if called on rx from an interface connected to a switch, 3317 * calls the proper forwarding routine, and return 1. 3318 */ 3319 int 3320 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3321 { 3322 struct netmap_adapter *na; 3323 struct netmap_kring *kring; 3324 3325 if (!(ifp->if_capenable & IFCAP_NETMAP)) 3326 return 0; 3327 3328 q &= NETMAP_RING_MASK; 3329 3330 if (netmap_verbose) 3331 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 3332 na = NA(ifp); 3333 if (na->na_flags & NAF_SKIP_INTR) { 3334 ND("use regular interrupt"); 3335 return 0; 3336 } 3337 3338 if (work_done) { /* RX path */ 3339 if (q >= na->num_rx_rings) 3340 return 0; // not a physical queue 3341 kring = na->rx_rings + q; 3342 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 3343 if (na->na_bdg != NULL) { 3344 netmap_nic_to_bdg(ifp, q); 3345 } else { 3346 selwakeuppri(&kring->si, PI_NET); 3347 if (na->num_rx_rings > 1 /* or multiple listeners */ ) 3348 selwakeuppri(&na->rx_si, PI_NET); 3349 } 3350 *work_done = 1; /* do not fire napi again */ 3351 } else { /* TX path */ 3352 if (q >= na->num_tx_rings) 3353 return 0; // not a physical queue 3354 kring = na->tx_rings + q; 3355 selwakeuppri(&kring->si, PI_NET); 3356 if (na->num_tx_rings > 1 /* or multiple listeners */ ) 3357 selwakeuppri(&na->tx_si, PI_NET); 3358 } 3359 return 1; 3360 } 3361 3362 3363 #ifdef linux /* linux-specific routines */ 3364 3365 3366 /* 3367 * Remap linux arguments into the FreeBSD call. 3368 * - pwait is the poll table, passed as 'dev'; 3369 * If pwait == NULL someone else already woke up before. We can report 3370 * events but they are filtered upstream. 3371 * If pwait != NULL, then pwait->key contains the list of events. 3372 * - events is computed from pwait as above. 3373 * - file is passed as 'td'; 3374 */ 3375 static u_int 3376 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) 3377 { 3378 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) 3379 int events = POLLIN | POLLOUT; /* XXX maybe... */ 3380 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) 3381 int events = pwait ? pwait->key : POLLIN | POLLOUT; 3382 #else /* in 3.4.0 field 'key' was renamed to '_key' */ 3383 int events = pwait ? pwait->_key : POLLIN | POLLOUT; 3384 #endif 3385 return netmap_poll((void *)pwait, events, (void *)file); 3386 } 3387 3388 3389 static int 3390 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) 3391 { 3392 int error = 0; 3393 unsigned long off, va; 3394 vm_ooffset_t pa; 3395 struct netmap_priv_d *priv = f->private_data; 3396 /* 3397 * vma->vm_start: start of mapping user address space 3398 * vma->vm_end: end of the mapping user address space 3399 * vma->vm_pfoff: offset of first page in the device 3400 */ 3401 3402 // XXX security checks 3403 3404 error = netmap_get_memory(priv); 3405 ND("get_memory returned %d", error); 3406 if (error) 3407 return -error; 3408 3409 if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { 3410 ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); 3411 return -EINVAL; 3412 } 3413 3414 for (va = vma->vm_start, off = vma->vm_pgoff; 3415 va < vma->vm_end; 3416 va += PAGE_SIZE, off++) 3417 { 3418 pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); 3419 if (pa == 0) 3420 return -EINVAL; 3421 3422 ND("va %lx pa %p", va, pa); 3423 error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); 3424 if (error) 3425 return error; 3426 } 3427 return 0; 3428 } 3429 3430 3431 /* 3432 * This one is probably already protected by the netif lock XXX 3433 */ 3434 static netdev_tx_t 3435 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) 3436 { 3437 netmap_transmit(dev, skb); 3438 return (NETDEV_TX_OK); 3439 } 3440 3441 3442 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 3443 #define LIN_IOCTL_NAME .ioctl 3444 int 3445 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) 3446 #else 3447 #define LIN_IOCTL_NAME .unlocked_ioctl 3448 long 3449 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) 3450 #endif 3451 { 3452 int ret; 3453 struct nmreq nmr; 3454 bzero(&nmr, sizeof(nmr)); 3455 3456 if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { 3457 data = 0; /* no argument required here */ 3458 } 3459 if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) 3460 return -EFAULT; 3461 ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); 3462 if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) 3463 return -EFAULT; 3464 return -ret; 3465 } 3466 3467 3468 static int 3469 netmap_release(struct inode *inode, struct file *file) 3470 { 3471 (void)inode; /* UNUSED */ 3472 if (file->private_data) 3473 netmap_dtor(file->private_data); 3474 return (0); 3475 } 3476 3477 3478 static int 3479 linux_netmap_open(struct inode *inode, struct file *file) 3480 { 3481 struct netmap_priv_d *priv; 3482 (void)inode; /* UNUSED */ 3483 3484 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 3485 M_NOWAIT | M_ZERO); 3486 if (priv == NULL) 3487 return -ENOMEM; 3488 3489 file->private_data = priv; 3490 3491 return (0); 3492 } 3493 3494 3495 static struct file_operations netmap_fops = { 3496 .owner = THIS_MODULE, 3497 .open = linux_netmap_open, 3498 .mmap = linux_netmap_mmap, 3499 LIN_IOCTL_NAME = linux_netmap_ioctl, 3500 .poll = linux_netmap_poll, 3501 .release = netmap_release, 3502 }; 3503 3504 3505 static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ 3506 MISC_DYNAMIC_MINOR, 3507 "netmap", 3508 &netmap_fops, 3509 }; 3510 3511 static int netmap_init(void); 3512 static void netmap_fini(void); 3513 3514 3515 /* Errors have negative values on linux */ 3516 static int linux_netmap_init(void) 3517 { 3518 return -netmap_init(); 3519 } 3520 3521 module_init(linux_netmap_init); 3522 module_exit(netmap_fini); 3523 /* export certain symbols to other modules */ 3524 EXPORT_SYMBOL(netmap_attach); // driver attach routines 3525 EXPORT_SYMBOL(netmap_detach); // driver detach routines 3526 EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error 3527 EXPORT_SYMBOL(netmap_buffer_lut); 3528 EXPORT_SYMBOL(netmap_total_buffers); // index check 3529 EXPORT_SYMBOL(netmap_buffer_base); 3530 EXPORT_SYMBOL(netmap_reset); // ring init routines 3531 EXPORT_SYMBOL(netmap_buf_size); 3532 EXPORT_SYMBOL(netmap_rx_irq); // default irq handler 3533 EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away 3534 EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine 3535 EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function 3536 EXPORT_SYMBOL(netmap_disable_all_rings); 3537 EXPORT_SYMBOL(netmap_enable_all_rings); 3538 3539 3540 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); 3541 MODULE_DESCRIPTION("The netmap packet I/O framework"); 3542 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ 3543 3544 #else /* __FreeBSD__ */ 3545 3546 3547 static struct cdevsw netmap_cdevsw = { 3548 .d_version = D_VERSION, 3549 .d_name = "netmap", 3550 .d_open = netmap_open, 3551 .d_mmap_single = netmap_mmap_single, 3552 .d_ioctl = netmap_ioctl, 3553 .d_poll = netmap_poll, 3554 .d_close = netmap_close, 3555 }; 3556 #endif /* __FreeBSD__ */ 3557 3558 /* 3559 *---- support for virtual bridge ----- 3560 */ 3561 3562 /* ----- FreeBSD if_bridge hash function ------- */ 3563 3564 /* 3565 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 3566 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 3567 * 3568 * http://www.burtleburtle.net/bob/hash/spooky.html 3569 */ 3570 #define mix(a, b, c) \ 3571 do { \ 3572 a -= b; a -= c; a ^= (c >> 13); \ 3573 b -= c; b -= a; b ^= (a << 8); \ 3574 c -= a; c -= b; c ^= (b >> 13); \ 3575 a -= b; a -= c; a ^= (c >> 12); \ 3576 b -= c; b -= a; b ^= (a << 16); \ 3577 c -= a; c -= b; c ^= (b >> 5); \ 3578 a -= b; a -= c; a ^= (c >> 3); \ 3579 b -= c; b -= a; b ^= (a << 10); \ 3580 c -= a; c -= b; c ^= (b >> 15); \ 3581 } while (/*CONSTCOND*/0) 3582 3583 static __inline uint32_t 3584 nm_bridge_rthash(const uint8_t *addr) 3585 { 3586 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 3587 3588 b += addr[5] << 8; 3589 b += addr[4]; 3590 a += addr[3] << 24; 3591 a += addr[2] << 16; 3592 a += addr[1] << 8; 3593 a += addr[0]; 3594 3595 mix(a, b, c); 3596 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 3597 return (c & BRIDGE_RTHASH_MASK); 3598 } 3599 3600 #undef mix 3601 3602 3603 static int 3604 bdg_netmap_reg(struct ifnet *ifp, int onoff) 3605 { 3606 /* the interface is already attached to the bridge, 3607 * so we only need to toggle IFCAP_NETMAP. 3608 */ 3609 if (onoff) { 3610 ifp->if_capenable |= IFCAP_NETMAP; 3611 } else { 3612 ifp->if_capenable &= ~IFCAP_NETMAP; 3613 } 3614 return 0; 3615 } 3616 3617 3618 /* 3619 * Lookup function for a learning bridge. 3620 * Update the hash table with the source address, 3621 * and then returns the destination port index, and the 3622 * ring in *dst_ring (at the moment, always use ring 0) 3623 */ 3624 u_int 3625 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, 3626 struct netmap_adapter *na) 3627 { 3628 struct nm_hash_ent *ht = na->na_bdg->ht; 3629 uint32_t sh, dh; 3630 u_int dst, mysrc = na->bdg_port; 3631 uint64_t smac, dmac; 3632 3633 if (buf_len < 14) { 3634 D("invalid buf length %d", buf_len); 3635 return NM_BDG_NOPORT; 3636 } 3637 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 3638 smac = le64toh(*(uint64_t *)(buf + 4)); 3639 smac >>= 16; 3640 3641 /* 3642 * The hash is somewhat expensive, there might be some 3643 * worthwhile optimizations here. 3644 */ 3645 if ((buf[6] & 1) == 0) { /* valid src */ 3646 uint8_t *s = buf+6; 3647 sh = nm_bridge_rthash(s); // XXX hash of source 3648 /* update source port forwarding entry */ 3649 ht[sh].mac = smac; /* XXX expire ? */ 3650 ht[sh].ports = mysrc; 3651 if (netmap_verbose) 3652 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 3653 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 3654 } 3655 dst = NM_BDG_BROADCAST; 3656 if ((buf[0] & 1) == 0) { /* unicast */ 3657 dh = nm_bridge_rthash(buf); // XXX hash of dst 3658 if (ht[dh].mac == dmac) { /* found dst */ 3659 dst = ht[dh].ports; 3660 } 3661 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 3662 } 3663 *dst_ring = 0; 3664 return dst; 3665 } 3666 3667 3668 /* 3669 * This flush routine supports only unicast and broadcast but a large 3670 * number of ports, and lets us replace the learn and dispatch functions. 3671 */ 3672 int 3673 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, 3674 u_int ring_nr) 3675 { 3676 struct nm_bdg_q *dst_ents, *brddst; 3677 uint16_t num_dsts = 0, *dsts; 3678 struct nm_bridge *b = na->na_bdg; 3679 u_int i, j, me = na->bdg_port; 3680 3681 /* 3682 * The work area (pointed by ft) is followed by an array of 3683 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 3684 * queues per port plus one for the broadcast traffic. 3685 * Then we have an array of destination indexes. 3686 */ 3687 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 3688 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 3689 3690 /* first pass: find a destination for each packet in the batch */ 3691 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 3692 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 3693 uint16_t dst_port, d_i; 3694 struct nm_bdg_q *d; 3695 3696 ND("slot %d frags %d", i, ft[i].ft_frags); 3697 dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, 3698 &dst_ring, na); 3699 if (netmap_verbose > 255) 3700 RD(5, "slot %d port %d -> %d", i, me, dst_port); 3701 if (dst_port == NM_BDG_NOPORT) 3702 continue; /* this packet is identified to be dropped */ 3703 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 3704 continue; 3705 else if (dst_port == NM_BDG_BROADCAST) 3706 dst_ring = 0; /* broadcasts always go to ring 0 */ 3707 else if (unlikely(dst_port == me || 3708 !b->bdg_ports[dst_port])) 3709 continue; 3710 3711 /* get a position in the scratch pad */ 3712 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 3713 d = dst_ents + d_i; 3714 3715 /* append the first fragment to the list */ 3716 if (d->bq_head == NM_FT_NULL) { /* new destination */ 3717 d->bq_head = d->bq_tail = i; 3718 /* remember this position to be scanned later */ 3719 if (dst_port != NM_BDG_BROADCAST) 3720 dsts[num_dsts++] = d_i; 3721 } else { 3722 ft[d->bq_tail].ft_next = i; 3723 d->bq_tail = i; 3724 } 3725 d->bq_len += ft[i].ft_frags; 3726 } 3727 3728 /* 3729 * Broadcast traffic goes to ring 0 on all destinations. 3730 * So we need to add these rings to the list of ports to scan. 3731 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 3732 * expensive. We should keep a compact list of active destinations 3733 * so we could shorten this loop. 3734 */ 3735 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 3736 if (brddst->bq_head != NM_FT_NULL) { 3737 for (j = 0; likely(j < b->bdg_active_ports); j++) { 3738 uint16_t d_i; 3739 i = b->bdg_port_index[j]; 3740 if (unlikely(i == me)) 3741 continue; 3742 d_i = i * NM_BDG_MAXRINGS; 3743 if (dst_ents[d_i].bq_head == NM_FT_NULL) 3744 dsts[num_dsts++] = d_i; 3745 } 3746 } 3747 3748 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 3749 /* second pass: scan destinations (XXX will be modular somehow) */ 3750 for (i = 0; i < num_dsts; i++) { 3751 struct ifnet *dst_ifp; 3752 struct netmap_adapter *dst_na; 3753 struct netmap_kring *kring; 3754 struct netmap_ring *ring; 3755 u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; 3756 u_int needed, howmany; 3757 int retry = netmap_txsync_retry; 3758 struct nm_bdg_q *d; 3759 uint32_t my_start = 0, lease_idx = 0; 3760 int nrings; 3761 3762 d_i = dsts[i]; 3763 ND("second pass %d port %d", i, d_i); 3764 d = dst_ents + d_i; 3765 // XXX fix the division 3766 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 3767 /* protect from the lookup function returning an inactive 3768 * destination port 3769 */ 3770 if (unlikely(dst_na == NULL)) 3771 goto cleanup; 3772 if (dst_na->na_flags & NAF_SW_ONLY) 3773 goto cleanup; 3774 dst_ifp = dst_na->ifp; 3775 /* 3776 * The interface may be in !netmap mode in two cases: 3777 * - when na is attached but not activated yet; 3778 * - when na is being deactivated but is still attached. 3779 */ 3780 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { 3781 ND("not in netmap mode!"); 3782 goto cleanup; 3783 } 3784 3785 /* there is at least one either unicast or broadcast packet */ 3786 brd_next = brddst->bq_head; 3787 next = d->bq_head; 3788 /* we need to reserve this many slots. If fewer are 3789 * available, some packets will be dropped. 3790 * Packets may have multiple fragments, so we may not use 3791 * there is a chance that we may not use all of the slots 3792 * we have claimed, so we will need to handle the leftover 3793 * ones when we regain the lock. 3794 */ 3795 needed = d->bq_len + brddst->bq_len; 3796 3797 is_vp = nma_is_vp(dst_na); 3798 ND(5, "pass 2 dst %d is %x %s", 3799 i, d_i, is_vp ? "virtual" : "nic/host"); 3800 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 3801 if (is_vp) { /* virtual port */ 3802 nrings = dst_na->num_rx_rings; 3803 } else { 3804 nrings = dst_na->num_tx_rings; 3805 } 3806 if (dst_nr >= nrings) 3807 dst_nr = dst_nr % nrings; 3808 kring = is_vp ? &dst_na->rx_rings[dst_nr] : 3809 &dst_na->tx_rings[dst_nr]; 3810 ring = kring->ring; 3811 lim = kring->nkr_num_slots - 1; 3812 3813 retry: 3814 3815 /* reserve the buffers in the queue and an entry 3816 * to report completion, and drop lock. 3817 * XXX this might become a helper function. 3818 */ 3819 mtx_lock(&kring->q_lock); 3820 if (kring->nkr_stopped) { 3821 mtx_unlock(&kring->q_lock); 3822 goto cleanup; 3823 } 3824 /* on physical interfaces, do a txsync to recover 3825 * slots for packets already transmitted. 3826 * XXX maybe we could be optimistic and rely on a retry 3827 * in case of failure. 3828 */ 3829 if (nma_is_hw(dst_na)) { 3830 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3831 } 3832 my_start = j = kring->nkr_hwlease; 3833 howmany = nm_kr_space(kring, is_vp); 3834 if (needed < howmany) 3835 howmany = needed; 3836 lease_idx = nm_kr_lease(kring, howmany, is_vp); 3837 mtx_unlock(&kring->q_lock); 3838 3839 /* only retry if we need more than available slots */ 3840 if (retry && needed <= howmany) 3841 retry = 0; 3842 3843 /* copy to the destination queue */ 3844 while (howmany > 0) { 3845 struct netmap_slot *slot; 3846 struct nm_bdg_fwd *ft_p, *ft_end; 3847 u_int cnt; 3848 3849 /* find the queue from which we pick next packet. 3850 * NM_FT_NULL is always higher than valid indexes 3851 * so we never dereference it if the other list 3852 * has packets (and if both are empty we never 3853 * get here). 3854 */ 3855 if (next < brd_next) { 3856 ft_p = ft + next; 3857 next = ft_p->ft_next; 3858 } else { /* insert broadcast */ 3859 ft_p = ft + brd_next; 3860 brd_next = ft_p->ft_next; 3861 } 3862 cnt = ft_p->ft_frags; // cnt > 0 3863 if (unlikely(cnt > howmany)) 3864 break; /* no more space */ 3865 howmany -= cnt; 3866 if (netmap_verbose && cnt > 1) 3867 RD(5, "rx %d frags to %d", cnt, j); 3868 ft_end = ft_p + cnt; 3869 do { 3870 void *dst, *src = ft_p->ft_buf; 3871 size_t len = (ft_p->ft_len + 63) & ~63; 3872 3873 slot = &ring->slot[j]; 3874 dst = BDG_NMB(dst_na->nm_mem, slot); 3875 /* round to a multiple of 64 */ 3876 3877 ND("send %d %d bytes at %s:%d", 3878 i, ft_p->ft_len, dst_ifp->if_xname, j); 3879 if (ft_p->ft_flags & NS_INDIRECT) { 3880 if (copyin(src, dst, len)) { 3881 // invalid user pointer, pretend len is 0 3882 ft_p->ft_len = 0; 3883 } 3884 } else { 3885 //memcpy(dst, src, len); 3886 pkt_copy(src, dst, (int)len); 3887 } 3888 slot->len = ft_p->ft_len; 3889 slot->flags = (cnt << 8)| NS_MOREFRAG; 3890 j = nm_next(j, lim); 3891 ft_p++; 3892 sent++; 3893 } while (ft_p != ft_end); 3894 slot->flags = (cnt << 8); /* clear flag on last entry */ 3895 /* are we done ? */ 3896 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 3897 break; 3898 } 3899 { 3900 /* current position */ 3901 uint32_t *p = kring->nkr_leases; /* shorthand */ 3902 uint32_t update_pos; 3903 int still_locked = 1; 3904 3905 mtx_lock(&kring->q_lock); 3906 if (unlikely(howmany > 0)) { 3907 /* not used all bufs. If i am the last one 3908 * i can recover the slots, otherwise must 3909 * fill them with 0 to mark empty packets. 3910 */ 3911 ND("leftover %d bufs", howmany); 3912 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 3913 /* yes i am the last one */ 3914 ND("roll back nkr_hwlease to %d", j); 3915 kring->nkr_hwlease = j; 3916 } else { 3917 while (howmany-- > 0) { 3918 ring->slot[j].len = 0; 3919 ring->slot[j].flags = 0; 3920 j = nm_next(j, lim); 3921 } 3922 } 3923 } 3924 p[lease_idx] = j; /* report I am done */ 3925 3926 update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; 3927 3928 if (my_start == update_pos) { 3929 /* all slots before my_start have been reported, 3930 * so scan subsequent leases to see if other ranges 3931 * have been completed, and to a selwakeup or txsync. 3932 */ 3933 while (lease_idx != kring->nkr_lease_idx && 3934 p[lease_idx] != NR_NOSLOT) { 3935 j = p[lease_idx]; 3936 p[lease_idx] = NR_NOSLOT; 3937 lease_idx = nm_next(lease_idx, lim); 3938 } 3939 /* j is the new 'write' position. j != my_start 3940 * means there are new buffers to report 3941 */ 3942 if (likely(j != my_start)) { 3943 if (is_vp) { 3944 uint32_t old_avail = kring->nr_hwavail; 3945 3946 kring->nr_hwavail = (j >= kring->nr_hwcur) ? 3947 j - kring->nr_hwcur : 3948 j + lim + 1 - kring->nr_hwcur; 3949 if (kring->nr_hwavail < old_avail) { 3950 D("avail shrink %d -> %d", 3951 old_avail, kring->nr_hwavail); 3952 } 3953 still_locked = 0; 3954 mtx_unlock(&kring->q_lock); 3955 selwakeuppri(&kring->si, PI_NET); 3956 } else { 3957 ring->cur = j; 3958 /* XXX update avail ? */ 3959 still_locked = 0; 3960 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3961 mtx_unlock(&kring->q_lock); 3962 3963 /* retry to send more packets */ 3964 if (nma_is_hw(dst_na) && retry--) 3965 goto retry; 3966 } 3967 } 3968 } 3969 if (still_locked) 3970 mtx_unlock(&kring->q_lock); 3971 } 3972 cleanup: 3973 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 3974 d->bq_len = 0; 3975 } 3976 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 3977 brddst->bq_len = 0; 3978 return 0; 3979 } 3980 3981 3982 /* 3983 * main dispatch routine for the bridge. 3984 * We already know that only one thread is running this. 3985 * we must run nm_bdg_preflush without lock. 3986 */ 3987 static int 3988 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) 3989 { 3990 struct netmap_adapter *na = NA(ifp); 3991 struct netmap_kring *kring = &na->tx_rings[ring_nr]; 3992 struct netmap_ring *ring = kring->ring; 3993 u_int j, k, lim = kring->nkr_num_slots - 1; 3994 3995 k = ring->cur; 3996 if (k > lim) 3997 return netmap_ring_reinit(kring); 3998 3999 if (bridge_batch <= 0) { /* testing only */ 4000 j = k; // used all 4001 goto done; 4002 } 4003 if (bridge_batch > NM_BDG_BATCH) 4004 bridge_batch = NM_BDG_BATCH; 4005 4006 j = nm_bdg_preflush(na, ring_nr, kring, k); 4007 if (j != k) 4008 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); 4009 /* k-j modulo ring size is the number of slots processed */ 4010 if (k < j) 4011 k += kring->nkr_num_slots; 4012 kring->nr_hwavail = lim - (k - j); 4013 4014 done: 4015 kring->nr_hwcur = j; 4016 ring->avail = kring->nr_hwavail; 4017 if (netmap_verbose) 4018 D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); 4019 return 0; 4020 } 4021 4022 4023 /* 4024 * user process reading from a VALE switch. 4025 * Already protected against concurrent calls from userspace, 4026 * but we must acquire the queue's lock to protect against 4027 * writers on the same queue. 4028 */ 4029 static int 4030 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) 4031 { 4032 struct netmap_adapter *na = NA(ifp); 4033 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 4034 struct netmap_ring *ring = kring->ring; 4035 u_int j, lim = kring->nkr_num_slots - 1; 4036 u_int k = ring->cur, resvd = ring->reserved; 4037 int n; 4038 4039 mtx_lock(&kring->q_lock); 4040 if (k > lim) { 4041 D("ouch dangerous reset!!!"); 4042 n = netmap_ring_reinit(kring); 4043 goto done; 4044 } 4045 4046 /* skip past packets that userspace has released */ 4047 j = kring->nr_hwcur; /* netmap ring index */ 4048 if (resvd > 0) { 4049 if (resvd + ring->avail >= lim + 1) { 4050 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 4051 ring->reserved = resvd = 0; // XXX panic... 4052 } 4053 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; 4054 } 4055 4056 if (j != k) { /* userspace has released some packets. */ 4057 n = k - j; 4058 if (n < 0) 4059 n += kring->nkr_num_slots; 4060 ND("userspace releases %d packets", n); 4061 for (n = 0; likely(j != k); n++) { 4062 struct netmap_slot *slot = &ring->slot[j]; 4063 void *addr = BDG_NMB(na->nm_mem, slot); 4064 4065 if (addr == netmap_buffer_base) { /* bad buf */ 4066 D("bad buffer index %d, ignore ?", 4067 slot->buf_idx); 4068 } 4069 slot->flags &= ~NS_BUF_CHANGED; 4070 j = nm_next(j, lim); 4071 } 4072 kring->nr_hwavail -= n; 4073 kring->nr_hwcur = k; 4074 } 4075 /* tell userspace that there are new packets */ 4076 ring->avail = kring->nr_hwavail - resvd; 4077 n = 0; 4078 done: 4079 mtx_unlock(&kring->q_lock); 4080 return n; 4081 } 4082 4083 4084 static int 4085 bdg_netmap_attach(struct netmap_adapter *arg) 4086 { 4087 struct netmap_adapter na; 4088 4089 ND("attaching virtual bridge"); 4090 bzero(&na, sizeof(na)); 4091 4092 na.ifp = arg->ifp; 4093 na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 4094 na.num_tx_rings = arg->num_tx_rings; 4095 na.num_rx_rings = arg->num_rx_rings; 4096 na.num_tx_desc = arg->num_tx_desc; 4097 na.num_rx_desc = arg->num_rx_desc; 4098 na.nm_txsync = bdg_netmap_txsync; 4099 na.nm_rxsync = bdg_netmap_rxsync; 4100 na.nm_register = bdg_netmap_reg; 4101 na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, 4102 na.num_tx_rings, na.num_tx_desc, 4103 na.num_rx_rings, na.num_rx_desc); 4104 return netmap_attach(&na, na.num_tx_rings); 4105 } 4106 4107 4108 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 4109 4110 4111 /* 4112 * Module loader. 4113 * 4114 * Create the /dev/netmap device and initialize all global 4115 * variables. 4116 * 4117 * Return 0 on success, errno on failure. 4118 */ 4119 static int 4120 netmap_init(void) 4121 { 4122 int i, error; 4123 4124 NMG_LOCK_INIT(); 4125 4126 error = netmap_mem_init(); 4127 if (error != 0) { 4128 printf("netmap: unable to initialize the memory allocator.\n"); 4129 return (error); 4130 } 4131 printf("netmap: loaded module\n"); 4132 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 4133 "netmap"); 4134 4135 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 4136 for (i = 0; i < NM_BRIDGES; i++) 4137 BDG_RWINIT(&nm_bridges[i]); 4138 return (error); 4139 } 4140 4141 4142 /* 4143 * Module unloader. 4144 * 4145 * Free all the memory, and destroy the ``/dev/netmap`` device. 4146 */ 4147 static void 4148 netmap_fini(void) 4149 { 4150 destroy_dev(netmap_dev); 4151 netmap_mem_fini(); 4152 NMG_LOCK_DESTROY(); 4153 printf("netmap: unloaded module.\n"); 4154 } 4155 4156 4157 #ifdef __FreeBSD__ 4158 /* 4159 * Kernel entry point. 4160 * 4161 * Initialize/finalize the module and return. 4162 * 4163 * Return 0 on success, errno on failure. 4164 */ 4165 static int 4166 netmap_loader(__unused struct module *module, int event, __unused void *arg) 4167 { 4168 int error = 0; 4169 4170 switch (event) { 4171 case MOD_LOAD: 4172 error = netmap_init(); 4173 break; 4174 4175 case MOD_UNLOAD: 4176 netmap_fini(); 4177 break; 4178 4179 default: 4180 error = EOPNOTSUPP; 4181 break; 4182 } 4183 4184 return (error); 4185 } 4186 4187 4188 DEV_MODULE(netmap, netmap_loader, NULL); 4189 #endif /* __FreeBSD__ */ 4190