1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 #ifdef __FreeBSD__ 28 #define TEST_STUFF // test code, does not compile yet on linux 29 #endif /* __FreeBSD__ */ 30 31 /* 32 * This module supports memory mapped access to network devices, 33 * see netmap(4). 34 * 35 * The module uses a large, memory pool allocated by the kernel 36 * and accessible as mmapped memory by multiple userspace threads/processes. 37 * The memory pool contains packet buffers and "netmap rings", 38 * i.e. user-accessible copies of the interface's queues. 39 * 40 * Access to the network card works like this: 41 * 1. a process/thread issues one or more open() on /dev/netmap, to create 42 * select()able file descriptor on which events are reported. 43 * 2. on each descriptor, the process issues an ioctl() to identify 44 * the interface that should report events to the file descriptor. 45 * 3. on each descriptor, the process issues an mmap() request to 46 * map the shared memory region within the process' address space. 47 * The list of interesting queues is indicated by a location in 48 * the shared memory region. 49 * 4. using the functions in the netmap(4) userspace API, a process 50 * can look up the occupation state of a queue, access memory buffers, 51 * and retrieve received packets or enqueue packets to transmit. 52 * 5. using some ioctl()s the process can synchronize the userspace view 53 * of the queue with the actual status in the kernel. This includes both 54 * receiving the notification of new packets, and transmitting new 55 * packets on the output interface. 56 * 6. select() or poll() can be used to wait for events on individual 57 * transmit or receive queues (or all queues for a given interface). 58 * 59 60 SYNCHRONIZATION (USER) 61 62 The netmap rings and data structures may be shared among multiple 63 user threads or even independent processes. 64 Any synchronization among those threads/processes is delegated 65 to the threads themselves. Only one thread at a time can be in 66 a system call on the same netmap ring. The OS does not enforce 67 this and only guarantees against system crashes in case of 68 invalid usage. 69 70 LOCKING (INTERNAL) 71 72 Within the kernel, access to the netmap rings is protected as follows: 73 74 - a spinlock on each ring, to handle producer/consumer races on 75 RX rings attached to the host stack (against multiple host 76 threads writing from the host stack to the same ring), 77 and on 'destination' rings attached to a VALE switch 78 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 79 protecting multiple active senders for the same destination) 80 81 - an atomic variable to guarantee that there is at most one 82 instance of *_*xsync() on the ring at any time. 83 For rings connected to user file 84 descriptors, an atomic_test_and_set() protects this, and the 85 lock on the ring is not actually used. 86 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 87 is also used to prevent multiple executions (the driver might indeed 88 already guarantee this). 89 For NIC TX rings connected to a VALE switch, the lock arbitrates 90 access to the queue (both when allocating buffers and when pushing 91 them out). 92 93 - *xsync() should be protected against initializations of the card. 94 On FreeBSD most devices have the reset routine protected by 95 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 96 the RING protection on rx_reset(), this should be added. 97 98 On linux there is an external lock on the tx path, which probably 99 also arbitrates access to the reset routine. XXX to be revised 100 101 - a per-interface core_lock protecting access from the host stack 102 while interfaces may be detached from netmap mode. 103 XXX there should be no need for this lock if we detach the interfaces 104 only while they are down. 105 106 107 --- VALE SWITCH --- 108 109 NMG_LOCK() serializes all modifications to switches and ports. 110 A switch cannot be deleted until all ports are gone. 111 112 For each switch, an SX lock (RWlock on linux) protects 113 deletion of ports. When configuring or deleting a new port, the 114 lock is acquired in exclusive mode (after holding NMG_LOCK). 115 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 116 The lock is held throughout the entire forwarding cycle, 117 during which the thread may incur in a page fault. 118 Hence it is important that sleepable shared locks are used. 119 120 On the rx ring, the per-port lock is grabbed initially to reserve 121 a number of slot in the ring, then the lock is released, 122 packets are copied from source to destination, and then 123 the lock is acquired again and the receive ring is updated. 124 (A similar thing is done on the tx ring for NIC and host stack 125 ports attached to the switch) 126 127 */ 128 129 /* 130 * OS-specific code that is used only within this file. 131 * Other OS-specific code that must be accessed by drivers 132 * is present in netmap_kern.h 133 */ 134 135 #if defined(__FreeBSD__) 136 #include <sys/cdefs.h> /* prerequisite */ 137 __FBSDID("$FreeBSD$"); 138 139 #include <sys/types.h> 140 #include <sys/module.h> 141 #include <sys/errno.h> 142 #include <sys/param.h> /* defines used in kernel.h */ 143 #include <sys/jail.h> 144 #include <sys/kernel.h> /* types used in module initialization */ 145 #include <sys/conf.h> /* cdevsw struct */ 146 #include <sys/uio.h> /* uio struct */ 147 #include <sys/sockio.h> 148 #include <sys/socketvar.h> /* struct socket */ 149 #include <sys/malloc.h> 150 #include <sys/mman.h> /* PROT_EXEC */ 151 #include <sys/poll.h> 152 #include <sys/proc.h> 153 #include <sys/rwlock.h> 154 #include <vm/vm.h> /* vtophys */ 155 #include <vm/pmap.h> /* vtophys */ 156 #include <vm/vm_param.h> 157 #include <vm/vm_object.h> 158 #include <vm/vm_page.h> 159 #include <vm/vm_pager.h> 160 #include <vm/uma.h> 161 #include <sys/socket.h> /* sockaddrs */ 162 #include <sys/selinfo.h> 163 #include <sys/sysctl.h> 164 #include <net/if.h> 165 #include <net/if_var.h> 166 #include <net/bpf.h> /* BIOCIMMEDIATE */ 167 #include <net/vnet.h> 168 #include <machine/bus.h> /* bus_dmamap_* */ 169 #include <sys/endian.h> 170 #include <sys/refcount.h> 171 172 #define prefetch(x) __builtin_prefetch(x) 173 174 #define BDG_RWLOCK_T struct rwlock // struct rwlock 175 176 #define BDG_RWINIT(b) \ 177 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 178 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 179 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 180 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 181 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 182 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 183 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 184 185 186 /* netmap global lock. 187 * normally called within the user thread (upon a system call) 188 * or when a file descriptor or process is terminated 189 * (last close or last munmap) 190 */ 191 192 #define NMG_LOCK_T struct mtx 193 #define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) 194 #define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) 195 #define NMG_LOCK() mtx_lock(&netmap_global_lock) 196 #define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) 197 #define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) 198 199 200 /* atomic operations */ 201 #include <machine/atomic.h> 202 #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) 203 #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) 204 205 206 #elif defined(linux) 207 208 #include "bsd_glue.h" 209 210 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); 211 212 static struct device_driver* 213 linux_netmap_find_driver(struct device *dev) 214 { 215 struct device_driver *dd; 216 217 while ( (dd = dev->driver) == NULL ) { 218 if ( (dev = dev->parent) == NULL ) 219 return NULL; 220 } 221 return dd; 222 } 223 224 static struct net_device* 225 ifunit_ref(const char *name) 226 { 227 struct net_device *ifp = dev_get_by_name(&init_net, name); 228 struct device_driver *dd; 229 230 if (ifp == NULL) 231 return NULL; 232 233 if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) 234 goto error; 235 236 if (!try_module_get(dd->owner)) 237 goto error; 238 239 return ifp; 240 error: 241 dev_put(ifp); 242 return NULL; 243 } 244 245 static void 246 if_rele(struct net_device *ifp) 247 { 248 struct device_driver *dd; 249 dd = linux_netmap_find_driver(&ifp->dev); 250 dev_put(ifp); 251 if (dd) 252 module_put(dd->owner); 253 } 254 255 // XXX a mtx would suffice here too 20130404 gl 256 #define NMG_LOCK_T struct semaphore 257 #define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) 258 #define NMG_LOCK_DESTROY() 259 #define NMG_LOCK() down(&netmap_global_lock) 260 #define NMG_UNLOCK() up(&netmap_global_lock) 261 #define NMG_LOCK_ASSERT() // XXX to be completed 262 263 264 #elif defined(__APPLE__) 265 266 #warning OSX support is only partial 267 #include "osx_glue.h" 268 269 #else 270 271 #error Unsupported platform 272 273 #endif /* unsupported */ 274 275 /* 276 * common headers 277 */ 278 #include <net/netmap.h> 279 #include <dev/netmap/netmap_kern.h> 280 #include <dev/netmap/netmap_mem2.h> 281 282 283 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 284 285 /* 286 * The following variables are used by the drivers and replicate 287 * fields in the global memory pool. They only refer to buffers 288 * used by physical interfaces. 289 */ 290 u_int netmap_total_buffers; 291 u_int netmap_buf_size; 292 char *netmap_buffer_base; /* also address of an invalid buffer */ 293 294 /* user-controlled variables */ 295 int netmap_verbose; 296 297 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 298 299 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 300 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 301 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 302 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 303 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 304 int netmap_mitigate = 1; 305 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 306 int netmap_no_pendintr = 1; 307 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 308 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 309 int netmap_txsync_retry = 2; 310 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 311 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 312 313 int netmap_drop = 0; /* debugging */ 314 int netmap_flags = 0; /* debug flags */ 315 int netmap_fwd = 0; /* force transparent mode */ 316 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 317 318 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); 319 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 320 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 321 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 322 323 NMG_LOCK_T netmap_global_lock; 324 325 /* 326 * protect against multiple threads using the same ring. 327 * also check that the ring has not been stopped. 328 */ 329 #define NM_KR_BUSY 1 330 #define NM_KR_STOPPED 2 331 static void nm_kr_put(struct netmap_kring *kr); 332 static __inline int nm_kr_tryget(struct netmap_kring *kr) 333 { 334 /* check a first time without taking the lock 335 * to avoid starvation for nm_kr_get() 336 */ 337 if (unlikely(kr->nkr_stopped)) { 338 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 339 return NM_KR_STOPPED; 340 } 341 if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) 342 return NM_KR_BUSY; 343 /* check a second time with lock held */ 344 if (unlikely(kr->nkr_stopped)) { 345 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 346 nm_kr_put(kr); 347 return NM_KR_STOPPED; 348 } 349 return 0; 350 } 351 352 static __inline void nm_kr_put(struct netmap_kring *kr) 353 { 354 NM_ATOMIC_CLEAR(&kr->nr_busy); 355 } 356 357 static void nm_kr_get(struct netmap_kring *kr) 358 { 359 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 360 tsleep(kr, 0, "NM_KR_GET", 4); 361 } 362 363 static void nm_disable_ring(struct netmap_kring *kr) 364 { 365 kr->nkr_stopped = 1; 366 nm_kr_get(kr); 367 mtx_lock(&kr->q_lock); 368 mtx_unlock(&kr->q_lock); 369 nm_kr_put(kr); 370 } 371 372 void netmap_disable_all_rings(struct ifnet *ifp) 373 { 374 struct netmap_adapter *na; 375 int i; 376 377 if (!(ifp->if_capenable & IFCAP_NETMAP)) 378 return; 379 380 na = NA(ifp); 381 382 for (i = 0; i < na->num_tx_rings + 1; i++) { 383 nm_disable_ring(na->tx_rings + i); 384 selwakeuppri(&na->tx_rings[i].si, PI_NET); 385 } 386 for (i = 0; i < na->num_rx_rings + 1; i++) { 387 nm_disable_ring(na->rx_rings + i); 388 selwakeuppri(&na->rx_rings[i].si, PI_NET); 389 } 390 selwakeuppri(&na->tx_si, PI_NET); 391 selwakeuppri(&na->rx_si, PI_NET); 392 } 393 394 void netmap_enable_all_rings(struct ifnet *ifp) 395 { 396 struct netmap_adapter *na; 397 int i; 398 399 if (!(ifp->if_capenable & IFCAP_NETMAP)) 400 return; 401 402 na = NA(ifp); 403 for (i = 0; i < na->num_tx_rings + 1; i++) { 404 D("enabling %p", na->tx_rings + i); 405 na->tx_rings[i].nkr_stopped = 0; 406 } 407 for (i = 0; i < na->num_rx_rings + 1; i++) { 408 D("enabling %p", na->rx_rings + i); 409 na->rx_rings[i].nkr_stopped = 0; 410 } 411 } 412 413 414 /* 415 * generic bound_checking function 416 */ 417 u_int 418 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 419 { 420 u_int oldv = *v; 421 const char *op = NULL; 422 423 if (dflt < lo) 424 dflt = lo; 425 if (dflt > hi) 426 dflt = hi; 427 if (oldv < lo) { 428 *v = dflt; 429 op = "Bump"; 430 } else if (oldv > hi) { 431 *v = hi; 432 op = "Clamp"; 433 } 434 if (op && msg) 435 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 436 return *v; 437 } 438 439 /* 440 * packet-dump function, user-supplied or static buffer. 441 * The destination buffer must be at least 30+4*len 442 */ 443 const char * 444 nm_dump_buf(char *p, int len, int lim, char *dst) 445 { 446 static char _dst[8192]; 447 int i, j, i0; 448 static char hex[] ="0123456789abcdef"; 449 char *o; /* output position */ 450 451 #define P_HI(x) hex[((x) & 0xf0)>>4] 452 #define P_LO(x) hex[((x) & 0xf)] 453 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 454 if (!dst) 455 dst = _dst; 456 if (lim <= 0 || lim > len) 457 lim = len; 458 o = dst; 459 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 460 o += strlen(o); 461 /* hexdump routine */ 462 for (i = 0; i < lim; ) { 463 sprintf(o, "%5d: ", i); 464 o += strlen(o); 465 memset(o, ' ', 48); 466 i0 = i; 467 for (j=0; j < 16 && i < lim; i++, j++) { 468 o[j*3] = P_HI(p[i]); 469 o[j*3+1] = P_LO(p[i]); 470 } 471 i = i0; 472 for (j=0; j < 16 && i < lim; i++, j++) 473 o[j + 48] = P_C(p[i]); 474 o[j+48] = '\n'; 475 o += j+49; 476 } 477 *o = '\0'; 478 #undef P_HI 479 #undef P_LO 480 #undef P_C 481 return dst; 482 } 483 484 /* 485 * system parameters (most of them in netmap_kern.h) 486 * NM_NAME prefix for switch port names, default "vale" 487 * NM_BDG_MAXPORTS number of ports 488 * NM_BRIDGES max number of switches in the system. 489 * XXX should become a sysctl or tunable 490 * 491 * Switch ports are named valeX:Y where X is the switch name and Y 492 * is the port. If Y matches a physical interface name, the port is 493 * connected to a physical device. 494 * 495 * Unlike physical interfaces, switch ports use their own memory region 496 * for rings and buffers. 497 * The virtual interfaces use per-queue lock instead of core lock. 498 * In the tx loop, we aggregate traffic in batches to make all operations 499 * faster. The batch size is bridge_batch. 500 */ 501 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 502 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 503 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 504 #define NM_BDG_HASH 1024 /* forwarding table entries */ 505 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 506 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 507 /* actual size of the tables */ 508 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 509 /* NM_FT_NULL terminates a list of slots in the ft */ 510 #define NM_FT_NULL NM_BDG_BATCH_MAX 511 #define NM_BRIDGES 8 /* number of bridges */ 512 513 514 /* 515 * bridge_batch is set via sysctl to the max batch size to be 516 * used in the bridge. The actual value may be larger as the 517 * last packet in the block may overflow the size. 518 */ 519 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 520 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 521 522 523 /* 524 * These are used to handle reference counters for bridge ports. 525 */ 526 #define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) 527 #define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) 528 529 /* The bridge references the buffers using the device specific look up table */ 530 static inline void * 531 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) 532 { 533 struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; 534 uint32_t i = slot->buf_idx; 535 return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; 536 } 537 538 static void bdg_netmap_attach(struct netmap_adapter *); 539 static int bdg_netmap_reg(struct ifnet *ifp, int onoff); 540 int kern_netmap_regif(struct nmreq *nmr); 541 542 /* 543 * Each transmit queue accumulates a batch of packets into 544 * a structure before forwarding. Packets to the same 545 * destination are put in a list using ft_next as a link field. 546 * ft_frags and ft_next are valid only on the first fragment. 547 */ 548 struct nm_bdg_fwd { /* forwarding entry for a bridge */ 549 void *ft_buf; /* netmap or indirect buffer */ 550 uint8_t ft_frags; /* how many fragments (only on 1st frag) */ 551 uint8_t _ft_port; /* dst port (unused) */ 552 uint16_t ft_flags; /* flags, e.g. indirect */ 553 uint16_t ft_len; /* src fragment len */ 554 uint16_t ft_next; /* next packet to same destination */ 555 }; 556 557 /* 558 * For each output interface, nm_bdg_q is used to construct a list. 559 * bq_len is the number of output buffers (we can have coalescing 560 * during the copy). 561 */ 562 struct nm_bdg_q { 563 uint16_t bq_head; 564 uint16_t bq_tail; 565 uint32_t bq_len; /* number of buffers */ 566 }; 567 568 /* XXX revise this */ 569 struct nm_hash_ent { 570 uint64_t mac; /* the top 2 bytes are the epoch */ 571 uint64_t ports; 572 }; 573 574 /* 575 * nm_bridge is a descriptor for a VALE switch. 576 * Interfaces for a bridge are all in bdg_ports[]. 577 * The array has fixed size, an empty entry does not terminate 578 * the search, but lookups only occur on attach/detach so we 579 * don't mind if they are slow. 580 * 581 * The bridge is non blocking on the transmit ports: excess 582 * packets are dropped if there is no room on the output port. 583 * 584 * bdg_lock protects accesses to the bdg_ports array. 585 * This is a rw lock (or equivalent). 586 */ 587 struct nm_bridge { 588 /* XXX what is the proper alignment/layout ? */ 589 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 590 int bdg_namelen; 591 uint32_t bdg_active_ports; /* 0 means free */ 592 char bdg_basename[IFNAMSIZ]; 593 594 /* Indexes of active ports (up to active_ports) 595 * and all other remaining ports. 596 */ 597 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 598 599 struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; 600 601 602 /* 603 * The function to decide the destination port. 604 * It returns either of an index of the destination port, 605 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 606 * forward this packet. ring_nr is the source ring index, and the 607 * function may overwrite this value to forward this packet to a 608 * different ring index. 609 * This function must be set by netmap_bdgctl(). 610 */ 611 bdg_lookup_fn_t nm_bdg_lookup; 612 613 /* the forwarding table, MAC+ports. 614 * XXX should be changed to an argument to be passed to 615 * the lookup function, and allocated on attach 616 */ 617 struct nm_hash_ent ht[NM_BDG_HASH]; 618 }; 619 620 621 /* 622 * XXX in principle nm_bridges could be created dynamically 623 * Right now we have a static array and deletions are protected 624 * by an exclusive lock. 625 */ 626 struct nm_bridge nm_bridges[NM_BRIDGES]; 627 628 629 /* 630 * A few function to tell which kind of port are we using. 631 * XXX should we hold a lock ? 632 * 633 * nma_is_vp() virtual port 634 * nma_is_host() port connected to the host stack 635 * nma_is_hw() port connected to a NIC 636 */ 637 int nma_is_vp(struct netmap_adapter *na); 638 int 639 nma_is_vp(struct netmap_adapter *na) 640 { 641 return na->nm_register == bdg_netmap_reg; 642 } 643 644 static __inline int 645 nma_is_host(struct netmap_adapter *na) 646 { 647 return na->nm_register == NULL; 648 } 649 650 static __inline int 651 nma_is_hw(struct netmap_adapter *na) 652 { 653 /* In case of sw adapter, nm_register is NULL */ 654 return !nma_is_vp(na) && !nma_is_host(na); 655 } 656 657 658 /* 659 * If the NIC is owned by the kernel 660 * (i.e., bridge), neither another bridge nor user can use it; 661 * if the NIC is owned by a user, only users can share it. 662 * Evaluation must be done under NMG_LOCK(). 663 */ 664 #define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) 665 #define NETMAP_OWNED_BY_ANY(ifp) \ 666 (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) 667 668 /* 669 * NA(ifp)->bdg_port port index 670 */ 671 672 673 /* 674 * this is a slightly optimized copy routine which rounds 675 * to multiple of 64 bytes and is often faster than dealing 676 * with other odd sizes. We assume there is enough room 677 * in the source and destination buffers. 678 * 679 * XXX only for multiples of 64 bytes, non overlapped. 680 */ 681 static inline void 682 pkt_copy(void *_src, void *_dst, int l) 683 { 684 uint64_t *src = _src; 685 uint64_t *dst = _dst; 686 if (unlikely(l >= 1024)) { 687 memcpy(dst, src, l); 688 return; 689 } 690 for (; likely(l > 0); l-=64) { 691 *dst++ = *src++; 692 *dst++ = *src++; 693 *dst++ = *src++; 694 *dst++ = *src++; 695 *dst++ = *src++; 696 *dst++ = *src++; 697 *dst++ = *src++; 698 *dst++ = *src++; 699 } 700 } 701 702 703 #ifdef TEST_STUFF 704 struct xxx { 705 char *name; 706 void (*fn)(uint32_t); 707 }; 708 709 710 static void 711 nm_test_defmtx(uint32_t n) 712 { 713 uint32_t i; 714 struct mtx m; 715 mtx_init(&m, "test", NULL, MTX_DEF); 716 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 717 mtx_destroy(&m); 718 return; 719 } 720 721 static void 722 nm_test_spinmtx(uint32_t n) 723 { 724 uint32_t i; 725 struct mtx m; 726 mtx_init(&m, "test", NULL, MTX_SPIN); 727 for (i = 0; i < n; i++) { mtx_lock(&m); mtx_unlock(&m); } 728 mtx_destroy(&m); 729 return; 730 } 731 732 static void 733 nm_test_rlock(uint32_t n) 734 { 735 uint32_t i; 736 struct rwlock m; 737 rw_init(&m, "test"); 738 for (i = 0; i < n; i++) { rw_rlock(&m); rw_runlock(&m); } 739 rw_destroy(&m); 740 return; 741 } 742 743 static void 744 nm_test_wlock(uint32_t n) 745 { 746 uint32_t i; 747 struct rwlock m; 748 rw_init(&m, "test"); 749 for (i = 0; i < n; i++) { rw_wlock(&m); rw_wunlock(&m); } 750 rw_destroy(&m); 751 return; 752 } 753 754 static void 755 nm_test_slock(uint32_t n) 756 { 757 uint32_t i; 758 struct sx m; 759 sx_init(&m, "test"); 760 for (i = 0; i < n; i++) { sx_slock(&m); sx_sunlock(&m); } 761 sx_destroy(&m); 762 return; 763 } 764 765 static void 766 nm_test_xlock(uint32_t n) 767 { 768 uint32_t i; 769 struct sx m; 770 sx_init(&m, "test"); 771 for (i = 0; i < n; i++) { sx_xlock(&m); sx_xunlock(&m); } 772 sx_destroy(&m); 773 return; 774 } 775 776 777 struct xxx nm_tests[] = { 778 { "defmtx", nm_test_defmtx }, 779 { "spinmtx", nm_test_spinmtx }, 780 { "rlock", nm_test_rlock }, 781 { "wlock", nm_test_wlock }, 782 { "slock", nm_test_slock }, 783 { "xlock", nm_test_xlock }, 784 }; 785 786 static int 787 nm_test(struct nmreq *nmr) 788 { 789 uint32_t scale, n, test; 790 static int old_test = -1; 791 792 test = nmr->nr_cmd; 793 scale = nmr->nr_offset; 794 n = sizeof(nm_tests) / sizeof(struct xxx) - 1; 795 if (test > n) { 796 D("test index too high, max %d", n); 797 return 0; 798 } 799 800 if (old_test != test) { 801 D("test %s scale %d", nm_tests[test].name, scale); 802 old_test = test; 803 } 804 nm_tests[test].fn(scale); 805 return 0; 806 } 807 #endif /* TEST_STUFF */ 808 809 /* 810 * locate a bridge among the existing ones. 811 * MUST BE CALLED WITH NMG_LOCK() 812 * 813 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 814 * We assume that this is called with a name of at least NM_NAME chars. 815 */ 816 static struct nm_bridge * 817 nm_find_bridge(const char *name, int create) 818 { 819 int i, l, namelen; 820 struct nm_bridge *b = NULL; 821 822 NMG_LOCK_ASSERT(); 823 824 namelen = strlen(NM_NAME); /* base length */ 825 l = name ? strlen(name) : 0; /* actual length */ 826 if (l < namelen) { 827 D("invalid bridge name %s", name ? name : NULL); 828 return NULL; 829 } 830 for (i = namelen + 1; i < l; i++) { 831 if (name[i] == ':') { 832 namelen = i; 833 break; 834 } 835 } 836 if (namelen >= IFNAMSIZ) 837 namelen = IFNAMSIZ; 838 ND("--- prefix is '%.*s' ---", namelen, name); 839 840 /* lookup the name, remember empty slot if there is one */ 841 for (i = 0; i < NM_BRIDGES; i++) { 842 struct nm_bridge *x = nm_bridges + i; 843 844 if (x->bdg_active_ports == 0) { 845 if (create && b == NULL) 846 b = x; /* record empty slot */ 847 } else if (x->bdg_namelen != namelen) { 848 continue; 849 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 850 ND("found '%.*s' at %d", namelen, name, i); 851 b = x; 852 break; 853 } 854 } 855 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 856 /* initialize the bridge */ 857 strncpy(b->bdg_basename, name, namelen); 858 ND("create new bridge %s with ports %d", b->bdg_basename, 859 b->bdg_active_ports); 860 b->bdg_namelen = namelen; 861 b->bdg_active_ports = 0; 862 for (i = 0; i < NM_BDG_MAXPORTS; i++) 863 b->bdg_port_index[i] = i; 864 /* set the default function */ 865 b->nm_bdg_lookup = netmap_bdg_learning; 866 /* reset the MAC address table */ 867 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 868 } 869 return b; 870 } 871 872 873 /* 874 * Free the forwarding tables for rings attached to switch ports. 875 */ 876 static void 877 nm_free_bdgfwd(struct netmap_adapter *na) 878 { 879 int nrings, i; 880 struct netmap_kring *kring; 881 882 NMG_LOCK_ASSERT(); 883 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 884 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 885 for (i = 0; i < nrings; i++) { 886 if (kring[i].nkr_ft) { 887 free(kring[i].nkr_ft, M_DEVBUF); 888 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 889 } 890 } 891 if (nma_is_hw(na)) 892 nm_free_bdgfwd(SWNA(na->ifp)); 893 } 894 895 896 /* 897 * Allocate the forwarding tables for the rings attached to the bridge ports. 898 */ 899 static int 900 nm_alloc_bdgfwd(struct netmap_adapter *na) 901 { 902 int nrings, l, i, num_dstq; 903 struct netmap_kring *kring; 904 905 NMG_LOCK_ASSERT(); 906 /* all port:rings + broadcast */ 907 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 908 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 909 l += sizeof(struct nm_bdg_q) * num_dstq; 910 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 911 912 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 913 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 914 for (i = 0; i < nrings; i++) { 915 struct nm_bdg_fwd *ft; 916 struct nm_bdg_q *dstq; 917 int j; 918 919 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 920 if (!ft) { 921 nm_free_bdgfwd(na); 922 return ENOMEM; 923 } 924 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 925 for (j = 0; j < num_dstq; j++) { 926 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 927 dstq[j].bq_len = 0; 928 } 929 kring[i].nkr_ft = ft; 930 } 931 if (nma_is_hw(na)) 932 nm_alloc_bdgfwd(SWNA(na->ifp)); 933 return 0; 934 } 935 936 937 /* 938 * Fetch configuration from the device, to cope with dynamic 939 * reconfigurations after loading the module. 940 */ 941 static int 942 netmap_update_config(struct netmap_adapter *na) 943 { 944 struct ifnet *ifp = na->ifp; 945 u_int txr, txd, rxr, rxd; 946 947 txr = txd = rxr = rxd = 0; 948 if (na->nm_config) { 949 na->nm_config(ifp, &txr, &txd, &rxr, &rxd); 950 } else { 951 /* take whatever we had at init time */ 952 txr = na->num_tx_rings; 953 txd = na->num_tx_desc; 954 rxr = na->num_rx_rings; 955 rxd = na->num_rx_desc; 956 } 957 958 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 959 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 960 return 0; /* nothing changed */ 961 if (netmap_verbose || na->refcount > 0) { 962 D("stored config %s: txring %d x %d, rxring %d x %d", 963 ifp->if_xname, 964 na->num_tx_rings, na->num_tx_desc, 965 na->num_rx_rings, na->num_rx_desc); 966 D("new config %s: txring %d x %d, rxring %d x %d", 967 ifp->if_xname, txr, txd, rxr, rxd); 968 } 969 if (na->refcount == 0) { 970 D("configuration changed (but fine)"); 971 na->num_tx_rings = txr; 972 na->num_tx_desc = txd; 973 na->num_rx_rings = rxr; 974 na->num_rx_desc = rxd; 975 return 0; 976 } 977 D("configuration changed while active, this is bad..."); 978 return 1; 979 } 980 981 static struct netmap_if * 982 netmap_if_new(const char *ifname, struct netmap_adapter *na) 983 { 984 if (netmap_update_config(na)) { 985 /* configuration mismatch, report and fail */ 986 return NULL; 987 } 988 return netmap_mem_if_new(ifname, na); 989 } 990 991 992 /* Structure associated to each thread which registered an interface. 993 * 994 * The first 4 fields of this structure are written by NIOCREGIF and 995 * read by poll() and NIOC?XSYNC. 996 * There is low contention among writers (actually, a correct user program 997 * should have no contention among writers) and among writers and readers, 998 * so we use a single global lock to protect the structure initialization. 999 * Since initialization involves the allocation of memory, we reuse the memory 1000 * allocator lock. 1001 * Read access to the structure is lock free. Readers must check that 1002 * np_nifp is not NULL before using the other fields. 1003 * If np_nifp is NULL initialization has not been performed, so they should 1004 * return an error to userlevel. 1005 * 1006 * The ref_done field is used to regulate access to the refcount in the 1007 * memory allocator. The refcount must be incremented at most once for 1008 * each open("/dev/netmap"). The increment is performed by the first 1009 * function that calls netmap_get_memory() (currently called by 1010 * mmap(), NIOCGINFO and NIOCREGIF). 1011 * If the refcount is incremented, it is then decremented when the 1012 * private structure is destroyed. 1013 */ 1014 struct netmap_priv_d { 1015 struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ 1016 1017 struct ifnet *np_ifp; /* device for which we hold a ref. */ 1018 int np_ringid; /* from the ioctl */ 1019 u_int np_qfirst, np_qlast; /* range of rings to scan */ 1020 uint16_t np_txpoll; 1021 1022 struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ 1023 #ifdef __FreeBSD__ 1024 int np_refcount; /* use with NMG_LOCK held */ 1025 #endif /* __FreeBSD__ */ 1026 }; 1027 1028 /* grab a reference to the memory allocator, if we don't have one already. The 1029 * reference is taken from the netmap_adapter registered with the priv. 1030 * 1031 */ 1032 static int 1033 netmap_get_memory_locked(struct netmap_priv_d* p) 1034 { 1035 struct netmap_mem_d *nmd; 1036 int error = 0; 1037 1038 if (p->np_ifp == NULL) { 1039 if (!netmap_mmap_unreg) 1040 return ENODEV; 1041 /* for compatibility with older versions of the API 1042 * we use the global allocator when no interface has been 1043 * registered 1044 */ 1045 nmd = &nm_mem; 1046 } else { 1047 nmd = NA(p->np_ifp)->nm_mem; 1048 } 1049 if (p->np_mref == NULL) { 1050 error = netmap_mem_finalize(nmd); 1051 if (!error) 1052 p->np_mref = nmd; 1053 } else if (p->np_mref != nmd) { 1054 /* a virtual port has been registered, but previous 1055 * syscalls already used the global allocator. 1056 * We cannot continue 1057 */ 1058 error = ENODEV; 1059 } 1060 return error; 1061 } 1062 1063 static int 1064 netmap_get_memory(struct netmap_priv_d* p) 1065 { 1066 int error; 1067 NMG_LOCK(); 1068 error = netmap_get_memory_locked(p); 1069 NMG_UNLOCK(); 1070 return error; 1071 } 1072 1073 static int 1074 netmap_have_memory_locked(struct netmap_priv_d* p) 1075 { 1076 return p->np_mref != NULL; 1077 } 1078 1079 static void 1080 netmap_drop_memory_locked(struct netmap_priv_d* p) 1081 { 1082 if (p->np_mref) { 1083 netmap_mem_deref(p->np_mref); 1084 p->np_mref = NULL; 1085 } 1086 } 1087 1088 /* 1089 * File descriptor's private data destructor. 1090 * 1091 * Call nm_register(ifp,0) to stop netmap mode on the interface and 1092 * revert to normal operation. We expect that np_ifp has not gone. 1093 * The second argument is the nifp to work on. In some cases it is 1094 * not attached yet to the netmap_priv_d so we need to pass it as 1095 * a separate argument. 1096 */ 1097 /* call with NMG_LOCK held */ 1098 static void 1099 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 1100 { 1101 struct ifnet *ifp = priv->np_ifp; 1102 struct netmap_adapter *na = NA(ifp); 1103 1104 NMG_LOCK_ASSERT(); 1105 na->refcount--; 1106 if (na->refcount <= 0) { /* last instance */ 1107 u_int i; 1108 1109 if (netmap_verbose) 1110 D("deleting last instance for %s", ifp->if_xname); 1111 /* 1112 * (TO CHECK) This function is only called 1113 * when the last reference to this file descriptor goes 1114 * away. This means we cannot have any pending poll() 1115 * or interrupt routine operating on the structure. 1116 * XXX The file may be closed in a thread while 1117 * another thread is using it. 1118 * Linux keeps the file opened until the last reference 1119 * by any outstanding ioctl/poll or mmap is gone. 1120 * FreeBSD does not track mmap()s (but we do) and 1121 * wakes up any sleeping poll(). Need to check what 1122 * happens if the close() occurs while a concurrent 1123 * syscall is running. 1124 */ 1125 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ 1126 /* Wake up any sleeping threads. netmap_poll will 1127 * then return POLLERR 1128 * XXX The wake up now must happen during *_down(), when 1129 * we order all activities to stop. -gl 1130 */ 1131 nm_free_bdgfwd(na); 1132 for (i = 0; i < na->num_tx_rings + 1; i++) { 1133 mtx_destroy(&na->tx_rings[i].q_lock); 1134 } 1135 for (i = 0; i < na->num_rx_rings + 1; i++) { 1136 mtx_destroy(&na->rx_rings[i].q_lock); 1137 } 1138 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 1139 /* knlist_destroy(&na->tx_si.si_note); */ 1140 /* knlist_destroy(&na->rx_si.si_note); */ 1141 if (nma_is_hw(na)) 1142 SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; 1143 } 1144 /* 1145 * netmap_mem_if_delete() deletes the nifp, and if this is 1146 * the last instance also buffers, rings and krings. 1147 */ 1148 netmap_mem_if_delete(na, nifp); 1149 } 1150 1151 1152 /* we assume netmap adapter exists 1153 * Called with NMG_LOCK held 1154 */ 1155 static void 1156 nm_if_rele(struct ifnet *ifp) 1157 { 1158 int i, is_hw, hw, sw, lim; 1159 struct nm_bridge *b; 1160 struct netmap_adapter *na; 1161 uint8_t tmp[NM_BDG_MAXPORTS]; 1162 1163 NMG_LOCK_ASSERT(); 1164 /* I can be called not only for get_ifp()-ed references where netmap's 1165 * capability is guaranteed, but also for non-netmap-capable NICs. 1166 */ 1167 if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { 1168 if_rele(ifp); 1169 return; 1170 } 1171 na = NA(ifp); 1172 b = na->na_bdg; 1173 is_hw = nma_is_hw(na); 1174 1175 ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); 1176 1177 if (!DROP_BDG_REF(ifp)) 1178 return; 1179 1180 /* 1181 New algorithm: 1182 make a copy of bdg_port_index; 1183 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 1184 in the array of bdg_port_index, replacing them with 1185 entries from the bottom of the array; 1186 decrement bdg_active_ports; 1187 acquire BDG_WLOCK() and copy back the array. 1188 */ 1189 1190 hw = NA(ifp)->bdg_port; 1191 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1192 lim = b->bdg_active_ports; 1193 1194 ND("detach %d and %d (lim %d)", hw, sw, lim); 1195 /* make a copy of the list of active ports, update it, 1196 * and then copy back within BDG_WLOCK(). 1197 */ 1198 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 1199 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 1200 if (hw >= 0 && tmp[i] == hw) { 1201 ND("detach hw %d at %d", hw, i); 1202 lim--; /* point to last active port */ 1203 tmp[i] = tmp[lim]; /* swap with i */ 1204 tmp[lim] = hw; /* now this is inactive */ 1205 hw = -1; 1206 } else if (sw >= 0 && tmp[i] == sw) { 1207 ND("detach sw %d at %d", sw, i); 1208 lim--; 1209 tmp[i] = tmp[lim]; 1210 tmp[lim] = sw; 1211 sw = -1; 1212 } else { 1213 i++; 1214 } 1215 } 1216 if (hw >= 0 || sw >= 0) { 1217 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 1218 } 1219 hw = NA(ifp)->bdg_port; 1220 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1221 1222 BDG_WLOCK(b); 1223 b->bdg_ports[hw] = NULL; 1224 na->na_bdg = NULL; 1225 if (sw >= 0) { 1226 b->bdg_ports[sw] = NULL; 1227 SWNA(ifp)->na_bdg = NULL; 1228 } 1229 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 1230 b->bdg_active_ports = lim; 1231 BDG_WUNLOCK(b); 1232 1233 ND("now %d active ports", lim); 1234 if (lim == 0) { 1235 ND("marking bridge %s as free", b->bdg_basename); 1236 b->nm_bdg_lookup = NULL; 1237 } 1238 1239 if (is_hw) { 1240 if_rele(ifp); 1241 } else { 1242 if (na->na_flags & NAF_MEM_OWNER) 1243 netmap_mem_private_delete(na->nm_mem); 1244 bzero(na, sizeof(*na)); 1245 free(na, M_DEVBUF); 1246 bzero(ifp, sizeof(*ifp)); 1247 free(ifp, M_DEVBUF); 1248 } 1249 } 1250 1251 1252 /* 1253 * returns 1 if this is the last instance and we can free priv 1254 */ 1255 static int 1256 netmap_dtor_locked(struct netmap_priv_d *priv) 1257 { 1258 struct ifnet *ifp = priv->np_ifp; 1259 1260 #ifdef __FreeBSD__ 1261 /* 1262 * np_refcount is the number of active mmaps on 1263 * this file descriptor 1264 */ 1265 if (--priv->np_refcount > 0) { 1266 return 0; 1267 } 1268 #endif /* __FreeBSD__ */ 1269 if (ifp) { 1270 netmap_do_unregif(priv, priv->np_nifp); 1271 } 1272 netmap_drop_memory_locked(priv); 1273 if (ifp) { 1274 nm_if_rele(ifp); /* might also destroy *na */ 1275 } 1276 return 1; 1277 } 1278 1279 static void 1280 netmap_dtor(void *data) 1281 { 1282 struct netmap_priv_d *priv = data; 1283 int last_instance; 1284 1285 NMG_LOCK(); 1286 last_instance = netmap_dtor_locked(priv); 1287 NMG_UNLOCK(); 1288 if (last_instance) { 1289 bzero(priv, sizeof(*priv)); /* for safety */ 1290 free(priv, M_DEVBUF); 1291 } 1292 } 1293 1294 1295 #ifdef __FreeBSD__ 1296 1297 /* 1298 * In order to track whether pages are still mapped, we hook into 1299 * the standard cdev_pager and intercept the constructor and 1300 * destructor. 1301 */ 1302 1303 struct netmap_vm_handle_t { 1304 struct cdev *dev; 1305 struct netmap_priv_d *priv; 1306 }; 1307 1308 static int 1309 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 1310 vm_ooffset_t foff, struct ucred *cred, u_short *color) 1311 { 1312 struct netmap_vm_handle_t *vmh = handle; 1313 D("handle %p size %jd prot %d foff %jd", 1314 handle, (intmax_t)size, prot, (intmax_t)foff); 1315 dev_ref(vmh->dev); 1316 return 0; 1317 } 1318 1319 1320 static void 1321 netmap_dev_pager_dtor(void *handle) 1322 { 1323 struct netmap_vm_handle_t *vmh = handle; 1324 struct cdev *dev = vmh->dev; 1325 struct netmap_priv_d *priv = vmh->priv; 1326 D("handle %p", handle); 1327 netmap_dtor(priv); 1328 free(vmh, M_DEVBUF); 1329 dev_rel(dev); 1330 } 1331 1332 static int 1333 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, 1334 int prot, vm_page_t *mres) 1335 { 1336 struct netmap_vm_handle_t *vmh = object->handle; 1337 struct netmap_priv_d *priv = vmh->priv; 1338 vm_paddr_t paddr; 1339 vm_page_t page; 1340 vm_memattr_t memattr; 1341 vm_pindex_t pidx; 1342 1343 ND("object %p offset %jd prot %d mres %p", 1344 object, (intmax_t)offset, prot, mres); 1345 memattr = object->memattr; 1346 pidx = OFF_TO_IDX(offset); 1347 paddr = netmap_mem_ofstophys(priv->np_mref, offset); 1348 if (paddr == 0) 1349 return VM_PAGER_FAIL; 1350 1351 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 1352 /* 1353 * If the passed in result page is a fake page, update it with 1354 * the new physical address. 1355 */ 1356 page = *mres; 1357 vm_page_updatefake(page, paddr, memattr); 1358 } else { 1359 /* 1360 * Replace the passed in reqpage page with our own fake page and 1361 * free up the all of the original pages. 1362 */ 1363 #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ 1364 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK 1365 #define VM_OBJECT_WLOCK VM_OBJECT_LOCK 1366 #endif /* VM_OBJECT_WUNLOCK */ 1367 1368 VM_OBJECT_WUNLOCK(object); 1369 page = vm_page_getfake(paddr, memattr); 1370 VM_OBJECT_WLOCK(object); 1371 vm_page_lock(*mres); 1372 vm_page_free(*mres); 1373 vm_page_unlock(*mres); 1374 *mres = page; 1375 vm_page_insert(page, object, pidx); 1376 } 1377 page->valid = VM_PAGE_BITS_ALL; 1378 return (VM_PAGER_OK); 1379 } 1380 1381 1382 static struct cdev_pager_ops netmap_cdev_pager_ops = { 1383 .cdev_pg_ctor = netmap_dev_pager_ctor, 1384 .cdev_pg_dtor = netmap_dev_pager_dtor, 1385 .cdev_pg_fault = netmap_dev_pager_fault, 1386 }; 1387 1388 1389 static int 1390 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, 1391 vm_size_t objsize, vm_object_t *objp, int prot) 1392 { 1393 int error; 1394 struct netmap_vm_handle_t *vmh; 1395 struct netmap_priv_d *priv; 1396 vm_object_t obj; 1397 1398 D("cdev %p foff %jd size %jd objp %p prot %d", cdev, 1399 (intmax_t )*foff, (intmax_t )objsize, objp, prot); 1400 1401 vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, 1402 M_NOWAIT | M_ZERO); 1403 if (vmh == NULL) 1404 return ENOMEM; 1405 vmh->dev = cdev; 1406 1407 NMG_LOCK(); 1408 error = devfs_get_cdevpriv((void**)&priv); 1409 if (error) 1410 goto err_unlock; 1411 vmh->priv = priv; 1412 priv->np_refcount++; 1413 NMG_UNLOCK(); 1414 1415 error = netmap_get_memory(priv); 1416 if (error) 1417 goto err_deref; 1418 1419 obj = cdev_pager_allocate(vmh, OBJT_DEVICE, 1420 &netmap_cdev_pager_ops, objsize, prot, 1421 *foff, NULL); 1422 if (obj == NULL) { 1423 D("cdev_pager_allocate failed"); 1424 error = EINVAL; 1425 goto err_deref; 1426 } 1427 1428 *objp = obj; 1429 return 0; 1430 1431 err_deref: 1432 NMG_LOCK(); 1433 priv->np_refcount--; 1434 err_unlock: 1435 NMG_UNLOCK(); 1436 // err: 1437 free(vmh, M_DEVBUF); 1438 return error; 1439 } 1440 1441 1442 // XXX can we remove this ? 1443 static int 1444 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) 1445 { 1446 if (netmap_verbose) 1447 D("dev %p fflag 0x%x devtype %d td %p", 1448 dev, fflag, devtype, td); 1449 return 0; 1450 } 1451 1452 1453 static int 1454 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 1455 { 1456 struct netmap_priv_d *priv; 1457 int error; 1458 1459 (void)dev; 1460 (void)oflags; 1461 (void)devtype; 1462 (void)td; 1463 1464 // XXX wait or nowait ? 1465 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 1466 M_NOWAIT | M_ZERO); 1467 if (priv == NULL) 1468 return ENOMEM; 1469 1470 error = devfs_set_cdevpriv(priv, netmap_dtor); 1471 if (error) 1472 return error; 1473 1474 priv->np_refcount = 1; 1475 1476 return 0; 1477 } 1478 #endif /* __FreeBSD__ */ 1479 1480 1481 /* 1482 * Handlers for synchronization of the queues from/to the host. 1483 * Netmap has two operating modes: 1484 * - in the default mode, the rings connected to the host stack are 1485 * just another ring pair managed by userspace; 1486 * - in transparent mode (XXX to be defined) incoming packets 1487 * (from the host or the NIC) are marked as NS_FORWARD upon 1488 * arrival, and the user application has a chance to reset the 1489 * flag for packets that should be dropped. 1490 * On the RXSYNC or poll(), packets in RX rings between 1491 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1492 * to the other side. 1493 * The transfer NIC --> host is relatively easy, just encapsulate 1494 * into mbufs and we are done. The host --> NIC side is slightly 1495 * harder because there might not be room in the tx ring so it 1496 * might take a while before releasing the buffer. 1497 */ 1498 1499 1500 /* 1501 * pass a chain of buffers to the host stack as coming from 'dst' 1502 */ 1503 static void 1504 netmap_send_up(struct ifnet *dst, struct mbuf *head) 1505 { 1506 struct mbuf *m; 1507 1508 /* send packets up, outside the lock */ 1509 while ((m = head) != NULL) { 1510 head = head->m_nextpkt; 1511 m->m_nextpkt = NULL; 1512 if (netmap_verbose & NM_VERB_HOST) 1513 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1514 NM_SEND_UP(dst, m); 1515 } 1516 } 1517 1518 struct mbq { 1519 struct mbuf *head; 1520 struct mbuf *tail; 1521 int count; 1522 }; 1523 1524 1525 /* 1526 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1527 * Run from hwcur to cur - reserved 1528 */ 1529 static void 1530 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1531 { 1532 /* Take packets from hwcur to cur-reserved and pass them up. 1533 * In case of no buffers we give up. At the end of the loop, 1534 * the queue is drained in all cases. 1535 * XXX handle reserved 1536 */ 1537 u_int lim = kring->nkr_num_slots - 1; 1538 struct mbuf *m, *tail = q->tail; 1539 u_int k = kring->ring->cur, n = kring->ring->reserved; 1540 struct netmap_mem_d *nmd = kring->na->nm_mem; 1541 1542 /* compute the final position, ring->cur - ring->reserved */ 1543 if (n > 0) { 1544 if (k < n) 1545 k += kring->nkr_num_slots; 1546 k += n; 1547 } 1548 for (n = kring->nr_hwcur; n != k;) { 1549 struct netmap_slot *slot = &kring->ring->slot[n]; 1550 1551 n = nm_next(n, lim); 1552 if ((slot->flags & NS_FORWARD) == 0 && !force) 1553 continue; 1554 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { 1555 D("bad pkt at %d len %d", n, slot->len); 1556 continue; 1557 } 1558 slot->flags &= ~NS_FORWARD; // XXX needed ? 1559 /* XXX adapt to the case of a multisegment packet */ 1560 m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); 1561 1562 if (m == NULL) 1563 break; 1564 if (tail) 1565 tail->m_nextpkt = m; 1566 else 1567 q->head = m; 1568 tail = m; 1569 q->count++; 1570 m->m_nextpkt = NULL; 1571 } 1572 q->tail = tail; 1573 } 1574 1575 1576 /* 1577 * The host ring has packets from nr_hwcur to (cur - reserved) 1578 * to be sent down to the NIC. 1579 * We need to use the queue lock on the source (host RX ring) 1580 * to protect against netmap_transmit. 1581 * If the user is well behaved we do not need to acquire locks 1582 * on the destination(s), 1583 * so we only need to make sure that there are no panics because 1584 * of user errors. 1585 * XXX verify 1586 * 1587 * We scan the tx rings, which have just been 1588 * flushed so nr_hwcur == cur. Pushing packets down means 1589 * increment cur and decrement avail. 1590 * XXX to be verified 1591 */ 1592 static void 1593 netmap_sw_to_nic(struct netmap_adapter *na) 1594 { 1595 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1596 struct netmap_kring *k1 = &na->tx_rings[0]; 1597 u_int i, howmany, src_lim, dst_lim; 1598 1599 /* XXX we should also check that the carrier is on */ 1600 if (kring->nkr_stopped) 1601 return; 1602 1603 mtx_lock(&kring->q_lock); 1604 1605 if (kring->nkr_stopped) 1606 goto out; 1607 1608 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 1609 1610 src_lim = kring->nkr_num_slots - 1; 1611 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 1612 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 1613 dst_lim = k1->nkr_num_slots - 1; 1614 while (howmany > 0 && k1->ring->avail > 0) { 1615 struct netmap_slot *src, *dst, tmp; 1616 src = &kring->ring->slot[kring->nr_hwcur]; 1617 dst = &k1->ring->slot[k1->ring->cur]; 1618 tmp = *src; 1619 src->buf_idx = dst->buf_idx; 1620 src->flags = NS_BUF_CHANGED; 1621 1622 dst->buf_idx = tmp.buf_idx; 1623 dst->len = tmp.len; 1624 dst->flags = NS_BUF_CHANGED; 1625 ND("out len %d buf %d from %d to %d", 1626 dst->len, dst->buf_idx, 1627 kring->nr_hwcur, k1->ring->cur); 1628 1629 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 1630 howmany--; 1631 kring->nr_hwavail--; 1632 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 1633 k1->ring->avail--; 1634 } 1635 kring->ring->cur = kring->nr_hwcur; // XXX 1636 k1++; // XXX why? 1637 } 1638 out: 1639 mtx_unlock(&kring->q_lock); 1640 } 1641 1642 1643 /* 1644 * netmap_txsync_to_host() passes packets up. We are called from a 1645 * system call in user process context, and the only contention 1646 * can be among multiple user threads erroneously calling 1647 * this routine concurrently. 1648 */ 1649 static void 1650 netmap_txsync_to_host(struct netmap_adapter *na) 1651 { 1652 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1653 struct netmap_ring *ring = kring->ring; 1654 u_int k, lim = kring->nkr_num_slots - 1; 1655 struct mbq q = { NULL, NULL, 0 }; 1656 1657 if (nm_kr_tryget(kring)) { 1658 D("ring %p busy (user error)", kring); 1659 return; 1660 } 1661 k = ring->cur; 1662 if (k > lim) { 1663 D("invalid ring index in stack TX kring %p", kring); 1664 netmap_ring_reinit(kring); 1665 nm_kr_put(kring); 1666 return; 1667 } 1668 1669 /* Take packets from hwcur to cur and pass them up. 1670 * In case of no buffers we give up. At the end of the loop, 1671 * the queue is drained in all cases. 1672 */ 1673 netmap_grab_packets(kring, &q, 1); 1674 kring->nr_hwcur = k; 1675 kring->nr_hwavail = ring->avail = lim; 1676 1677 nm_kr_put(kring); 1678 netmap_send_up(na->ifp, q.head); 1679 } 1680 1681 1682 /* 1683 * This is the 'txsync' handler to send from a software ring to the 1684 * host stack. 1685 */ 1686 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ 1687 static int 1688 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) 1689 { 1690 (void)ring_nr; 1691 (void)flags; 1692 if (netmap_verbose > 255) 1693 RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); 1694 netmap_txsync_to_host(NA(ifp)); 1695 return 0; 1696 } 1697 1698 1699 /* 1700 * rxsync backend for packets coming from the host stack. 1701 * They have been put in the queue by netmap_transmit() so we 1702 * need to protect access to the kring using a lock. 1703 * 1704 * This routine also does the selrecord if called from the poll handler 1705 * (we know because td != NULL). 1706 * 1707 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1708 * as an additional hidden argument. 1709 */ 1710 static void 1711 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1712 { 1713 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1714 struct netmap_ring *ring = kring->ring; 1715 u_int j, n, lim = kring->nkr_num_slots; 1716 u_int k = ring->cur, resvd = ring->reserved; 1717 1718 (void)pwait; /* disable unused warnings */ 1719 1720 if (kring->nkr_stopped) /* check a first time without lock */ 1721 return; 1722 1723 /* XXX as an optimization we could reuse na->core_lock */ 1724 mtx_lock(&kring->q_lock); 1725 1726 if (kring->nkr_stopped) /* check again with lock held */ 1727 goto unlock_out; 1728 1729 if (k >= lim) { 1730 netmap_ring_reinit(kring); 1731 goto unlock_out; 1732 } 1733 /* new packets are already set in nr_hwavail */ 1734 /* skip past packets that userspace has released */ 1735 j = kring->nr_hwcur; 1736 if (resvd > 0) { 1737 if (resvd + ring->avail >= lim + 1) { 1738 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 1739 ring->reserved = resvd = 0; // XXX panic... 1740 } 1741 k = (k >= resvd) ? k - resvd : k + lim - resvd; 1742 } 1743 if (j != k) { 1744 n = k >= j ? k - j : k + lim - j; 1745 kring->nr_hwavail -= n; 1746 kring->nr_hwcur = k; 1747 } 1748 k = ring->avail = kring->nr_hwavail - resvd; 1749 if (k == 0 && td) 1750 selrecord(td, &kring->si); 1751 if (k && (netmap_verbose & NM_VERB_HOST)) 1752 D("%d pkts from stack", k); 1753 unlock_out: 1754 1755 mtx_unlock(&kring->q_lock); 1756 } 1757 1758 1759 /* 1760 * MUST BE CALLED UNDER NMG_LOCK() 1761 * 1762 * get a refcounted reference to an interface. 1763 * This is always called in the execution of an ioctl(). 1764 * 1765 * Return ENXIO if the interface does not exist, EINVAL if netmap 1766 * is not supported by the interface. 1767 * If successful, hold a reference. 1768 * 1769 * When the NIC is attached to a bridge, reference is managed 1770 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1771 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1772 * is detached from the bridge, then ifp's refcount is dropped (this 1773 * is equivalent to that ifp is destroyed in case of virtual ports. 1774 * 1775 * This function uses if_rele() when we want to prevent the NIC from 1776 * being detached from the bridge in error handling. But once refcount 1777 * is acquired by this function, it must be released using nm_if_rele(). 1778 */ 1779 static int 1780 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) 1781 { 1782 const char *name = nmr->nr_name; 1783 int namelen = strlen(name); 1784 struct ifnet *iter = NULL; 1785 int no_prefix = 0; 1786 1787 /* first try to see if this is a bridge port. */ 1788 struct nm_bridge *b; 1789 struct netmap_adapter *na; 1790 int i, j, cand = -1, cand2 = -1; 1791 int needed; 1792 1793 NMG_LOCK_ASSERT(); 1794 *ifp = NULL; /* default */ 1795 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { 1796 no_prefix = 1; /* no VALE prefix */ 1797 goto no_bridge_port; 1798 } 1799 1800 b = nm_find_bridge(name, create); 1801 if (b == NULL) { 1802 D("no bridges available for '%s'", name); 1803 return (ENXIO); 1804 } 1805 1806 /* Now we are sure that name starts with the bridge's name, 1807 * lookup the port in the bridge. We need to scan the entire 1808 * list. It is not important to hold a WLOCK on the bridge 1809 * during the search because NMG_LOCK already guarantees 1810 * that there are no other possible writers. 1811 */ 1812 1813 /* lookup in the local list of ports */ 1814 for (j = 0; j < b->bdg_active_ports; j++) { 1815 i = b->bdg_port_index[j]; 1816 na = b->bdg_ports[i]; 1817 // KASSERT(na != NULL); 1818 iter = na->ifp; 1819 /* XXX make sure the name only contains one : */ 1820 if (!strcmp(iter->if_xname, name) /* virtual port */ || 1821 (namelen > b->bdg_namelen && !strcmp(iter->if_xname, 1822 name + b->bdg_namelen + 1)) /* NIC */) { 1823 ADD_BDG_REF(iter); 1824 ND("found existing if %s refs %d", name, 1825 NA(iter)->na_bdg_refcount); 1826 *ifp = iter; 1827 /* we are done, this is surely netmap capable */ 1828 return 0; 1829 } 1830 } 1831 /* not found, should we create it? */ 1832 if (!create) 1833 return ENXIO; 1834 /* yes we should, see if we have space to attach entries */ 1835 needed = 2; /* in some cases we only need 1 */ 1836 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 1837 D("bridge full %d, cannot create new port", b->bdg_active_ports); 1838 return EINVAL; 1839 } 1840 /* record the next two ports available, but do not allocate yet */ 1841 cand = b->bdg_port_index[b->bdg_active_ports]; 1842 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 1843 ND("+++ bridge %s port %s used %d avail %d %d", 1844 b->bdg_basename, name, b->bdg_active_ports, cand, cand2); 1845 1846 /* 1847 * try see if there is a matching NIC with this name 1848 * (after the bridge's name) 1849 */ 1850 iter = ifunit_ref(name + b->bdg_namelen + 1); 1851 if (!iter) { /* this is a virtual port */ 1852 /* Create a temporary NA with arguments, then 1853 * bdg_netmap_attach() will allocate the real one 1854 * and attach it to the ifp 1855 */ 1856 struct netmap_adapter tmp_na; 1857 1858 if (nmr->nr_cmd) { 1859 /* nr_cmd must be 0 for a virtual port */ 1860 return EINVAL; 1861 } 1862 bzero(&tmp_na, sizeof(tmp_na)); 1863 /* bound checking */ 1864 tmp_na.num_tx_rings = nmr->nr_tx_rings; 1865 nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1866 nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back 1867 tmp_na.num_rx_rings = nmr->nr_rx_rings; 1868 nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1869 nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back 1870 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1871 1, NM_BDG_MAXSLOTS, NULL); 1872 tmp_na.num_tx_desc = nmr->nr_tx_slots; 1873 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1874 1, NM_BDG_MAXSLOTS, NULL); 1875 tmp_na.num_rx_desc = nmr->nr_rx_slots; 1876 1877 /* create a struct ifnet for the new port. 1878 * need M_NOWAIT as we are under nma_lock 1879 */ 1880 iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); 1881 if (!iter) 1882 return ENOMEM; 1883 1884 strcpy(iter->if_xname, name); 1885 tmp_na.ifp = iter; 1886 /* bdg_netmap_attach creates a struct netmap_adapter */ 1887 bdg_netmap_attach(&tmp_na); 1888 cand2 = -1; /* only need one port */ 1889 } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ 1890 /* make sure the NIC is not already in use */ 1891 if (NETMAP_OWNED_BY_ANY(iter)) { 1892 D("NIC %s busy, cannot attach to bridge", 1893 iter->if_xname); 1894 if_rele(iter); /* don't detach from bridge */ 1895 return EINVAL; 1896 } 1897 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 1898 cand2 = -1; /* only need one port */ 1899 } else { /* not a netmap-capable NIC */ 1900 if_rele(iter); /* don't detach from bridge */ 1901 return EINVAL; 1902 } 1903 na = NA(iter); 1904 1905 BDG_WLOCK(b); 1906 na->bdg_port = cand; 1907 ND("NIC %p to bridge port %d", NA(iter), cand); 1908 /* bind the port to the bridge (virtual ports are not active) */ 1909 b->bdg_ports[cand] = na; 1910 na->na_bdg = b; 1911 b->bdg_active_ports++; 1912 if (cand2 >= 0) { 1913 /* also bind the host stack to the bridge */ 1914 b->bdg_ports[cand2] = SWNA(iter); 1915 SWNA(iter)->bdg_port = cand2; 1916 SWNA(iter)->na_bdg = b; 1917 b->bdg_active_ports++; 1918 ND("host %p to bridge port %d", SWNA(iter), cand2); 1919 } 1920 ADD_BDG_REF(iter); // XXX one or two ? 1921 ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); 1922 BDG_WUNLOCK(b); 1923 *ifp = iter; 1924 return 0; 1925 1926 no_bridge_port: 1927 *ifp = iter; 1928 if (! *ifp) 1929 *ifp = ifunit_ref(name); 1930 if (*ifp == NULL) 1931 return (ENXIO); 1932 1933 if (NETMAP_CAPABLE(*ifp)) { 1934 /* Users cannot use the NIC attached to a bridge directly */ 1935 if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { 1936 if_rele(*ifp); /* don't detach from bridge */ 1937 return EINVAL; 1938 } else 1939 return 0; /* valid pointer, we hold the refcount */ 1940 } 1941 nm_if_rele(*ifp); 1942 return EINVAL; // not NETMAP capable 1943 } 1944 1945 1946 /* 1947 * Error routine called when txsync/rxsync detects an error. 1948 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1949 * Return 1 on reinit. 1950 * 1951 * This routine is only called by the upper half of the kernel. 1952 * It only reads hwcur (which is changed only by the upper half, too) 1953 * and hwavail (which may be changed by the lower half, but only on 1954 * a tx ring and only to increase it, so any error will be recovered 1955 * on the next call). For the above, we don't strictly need to call 1956 * it under lock. 1957 */ 1958 int 1959 netmap_ring_reinit(struct netmap_kring *kring) 1960 { 1961 struct netmap_ring *ring = kring->ring; 1962 u_int i, lim = kring->nkr_num_slots - 1; 1963 int errors = 0; 1964 1965 // XXX KASSERT nm_kr_tryget 1966 RD(10, "called for %s", kring->na->ifp->if_xname); 1967 if (ring->cur > lim) 1968 errors++; 1969 for (i = 0; i <= lim; i++) { 1970 u_int idx = ring->slot[i].buf_idx; 1971 u_int len = ring->slot[i].len; 1972 if (idx < 2 || idx >= netmap_total_buffers) { 1973 if (!errors++) 1974 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1975 ring->slot[i].buf_idx = 0; 1976 ring->slot[i].len = 0; 1977 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1978 ring->slot[i].len = 0; 1979 if (!errors++) 1980 D("bad len %d at slot %d idx %d", 1981 len, i, idx); 1982 } 1983 } 1984 if (errors) { 1985 int pos = kring - kring->na->tx_rings; 1986 int n = kring->na->num_tx_rings + 1; 1987 1988 RD(10, "total %d errors", errors); 1989 errors++; 1990 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1991 kring->na->ifp->if_xname, 1992 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1993 ring->cur, kring->nr_hwcur, 1994 ring->avail, kring->nr_hwavail); 1995 ring->cur = kring->nr_hwcur; 1996 ring->avail = kring->nr_hwavail; 1997 } 1998 return (errors ? 1 : 0); 1999 } 2000 2001 2002 /* 2003 * Set the ring ID. For devices with a single queue, a request 2004 * for all rings is the same as a single ring. 2005 */ 2006 static int 2007 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 2008 { 2009 struct ifnet *ifp = priv->np_ifp; 2010 struct netmap_adapter *na = NA(ifp); 2011 u_int i = ringid & NETMAP_RING_MASK; 2012 /* initially (np_qfirst == np_qlast) we don't want to lock */ 2013 u_int lim = na->num_rx_rings; 2014 2015 if (na->num_tx_rings > lim) 2016 lim = na->num_tx_rings; 2017 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 2018 D("invalid ring id %d", i); 2019 return (EINVAL); 2020 } 2021 priv->np_ringid = ringid; 2022 if (ringid & NETMAP_SW_RING) { 2023 priv->np_qfirst = NETMAP_SW_RING; 2024 priv->np_qlast = 0; 2025 } else if (ringid & NETMAP_HW_RING) { 2026 priv->np_qfirst = i; 2027 priv->np_qlast = i + 1; 2028 } else { 2029 priv->np_qfirst = 0; 2030 priv->np_qlast = NETMAP_HW_RING ; 2031 } 2032 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 2033 if (netmap_verbose) { 2034 if (ringid & NETMAP_SW_RING) 2035 D("ringid %s set to SW RING", ifp->if_xname); 2036 else if (ringid & NETMAP_HW_RING) 2037 D("ringid %s set to HW RING %d", ifp->if_xname, 2038 priv->np_qfirst); 2039 else 2040 D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); 2041 } 2042 return 0; 2043 } 2044 2045 2046 /* 2047 * possibly move the interface to netmap-mode. 2048 * If success it returns a pointer to netmap_if, otherwise NULL. 2049 * This must be called with NMG_LOCK held. 2050 */ 2051 static struct netmap_if * 2052 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, 2053 uint16_t ringid, int *err) 2054 { 2055 struct netmap_adapter *na = NA(ifp); 2056 struct netmap_if *nifp = NULL; 2057 int error, need_mem; 2058 2059 NMG_LOCK_ASSERT(); 2060 /* ring configuration may have changed, fetch from the card */ 2061 netmap_update_config(na); 2062 priv->np_ifp = ifp; /* store the reference */ 2063 error = netmap_set_ringid(priv, ringid); 2064 if (error) 2065 goto out; 2066 /* ensure allocators are ready */ 2067 need_mem = !netmap_have_memory_locked(priv); 2068 if (need_mem) { 2069 error = netmap_get_memory_locked(priv); 2070 ND("get_memory returned %d", error); 2071 if (error) 2072 goto out; 2073 } 2074 nifp = netmap_if_new(ifp->if_xname, na); 2075 if (nifp == NULL) { /* allocation failed */ 2076 /* we should drop the allocator, but only 2077 * if we were the ones who grabbed it 2078 */ 2079 if (need_mem) 2080 netmap_drop_memory_locked(priv); 2081 error = ENOMEM; 2082 goto out; 2083 } 2084 na->refcount++; 2085 if (ifp->if_capenable & IFCAP_NETMAP) { 2086 /* was already set */ 2087 } else { 2088 u_int i; 2089 /* Otherwise set the card in netmap mode 2090 * and make it use the shared buffers. 2091 * 2092 * If the interface is attached to a bridge, lock it. 2093 */ 2094 if (NETMAP_OWNED_BY_KERN(ifp)) 2095 BDG_WLOCK(NA(ifp)->na_bdg); 2096 for (i = 0 ; i < na->num_tx_rings + 1; i++) 2097 mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", 2098 NULL, MTX_DEF); 2099 for (i = 0 ; i < na->num_rx_rings + 1; i++) { 2100 mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", 2101 NULL, MTX_DEF); 2102 } 2103 if (nma_is_hw(na)) { 2104 SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; 2105 SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; 2106 } 2107 /* 2108 * do not core lock because the race is harmless here, 2109 * there cannot be any traffic to netmap_transmit() 2110 */ 2111 error = na->nm_register(ifp, 1); /* mode on */ 2112 // XXX do we need to nm_alloc_bdgfwd() in all cases ? 2113 if (!error) 2114 error = nm_alloc_bdgfwd(na); 2115 if (error) { 2116 netmap_do_unregif(priv, nifp); 2117 nifp = NULL; 2118 } 2119 if (NETMAP_OWNED_BY_KERN(ifp)) 2120 BDG_WUNLOCK(NA(ifp)->na_bdg); 2121 2122 } 2123 out: 2124 *err = error; 2125 if (nifp != NULL) { 2126 /* 2127 * advertise that the interface is ready bt setting ni_nifp. 2128 * The barrier is needed because readers (poll and *SYNC) 2129 * check for priv->np_nifp != NULL without locking 2130 */ 2131 wmb(); /* make sure previous writes are visible to all CPUs */ 2132 priv->np_nifp = nifp; 2133 } 2134 return nifp; 2135 } 2136 2137 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ 2138 static int 2139 nm_bdg_attach(struct nmreq *nmr) 2140 { 2141 struct ifnet *ifp; 2142 struct netmap_if *nifp; 2143 struct netmap_priv_d *npriv; 2144 int error; 2145 2146 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2147 if (npriv == NULL) 2148 return ENOMEM; 2149 NMG_LOCK(); 2150 error = get_ifp(nmr, &ifp, 1 /* create if not exists */); 2151 if (error) /* no device, or another bridge or user owns the device */ 2152 goto unlock_exit; 2153 /* get_ifp() sets na_bdg if this is a physical interface 2154 * that we can attach to a switch. 2155 */ 2156 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2157 /* got reference to a virtual port or direct access to a NIC. 2158 * perhaps specified no bridge prefix or wrong NIC name 2159 */ 2160 error = EINVAL; 2161 goto unref_exit; 2162 } 2163 2164 if (NA(ifp)->refcount > 0) { /* already registered */ 2165 error = EBUSY; 2166 DROP_BDG_REF(ifp); 2167 goto unlock_exit; 2168 } 2169 2170 nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); 2171 if (!nifp) { 2172 goto unref_exit; 2173 } 2174 2175 NA(ifp)->na_kpriv = npriv; 2176 NMG_UNLOCK(); 2177 ND("registered %s to netmap-mode", ifp->if_xname); 2178 return 0; 2179 2180 unref_exit: 2181 nm_if_rele(ifp); 2182 unlock_exit: 2183 NMG_UNLOCK(); 2184 bzero(npriv, sizeof(*npriv)); 2185 free(npriv, M_DEVBUF); 2186 return error; 2187 } 2188 2189 static int 2190 nm_bdg_detach(struct nmreq *nmr) 2191 { 2192 struct ifnet *ifp; 2193 int error; 2194 int last_instance; 2195 2196 NMG_LOCK(); 2197 error = get_ifp(nmr, &ifp, 0 /* don't create */); 2198 if (error) { /* no device, or another bridge or user owns the device */ 2199 goto unlock_exit; 2200 } 2201 /* XXX do we need to check this ? */ 2202 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2203 /* got reference to a virtual port or direct access to a NIC. 2204 * perhaps specified no bridge's prefix or wrong NIC's name 2205 */ 2206 error = EINVAL; 2207 goto unref_exit; 2208 } 2209 2210 if (NA(ifp)->refcount == 0) { /* not registered */ 2211 error = EINVAL; 2212 goto unref_exit; 2213 } 2214 2215 DROP_BDG_REF(ifp); /* the one from get_ifp */ 2216 last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ 2217 NMG_UNLOCK(); 2218 if (!last_instance) { 2219 D("--- error, trying to detach an entry with active mmaps"); 2220 error = EINVAL; 2221 } else { 2222 struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; 2223 NA(ifp)->na_kpriv = NULL; 2224 2225 bzero(npriv, sizeof(*npriv)); 2226 free(npriv, M_DEVBUF); 2227 } 2228 return error; 2229 2230 unref_exit: 2231 nm_if_rele(ifp); 2232 unlock_exit: 2233 NMG_UNLOCK(); 2234 return error; 2235 } 2236 2237 2238 /* Initialize necessary fields of sw adapter located in right after hw's 2239 * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. 2240 * It is always activated and deactivated at the same tie with the hw's one. 2241 * Thus we don't need refcounting on the sw adapter. 2242 * Regardless of NIC's feature we use separate lock so that anybody can lock 2243 * me independently from the hw adapter. 2244 * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw 2245 */ 2246 static void 2247 netmap_attach_sw(struct ifnet *ifp) 2248 { 2249 struct netmap_adapter *hw_na = NA(ifp); 2250 struct netmap_adapter *na = SWNA(ifp); 2251 2252 na->ifp = ifp; 2253 na->num_rx_rings = na->num_tx_rings = 1; 2254 na->num_tx_desc = hw_na->num_tx_desc; 2255 na->num_rx_desc = hw_na->num_rx_desc; 2256 na->nm_txsync = netmap_bdg_to_host; 2257 /* we use the same memory allocator as the 2258 * the hw adapter */ 2259 na->nm_mem = hw_na->nm_mem; 2260 } 2261 2262 2263 /* exported to kernel callers, e.g. OVS ? 2264 * Entry point. 2265 * Called without NMG_LOCK. 2266 */ 2267 int 2268 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) 2269 { 2270 struct nm_bridge *b; 2271 struct netmap_adapter *na; 2272 struct ifnet *iter; 2273 char *name = nmr->nr_name; 2274 int cmd = nmr->nr_cmd, namelen = strlen(name); 2275 int error = 0, i, j; 2276 2277 switch (cmd) { 2278 case NETMAP_BDG_ATTACH: 2279 error = nm_bdg_attach(nmr); 2280 break; 2281 2282 case NETMAP_BDG_DETACH: 2283 error = nm_bdg_detach(nmr); 2284 break; 2285 2286 case NETMAP_BDG_LIST: 2287 /* this is used to enumerate bridges and ports */ 2288 if (namelen) { /* look up indexes of bridge and port */ 2289 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 2290 error = EINVAL; 2291 break; 2292 } 2293 NMG_LOCK(); 2294 b = nm_find_bridge(name, 0 /* don't create */); 2295 if (!b) { 2296 error = ENOENT; 2297 NMG_UNLOCK(); 2298 break; 2299 } 2300 2301 error = ENOENT; 2302 for (j = 0; j < b->bdg_active_ports; j++) { 2303 i = b->bdg_port_index[j]; 2304 na = b->bdg_ports[i]; 2305 if (na == NULL) { 2306 D("---AAAAAAAAARGH-------"); 2307 continue; 2308 } 2309 iter = na->ifp; 2310 /* the former and the latter identify a 2311 * virtual port and a NIC, respectively 2312 */ 2313 if (!strcmp(iter->if_xname, name) || 2314 (namelen > b->bdg_namelen && 2315 !strcmp(iter->if_xname, 2316 name + b->bdg_namelen + 1))) { 2317 /* bridge index */ 2318 nmr->nr_arg1 = b - nm_bridges; 2319 nmr->nr_arg2 = i; /* port index */ 2320 error = 0; 2321 break; 2322 } 2323 } 2324 NMG_UNLOCK(); 2325 } else { 2326 /* return the first non-empty entry starting from 2327 * bridge nr_arg1 and port nr_arg2. 2328 * 2329 * Users can detect the end of the same bridge by 2330 * seeing the new and old value of nr_arg1, and can 2331 * detect the end of all the bridge by error != 0 2332 */ 2333 i = nmr->nr_arg1; 2334 j = nmr->nr_arg2; 2335 2336 NMG_LOCK(); 2337 for (error = ENOENT; i < NM_BRIDGES; i++) { 2338 b = nm_bridges + i; 2339 if (j >= b->bdg_active_ports) { 2340 j = 0; /* following bridges scan from 0 */ 2341 continue; 2342 } 2343 nmr->nr_arg1 = i; 2344 nmr->nr_arg2 = j; 2345 j = b->bdg_port_index[j]; 2346 na = b->bdg_ports[j]; 2347 iter = na->ifp; 2348 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); 2349 error = 0; 2350 break; 2351 } 2352 NMG_UNLOCK(); 2353 } 2354 break; 2355 2356 case NETMAP_BDG_LOOKUP_REG: 2357 /* register a lookup function to the given bridge. 2358 * nmr->nr_name may be just bridge's name (including ':' 2359 * if it is not just NM_NAME). 2360 */ 2361 if (!func) { 2362 error = EINVAL; 2363 break; 2364 } 2365 NMG_LOCK(); 2366 b = nm_find_bridge(name, 0 /* don't create */); 2367 if (!b) { 2368 error = EINVAL; 2369 } else { 2370 b->nm_bdg_lookup = func; 2371 } 2372 NMG_UNLOCK(); 2373 break; 2374 2375 default: 2376 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 2377 error = EINVAL; 2378 break; 2379 } 2380 return error; 2381 } 2382 2383 2384 /* 2385 * ioctl(2) support for the "netmap" device. 2386 * 2387 * Following a list of accepted commands: 2388 * - NIOCGINFO 2389 * - SIOCGIFADDR just for convenience 2390 * - NIOCREGIF 2391 * - NIOCUNREGIF 2392 * - NIOCTXSYNC 2393 * - NIOCRXSYNC 2394 * 2395 * Return 0 on success, errno otherwise. 2396 */ 2397 static int 2398 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2399 int fflag, struct thread *td) 2400 { 2401 struct netmap_priv_d *priv = NULL; 2402 struct ifnet *ifp = NULL; 2403 struct nmreq *nmr = (struct nmreq *) data; 2404 struct netmap_adapter *na = NULL; 2405 int error; 2406 u_int i, lim; 2407 struct netmap_if *nifp; 2408 struct netmap_kring *krings; 2409 2410 (void)dev; /* UNUSED */ 2411 (void)fflag; /* UNUSED */ 2412 #ifdef linux 2413 #define devfs_get_cdevpriv(pp) \ 2414 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 2415 (*pp ? 0 : ENOENT); }) 2416 2417 /* devfs_set_cdevpriv cannot fail on linux */ 2418 #define devfs_set_cdevpriv(p, fn) \ 2419 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 2420 2421 2422 #define devfs_clear_cdevpriv() do { \ 2423 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 2424 } while (0) 2425 #endif /* linux */ 2426 2427 CURVNET_SET(TD_TO_VNET(td)); 2428 2429 error = devfs_get_cdevpriv((void **)&priv); 2430 if (error) { 2431 CURVNET_RESTORE(); 2432 /* XXX ENOENT should be impossible, since the priv 2433 * is now created in the open */ 2434 return (error == ENOENT ? ENXIO : error); 2435 } 2436 2437 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 2438 switch (cmd) { 2439 case NIOCGINFO: /* return capabilities etc */ 2440 if (nmr->nr_version != NETMAP_API) { 2441 #ifdef TEST_STUFF 2442 /* some test code for locks etc */ 2443 if (nmr->nr_version == 666) { 2444 error = nm_test(nmr); 2445 break; 2446 } 2447 #endif /* TEST_STUFF */ 2448 D("API mismatch got %d have %d", 2449 nmr->nr_version, NETMAP_API); 2450 nmr->nr_version = NETMAP_API; 2451 error = EINVAL; 2452 break; 2453 } 2454 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2455 error = netmap_bdg_ctl(nmr, NULL); 2456 break; 2457 } 2458 2459 NMG_LOCK(); 2460 do { 2461 /* memsize is always valid */ 2462 struct netmap_mem_d *nmd = &nm_mem; 2463 u_int memflags; 2464 2465 if (nmr->nr_name[0] != '\0') { 2466 /* get a refcount */ 2467 error = get_ifp(nmr, &ifp, 1 /* create */); 2468 if (error) 2469 break; 2470 na = NA(ifp); /* retrieve the netmap adapter */ 2471 nmd = na->nm_mem; /* and its memory allocator */ 2472 } 2473 2474 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 2475 if (error) 2476 break; 2477 if (na == NULL) /* only memory info */ 2478 break; 2479 nmr->nr_offset = 0; 2480 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2481 netmap_update_config(na); 2482 nmr->nr_rx_rings = na->num_rx_rings; 2483 nmr->nr_tx_rings = na->num_tx_rings; 2484 nmr->nr_rx_slots = na->num_rx_desc; 2485 nmr->nr_tx_slots = na->num_tx_desc; 2486 if (memflags & NETMAP_MEM_PRIVATE) 2487 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2488 } while (0); 2489 if (ifp) 2490 nm_if_rele(ifp); /* return the refcount */ 2491 NMG_UNLOCK(); 2492 break; 2493 2494 case NIOCREGIF: 2495 if (nmr->nr_version != NETMAP_API) { 2496 nmr->nr_version = NETMAP_API; 2497 error = EINVAL; 2498 break; 2499 } 2500 /* possibly attach/detach NIC and VALE switch */ 2501 i = nmr->nr_cmd; 2502 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 2503 error = netmap_bdg_ctl(nmr, NULL); 2504 break; 2505 } else if (i != 0) { 2506 D("nr_cmd must be 0 not %d", i); 2507 error = EINVAL; 2508 break; 2509 } 2510 2511 /* protect access to priv from concurrent NIOCREGIF */ 2512 NMG_LOCK(); 2513 do { 2514 u_int memflags; 2515 2516 if (priv->np_ifp != NULL) { /* thread already registered */ 2517 error = netmap_set_ringid(priv, nmr->nr_ringid); 2518 break; 2519 } 2520 /* find the interface and a reference */ 2521 error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ 2522 if (error) 2523 break; 2524 if (NETMAP_OWNED_BY_KERN(ifp)) { 2525 nm_if_rele(ifp); 2526 error = EBUSY; 2527 break; 2528 } 2529 nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); 2530 if (!nifp) { /* reg. failed, release priv and ref */ 2531 nm_if_rele(ifp); /* return the refcount */ 2532 priv->np_ifp = NULL; 2533 priv->np_nifp = NULL; 2534 break; 2535 } 2536 2537 /* return the offset of the netmap_if object */ 2538 na = NA(ifp); /* retrieve netmap adapter */ 2539 nmr->nr_rx_rings = na->num_rx_rings; 2540 nmr->nr_tx_rings = na->num_tx_rings; 2541 nmr->nr_rx_slots = na->num_rx_desc; 2542 nmr->nr_tx_slots = na->num_tx_desc; 2543 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 2544 if (error) { 2545 nm_if_rele(ifp); 2546 break; 2547 } 2548 if (memflags & NETMAP_MEM_PRIVATE) { 2549 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2550 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2551 } 2552 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2553 } while (0); 2554 NMG_UNLOCK(); 2555 break; 2556 2557 case NIOCUNREGIF: 2558 // XXX we have no data here ? 2559 D("deprecated, data is %p", nmr); 2560 error = EINVAL; 2561 break; 2562 2563 case NIOCTXSYNC: 2564 case NIOCRXSYNC: 2565 nifp = priv->np_nifp; 2566 2567 if (nifp == NULL) { 2568 error = ENXIO; 2569 break; 2570 } 2571 rmb(); /* make sure following reads are not from cache */ 2572 2573 ifp = priv->np_ifp; /* we have a reference */ 2574 2575 if (ifp == NULL) { 2576 D("Internal error: nifp != NULL && ifp == NULL"); 2577 error = ENXIO; 2578 break; 2579 } 2580 2581 na = NA(ifp); /* retrieve netmap adapter */ 2582 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 2583 if (cmd == NIOCTXSYNC) 2584 netmap_txsync_to_host(na); 2585 else 2586 netmap_rxsync_from_host(na, NULL, NULL); 2587 break; 2588 } 2589 /* find the last ring to scan */ 2590 lim = priv->np_qlast; 2591 if (lim == NETMAP_HW_RING) 2592 lim = (cmd == NIOCTXSYNC) ? 2593 na->num_tx_rings : na->num_rx_rings; 2594 2595 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 2596 for (i = priv->np_qfirst; i < lim; i++) { 2597 struct netmap_kring *kring = krings + i; 2598 if (nm_kr_tryget(kring)) { 2599 error = EBUSY; 2600 goto out; 2601 } 2602 if (cmd == NIOCTXSYNC) { 2603 if (netmap_verbose & NM_VERB_TXSYNC) 2604 D("pre txsync ring %d cur %d hwcur %d", 2605 i, kring->ring->cur, 2606 kring->nr_hwcur); 2607 na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); 2608 if (netmap_verbose & NM_VERB_TXSYNC) 2609 D("post txsync ring %d cur %d hwcur %d", 2610 i, kring->ring->cur, 2611 kring->nr_hwcur); 2612 } else { 2613 na->nm_rxsync(ifp, i, NAF_FORCE_READ); 2614 microtime(&na->rx_rings[i].ring->ts); 2615 } 2616 nm_kr_put(kring); 2617 } 2618 2619 break; 2620 2621 #ifdef __FreeBSD__ 2622 case BIOCIMMEDIATE: 2623 case BIOCGHDRCMPLT: 2624 case BIOCSHDRCMPLT: 2625 case BIOCSSEESENT: 2626 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2627 break; 2628 2629 default: /* allow device-specific ioctls */ 2630 { 2631 struct socket so; 2632 2633 bzero(&so, sizeof(so)); 2634 NMG_LOCK(); 2635 error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ 2636 if (error) { 2637 NMG_UNLOCK(); 2638 break; 2639 } 2640 so.so_vnet = ifp->if_vnet; 2641 // so->so_proto not null. 2642 error = ifioctl(&so, cmd, data, td); 2643 nm_if_rele(ifp); 2644 NMG_UNLOCK(); 2645 break; 2646 } 2647 2648 #else /* linux */ 2649 default: 2650 error = EOPNOTSUPP; 2651 #endif /* linux */ 2652 } 2653 out: 2654 2655 CURVNET_RESTORE(); 2656 return (error); 2657 } 2658 2659 2660 /* 2661 * select(2) and poll(2) handlers for the "netmap" device. 2662 * 2663 * Can be called for one or more queues. 2664 * Return true the event mask corresponding to ready events. 2665 * If there are no ready events, do a selrecord on either individual 2666 * selinfo or on the global one. 2667 * Device-dependent parts (locking and sync of tx/rx rings) 2668 * are done through callbacks. 2669 * 2670 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2671 * The first one is remapped to pwait as selrecord() uses the name as an 2672 * hidden argument. 2673 */ 2674 static int 2675 netmap_poll(struct cdev *dev, int events, struct thread *td) 2676 { 2677 struct netmap_priv_d *priv = NULL; 2678 struct netmap_adapter *na; 2679 struct ifnet *ifp; 2680 struct netmap_kring *kring; 2681 u_int i, check_all, want_tx, want_rx, revents = 0; 2682 u_int lim_tx, lim_rx, host_forwarded = 0; 2683 struct mbq q = { NULL, NULL, 0 }; 2684 void *pwait = dev; /* linux compatibility */ 2685 2686 int retry_tx = 1; 2687 2688 (void)pwait; 2689 2690 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 2691 return POLLERR; 2692 2693 if (priv->np_nifp == NULL) { 2694 D("No if registered"); 2695 return POLLERR; 2696 } 2697 rmb(); /* make sure following reads are not from cache */ 2698 2699 ifp = priv->np_ifp; 2700 // XXX check for deleting() ? 2701 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 2702 return POLLERR; 2703 2704 if (netmap_verbose & 0x8000) 2705 D("device %s events 0x%x", ifp->if_xname, events); 2706 want_tx = events & (POLLOUT | POLLWRNORM); 2707 want_rx = events & (POLLIN | POLLRDNORM); 2708 2709 na = NA(ifp); /* retrieve netmap adapter */ 2710 2711 lim_tx = na->num_tx_rings; 2712 lim_rx = na->num_rx_rings; 2713 2714 if (priv->np_qfirst == NETMAP_SW_RING) { 2715 /* handle the host stack ring */ 2716 if (priv->np_txpoll || want_tx) { 2717 /* push any packets up, then we are always ready */ 2718 netmap_txsync_to_host(na); 2719 revents |= want_tx; 2720 } 2721 if (want_rx) { 2722 kring = &na->rx_rings[lim_rx]; 2723 if (kring->ring->avail == 0) 2724 netmap_rxsync_from_host(na, td, dev); 2725 if (kring->ring->avail > 0) { 2726 revents |= want_rx; 2727 } 2728 } 2729 return (revents); 2730 } 2731 2732 /* if we are in transparent mode, check also the host rx ring */ 2733 kring = &na->rx_rings[lim_rx]; 2734 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2735 && want_rx 2736 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 2737 if (kring->ring->avail == 0) 2738 netmap_rxsync_from_host(na, td, dev); 2739 if (kring->ring->avail > 0) 2740 revents |= want_rx; 2741 } 2742 2743 /* 2744 * check_all is set if the card has more than one queue AND 2745 * the client is polling all of them. If true, we sleep on 2746 * the "global" selinfo, otherwise we sleep on individual selinfo 2747 * (FreeBSD only allows two selinfo's per file descriptor). 2748 * The interrupt routine in the driver wake one or the other 2749 * (or both) depending on which clients are active. 2750 * 2751 * rxsync() is only called if we run out of buffers on a POLLIN. 2752 * txsync() is called if we run out of buffers on POLLOUT, or 2753 * there are pending packets to send. The latter can be disabled 2754 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2755 */ 2756 check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1); 2757 2758 if (priv->np_qlast != NETMAP_HW_RING) { 2759 lim_tx = lim_rx = priv->np_qlast; 2760 } 2761 2762 /* 2763 * We start with a lock free round which is good if we have 2764 * data available. If this fails, then lock and call the sync 2765 * routines. 2766 */ 2767 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 2768 kring = &na->rx_rings[i]; 2769 if (kring->ring->avail > 0) { 2770 revents |= want_rx; 2771 want_rx = 0; /* also breaks the loop */ 2772 } 2773 } 2774 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 2775 kring = &na->tx_rings[i]; 2776 if (kring->ring->avail > 0) { 2777 revents |= want_tx; 2778 want_tx = 0; /* also breaks the loop */ 2779 } 2780 } 2781 2782 /* 2783 * If we to push packets out (priv->np_txpoll) or want_tx is 2784 * still set, we do need to run the txsync calls (on all rings, 2785 * to avoid that the tx rings stall). 2786 */ 2787 if (priv->np_txpoll || want_tx) { 2788 /* If we really want to be woken up (want_tx), 2789 * do a selrecord, either on the global or on 2790 * the private structure. Then issue the txsync 2791 * so there is no race in the selrecord/selwait 2792 */ 2793 flush_tx: 2794 for (i = priv->np_qfirst; i < lim_tx; i++) { 2795 kring = &na->tx_rings[i]; 2796 /* 2797 * Skip this ring if want_tx == 0 2798 * (we have already done a successful sync on 2799 * a previous ring) AND kring->cur == kring->hwcur 2800 * (there are no pending transmissions for this ring). 2801 */ 2802 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2803 continue; 2804 /* make sure only one user thread is doing this */ 2805 if (nm_kr_tryget(kring)) { 2806 ND("ring %p busy is %d", kring, (int)kring->nr_busy); 2807 revents |= POLLERR; 2808 goto out; 2809 } 2810 2811 if (netmap_verbose & NM_VERB_TXSYNC) 2812 D("send %d on %s %d", 2813 kring->ring->cur, ifp->if_xname, i); 2814 if (na->nm_txsync(ifp, i, 0)) 2815 revents |= POLLERR; 2816 2817 /* Check avail/call selrecord only if called with POLLOUT */ 2818 if (want_tx) { 2819 if (kring->ring->avail > 0) { 2820 /* stop at the first ring. We don't risk 2821 * starvation. 2822 */ 2823 revents |= want_tx; 2824 want_tx = 0; 2825 } 2826 } 2827 nm_kr_put(kring); 2828 } 2829 if (want_tx && retry_tx) { 2830 selrecord(td, check_all ? 2831 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 2832 retry_tx = 0; 2833 goto flush_tx; 2834 } 2835 } 2836 2837 /* 2838 * now if want_rx is still set we need to lock and rxsync. 2839 * Do it on all rings because otherwise we starve. 2840 */ 2841 if (want_rx) { 2842 int retry_rx = 1; 2843 do_retry_rx: 2844 for (i = priv->np_qfirst; i < lim_rx; i++) { 2845 kring = &na->rx_rings[i]; 2846 2847 if (nm_kr_tryget(kring)) { 2848 revents |= POLLERR; 2849 goto out; 2850 } 2851 2852 /* XXX NR_FORWARD should only be read on 2853 * physical or NIC ports 2854 */ 2855 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2856 ND(10, "forwarding some buffers up %d to %d", 2857 kring->nr_hwcur, kring->ring->cur); 2858 netmap_grab_packets(kring, &q, netmap_fwd); 2859 } 2860 2861 if (na->nm_rxsync(ifp, i, 0)) 2862 revents |= POLLERR; 2863 if (netmap_no_timestamp == 0 || 2864 kring->ring->flags & NR_TIMESTAMP) { 2865 microtime(&kring->ring->ts); 2866 } 2867 2868 if (kring->ring->avail > 0) { 2869 revents |= want_rx; 2870 retry_rx = 0; 2871 } 2872 nm_kr_put(kring); 2873 } 2874 if (retry_rx) { 2875 retry_rx = 0; 2876 selrecord(td, check_all ? 2877 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 2878 goto do_retry_rx; 2879 } 2880 } 2881 2882 /* forward host to the netmap ring. 2883 * I am accessing nr_hwavail without lock, but netmap_transmit 2884 * can only increment it, so the operation is safe. 2885 */ 2886 kring = &na->rx_rings[lim_rx]; 2887 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2888 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 2889 && kring->nr_hwavail > 0 && !host_forwarded) { 2890 netmap_sw_to_nic(na); 2891 host_forwarded = 1; /* prevent another pass */ 2892 want_rx = 0; 2893 goto flush_tx; 2894 } 2895 2896 if (q.head) 2897 netmap_send_up(na->ifp, q.head); 2898 2899 out: 2900 2901 return (revents); 2902 } 2903 2904 /*------- driver support routines ------*/ 2905 2906 2907 /* 2908 * Initialize a ``netmap_adapter`` object created by driver on attach. 2909 * We allocate a block of memory with room for a struct netmap_adapter 2910 * plus two sets of N+2 struct netmap_kring (where N is the number 2911 * of hardware rings): 2912 * krings 0..N-1 are for the hardware queues. 2913 * kring N is for the host stack queue 2914 * kring N+1 is only used for the selinfo for all queues. 2915 * Return 0 on success, ENOMEM otherwise. 2916 * 2917 * By default the receive and transmit adapter ring counts are both initialized 2918 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2919 * setups. 2920 */ 2921 int 2922 netmap_attach(struct netmap_adapter *arg, u_int num_queues) 2923 { 2924 struct netmap_adapter *na = NULL; 2925 struct ifnet *ifp = arg ? arg->ifp : NULL; 2926 size_t len; 2927 2928 if (arg == NULL || ifp == NULL) 2929 goto fail; 2930 /* a VALE port uses two endpoints */ 2931 len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; 2932 na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); 2933 if (na == NULL) 2934 goto fail; 2935 WNA(ifp) = na; 2936 *na = *arg; /* copy everything, trust the driver to not pass junk */ 2937 NETMAP_SET_CAPABLE(ifp); 2938 if (na->num_tx_rings == 0) 2939 na->num_tx_rings = num_queues; 2940 na->num_rx_rings = num_queues; 2941 na->refcount = na->na_single = na->na_multi = 0; 2942 /* Core lock initialized here, others after netmap_if_new. */ 2943 mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); 2944 #ifdef linux 2945 if (ifp->netdev_ops) { 2946 ND("netdev_ops %p", ifp->netdev_ops); 2947 /* prepare a clone of the netdev ops */ 2948 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2949 na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2950 #else 2951 na->nm_ndo = *ifp->netdev_ops; 2952 #endif 2953 } 2954 na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2955 #endif /* linux */ 2956 na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; 2957 if (!nma_is_vp(arg)) 2958 netmap_attach_sw(ifp); 2959 D("success for %s", ifp->if_xname); 2960 return 0; 2961 2962 fail: 2963 D("fail, arg %p ifp %p na %p", arg, ifp, na); 2964 netmap_detach(ifp); 2965 return (na ? EINVAL : ENOMEM); 2966 } 2967 2968 2969 /* 2970 * Free the allocated memory linked to the given ``netmap_adapter`` 2971 * object. 2972 */ 2973 void 2974 netmap_detach(struct ifnet *ifp) 2975 { 2976 struct netmap_adapter *na = NA(ifp); 2977 2978 if (!na) 2979 return; 2980 2981 mtx_destroy(&na->core_lock); 2982 2983 if (na->tx_rings) { /* XXX should not happen */ 2984 D("freeing leftover tx_rings"); 2985 free(na->tx_rings, M_DEVBUF); 2986 } 2987 if (na->na_flags & NAF_MEM_OWNER) 2988 netmap_mem_private_delete(na->nm_mem); 2989 bzero(na, sizeof(*na)); 2990 WNA(ifp) = NULL; 2991 free(na, M_DEVBUF); 2992 } 2993 2994 2995 int 2996 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 2997 struct netmap_adapter *na, u_int ring_nr); 2998 2999 3000 /* 3001 * Intercept packets from the network stack and pass them 3002 * to netmap as incoming packets on the 'software' ring. 3003 * We rely on the OS to make sure that the ifp and na do not go 3004 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 3005 * In nm_register() or whenever there is a reinitialization, 3006 * we make sure to access the core lock and per-ring locks 3007 * so that IFCAP_NETMAP is visible here. 3008 */ 3009 int 3010 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 3011 { 3012 struct netmap_adapter *na = NA(ifp); 3013 struct netmap_kring *kring; 3014 u_int i, len = MBUF_LEN(m); 3015 u_int error = EBUSY, lim; 3016 struct netmap_slot *slot; 3017 3018 // XXX [Linux] we do not need this lock 3019 // if we follow the down/configure/up protocol -gl 3020 // mtx_lock(&na->core_lock); 3021 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 3022 /* interface not in netmap mode anymore */ 3023 error = ENXIO; 3024 goto done; 3025 } 3026 3027 kring = &na->rx_rings[na->num_rx_rings]; 3028 lim = kring->nkr_num_slots - 1; 3029 if (netmap_verbose & NM_VERB_HOST) 3030 D("%s packet %d len %d from the stack", ifp->if_xname, 3031 kring->nr_hwcur + kring->nr_hwavail, len); 3032 // XXX reconsider long packets if we handle fragments 3033 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 3034 D("%s from_host, drop packet size %d > %d", ifp->if_xname, 3035 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 3036 goto done; 3037 } 3038 if (SWNA(ifp)->na_bdg) { 3039 struct nm_bdg_fwd *ft; 3040 char *dst; 3041 3042 na = SWNA(ifp); /* we operate on the host port */ 3043 ft = na->rx_rings[0].nkr_ft; 3044 dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); 3045 3046 /* use slot 0 in the ft, there is nothing queued here */ 3047 /* XXX we can save the copy calling m_copydata in nm_bdg_flush, 3048 * need a special flag for this. 3049 */ 3050 m_copydata(m, 0, (int)len, dst); 3051 ft->ft_flags = 0; 3052 ft->ft_len = len; 3053 ft->ft_buf = dst; 3054 ft->ft_next = NM_FT_NULL; 3055 ft->ft_frags = 1; 3056 if (netmap_verbose & NM_VERB_HOST) 3057 RD(5, "pkt %p size %d to bridge port %d", 3058 dst, len, na->bdg_port); 3059 nm_bdg_flush(ft, 1, na, 0); 3060 na = NA(ifp); /* back to the regular object/lock */ 3061 error = 0; 3062 goto done; 3063 } 3064 3065 /* protect against other instances of netmap_transmit, 3066 * and userspace invocations of rxsync(). 3067 * XXX could reuse core_lock 3068 */ 3069 // XXX [Linux] there can be no other instances of netmap_transmit 3070 // on this same ring, but we still need this lock to protect 3071 // concurrent access from netmap_sw_to_nic() -gl 3072 mtx_lock(&kring->q_lock); 3073 if (kring->nr_hwavail >= lim) { 3074 if (netmap_verbose) 3075 D("stack ring %s full\n", ifp->if_xname); 3076 } else { 3077 /* compute the insert position */ 3078 i = nm_kr_rxpos(kring); 3079 slot = &kring->ring->slot[i]; 3080 m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); 3081 slot->len = len; 3082 slot->flags = kring->nkr_slot_flags; 3083 kring->nr_hwavail++; 3084 if (netmap_verbose & NM_VERB_HOST) 3085 D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); 3086 selwakeuppri(&kring->si, PI_NET); 3087 error = 0; 3088 } 3089 mtx_unlock(&kring->q_lock); 3090 3091 done: 3092 // mtx_unlock(&na->core_lock); 3093 3094 /* release the mbuf in either cases of success or failure. As an 3095 * alternative, put the mbuf in a free list and free the list 3096 * only when really necessary. 3097 */ 3098 m_freem(m); 3099 3100 return (error); 3101 } 3102 3103 3104 /* 3105 * netmap_reset() is called by the driver routines when reinitializing 3106 * a ring. The driver is in charge of locking to protect the kring. 3107 * If netmap mode is not set just return NULL. 3108 */ 3109 struct netmap_slot * 3110 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 3111 u_int new_cur) 3112 { 3113 struct netmap_kring *kring; 3114 int new_hwofs, lim; 3115 3116 if (na == NULL) { 3117 D("NULL na, should not happen"); 3118 return NULL; /* no netmap support here */ 3119 } 3120 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 3121 D("interface not in netmap mode"); 3122 return NULL; /* nothing to reinitialize */ 3123 } 3124 3125 /* XXX note- in the new scheme, we are not guaranteed to be 3126 * under lock (e.g. when called on a device reset). 3127 * In this case, we should set a flag and do not trust too 3128 * much the values. In practice: TODO 3129 * - set a RESET flag somewhere in the kring 3130 * - do the processing in a conservative way 3131 * - let the *sync() fixup at the end. 3132 */ 3133 if (tx == NR_TX) { 3134 if (n >= na->num_tx_rings) 3135 return NULL; 3136 kring = na->tx_rings + n; 3137 new_hwofs = kring->nr_hwcur - new_cur; 3138 } else { 3139 if (n >= na->num_rx_rings) 3140 return NULL; 3141 kring = na->rx_rings + n; 3142 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 3143 } 3144 lim = kring->nkr_num_slots - 1; 3145 if (new_hwofs > lim) 3146 new_hwofs -= lim + 1; 3147 3148 /* Always set the new offset value and realign the ring. */ 3149 D("%s hwofs %d -> %d, hwavail %d -> %d", 3150 tx == NR_TX ? "TX" : "RX", 3151 kring->nkr_hwofs, new_hwofs, 3152 kring->nr_hwavail, 3153 tx == NR_TX ? lim : kring->nr_hwavail); 3154 kring->nkr_hwofs = new_hwofs; 3155 if (tx == NR_TX) 3156 kring->nr_hwavail = lim; 3157 3158 #if 0 // def linux 3159 /* XXX check that the mappings are correct */ 3160 /* need ring_nr, adapter->pdev, direction */ 3161 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 3162 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 3163 D("error mapping rx netmap buffer %d", i); 3164 // XXX fix error handling 3165 } 3166 3167 #endif /* linux */ 3168 /* 3169 * Wakeup on the individual and global selwait 3170 * We do the wakeup here, but the ring is not yet reconfigured. 3171 * However, we are under lock so there are no races. 3172 */ 3173 selwakeuppri(&kring->si, PI_NET); 3174 selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); 3175 return kring->ring->slot; 3176 } 3177 3178 3179 /* 3180 * Grab packets from a kring, move them into the ft structure 3181 * associated to the tx (input) port. Max one instance per port, 3182 * filtered on input (ioctl, poll or XXX). 3183 * Returns the next position in the ring. 3184 */ 3185 static int 3186 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, 3187 struct netmap_kring *kring, u_int end) 3188 { 3189 struct netmap_ring *ring = kring->ring; 3190 struct nm_bdg_fwd *ft; 3191 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 3192 u_int ft_i = 0; /* start from 0 */ 3193 u_int frags = 1; /* how many frags ? */ 3194 struct nm_bridge *b = na->na_bdg; 3195 3196 /* To protect against modifications to the bridge we acquire a 3197 * shared lock, waiting if we can sleep (if the source port is 3198 * attached to a user process) or with a trylock otherwise (NICs). 3199 */ 3200 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3201 if (na->na_flags & NAF_BDG_MAYSLEEP) 3202 BDG_RLOCK(b); 3203 else if (!BDG_RTRYLOCK(b)) 3204 return 0; 3205 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3206 ft = kring->nkr_ft; 3207 3208 for (; likely(j != end); j = nm_next(j, lim)) { 3209 struct netmap_slot *slot = &ring->slot[j]; 3210 char *buf; 3211 3212 ft[ft_i].ft_len = slot->len; 3213 ft[ft_i].ft_flags = slot->flags; 3214 3215 ND("flags is 0x%x", slot->flags); 3216 /* this slot goes into a list so initialize the link field */ 3217 ft[ft_i].ft_next = NM_FT_NULL; 3218 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 3219 (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); 3220 prefetch(buf); 3221 ++ft_i; 3222 if (slot->flags & NS_MOREFRAG) { 3223 frags++; 3224 continue; 3225 } 3226 if (unlikely(netmap_verbose && frags > 1)) 3227 RD(5, "%d frags at %d", frags, ft_i - frags); 3228 ft[ft_i - frags].ft_frags = frags; 3229 frags = 1; 3230 if (unlikely((int)ft_i >= bridge_batch)) 3231 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3232 } 3233 if (frags > 1) { 3234 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 3235 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 3236 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 3237 ft[ft_i - frags].ft_frags = frags - 1; 3238 } 3239 if (ft_i) 3240 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3241 BDG_RUNLOCK(b); 3242 return j; 3243 } 3244 3245 3246 /* 3247 * Pass packets from nic to the bridge. 3248 * XXX TODO check locking: this is called from the interrupt 3249 * handler so we should make sure that the interface is not 3250 * disconnected while passing down an interrupt. 3251 * 3252 * Note, no user process can access this NIC so we can ignore 3253 * the info in the 'ring'. 3254 */ 3255 static void 3256 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) 3257 { 3258 struct netmap_adapter *na = NA(ifp); 3259 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 3260 struct netmap_ring *ring = kring->ring; 3261 u_int j, k; 3262 3263 /* make sure that only one thread is ever in here, 3264 * after which we can unlock. Probably unnecessary XXX. 3265 */ 3266 if (nm_kr_tryget(kring)) 3267 return; 3268 /* fetch packets that have arrived. 3269 * XXX maybe do this in a loop ? 3270 */ 3271 if (na->nm_rxsync(ifp, ring_nr, 0)) 3272 goto put_out; 3273 if (kring->nr_hwavail == 0 && netmap_verbose) { 3274 D("how strange, interrupt with no packets on %s", 3275 ifp->if_xname); 3276 goto put_out; 3277 } 3278 k = nm_kr_rxpos(kring); 3279 3280 j = nm_bdg_preflush(na, ring_nr, kring, k); 3281 3282 /* we consume everything, but we cannot update kring directly 3283 * because the nic may have destroyed the info in the NIC ring. 3284 * So we need to call rxsync again to restore it. 3285 */ 3286 ring->cur = j; 3287 ring->avail = 0; 3288 na->nm_rxsync(ifp, ring_nr, 0); 3289 3290 put_out: 3291 nm_kr_put(kring); 3292 return; 3293 } 3294 3295 3296 /* 3297 * Default functions to handle rx/tx interrupts from a physical device. 3298 * "work_done" is non-null on the RX path, NULL for the TX path. 3299 * We rely on the OS to make sure that there is only one active 3300 * instance per queue, and that there is appropriate locking. 3301 * 3302 * If the card is not in netmap mode, simply return 0, 3303 * so that the caller proceeds with regular processing. 3304 * 3305 * If the card is connected to a netmap file descriptor, 3306 * do a selwakeup on the individual queue, plus one on the global one 3307 * if needed (multiqueue card _and_ there are multiqueue listeners), 3308 * and return 1. 3309 * 3310 * Finally, if called on rx from an interface connected to a switch, 3311 * calls the proper forwarding routine, and return 1. 3312 */ 3313 int 3314 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3315 { 3316 struct netmap_adapter *na; 3317 struct netmap_kring *kring; 3318 3319 if (!(ifp->if_capenable & IFCAP_NETMAP)) 3320 return 0; 3321 3322 q &= NETMAP_RING_MASK; 3323 3324 if (netmap_verbose) 3325 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 3326 na = NA(ifp); 3327 if (na->na_flags & NAF_SKIP_INTR) { 3328 ND("use regular interrupt"); 3329 return 0; 3330 } 3331 3332 if (work_done) { /* RX path */ 3333 if (q >= na->num_rx_rings) 3334 return 0; // not a physical queue 3335 kring = na->rx_rings + q; 3336 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 3337 if (na->na_bdg != NULL) { 3338 netmap_nic_to_bdg(ifp, q); 3339 } else { 3340 selwakeuppri(&kring->si, PI_NET); 3341 if (na->num_rx_rings > 1 /* or multiple listeners */ ) 3342 selwakeuppri(&na->rx_si, PI_NET); 3343 } 3344 *work_done = 1; /* do not fire napi again */ 3345 } else { /* TX path */ 3346 if (q >= na->num_tx_rings) 3347 return 0; // not a physical queue 3348 kring = na->tx_rings + q; 3349 selwakeuppri(&kring->si, PI_NET); 3350 if (na->num_tx_rings > 1 /* or multiple listeners */ ) 3351 selwakeuppri(&na->tx_si, PI_NET); 3352 } 3353 return 1; 3354 } 3355 3356 3357 #ifdef linux /* linux-specific routines */ 3358 3359 3360 /* 3361 * Remap linux arguments into the FreeBSD call. 3362 * - pwait is the poll table, passed as 'dev'; 3363 * If pwait == NULL someone else already woke up before. We can report 3364 * events but they are filtered upstream. 3365 * If pwait != NULL, then pwait->key contains the list of events. 3366 * - events is computed from pwait as above. 3367 * - file is passed as 'td'; 3368 */ 3369 static u_int 3370 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) 3371 { 3372 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) 3373 int events = POLLIN | POLLOUT; /* XXX maybe... */ 3374 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) 3375 int events = pwait ? pwait->key : POLLIN | POLLOUT; 3376 #else /* in 3.4.0 field 'key' was renamed to '_key' */ 3377 int events = pwait ? pwait->_key : POLLIN | POLLOUT; 3378 #endif 3379 return netmap_poll((void *)pwait, events, (void *)file); 3380 } 3381 3382 3383 static int 3384 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) 3385 { 3386 int error = 0; 3387 unsigned long off, va; 3388 vm_ooffset_t pa; 3389 struct netmap_priv_d *priv = f->private_data; 3390 /* 3391 * vma->vm_start: start of mapping user address space 3392 * vma->vm_end: end of the mapping user address space 3393 * vma->vm_pfoff: offset of first page in the device 3394 */ 3395 3396 // XXX security checks 3397 3398 error = netmap_get_memory(priv); 3399 ND("get_memory returned %d", error); 3400 if (error) 3401 return -error; 3402 3403 if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { 3404 ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); 3405 return -EINVAL; 3406 } 3407 3408 for (va = vma->vm_start, off = vma->vm_pgoff; 3409 va < vma->vm_end; 3410 va += PAGE_SIZE, off++) 3411 { 3412 pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); 3413 if (pa == 0) 3414 return -EINVAL; 3415 3416 ND("va %lx pa %p", va, pa); 3417 error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); 3418 if (error) 3419 return error; 3420 } 3421 return 0; 3422 } 3423 3424 3425 /* 3426 * This one is probably already protected by the netif lock XXX 3427 */ 3428 static netdev_tx_t 3429 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) 3430 { 3431 netmap_transmit(dev, skb); 3432 return (NETDEV_TX_OK); 3433 } 3434 3435 3436 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 3437 #define LIN_IOCTL_NAME .ioctl 3438 int 3439 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) 3440 #else 3441 #define LIN_IOCTL_NAME .unlocked_ioctl 3442 long 3443 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) 3444 #endif 3445 { 3446 int ret; 3447 struct nmreq nmr; 3448 bzero(&nmr, sizeof(nmr)); 3449 3450 if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { 3451 data = 0; /* no argument required here */ 3452 } 3453 if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) 3454 return -EFAULT; 3455 ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); 3456 if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) 3457 return -EFAULT; 3458 return -ret; 3459 } 3460 3461 3462 static int 3463 netmap_release(struct inode *inode, struct file *file) 3464 { 3465 (void)inode; /* UNUSED */ 3466 if (file->private_data) 3467 netmap_dtor(file->private_data); 3468 return (0); 3469 } 3470 3471 3472 static int 3473 linux_netmap_open(struct inode *inode, struct file *file) 3474 { 3475 struct netmap_priv_d *priv; 3476 (void)inode; /* UNUSED */ 3477 3478 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 3479 M_NOWAIT | M_ZERO); 3480 if (priv == NULL) 3481 return -ENOMEM; 3482 3483 file->private_data = priv; 3484 3485 return (0); 3486 } 3487 3488 3489 static struct file_operations netmap_fops = { 3490 .owner = THIS_MODULE, 3491 .open = linux_netmap_open, 3492 .mmap = linux_netmap_mmap, 3493 LIN_IOCTL_NAME = linux_netmap_ioctl, 3494 .poll = linux_netmap_poll, 3495 .release = netmap_release, 3496 }; 3497 3498 3499 static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ 3500 MISC_DYNAMIC_MINOR, 3501 "netmap", 3502 &netmap_fops, 3503 }; 3504 3505 static int netmap_init(void); 3506 static void netmap_fini(void); 3507 3508 3509 /* Errors have negative values on linux */ 3510 static int linux_netmap_init(void) 3511 { 3512 return -netmap_init(); 3513 } 3514 3515 module_init(linux_netmap_init); 3516 module_exit(netmap_fini); 3517 /* export certain symbols to other modules */ 3518 EXPORT_SYMBOL(netmap_attach); // driver attach routines 3519 EXPORT_SYMBOL(netmap_detach); // driver detach routines 3520 EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error 3521 EXPORT_SYMBOL(netmap_buffer_lut); 3522 EXPORT_SYMBOL(netmap_total_buffers); // index check 3523 EXPORT_SYMBOL(netmap_buffer_base); 3524 EXPORT_SYMBOL(netmap_reset); // ring init routines 3525 EXPORT_SYMBOL(netmap_buf_size); 3526 EXPORT_SYMBOL(netmap_rx_irq); // default irq handler 3527 EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away 3528 EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine 3529 EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function 3530 EXPORT_SYMBOL(netmap_disable_all_rings); 3531 EXPORT_SYMBOL(netmap_enable_all_rings); 3532 3533 3534 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); 3535 MODULE_DESCRIPTION("The netmap packet I/O framework"); 3536 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ 3537 3538 #else /* __FreeBSD__ */ 3539 3540 3541 static struct cdevsw netmap_cdevsw = { 3542 .d_version = D_VERSION, 3543 .d_name = "netmap", 3544 .d_open = netmap_open, 3545 .d_mmap_single = netmap_mmap_single, 3546 .d_ioctl = netmap_ioctl, 3547 .d_poll = netmap_poll, 3548 .d_close = netmap_close, 3549 }; 3550 #endif /* __FreeBSD__ */ 3551 3552 /* 3553 *---- support for virtual bridge ----- 3554 */ 3555 3556 /* ----- FreeBSD if_bridge hash function ------- */ 3557 3558 /* 3559 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 3560 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 3561 * 3562 * http://www.burtleburtle.net/bob/hash/spooky.html 3563 */ 3564 #define mix(a, b, c) \ 3565 do { \ 3566 a -= b; a -= c; a ^= (c >> 13); \ 3567 b -= c; b -= a; b ^= (a << 8); \ 3568 c -= a; c -= b; c ^= (b >> 13); \ 3569 a -= b; a -= c; a ^= (c >> 12); \ 3570 b -= c; b -= a; b ^= (a << 16); \ 3571 c -= a; c -= b; c ^= (b >> 5); \ 3572 a -= b; a -= c; a ^= (c >> 3); \ 3573 b -= c; b -= a; b ^= (a << 10); \ 3574 c -= a; c -= b; c ^= (b >> 15); \ 3575 } while (/*CONSTCOND*/0) 3576 3577 static __inline uint32_t 3578 nm_bridge_rthash(const uint8_t *addr) 3579 { 3580 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 3581 3582 b += addr[5] << 8; 3583 b += addr[4]; 3584 a += addr[3] << 24; 3585 a += addr[2] << 16; 3586 a += addr[1] << 8; 3587 a += addr[0]; 3588 3589 mix(a, b, c); 3590 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 3591 return (c & BRIDGE_RTHASH_MASK); 3592 } 3593 3594 #undef mix 3595 3596 3597 static int 3598 bdg_netmap_reg(struct ifnet *ifp, int onoff) 3599 { 3600 /* the interface is already attached to the bridge, 3601 * so we only need to toggle IFCAP_NETMAP. 3602 */ 3603 if (onoff) { 3604 ifp->if_capenable |= IFCAP_NETMAP; 3605 } else { 3606 ifp->if_capenable &= ~IFCAP_NETMAP; 3607 } 3608 return 0; 3609 } 3610 3611 3612 /* 3613 * Lookup function for a learning bridge. 3614 * Update the hash table with the source address, 3615 * and then returns the destination port index, and the 3616 * ring in *dst_ring (at the moment, always use ring 0) 3617 */ 3618 u_int 3619 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, 3620 struct netmap_adapter *na) 3621 { 3622 struct nm_hash_ent *ht = na->na_bdg->ht; 3623 uint32_t sh, dh; 3624 u_int dst, mysrc = na->bdg_port; 3625 uint64_t smac, dmac; 3626 3627 if (buf_len < 14) { 3628 D("invalid buf length %d", buf_len); 3629 return NM_BDG_NOPORT; 3630 } 3631 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 3632 smac = le64toh(*(uint64_t *)(buf + 4)); 3633 smac >>= 16; 3634 3635 /* 3636 * The hash is somewhat expensive, there might be some 3637 * worthwhile optimizations here. 3638 */ 3639 if ((buf[6] & 1) == 0) { /* valid src */ 3640 uint8_t *s = buf+6; 3641 sh = nm_bridge_rthash(s); // XXX hash of source 3642 /* update source port forwarding entry */ 3643 ht[sh].mac = smac; /* XXX expire ? */ 3644 ht[sh].ports = mysrc; 3645 if (netmap_verbose) 3646 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 3647 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 3648 } 3649 dst = NM_BDG_BROADCAST; 3650 if ((buf[0] & 1) == 0) { /* unicast */ 3651 dh = nm_bridge_rthash(buf); // XXX hash of dst 3652 if (ht[dh].mac == dmac) { /* found dst */ 3653 dst = ht[dh].ports; 3654 } 3655 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 3656 } 3657 *dst_ring = 0; 3658 return dst; 3659 } 3660 3661 3662 /* 3663 * This flush routine supports only unicast and broadcast but a large 3664 * number of ports, and lets us replace the learn and dispatch functions. 3665 */ 3666 int 3667 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, 3668 u_int ring_nr) 3669 { 3670 struct nm_bdg_q *dst_ents, *brddst; 3671 uint16_t num_dsts = 0, *dsts; 3672 struct nm_bridge *b = na->na_bdg; 3673 u_int i, j, me = na->bdg_port; 3674 3675 /* 3676 * The work area (pointed by ft) is followed by an array of 3677 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 3678 * queues per port plus one for the broadcast traffic. 3679 * Then we have an array of destination indexes. 3680 */ 3681 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 3682 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 3683 3684 /* first pass: find a destination for each packet in the batch */ 3685 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 3686 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 3687 uint16_t dst_port, d_i; 3688 struct nm_bdg_q *d; 3689 3690 ND("slot %d frags %d", i, ft[i].ft_frags); 3691 dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, 3692 &dst_ring, na); 3693 if (netmap_verbose > 255) 3694 RD(5, "slot %d port %d -> %d", i, me, dst_port); 3695 if (dst_port == NM_BDG_NOPORT) 3696 continue; /* this packet is identified to be dropped */ 3697 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 3698 continue; 3699 else if (dst_port == NM_BDG_BROADCAST) 3700 dst_ring = 0; /* broadcasts always go to ring 0 */ 3701 else if (unlikely(dst_port == me || 3702 !b->bdg_ports[dst_port])) 3703 continue; 3704 3705 /* get a position in the scratch pad */ 3706 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 3707 d = dst_ents + d_i; 3708 3709 /* append the first fragment to the list */ 3710 if (d->bq_head == NM_FT_NULL) { /* new destination */ 3711 d->bq_head = d->bq_tail = i; 3712 /* remember this position to be scanned later */ 3713 if (dst_port != NM_BDG_BROADCAST) 3714 dsts[num_dsts++] = d_i; 3715 } else { 3716 ft[d->bq_tail].ft_next = i; 3717 d->bq_tail = i; 3718 } 3719 d->bq_len += ft[i].ft_frags; 3720 } 3721 3722 /* 3723 * Broadcast traffic goes to ring 0 on all destinations. 3724 * So we need to add these rings to the list of ports to scan. 3725 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 3726 * expensive. We should keep a compact list of active destinations 3727 * so we could shorten this loop. 3728 */ 3729 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 3730 if (brddst->bq_head != NM_FT_NULL) { 3731 for (j = 0; likely(j < b->bdg_active_ports); j++) { 3732 uint16_t d_i; 3733 i = b->bdg_port_index[j]; 3734 if (unlikely(i == me)) 3735 continue; 3736 d_i = i * NM_BDG_MAXRINGS; 3737 if (dst_ents[d_i].bq_head == NM_FT_NULL) 3738 dsts[num_dsts++] = d_i; 3739 } 3740 } 3741 3742 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 3743 /* second pass: scan destinations (XXX will be modular somehow) */ 3744 for (i = 0; i < num_dsts; i++) { 3745 struct ifnet *dst_ifp; 3746 struct netmap_adapter *dst_na; 3747 struct netmap_kring *kring; 3748 struct netmap_ring *ring; 3749 u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; 3750 u_int needed, howmany; 3751 int retry = netmap_txsync_retry; 3752 struct nm_bdg_q *d; 3753 uint32_t my_start = 0, lease_idx = 0; 3754 int nrings; 3755 3756 d_i = dsts[i]; 3757 ND("second pass %d port %d", i, d_i); 3758 d = dst_ents + d_i; 3759 // XXX fix the division 3760 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 3761 /* protect from the lookup function returning an inactive 3762 * destination port 3763 */ 3764 if (unlikely(dst_na == NULL)) 3765 goto cleanup; 3766 if (dst_na->na_flags & NAF_SW_ONLY) 3767 goto cleanup; 3768 dst_ifp = dst_na->ifp; 3769 /* 3770 * The interface may be in !netmap mode in two cases: 3771 * - when na is attached but not activated yet; 3772 * - when na is being deactivated but is still attached. 3773 */ 3774 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { 3775 ND("not in netmap mode!"); 3776 goto cleanup; 3777 } 3778 3779 /* there is at least one either unicast or broadcast packet */ 3780 brd_next = brddst->bq_head; 3781 next = d->bq_head; 3782 /* we need to reserve this many slots. If fewer are 3783 * available, some packets will be dropped. 3784 * Packets may have multiple fragments, so we may not use 3785 * there is a chance that we may not use all of the slots 3786 * we have claimed, so we will need to handle the leftover 3787 * ones when we regain the lock. 3788 */ 3789 needed = d->bq_len + brddst->bq_len; 3790 3791 is_vp = nma_is_vp(dst_na); 3792 ND(5, "pass 2 dst %d is %x %s", 3793 i, d_i, is_vp ? "virtual" : "nic/host"); 3794 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 3795 if (is_vp) { /* virtual port */ 3796 nrings = dst_na->num_rx_rings; 3797 } else { 3798 nrings = dst_na->num_tx_rings; 3799 } 3800 if (dst_nr >= nrings) 3801 dst_nr = dst_nr % nrings; 3802 kring = is_vp ? &dst_na->rx_rings[dst_nr] : 3803 &dst_na->tx_rings[dst_nr]; 3804 ring = kring->ring; 3805 lim = kring->nkr_num_slots - 1; 3806 3807 retry: 3808 3809 /* reserve the buffers in the queue and an entry 3810 * to report completion, and drop lock. 3811 * XXX this might become a helper function. 3812 */ 3813 mtx_lock(&kring->q_lock); 3814 if (kring->nkr_stopped) { 3815 mtx_unlock(&kring->q_lock); 3816 goto cleanup; 3817 } 3818 /* on physical interfaces, do a txsync to recover 3819 * slots for packets already transmitted. 3820 * XXX maybe we could be optimistic and rely on a retry 3821 * in case of failure. 3822 */ 3823 if (nma_is_hw(dst_na)) { 3824 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3825 } 3826 my_start = j = kring->nkr_hwlease; 3827 howmany = nm_kr_space(kring, is_vp); 3828 if (needed < howmany) 3829 howmany = needed; 3830 lease_idx = nm_kr_lease(kring, howmany, is_vp); 3831 mtx_unlock(&kring->q_lock); 3832 3833 /* only retry if we need more than available slots */ 3834 if (retry && needed <= howmany) 3835 retry = 0; 3836 3837 /* copy to the destination queue */ 3838 while (howmany > 0) { 3839 struct netmap_slot *slot; 3840 struct nm_bdg_fwd *ft_p, *ft_end; 3841 u_int cnt; 3842 3843 /* find the queue from which we pick next packet. 3844 * NM_FT_NULL is always higher than valid indexes 3845 * so we never dereference it if the other list 3846 * has packets (and if both are empty we never 3847 * get here). 3848 */ 3849 if (next < brd_next) { 3850 ft_p = ft + next; 3851 next = ft_p->ft_next; 3852 } else { /* insert broadcast */ 3853 ft_p = ft + brd_next; 3854 brd_next = ft_p->ft_next; 3855 } 3856 cnt = ft_p->ft_frags; // cnt > 0 3857 if (unlikely(cnt > howmany)) 3858 break; /* no more space */ 3859 howmany -= cnt; 3860 if (netmap_verbose && cnt > 1) 3861 RD(5, "rx %d frags to %d", cnt, j); 3862 ft_end = ft_p + cnt; 3863 do { 3864 void *dst, *src = ft_p->ft_buf; 3865 size_t len = (ft_p->ft_len + 63) & ~63; 3866 3867 slot = &ring->slot[j]; 3868 dst = BDG_NMB(dst_na->nm_mem, slot); 3869 /* round to a multiple of 64 */ 3870 3871 ND("send %d %d bytes at %s:%d", 3872 i, ft_p->ft_len, dst_ifp->if_xname, j); 3873 if (ft_p->ft_flags & NS_INDIRECT) { 3874 if (copyin(src, dst, len)) { 3875 // invalid user pointer, pretend len is 0 3876 ft_p->ft_len = 0; 3877 } 3878 } else { 3879 //memcpy(dst, src, len); 3880 pkt_copy(src, dst, (int)len); 3881 } 3882 slot->len = ft_p->ft_len; 3883 slot->flags = (cnt << 8)| NS_MOREFRAG; 3884 j = nm_next(j, lim); 3885 ft_p++; 3886 sent++; 3887 } while (ft_p != ft_end); 3888 slot->flags = (cnt << 8); /* clear flag on last entry */ 3889 /* are we done ? */ 3890 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 3891 break; 3892 } 3893 { 3894 /* current position */ 3895 uint32_t *p = kring->nkr_leases; /* shorthand */ 3896 uint32_t update_pos; 3897 int still_locked = 1; 3898 3899 mtx_lock(&kring->q_lock); 3900 if (unlikely(howmany > 0)) { 3901 /* not used all bufs. If i am the last one 3902 * i can recover the slots, otherwise must 3903 * fill them with 0 to mark empty packets. 3904 */ 3905 ND("leftover %d bufs", howmany); 3906 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 3907 /* yes i am the last one */ 3908 ND("roll back nkr_hwlease to %d", j); 3909 kring->nkr_hwlease = j; 3910 } else { 3911 while (howmany-- > 0) { 3912 ring->slot[j].len = 0; 3913 ring->slot[j].flags = 0; 3914 j = nm_next(j, lim); 3915 } 3916 } 3917 } 3918 p[lease_idx] = j; /* report I am done */ 3919 3920 update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; 3921 3922 if (my_start == update_pos) { 3923 /* all slots before my_start have been reported, 3924 * so scan subsequent leases to see if other ranges 3925 * have been completed, and to a selwakeup or txsync. 3926 */ 3927 while (lease_idx != kring->nkr_lease_idx && 3928 p[lease_idx] != NR_NOSLOT) { 3929 j = p[lease_idx]; 3930 p[lease_idx] = NR_NOSLOT; 3931 lease_idx = nm_next(lease_idx, lim); 3932 } 3933 /* j is the new 'write' position. j != my_start 3934 * means there are new buffers to report 3935 */ 3936 if (likely(j != my_start)) { 3937 if (is_vp) { 3938 uint32_t old_avail = kring->nr_hwavail; 3939 3940 kring->nr_hwavail = (j >= kring->nr_hwcur) ? 3941 j - kring->nr_hwcur : 3942 j + lim + 1 - kring->nr_hwcur; 3943 if (kring->nr_hwavail < old_avail) { 3944 D("avail shrink %d -> %d", 3945 old_avail, kring->nr_hwavail); 3946 } 3947 still_locked = 0; 3948 mtx_unlock(&kring->q_lock); 3949 selwakeuppri(&kring->si, PI_NET); 3950 } else { 3951 ring->cur = j; 3952 /* XXX update avail ? */ 3953 still_locked = 0; 3954 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3955 mtx_unlock(&kring->q_lock); 3956 3957 /* retry to send more packets */ 3958 if (nma_is_hw(dst_na) && retry--) 3959 goto retry; 3960 } 3961 } 3962 } 3963 if (still_locked) 3964 mtx_unlock(&kring->q_lock); 3965 } 3966 cleanup: 3967 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 3968 d->bq_len = 0; 3969 } 3970 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 3971 brddst->bq_len = 0; 3972 return 0; 3973 } 3974 3975 3976 /* 3977 * main dispatch routine for the bridge. 3978 * We already know that only one thread is running this. 3979 * we must run nm_bdg_preflush without lock. 3980 */ 3981 static int 3982 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) 3983 { 3984 struct netmap_adapter *na = NA(ifp); 3985 struct netmap_kring *kring = &na->tx_rings[ring_nr]; 3986 struct netmap_ring *ring = kring->ring; 3987 u_int j, k, lim = kring->nkr_num_slots - 1; 3988 3989 k = ring->cur; 3990 if (k > lim) 3991 return netmap_ring_reinit(kring); 3992 3993 if (bridge_batch <= 0) { /* testing only */ 3994 j = k; // used all 3995 goto done; 3996 } 3997 if (bridge_batch > NM_BDG_BATCH) 3998 bridge_batch = NM_BDG_BATCH; 3999 4000 j = nm_bdg_preflush(na, ring_nr, kring, k); 4001 if (j != k) 4002 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); 4003 /* k-j modulo ring size is the number of slots processed */ 4004 if (k < j) 4005 k += kring->nkr_num_slots; 4006 kring->nr_hwavail = lim - (k - j); 4007 4008 done: 4009 kring->nr_hwcur = j; 4010 ring->avail = kring->nr_hwavail; 4011 if (netmap_verbose) 4012 D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); 4013 return 0; 4014 } 4015 4016 4017 /* 4018 * user process reading from a VALE switch. 4019 * Already protected against concurrent calls from userspace, 4020 * but we must acquire the queue's lock to protect against 4021 * writers on the same queue. 4022 */ 4023 static int 4024 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) 4025 { 4026 struct netmap_adapter *na = NA(ifp); 4027 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 4028 struct netmap_ring *ring = kring->ring; 4029 u_int j, lim = kring->nkr_num_slots - 1; 4030 u_int k = ring->cur, resvd = ring->reserved; 4031 int n; 4032 4033 mtx_lock(&kring->q_lock); 4034 if (k > lim) { 4035 D("ouch dangerous reset!!!"); 4036 n = netmap_ring_reinit(kring); 4037 goto done; 4038 } 4039 4040 /* skip past packets that userspace has released */ 4041 j = kring->nr_hwcur; /* netmap ring index */ 4042 if (resvd > 0) { 4043 if (resvd + ring->avail >= lim + 1) { 4044 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 4045 ring->reserved = resvd = 0; // XXX panic... 4046 } 4047 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; 4048 } 4049 4050 if (j != k) { /* userspace has released some packets. */ 4051 n = k - j; 4052 if (n < 0) 4053 n += kring->nkr_num_slots; 4054 ND("userspace releases %d packets", n); 4055 for (n = 0; likely(j != k); n++) { 4056 struct netmap_slot *slot = &ring->slot[j]; 4057 void *addr = BDG_NMB(na->nm_mem, slot); 4058 4059 if (addr == netmap_buffer_base) { /* bad buf */ 4060 D("bad buffer index %d, ignore ?", 4061 slot->buf_idx); 4062 } 4063 slot->flags &= ~NS_BUF_CHANGED; 4064 j = nm_next(j, lim); 4065 } 4066 kring->nr_hwavail -= n; 4067 kring->nr_hwcur = k; 4068 } 4069 /* tell userspace that there are new packets */ 4070 ring->avail = kring->nr_hwavail - resvd; 4071 n = 0; 4072 done: 4073 mtx_unlock(&kring->q_lock); 4074 return n; 4075 } 4076 4077 4078 static void 4079 bdg_netmap_attach(struct netmap_adapter *arg) 4080 { 4081 struct netmap_adapter na; 4082 4083 ND("attaching virtual bridge"); 4084 bzero(&na, sizeof(na)); 4085 4086 na.ifp = arg->ifp; 4087 na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 4088 na.num_tx_rings = arg->num_tx_rings; 4089 na.num_rx_rings = arg->num_rx_rings; 4090 na.num_tx_desc = arg->num_tx_desc; 4091 na.num_rx_desc = arg->num_rx_desc; 4092 na.nm_txsync = bdg_netmap_txsync; 4093 na.nm_rxsync = bdg_netmap_rxsync; 4094 na.nm_register = bdg_netmap_reg; 4095 na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, 4096 na.num_tx_rings, na.num_tx_desc, 4097 na.num_rx_rings, na.num_rx_desc); 4098 netmap_attach(&na, na.num_tx_rings); 4099 } 4100 4101 4102 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 4103 4104 4105 /* 4106 * Module loader. 4107 * 4108 * Create the /dev/netmap device and initialize all global 4109 * variables. 4110 * 4111 * Return 0 on success, errno on failure. 4112 */ 4113 static int 4114 netmap_init(void) 4115 { 4116 int i, error; 4117 4118 NMG_LOCK_INIT(); 4119 4120 error = netmap_mem_init(); 4121 if (error != 0) { 4122 printf("netmap: unable to initialize the memory allocator.\n"); 4123 return (error); 4124 } 4125 printf("netmap: loaded module\n"); 4126 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 4127 "netmap"); 4128 4129 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 4130 for (i = 0; i < NM_BRIDGES; i++) 4131 BDG_RWINIT(&nm_bridges[i]); 4132 return (error); 4133 } 4134 4135 4136 /* 4137 * Module unloader. 4138 * 4139 * Free all the memory, and destroy the ``/dev/netmap`` device. 4140 */ 4141 static void 4142 netmap_fini(void) 4143 { 4144 destroy_dev(netmap_dev); 4145 netmap_mem_fini(); 4146 NMG_LOCK_DESTROY(); 4147 printf("netmap: unloaded module.\n"); 4148 } 4149 4150 4151 #ifdef __FreeBSD__ 4152 /* 4153 * Kernel entry point. 4154 * 4155 * Initialize/finalize the module and return. 4156 * 4157 * Return 0 on success, errno on failure. 4158 */ 4159 static int 4160 netmap_loader(__unused struct module *module, int event, __unused void *arg) 4161 { 4162 int error = 0; 4163 4164 switch (event) { 4165 case MOD_LOAD: 4166 error = netmap_init(); 4167 break; 4168 4169 case MOD_UNLOAD: 4170 netmap_fini(); 4171 break; 4172 4173 default: 4174 error = EOPNOTSUPP; 4175 break; 4176 } 4177 4178 return (error); 4179 } 4180 4181 4182 DEV_MODULE(netmap, netmap_loader, NULL); 4183 #endif /* __FreeBSD__ */ 4184