1 /* 2 * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #define NM_BRIDGE 27 28 /* 29 * This module supports memory mapped access to network devices, 30 * see netmap(4). 31 * 32 * The module uses a large, memory pool allocated by the kernel 33 * and accessible as mmapped memory by multiple userspace threads/processes. 34 * The memory pool contains packet buffers and "netmap rings", 35 * i.e. user-accessible copies of the interface's queues. 36 * 37 * Access to the network card works like this: 38 * 1. a process/thread issues one or more open() on /dev/netmap, to create 39 * select()able file descriptor on which events are reported. 40 * 2. on each descriptor, the process issues an ioctl() to identify 41 * the interface that should report events to the file descriptor. 42 * 3. on each descriptor, the process issues an mmap() request to 43 * map the shared memory region within the process' address space. 44 * The list of interesting queues is indicated by a location in 45 * the shared memory region. 46 * 4. using the functions in the netmap(4) userspace API, a process 47 * can look up the occupation state of a queue, access memory buffers, 48 * and retrieve received packets or enqueue packets to transmit. 49 * 5. using some ioctl()s the process can synchronize the userspace view 50 * of the queue with the actual status in the kernel. This includes both 51 * receiving the notification of new packets, and transmitting new 52 * packets on the output interface. 53 * 6. select() or poll() can be used to wait for events on individual 54 * transmit or receive queues (or all queues for a given interface). 55 */ 56 57 #ifdef linux 58 #include "bsd_glue.h" 59 static netdev_tx_t netmap_start_linux(struct sk_buff *skb, struct net_device *dev); 60 #endif /* linux */ 61 62 #ifdef __APPLE__ 63 #include "osx_glue.h" 64 #endif /* __APPLE__ */ 65 66 #ifdef __FreeBSD__ 67 #include <sys/cdefs.h> /* prerequisite */ 68 __FBSDID("$FreeBSD$"); 69 70 #include <sys/types.h> 71 #include <sys/module.h> 72 #include <sys/errno.h> 73 #include <sys/param.h> /* defines used in kernel.h */ 74 #include <sys/jail.h> 75 #include <sys/kernel.h> /* types used in module initialization */ 76 #include <sys/conf.h> /* cdevsw struct */ 77 #include <sys/uio.h> /* uio struct */ 78 #include <sys/sockio.h> 79 #include <sys/socketvar.h> /* struct socket */ 80 #include <sys/malloc.h> 81 #include <sys/mman.h> /* PROT_EXEC */ 82 #include <sys/poll.h> 83 #include <sys/proc.h> 84 #include <vm/vm.h> /* vtophys */ 85 #include <vm/pmap.h> /* vtophys */ 86 #include <sys/socket.h> /* sockaddrs */ 87 #include <machine/bus.h> 88 #include <sys/selinfo.h> 89 #include <sys/sysctl.h> 90 #include <net/if.h> 91 #include <net/bpf.h> /* BIOCIMMEDIATE */ 92 #include <net/vnet.h> 93 #include <net/netmap.h> 94 #include <dev/netmap/netmap_kern.h> 95 #include <machine/bus.h> /* bus_dmamap_* */ 96 97 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 98 #endif /* __FreeBSD__ */ 99 100 /* 101 * lock and unlock for the netmap memory allocator 102 */ 103 #define NMA_LOCK() mtx_lock(&nm_mem->nm_mtx); 104 #define NMA_UNLOCK() mtx_unlock(&nm_mem->nm_mtx); 105 struct netmap_mem_d; 106 static struct netmap_mem_d *nm_mem; /* Our memory allocator. */ 107 108 u_int netmap_total_buffers; 109 char *netmap_buffer_base; /* address of an invalid buffer */ 110 111 /* user-controlled variables */ 112 int netmap_verbose; 113 114 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 115 116 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 117 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 118 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 119 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 120 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 121 int netmap_buf_size = 2048; 122 TUNABLE_INT("hw.netmap.buf_size", &netmap_buf_size); 123 SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size, 124 CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers"); 125 int netmap_mitigate = 1; 126 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 127 int netmap_no_pendintr = 1; 128 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 129 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 130 131 int netmap_drop = 0; /* debugging */ 132 int netmap_flags = 0; /* debug flags */ 133 int netmap_copy = 0; /* debugging, copy content */ 134 135 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); 136 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 137 SYSCTL_INT(_dev_netmap, OID_AUTO, copy, CTLFLAG_RW, &netmap_copy, 0 , ""); 138 139 #ifdef NM_BRIDGE /* support for netmap bridge */ 140 141 /* 142 * system parameters. 143 * 144 * All switched ports have prefix NM_NAME. 145 * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap, 146 * so a practical upper bound is 64). 147 * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet). 148 * The virtual interfaces use per-queue lock instead of core lock. 149 * In the tx loop, we aggregate traffic in batches to make all operations 150 * faster. The batch size is NM_BDG_BATCH 151 */ 152 #define NM_NAME "vale" /* prefix for the interface */ 153 #define NM_BDG_MAXPORTS 16 /* up to 64 ? */ 154 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 155 #define NM_BDG_HASH 1024 /* forwarding table entries */ 156 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 157 #define NM_BRIDGES 4 /* number of bridges */ 158 int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */ 159 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); 160 161 #ifdef linux 162 #define ADD_BDG_REF(ifp) (NA(ifp)->if_refcount++) 163 #define DROP_BDG_REF(ifp) (NA(ifp)->if_refcount-- <= 1) 164 #else /* !linux */ 165 #define ADD_BDG_REF(ifp) (ifp)->if_refcount++ 166 #define DROP_BDG_REF(ifp) refcount_release(&(ifp)->if_refcount) 167 #ifdef __FreeBSD__ 168 #include <sys/endian.h> 169 #include <sys/refcount.h> 170 #endif /* __FreeBSD__ */ 171 #define prefetch(x) __builtin_prefetch(x) 172 #endif /* !linux */ 173 174 static void bdg_netmap_attach(struct ifnet *ifp); 175 static int bdg_netmap_reg(struct ifnet *ifp, int onoff); 176 /* per-tx-queue entry */ 177 struct nm_bdg_fwd { /* forwarding entry for a bridge */ 178 void *buf; 179 uint64_t dst; /* dst mask */ 180 uint32_t src; /* src index ? */ 181 uint16_t len; /* src len */ 182 }; 183 184 struct nm_hash_ent { 185 uint64_t mac; /* the top 2 bytes are the epoch */ 186 uint64_t ports; 187 }; 188 189 /* 190 * Interfaces for a bridge are all in ports[]. 191 * The array has fixed size, an empty entry does not terminate 192 * the search. 193 */ 194 struct nm_bridge { 195 struct ifnet *bdg_ports[NM_BDG_MAXPORTS]; 196 int n_ports; 197 uint64_t act_ports; 198 int freelist; /* first buffer index */ 199 NM_SELINFO_T si; /* poll/select wait queue */ 200 NM_LOCK_T bdg_lock; /* protect the selinfo ? */ 201 202 /* the forwarding table, MAC+ports */ 203 struct nm_hash_ent ht[NM_BDG_HASH]; 204 205 int namelen; /* 0 means free */ 206 char basename[IFNAMSIZ]; 207 }; 208 209 struct nm_bridge nm_bridges[NM_BRIDGES]; 210 211 #define BDG_LOCK(b) mtx_lock(&(b)->bdg_lock) 212 #define BDG_UNLOCK(b) mtx_unlock(&(b)->bdg_lock) 213 214 /* 215 * NA(ifp)->bdg_port port index 216 */ 217 218 // XXX only for multiples of 64 bytes, non overlapped. 219 static inline void 220 pkt_copy(void *_src, void *_dst, int l) 221 { 222 uint64_t *src = _src; 223 uint64_t *dst = _dst; 224 if (unlikely(l >= 1024)) { 225 bcopy(src, dst, l); 226 return; 227 } 228 for (; likely(l > 0); l-=64) { 229 *dst++ = *src++; 230 *dst++ = *src++; 231 *dst++ = *src++; 232 *dst++ = *src++; 233 *dst++ = *src++; 234 *dst++ = *src++; 235 *dst++ = *src++; 236 *dst++ = *src++; 237 } 238 } 239 240 /* 241 * locate a bridge among the existing ones. 242 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 243 * We assume that this is called with a name of at least NM_NAME chars. 244 */ 245 static struct nm_bridge * 246 nm_find_bridge(const char *name) 247 { 248 int i, l, namelen, e; 249 struct nm_bridge *b = NULL; 250 251 namelen = strlen(NM_NAME); /* base length */ 252 l = strlen(name); /* actual length */ 253 for (i = namelen + 1; i < l; i++) { 254 if (name[i] == ':') { 255 namelen = i; 256 break; 257 } 258 } 259 if (namelen >= IFNAMSIZ) 260 namelen = IFNAMSIZ; 261 ND("--- prefix is '%.*s' ---", namelen, name); 262 263 /* use the first entry for locking */ 264 BDG_LOCK(nm_bridges); // XXX do better 265 for (e = -1, i = 1; i < NM_BRIDGES; i++) { 266 b = nm_bridges + i; 267 if (b->namelen == 0) 268 e = i; /* record empty slot */ 269 else if (strncmp(name, b->basename, namelen) == 0) { 270 ND("found '%.*s' at %d", namelen, name, i); 271 break; 272 } 273 } 274 if (i == NM_BRIDGES) { /* all full */ 275 if (e == -1) { /* no empty slot */ 276 b = NULL; 277 } else { 278 b = nm_bridges + e; 279 strncpy(b->basename, name, namelen); 280 b->namelen = namelen; 281 } 282 } 283 BDG_UNLOCK(nm_bridges); 284 return b; 285 } 286 #endif /* NM_BRIDGE */ 287 288 /*------------- memory allocator -----------------*/ 289 #ifdef NETMAP_MEM2 290 #include "netmap_mem2.c" 291 #else /* !NETMAP_MEM2 */ 292 #include "netmap_mem1.c" 293 #endif /* !NETMAP_MEM2 */ 294 /*------------ end of memory allocator ----------*/ 295 296 /* Structure associated to each thread which registered an interface. */ 297 struct netmap_priv_d { 298 struct netmap_if *np_nifp; /* netmap interface descriptor. */ 299 300 struct ifnet *np_ifp; /* device for which we hold a reference */ 301 int np_ringid; /* from the ioctl */ 302 u_int np_qfirst, np_qlast; /* range of rings to scan */ 303 uint16_t np_txpoll; 304 }; 305 306 307 /* 308 * File descriptor's private data destructor. 309 * 310 * Call nm_register(ifp,0) to stop netmap mode on the interface and 311 * revert to normal operation. We expect that np_ifp has not gone. 312 */ 313 static void 314 netmap_dtor_locked(void *data) 315 { 316 struct netmap_priv_d *priv = data; 317 struct ifnet *ifp = priv->np_ifp; 318 struct netmap_adapter *na = NA(ifp); 319 struct netmap_if *nifp = priv->np_nifp; 320 321 na->refcount--; 322 if (na->refcount <= 0) { /* last instance */ 323 u_int i, j, lim; 324 325 D("deleting last netmap instance for %s", ifp->if_xname); 326 /* 327 * there is a race here with *_netmap_task() and 328 * netmap_poll(), which don't run under NETMAP_REG_LOCK. 329 * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP 330 * (aka NETMAP_DELETING(na)) are a unique marker that the 331 * device is dying. 332 * Before destroying stuff we sleep a bit, and then complete 333 * the job. NIOCREG should realize the condition and 334 * loop until they can continue; the other routines 335 * should check the condition at entry and quit if 336 * they cannot run. 337 */ 338 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); 339 tsleep(na, 0, "NIOCUNREG", 4); 340 na->nm_lock(ifp, NETMAP_REG_LOCK, 0); 341 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ 342 /* Wake up any sleeping threads. netmap_poll will 343 * then return POLLERR 344 */ 345 for (i = 0; i < na->num_tx_rings + 1; i++) 346 selwakeuppri(&na->tx_rings[i].si, PI_NET); 347 for (i = 0; i < na->num_rx_rings + 1; i++) 348 selwakeuppri(&na->rx_rings[i].si, PI_NET); 349 selwakeuppri(&na->tx_si, PI_NET); 350 selwakeuppri(&na->rx_si, PI_NET); 351 /* release all buffers */ 352 NMA_LOCK(); 353 for (i = 0; i < na->num_tx_rings + 1; i++) { 354 struct netmap_ring *ring = na->tx_rings[i].ring; 355 lim = na->tx_rings[i].nkr_num_slots; 356 for (j = 0; j < lim; j++) 357 netmap_free_buf(nifp, ring->slot[j].buf_idx); 358 } 359 for (i = 0; i < na->num_rx_rings + 1; i++) { 360 struct netmap_ring *ring = na->rx_rings[i].ring; 361 lim = na->rx_rings[i].nkr_num_slots; 362 for (j = 0; j < lim; j++) 363 netmap_free_buf(nifp, ring->slot[j].buf_idx); 364 } 365 NMA_UNLOCK(); 366 netmap_free_rings(na); 367 wakeup(na); 368 } 369 netmap_if_free(nifp); 370 } 371 372 static void 373 nm_if_rele(struct ifnet *ifp) 374 { 375 #ifndef NM_BRIDGE 376 if_rele(ifp); 377 #else /* NM_BRIDGE */ 378 int i, full; 379 struct nm_bridge *b; 380 381 if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) { 382 if_rele(ifp); 383 return; 384 } 385 if (!DROP_BDG_REF(ifp)) 386 return; 387 b = ifp->if_bridge; 388 BDG_LOCK(nm_bridges); 389 BDG_LOCK(b); 390 ND("want to disconnect %s from the bridge", ifp->if_xname); 391 full = 0; 392 for (i = 0; i < NM_BDG_MAXPORTS; i++) { 393 if (b->bdg_ports[i] == ifp) { 394 b->bdg_ports[i] = NULL; 395 bzero(ifp, sizeof(*ifp)); 396 free(ifp, M_DEVBUF); 397 break; 398 } 399 else if (b->bdg_ports[i] != NULL) 400 full = 1; 401 } 402 BDG_UNLOCK(b); 403 if (full == 0) { 404 ND("freeing bridge %d", b - nm_bridges); 405 b->namelen = 0; 406 } 407 BDG_UNLOCK(nm_bridges); 408 if (i == NM_BDG_MAXPORTS) 409 D("ouch, cannot find ifp to remove"); 410 #endif /* NM_BRIDGE */ 411 } 412 413 static void 414 netmap_dtor(void *data) 415 { 416 struct netmap_priv_d *priv = data; 417 struct ifnet *ifp = priv->np_ifp; 418 struct netmap_adapter *na = NA(ifp); 419 420 na->nm_lock(ifp, NETMAP_REG_LOCK, 0); 421 netmap_dtor_locked(data); 422 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); 423 424 nm_if_rele(ifp); 425 bzero(priv, sizeof(*priv)); /* XXX for safety */ 426 free(priv, M_DEVBUF); 427 } 428 429 430 /* 431 * mmap(2) support for the "netmap" device. 432 * 433 * Expose all the memory previously allocated by our custom memory 434 * allocator: this way the user has only to issue a single mmap(2), and 435 * can work on all the data structures flawlessly. 436 * 437 * Return 0 on success, -1 otherwise. 438 */ 439 440 #ifdef __FreeBSD__ 441 static int 442 netmap_mmap(__unused struct cdev *dev, 443 #if __FreeBSD_version < 900000 444 vm_offset_t offset, vm_paddr_t *paddr, int nprot 445 #else 446 vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, 447 __unused vm_memattr_t *memattr 448 #endif 449 ) 450 { 451 if (nprot & PROT_EXEC) 452 return (-1); // XXX -1 or EINVAL ? 453 454 ND("request for offset 0x%x", (uint32_t)offset); 455 *paddr = netmap_ofstophys(offset); 456 457 return (0); 458 } 459 #endif /* __FreeBSD__ */ 460 461 462 /* 463 * Handlers for synchronization of the queues from/to the host. 464 * 465 * netmap_sync_to_host() passes packets up. We are called from a 466 * system call in user process context, and the only contention 467 * can be among multiple user threads erroneously calling 468 * this routine concurrently. In principle we should not even 469 * need to lock. 470 */ 471 static void 472 netmap_sync_to_host(struct netmap_adapter *na) 473 { 474 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 475 struct netmap_ring *ring = kring->ring; 476 struct mbuf *head = NULL, *tail = NULL, *m; 477 u_int k, n, lim = kring->nkr_num_slots - 1; 478 479 k = ring->cur; 480 if (k > lim) { 481 netmap_ring_reinit(kring); 482 return; 483 } 484 // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); 485 486 /* Take packets from hwcur to cur and pass them up. 487 * In case of no buffers we give up. At the end of the loop, 488 * the queue is drained in all cases. 489 */ 490 for (n = kring->nr_hwcur; n != k;) { 491 struct netmap_slot *slot = &ring->slot[n]; 492 493 n = (n == lim) ? 0 : n + 1; 494 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { 495 D("bad pkt at %d len %d", n, slot->len); 496 continue; 497 } 498 m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL); 499 500 if (m == NULL) 501 break; 502 if (tail) 503 tail->m_nextpkt = m; 504 else 505 head = m; 506 tail = m; 507 m->m_nextpkt = NULL; 508 } 509 kring->nr_hwcur = k; 510 kring->nr_hwavail = ring->avail = lim; 511 // na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); 512 513 /* send packets up, outside the lock */ 514 while ((m = head) != NULL) { 515 head = head->m_nextpkt; 516 m->m_nextpkt = NULL; 517 if (netmap_verbose & NM_VERB_HOST) 518 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 519 NM_SEND_UP(na->ifp, m); 520 } 521 } 522 523 /* 524 * rxsync backend for packets coming from the host stack. 525 * They have been put in the queue by netmap_start() so we 526 * need to protect access to the kring using a lock. 527 * 528 * This routine also does the selrecord if called from the poll handler 529 * (we know because td != NULL). 530 * 531 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 532 * as an additional hidden argument. 533 */ 534 static void 535 netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 536 { 537 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 538 struct netmap_ring *ring = kring->ring; 539 u_int j, n, lim = kring->nkr_num_slots; 540 u_int k = ring->cur, resvd = ring->reserved; 541 542 (void)pwait; /* disable unused warnings */ 543 na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); 544 if (k >= lim) { 545 netmap_ring_reinit(kring); 546 return; 547 } 548 /* new packets are already set in nr_hwavail */ 549 /* skip past packets that userspace has released */ 550 j = kring->nr_hwcur; 551 if (resvd > 0) { 552 if (resvd + ring->avail >= lim + 1) { 553 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 554 ring->reserved = resvd = 0; // XXX panic... 555 } 556 k = (k >= resvd) ? k - resvd : k + lim - resvd; 557 } 558 if (j != k) { 559 n = k >= j ? k - j : k + lim - j; 560 kring->nr_hwavail -= n; 561 kring->nr_hwcur = k; 562 } 563 k = ring->avail = kring->nr_hwavail - resvd; 564 if (k == 0 && td) 565 selrecord(td, &kring->si); 566 if (k && (netmap_verbose & NM_VERB_HOST)) 567 D("%d pkts from stack", k); 568 na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); 569 } 570 571 572 /* 573 * get a refcounted reference to an interface. 574 * Return ENXIO if the interface does not exist, EINVAL if netmap 575 * is not supported by the interface. 576 * If successful, hold a reference. 577 */ 578 static int 579 get_ifp(const char *name, struct ifnet **ifp) 580 { 581 #ifdef NM_BRIDGE 582 struct ifnet *iter = NULL; 583 584 do { 585 struct nm_bridge *b; 586 int i, l, cand = -1; 587 588 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) 589 break; 590 b = nm_find_bridge(name); 591 if (b == NULL) { 592 D("no bridges available for '%s'", name); 593 return (ENXIO); 594 } 595 /* XXX locking */ 596 BDG_LOCK(b); 597 /* lookup in the local list of ports */ 598 for (i = 0; i < NM_BDG_MAXPORTS; i++) { 599 iter = b->bdg_ports[i]; 600 if (iter == NULL) { 601 if (cand == -1) 602 cand = i; /* potential insert point */ 603 continue; 604 } 605 if (!strcmp(iter->if_xname, name)) { 606 ADD_BDG_REF(iter); 607 ND("found existing interface"); 608 BDG_UNLOCK(b); 609 break; 610 } 611 } 612 if (i < NM_BDG_MAXPORTS) /* already unlocked */ 613 break; 614 if (cand == -1) { 615 D("bridge full, cannot create new port"); 616 no_port: 617 BDG_UNLOCK(b); 618 *ifp = NULL; 619 return EINVAL; 620 } 621 ND("create new bridge port %s", name); 622 /* space for forwarding list after the ifnet */ 623 l = sizeof(*iter) + 624 sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ; 625 iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 626 if (!iter) 627 goto no_port; 628 strcpy(iter->if_xname, name); 629 bdg_netmap_attach(iter); 630 b->bdg_ports[cand] = iter; 631 iter->if_bridge = b; 632 ADD_BDG_REF(iter); 633 BDG_UNLOCK(b); 634 ND("attaching virtual bridge %p", b); 635 } while (0); 636 *ifp = iter; 637 if (! *ifp) 638 #endif /* NM_BRIDGE */ 639 *ifp = ifunit_ref(name); 640 if (*ifp == NULL) 641 return (ENXIO); 642 /* can do this if the capability exists and if_pspare[0] 643 * points to the netmap descriptor. 644 */ 645 if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) 646 return 0; /* valid pointer, we hold the refcount */ 647 nm_if_rele(*ifp); 648 return EINVAL; // not NETMAP capable 649 } 650 651 652 /* 653 * Error routine called when txsync/rxsync detects an error. 654 * Can't do much more than resetting cur = hwcur, avail = hwavail. 655 * Return 1 on reinit. 656 * 657 * This routine is only called by the upper half of the kernel. 658 * It only reads hwcur (which is changed only by the upper half, too) 659 * and hwavail (which may be changed by the lower half, but only on 660 * a tx ring and only to increase it, so any error will be recovered 661 * on the next call). For the above, we don't strictly need to call 662 * it under lock. 663 */ 664 int 665 netmap_ring_reinit(struct netmap_kring *kring) 666 { 667 struct netmap_ring *ring = kring->ring; 668 u_int i, lim = kring->nkr_num_slots - 1; 669 int errors = 0; 670 671 D("called for %s", kring->na->ifp->if_xname); 672 if (ring->cur > lim) 673 errors++; 674 for (i = 0; i <= lim; i++) { 675 u_int idx = ring->slot[i].buf_idx; 676 u_int len = ring->slot[i].len; 677 if (idx < 2 || idx >= netmap_total_buffers) { 678 if (!errors++) 679 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 680 ring->slot[i].buf_idx = 0; 681 ring->slot[i].len = 0; 682 } else if (len > NETMAP_BUF_SIZE) { 683 ring->slot[i].len = 0; 684 if (!errors++) 685 D("bad len %d at slot %d idx %d", 686 len, i, idx); 687 } 688 } 689 if (errors) { 690 int pos = kring - kring->na->tx_rings; 691 int n = kring->na->num_tx_rings + 1; 692 693 D("total %d errors", errors); 694 errors++; 695 D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 696 kring->na->ifp->if_xname, 697 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 698 ring->cur, kring->nr_hwcur, 699 ring->avail, kring->nr_hwavail); 700 ring->cur = kring->nr_hwcur; 701 ring->avail = kring->nr_hwavail; 702 } 703 return (errors ? 1 : 0); 704 } 705 706 707 /* 708 * Set the ring ID. For devices with a single queue, a request 709 * for all rings is the same as a single ring. 710 */ 711 static int 712 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 713 { 714 struct ifnet *ifp = priv->np_ifp; 715 struct netmap_adapter *na = NA(ifp); 716 u_int i = ringid & NETMAP_RING_MASK; 717 /* initially (np_qfirst == np_qlast) we don't want to lock */ 718 int need_lock = (priv->np_qfirst != priv->np_qlast); 719 int lim = na->num_rx_rings; 720 721 if (na->num_tx_rings > lim) 722 lim = na->num_tx_rings; 723 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 724 D("invalid ring id %d", i); 725 return (EINVAL); 726 } 727 if (need_lock) 728 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); 729 priv->np_ringid = ringid; 730 if (ringid & NETMAP_SW_RING) { 731 priv->np_qfirst = NETMAP_SW_RING; 732 priv->np_qlast = 0; 733 } else if (ringid & NETMAP_HW_RING) { 734 priv->np_qfirst = i; 735 priv->np_qlast = i + 1; 736 } else { 737 priv->np_qfirst = 0; 738 priv->np_qlast = NETMAP_HW_RING ; 739 } 740 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 741 if (need_lock) 742 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); 743 if (ringid & NETMAP_SW_RING) 744 D("ringid %s set to SW RING", ifp->if_xname); 745 else if (ringid & NETMAP_HW_RING) 746 D("ringid %s set to HW RING %d", ifp->if_xname, 747 priv->np_qfirst); 748 else 749 D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); 750 return 0; 751 } 752 753 /* 754 * ioctl(2) support for the "netmap" device. 755 * 756 * Following a list of accepted commands: 757 * - NIOCGINFO 758 * - SIOCGIFADDR just for convenience 759 * - NIOCREGIF 760 * - NIOCUNREGIF 761 * - NIOCTXSYNC 762 * - NIOCRXSYNC 763 * 764 * Return 0 on success, errno otherwise. 765 */ 766 static int 767 netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, 768 __unused int fflag, struct thread *td) 769 { 770 struct netmap_priv_d *priv = NULL; 771 struct ifnet *ifp; 772 struct nmreq *nmr = (struct nmreq *) data; 773 struct netmap_adapter *na; 774 int error; 775 u_int i, lim; 776 struct netmap_if *nifp; 777 778 #ifdef linux 779 #define devfs_get_cdevpriv(pp) \ 780 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 781 (*pp ? 0 : ENOENT); }) 782 783 /* devfs_set_cdevpriv cannot fail on linux */ 784 #define devfs_set_cdevpriv(p, fn) \ 785 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 786 787 788 #define devfs_clear_cdevpriv() do { \ 789 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 790 } while (0) 791 #endif /* linux */ 792 793 CURVNET_SET(TD_TO_VNET(td)); 794 795 error = devfs_get_cdevpriv((void **)&priv); 796 if (error != ENOENT && error != 0) { 797 CURVNET_RESTORE(); 798 return (error); 799 } 800 801 error = 0; /* Could be ENOENT */ 802 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 803 switch (cmd) { 804 case NIOCGINFO: /* return capabilities etc */ 805 /* memsize is always valid */ 806 nmr->nr_memsize = nm_mem->nm_totalsize; 807 nmr->nr_offset = 0; 808 nmr->nr_rx_rings = nmr->nr_tx_rings = 0; 809 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 810 if (nmr->nr_version != NETMAP_API) { 811 D("API mismatch got %d have %d", 812 nmr->nr_version, NETMAP_API); 813 nmr->nr_version = NETMAP_API; 814 error = EINVAL; 815 break; 816 } 817 if (nmr->nr_name[0] == '\0') /* just get memory info */ 818 break; 819 error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */ 820 if (error) 821 break; 822 na = NA(ifp); /* retrieve netmap_adapter */ 823 nmr->nr_rx_rings = na->num_rx_rings; 824 nmr->nr_tx_rings = na->num_tx_rings; 825 nmr->nr_rx_slots = na->num_rx_desc; 826 nmr->nr_tx_slots = na->num_tx_desc; 827 nm_if_rele(ifp); /* return the refcount */ 828 break; 829 830 case NIOCREGIF: 831 if (nmr->nr_version != NETMAP_API) { 832 nmr->nr_version = NETMAP_API; 833 error = EINVAL; 834 break; 835 } 836 if (priv != NULL) { /* thread already registered */ 837 error = netmap_set_ringid(priv, nmr->nr_ringid); 838 break; 839 } 840 /* find the interface and a reference */ 841 error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ 842 if (error) 843 break; 844 na = NA(ifp); /* retrieve netmap adapter */ 845 /* 846 * Allocate the private per-thread structure. 847 * XXX perhaps we can use a blocking malloc ? 848 */ 849 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 850 M_NOWAIT | M_ZERO); 851 if (priv == NULL) { 852 error = ENOMEM; 853 nm_if_rele(ifp); /* return the refcount */ 854 break; 855 } 856 857 for (i = 10; i > 0; i--) { 858 na->nm_lock(ifp, NETMAP_REG_LOCK, 0); 859 if (!NETMAP_DELETING(na)) 860 break; 861 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); 862 tsleep(na, 0, "NIOCREGIF", hz/10); 863 } 864 if (i == 0) { 865 D("too many NIOCREGIF attempts, give up"); 866 error = EINVAL; 867 free(priv, M_DEVBUF); 868 nm_if_rele(ifp); /* return the refcount */ 869 break; 870 } 871 872 priv->np_ifp = ifp; /* store the reference */ 873 error = netmap_set_ringid(priv, nmr->nr_ringid); 874 if (error) 875 goto error; 876 priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na); 877 if (nifp == NULL) { /* allocation failed */ 878 error = ENOMEM; 879 } else if (ifp->if_capenable & IFCAP_NETMAP) { 880 /* was already set */ 881 } else { 882 /* Otherwise set the card in netmap mode 883 * and make it use the shared buffers. 884 */ 885 for (i = 0 ; i < na->num_tx_rings + 1; i++) 886 mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", MTX_NETWORK_LOCK, MTX_DEF); 887 for (i = 0 ; i < na->num_rx_rings + 1; i++) { 888 mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF); 889 } 890 error = na->nm_register(ifp, 1); /* mode on */ 891 if (error) 892 netmap_dtor_locked(priv); 893 } 894 895 if (error) { /* reg. failed, release priv and ref */ 896 error: 897 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); 898 nm_if_rele(ifp); /* return the refcount */ 899 bzero(priv, sizeof(*priv)); 900 free(priv, M_DEVBUF); 901 break; 902 } 903 904 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); 905 error = devfs_set_cdevpriv(priv, netmap_dtor); 906 907 if (error != 0) { 908 /* could not assign the private storage for the 909 * thread, call the destructor explicitly. 910 */ 911 netmap_dtor(priv); 912 break; 913 } 914 915 /* return the offset of the netmap_if object */ 916 nmr->nr_rx_rings = na->num_rx_rings; 917 nmr->nr_tx_rings = na->num_tx_rings; 918 nmr->nr_rx_slots = na->num_rx_desc; 919 nmr->nr_tx_slots = na->num_tx_desc; 920 nmr->nr_memsize = nm_mem->nm_totalsize; 921 nmr->nr_offset = netmap_if_offset(nifp); 922 break; 923 924 case NIOCUNREGIF: 925 if (priv == NULL) { 926 error = ENXIO; 927 break; 928 } 929 930 /* the interface is unregistered inside the 931 destructor of the private data. */ 932 devfs_clear_cdevpriv(); 933 break; 934 935 case NIOCTXSYNC: 936 case NIOCRXSYNC: 937 if (priv == NULL) { 938 error = ENXIO; 939 break; 940 } 941 ifp = priv->np_ifp; /* we have a reference */ 942 na = NA(ifp); /* retrieve netmap adapter */ 943 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 944 if (cmd == NIOCTXSYNC) 945 netmap_sync_to_host(na); 946 else 947 netmap_sync_from_host(na, NULL, NULL); 948 break; 949 } 950 /* find the last ring to scan */ 951 lim = priv->np_qlast; 952 if (lim == NETMAP_HW_RING) 953 lim = (cmd == NIOCTXSYNC) ? 954 na->num_tx_rings : na->num_rx_rings; 955 956 for (i = priv->np_qfirst; i < lim; i++) { 957 if (cmd == NIOCTXSYNC) { 958 struct netmap_kring *kring = &na->tx_rings[i]; 959 if (netmap_verbose & NM_VERB_TXSYNC) 960 D("pre txsync ring %d cur %d hwcur %d", 961 i, kring->ring->cur, 962 kring->nr_hwcur); 963 na->nm_txsync(ifp, i, 1 /* do lock */); 964 if (netmap_verbose & NM_VERB_TXSYNC) 965 D("post txsync ring %d cur %d hwcur %d", 966 i, kring->ring->cur, 967 kring->nr_hwcur); 968 } else { 969 na->nm_rxsync(ifp, i, 1 /* do lock */); 970 microtime(&na->rx_rings[i].ring->ts); 971 } 972 } 973 974 break; 975 976 #ifdef __FreeBSD__ 977 case BIOCIMMEDIATE: 978 case BIOCGHDRCMPLT: 979 case BIOCSHDRCMPLT: 980 case BIOCSSEESENT: 981 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 982 break; 983 984 default: /* allow device-specific ioctls */ 985 { 986 struct socket so; 987 bzero(&so, sizeof(so)); 988 error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ 989 if (error) 990 break; 991 so.so_vnet = ifp->if_vnet; 992 // so->so_proto not null. 993 error = ifioctl(&so, cmd, data, td); 994 nm_if_rele(ifp); 995 break; 996 } 997 998 #else /* linux */ 999 default: 1000 error = EOPNOTSUPP; 1001 #endif /* linux */ 1002 } 1003 1004 CURVNET_RESTORE(); 1005 return (error); 1006 } 1007 1008 1009 /* 1010 * select(2) and poll(2) handlers for the "netmap" device. 1011 * 1012 * Can be called for one or more queues. 1013 * Return true the event mask corresponding to ready events. 1014 * If there are no ready events, do a selrecord on either individual 1015 * selfd or on the global one. 1016 * Device-dependent parts (locking and sync of tx/rx rings) 1017 * are done through callbacks. 1018 * 1019 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1020 * The first one is remapped to pwait as selrecord() uses the name as an 1021 * hidden argument. 1022 */ 1023 static int 1024 netmap_poll(struct cdev *dev, int events, struct thread *td) 1025 { 1026 struct netmap_priv_d *priv = NULL; 1027 struct netmap_adapter *na; 1028 struct ifnet *ifp; 1029 struct netmap_kring *kring; 1030 u_int core_lock, i, check_all, want_tx, want_rx, revents = 0; 1031 u_int lim_tx, lim_rx; 1032 enum {NO_CL, NEED_CL, LOCKED_CL }; /* see below */ 1033 void *pwait = dev; /* linux compatibility */ 1034 1035 (void)pwait; 1036 1037 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 1038 return POLLERR; 1039 1040 ifp = priv->np_ifp; 1041 // XXX check for deleting() ? 1042 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1043 return POLLERR; 1044 1045 if (netmap_verbose & 0x8000) 1046 D("device %s events 0x%x", ifp->if_xname, events); 1047 want_tx = events & (POLLOUT | POLLWRNORM); 1048 want_rx = events & (POLLIN | POLLRDNORM); 1049 1050 na = NA(ifp); /* retrieve netmap adapter */ 1051 1052 lim_tx = na->num_tx_rings; 1053 lim_rx = na->num_rx_rings; 1054 /* how many queues we are scanning */ 1055 if (priv->np_qfirst == NETMAP_SW_RING) { 1056 if (priv->np_txpoll || want_tx) { 1057 /* push any packets up, then we are always ready */ 1058 kring = &na->tx_rings[lim_tx]; 1059 netmap_sync_to_host(na); 1060 revents |= want_tx; 1061 } 1062 if (want_rx) { 1063 kring = &na->rx_rings[lim_rx]; 1064 if (kring->ring->avail == 0) 1065 netmap_sync_from_host(na, td, dev); 1066 if (kring->ring->avail > 0) { 1067 revents |= want_rx; 1068 } 1069 } 1070 return (revents); 1071 } 1072 1073 /* 1074 * check_all is set if the card has more than one queue and 1075 * the client is polling all of them. If true, we sleep on 1076 * the "global" selfd, otherwise we sleep on individual selfd 1077 * (we can only sleep on one of them per direction). 1078 * The interrupt routine in the driver should always wake on 1079 * the individual selfd, and also on the global one if the card 1080 * has more than one ring. 1081 * 1082 * If the card has only one lock, we just use that. 1083 * If the card has separate ring locks, we just use those 1084 * unless we are doing check_all, in which case the whole 1085 * loop is wrapped by the global lock. 1086 * We acquire locks only when necessary: if poll is called 1087 * when buffers are available, we can just return without locks. 1088 * 1089 * rxsync() is only called if we run out of buffers on a POLLIN. 1090 * txsync() is called if we run out of buffers on POLLOUT, or 1091 * there are pending packets to send. The latter can be disabled 1092 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1093 */ 1094 check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1); 1095 1096 /* 1097 * core_lock indicates what to do with the core lock. 1098 * The core lock is used when either the card has no individual 1099 * locks, or it has individual locks but we are cheking all 1100 * rings so we need the core lock to avoid missing wakeup events. 1101 * 1102 * It has three possible states: 1103 * NO_CL we don't need to use the core lock, e.g. 1104 * because we are protected by individual locks. 1105 * NEED_CL we need the core lock. In this case, when we 1106 * call the lock routine, move to LOCKED_CL 1107 * to remember to release the lock once done. 1108 * LOCKED_CL core lock is set, so we need to release it. 1109 */ 1110 core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL; 1111 #ifdef NM_BRIDGE 1112 /* the bridge uses separate locks */ 1113 if (na->nm_register == bdg_netmap_reg) { 1114 ND("not using core lock for %s", ifp->if_xname); 1115 core_lock = NO_CL; 1116 } 1117 #endif /* NM_BRIDGE */ 1118 if (priv->np_qlast != NETMAP_HW_RING) { 1119 lim_tx = lim_rx = priv->np_qlast; 1120 } 1121 1122 /* 1123 * We start with a lock free round which is good if we have 1124 * data available. If this fails, then lock and call the sync 1125 * routines. 1126 */ 1127 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1128 kring = &na->rx_rings[i]; 1129 if (kring->ring->avail > 0) { 1130 revents |= want_rx; 1131 want_rx = 0; /* also breaks the loop */ 1132 } 1133 } 1134 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1135 kring = &na->tx_rings[i]; 1136 if (kring->ring->avail > 0) { 1137 revents |= want_tx; 1138 want_tx = 0; /* also breaks the loop */ 1139 } 1140 } 1141 1142 /* 1143 * If we to push packets out (priv->np_txpoll) or want_tx is 1144 * still set, we do need to run the txsync calls (on all rings, 1145 * to avoid that the tx rings stall). 1146 */ 1147 if (priv->np_txpoll || want_tx) { 1148 for (i = priv->np_qfirst; i < lim_tx; i++) { 1149 kring = &na->tx_rings[i]; 1150 /* 1151 * Skip the current ring if want_tx == 0 1152 * (we have already done a successful sync on 1153 * a previous ring) AND kring->cur == kring->hwcur 1154 * (there are no pending transmissions for this ring). 1155 */ 1156 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1157 continue; 1158 if (core_lock == NEED_CL) { 1159 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); 1160 core_lock = LOCKED_CL; 1161 } 1162 if (na->separate_locks) 1163 na->nm_lock(ifp, NETMAP_TX_LOCK, i); 1164 if (netmap_verbose & NM_VERB_TXSYNC) 1165 D("send %d on %s %d", 1166 kring->ring->cur, 1167 ifp->if_xname, i); 1168 if (na->nm_txsync(ifp, i, 0 /* no lock */)) 1169 revents |= POLLERR; 1170 1171 /* Check avail/call selrecord only if called with POLLOUT */ 1172 if (want_tx) { 1173 if (kring->ring->avail > 0) { 1174 /* stop at the first ring. We don't risk 1175 * starvation. 1176 */ 1177 revents |= want_tx; 1178 want_tx = 0; 1179 } else if (!check_all) 1180 selrecord(td, &kring->si); 1181 } 1182 if (na->separate_locks) 1183 na->nm_lock(ifp, NETMAP_TX_UNLOCK, i); 1184 } 1185 } 1186 1187 /* 1188 * now if want_rx is still set we need to lock and rxsync. 1189 * Do it on all rings because otherwise we starve. 1190 */ 1191 if (want_rx) { 1192 for (i = priv->np_qfirst; i < lim_rx; i++) { 1193 kring = &na->rx_rings[i]; 1194 if (core_lock == NEED_CL) { 1195 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); 1196 core_lock = LOCKED_CL; 1197 } 1198 if (na->separate_locks) 1199 na->nm_lock(ifp, NETMAP_RX_LOCK, i); 1200 1201 if (na->nm_rxsync(ifp, i, 0 /* no lock */)) 1202 revents |= POLLERR; 1203 if (netmap_no_timestamp == 0 || 1204 kring->ring->flags & NR_TIMESTAMP) { 1205 microtime(&kring->ring->ts); 1206 } 1207 1208 if (kring->ring->avail > 0) 1209 revents |= want_rx; 1210 else if (!check_all) 1211 selrecord(td, &kring->si); 1212 if (na->separate_locks) 1213 na->nm_lock(ifp, NETMAP_RX_UNLOCK, i); 1214 } 1215 } 1216 if (check_all && revents == 0) { /* signal on the global queue */ 1217 if (want_tx) 1218 selrecord(td, &na->tx_si); 1219 if (want_rx) 1220 selrecord(td, &na->rx_si); 1221 } 1222 if (core_lock == LOCKED_CL) 1223 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); 1224 1225 return (revents); 1226 } 1227 1228 /*------- driver support routines ------*/ 1229 1230 /* 1231 * default lock wrapper. 1232 */ 1233 static void 1234 netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) 1235 { 1236 struct netmap_adapter *na = NA(dev); 1237 1238 switch (what) { 1239 #ifdef linux /* some system do not need lock on register */ 1240 case NETMAP_REG_LOCK: 1241 case NETMAP_REG_UNLOCK: 1242 break; 1243 #endif /* linux */ 1244 1245 case NETMAP_CORE_LOCK: 1246 mtx_lock(&na->core_lock); 1247 break; 1248 1249 case NETMAP_CORE_UNLOCK: 1250 mtx_unlock(&na->core_lock); 1251 break; 1252 1253 case NETMAP_TX_LOCK: 1254 mtx_lock(&na->tx_rings[queueid].q_lock); 1255 break; 1256 1257 case NETMAP_TX_UNLOCK: 1258 mtx_unlock(&na->tx_rings[queueid].q_lock); 1259 break; 1260 1261 case NETMAP_RX_LOCK: 1262 mtx_lock(&na->rx_rings[queueid].q_lock); 1263 break; 1264 1265 case NETMAP_RX_UNLOCK: 1266 mtx_unlock(&na->rx_rings[queueid].q_lock); 1267 break; 1268 } 1269 } 1270 1271 1272 /* 1273 * Initialize a ``netmap_adapter`` object created by driver on attach. 1274 * We allocate a block of memory with room for a struct netmap_adapter 1275 * plus two sets of N+2 struct netmap_kring (where N is the number 1276 * of hardware rings): 1277 * krings 0..N-1 are for the hardware queues. 1278 * kring N is for the host stack queue 1279 * kring N+1 is only used for the selinfo for all queues. 1280 * Return 0 on success, ENOMEM otherwise. 1281 * 1282 * na->num_tx_rings can be set for cards with different tx/rx setups 1283 */ 1284 int 1285 netmap_attach(struct netmap_adapter *na, int num_queues) 1286 { 1287 int n, size; 1288 void *buf; 1289 struct ifnet *ifp = na->ifp; 1290 1291 if (ifp == NULL) { 1292 D("ifp not set, giving up"); 1293 return EINVAL; 1294 } 1295 /* clear other fields ? */ 1296 na->refcount = 0; 1297 if (na->num_tx_rings == 0) 1298 na->num_tx_rings = num_queues; 1299 na->num_rx_rings = num_queues; 1300 /* on each direction we have N+1 resources 1301 * 0..n-1 are the hardware rings 1302 * n is the ring attached to the stack. 1303 */ 1304 n = na->num_rx_rings + na->num_tx_rings + 2; 1305 size = sizeof(*na) + n * sizeof(struct netmap_kring); 1306 1307 buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); 1308 if (buf) { 1309 WNA(ifp) = buf; 1310 na->tx_rings = (void *)((char *)buf + sizeof(*na)); 1311 na->rx_rings = na->tx_rings + na->num_tx_rings + 1; 1312 bcopy(na, buf, sizeof(*na)); 1313 ifp->if_capabilities |= IFCAP_NETMAP; 1314 1315 na = buf; 1316 if (na->nm_lock == NULL) { 1317 ND("using default locks for %s", ifp->if_xname); 1318 na->nm_lock = netmap_lock_wrapper; 1319 /* core lock initialized here. 1320 * others initialized after netmap_if_new 1321 */ 1322 mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); 1323 } 1324 } 1325 #ifdef linux 1326 if (ifp->netdev_ops) { 1327 D("netdev_ops %p", ifp->netdev_ops); 1328 /* prepare a clone of the netdev ops */ 1329 na->nm_ndo = *ifp->netdev_ops; 1330 } 1331 na->nm_ndo.ndo_start_xmit = netmap_start_linux; 1332 #endif 1333 D("%s for %s", buf ? "ok" : "failed", ifp->if_xname); 1334 1335 return (buf ? 0 : ENOMEM); 1336 } 1337 1338 1339 /* 1340 * Free the allocated memory linked to the given ``netmap_adapter`` 1341 * object. 1342 */ 1343 void 1344 netmap_detach(struct ifnet *ifp) 1345 { 1346 u_int i; 1347 struct netmap_adapter *na = NA(ifp); 1348 1349 if (!na) 1350 return; 1351 1352 for (i = 0; i < na->num_tx_rings + 1; i++) { 1353 knlist_destroy(&na->tx_rings[i].si.si_note); 1354 mtx_destroy(&na->tx_rings[i].q_lock); 1355 } 1356 for (i = 0; i < na->num_rx_rings + 1; i++) { 1357 knlist_destroy(&na->rx_rings[i].si.si_note); 1358 mtx_destroy(&na->rx_rings[i].q_lock); 1359 } 1360 knlist_destroy(&na->tx_si.si_note); 1361 knlist_destroy(&na->rx_si.si_note); 1362 bzero(na, sizeof(*na)); 1363 WNA(ifp) = NULL; 1364 free(na, M_DEVBUF); 1365 } 1366 1367 1368 /* 1369 * Intercept packets from the network stack and pass them 1370 * to netmap as incoming packets on the 'software' ring. 1371 * We are not locked when called. 1372 */ 1373 int 1374 netmap_start(struct ifnet *ifp, struct mbuf *m) 1375 { 1376 struct netmap_adapter *na = NA(ifp); 1377 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1378 u_int i, len = MBUF_LEN(m); 1379 int error = EBUSY, lim = kring->nkr_num_slots - 1; 1380 struct netmap_slot *slot; 1381 1382 if (netmap_verbose & NM_VERB_HOST) 1383 D("%s packet %d len %d from the stack", ifp->if_xname, 1384 kring->nr_hwcur + kring->nr_hwavail, len); 1385 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); 1386 if (kring->nr_hwavail >= lim) { 1387 if (netmap_verbose) 1388 D("stack ring %s full\n", ifp->if_xname); 1389 goto done; /* no space */ 1390 } 1391 if (len > NETMAP_BUF_SIZE) { 1392 D("drop packet size %d > %d", len, NETMAP_BUF_SIZE); 1393 goto done; /* too long for us */ 1394 } 1395 1396 /* compute the insert position */ 1397 i = kring->nr_hwcur + kring->nr_hwavail; 1398 if (i > lim) 1399 i -= lim + 1; 1400 slot = &kring->ring->slot[i]; 1401 m_copydata(m, 0, len, NMB(slot)); 1402 slot->len = len; 1403 kring->nr_hwavail++; 1404 if (netmap_verbose & NM_VERB_HOST) 1405 D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); 1406 selwakeuppri(&kring->si, PI_NET); 1407 error = 0; 1408 done: 1409 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); 1410 1411 /* release the mbuf in either cases of success or failure. As an 1412 * alternative, put the mbuf in a free list and free the list 1413 * only when really necessary. 1414 */ 1415 m_freem(m); 1416 1417 return (error); 1418 } 1419 1420 1421 /* 1422 * netmap_reset() is called by the driver routines when reinitializing 1423 * a ring. The driver is in charge of locking to protect the kring. 1424 * If netmap mode is not set just return NULL. 1425 */ 1426 struct netmap_slot * 1427 netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, 1428 u_int new_cur) 1429 { 1430 struct netmap_kring *kring; 1431 int new_hwofs, lim; 1432 1433 if (na == NULL) 1434 return NULL; /* no netmap support here */ 1435 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) 1436 return NULL; /* nothing to reinitialize */ 1437 1438 if (tx == NR_TX) { 1439 kring = na->tx_rings + n; 1440 new_hwofs = kring->nr_hwcur - new_cur; 1441 } else { 1442 kring = na->rx_rings + n; 1443 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 1444 } 1445 lim = kring->nkr_num_slots - 1; 1446 if (new_hwofs > lim) 1447 new_hwofs -= lim + 1; 1448 1449 /* Alwayws set the new offset value and realign the ring. */ 1450 kring->nkr_hwofs = new_hwofs; 1451 if (tx == NR_TX) 1452 kring->nr_hwavail = kring->nkr_num_slots - 1; 1453 D("new hwofs %d on %s %s[%d]", 1454 kring->nkr_hwofs, na->ifp->if_xname, 1455 tx == NR_TX ? "TX" : "RX", n); 1456 1457 #if 0 // def linux 1458 /* XXX check that the mappings are correct */ 1459 /* need ring_nr, adapter->pdev, direction */ 1460 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 1461 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 1462 D("error mapping rx netmap buffer %d", i); 1463 // XXX fix error handling 1464 } 1465 1466 #endif /* linux */ 1467 /* 1468 * Wakeup on the individual and global lock 1469 * We do the wakeup here, but the ring is not yet reconfigured. 1470 * However, we are under lock so there are no races. 1471 */ 1472 selwakeuppri(&kring->si, PI_NET); 1473 selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); 1474 return kring->ring->slot; 1475 } 1476 1477 1478 /* 1479 * Default functions to handle rx/tx interrupts 1480 * we have 4 cases: 1481 * 1 ring, single lock: 1482 * lock(core); wake(i=0); unlock(core) 1483 * N rings, single lock: 1484 * lock(core); wake(i); wake(N+1) unlock(core) 1485 * 1 ring, separate locks: (i=0) 1486 * lock(i); wake(i); unlock(i) 1487 * N rings, separate locks: 1488 * lock(i); wake(i); unlock(i); lock(core) wake(N+1) unlock(core) 1489 * work_done is non-null on the RX path. 1490 */ 1491 int 1492 netmap_rx_irq(struct ifnet *ifp, int q, int *work_done) 1493 { 1494 struct netmap_adapter *na; 1495 struct netmap_kring *r; 1496 NM_SELINFO_T *main_wq; 1497 1498 if (!(ifp->if_capenable & IFCAP_NETMAP)) 1499 return 0; 1500 na = NA(ifp); 1501 if (work_done) { /* RX path */ 1502 r = na->rx_rings + q; 1503 r->nr_kflags |= NKR_PENDINTR; 1504 main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL; 1505 } else { /* tx path */ 1506 r = na->tx_rings + q; 1507 main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL; 1508 work_done = &q; /* dummy */ 1509 } 1510 if (na->separate_locks) { 1511 mtx_lock(&r->q_lock); 1512 selwakeuppri(&r->si, PI_NET); 1513 mtx_unlock(&r->q_lock); 1514 if (main_wq) { 1515 mtx_lock(&na->core_lock); 1516 selwakeuppri(main_wq, PI_NET); 1517 mtx_unlock(&na->core_lock); 1518 } 1519 } else { 1520 mtx_lock(&na->core_lock); 1521 selwakeuppri(&r->si, PI_NET); 1522 if (main_wq) 1523 selwakeuppri(main_wq, PI_NET); 1524 mtx_unlock(&na->core_lock); 1525 } 1526 *work_done = 1; /* do not fire napi again */ 1527 return 1; 1528 } 1529 1530 1531 #ifdef linux /* linux-specific routines */ 1532 1533 /* 1534 * Remap linux arguments into the FreeBSD call. 1535 * - pwait is the poll table, passed as 'dev'; 1536 * If pwait == NULL someone else already woke up before. We can report 1537 * events but they are filtered upstream. 1538 * If pwait != NULL, then pwait->key contains the list of events. 1539 * - events is computed from pwait as above. 1540 * - file is passed as 'td'; 1541 */ 1542 static u_int 1543 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) 1544 { 1545 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) 1546 int events = pwait ? pwait->key : POLLIN | POLLOUT; 1547 #else /* in 3.4.0 field 'key' was renamed to '_key' */ 1548 int events = pwait ? pwait->_key : POLLIN | POLLOUT; 1549 #endif 1550 return netmap_poll((void *)pwait, events, (void *)file); 1551 } 1552 1553 static int 1554 netmap_mmap(__unused struct file *f, struct vm_area_struct *vma) 1555 { 1556 int lut_skip, i, j; 1557 int user_skip = 0; 1558 struct lut_entry *l_entry; 1559 const struct netmap_obj_pool *p[] = { 1560 nm_mem->nm_if_pool, 1561 nm_mem->nm_ring_pool, 1562 nm_mem->nm_buf_pool }; 1563 /* 1564 * vma->vm_start: start of mapping user address space 1565 * vma->vm_end: end of the mapping user address space 1566 */ 1567 1568 // XXX security checks 1569 1570 for (i = 0; i < 3; i++) { /* loop through obj_pools */ 1571 /* 1572 * In each pool memory is allocated in clusters 1573 * of size _clustsize , each containing clustentries 1574 * entries. For each object k we already store the 1575 * vtophys malling in lut[k] so we use that, scanning 1576 * the lut[] array in steps of clustentries, 1577 * and we map each cluster (not individual pages, 1578 * it would be overkill). 1579 */ 1580 for (lut_skip = 0, j = 0; j < p[i]->_numclusters; j++) { 1581 l_entry = &p[i]->lut[lut_skip]; 1582 if (remap_pfn_range(vma, vma->vm_start + user_skip, 1583 l_entry->paddr >> PAGE_SHIFT, p[i]->_clustsize, 1584 vma->vm_page_prot)) 1585 return -EAGAIN; // XXX check return value 1586 lut_skip += p[i]->clustentries; 1587 user_skip += p[i]->_clustsize; 1588 } 1589 } 1590 1591 return 0; 1592 } 1593 1594 static netdev_tx_t 1595 netmap_start_linux(struct sk_buff *skb, struct net_device *dev) 1596 { 1597 netmap_start(dev, skb); 1598 return (NETDEV_TX_OK); 1599 } 1600 1601 1602 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) 1603 #define LIN_IOCTL_NAME .ioctl 1604 int 1605 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) 1606 #else 1607 #define LIN_IOCTL_NAME .unlocked_ioctl 1608 long 1609 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) 1610 #endif 1611 { 1612 int ret; 1613 struct nmreq nmr; 1614 bzero(&nmr, sizeof(nmr)); 1615 1616 if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) 1617 return -EFAULT; 1618 ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); 1619 if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) 1620 return -EFAULT; 1621 return -ret; 1622 } 1623 1624 1625 static int 1626 netmap_release(__unused struct inode *inode, struct file *file) 1627 { 1628 if (file->private_data) 1629 netmap_dtor(file->private_data); 1630 return (0); 1631 } 1632 1633 1634 static struct file_operations netmap_fops = { 1635 .mmap = netmap_mmap, 1636 LIN_IOCTL_NAME = linux_netmap_ioctl, 1637 .poll = linux_netmap_poll, 1638 .release = netmap_release, 1639 }; 1640 1641 static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ 1642 MISC_DYNAMIC_MINOR, 1643 "netmap", 1644 &netmap_fops, 1645 }; 1646 1647 static int netmap_init(void); 1648 static void netmap_fini(void); 1649 1650 module_init(netmap_init); 1651 module_exit(netmap_fini); 1652 /* export certain symbols to other modules */ 1653 EXPORT_SYMBOL(netmap_attach); // driver attach routines 1654 EXPORT_SYMBOL(netmap_detach); // driver detach routines 1655 EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error 1656 EXPORT_SYMBOL(netmap_buffer_lut); 1657 EXPORT_SYMBOL(netmap_total_buffers); // index check 1658 EXPORT_SYMBOL(netmap_buffer_base); 1659 EXPORT_SYMBOL(netmap_reset); // ring init routines 1660 EXPORT_SYMBOL(netmap_buf_size); 1661 EXPORT_SYMBOL(netmap_rx_irq); // default irq handler 1662 EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away 1663 1664 1665 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); 1666 MODULE_DESCRIPTION("The netmap packet I/O framework"); 1667 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ 1668 1669 #else /* __FreeBSD__ */ 1670 1671 static struct cdevsw netmap_cdevsw = { 1672 .d_version = D_VERSION, 1673 .d_name = "netmap", 1674 .d_mmap = netmap_mmap, 1675 .d_ioctl = netmap_ioctl, 1676 .d_poll = netmap_poll, 1677 }; 1678 #endif /* __FreeBSD__ */ 1679 1680 #ifdef NM_BRIDGE 1681 /* 1682 *---- support for virtual bridge ----- 1683 */ 1684 1685 /* ----- FreeBSD if_bridge hash function ------- */ 1686 1687 /* 1688 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1689 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1690 * 1691 * http://www.burtleburtle.net/bob/hash/spooky.html 1692 */ 1693 #define mix(a, b, c) \ 1694 do { \ 1695 a -= b; a -= c; a ^= (c >> 13); \ 1696 b -= c; b -= a; b ^= (a << 8); \ 1697 c -= a; c -= b; c ^= (b >> 13); \ 1698 a -= b; a -= c; a ^= (c >> 12); \ 1699 b -= c; b -= a; b ^= (a << 16); \ 1700 c -= a; c -= b; c ^= (b >> 5); \ 1701 a -= b; a -= c; a ^= (c >> 3); \ 1702 b -= c; b -= a; b ^= (a << 10); \ 1703 c -= a; c -= b; c ^= (b >> 15); \ 1704 } while (/*CONSTCOND*/0) 1705 1706 static __inline uint32_t 1707 nm_bridge_rthash(const uint8_t *addr) 1708 { 1709 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1710 1711 b += addr[5] << 8; 1712 b += addr[4]; 1713 a += addr[3] << 24; 1714 a += addr[2] << 16; 1715 a += addr[1] << 8; 1716 a += addr[0]; 1717 1718 mix(a, b, c); 1719 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1720 return (c & BRIDGE_RTHASH_MASK); 1721 } 1722 1723 #undef mix 1724 1725 1726 static int 1727 bdg_netmap_reg(struct ifnet *ifp, int onoff) 1728 { 1729 int i, err = 0; 1730 struct nm_bridge *b = ifp->if_bridge; 1731 1732 BDG_LOCK(b); 1733 if (onoff) { 1734 /* the interface must be already in the list. 1735 * only need to mark the port as active 1736 */ 1737 ND("should attach %s to the bridge", ifp->if_xname); 1738 for (i=0; i < NM_BDG_MAXPORTS; i++) 1739 if (b->bdg_ports[i] == ifp) 1740 break; 1741 if (i == NM_BDG_MAXPORTS) { 1742 D("no more ports available"); 1743 err = EINVAL; 1744 goto done; 1745 } 1746 ND("setting %s in netmap mode", ifp->if_xname); 1747 ifp->if_capenable |= IFCAP_NETMAP; 1748 NA(ifp)->bdg_port = i; 1749 b->act_ports |= (1<<i); 1750 b->bdg_ports[i] = ifp; 1751 } else { 1752 /* should be in the list, too -- remove from the mask */ 1753 ND("removing %s from netmap mode", ifp->if_xname); 1754 ifp->if_capenable &= ~IFCAP_NETMAP; 1755 i = NA(ifp)->bdg_port; 1756 b->act_ports &= ~(1<<i); 1757 } 1758 done: 1759 BDG_UNLOCK(b); 1760 return err; 1761 } 1762 1763 1764 static int 1765 nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct ifnet *ifp) 1766 { 1767 int i, ifn; 1768 uint64_t all_dst, dst; 1769 uint32_t sh, dh; 1770 uint64_t mysrc = 1 << NA(ifp)->bdg_port; 1771 uint64_t smac, dmac; 1772 struct netmap_slot *slot; 1773 struct nm_bridge *b = ifp->if_bridge; 1774 1775 ND("prepare to send %d packets, act_ports 0x%x", n, b->act_ports); 1776 /* only consider valid destinations */ 1777 all_dst = (b->act_ports & ~mysrc); 1778 /* first pass: hash and find destinations */ 1779 for (i = 0; likely(i < n); i++) { 1780 uint8_t *buf = ft[i].buf; 1781 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1782 smac = le64toh(*(uint64_t *)(buf + 4)); 1783 smac >>= 16; 1784 if (unlikely(netmap_verbose)) { 1785 uint8_t *s = buf+6, *d = buf; 1786 D("%d len %4d %02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x", 1787 i, 1788 ft[i].len, 1789 s[0], s[1], s[2], s[3], s[4], s[5], 1790 d[0], d[1], d[2], d[3], d[4], d[5]); 1791 } 1792 /* 1793 * The hash is somewhat expensive, there might be some 1794 * worthwhile optimizations here. 1795 */ 1796 if ((buf[6] & 1) == 0) { /* valid src */ 1797 uint8_t *s = buf+6; 1798 sh = nm_bridge_rthash(buf+6); // XXX hash of source 1799 /* update source port forwarding entry */ 1800 b->ht[sh].mac = smac; /* XXX expire ? */ 1801 b->ht[sh].ports = mysrc; 1802 if (netmap_verbose) 1803 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1804 s[0], s[1], s[2], s[3], s[4], s[5], NA(ifp)->bdg_port); 1805 } 1806 dst = 0; 1807 if ( (buf[0] & 1) == 0) { /* unicast */ 1808 uint8_t *d = buf; 1809 dh = nm_bridge_rthash(buf); // XXX hash of dst 1810 if (b->ht[dh].mac == dmac) { /* found dst */ 1811 dst = b->ht[dh].ports; 1812 if (netmap_verbose) 1813 D("dst %02x:%02x:%02x:%02x:%02x:%02x to port %x", 1814 d[0], d[1], d[2], d[3], d[4], d[5], (uint32_t)(dst >> 16)); 1815 } 1816 } 1817 if (dst == 0) 1818 dst = all_dst; 1819 dst &= all_dst; /* only consider valid ports */ 1820 if (unlikely(netmap_verbose)) 1821 D("pkt goes to ports 0x%x", (uint32_t)dst); 1822 ft[i].dst = dst; 1823 } 1824 1825 /* second pass, scan interfaces and forward */ 1826 all_dst = (b->act_ports & ~mysrc); 1827 for (ifn = 0; all_dst; ifn++) { 1828 struct ifnet *dst_ifp = b->bdg_ports[ifn]; 1829 struct netmap_adapter *na; 1830 struct netmap_kring *kring; 1831 struct netmap_ring *ring; 1832 int j, lim, sent, locked; 1833 1834 if (!dst_ifp) 1835 continue; 1836 ND("scan port %d %s", ifn, dst_ifp->if_xname); 1837 dst = 1 << ifn; 1838 if ((dst & all_dst) == 0) /* skip if not set */ 1839 continue; 1840 all_dst &= ~dst; /* clear current node */ 1841 na = NA(dst_ifp); 1842 1843 ring = NULL; 1844 kring = NULL; 1845 lim = sent = locked = 0; 1846 /* inside, scan slots */ 1847 for (i = 0; likely(i < n); i++) { 1848 if ((ft[i].dst & dst) == 0) 1849 continue; /* not here */ 1850 if (!locked) { 1851 kring = &na->rx_rings[0]; 1852 ring = kring->ring; 1853 lim = kring->nkr_num_slots - 1; 1854 na->nm_lock(dst_ifp, NETMAP_RX_LOCK, 0); 1855 locked = 1; 1856 } 1857 if (unlikely(kring->nr_hwavail >= lim)) { 1858 if (netmap_verbose) 1859 D("rx ring full on %s", ifp->if_xname); 1860 break; 1861 } 1862 j = kring->nr_hwcur + kring->nr_hwavail; 1863 if (j > lim) 1864 j -= kring->nkr_num_slots; 1865 slot = &ring->slot[j]; 1866 ND("send %d %d bytes at %s:%d", i, ft[i].len, dst_ifp->if_xname, j); 1867 pkt_copy(ft[i].buf, NMB(slot), ft[i].len); 1868 slot->len = ft[i].len; 1869 kring->nr_hwavail++; 1870 sent++; 1871 } 1872 if (locked) { 1873 ND("sent %d on %s", sent, dst_ifp->if_xname); 1874 if (sent) 1875 selwakeuppri(&kring->si, PI_NET); 1876 na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, 0); 1877 } 1878 } 1879 return 0; 1880 } 1881 1882 /* 1883 * main dispatch routine 1884 */ 1885 static int 1886 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) 1887 { 1888 struct netmap_adapter *na = NA(ifp); 1889 struct netmap_kring *kring = &na->tx_rings[ring_nr]; 1890 struct netmap_ring *ring = kring->ring; 1891 int i, j, k, lim = kring->nkr_num_slots - 1; 1892 struct nm_bdg_fwd *ft = (struct nm_bdg_fwd *)(ifp + 1); 1893 int ft_i; /* position in the forwarding table */ 1894 1895 k = ring->cur; 1896 if (k > lim) 1897 return netmap_ring_reinit(kring); 1898 if (do_lock) 1899 na->nm_lock(ifp, NETMAP_TX_LOCK, ring_nr); 1900 1901 if (netmap_bridge <= 0) { /* testing only */ 1902 j = k; // used all 1903 goto done; 1904 } 1905 if (netmap_bridge > NM_BDG_BATCH) 1906 netmap_bridge = NM_BDG_BATCH; 1907 1908 ft_i = 0; /* start from 0 */ 1909 for (j = kring->nr_hwcur; likely(j != k); j = unlikely(j == lim) ? 0 : j+1) { 1910 struct netmap_slot *slot = &ring->slot[j]; 1911 int len = ft[ft_i].len = slot->len; 1912 char *buf = ft[ft_i].buf = NMB(slot); 1913 1914 prefetch(buf); 1915 if (unlikely(len < 14)) 1916 continue; 1917 if (unlikely(++ft_i == netmap_bridge)) 1918 ft_i = nm_bdg_flush(ft, ft_i, ifp); 1919 } 1920 if (ft_i) 1921 ft_i = nm_bdg_flush(ft, ft_i, ifp); 1922 /* count how many packets we sent */ 1923 i = k - j; 1924 if (i < 0) 1925 i += kring->nkr_num_slots; 1926 kring->nr_hwavail = kring->nkr_num_slots - 1 - i; 1927 if (j != k) 1928 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); 1929 1930 done: 1931 kring->nr_hwcur = j; 1932 ring->avail = kring->nr_hwavail; 1933 if (do_lock) 1934 na->nm_lock(ifp, NETMAP_TX_UNLOCK, ring_nr); 1935 1936 if (netmap_verbose) 1937 D("%s ring %d lock %d", ifp->if_xname, ring_nr, do_lock); 1938 return 0; 1939 } 1940 1941 static int 1942 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) 1943 { 1944 struct netmap_adapter *na = NA(ifp); 1945 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 1946 struct netmap_ring *ring = kring->ring; 1947 int j, n, lim = kring->nkr_num_slots - 1; 1948 u_int k = ring->cur, resvd = ring->reserved; 1949 1950 ND("%s ring %d lock %d avail %d", 1951 ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail); 1952 1953 if (k > lim) 1954 return netmap_ring_reinit(kring); 1955 if (do_lock) 1956 na->nm_lock(ifp, NETMAP_RX_LOCK, ring_nr); 1957 1958 /* skip past packets that userspace has released */ 1959 j = kring->nr_hwcur; /* netmap ring index */ 1960 if (resvd > 0) { 1961 if (resvd + ring->avail >= lim + 1) { 1962 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 1963 ring->reserved = resvd = 0; // XXX panic... 1964 } 1965 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; 1966 } 1967 1968 if (j != k) { /* userspace has released some packets. */ 1969 n = k - j; 1970 if (n < 0) 1971 n += kring->nkr_num_slots; 1972 ND("userspace releases %d packets", n); 1973 for (n = 0; likely(j != k); n++) { 1974 struct netmap_slot *slot = &ring->slot[j]; 1975 void *addr = NMB(slot); 1976 1977 if (addr == netmap_buffer_base) { /* bad buf */ 1978 if (do_lock) 1979 na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); 1980 return netmap_ring_reinit(kring); 1981 } 1982 /* decrease refcount for buffer */ 1983 1984 slot->flags &= ~NS_BUF_CHANGED; 1985 j = unlikely(j == lim) ? 0 : j + 1; 1986 } 1987 kring->nr_hwavail -= n; 1988 kring->nr_hwcur = k; 1989 } 1990 /* tell userspace that there are new packets */ 1991 ring->avail = kring->nr_hwavail - resvd; 1992 1993 if (do_lock) 1994 na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); 1995 return 0; 1996 } 1997 1998 static void 1999 bdg_netmap_attach(struct ifnet *ifp) 2000 { 2001 struct netmap_adapter na; 2002 2003 ND("attaching virtual bridge"); 2004 bzero(&na, sizeof(na)); 2005 2006 na.ifp = ifp; 2007 na.separate_locks = 1; 2008 na.num_tx_desc = NM_BRIDGE_RINGSIZE; 2009 na.num_rx_desc = NM_BRIDGE_RINGSIZE; 2010 na.nm_txsync = bdg_netmap_txsync; 2011 na.nm_rxsync = bdg_netmap_rxsync; 2012 na.nm_register = bdg_netmap_reg; 2013 netmap_attach(&na, 1); 2014 } 2015 2016 #endif /* NM_BRIDGE */ 2017 2018 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2019 2020 2021 /* 2022 * Module loader. 2023 * 2024 * Create the /dev/netmap device and initialize all global 2025 * variables. 2026 * 2027 * Return 0 on success, errno on failure. 2028 */ 2029 static int 2030 netmap_init(void) 2031 { 2032 int error; 2033 2034 error = netmap_memory_init(); 2035 if (error != 0) { 2036 printf("netmap: unable to initialize the memory allocator."); 2037 return (error); 2038 } 2039 printf("netmap: loaded module with %d Mbytes\n", 2040 (int)(nm_mem->nm_totalsize >> 20)); 2041 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2042 "netmap"); 2043 2044 #ifdef NM_BRIDGE 2045 { 2046 int i; 2047 for (i = 0; i < NM_BRIDGES; i++) 2048 mtx_init(&nm_bridges[i].bdg_lock, "bdg lock", "bdg_lock", MTX_DEF); 2049 } 2050 #endif 2051 return (error); 2052 } 2053 2054 2055 /* 2056 * Module unloader. 2057 * 2058 * Free all the memory, and destroy the ``/dev/netmap`` device. 2059 */ 2060 static void 2061 netmap_fini(void) 2062 { 2063 destroy_dev(netmap_dev); 2064 netmap_memory_fini(); 2065 printf("netmap: unloaded module.\n"); 2066 } 2067 2068 2069 #ifdef __FreeBSD__ 2070 /* 2071 * Kernel entry point. 2072 * 2073 * Initialize/finalize the module and return. 2074 * 2075 * Return 0 on success, errno on failure. 2076 */ 2077 static int 2078 netmap_loader(__unused struct module *module, int event, __unused void *arg) 2079 { 2080 int error = 0; 2081 2082 switch (event) { 2083 case MOD_LOAD: 2084 error = netmap_init(); 2085 break; 2086 2087 case MOD_UNLOAD: 2088 netmap_fini(); 2089 break; 2090 2091 default: 2092 error = EOPNOTSUPP; 2093 break; 2094 } 2095 2096 return (error); 2097 } 2098 2099 2100 DEV_MODULE(netmap, netmap_loader, NULL); 2101 #endif /* __FreeBSD__ */ 2102