1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2016-2017 Intel Corporation 3 */ 4 5 #include <rte_atomic.h> 6 #include <rte_branch_prediction.h> 7 #include <rte_byteorder.h> 8 #include <rte_common.h> 9 #include <rte_mbuf.h> 10 #include <ethdev_driver.h> 11 #include <ethdev_vdev.h> 12 #include <rte_malloc.h> 13 #include <rte_bus_vdev.h> 14 #include <rte_kvargs.h> 15 #include <rte_net.h> 16 #include <rte_debug.h> 17 #include <rte_ip.h> 18 #include <rte_string_fns.h> 19 #include <rte_ethdev.h> 20 #include <rte_errno.h> 21 #include <rte_cycles.h> 22 23 #include <sys/types.h> 24 #include <sys/stat.h> 25 #include <sys/socket.h> 26 #include <sys/ioctl.h> 27 #include <sys/utsname.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 #include <signal.h> 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <sys/uio.h> 34 #include <unistd.h> 35 #include <arpa/inet.h> 36 #include <net/if.h> 37 #include <linux/if_tun.h> 38 #include <linux/if_ether.h> 39 #include <fcntl.h> 40 #include <ctype.h> 41 42 #include <tap_rss.h> 43 #include <rte_eth_tap.h> 44 #include <tap_flow.h> 45 #include <tap_netlink.h> 46 #include <tap_tcmsgs.h> 47 48 /* Linux based path to the TUN device */ 49 #define TUN_TAP_DEV_PATH "/dev/net/tun" 50 #define DEFAULT_TAP_NAME "dtap" 51 #define DEFAULT_TUN_NAME "dtun" 52 53 #define ETH_TAP_IFACE_ARG "iface" 54 #define ETH_TAP_REMOTE_ARG "remote" 55 #define ETH_TAP_MAC_ARG "mac" 56 #define ETH_TAP_MAC_FIXED "fixed" 57 58 #define ETH_TAP_USR_MAC_FMT "xx:xx:xx:xx:xx:xx" 59 #define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef" 60 #define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT 61 62 #define TAP_GSO_MBUFS_PER_CORE 128 63 #define TAP_GSO_MBUF_SEG_SIZE 128 64 #define TAP_GSO_MBUF_CACHE_SIZE 4 65 #define TAP_GSO_MBUFS_NUM \ 66 (TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE) 67 68 /* IPC key for queue fds sync */ 69 #define TAP_MP_KEY "tap_mp_sync_queues" 70 #define TAP_MP_REQ_START_RXTX "tap_mp_req_start_rxtx" 71 72 #define TAP_IOV_DEFAULT_MAX 1024 73 74 #define TAP_RX_OFFLOAD (RTE_ETH_RX_OFFLOAD_SCATTER | \ 75 RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | \ 76 RTE_ETH_RX_OFFLOAD_UDP_CKSUM | \ 77 RTE_ETH_RX_OFFLOAD_TCP_CKSUM) 78 79 #define TAP_TX_OFFLOAD (RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \ 80 RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \ 81 RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \ 82 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \ 83 RTE_ETH_TX_OFFLOAD_TCP_TSO) 84 85 static int tap_devices_count; 86 87 static const char *tuntap_types[ETH_TUNTAP_TYPE_MAX] = { 88 "UNKNOWN", "TUN", "TAP" 89 }; 90 91 static const char *valid_arguments[] = { 92 ETH_TAP_IFACE_ARG, 93 ETH_TAP_REMOTE_ARG, 94 ETH_TAP_MAC_ARG, 95 NULL 96 }; 97 98 static volatile uint32_t tap_trigger; /* Rx trigger */ 99 100 static struct rte_eth_link pmd_link = { 101 .link_speed = RTE_ETH_SPEED_NUM_10G, 102 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 103 .link_status = RTE_ETH_LINK_DOWN, 104 .link_autoneg = RTE_ETH_LINK_FIXED, 105 }; 106 107 static void 108 tap_trigger_cb(int sig __rte_unused) 109 { 110 /* Valid trigger values are nonzero */ 111 tap_trigger = (tap_trigger + 1) | 0x80000000; 112 } 113 114 /* Specifies on what netdevices the ioctl should be applied */ 115 enum ioctl_mode { 116 LOCAL_AND_REMOTE, 117 LOCAL_ONLY, 118 REMOTE_ONLY, 119 }; 120 121 /* Message header to synchronize queues via IPC */ 122 struct ipc_queues { 123 char port_name[RTE_DEV_NAME_MAX_LEN]; 124 int rxq_count; 125 int txq_count; 126 /* 127 * The file descriptors are in the dedicated part 128 * of the Unix message to be translated by the kernel. 129 */ 130 }; 131 132 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set); 133 134 /** 135 * Tun/Tap allocation routine 136 * 137 * @param[in] pmd 138 * Pointer to private structure. 139 * 140 * @param[in] is_keepalive 141 * Keepalive flag 142 * 143 * @return 144 * -1 on failure, fd on success 145 */ 146 static int 147 tun_alloc(struct pmd_internals *pmd, int is_keepalive) 148 { 149 struct ifreq ifr; 150 #ifdef IFF_MULTI_QUEUE 151 unsigned int features; 152 #endif 153 int fd, signo, flags; 154 155 memset(&ifr, 0, sizeof(struct ifreq)); 156 157 /* 158 * Do not set IFF_NO_PI as packet information header will be needed 159 * to check if a received packet has been truncated. 160 */ 161 ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ? 162 IFF_TAP : IFF_TUN | IFF_POINTOPOINT; 163 strlcpy(ifr.ifr_name, pmd->name, IFNAMSIZ); 164 165 fd = open(TUN_TAP_DEV_PATH, O_RDWR); 166 if (fd < 0) { 167 TAP_LOG(ERR, "Unable to open %s interface", TUN_TAP_DEV_PATH); 168 goto error; 169 } 170 171 #ifdef IFF_MULTI_QUEUE 172 /* Grab the TUN features to verify we can work multi-queue */ 173 if (ioctl(fd, TUNGETFEATURES, &features) < 0) { 174 TAP_LOG(ERR, "unable to get TUN/TAP features"); 175 goto error; 176 } 177 TAP_LOG(DEBUG, "%s Features %08x", TUN_TAP_DEV_PATH, features); 178 179 if (features & IFF_MULTI_QUEUE) { 180 TAP_LOG(DEBUG, " Multi-queue support for %d queues", 181 RTE_PMD_TAP_MAX_QUEUES); 182 ifr.ifr_flags |= IFF_MULTI_QUEUE; 183 } else 184 #endif 185 { 186 ifr.ifr_flags |= IFF_ONE_QUEUE; 187 TAP_LOG(DEBUG, " Single queue only support"); 188 } 189 190 /* Set the TUN/TAP configuration and set the name if needed */ 191 if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) { 192 TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s", 193 ifr.ifr_name, strerror(errno)); 194 goto error; 195 } 196 197 /* 198 * Name passed to kernel might be wildcard like dtun%d 199 * and need to find the resulting device. 200 */ 201 TAP_LOG(DEBUG, "Device name is '%s'", ifr.ifr_name); 202 strlcpy(pmd->name, ifr.ifr_name, RTE_ETH_NAME_MAX_LEN); 203 204 if (is_keepalive) { 205 /* 206 * Detach the TUN/TAP keep-alive queue 207 * to avoid traffic through it 208 */ 209 ifr.ifr_flags = IFF_DETACH_QUEUE; 210 if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) { 211 TAP_LOG(WARNING, 212 "Unable to detach keep-alive queue for %s: %s", 213 ifr.ifr_name, strerror(errno)); 214 goto error; 215 } 216 } 217 218 flags = fcntl(fd, F_GETFL); 219 if (flags == -1) { 220 TAP_LOG(WARNING, 221 "Unable to get %s current flags\n", 222 ifr.ifr_name); 223 goto error; 224 } 225 226 /* Always set the file descriptor to non-blocking */ 227 flags |= O_NONBLOCK; 228 if (fcntl(fd, F_SETFL, flags) < 0) { 229 TAP_LOG(WARNING, 230 "Unable to set %s to nonblocking: %s", 231 ifr.ifr_name, strerror(errno)); 232 goto error; 233 } 234 235 /* Find a free realtime signal */ 236 for (signo = SIGRTMIN + 1; signo < SIGRTMAX; signo++) { 237 struct sigaction sa; 238 239 if (sigaction(signo, NULL, &sa) == -1) { 240 TAP_LOG(WARNING, 241 "Unable to get current rt-signal %d handler", 242 signo); 243 goto error; 244 } 245 246 /* Already have the handler we want on this signal */ 247 if (sa.sa_handler == tap_trigger_cb) 248 break; 249 250 /* Is handler in use by application */ 251 if (sa.sa_handler != SIG_DFL) { 252 TAP_LOG(DEBUG, 253 "Skipping used rt-signal %d", signo); 254 continue; 255 } 256 257 sa = (struct sigaction) { 258 .sa_flags = SA_RESTART, 259 .sa_handler = tap_trigger_cb, 260 }; 261 262 if (sigaction(signo, &sa, NULL) == -1) { 263 TAP_LOG(WARNING, 264 "Unable to set rt-signal %d handler\n", signo); 265 goto error; 266 } 267 268 /* Found a good signal to use */ 269 TAP_LOG(DEBUG, 270 "Using rt-signal %d", signo); 271 break; 272 } 273 274 if (signo == SIGRTMAX) { 275 TAP_LOG(WARNING, "All rt-signals are in use\n"); 276 277 /* Disable trigger globally in case of error */ 278 tap_trigger = 0; 279 TAP_LOG(NOTICE, "No Rx trigger signal available\n"); 280 } else { 281 /* Enable signal on file descriptor */ 282 if (fcntl(fd, F_SETSIG, signo) < 0) { 283 TAP_LOG(WARNING, "Unable to set signo %d for fd %d: %s", 284 signo, fd, strerror(errno)); 285 goto error; 286 } 287 if (fcntl(fd, F_SETFL, flags | O_ASYNC) < 0) { 288 TAP_LOG(WARNING, "Unable to set fcntl flags: %s", 289 strerror(errno)); 290 goto error; 291 } 292 293 if (fcntl(fd, F_SETOWN, getpid()) < 0) { 294 TAP_LOG(WARNING, "Unable to set fcntl owner: %s", 295 strerror(errno)); 296 goto error; 297 } 298 } 299 return fd; 300 301 error: 302 if (fd >= 0) 303 close(fd); 304 return -1; 305 } 306 307 static void 308 tap_verify_csum(struct rte_mbuf *mbuf) 309 { 310 uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK; 311 uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK; 312 uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK; 313 unsigned int l2_len = sizeof(struct rte_ether_hdr); 314 unsigned int l3_len; 315 uint16_t cksum = 0; 316 void *l3_hdr; 317 void *l4_hdr; 318 struct rte_udp_hdr *udp_hdr; 319 320 if (l2 == RTE_PTYPE_L2_ETHER_VLAN) 321 l2_len += 4; 322 else if (l2 == RTE_PTYPE_L2_ETHER_QINQ) 323 l2_len += 8; 324 /* Don't verify checksum for packets with discontinuous L2 header */ 325 if (unlikely(l2_len + sizeof(struct rte_ipv4_hdr) > 326 rte_pktmbuf_data_len(mbuf))) 327 return; 328 l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len); 329 if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) { 330 struct rte_ipv4_hdr *iph = l3_hdr; 331 332 l3_len = rte_ipv4_hdr_len(iph); 333 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf))) 334 return; 335 /* check that the total length reported by header is not 336 * greater than the total received size 337 */ 338 if (l2_len + rte_be_to_cpu_16(iph->total_length) > 339 rte_pktmbuf_data_len(mbuf)) 340 return; 341 342 cksum = ~rte_raw_cksum(iph, l3_len); 343 mbuf->ol_flags |= cksum ? 344 RTE_MBUF_F_RX_IP_CKSUM_BAD : 345 RTE_MBUF_F_RX_IP_CKSUM_GOOD; 346 } else if (l3 == RTE_PTYPE_L3_IPV6) { 347 struct rte_ipv6_hdr *iph = l3_hdr; 348 349 l3_len = sizeof(struct rte_ipv6_hdr); 350 /* check that the total length reported by header is not 351 * greater than the total received size 352 */ 353 if (l2_len + l3_len + rte_be_to_cpu_16(iph->payload_len) > 354 rte_pktmbuf_data_len(mbuf)) 355 return; 356 } else { 357 /* - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN cannot happen because 358 * mbuf->packet_type is filled by rte_net_get_ptype() which 359 * never returns this value. 360 * - IPv6 extensions are not supported. 361 */ 362 return; 363 } 364 if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) { 365 int cksum_ok; 366 367 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len); 368 /* Don't verify checksum for multi-segment packets. */ 369 if (mbuf->nb_segs > 1) 370 return; 371 if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) { 372 if (l4 == RTE_PTYPE_L4_UDP) { 373 udp_hdr = (struct rte_udp_hdr *)l4_hdr; 374 if (udp_hdr->dgram_cksum == 0) { 375 /* 376 * For IPv4, a zero UDP checksum 377 * indicates that the sender did not 378 * generate one [RFC 768]. 379 */ 380 mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE; 381 return; 382 } 383 } 384 cksum_ok = !rte_ipv4_udptcp_cksum_verify(l3_hdr, 385 l4_hdr); 386 } else { /* l3 == RTE_PTYPE_L3_IPV6, checked above */ 387 cksum_ok = !rte_ipv6_udptcp_cksum_verify(l3_hdr, 388 l4_hdr); 389 } 390 mbuf->ol_flags |= cksum_ok ? 391 RTE_MBUF_F_RX_L4_CKSUM_GOOD : RTE_MBUF_F_RX_L4_CKSUM_BAD; 392 } 393 } 394 395 static void 396 tap_rxq_pool_free(struct rte_mbuf *pool) 397 { 398 struct rte_mbuf *mbuf = pool; 399 uint16_t nb_segs = 1; 400 401 if (mbuf == NULL) 402 return; 403 404 while (mbuf->next) { 405 mbuf = mbuf->next; 406 nb_segs++; 407 } 408 pool->nb_segs = nb_segs; 409 rte_pktmbuf_free(pool); 410 } 411 412 /* Callback to handle the rx burst of packets to the correct interface and 413 * file descriptor(s) in a multi-queue setup. 414 */ 415 static uint16_t 416 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 417 { 418 struct rx_queue *rxq = queue; 419 struct pmd_process_private *process_private; 420 uint16_t num_rx; 421 unsigned long num_rx_bytes = 0; 422 uint32_t trigger = tap_trigger; 423 424 if (trigger == rxq->trigger_seen) 425 return 0; 426 427 process_private = rte_eth_devices[rxq->in_port].process_private; 428 for (num_rx = 0; num_rx < nb_pkts; ) { 429 struct rte_mbuf *mbuf = rxq->pool; 430 struct rte_mbuf *seg = NULL; 431 struct rte_mbuf *new_tail = NULL; 432 uint16_t data_off = rte_pktmbuf_headroom(mbuf); 433 int len; 434 435 len = readv(process_private->rxq_fds[rxq->queue_id], 436 *rxq->iovecs, 437 1 + (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_SCATTER ? 438 rxq->nb_rx_desc : 1)); 439 if (len < (int)sizeof(struct tun_pi)) 440 break; 441 442 /* Packet couldn't fit in the provided mbuf */ 443 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) { 444 rxq->stats.ierrors++; 445 continue; 446 } 447 448 len -= sizeof(struct tun_pi); 449 450 mbuf->pkt_len = len; 451 mbuf->port = rxq->in_port; 452 while (1) { 453 struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); 454 455 if (unlikely(!buf)) { 456 rxq->stats.rx_nombuf++; 457 /* No new buf has been allocated: do nothing */ 458 if (!new_tail || !seg) 459 goto end; 460 461 seg->next = NULL; 462 tap_rxq_pool_free(mbuf); 463 464 goto end; 465 } 466 seg = seg ? seg->next : mbuf; 467 if (rxq->pool == mbuf) 468 rxq->pool = buf; 469 if (new_tail) 470 new_tail->next = buf; 471 new_tail = buf; 472 new_tail->next = seg->next; 473 474 /* iovecs[0] is reserved for packet info (pi) */ 475 (*rxq->iovecs)[mbuf->nb_segs].iov_len = 476 buf->buf_len - data_off; 477 (*rxq->iovecs)[mbuf->nb_segs].iov_base = 478 (char *)buf->buf_addr + data_off; 479 480 seg->data_len = RTE_MIN(seg->buf_len - data_off, len); 481 seg->data_off = data_off; 482 483 len -= seg->data_len; 484 if (len <= 0) 485 break; 486 mbuf->nb_segs++; 487 /* First segment has headroom, not the others */ 488 data_off = 0; 489 } 490 seg->next = NULL; 491 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL, 492 RTE_PTYPE_ALL_MASK); 493 if (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_CHECKSUM) 494 tap_verify_csum(mbuf); 495 496 /* account for the receive frame */ 497 bufs[num_rx++] = mbuf; 498 num_rx_bytes += mbuf->pkt_len; 499 } 500 end: 501 rxq->stats.ipackets += num_rx; 502 rxq->stats.ibytes += num_rx_bytes; 503 504 if (trigger && num_rx < nb_pkts) 505 rxq->trigger_seen = trigger; 506 507 return num_rx; 508 } 509 510 /* Finalize l4 checksum calculation */ 511 static void 512 tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum, 513 uint32_t l4_raw_cksum) 514 { 515 if (l4_cksum) { 516 uint32_t cksum; 517 518 cksum = __rte_raw_cksum_reduce(l4_raw_cksum); 519 cksum += l4_phdr_cksum; 520 521 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); 522 cksum = (~cksum) & 0xffff; 523 if (cksum == 0) 524 cksum = 0xffff; 525 *l4_cksum = cksum; 526 } 527 } 528 529 /* Accumulate L4 raw checksums */ 530 static void 531 tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum, 532 uint32_t *l4_raw_cksum) 533 { 534 if (l4_cksum == NULL) 535 return; 536 537 *l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum); 538 } 539 540 /* L3 and L4 pseudo headers checksum offloads */ 541 static void 542 tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len, 543 unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum, 544 uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum) 545 { 546 void *l3_hdr = packet + l2_len; 547 548 if (ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_IPV4)) { 549 struct rte_ipv4_hdr *iph = l3_hdr; 550 uint16_t cksum; 551 552 iph->hdr_checksum = 0; 553 cksum = rte_raw_cksum(iph, l3_len); 554 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum; 555 } 556 if (ol_flags & RTE_MBUF_F_TX_L4_MASK) { 557 void *l4_hdr; 558 559 l4_hdr = packet + l2_len + l3_len; 560 if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_UDP_CKSUM) 561 *l4_cksum = &((struct rte_udp_hdr *)l4_hdr)->dgram_cksum; 562 else if ((ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM) 563 *l4_cksum = &((struct rte_tcp_hdr *)l4_hdr)->cksum; 564 else 565 return; 566 **l4_cksum = 0; 567 if (ol_flags & RTE_MBUF_F_TX_IPV4) 568 *l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0); 569 else 570 *l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0); 571 *l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0); 572 } 573 } 574 575 static inline int 576 tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, 577 struct rte_mbuf **pmbufs, 578 uint16_t *num_packets, unsigned long *num_tx_bytes) 579 { 580 int i; 581 uint16_t l234_hlen; 582 struct pmd_process_private *process_private; 583 584 process_private = rte_eth_devices[txq->out_port].process_private; 585 586 for (i = 0; i < num_mbufs; i++) { 587 struct rte_mbuf *mbuf = pmbufs[i]; 588 struct iovec iovecs[mbuf->nb_segs + 2]; 589 struct tun_pi pi = { .flags = 0, .proto = 0x00 }; 590 struct rte_mbuf *seg = mbuf; 591 char m_copy[mbuf->data_len]; 592 int proto; 593 int n; 594 int j; 595 int k; /* current index in iovecs for copying segments */ 596 uint16_t seg_len; /* length of first segment */ 597 uint16_t nb_segs; 598 uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */ 599 uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */ 600 uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */ 601 uint16_t is_cksum = 0; /* in case cksum should be offloaded */ 602 603 l4_cksum = NULL; 604 if (txq->type == ETH_TUNTAP_TYPE_TUN) { 605 /* 606 * TUN and TAP are created with IFF_NO_PI disabled. 607 * For TUN PMD this mandatory as fields are used by 608 * Kernel tun.c to determine whether its IP or non IP 609 * packets. 610 * 611 * The logic fetches the first byte of data from mbuf 612 * then compares whether its v4 or v6. If first byte 613 * is 4 or 6, then protocol field is updated. 614 */ 615 char *buff_data = rte_pktmbuf_mtod(seg, void *); 616 proto = (*buff_data & 0xf0); 617 pi.proto = (proto == 0x40) ? 618 rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4) : 619 ((proto == 0x60) ? 620 rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6) : 621 0x00); 622 } 623 624 k = 0; 625 iovecs[k].iov_base = π 626 iovecs[k].iov_len = sizeof(pi); 627 k++; 628 629 nb_segs = mbuf->nb_segs; 630 if (txq->csum && 631 ((mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_IPV4) || 632 (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_UDP_CKSUM || 633 (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM))) { 634 is_cksum = 1; 635 636 /* Support only packets with at least layer 4 637 * header included in the first segment 638 */ 639 seg_len = rte_pktmbuf_data_len(mbuf); 640 l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; 641 if (seg_len < l234_hlen) 642 return -1; 643 644 /* To change checksums, work on a * copy of l2, l3 645 * headers + l4 pseudo header 646 */ 647 rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *), 648 l234_hlen); 649 tap_tx_l3_cksum(m_copy, mbuf->ol_flags, 650 mbuf->l2_len, mbuf->l3_len, mbuf->l4_len, 651 &l4_cksum, &l4_phdr_cksum, 652 &l4_raw_cksum); 653 iovecs[k].iov_base = m_copy; 654 iovecs[k].iov_len = l234_hlen; 655 k++; 656 657 /* Update next iovecs[] beyond l2, l3, l4 headers */ 658 if (seg_len > l234_hlen) { 659 iovecs[k].iov_len = seg_len - l234_hlen; 660 iovecs[k].iov_base = 661 rte_pktmbuf_mtod(seg, char *) + 662 l234_hlen; 663 tap_tx_l4_add_rcksum(iovecs[k].iov_base, 664 iovecs[k].iov_len, l4_cksum, 665 &l4_raw_cksum); 666 k++; 667 nb_segs++; 668 } 669 seg = seg->next; 670 } 671 672 for (j = k; j <= nb_segs; j++) { 673 iovecs[j].iov_len = rte_pktmbuf_data_len(seg); 674 iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *); 675 if (is_cksum) 676 tap_tx_l4_add_rcksum(iovecs[j].iov_base, 677 iovecs[j].iov_len, l4_cksum, 678 &l4_raw_cksum); 679 seg = seg->next; 680 } 681 682 if (is_cksum) 683 tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum); 684 685 /* copy the tx frame data */ 686 n = writev(process_private->txq_fds[txq->queue_id], iovecs, j); 687 if (n <= 0) 688 return -1; 689 690 (*num_packets)++; 691 (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf); 692 } 693 return 0; 694 } 695 696 /* Callback to handle sending packets from the tap interface 697 */ 698 static uint16_t 699 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 700 { 701 struct tx_queue *txq = queue; 702 uint16_t num_tx = 0; 703 uint16_t num_packets = 0; 704 unsigned long num_tx_bytes = 0; 705 uint32_t max_size; 706 int i; 707 708 if (unlikely(nb_pkts == 0)) 709 return 0; 710 711 struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS]; 712 max_size = *txq->mtu + (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + 4); 713 for (i = 0; i < nb_pkts; i++) { 714 struct rte_mbuf *mbuf_in = bufs[num_tx]; 715 struct rte_mbuf **mbuf; 716 uint16_t num_mbufs = 0; 717 uint16_t tso_segsz = 0; 718 int ret; 719 int num_tso_mbufs; 720 uint16_t hdrs_len; 721 uint64_t tso; 722 723 tso = mbuf_in->ol_flags & RTE_MBUF_F_TX_TCP_SEG; 724 if (tso) { 725 struct rte_gso_ctx *gso_ctx = &txq->gso_ctx; 726 727 /* TCP segmentation implies TCP checksum offload */ 728 mbuf_in->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; 729 730 /* gso size is calculated without RTE_ETHER_CRC_LEN */ 731 hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len + 732 mbuf_in->l4_len; 733 tso_segsz = mbuf_in->tso_segsz + hdrs_len; 734 if (unlikely(tso_segsz == hdrs_len) || 735 tso_segsz > *txq->mtu) { 736 txq->stats.errs++; 737 break; 738 } 739 gso_ctx->gso_size = tso_segsz; 740 /* 'mbuf_in' packet to segment */ 741 num_tso_mbufs = rte_gso_segment(mbuf_in, 742 gso_ctx, /* gso control block */ 743 (struct rte_mbuf **)&gso_mbufs, /* out mbufs */ 744 RTE_DIM(gso_mbufs)); /* max tso mbufs */ 745 746 /* ret contains the number of new created mbufs */ 747 if (num_tso_mbufs < 0) 748 break; 749 750 if (num_tso_mbufs >= 1) { 751 mbuf = gso_mbufs; 752 num_mbufs = num_tso_mbufs; 753 } else { 754 /* 0 means it can be transmitted directly 755 * without gso. 756 */ 757 mbuf = &mbuf_in; 758 num_mbufs = 1; 759 } 760 } else { 761 /* stats.errs will be incremented */ 762 if (rte_pktmbuf_pkt_len(mbuf_in) > max_size) 763 break; 764 765 /* ret 0 indicates no new mbufs were created */ 766 num_tso_mbufs = 0; 767 mbuf = &mbuf_in; 768 num_mbufs = 1; 769 } 770 771 ret = tap_write_mbufs(txq, num_mbufs, mbuf, 772 &num_packets, &num_tx_bytes); 773 if (ret == -1) { 774 txq->stats.errs++; 775 /* free tso mbufs */ 776 if (num_tso_mbufs > 0) 777 rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs); 778 break; 779 } 780 num_tx++; 781 /* free original mbuf */ 782 rte_pktmbuf_free(mbuf_in); 783 /* free tso mbufs */ 784 if (num_tso_mbufs > 0) 785 rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs); 786 } 787 788 txq->stats.opackets += num_packets; 789 txq->stats.errs += nb_pkts - num_tx; 790 txq->stats.obytes += num_tx_bytes; 791 792 return num_tx; 793 } 794 795 static const char * 796 tap_ioctl_req2str(unsigned long request) 797 { 798 switch (request) { 799 case SIOCSIFFLAGS: 800 return "SIOCSIFFLAGS"; 801 case SIOCGIFFLAGS: 802 return "SIOCGIFFLAGS"; 803 case SIOCGIFHWADDR: 804 return "SIOCGIFHWADDR"; 805 case SIOCSIFHWADDR: 806 return "SIOCSIFHWADDR"; 807 case SIOCSIFMTU: 808 return "SIOCSIFMTU"; 809 } 810 return "UNKNOWN"; 811 } 812 813 static int 814 tap_ioctl(struct pmd_internals *pmd, unsigned long request, 815 struct ifreq *ifr, int set, enum ioctl_mode mode) 816 { 817 short req_flags = ifr->ifr_flags; 818 int remote = pmd->remote_if_index && 819 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE); 820 821 if (!pmd->remote_if_index && mode == REMOTE_ONLY) 822 return 0; 823 /* 824 * If there is a remote netdevice, apply ioctl on it, then apply it on 825 * the tap netdevice. 826 */ 827 apply: 828 if (remote) 829 strlcpy(ifr->ifr_name, pmd->remote_iface, IFNAMSIZ); 830 else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE) 831 strlcpy(ifr->ifr_name, pmd->name, IFNAMSIZ); 832 switch (request) { 833 case SIOCSIFFLAGS: 834 /* fetch current flags to leave other flags untouched */ 835 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0) 836 goto error; 837 if (set) 838 ifr->ifr_flags |= req_flags; 839 else 840 ifr->ifr_flags &= ~req_flags; 841 break; 842 case SIOCGIFFLAGS: 843 case SIOCGIFHWADDR: 844 case SIOCSIFHWADDR: 845 case SIOCSIFMTU: 846 break; 847 default: 848 TAP_LOG(WARNING, "%s: ioctl() called with wrong arg", 849 pmd->name); 850 return -EINVAL; 851 } 852 if (ioctl(pmd->ioctl_sock, request, ifr) < 0) 853 goto error; 854 if (remote-- && mode == LOCAL_AND_REMOTE) 855 goto apply; 856 return 0; 857 858 error: 859 TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name, 860 tap_ioctl_req2str(request), strerror(errno), errno); 861 return -errno; 862 } 863 864 static int 865 tap_link_set_down(struct rte_eth_dev *dev) 866 { 867 struct pmd_internals *pmd = dev->data->dev_private; 868 struct ifreq ifr = { .ifr_flags = IFF_UP }; 869 870 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 871 return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY); 872 } 873 874 static int 875 tap_link_set_up(struct rte_eth_dev *dev) 876 { 877 struct pmd_internals *pmd = dev->data->dev_private; 878 struct ifreq ifr = { .ifr_flags = IFF_UP }; 879 880 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 881 return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); 882 } 883 884 static int 885 tap_mp_req_on_rxtx(struct rte_eth_dev *dev) 886 { 887 struct rte_mp_msg msg; 888 struct ipc_queues *request_param = (struct ipc_queues *)msg.param; 889 int err; 890 int fd_iterator = 0; 891 struct pmd_process_private *process_private = dev->process_private; 892 int i; 893 894 memset(&msg, 0, sizeof(msg)); 895 strlcpy(msg.name, TAP_MP_REQ_START_RXTX, sizeof(msg.name)); 896 strlcpy(request_param->port_name, dev->data->name, sizeof(request_param->port_name)); 897 msg.len_param = sizeof(*request_param); 898 for (i = 0; i < dev->data->nb_tx_queues; i++) { 899 msg.fds[fd_iterator++] = process_private->txq_fds[i]; 900 msg.num_fds++; 901 request_param->txq_count++; 902 } 903 for (i = 0; i < dev->data->nb_rx_queues; i++) { 904 msg.fds[fd_iterator++] = process_private->rxq_fds[i]; 905 msg.num_fds++; 906 request_param->rxq_count++; 907 } 908 909 err = rte_mp_sendmsg(&msg); 910 if (err < 0) { 911 TAP_LOG(ERR, "Failed to send start req to secondary %d", 912 rte_errno); 913 return -1; 914 } 915 916 return 0; 917 } 918 919 static int 920 tap_dev_start(struct rte_eth_dev *dev) 921 { 922 int err, i; 923 924 if (rte_eal_process_type() == RTE_PROC_PRIMARY) 925 tap_mp_req_on_rxtx(dev); 926 927 err = tap_intr_handle_set(dev, 1); 928 if (err) 929 return err; 930 931 err = tap_link_set_up(dev); 932 if (err) 933 return err; 934 935 for (i = 0; i < dev->data->nb_tx_queues; i++) 936 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 937 for (i = 0; i < dev->data->nb_rx_queues; i++) 938 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 939 940 return err; 941 } 942 943 static int 944 tap_mp_req_start_rxtx(const struct rte_mp_msg *request, __rte_unused const void *peer) 945 { 946 struct rte_eth_dev *dev; 947 const struct ipc_queues *request_param = 948 (const struct ipc_queues *)request->param; 949 int fd_iterator; 950 int queue; 951 struct pmd_process_private *process_private; 952 953 dev = rte_eth_dev_get_by_name(request_param->port_name); 954 if (!dev) { 955 TAP_LOG(ERR, "Failed to get dev for %s", 956 request_param->port_name); 957 return -1; 958 } 959 process_private = dev->process_private; 960 fd_iterator = 0; 961 TAP_LOG(DEBUG, "tap_attach rx_q:%d tx_q:%d\n", request_param->rxq_count, 962 request_param->txq_count); 963 for (queue = 0; queue < request_param->txq_count; queue++) 964 process_private->txq_fds[queue] = request->fds[fd_iterator++]; 965 for (queue = 0; queue < request_param->rxq_count; queue++) 966 process_private->rxq_fds[queue] = request->fds[fd_iterator++]; 967 968 return 0; 969 } 970 971 /* This function gets called when the current port gets stopped. 972 */ 973 static int 974 tap_dev_stop(struct rte_eth_dev *dev) 975 { 976 int i; 977 978 for (i = 0; i < dev->data->nb_tx_queues; i++) 979 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 980 for (i = 0; i < dev->data->nb_rx_queues; i++) 981 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 982 983 tap_intr_handle_set(dev, 0); 984 tap_link_set_down(dev); 985 986 return 0; 987 } 988 989 static int 990 tap_dev_configure(struct rte_eth_dev *dev) 991 { 992 struct pmd_internals *pmd = dev->data->dev_private; 993 994 if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) { 995 TAP_LOG(ERR, 996 "%s: number of rx queues %d exceeds max num of queues %d", 997 dev->device->name, 998 dev->data->nb_rx_queues, 999 RTE_PMD_TAP_MAX_QUEUES); 1000 return -1; 1001 } 1002 if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) { 1003 TAP_LOG(ERR, 1004 "%s: number of tx queues %d exceeds max num of queues %d", 1005 dev->device->name, 1006 dev->data->nb_tx_queues, 1007 RTE_PMD_TAP_MAX_QUEUES); 1008 return -1; 1009 } 1010 if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) { 1011 TAP_LOG(ERR, 1012 "%s: number of rx queues %d must be equal to number of tx queues %d", 1013 dev->device->name, 1014 dev->data->nb_rx_queues, 1015 dev->data->nb_tx_queues); 1016 return -1; 1017 } 1018 1019 TAP_LOG(INFO, "%s: %s: TX configured queues number: %u", 1020 dev->device->name, pmd->name, dev->data->nb_tx_queues); 1021 1022 TAP_LOG(INFO, "%s: %s: RX configured queues number: %u", 1023 dev->device->name, pmd->name, dev->data->nb_rx_queues); 1024 1025 return 0; 1026 } 1027 1028 static uint32_t 1029 tap_dev_speed_capa(void) 1030 { 1031 uint32_t speed = pmd_link.link_speed; 1032 uint32_t capa = 0; 1033 1034 if (speed >= RTE_ETH_SPEED_NUM_10M) 1035 capa |= RTE_ETH_LINK_SPEED_10M; 1036 if (speed >= RTE_ETH_SPEED_NUM_100M) 1037 capa |= RTE_ETH_LINK_SPEED_100M; 1038 if (speed >= RTE_ETH_SPEED_NUM_1G) 1039 capa |= RTE_ETH_LINK_SPEED_1G; 1040 if (speed >= RTE_ETH_SPEED_NUM_5G) 1041 capa |= RTE_ETH_LINK_SPEED_2_5G; 1042 if (speed >= RTE_ETH_SPEED_NUM_5G) 1043 capa |= RTE_ETH_LINK_SPEED_5G; 1044 if (speed >= RTE_ETH_SPEED_NUM_10G) 1045 capa |= RTE_ETH_LINK_SPEED_10G; 1046 if (speed >= RTE_ETH_SPEED_NUM_20G) 1047 capa |= RTE_ETH_LINK_SPEED_20G; 1048 if (speed >= RTE_ETH_SPEED_NUM_25G) 1049 capa |= RTE_ETH_LINK_SPEED_25G; 1050 if (speed >= RTE_ETH_SPEED_NUM_40G) 1051 capa |= RTE_ETH_LINK_SPEED_40G; 1052 if (speed >= RTE_ETH_SPEED_NUM_50G) 1053 capa |= RTE_ETH_LINK_SPEED_50G; 1054 if (speed >= RTE_ETH_SPEED_NUM_56G) 1055 capa |= RTE_ETH_LINK_SPEED_56G; 1056 if (speed >= RTE_ETH_SPEED_NUM_100G) 1057 capa |= RTE_ETH_LINK_SPEED_100G; 1058 1059 return capa; 1060 } 1061 1062 static int 1063 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 1064 { 1065 struct pmd_internals *internals = dev->data->dev_private; 1066 1067 dev_info->if_index = internals->if_index; 1068 dev_info->max_mac_addrs = 1; 1069 dev_info->max_rx_pktlen = (uint32_t)RTE_ETHER_MAX_VLAN_FRAME_LEN; 1070 dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES; 1071 dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES; 1072 dev_info->min_rx_bufsize = 0; 1073 dev_info->speed_capa = tap_dev_speed_capa(); 1074 dev_info->rx_queue_offload_capa = TAP_RX_OFFLOAD; 1075 dev_info->rx_offload_capa = dev_info->rx_queue_offload_capa; 1076 dev_info->tx_queue_offload_capa = TAP_TX_OFFLOAD; 1077 dev_info->tx_offload_capa = dev_info->tx_queue_offload_capa; 1078 dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE; 1079 /* 1080 * limitation: TAP supports all of IP, UDP and TCP hash 1081 * functions together and not in partial combinations 1082 */ 1083 dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK; 1084 dev_info->dev_capa &= ~RTE_ETH_DEV_CAPA_FLOW_RULE_KEEP; 1085 1086 return 0; 1087 } 1088 1089 static int 1090 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats) 1091 { 1092 unsigned int i, imax; 1093 unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; 1094 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 1095 unsigned long rx_nombuf = 0, ierrors = 0; 1096 const struct pmd_internals *pmd = dev->data->dev_private; 1097 1098 /* rx queue statistics */ 1099 imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? 1100 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS; 1101 for (i = 0; i < imax; i++) { 1102 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets; 1103 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes; 1104 rx_total += tap_stats->q_ipackets[i]; 1105 rx_bytes_total += tap_stats->q_ibytes[i]; 1106 rx_nombuf += pmd->rxq[i].stats.rx_nombuf; 1107 ierrors += pmd->rxq[i].stats.ierrors; 1108 } 1109 1110 /* tx queue statistics */ 1111 imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? 1112 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS; 1113 1114 for (i = 0; i < imax; i++) { 1115 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets; 1116 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes; 1117 tx_total += tap_stats->q_opackets[i]; 1118 tx_err_total += pmd->txq[i].stats.errs; 1119 tx_bytes_total += tap_stats->q_obytes[i]; 1120 } 1121 1122 tap_stats->ipackets = rx_total; 1123 tap_stats->ibytes = rx_bytes_total; 1124 tap_stats->ierrors = ierrors; 1125 tap_stats->rx_nombuf = rx_nombuf; 1126 tap_stats->opackets = tx_total; 1127 tap_stats->oerrors = tx_err_total; 1128 tap_stats->obytes = tx_bytes_total; 1129 return 0; 1130 } 1131 1132 static int 1133 tap_stats_reset(struct rte_eth_dev *dev) 1134 { 1135 int i; 1136 struct pmd_internals *pmd = dev->data->dev_private; 1137 1138 for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { 1139 pmd->rxq[i].stats.ipackets = 0; 1140 pmd->rxq[i].stats.ibytes = 0; 1141 pmd->rxq[i].stats.ierrors = 0; 1142 pmd->rxq[i].stats.rx_nombuf = 0; 1143 1144 pmd->txq[i].stats.opackets = 0; 1145 pmd->txq[i].stats.errs = 0; 1146 pmd->txq[i].stats.obytes = 0; 1147 } 1148 1149 return 0; 1150 } 1151 1152 static int 1153 tap_dev_close(struct rte_eth_dev *dev) 1154 { 1155 int i; 1156 struct pmd_internals *internals = dev->data->dev_private; 1157 struct pmd_process_private *process_private = dev->process_private; 1158 struct rx_queue *rxq; 1159 1160 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 1161 rte_free(dev->process_private); 1162 if (tap_devices_count == 1) 1163 rte_mp_action_unregister(TAP_MP_REQ_START_RXTX); 1164 tap_devices_count--; 1165 return 0; 1166 } 1167 1168 tap_link_set_down(dev); 1169 if (internals->nlsk_fd != -1) { 1170 tap_flow_flush(dev, NULL); 1171 tap_flow_implicit_flush(internals, NULL); 1172 tap_nl_final(internals->nlsk_fd); 1173 internals->nlsk_fd = -1; 1174 } 1175 1176 for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { 1177 if (process_private->rxq_fds[i] != -1) { 1178 rxq = &internals->rxq[i]; 1179 close(process_private->rxq_fds[i]); 1180 process_private->rxq_fds[i] = -1; 1181 tap_rxq_pool_free(rxq->pool); 1182 rte_free(rxq->iovecs); 1183 rxq->pool = NULL; 1184 rxq->iovecs = NULL; 1185 } 1186 if (process_private->txq_fds[i] != -1) { 1187 close(process_private->txq_fds[i]); 1188 process_private->txq_fds[i] = -1; 1189 } 1190 } 1191 1192 if (internals->remote_if_index) { 1193 /* Restore initial remote state */ 1194 int ret = ioctl(internals->ioctl_sock, SIOCSIFFLAGS, 1195 &internals->remote_initial_flags); 1196 if (ret) 1197 TAP_LOG(ERR, "restore remote state failed: %d", ret); 1198 1199 } 1200 1201 rte_mempool_free(internals->gso_ctx_mp); 1202 internals->gso_ctx_mp = NULL; 1203 1204 if (internals->ka_fd != -1) { 1205 close(internals->ka_fd); 1206 internals->ka_fd = -1; 1207 } 1208 1209 /* mac_addrs must not be freed alone because part of dev_private */ 1210 dev->data->mac_addrs = NULL; 1211 1212 internals = dev->data->dev_private; 1213 TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u", 1214 tuntap_types[internals->type], rte_socket_id()); 1215 1216 if (internals->ioctl_sock != -1) { 1217 close(internals->ioctl_sock); 1218 internals->ioctl_sock = -1; 1219 } 1220 rte_free(dev->process_private); 1221 if (tap_devices_count == 1) 1222 rte_mp_action_unregister(TAP_MP_KEY); 1223 tap_devices_count--; 1224 /* 1225 * Since TUN device has no more opened file descriptors 1226 * it will be removed from kernel 1227 */ 1228 1229 return 0; 1230 } 1231 1232 static void 1233 tap_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid) 1234 { 1235 struct rx_queue *rxq = dev->data->rx_queues[qid]; 1236 struct pmd_process_private *process_private; 1237 1238 if (!rxq) 1239 return; 1240 process_private = rte_eth_devices[rxq->in_port].process_private; 1241 if (process_private->rxq_fds[rxq->queue_id] != -1) { 1242 close(process_private->rxq_fds[rxq->queue_id]); 1243 process_private->rxq_fds[rxq->queue_id] = -1; 1244 tap_rxq_pool_free(rxq->pool); 1245 rte_free(rxq->iovecs); 1246 rxq->pool = NULL; 1247 rxq->iovecs = NULL; 1248 } 1249 } 1250 1251 static void 1252 tap_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid) 1253 { 1254 struct tx_queue *txq = dev->data->tx_queues[qid]; 1255 struct pmd_process_private *process_private; 1256 1257 if (!txq) 1258 return; 1259 process_private = rte_eth_devices[txq->out_port].process_private; 1260 1261 if (process_private->txq_fds[txq->queue_id] != -1) { 1262 close(process_private->txq_fds[txq->queue_id]); 1263 process_private->txq_fds[txq->queue_id] = -1; 1264 } 1265 } 1266 1267 static int 1268 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused) 1269 { 1270 struct rte_eth_link *dev_link = &dev->data->dev_link; 1271 struct pmd_internals *pmd = dev->data->dev_private; 1272 struct ifreq ifr = { .ifr_flags = 0 }; 1273 1274 if (pmd->remote_if_index) { 1275 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY); 1276 if (!(ifr.ifr_flags & IFF_UP) || 1277 !(ifr.ifr_flags & IFF_RUNNING)) { 1278 dev_link->link_status = RTE_ETH_LINK_DOWN; 1279 return 0; 1280 } 1281 } 1282 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY); 1283 dev_link->link_status = 1284 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ? 1285 RTE_ETH_LINK_UP : 1286 RTE_ETH_LINK_DOWN); 1287 return 0; 1288 } 1289 1290 static int 1291 tap_promisc_enable(struct rte_eth_dev *dev) 1292 { 1293 struct pmd_internals *pmd = dev->data->dev_private; 1294 struct ifreq ifr = { .ifr_flags = IFF_PROMISC }; 1295 int ret; 1296 1297 ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); 1298 if (ret != 0) 1299 return ret; 1300 1301 if (pmd->remote_if_index && !pmd->flow_isolate) { 1302 dev->data->promiscuous = 1; 1303 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC); 1304 if (ret != 0) { 1305 /* Rollback promisc flag */ 1306 tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); 1307 /* 1308 * rte_eth_dev_promiscuous_enable() rollback 1309 * dev->data->promiscuous in the case of failure. 1310 */ 1311 return ret; 1312 } 1313 } 1314 1315 return 0; 1316 } 1317 1318 static int 1319 tap_promisc_disable(struct rte_eth_dev *dev) 1320 { 1321 struct pmd_internals *pmd = dev->data->dev_private; 1322 struct ifreq ifr = { .ifr_flags = IFF_PROMISC }; 1323 int ret; 1324 1325 ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); 1326 if (ret != 0) 1327 return ret; 1328 1329 if (pmd->remote_if_index && !pmd->flow_isolate) { 1330 dev->data->promiscuous = 0; 1331 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC); 1332 if (ret != 0) { 1333 /* Rollback promisc flag */ 1334 tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); 1335 /* 1336 * rte_eth_dev_promiscuous_disable() rollback 1337 * dev->data->promiscuous in the case of failure. 1338 */ 1339 return ret; 1340 } 1341 } 1342 1343 return 0; 1344 } 1345 1346 static int 1347 tap_allmulti_enable(struct rte_eth_dev *dev) 1348 { 1349 struct pmd_internals *pmd = dev->data->dev_private; 1350 struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI }; 1351 int ret; 1352 1353 ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); 1354 if (ret != 0) 1355 return ret; 1356 1357 if (pmd->remote_if_index && !pmd->flow_isolate) { 1358 dev->data->all_multicast = 1; 1359 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI); 1360 if (ret != 0) { 1361 /* Rollback allmulti flag */ 1362 tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); 1363 /* 1364 * rte_eth_dev_allmulticast_enable() rollback 1365 * dev->data->all_multicast in the case of failure. 1366 */ 1367 return ret; 1368 } 1369 } 1370 1371 return 0; 1372 } 1373 1374 static int 1375 tap_allmulti_disable(struct rte_eth_dev *dev) 1376 { 1377 struct pmd_internals *pmd = dev->data->dev_private; 1378 struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI }; 1379 int ret; 1380 1381 ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE); 1382 if (ret != 0) 1383 return ret; 1384 1385 if (pmd->remote_if_index && !pmd->flow_isolate) { 1386 dev->data->all_multicast = 0; 1387 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI); 1388 if (ret != 0) { 1389 /* Rollback allmulti flag */ 1390 tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE); 1391 /* 1392 * rte_eth_dev_allmulticast_disable() rollback 1393 * dev->data->all_multicast in the case of failure. 1394 */ 1395 return ret; 1396 } 1397 } 1398 1399 return 0; 1400 } 1401 1402 static int 1403 tap_mac_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr) 1404 { 1405 struct pmd_internals *pmd = dev->data->dev_private; 1406 enum ioctl_mode mode = LOCAL_ONLY; 1407 struct ifreq ifr; 1408 int ret; 1409 1410 if (pmd->type == ETH_TUNTAP_TYPE_TUN) { 1411 TAP_LOG(ERR, "%s: can't MAC address for TUN", 1412 dev->device->name); 1413 return -ENOTSUP; 1414 } 1415 1416 if (rte_is_zero_ether_addr(mac_addr)) { 1417 TAP_LOG(ERR, "%s: can't set an empty MAC address", 1418 dev->device->name); 1419 return -EINVAL; 1420 } 1421 /* Check the actual current MAC address on the tap netdevice */ 1422 ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY); 1423 if (ret < 0) 1424 return ret; 1425 if (rte_is_same_ether_addr( 1426 (struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data, 1427 mac_addr)) 1428 return 0; 1429 /* Check the current MAC address on the remote */ 1430 ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY); 1431 if (ret < 0) 1432 return ret; 1433 if (!rte_is_same_ether_addr( 1434 (struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data, 1435 mac_addr)) 1436 mode = LOCAL_AND_REMOTE; 1437 ifr.ifr_hwaddr.sa_family = AF_LOCAL; 1438 rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, RTE_ETHER_ADDR_LEN); 1439 ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode); 1440 if (ret < 0) 1441 return ret; 1442 rte_memcpy(&pmd->eth_addr, mac_addr, RTE_ETHER_ADDR_LEN); 1443 if (pmd->remote_if_index && !pmd->flow_isolate) { 1444 /* Replace MAC redirection rule after a MAC change */ 1445 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC); 1446 if (ret < 0) { 1447 TAP_LOG(ERR, 1448 "%s: Couldn't delete MAC redirection rule", 1449 dev->device->name); 1450 return ret; 1451 } 1452 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC); 1453 if (ret < 0) { 1454 TAP_LOG(ERR, 1455 "%s: Couldn't add MAC redirection rule", 1456 dev->device->name); 1457 return ret; 1458 } 1459 } 1460 1461 return 0; 1462 } 1463 1464 static int 1465 tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev) 1466 { 1467 uint32_t gso_types; 1468 char pool_name[64]; 1469 struct pmd_internals *pmd = dev->data->dev_private; 1470 int ret; 1471 1472 /* initialize GSO context */ 1473 gso_types = RTE_ETH_TX_OFFLOAD_TCP_TSO; 1474 if (!pmd->gso_ctx_mp) { 1475 /* 1476 * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE 1477 * bytes size per mbuf use this pool for both direct and 1478 * indirect mbufs 1479 */ 1480 ret = snprintf(pool_name, sizeof(pool_name), "mp_%s", 1481 dev->device->name); 1482 if (ret < 0 || ret >= (int)sizeof(pool_name)) { 1483 TAP_LOG(ERR, 1484 "%s: failed to create mbuf pool name for device %s," 1485 "device name too long or output error, ret: %d\n", 1486 pmd->name, dev->device->name, ret); 1487 return -ENAMETOOLONG; 1488 } 1489 pmd->gso_ctx_mp = rte_pktmbuf_pool_create(pool_name, 1490 TAP_GSO_MBUFS_NUM, TAP_GSO_MBUF_CACHE_SIZE, 0, 1491 RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE, 1492 SOCKET_ID_ANY); 1493 if (!pmd->gso_ctx_mp) { 1494 TAP_LOG(ERR, 1495 "%s: failed to create mbuf pool for device %s\n", 1496 pmd->name, dev->device->name); 1497 return -1; 1498 } 1499 } 1500 1501 gso_ctx->direct_pool = pmd->gso_ctx_mp; 1502 gso_ctx->indirect_pool = pmd->gso_ctx_mp; 1503 gso_ctx->gso_types = gso_types; 1504 gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */ 1505 gso_ctx->flag = 0; 1506 1507 return 0; 1508 } 1509 1510 static int 1511 tap_setup_queue(struct rte_eth_dev *dev, 1512 struct pmd_internals *internals, 1513 uint16_t qid, 1514 int is_rx) 1515 { 1516 int ret; 1517 int *fd; 1518 int *other_fd; 1519 const char *dir; 1520 struct pmd_internals *pmd = dev->data->dev_private; 1521 struct pmd_process_private *process_private = dev->process_private; 1522 struct rx_queue *rx = &internals->rxq[qid]; 1523 struct tx_queue *tx = &internals->txq[qid]; 1524 struct rte_gso_ctx *gso_ctx; 1525 1526 if (is_rx) { 1527 fd = &process_private->rxq_fds[qid]; 1528 other_fd = &process_private->txq_fds[qid]; 1529 dir = "rx"; 1530 gso_ctx = NULL; 1531 } else { 1532 fd = &process_private->txq_fds[qid]; 1533 other_fd = &process_private->rxq_fds[qid]; 1534 dir = "tx"; 1535 gso_ctx = &tx->gso_ctx; 1536 } 1537 if (*fd != -1) { 1538 /* fd for this queue already exists */ 1539 TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists", 1540 pmd->name, *fd, dir, qid); 1541 gso_ctx = NULL; 1542 } else if (*other_fd != -1) { 1543 /* Only other_fd exists. dup it */ 1544 *fd = dup(*other_fd); 1545 if (*fd < 0) { 1546 *fd = -1; 1547 TAP_LOG(ERR, "%s: dup() failed.", pmd->name); 1548 return -1; 1549 } 1550 TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)", 1551 pmd->name, *other_fd, dir, qid, *fd); 1552 } else { 1553 /* Both RX and TX fds do not exist (equal -1). Create fd */ 1554 *fd = tun_alloc(pmd, 0); 1555 if (*fd < 0) { 1556 *fd = -1; /* restore original value */ 1557 TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name); 1558 return -1; 1559 } 1560 TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d", 1561 pmd->name, dir, qid, *fd); 1562 } 1563 1564 tx->mtu = &dev->data->mtu; 1565 rx->rxmode = &dev->data->dev_conf.rxmode; 1566 if (gso_ctx) { 1567 ret = tap_gso_ctx_setup(gso_ctx, dev); 1568 if (ret) 1569 return -1; 1570 } 1571 1572 tx->type = pmd->type; 1573 1574 return *fd; 1575 } 1576 1577 static int 1578 tap_rx_queue_setup(struct rte_eth_dev *dev, 1579 uint16_t rx_queue_id, 1580 uint16_t nb_rx_desc, 1581 unsigned int socket_id, 1582 const struct rte_eth_rxconf *rx_conf __rte_unused, 1583 struct rte_mempool *mp) 1584 { 1585 struct pmd_internals *internals = dev->data->dev_private; 1586 struct pmd_process_private *process_private = dev->process_private; 1587 struct rx_queue *rxq = &internals->rxq[rx_queue_id]; 1588 struct rte_mbuf **tmp = &rxq->pool; 1589 long iov_max = sysconf(_SC_IOV_MAX); 1590 1591 if (iov_max <= 0) { 1592 TAP_LOG(WARNING, 1593 "_SC_IOV_MAX is not defined. Using %d as default", 1594 TAP_IOV_DEFAULT_MAX); 1595 iov_max = TAP_IOV_DEFAULT_MAX; 1596 } 1597 uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1); 1598 struct iovec (*iovecs)[nb_desc + 1]; 1599 int data_off = RTE_PKTMBUF_HEADROOM; 1600 int ret = 0; 1601 int fd; 1602 int i; 1603 1604 if (rx_queue_id >= dev->data->nb_rx_queues || !mp) { 1605 TAP_LOG(WARNING, 1606 "nb_rx_queues %d too small or mempool NULL", 1607 dev->data->nb_rx_queues); 1608 return -1; 1609 } 1610 1611 rxq->mp = mp; 1612 rxq->trigger_seen = 1; /* force initial burst */ 1613 rxq->in_port = dev->data->port_id; 1614 rxq->queue_id = rx_queue_id; 1615 rxq->nb_rx_desc = nb_desc; 1616 iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0, 1617 socket_id); 1618 if (!iovecs) { 1619 TAP_LOG(WARNING, 1620 "%s: Couldn't allocate %d RX descriptors", 1621 dev->device->name, nb_desc); 1622 return -ENOMEM; 1623 } 1624 rxq->iovecs = iovecs; 1625 1626 dev->data->rx_queues[rx_queue_id] = rxq; 1627 fd = tap_setup_queue(dev, internals, rx_queue_id, 1); 1628 if (fd == -1) { 1629 ret = fd; 1630 goto error; 1631 } 1632 1633 (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi); 1634 (*rxq->iovecs)[0].iov_base = &rxq->pi; 1635 1636 for (i = 1; i <= nb_desc; i++) { 1637 *tmp = rte_pktmbuf_alloc(rxq->mp); 1638 if (!*tmp) { 1639 TAP_LOG(WARNING, 1640 "%s: couldn't allocate memory for queue %d", 1641 dev->device->name, rx_queue_id); 1642 ret = -ENOMEM; 1643 goto error; 1644 } 1645 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off; 1646 (*rxq->iovecs)[i].iov_base = 1647 (char *)(*tmp)->buf_addr + data_off; 1648 data_off = 0; 1649 tmp = &(*tmp)->next; 1650 } 1651 1652 TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d", 1653 internals->name, rx_queue_id, 1654 process_private->rxq_fds[rx_queue_id]); 1655 1656 return 0; 1657 1658 error: 1659 tap_rxq_pool_free(rxq->pool); 1660 rxq->pool = NULL; 1661 rte_free(rxq->iovecs); 1662 rxq->iovecs = NULL; 1663 return ret; 1664 } 1665 1666 static int 1667 tap_tx_queue_setup(struct rte_eth_dev *dev, 1668 uint16_t tx_queue_id, 1669 uint16_t nb_tx_desc __rte_unused, 1670 unsigned int socket_id __rte_unused, 1671 const struct rte_eth_txconf *tx_conf) 1672 { 1673 struct pmd_internals *internals = dev->data->dev_private; 1674 struct pmd_process_private *process_private = dev->process_private; 1675 struct tx_queue *txq; 1676 int ret; 1677 uint64_t offloads; 1678 1679 if (tx_queue_id >= dev->data->nb_tx_queues) 1680 return -1; 1681 dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id]; 1682 txq = dev->data->tx_queues[tx_queue_id]; 1683 txq->out_port = dev->data->port_id; 1684 txq->queue_id = tx_queue_id; 1685 1686 offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads; 1687 txq->csum = !!(offloads & 1688 (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 1689 RTE_ETH_TX_OFFLOAD_UDP_CKSUM | 1690 RTE_ETH_TX_OFFLOAD_TCP_CKSUM)); 1691 1692 ret = tap_setup_queue(dev, internals, tx_queue_id, 0); 1693 if (ret == -1) 1694 return -1; 1695 TAP_LOG(DEBUG, 1696 " TX TUNTAP device name %s, qid %d on fd %d csum %s", 1697 internals->name, tx_queue_id, 1698 process_private->txq_fds[tx_queue_id], 1699 txq->csum ? "on" : "off"); 1700 1701 return 0; 1702 } 1703 1704 static int 1705 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 1706 { 1707 struct pmd_internals *pmd = dev->data->dev_private; 1708 struct ifreq ifr = { .ifr_mtu = mtu }; 1709 1710 return tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE); 1711 } 1712 1713 static int 1714 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused, 1715 struct rte_ether_addr *mc_addr_set __rte_unused, 1716 uint32_t nb_mc_addr __rte_unused) 1717 { 1718 /* 1719 * Nothing to do actually: the tap has no filtering whatsoever, every 1720 * packet is received. 1721 */ 1722 return 0; 1723 } 1724 1725 static int 1726 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg) 1727 { 1728 struct rte_eth_dev *dev = arg; 1729 struct pmd_internals *pmd = dev->data->dev_private; 1730 struct ifinfomsg *info = NLMSG_DATA(nh); 1731 1732 if (nh->nlmsg_type != RTM_NEWLINK || 1733 (info->ifi_index != pmd->if_index && 1734 info->ifi_index != pmd->remote_if_index)) 1735 return 0; 1736 return tap_link_update(dev, 0); 1737 } 1738 1739 static void 1740 tap_dev_intr_handler(void *cb_arg) 1741 { 1742 struct rte_eth_dev *dev = cb_arg; 1743 struct pmd_internals *pmd = dev->data->dev_private; 1744 1745 if (rte_intr_fd_get(pmd->intr_handle) >= 0) 1746 tap_nl_recv(rte_intr_fd_get(pmd->intr_handle), 1747 tap_nl_msg_handler, dev); 1748 } 1749 1750 static int 1751 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set) 1752 { 1753 struct pmd_internals *pmd = dev->data->dev_private; 1754 int ret; 1755 1756 /* In any case, disable interrupt if the conf is no longer there. */ 1757 if (!dev->data->dev_conf.intr_conf.lsc) { 1758 if (rte_intr_fd_get(pmd->intr_handle) != -1) 1759 goto clean; 1760 1761 return 0; 1762 } 1763 if (set) { 1764 rte_intr_fd_set(pmd->intr_handle, tap_nl_init(RTMGRP_LINK)); 1765 if (unlikely(rte_intr_fd_get(pmd->intr_handle) == -1)) 1766 return -EBADF; 1767 return rte_intr_callback_register( 1768 pmd->intr_handle, tap_dev_intr_handler, dev); 1769 } 1770 1771 clean: 1772 do { 1773 ret = rte_intr_callback_unregister(pmd->intr_handle, 1774 tap_dev_intr_handler, dev); 1775 if (ret >= 0) { 1776 break; 1777 } else if (ret == -EAGAIN) { 1778 rte_delay_ms(100); 1779 } else { 1780 TAP_LOG(ERR, "intr callback unregister failed: %d", 1781 ret); 1782 break; 1783 } 1784 } while (true); 1785 1786 if (rte_intr_fd_get(pmd->intr_handle) >= 0) { 1787 tap_nl_final(rte_intr_fd_get(pmd->intr_handle)); 1788 rte_intr_fd_set(pmd->intr_handle, -1); 1789 } 1790 1791 return 0; 1792 } 1793 1794 static int 1795 tap_intr_handle_set(struct rte_eth_dev *dev, int set) 1796 { 1797 int err; 1798 1799 err = tap_lsc_intr_handle_set(dev, set); 1800 if (err < 0) { 1801 if (!set) 1802 tap_rx_intr_vec_set(dev, 0); 1803 return err; 1804 } 1805 err = tap_rx_intr_vec_set(dev, set); 1806 if (err && set) 1807 tap_lsc_intr_handle_set(dev, 0); 1808 return err; 1809 } 1810 1811 static const uint32_t* 1812 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused) 1813 { 1814 static const uint32_t ptypes[] = { 1815 RTE_PTYPE_INNER_L2_ETHER, 1816 RTE_PTYPE_INNER_L2_ETHER_VLAN, 1817 RTE_PTYPE_INNER_L2_ETHER_QINQ, 1818 RTE_PTYPE_INNER_L3_IPV4, 1819 RTE_PTYPE_INNER_L3_IPV4_EXT, 1820 RTE_PTYPE_INNER_L3_IPV6, 1821 RTE_PTYPE_INNER_L3_IPV6_EXT, 1822 RTE_PTYPE_INNER_L4_FRAG, 1823 RTE_PTYPE_INNER_L4_UDP, 1824 RTE_PTYPE_INNER_L4_TCP, 1825 RTE_PTYPE_INNER_L4_SCTP, 1826 RTE_PTYPE_L2_ETHER, 1827 RTE_PTYPE_L2_ETHER_VLAN, 1828 RTE_PTYPE_L2_ETHER_QINQ, 1829 RTE_PTYPE_L3_IPV4, 1830 RTE_PTYPE_L3_IPV4_EXT, 1831 RTE_PTYPE_L3_IPV6_EXT, 1832 RTE_PTYPE_L3_IPV6, 1833 RTE_PTYPE_L4_FRAG, 1834 RTE_PTYPE_L4_UDP, 1835 RTE_PTYPE_L4_TCP, 1836 RTE_PTYPE_L4_SCTP, 1837 }; 1838 1839 return ptypes; 1840 } 1841 1842 static int 1843 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused, 1844 struct rte_eth_fc_conf *fc_conf) 1845 { 1846 fc_conf->mode = RTE_ETH_FC_NONE; 1847 return 0; 1848 } 1849 1850 static int 1851 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused, 1852 struct rte_eth_fc_conf *fc_conf) 1853 { 1854 if (fc_conf->mode != RTE_ETH_FC_NONE) 1855 return -ENOTSUP; 1856 return 0; 1857 } 1858 1859 /** 1860 * DPDK callback to update the RSS hash configuration. 1861 * 1862 * @param dev 1863 * Pointer to Ethernet device structure. 1864 * @param[in] rss_conf 1865 * RSS configuration data. 1866 * 1867 * @return 1868 * 0 on success, a negative errno value otherwise and rte_errno is set. 1869 */ 1870 static int 1871 tap_rss_hash_update(struct rte_eth_dev *dev, 1872 struct rte_eth_rss_conf *rss_conf) 1873 { 1874 if (rss_conf->rss_hf & TAP_RSS_HF_MASK) { 1875 rte_errno = EINVAL; 1876 return -rte_errno; 1877 } 1878 if (rss_conf->rss_key && rss_conf->rss_key_len) { 1879 /* 1880 * Currently TAP RSS key is hard coded 1881 * and cannot be updated 1882 */ 1883 TAP_LOG(ERR, 1884 "port %u RSS key cannot be updated", 1885 dev->data->port_id); 1886 rte_errno = EINVAL; 1887 return -rte_errno; 1888 } 1889 return 0; 1890 } 1891 1892 static int 1893 tap_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id) 1894 { 1895 dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED; 1896 1897 return 0; 1898 } 1899 1900 static int 1901 tap_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id) 1902 { 1903 dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED; 1904 1905 return 0; 1906 } 1907 1908 static int 1909 tap_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id) 1910 { 1911 dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; 1912 1913 return 0; 1914 } 1915 1916 static int 1917 tap_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id) 1918 { 1919 dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; 1920 1921 return 0; 1922 } 1923 static const struct eth_dev_ops ops = { 1924 .dev_start = tap_dev_start, 1925 .dev_stop = tap_dev_stop, 1926 .dev_close = tap_dev_close, 1927 .dev_configure = tap_dev_configure, 1928 .dev_infos_get = tap_dev_info, 1929 .rx_queue_setup = tap_rx_queue_setup, 1930 .tx_queue_setup = tap_tx_queue_setup, 1931 .rx_queue_start = tap_rx_queue_start, 1932 .tx_queue_start = tap_tx_queue_start, 1933 .rx_queue_stop = tap_rx_queue_stop, 1934 .tx_queue_stop = tap_tx_queue_stop, 1935 .rx_queue_release = tap_rx_queue_release, 1936 .tx_queue_release = tap_tx_queue_release, 1937 .flow_ctrl_get = tap_flow_ctrl_get, 1938 .flow_ctrl_set = tap_flow_ctrl_set, 1939 .link_update = tap_link_update, 1940 .dev_set_link_up = tap_link_set_up, 1941 .dev_set_link_down = tap_link_set_down, 1942 .promiscuous_enable = tap_promisc_enable, 1943 .promiscuous_disable = tap_promisc_disable, 1944 .allmulticast_enable = tap_allmulti_enable, 1945 .allmulticast_disable = tap_allmulti_disable, 1946 .mac_addr_set = tap_mac_set, 1947 .mtu_set = tap_mtu_set, 1948 .set_mc_addr_list = tap_set_mc_addr_list, 1949 .stats_get = tap_stats_get, 1950 .stats_reset = tap_stats_reset, 1951 .dev_supported_ptypes_get = tap_dev_supported_ptypes_get, 1952 .rss_hash_update = tap_rss_hash_update, 1953 .flow_ops_get = tap_dev_flow_ops_get, 1954 }; 1955 1956 static int 1957 eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name, 1958 char *remote_iface, struct rte_ether_addr *mac_addr, 1959 enum rte_tuntap_type type) 1960 { 1961 int numa_node = rte_socket_id(); 1962 struct rte_eth_dev *dev; 1963 struct pmd_internals *pmd; 1964 struct pmd_process_private *process_private; 1965 const char *tuntap_name = tuntap_types[type]; 1966 struct rte_eth_dev_data *data; 1967 struct ifreq ifr; 1968 int i; 1969 1970 TAP_LOG(DEBUG, "%s device on numa %u", tuntap_name, rte_socket_id()); 1971 1972 dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd)); 1973 if (!dev) { 1974 TAP_LOG(ERR, "%s Unable to allocate device struct", 1975 tuntap_name); 1976 goto error_exit_nodev; 1977 } 1978 1979 process_private = (struct pmd_process_private *) 1980 rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private), 1981 RTE_CACHE_LINE_SIZE, dev->device->numa_node); 1982 1983 if (process_private == NULL) { 1984 TAP_LOG(ERR, "Failed to alloc memory for process private"); 1985 return -1; 1986 } 1987 pmd = dev->data->dev_private; 1988 dev->process_private = process_private; 1989 pmd->dev = dev; 1990 strlcpy(pmd->name, tap_name, sizeof(pmd->name)); 1991 pmd->type = type; 1992 pmd->ka_fd = -1; 1993 pmd->nlsk_fd = -1; 1994 pmd->gso_ctx_mp = NULL; 1995 1996 pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0); 1997 if (pmd->ioctl_sock == -1) { 1998 TAP_LOG(ERR, 1999 "%s Unable to get a socket for management: %s", 2000 tuntap_name, strerror(errno)); 2001 goto error_exit; 2002 } 2003 2004 /* Allocate interrupt instance */ 2005 pmd->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED); 2006 if (pmd->intr_handle == NULL) { 2007 TAP_LOG(ERR, "Failed to allocate intr handle"); 2008 goto error_exit; 2009 } 2010 2011 /* Setup some default values */ 2012 data = dev->data; 2013 data->dev_private = pmd; 2014 data->dev_flags = RTE_ETH_DEV_INTR_LSC | 2015 RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 2016 data->numa_node = numa_node; 2017 2018 data->dev_link = pmd_link; 2019 data->mac_addrs = &pmd->eth_addr; 2020 /* Set the number of RX and TX queues */ 2021 data->nb_rx_queues = 0; 2022 data->nb_tx_queues = 0; 2023 2024 dev->dev_ops = &ops; 2025 dev->rx_pkt_burst = pmd_rx_burst; 2026 dev->tx_pkt_burst = pmd_tx_burst; 2027 2028 rte_intr_type_set(pmd->intr_handle, RTE_INTR_HANDLE_EXT); 2029 rte_intr_fd_set(pmd->intr_handle, -1); 2030 dev->intr_handle = pmd->intr_handle; 2031 2032 /* Presetup the fds to -1 as being not valid */ 2033 for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { 2034 process_private->rxq_fds[i] = -1; 2035 process_private->txq_fds[i] = -1; 2036 } 2037 2038 if (pmd->type == ETH_TUNTAP_TYPE_TAP) { 2039 if (rte_is_zero_ether_addr(mac_addr)) 2040 rte_eth_random_addr((uint8_t *)&pmd->eth_addr); 2041 else 2042 rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr)); 2043 } 2044 2045 /* 2046 * Allocate a TUN device keep-alive file descriptor that will only be 2047 * closed when the TUN device itself is closed or removed. 2048 * This keep-alive file descriptor will guarantee that the TUN device 2049 * exists even when all of its queues are closed 2050 */ 2051 pmd->ka_fd = tun_alloc(pmd, 1); 2052 if (pmd->ka_fd == -1) { 2053 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name); 2054 goto error_exit; 2055 } 2056 TAP_LOG(DEBUG, "allocated %s", pmd->name); 2057 2058 ifr.ifr_mtu = dev->data->mtu; 2059 if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0) 2060 goto error_exit; 2061 2062 if (pmd->type == ETH_TUNTAP_TYPE_TAP) { 2063 memset(&ifr, 0, sizeof(struct ifreq)); 2064 ifr.ifr_hwaddr.sa_family = AF_LOCAL; 2065 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr, 2066 RTE_ETHER_ADDR_LEN); 2067 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) 2068 goto error_exit; 2069 } 2070 2071 /* 2072 * Set up everything related to rte_flow: 2073 * - netlink socket 2074 * - tap / remote if_index 2075 * - mandatory QDISCs 2076 * - rte_flow actual/implicit lists 2077 * - implicit rules 2078 */ 2079 pmd->nlsk_fd = tap_nl_init(0); 2080 if (pmd->nlsk_fd == -1) { 2081 TAP_LOG(WARNING, "%s: failed to create netlink socket.", 2082 pmd->name); 2083 goto disable_rte_flow; 2084 } 2085 pmd->if_index = if_nametoindex(pmd->name); 2086 if (!pmd->if_index) { 2087 TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name); 2088 goto disable_rte_flow; 2089 } 2090 if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) { 2091 TAP_LOG(ERR, "%s: failed to create multiq qdisc.", 2092 pmd->name); 2093 goto disable_rte_flow; 2094 } 2095 if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) { 2096 TAP_LOG(ERR, "%s: failed to create ingress qdisc.", 2097 pmd->name); 2098 goto disable_rte_flow; 2099 } 2100 LIST_INIT(&pmd->flows); 2101 2102 if (strlen(remote_iface)) { 2103 pmd->remote_if_index = if_nametoindex(remote_iface); 2104 if (!pmd->remote_if_index) { 2105 TAP_LOG(ERR, "%s: failed to get %s if_index.", 2106 pmd->name, remote_iface); 2107 goto error_remote; 2108 } 2109 strlcpy(pmd->remote_iface, remote_iface, RTE_ETH_NAME_MAX_LEN); 2110 2111 /* Save state of remote device */ 2112 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY); 2113 2114 /* Replicate remote MAC address */ 2115 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) { 2116 TAP_LOG(ERR, "%s: failed to get %s MAC address.", 2117 pmd->name, pmd->remote_iface); 2118 goto error_remote; 2119 } 2120 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data, 2121 RTE_ETHER_ADDR_LEN); 2122 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */ 2123 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) { 2124 TAP_LOG(ERR, "%s: failed to get %s MAC address.", 2125 pmd->name, remote_iface); 2126 goto error_remote; 2127 } 2128 2129 /* 2130 * Flush usually returns negative value because it tries to 2131 * delete every QDISC (and on a running device, one QDISC at 2132 * least is needed). Ignore negative return value. 2133 */ 2134 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index); 2135 if (qdisc_create_ingress(pmd->nlsk_fd, 2136 pmd->remote_if_index) < 0) { 2137 TAP_LOG(ERR, "%s: failed to create ingress qdisc.", 2138 pmd->remote_iface); 2139 goto error_remote; 2140 } 2141 LIST_INIT(&pmd->implicit_flows); 2142 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 || 2143 tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 || 2144 tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 || 2145 tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) { 2146 TAP_LOG(ERR, 2147 "%s: failed to create implicit rules.", 2148 pmd->name); 2149 goto error_remote; 2150 } 2151 } 2152 2153 rte_eth_dev_probing_finish(dev); 2154 return 0; 2155 2156 disable_rte_flow: 2157 TAP_LOG(ERR, " Disabling rte flow support: %s(%d)", 2158 strerror(errno), errno); 2159 if (strlen(remote_iface)) { 2160 TAP_LOG(ERR, "Remote feature requires flow support."); 2161 goto error_exit; 2162 } 2163 rte_eth_dev_probing_finish(dev); 2164 return 0; 2165 2166 error_remote: 2167 TAP_LOG(ERR, " Can't set up remote feature: %s(%d)", 2168 strerror(errno), errno); 2169 tap_flow_implicit_flush(pmd, NULL); 2170 2171 error_exit: 2172 if (pmd->nlsk_fd != -1) 2173 close(pmd->nlsk_fd); 2174 if (pmd->ka_fd != -1) 2175 close(pmd->ka_fd); 2176 if (pmd->ioctl_sock != -1) 2177 close(pmd->ioctl_sock); 2178 /* mac_addrs must not be freed alone because part of dev_private */ 2179 dev->data->mac_addrs = NULL; 2180 rte_eth_dev_release_port(dev); 2181 rte_intr_instance_free(pmd->intr_handle); 2182 2183 error_exit_nodev: 2184 TAP_LOG(ERR, "%s Unable to initialize %s", 2185 tuntap_name, rte_vdev_device_name(vdev)); 2186 2187 return -EINVAL; 2188 } 2189 2190 /* make sure name is a possible Linux network device name */ 2191 static bool 2192 is_valid_iface(const char *name) 2193 { 2194 if (*name == '\0') 2195 return false; 2196 2197 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) 2198 return false; 2199 2200 while (*name) { 2201 if (*name == '/' || *name == ':' || isspace(*name)) 2202 return false; 2203 name++; 2204 } 2205 return true; 2206 } 2207 2208 static int 2209 set_interface_name(const char *key __rte_unused, 2210 const char *value, 2211 void *extra_args) 2212 { 2213 char *name = (char *)extra_args; 2214 2215 if (value) { 2216 if (!is_valid_iface(value)) { 2217 TAP_LOG(ERR, "TAP invalid remote interface name (%s)", 2218 value); 2219 return -1; 2220 } 2221 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN); 2222 } else { 2223 /* use tap%d which causes kernel to choose next available */ 2224 strlcpy(name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN); 2225 } 2226 return 0; 2227 } 2228 2229 static int 2230 set_remote_iface(const char *key __rte_unused, 2231 const char *value, 2232 void *extra_args) 2233 { 2234 char *name = (char *)extra_args; 2235 2236 if (value) { 2237 if (!is_valid_iface(value)) { 2238 TAP_LOG(ERR, "TAP invalid remote interface name (%s)", 2239 value); 2240 return -1; 2241 } 2242 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN); 2243 } 2244 2245 return 0; 2246 } 2247 2248 static int parse_user_mac(struct rte_ether_addr *user_mac, 2249 const char *value) 2250 { 2251 unsigned int index = 0; 2252 char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL; 2253 2254 if (user_mac == NULL || value == NULL) 2255 return 0; 2256 2257 strlcpy(mac_temp, value, sizeof(mac_temp)); 2258 mac_byte = strtok(mac_temp, ":"); 2259 2260 while ((mac_byte != NULL) && 2261 (strlen(mac_byte) <= 2) && 2262 (strlen(mac_byte) == strspn(mac_byte, 2263 ETH_TAP_CMP_MAC_FMT))) { 2264 user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16); 2265 mac_byte = strtok(NULL, ":"); 2266 } 2267 2268 return index; 2269 } 2270 2271 static int 2272 set_mac_type(const char *key __rte_unused, 2273 const char *value, 2274 void *extra_args) 2275 { 2276 struct rte_ether_addr *user_mac = extra_args; 2277 2278 if (!value) 2279 return 0; 2280 2281 if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) { 2282 static int iface_idx; 2283 2284 /* fixed mac = 00:64:74:61:70:<iface_idx> */ 2285 memcpy((char *)user_mac->addr_bytes, "\0dtap", 2286 RTE_ETHER_ADDR_LEN); 2287 user_mac->addr_bytes[RTE_ETHER_ADDR_LEN - 1] = 2288 iface_idx++ + '0'; 2289 goto success; 2290 } 2291 2292 if (parse_user_mac(user_mac, value) != 6) 2293 goto error; 2294 success: 2295 TAP_LOG(DEBUG, "TAP user MAC param (%s)", value); 2296 return 0; 2297 2298 error: 2299 TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)", 2300 value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT); 2301 return -1; 2302 } 2303 2304 /* 2305 * Open a TUN interface device. TUN PMD 2306 * 1) sets tap_type as false 2307 * 2) intakes iface as argument. 2308 * 3) as interface is virtual set speed to 10G 2309 */ 2310 static int 2311 rte_pmd_tun_probe(struct rte_vdev_device *dev) 2312 { 2313 const char *name, *params; 2314 int ret; 2315 struct rte_kvargs *kvlist = NULL; 2316 char tun_name[RTE_ETH_NAME_MAX_LEN]; 2317 char remote_iface[RTE_ETH_NAME_MAX_LEN]; 2318 struct rte_eth_dev *eth_dev; 2319 2320 name = rte_vdev_device_name(dev); 2321 params = rte_vdev_device_args(dev); 2322 memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN); 2323 2324 if (rte_eal_process_type() == RTE_PROC_SECONDARY && 2325 strlen(params) == 0) { 2326 eth_dev = rte_eth_dev_attach_secondary(name); 2327 if (!eth_dev) { 2328 TAP_LOG(ERR, "Failed to probe %s", name); 2329 return -1; 2330 } 2331 eth_dev->dev_ops = &ops; 2332 eth_dev->device = &dev->device; 2333 rte_eth_dev_probing_finish(eth_dev); 2334 return 0; 2335 } 2336 2337 /* use tun%d which causes kernel to choose next available */ 2338 strlcpy(tun_name, DEFAULT_TUN_NAME "%d", RTE_ETH_NAME_MAX_LEN); 2339 2340 if (params && (params[0] != '\0')) { 2341 TAP_LOG(DEBUG, "parameters (%s)", params); 2342 2343 kvlist = rte_kvargs_parse(params, valid_arguments); 2344 if (kvlist) { 2345 if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) { 2346 ret = rte_kvargs_process(kvlist, 2347 ETH_TAP_IFACE_ARG, 2348 &set_interface_name, 2349 tun_name); 2350 2351 if (ret == -1) 2352 goto leave; 2353 } 2354 } 2355 } 2356 pmd_link.link_speed = RTE_ETH_SPEED_NUM_10G; 2357 2358 TAP_LOG(DEBUG, "Initializing pmd_tun for %s", name); 2359 2360 ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0, 2361 ETH_TUNTAP_TYPE_TUN); 2362 2363 leave: 2364 if (ret == -1) { 2365 TAP_LOG(ERR, "Failed to create pmd for %s as %s", 2366 name, tun_name); 2367 } 2368 rte_kvargs_free(kvlist); 2369 2370 return ret; 2371 } 2372 2373 /* Request queue file descriptors from secondary to primary. */ 2374 static int 2375 tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev) 2376 { 2377 int ret; 2378 struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0}; 2379 struct rte_mp_msg request, *reply; 2380 struct rte_mp_reply replies; 2381 struct ipc_queues *request_param = (struct ipc_queues *)request.param; 2382 struct ipc_queues *reply_param; 2383 struct pmd_process_private *process_private = dev->process_private; 2384 int queue, fd_iterator; 2385 2386 /* Prepare the request */ 2387 memset(&request, 0, sizeof(request)); 2388 strlcpy(request.name, TAP_MP_KEY, sizeof(request.name)); 2389 strlcpy(request_param->port_name, port_name, 2390 sizeof(request_param->port_name)); 2391 request.len_param = sizeof(*request_param); 2392 /* Send request and receive reply */ 2393 ret = rte_mp_request_sync(&request, &replies, &timeout); 2394 if (ret < 0 || replies.nb_received != 1) { 2395 TAP_LOG(ERR, "Failed to request queues from primary: %d", 2396 rte_errno); 2397 return -1; 2398 } 2399 reply = &replies.msgs[0]; 2400 reply_param = (struct ipc_queues *)reply->param; 2401 TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name); 2402 2403 /* Attach the queues from received file descriptors */ 2404 if (reply_param->rxq_count + reply_param->txq_count != reply->num_fds) { 2405 TAP_LOG(ERR, "Unexpected number of fds received"); 2406 return -1; 2407 } 2408 2409 dev->data->nb_rx_queues = reply_param->rxq_count; 2410 dev->data->nb_tx_queues = reply_param->txq_count; 2411 fd_iterator = 0; 2412 for (queue = 0; queue < reply_param->rxq_count; queue++) 2413 process_private->rxq_fds[queue] = reply->fds[fd_iterator++]; 2414 for (queue = 0; queue < reply_param->txq_count; queue++) 2415 process_private->txq_fds[queue] = reply->fds[fd_iterator++]; 2416 free(reply); 2417 return 0; 2418 } 2419 2420 /* Send the queue file descriptors from the primary process to secondary. */ 2421 static int 2422 tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer) 2423 { 2424 struct rte_eth_dev *dev; 2425 struct pmd_process_private *process_private; 2426 struct rte_mp_msg reply; 2427 const struct ipc_queues *request_param = 2428 (const struct ipc_queues *)request->param; 2429 struct ipc_queues *reply_param = 2430 (struct ipc_queues *)reply.param; 2431 int queue; 2432 2433 /* Get requested port */ 2434 TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name); 2435 dev = rte_eth_dev_get_by_name(request_param->port_name); 2436 if (!dev) { 2437 TAP_LOG(ERR, "Failed to get port id for %s", 2438 request_param->port_name); 2439 return -1; 2440 } 2441 process_private = dev->process_private; 2442 2443 /* Fill file descriptors for all queues */ 2444 reply.num_fds = 0; 2445 reply_param->rxq_count = 0; 2446 if (dev->data->nb_rx_queues + dev->data->nb_tx_queues > 2447 RTE_MP_MAX_FD_NUM){ 2448 TAP_LOG(ERR, "Number of rx/tx queues exceeds max number of fds"); 2449 return -1; 2450 } 2451 2452 for (queue = 0; queue < dev->data->nb_rx_queues; queue++) { 2453 reply.fds[reply.num_fds++] = process_private->rxq_fds[queue]; 2454 reply_param->rxq_count++; 2455 } 2456 RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues); 2457 2458 reply_param->txq_count = 0; 2459 for (queue = 0; queue < dev->data->nb_tx_queues; queue++) { 2460 reply.fds[reply.num_fds++] = process_private->txq_fds[queue]; 2461 reply_param->txq_count++; 2462 } 2463 RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues); 2464 2465 /* Send reply */ 2466 strlcpy(reply.name, request->name, sizeof(reply.name)); 2467 strlcpy(reply_param->port_name, request_param->port_name, 2468 sizeof(reply_param->port_name)); 2469 reply.len_param = sizeof(*reply_param); 2470 if (rte_mp_reply(&reply, peer) < 0) { 2471 TAP_LOG(ERR, "Failed to reply an IPC request to sync queues"); 2472 return -1; 2473 } 2474 return 0; 2475 } 2476 2477 /* Open a TAP interface device. 2478 */ 2479 static int 2480 rte_pmd_tap_probe(struct rte_vdev_device *dev) 2481 { 2482 const char *name, *params; 2483 int ret; 2484 struct rte_kvargs *kvlist = NULL; 2485 int speed; 2486 char tap_name[RTE_ETH_NAME_MAX_LEN]; 2487 char remote_iface[RTE_ETH_NAME_MAX_LEN]; 2488 struct rte_ether_addr user_mac = { .addr_bytes = {0} }; 2489 struct rte_eth_dev *eth_dev; 2490 int tap_devices_count_increased = 0; 2491 2492 name = rte_vdev_device_name(dev); 2493 params = rte_vdev_device_args(dev); 2494 2495 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 2496 eth_dev = rte_eth_dev_attach_secondary(name); 2497 if (!eth_dev) { 2498 TAP_LOG(ERR, "Failed to probe %s", name); 2499 return -1; 2500 } 2501 eth_dev->dev_ops = &ops; 2502 eth_dev->device = &dev->device; 2503 eth_dev->rx_pkt_burst = pmd_rx_burst; 2504 eth_dev->tx_pkt_burst = pmd_tx_burst; 2505 if (!rte_eal_primary_proc_alive(NULL)) { 2506 TAP_LOG(ERR, "Primary process is missing"); 2507 return -1; 2508 } 2509 eth_dev->process_private = (struct pmd_process_private *) 2510 rte_zmalloc_socket(name, 2511 sizeof(struct pmd_process_private), 2512 RTE_CACHE_LINE_SIZE, 2513 eth_dev->device->numa_node); 2514 if (eth_dev->process_private == NULL) { 2515 TAP_LOG(ERR, 2516 "Failed to alloc memory for process private"); 2517 return -1; 2518 } 2519 2520 ret = tap_mp_attach_queues(name, eth_dev); 2521 if (ret != 0) 2522 return -1; 2523 2524 if (!tap_devices_count) { 2525 ret = rte_mp_action_register(TAP_MP_REQ_START_RXTX, tap_mp_req_start_rxtx); 2526 if (ret < 0 && rte_errno != ENOTSUP) { 2527 TAP_LOG(ERR, "tap: Failed to register IPC callback: %s", 2528 strerror(rte_errno)); 2529 return -1; 2530 } 2531 } 2532 tap_devices_count++; 2533 rte_eth_dev_probing_finish(eth_dev); 2534 return 0; 2535 } 2536 2537 speed = RTE_ETH_SPEED_NUM_10G; 2538 2539 /* use tap%d which causes kernel to choose next available */ 2540 strlcpy(tap_name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN); 2541 memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN); 2542 2543 if (params && (params[0] != '\0')) { 2544 TAP_LOG(DEBUG, "parameters (%s)", params); 2545 2546 kvlist = rte_kvargs_parse(params, valid_arguments); 2547 if (kvlist) { 2548 if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) { 2549 ret = rte_kvargs_process(kvlist, 2550 ETH_TAP_IFACE_ARG, 2551 &set_interface_name, 2552 tap_name); 2553 if (ret == -1) 2554 goto leave; 2555 } 2556 2557 if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) { 2558 ret = rte_kvargs_process(kvlist, 2559 ETH_TAP_REMOTE_ARG, 2560 &set_remote_iface, 2561 remote_iface); 2562 if (ret == -1) 2563 goto leave; 2564 } 2565 2566 if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) { 2567 ret = rte_kvargs_process(kvlist, 2568 ETH_TAP_MAC_ARG, 2569 &set_mac_type, 2570 &user_mac); 2571 if (ret == -1) 2572 goto leave; 2573 } 2574 } 2575 } 2576 pmd_link.link_speed = speed; 2577 2578 TAP_LOG(DEBUG, "Initializing pmd_tap for %s", name); 2579 2580 /* Register IPC feed callback */ 2581 if (!tap_devices_count) { 2582 ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues); 2583 if (ret < 0 && rte_errno != ENOTSUP) { 2584 TAP_LOG(ERR, "tap: Failed to register IPC callback: %s", 2585 strerror(rte_errno)); 2586 goto leave; 2587 } 2588 } 2589 tap_devices_count++; 2590 tap_devices_count_increased = 1; 2591 ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac, 2592 ETH_TUNTAP_TYPE_TAP); 2593 2594 leave: 2595 if (ret == -1) { 2596 TAP_LOG(ERR, "Failed to create pmd for %s as %s", 2597 name, tap_name); 2598 if (tap_devices_count_increased == 1) { 2599 if (tap_devices_count == 1) 2600 rte_mp_action_unregister(TAP_MP_KEY); 2601 tap_devices_count--; 2602 } 2603 } 2604 rte_kvargs_free(kvlist); 2605 2606 return ret; 2607 } 2608 2609 /* detach a TUNTAP device. 2610 */ 2611 static int 2612 rte_pmd_tap_remove(struct rte_vdev_device *dev) 2613 { 2614 struct rte_eth_dev *eth_dev = NULL; 2615 2616 /* find the ethdev entry */ 2617 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 2618 if (!eth_dev) 2619 return 0; 2620 2621 tap_dev_close(eth_dev); 2622 rte_eth_dev_release_port(eth_dev); 2623 2624 return 0; 2625 } 2626 2627 static struct rte_vdev_driver pmd_tun_drv = { 2628 .probe = rte_pmd_tun_probe, 2629 .remove = rte_pmd_tap_remove, 2630 }; 2631 2632 static struct rte_vdev_driver pmd_tap_drv = { 2633 .probe = rte_pmd_tap_probe, 2634 .remove = rte_pmd_tap_remove, 2635 }; 2636 2637 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv); 2638 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv); 2639 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap); 2640 RTE_PMD_REGISTER_PARAM_STRING(net_tun, 2641 ETH_TAP_IFACE_ARG "=<string> "); 2642 RTE_PMD_REGISTER_PARAM_STRING(net_tap, 2643 ETH_TAP_IFACE_ARG "=<string> " 2644 ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " " 2645 ETH_TAP_REMOTE_ARG "=<string>"); 2646 RTE_LOG_REGISTER_DEFAULT(tap_logtype, NOTICE); 2647