1 /* 2 * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_lcore.h> 42 #include <rte_launch.h> 43 #include <rte_ethdev.h> 44 #include <rte_debug.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 #include "ff_memory.h" 63 64 #define PAGE_SIZE 4096 65 #define PAGE_SHIFT 12 66 #define PAGE_MASK (PAGE_SIZE - 1) 67 #define trunc_page(x) ((x) & ~PAGE_MASK) 68 #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) 69 70 extern struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 71 extern struct lcore_conf lcore_conf; 72 73 //struct ff_tx_offload; 74 75 // ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address. 76 static struct rte_mempool *ff_ref_pool[NB_SOCKETS]; 77 78 #define Head_INC(h) {\ 79 if ( ++h >= TX_QUEUE_SIZE ) \ 80 h = 0;\ 81 }; 82 83 #define Head_DEC(h) do{\ 84 if ( --h < 0 ) \ 85 h = TX_QUEUE_SIZE-1;\ 86 }while(0); 87 88 // bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed. 89 static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS]; 90 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num); 91 static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len); 92 93 typedef struct _list_manager_s 94 { 95 uint64_t *ele; 96 int size; 97 //int FreeNum; 98 int top; 99 }StackList_t; 100 101 static StackList_t ff_mpage_ctl = {0}; 102 static uint64_t ff_page_start = NULL, ff_page_end = NULL; 103 static phys_addr_t *ff_mpage_phy = NULL; 104 105 static inline void *stklist_pop(StackList_t *p); 106 static inline int stklist_push(StackList_t * p, uint64_t val); 107 108 static int stklist_init(StackList_t*p, int size) 109 { 110 111 int i = 0; 112 113 if (p==NULL || size<=0){ 114 return -1; 115 } 116 p->size = size; 117 p->top = 0; 118 if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0) 119 return -2; 120 121 return 0; 122 } 123 124 static inline void *stklist_pop(StackList_t *p) 125 { 126 int head = 0; 127 128 if (p==NULL) 129 return NULL; 130 131 if (p->top > 0 ){ 132 return (void*)p->ele[--p->top]; 133 } 134 else 135 return NULL; 136 } 137 138 //id: the id of element to be freed. 139 //return code: -1: faile; >=0:OK. 140 static inline int stklist_push(StackList_t *p, const uint64_t val){ 141 int tail = 0; 142 143 if (p==NULL) 144 return -1; 145 if (p->top < p->size){ 146 p->ele[p->top++] = val; 147 return 0; 148 } 149 else 150 return -1; 151 } 152 153 static inline int stklist_size(StackList_t * p) 154 { 155 return p->size; 156 } 157 158 // set (void*) to rte_mbuf's priv_data. 159 static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data) 160 { 161 if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t)) 162 *((uint64_t*)(p+1)) = data; 163 return 0; 164 } 165 166 /************************* 167 * if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below: 168 * <--- num-1 ---->|ptr| head | 169 * ---------------------------------------------- 170 * | 0 | 0 | ..............| 0 | p | XXX | 171 *----------------------------------------------- 172 *************************/ 173 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num) 174 { 175 int i = 0; 176 for ( i=0; i<seg_num-1; i++){ 177 if ( q->m_table[q->head] ){ 178 ff_mbuf_free(q->m_table[q->head]); 179 q->m_table[q->head] = NULL; 180 } 181 Head_INC(q->head); 182 } 183 if ( q->m_table[q->head] ) 184 ff_mbuf_free(q->m_table[q->head]); 185 q->m_table[q->head] = p; 186 Head_INC(q->head); 187 188 return 0; 189 } 190 191 // pop out from head-1 . 192 static inline int ff_txring_pop(struct mbuf_txring* q, int num) 193 { 194 int i = 0; 195 196 for (i=0; i<num; i++){ 197 Head_DEC(q->head); 198 if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ){ 199 rte_panic("ff_txring_pop fatal error!"); 200 } 201 if ( q->m_table[q->head] != NULL ){ 202 ff_mbuf_free(q->m_table[q->head]); 203 q->m_table[q->head] = NULL; 204 } 205 } 206 } 207 208 static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num) 209 { 210 memset(q, 0, sizeof(struct mbuf_txring)*num); 211 } 212 213 void ff_init_ref_pool(int nb_mbuf, int socketid) 214 { 215 char s[64] = {0}; 216 217 if (ff_ref_pool[socketid] != NULL) { 218 return; 219 } 220 snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid); 221 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 222 ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid); 223 } else { 224 ff_ref_pool[socketid] = rte_mempool_lookup(s); 225 } 226 } 227 228 int ff_mmap_init() 229 { 230 int err = 0; 231 int i = 0; 232 uint64_t virt_addr = NULL; 233 phys_addr_t phys_addr = 0; 234 uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20); 235 unsigned int bsd_pagesz = 0; 236 237 ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); 238 if (ff_page_start == (uint64_t)-1){ 239 rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno); 240 return -1; 241 } 242 243 if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) { 244 rte_panic("mlock failed, err=%d.\n", errno); 245 return -1; 246 } 247 ff_page_end = ff_page_start + bsd_memsz; 248 bsd_pagesz = (bsd_memsz>>12); 249 rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size); 250 printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end); 251 252 if (posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0){ 253 rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno); 254 return -1; 255 } 256 257 stklist_init(&ff_mpage_ctl, bsd_pagesz); 258 259 for (i=0; i<bsd_pagesz; i++ ){ 260 virt_addr = ff_page_start + PAGE_SIZE*i; 261 memset((void*)virt_addr, 0, PAGE_SIZE); 262 263 stklist_push( &ff_mpage_ctl, virt_addr); 264 ff_mpage_phy[i] = rte_mem_virt2phy((const void*)virt_addr); 265 if ( ff_mpage_phy[i] == RTE_BAD_IOVA ){ 266 rte_panic("rte_mem_virt2phy return invalid address."); 267 return -1; 268 } 269 } 270 271 ff_txring_init(&nic_tx_ring[0], RTE_MAX_ETHPORTS); 272 273 return 0; 274 } 275 276 // 1: vma in fstack page table; 0: vma not in fstack pages, in DPDK pool. 277 static inline int ff_chk_vma(const uint64_t virtaddr) 278 { 279 return !!( virtaddr > ff_page_start && virtaddr < ff_page_end ); 280 } 281 282 /* 283 * Get physical address of any mapped virtual address in the current process. 284 */ 285 static inline uint64_t ff_mem_virt2phy(const void* virtaddr) 286 { 287 uint64_t addr = 0; 288 uint32_t pages = 0; 289 290 pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT); 291 if (pages >= stklist_size(&ff_mpage_ctl)){ 292 rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages); 293 return -1; 294 } 295 296 addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK); 297 return addr; 298 } 299 300 void *ff_mem_get_page() 301 { 302 return (void*)stklist_pop(&ff_mpage_ctl); 303 } 304 305 int ff_mem_free_addr(void *p) 306 { 307 stklist_push(&ff_mpage_ctl, (const uint64_t)p); 308 return 0; 309 } 310 311 static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void *m, struct rte_mbuf *head) 312 { 313 void *data = NULL; 314 struct ff_tx_offload offload = {0}; 315 316 ff_mbuf_tx_offload(m, &offload); 317 data = rte_pktmbuf_mtod(head, void*); 318 319 if (offload.ip_csum) { 320 /* ipv6 not supported yet */ 321 struct ipv4_hdr *iph; 322 int iph_len; 323 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 324 iph_len = (iph->version_ihl & 0x0f) << 2; 325 326 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 327 head->l2_len = ETHER_HDR_LEN; 328 head->l3_len = iph_len; 329 } 330 331 if (ctx->hw_features.tx_csum_l4) { 332 struct ipv4_hdr *iph; 333 int iph_len; 334 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 335 iph_len = (iph->version_ihl & 0x0f) << 2; 336 337 if (offload.tcp_csum) { 338 head->ol_flags |= PKT_TX_TCP_CKSUM; 339 head->l2_len = ETHER_HDR_LEN; 340 head->l3_len = iph_len; 341 } 342 343 /* 344 * TCP segmentation offload. 345 * 346 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 347 * implies PKT_TX_TCP_CKSUM) 348 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 349 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 350 * write the IP checksum to 0 in the packet 351 * - fill the mbuf offload information: l2_len, 352 * l3_len, l4_len, tso_segsz 353 * - calculate the pseudo header checksum without taking ip_len 354 * in account, and set it in the TCP header. Refer to 355 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 356 * used as helpers. 357 */ 358 if (offload.tso_seg_size) { 359 struct tcp_hdr *tcph; 360 int tcph_len; 361 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 362 tcph_len = (tcph->data_off & 0xf0) >> 2; 363 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 364 365 head->ol_flags |= PKT_TX_TCP_SEG; 366 head->l4_len = tcph_len; 367 head->tso_segsz = offload.tso_seg_size; 368 } 369 370 if (offload.udp_csum) { 371 head->ol_flags |= PKT_TX_UDP_CKSUM; 372 head->l2_len = ETHER_HDR_LEN; 373 head->l3_len = iph_len; 374 } 375 } 376 } 377 378 // create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER. 379 static inline struct rte_mbuf* ff_extcl_to_rte(void *m ) 380 { 381 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 382 struct rte_mbuf *src_mbuf = NULL; 383 struct rte_mbuf *p_head = NULL; 384 385 src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m); 386 if ( NULL==src_mbuf ){ 387 return NULL; 388 } 389 p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool); 390 if (p_head == NULL){ 391 return NULL; 392 } 393 394 return p_head; 395 } 396 397 // create rte_mbuf refer to data in bsd mbuf. 398 static inline struct rte_mbuf* ff_bsd_to_rte(void *m, int total) 399 { 400 struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id]; 401 struct rte_mbuf *p_head = NULL; 402 struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL; 403 void *data = NULL; 404 void *p_bsdbuf = NULL; 405 unsigned len = 0; 406 407 p_head = rte_pktmbuf_alloc(mbuf_pool); 408 if (p_head == NULL){ 409 return NULL; 410 } 411 p_head->pkt_len = total; 412 p_head->nb_segs = 0; 413 cur = p_head; 414 p_bsdbuf = m; 415 while ( p_bsdbuf ){ 416 if (cur == NULL) { 417 cur = rte_pktmbuf_alloc(mbuf_pool); 418 if (cur == NULL) { 419 rte_pktmbuf_free(p_head); 420 return NULL; 421 } 422 } 423 ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf. 424 cur->buf_addr = data; 425 cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr)); 426 cur->data_off = 0; 427 cur->data_len = len; 428 429 p_head->nb_segs++; 430 if (prev != NULL) { 431 prev->next = cur; 432 } 433 prev = cur; 434 cur = NULL; 435 } 436 437 return p_head; 438 } 439 440 int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total) 441 { 442 struct rte_mbuf *head = NULL; 443 void *src_buf = NULL; 444 void *p_data = NULL; 445 struct lcore_conf *qconf = NULL; 446 unsigned len = 0; 447 448 if ( !m ){ 449 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address."); 450 return 0; 451 } 452 p_data = ff_mbuf_mtod(m); 453 if ( ff_chk_vma((uint64_t)p_data)){ 454 head = ff_bsd_to_rte(m, total); 455 } 456 else if ( (head = ff_extcl_to_rte(m)) == NULL ){ 457 rte_panic("data address 0x%lx is out of page bound or not malloced by DPDK recver.", (uint64_t)p_data); 458 return 0; 459 } 460 461 if (head == NULL){ 462 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_if_send_onepkt call ff_bsd_to_rte failed."); 463 ff_mbuf_free(m); 464 return 0; 465 } 466 467 ff_offload_set(ctx, m, head); 468 qconf = &lcore_conf; 469 len = qconf->tx_mbufs[ctx->port_id].len; 470 qconf->tx_mbufs[ctx->port_id].m_table[len] = head; 471 qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m; 472 len++; 473 474 return len; 475 } 476 477 int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs) 478 { 479 return ff_txring_enqueue(&nic_tx_ring[portid], p_mbuf, nb_segs); 480 } 481 482