1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_lcore.h> 42 #include <rte_launch.h> 43 #include <rte_ethdev.h> 44 #include <rte_debug.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 #include "ff_memory.h" 63 64 #define PAGE_SIZE 4096 65 #define PAGE_SHIFT 12 66 #define PAGE_MASK (PAGE_SIZE - 1) 67 #define trunc_page(x) ((x) & ~PAGE_MASK) 68 #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) 69 70 extern struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 71 extern struct lcore_conf lcore_conf; 72 73 //struct ff_tx_offload; 74 75 // ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address. 76 static struct rte_mempool *ff_ref_pool[NB_SOCKETS]; 77 78 #define Head_INC(h) {\ 79 if ( ++h >= TX_QUEUE_SIZE ) \ 80 h = 0;\ 81 }; 82 83 #define Head_DEC(h) do{\ 84 if ( --h < 0 ) \ 85 h = TX_QUEUE_SIZE-1;\ 86 }while(0); 87 88 // bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed. 89 static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS]; 90 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num); 91 static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len); 92 93 typedef struct _list_manager_s 94 { 95 uint64_t *ele; 96 int size; 97 //int FreeNum; 98 int top; 99 }StackList_t; 100 101 static StackList_t ff_mpage_ctl = {0}; 102 static uint64_t ff_page_start = NULL, ff_page_end = NULL; 103 static phys_addr_t *ff_mpage_phy = NULL; 104 105 static inline void *stklist_pop(StackList_t *p); 106 static inline int stklist_push(StackList_t * p, uint64_t val); 107 108 static int stklist_init(StackList_t*p, int size) 109 { 110 int i = 0; 111 112 if (p==NULL || size<=0){ 113 return -1; 114 } 115 p->size = size; 116 p->top = 0; 117 if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0) 118 return -2; 119 120 return 0; 121 } 122 123 static inline void *stklist_pop(StackList_t *p) 124 { 125 int head = 0; 126 127 if (p==NULL) 128 return NULL; 129 130 if (p->top > 0 ){ 131 return (void*)p->ele[--p->top]; 132 } 133 else 134 return NULL; 135 } 136 137 //id: the id of element to be freed. 138 //return code: -1: faile; >=0:OK. 139 static inline int stklist_push(StackList_t *p, const uint64_t val){ 140 int tail = 0; 141 142 if (p==NULL) 143 return -1; 144 if (p->top < p->size){ 145 p->ele[p->top++] = val; 146 return 0; 147 } 148 else 149 return -1; 150 } 151 152 static inline int stklist_size(StackList_t * p) 153 { 154 return p->size; 155 } 156 157 // set (void*) to rte_mbuf's priv_data. 158 static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data) 159 { 160 if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t)) 161 *((uint64_t*)(p+1)) = data; 162 return 0; 163 } 164 165 /************************* 166 * if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below: 167 * <--- num-1 ---->|ptr| head | 168 * ---------------------------------------------- 169 * | 0 | 0 | ..............| 0 | p | XXX | 170 *----------------------------------------------- 171 *************************/ 172 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num) 173 { 174 int i = 0; 175 for ( i=0; i<seg_num-1; i++){ 176 if ( q->m_table[q->head] ){ 177 ff_mbuf_free(q->m_table[q->head]); 178 q->m_table[q->head] = NULL; 179 } 180 Head_INC(q->head); 181 } 182 if ( q->m_table[q->head] ) 183 ff_mbuf_free(q->m_table[q->head]); 184 q->m_table[q->head] = p; 185 Head_INC(q->head); 186 187 return 0; 188 } 189 190 // pop out from head-1 . 191 static inline int ff_txring_pop(struct mbuf_txring* q, int num) 192 { 193 int i = 0; 194 195 for (i=0; i<num; i++){ 196 Head_DEC(q->head); 197 if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ){ 198 rte_panic("ff_txring_pop fatal error!"); 199 } 200 if ( q->m_table[q->head] != NULL ){ 201 ff_mbuf_free(q->m_table[q->head]); 202 q->m_table[q->head] = NULL; 203 } 204 } 205 } 206 207 static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num) 208 { 209 memset(q, 0, sizeof(struct mbuf_txring)*num); 210 } 211 212 void ff_init_ref_pool(int nb_mbuf, int socketid) 213 { 214 char s[64] = {0}; 215 216 if (ff_ref_pool[socketid] != NULL) { 217 return; 218 } 219 snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid); 220 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 221 ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid); 222 } else { 223 ff_ref_pool[socketid] = rte_mempool_lookup(s); 224 } 225 } 226 227 int ff_mmap_init() 228 { 229 int err = 0; 230 int i = 0; 231 uint64_t virt_addr = NULL; 232 phys_addr_t phys_addr = 0; 233 uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20); 234 unsigned int bsd_pagesz = 0; 235 236 ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); 237 if (ff_page_start == (uint64_t)-1){ 238 rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno); 239 return -1; 240 } 241 242 if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) { 243 rte_panic("mlock failed, err=%d.\n", errno); 244 return -1; 245 } 246 ff_page_end = ff_page_start + bsd_memsz; 247 bsd_pagesz = (bsd_memsz>>12); 248 rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size); 249 printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end); 250 251 if (posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0){ 252 rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno); 253 return -1; 254 } 255 256 stklist_init(&ff_mpage_ctl, bsd_pagesz); 257 258 for (i=0; i<bsd_pagesz; i++ ){ 259 virt_addr = ff_page_start + PAGE_SIZE*i; 260 memset((void*)virt_addr, 0, PAGE_SIZE); 261 262 stklist_push( &ff_mpage_ctl, virt_addr); 263 ff_mpage_phy[i] = rte_mem_virt2phy((const void*)virt_addr); 264 if ( ff_mpage_phy[i] == RTE_BAD_IOVA ){ 265 rte_panic("rte_mem_virt2phy return invalid address."); 266 return -1; 267 } 268 } 269 270 ff_txring_init(&nic_tx_ring[0], RTE_MAX_ETHPORTS); 271 272 return 0; 273 } 274 275 // 1: vma in fstack page table; 0: vma not in fstack pages, in DPDK pool. 276 static inline int ff_chk_vma(const uint64_t virtaddr) 277 { 278 return !!( virtaddr > ff_page_start && virtaddr < ff_page_end ); 279 } 280 281 /* 282 * Get physical address of any mapped virtual address in the current process. 283 */ 284 static inline uint64_t ff_mem_virt2phy(const void* virtaddr) 285 { 286 uint64_t addr = 0; 287 uint32_t pages = 0; 288 289 pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT); 290 if (pages >= stklist_size(&ff_mpage_ctl)){ 291 rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages); 292 return -1; 293 } 294 295 addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK); 296 return addr; 297 } 298 299 void *ff_mem_get_page() 300 { 301 return (void*)stklist_pop(&ff_mpage_ctl); 302 } 303 304 int ff_mem_free_addr(void *p) 305 { 306 stklist_push(&ff_mpage_ctl, (const uint64_t)p); 307 return 0; 308 } 309 310 static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void *m, struct rte_mbuf *head) 311 { 312 void *data = NULL; 313 struct ff_tx_offload offload = {0}; 314 315 ff_mbuf_tx_offload(m, &offload); 316 data = rte_pktmbuf_mtod(head, void*); 317 318 if (offload.ip_csum) { 319 /* ipv6 not supported yet */ 320 struct ipv4_hdr *iph; 321 int iph_len; 322 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 323 iph_len = (iph->version_ihl & 0x0f) << 2; 324 325 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 326 head->l2_len = ETHER_HDR_LEN; 327 head->l3_len = iph_len; 328 } 329 330 if (ctx->hw_features.tx_csum_l4) { 331 struct ipv4_hdr *iph; 332 int iph_len; 333 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 334 iph_len = (iph->version_ihl & 0x0f) << 2; 335 336 if (offload.tcp_csum) { 337 head->ol_flags |= PKT_TX_TCP_CKSUM; 338 head->l2_len = ETHER_HDR_LEN; 339 head->l3_len = iph_len; 340 } 341 342 /* 343 * TCP segmentation offload. 344 * 345 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 346 * implies PKT_TX_TCP_CKSUM) 347 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 348 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 349 * write the IP checksum to 0 in the packet 350 * - fill the mbuf offload information: l2_len, 351 * l3_len, l4_len, tso_segsz 352 * - calculate the pseudo header checksum without taking ip_len 353 * in account, and set it in the TCP header. Refer to 354 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 355 * used as helpers. 356 */ 357 if (offload.tso_seg_size) { 358 struct tcp_hdr *tcph; 359 int tcph_len; 360 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 361 tcph_len = (tcph->data_off & 0xf0) >> 2; 362 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 363 364 head->ol_flags |= PKT_TX_TCP_SEG; 365 head->l4_len = tcph_len; 366 head->tso_segsz = offload.tso_seg_size; 367 } 368 369 if (offload.udp_csum) { 370 head->ol_flags |= PKT_TX_UDP_CKSUM; 371 head->l2_len = ETHER_HDR_LEN; 372 head->l3_len = iph_len; 373 } 374 } 375 } 376 377 // create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER. 378 static inline struct rte_mbuf* ff_extcl_to_rte(void *m ) 379 { 380 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 381 struct rte_mbuf *src_mbuf = NULL; 382 struct rte_mbuf *p_head = NULL; 383 384 src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m); 385 if ( NULL==src_mbuf ){ 386 return NULL; 387 } 388 p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool); 389 if (p_head == NULL){ 390 return NULL; 391 } 392 393 return p_head; 394 } 395 396 // create rte_mbuf refer to data in bsd mbuf. 397 static inline struct rte_mbuf* ff_bsd_to_rte(void *m, int total) 398 { 399 struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id]; 400 struct rte_mbuf *p_head = NULL; 401 struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL; 402 void *data = NULL; 403 void *p_bsdbuf = NULL; 404 unsigned len = 0; 405 406 p_head = rte_pktmbuf_alloc(mbuf_pool); 407 if (p_head == NULL){ 408 return NULL; 409 } 410 p_head->pkt_len = total; 411 p_head->nb_segs = 0; 412 cur = p_head; 413 p_bsdbuf = m; 414 while ( p_bsdbuf ){ 415 if (cur == NULL) { 416 cur = rte_pktmbuf_alloc(mbuf_pool); 417 if (cur == NULL) { 418 rte_pktmbuf_free(p_head); 419 return NULL; 420 } 421 } 422 ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf. 423 cur->buf_addr = data; 424 cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr)); 425 cur->data_off = 0; 426 cur->data_len = len; 427 428 p_head->nb_segs++; 429 if (prev != NULL) { 430 prev->next = cur; 431 } 432 prev = cur; 433 cur = NULL; 434 } 435 436 return p_head; 437 } 438 439 int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total) 440 { 441 struct rte_mbuf *head = NULL; 442 void *src_buf = NULL; 443 void *p_data = NULL; 444 struct lcore_conf *qconf = NULL; 445 unsigned len = 0; 446 447 if ( !m ){ 448 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address."); 449 return 0; 450 } 451 p_data = ff_mbuf_mtod(m); 452 if ( ff_chk_vma((uint64_t)p_data)){ 453 head = ff_bsd_to_rte(m, total); 454 } 455 else if ( (head = ff_extcl_to_rte(m)) == NULL ){ 456 rte_panic("data address 0x%lx is out of page bound or not malloced by DPDK recver.", (uint64_t)p_data); 457 return 0; 458 } 459 460 if (head == NULL){ 461 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_if_send_onepkt call ff_bsd_to_rte failed."); 462 ff_mbuf_free(m); 463 return 0; 464 } 465 466 ff_offload_set(ctx, m, head); 467 qconf = &lcore_conf; 468 len = qconf->tx_mbufs[ctx->port_id].len; 469 qconf->tx_mbufs[ctx->port_id].m_table[len] = head; 470 qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m; 471 len++; 472 473 return len; 474 } 475 476 int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs) 477 { 478 return ff_txring_enqueue(&nic_tx_ring[portid], p_mbuf, nb_segs); 479 } 480 481