1 /*
2 * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 *
25 */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_lcore.h>
42 #include <rte_launch.h>
43 #include <rte_ethdev.h>
44 #include <rte_debug.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 #include "ff_memory.h"
63
64 #define PAGE_SIZE 4096
65 #define PAGE_SHIFT 12
66 #define PAGE_MASK (PAGE_SIZE - 1)
67 #define trunc_page(x) ((x) & ~PAGE_MASK)
68 #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK)
69
70 extern struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
71 extern struct lcore_conf lcore_conf;
72
73 //struct ff_tx_offload;
74
75 // ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address.
76 static struct rte_mempool *ff_ref_pool[NB_SOCKETS];
77
78 #define Head_INC(h) {\
79 if ( ++h >= TX_QUEUE_SIZE ) \
80 h = 0;\
81 };
82
83 #define Head_DEC(h) do{\
84 if ( --h < 0 ) \
85 h = TX_QUEUE_SIZE-1;\
86 }while(0);
87
88 // bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed.
89 static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS];
90 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num);
91 static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len);
92
93 typedef struct _list_manager_s
94 {
95 uint64_t *ele;
96 int size;
97 //int FreeNum;
98 int top;
99 }StackList_t;
100
101 static StackList_t ff_mpage_ctl = {0};
102 static uint64_t ff_page_start = (uint64_t)NULL, ff_page_end = (uint64_t)NULL;
103 static phys_addr_t *ff_mpage_phy = NULL;
104
105 static inline void *stklist_pop(StackList_t *p);
106 static inline int stklist_push(StackList_t * p, uint64_t val);
107
stklist_init(StackList_t * p,int size)108 static int stklist_init(StackList_t*p, int size)
109 {
110
111 int i = 0;
112
113 if (p==NULL || size<=0){
114 return -1;
115 }
116 p->size = size;
117 p->top = 0;
118 if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0)
119 return -2;
120
121 return 0;
122 }
123
stklist_pop(StackList_t * p)124 static inline void *stklist_pop(StackList_t *p)
125 {
126 int head = 0;
127
128 if (p==NULL)
129 return NULL;
130
131 if (p->top > 0 ){
132 return (void*)p->ele[--p->top];
133 }
134 else
135 return NULL;
136 }
137
138 //id: the id of element to be freed.
139 //return code: -1: faile; >=0:OK.
stklist_push(StackList_t * p,const uint64_t val)140 static inline int stklist_push(StackList_t *p, const uint64_t val){
141 int tail = 0;
142
143 if (p==NULL)
144 return -1;
145 if (p->top < p->size){
146 p->ele[p->top++] = val;
147 return 0;
148 }
149 else
150 return -1;
151 }
152
stklist_size(StackList_t * p)153 static inline int stklist_size(StackList_t * p)
154 {
155 return p->size;
156 }
157
158 // set (void*) to rte_mbuf's priv_data.
ff_mbuf_set_uint64(struct rte_mbuf * p,uint64_t data)159 static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data)
160 {
161 if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t))
162 *((uint64_t*)(p+1)) = data;
163 return 0;
164 }
165
166 /*************************
167 * if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below:
168 * <--- num-1 ---->|ptr| head |
169 * ----------------------------------------------
170 * | 0 | 0 | ..............| 0 | p | XXX |
171 *-----------------------------------------------
172 *************************/
ff_txring_enqueue(struct mbuf_txring * q,void * p,int seg_num)173 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num)
174 {
175 int i = 0;
176 for ( i=0; i<seg_num-1; i++){
177 if ( q->m_table[q->head] ){
178 ff_mbuf_free(q->m_table[q->head]);
179 q->m_table[q->head] = NULL;
180 }
181 Head_INC(q->head);
182 }
183 if ( q->m_table[q->head] )
184 ff_mbuf_free(q->m_table[q->head]);
185 q->m_table[q->head] = p;
186 Head_INC(q->head);
187
188 return 0;
189 }
190
191 // pop out from head-1 .
ff_txring_pop(struct mbuf_txring * q,int num)192 static inline int ff_txring_pop(struct mbuf_txring* q, int num)
193 {
194 int i = 0;
195
196 for (i=0; i<num; i++){
197 Head_DEC(q->head);
198 if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ){
199 rte_panic("ff_txring_pop fatal error!");
200 }
201 if ( q->m_table[q->head] != NULL ){
202 ff_mbuf_free(q->m_table[q->head]);
203 q->m_table[q->head] = NULL;
204 }
205 }
206 }
207
ff_txring_init(struct mbuf_txring * q,uint32_t num)208 static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num)
209 {
210 memset(q, 0, sizeof(struct mbuf_txring)*num);
211 }
212
ff_init_ref_pool(int nb_mbuf,int socketid)213 void ff_init_ref_pool(int nb_mbuf, int socketid)
214 {
215 char s[64] = {0};
216
217 if (ff_ref_pool[socketid] != NULL) {
218 return;
219 }
220 snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid);
221 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
222 ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid);
223 } else {
224 ff_ref_pool[socketid] = rte_mempool_lookup(s);
225 }
226 }
227
ff_mmap_init()228 int ff_mmap_init()
229 {
230 int err = 0;
231 int i = 0;
232 uint64_t virt_addr = (uint64_t)NULL;
233 phys_addr_t phys_addr = 0;
234 uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20);
235 unsigned int bsd_pagesz = 0;
236
237 ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
238 if (ff_page_start == (uint64_t)-1){
239 rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno);
240 return -1;
241 }
242
243 if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) {
244 rte_panic("mlock failed, err=%d.\n", errno);
245 return -1;
246 }
247 ff_page_end = ff_page_start + bsd_memsz;
248 bsd_pagesz = (bsd_memsz>>12);
249 rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size);
250 printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end);
251
252 if (posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0){
253 rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno);
254 return -1;
255 }
256
257 stklist_init(&ff_mpage_ctl, bsd_pagesz);
258
259 for (i=0; i<bsd_pagesz; i++ ){
260 virt_addr = ff_page_start + PAGE_SIZE*i;
261 memset((void*)virt_addr, 0, PAGE_SIZE);
262
263 stklist_push( &ff_mpage_ctl, virt_addr);
264 ff_mpage_phy[i] = rte_mem_virt2phy((const void*)virt_addr);
265 if ( ff_mpage_phy[i] == RTE_BAD_IOVA ){
266 rte_panic("rte_mem_virt2phy return invalid address.");
267 return -1;
268 }
269 }
270
271 ff_txring_init(&nic_tx_ring[0], RTE_MAX_ETHPORTS);
272
273 return 0;
274 }
275
276 // 1: vma in fstack page table; 0: vma not in fstack pages, in DPDK pool.
ff_chk_vma(const uint64_t virtaddr)277 static inline int ff_chk_vma(const uint64_t virtaddr)
278 {
279 return !!( virtaddr > ff_page_start && virtaddr < ff_page_end );
280 }
281
282 /*
283 * Get physical address of any mapped virtual address in the current process.
284 */
ff_mem_virt2phy(const void * virtaddr)285 static inline uint64_t ff_mem_virt2phy(const void* virtaddr)
286 {
287 uint64_t addr = 0;
288 uint32_t pages = 0;
289
290 pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT);
291 if (pages >= stklist_size(&ff_mpage_ctl)){
292 rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages);
293 return -1;
294 }
295
296 addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK);
297 return addr;
298 }
299
ff_mem_get_page()300 void *ff_mem_get_page()
301 {
302 return (void*)stklist_pop(&ff_mpage_ctl);
303 }
304
ff_mem_free_addr(void * p)305 int ff_mem_free_addr(void *p)
306 {
307 stklist_push(&ff_mpage_ctl, (const uint64_t)p);
308 return 0;
309 }
310
ff_offload_set(struct ff_dpdk_if_context * ctx,void * m,struct rte_mbuf * head)311 static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void *m, struct rte_mbuf *head)
312 {
313 void *data = NULL;
314 struct ff_tx_offload offload = {0};
315
316 ff_mbuf_tx_offload(m, &offload);
317 data = rte_pktmbuf_mtod(head, void*);
318
319 if (offload.ip_csum) {
320 /* ipv6 not supported yet */
321 struct rte_ipv4_hdr *iph;
322 int iph_len;
323 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
324 iph_len = (iph->version_ihl & 0x0f) << 2;
325
326 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
327 head->l2_len = RTE_ETHER_HDR_LEN;
328 head->l3_len = iph_len;
329 }
330
331 if (ctx->hw_features.tx_csum_l4) {
332 struct rte_ipv4_hdr *iph;
333 int iph_len;
334 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
335 iph_len = (iph->version_ihl & 0x0f) << 2;
336
337 if (offload.tcp_csum) {
338 head->ol_flags |= PKT_TX_TCP_CKSUM;
339 head->l2_len = RTE_ETHER_HDR_LEN;
340 head->l3_len = iph_len;
341 }
342
343 /*
344 * TCP segmentation offload.
345 *
346 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
347 * implies PKT_TX_TCP_CKSUM)
348 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
349 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
350 * write the IP checksum to 0 in the packet
351 * - fill the mbuf offload information: l2_len,
352 * l3_len, l4_len, tso_segsz
353 * - calculate the pseudo header checksum without taking ip_len
354 * in account, and set it in the TCP header. Refer to
355 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
356 * used as helpers.
357 */
358 if (offload.tso_seg_size) {
359 struct rte_tcp_hdr *tcph;
360 int tcph_len;
361 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
362 tcph_len = (tcph->data_off & 0xf0) >> 2;
363 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
364
365 head->ol_flags |= PKT_TX_TCP_SEG;
366 head->l4_len = tcph_len;
367 head->tso_segsz = offload.tso_seg_size;
368 }
369
370 if (offload.udp_csum) {
371 head->ol_flags |= PKT_TX_UDP_CKSUM;
372 head->l2_len = RTE_ETHER_HDR_LEN;
373 head->l3_len = iph_len;
374 }
375 }
376 }
377
378 // create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER.
ff_extcl_to_rte(void * m)379 static inline struct rte_mbuf* ff_extcl_to_rte(void *m )
380 {
381 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
382 struct rte_mbuf *src_mbuf = NULL;
383 struct rte_mbuf *p_head = NULL;
384
385 src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m);
386 if ( NULL==src_mbuf ){
387 return NULL;
388 }
389 p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool);
390 if (p_head == NULL){
391 return NULL;
392 }
393
394 return p_head;
395 }
396
397 // create rte_mbuf refer to data in bsd mbuf.
ff_bsd_to_rte(void * m,int total)398 static inline struct rte_mbuf* ff_bsd_to_rte(void *m, int total)
399 {
400 struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id];
401 struct rte_mbuf *p_head = NULL;
402 struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL;
403 void *data = NULL;
404 void *p_bsdbuf = NULL;
405 unsigned len = 0;
406
407 p_head = rte_pktmbuf_alloc(mbuf_pool);
408 if (p_head == NULL){
409 return NULL;
410 }
411 p_head->pkt_len = total;
412 p_head->nb_segs = 0;
413 cur = p_head;
414 p_bsdbuf = m;
415 while ( p_bsdbuf ){
416 if (cur == NULL) {
417 cur = rte_pktmbuf_alloc(mbuf_pool);
418 if (cur == NULL) {
419 rte_pktmbuf_free(p_head);
420 return NULL;
421 }
422 }
423 ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf.
424 cur->buf_addr = data;
425 cur->buf_iova = ff_mem_virt2phy((const void*)(cur->buf_addr));
426 cur->data_off = 0;
427 cur->data_len = len;
428
429 p_head->nb_segs++;
430 if (prev != NULL) {
431 prev->next = cur;
432 }
433 prev = cur;
434 cur = NULL;
435 }
436
437 return p_head;
438 }
439
ff_if_send_onepkt(struct ff_dpdk_if_context * ctx,void * m,int total)440 int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total)
441 {
442 struct rte_mbuf *head = NULL;
443 void *src_buf = NULL;
444 void *p_data = NULL;
445 struct lcore_conf *qconf = NULL;
446 unsigned len = 0;
447
448 if ( !m ){
449 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address.");
450 return 0;
451 }
452 p_data = ff_mbuf_mtod(m);
453 if ( ff_chk_vma((uint64_t)p_data)){
454 head = ff_bsd_to_rte(m, total);
455 }
456 else if ( (head = ff_extcl_to_rte(m)) == NULL ){
457 rte_panic("data address 0x%lx is out of page bound or not malloced by DPDK recver.", (uint64_t)p_data);
458 return 0;
459 }
460
461 if (head == NULL){
462 rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_if_send_onepkt call ff_bsd_to_rte failed.");
463 ff_mbuf_free(m);
464 return 0;
465 }
466
467 ff_offload_set(ctx, m, head);
468 qconf = &lcore_conf;
469 len = qconf->tx_mbufs[ctx->port_id].len;
470 qconf->tx_mbufs[ctx->port_id].m_table[len] = head;
471 qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m;
472 len++;
473
474 return len;
475 }
476
ff_enq_tx_bsdmbuf(uint8_t portid,void * p_mbuf,int nb_segs)477 int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs)
478 {
479 return ff_txring_enqueue(&nic_tx_ring[portid], p_mbuf, nb_segs);
480 }
481
482