xref: /f-stack/lib/ff_memory.c (revision a5c480ea)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_lcore.h>
42 #include <rte_launch.h>
43 #include <rte_ethdev.h>
44 #include <rte_debug.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 #include "ff_memory.h"
63 
64 #define PAGE_SIZE			4096
65 #define	PAGE_SHIFT			12
66 #define	PAGE_MASK			(PAGE_SIZE - 1)
67 #define	trunc_page(x)		((x) & ~PAGE_MASK)
68 #define	round_page(x)		(((x) + PAGE_MASK) & ~PAGE_MASK)
69 
70 extern struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
71 extern struct lcore_conf lcore_conf;
72 
73 //struct ff_tx_offload;
74 
75 // ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address.
76 static struct rte_mempool *ff_ref_pool[NB_SOCKETS];
77 
78 #define	Head_INC(h)	{\
79 	if ( ++h >= TX_QUEUE_SIZE ) \
80 		h = 0;\
81 	};
82 
83 #define	Head_DEC(h)	do{\
84 	if ( --h < 0 ) \
85 		h = TX_QUEUE_SIZE-1;\
86 	}while(0);
87 
88 // bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed.
89 static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS];
90 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num);
91 static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len);
92 
93 typedef struct _list_manager_s
94 {
95 	uint64_t	*ele;
96 	int		size;
97 	//int		FreeNum;
98 	int 	top;
99 }StackList_t;
100 
101 static StackList_t 		ff_mpage_ctl = {0};
102 static uint64_t		 	ff_page_start = NULL, ff_page_end = NULL;
103 static phys_addr_t		*ff_mpage_phy = NULL;
104 
105 static inline void		*stklist_pop(StackList_t *p);
106 static inline int 		stklist_push(StackList_t * p, uint64_t val);
107 
108 static int 				stklist_init(StackList_t*p, int size)
109 {
110 	int i = 0;
111 
112 	if (p==NULL || size<=0){
113 		return -1;
114 	}
115 	p->size = size;
116 	p->top = 0;
117 	if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0)
118 		return -2;
119 
120 	return 0;
121 }
122 
123 static inline void *stklist_pop(StackList_t *p)
124 {
125 	int head = 0;
126 
127 	if (p==NULL)
128 		return NULL;
129 
130 	if (p->top > 0 ){
131 		return (void*)p->ele[--p->top];
132 	}
133 	else
134 		return NULL;
135 }
136 
137 //id: the id of element to be freed.
138 //return code: -1: faile;  >=0:OK.
139 static inline int stklist_push(StackList_t *p,  const uint64_t val){
140 	int tail = 0;
141 
142 	if (p==NULL)
143 		return -1;
144 	if (p->top < p->size){
145 		p->ele[p->top++] = val;
146 		return 0;
147 	}
148 	else
149 		return -1;
150 }
151 
152 static inline int stklist_size(StackList_t * p)
153 {
154 	return p->size;
155 }
156 
157 // set (void*) to rte_mbuf's priv_data.
158 static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data)
159 {
160 	if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t))
161 		*((uint64_t*)(p+1)) = data;
162 	return 0;
163 }
164 
165 /*************************
166 * if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below:
167 * <---     num-1          ---->|ptr| head |
168 * ----------------------------------------------
169 * | 0 | 0 | ..............| 0  | p | XXX  |
170 *-----------------------------------------------
171 *************************/
172 static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num)
173 {
174 	int i = 0;
175 	for ( i=0; i<seg_num-1; i++){
176 		if ( q->m_table[q->head] ){
177 			ff_mbuf_free(q->m_table[q->head]);
178 			q->m_table[q->head] = NULL;
179 		}
180 		Head_INC(q->head);
181 	}
182 	if ( q->m_table[q->head] )
183 		ff_mbuf_free(q->m_table[q->head]);
184 	q->m_table[q->head] = p;
185 	Head_INC(q->head);
186 
187 	return 0;
188 }
189 
190 // pop out from head-1 .
191 static inline int ff_txring_pop(struct mbuf_txring* q, int num)
192 {
193 	int i = 0;
194 
195 	for (i=0; i<num; i++){
196 		Head_DEC(q->head);
197 		if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ){
198 			rte_panic("ff_txring_pop fatal error!");
199 		}
200 		if ( q->m_table[q->head] != NULL ){
201 			ff_mbuf_free(q->m_table[q->head]);
202 			q->m_table[q->head] = NULL;
203 		}
204 	}
205 }
206 
207 static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num)
208 {
209 	memset(q, 0, sizeof(struct mbuf_txring)*num);
210 }
211 
212 void ff_init_ref_pool(int nb_mbuf, int socketid)
213 {
214 	char s[64] = {0};
215 
216 	if (ff_ref_pool[socketid] != NULL) {
217             return;
218     }
219     snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid);
220 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
221 	    ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid);
222 	} else {
223 	    ff_ref_pool[socketid] = rte_mempool_lookup(s);
224 	}
225 }
226 
227 int ff_mmap_init()
228 {
229 	int err = 0;
230 	int i = 0;
231 	uint64_t	virt_addr = NULL;
232 	phys_addr_t	phys_addr = 0;
233 	uint64_t	bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20);
234 	unsigned int bsd_pagesz = 0;
235 
236 	ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
237 	if (ff_page_start == (uint64_t)-1){
238 		rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno);
239 		return -1;
240 	}
241 
242 	if ( mlock((void*)ff_page_start, bsd_memsz)<0 )	{
243 		rte_panic("mlock failed, err=%d.\n", errno);
244 		return -1;
245 	}
246 	ff_page_end = ff_page_start + bsd_memsz;
247 	bsd_pagesz = (bsd_memsz>>12);
248 	rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size);
249 	printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end);
250 
251 	if (posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0){
252 		rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno);
253 		return -1;
254 	}
255 
256 	stklist_init(&ff_mpage_ctl, bsd_pagesz);
257 
258 	for (i=0; i<bsd_pagesz; i++ ){
259 		virt_addr = ff_page_start + PAGE_SIZE*i;
260 		memset((void*)virt_addr, 0, PAGE_SIZE);
261 
262 		stklist_push( &ff_mpage_ctl, virt_addr);
263 		ff_mpage_phy[i] = rte_mem_virt2phy((const void*)virt_addr);
264 		if ( ff_mpage_phy[i] == RTE_BAD_IOVA ){
265 			rte_panic("rte_mem_virt2phy return invalid address.");
266 			return -1;
267 		}
268 	}
269 
270     ff_txring_init(&nic_tx_ring[0], RTE_MAX_ETHPORTS);
271 
272 	return 0;
273 }
274 
275 // 1: vma in fstack page table;  0: vma not in fstack pages, in DPDK pool.
276 static inline int ff_chk_vma(const uint64_t virtaddr)
277 {
278 	return  !!( virtaddr > ff_page_start && virtaddr < ff_page_end );
279 }
280 
281 /*
282  * Get physical address of any mapped virtual address in the current process.
283  */
284 static inline uint64_t ff_mem_virt2phy(const void* virtaddr)
285 {
286 	uint64_t	addr = 0;
287 	uint32_t	pages = 0;
288 
289 	pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT);
290 	if (pages >= stklist_size(&ff_mpage_ctl)){
291 		rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages);
292 		return -1;
293 	}
294 
295 	addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK);
296 	return addr;
297 }
298 
299 void *ff_mem_get_page()
300 {
301 	return (void*)stklist_pop(&ff_mpage_ctl);
302 }
303 
304 int	ff_mem_free_addr(void *p)
305 {
306 	stklist_push(&ff_mpage_ctl, (const uint64_t)p);
307 	return 0;
308 }
309 
310 static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void *m, struct rte_mbuf *head)
311 {
312 	void					*data = NULL;
313     struct ff_tx_offload 	offload = {0};
314 
315     ff_mbuf_tx_offload(m, &offload);
316     data = rte_pktmbuf_mtod(head, void*);
317 
318     if (offload.ip_csum) {
319         /* ipv6 not supported yet */
320         struct ipv4_hdr *iph;
321         int iph_len;
322         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
323         iph_len = (iph->version_ihl & 0x0f) << 2;
324 
325         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
326         head->l2_len = ETHER_HDR_LEN;
327         head->l3_len = iph_len;
328     }
329 
330     if (ctx->hw_features.tx_csum_l4) {
331         struct ipv4_hdr *iph;
332         int iph_len;
333         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
334         iph_len = (iph->version_ihl & 0x0f) << 2;
335 
336         if (offload.tcp_csum) {
337             head->ol_flags |= PKT_TX_TCP_CKSUM;
338             head->l2_len = ETHER_HDR_LEN;
339             head->l3_len = iph_len;
340         }
341 
342        /*
343          *  TCP segmentation offload.
344          *
345          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
346          *    implies PKT_TX_TCP_CKSUM)
347          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
348          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
349          *    write the IP checksum to 0 in the packet
350          *  - fill the mbuf offload information: l2_len,
351          *    l3_len, l4_len, tso_segsz
352          *  - calculate the pseudo header checksum without taking ip_len
353          *    in account, and set it in the TCP header. Refer to
354          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
355          *    used as helpers.
356          */
357         if (offload.tso_seg_size) {
358             struct tcp_hdr *tcph;
359             int tcph_len;
360             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
361             tcph_len = (tcph->data_off & 0xf0) >> 2;
362             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
363 
364             head->ol_flags |= PKT_TX_TCP_SEG;
365             head->l4_len = tcph_len;
366             head->tso_segsz = offload.tso_seg_size;
367         }
368 
369         if (offload.udp_csum) {
370             head->ol_flags |= PKT_TX_UDP_CKSUM;
371             head->l2_len = ETHER_HDR_LEN;
372             head->l3_len = iph_len;
373         }
374     }
375 }
376 
377 // create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER.
378 static inline struct rte_mbuf* 	ff_extcl_to_rte(void *m )
379 {
380 	struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
381 	struct rte_mbuf *src_mbuf = NULL;
382 	struct rte_mbuf *p_head = NULL;
383 
384 	src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m);
385 	if ( NULL==src_mbuf ){
386 		return NULL;
387 	}
388 	p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool);
389 	if (p_head == NULL){
390 		return NULL;
391 	}
392 
393 	return p_head;
394 }
395 
396 //  create rte_mbuf refer to data in bsd mbuf.
397 static inline struct rte_mbuf* 	ff_bsd_to_rte(void *m, int total)
398 {
399 	struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id];
400 	struct rte_mbuf *p_head = NULL;
401 	struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL;
402 	void	*data = NULL;
403 	void	*p_bsdbuf = NULL;
404     unsigned len = 0;
405 
406 	p_head = rte_pktmbuf_alloc(mbuf_pool);
407 	if (p_head == NULL){
408 		return NULL;
409 	}
410 	p_head->pkt_len = total;
411     p_head->nb_segs = 0;
412     cur = p_head;
413     p_bsdbuf = m;
414     while ( p_bsdbuf ){
415         if (cur == NULL) {
416             cur = rte_pktmbuf_alloc(mbuf_pool);
417             if (cur == NULL) {
418                 rte_pktmbuf_free(p_head);
419                 return NULL;
420             }
421         }
422         ff_next_mbuf(&p_bsdbuf, &data, &len);		// p_bsdbuf move to next mbuf.
423         cur->buf_addr = data;
424         cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr));
425         cur->data_off = 0;
426         cur->data_len = len;
427 
428         p_head->nb_segs++;
429         if (prev != NULL) {
430             prev->next = cur;
431         }
432         prev = cur;
433         cur = NULL;
434     }
435 
436 	return p_head;
437 }
438 
439 int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total)
440 {
441     struct rte_mbuf *head = NULL;
442     void			*src_buf = NULL;
443     void			*p_data = NULL;
444     struct lcore_conf *qconf = NULL;
445     unsigned		len = 0;
446 
447     if ( !m ){
448         rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address.");
449     	return 0;
450     }
451     p_data = ff_mbuf_mtod(m);
452     if ( ff_chk_vma((uint64_t)p_data)){
453 		head = ff_bsd_to_rte(m, total);
454 	}
455 	else if ( (head = ff_extcl_to_rte(m)) == NULL ){
456 	   	rte_panic("data address 0x%lx is out of page bound or not malloced by DPDK recver.", (uint64_t)p_data);
457 		return 0;
458     }
459 
460     if (head == NULL){
461     	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_if_send_onepkt call ff_bsd_to_rte failed.");
462 	    ff_mbuf_free(m);
463 	    return 0;
464 	}
465 
466     ff_offload_set(ctx, m, head);
467     qconf = &lcore_conf;
468     len = qconf->tx_mbufs[ctx->port_id].len;
469     qconf->tx_mbufs[ctx->port_id].m_table[len] = head;
470     qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m;
471     len++;
472 
473     return len;
474 }
475 
476 int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs)
477 {
478     return ff_txring_enqueue(&nic_tx_ring[portid], p_mbuf, nb_segs);
479 }
480 
481