xref: /f-stack/dpdk/lib/librte_vhost/virtio_net.c (revision 2d9fd380)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
12 #include <rte_ip.h>
13 #include <rte_vhost.h>
14 #include <rte_tcp.h>
15 #include <rte_udp.h>
16 #include <rte_sctp.h>
17 #include <rte_arp.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
20 #include <rte_vhost_async.h>
21 
22 #include "iotlb.h"
23 #include "vhost.h"
24 
25 #define MAX_BATCH_LEN 256
26 
27 #define VHOST_ASYNC_BATCH_THRESHOLD 32
28 
29 static  __rte_always_inline bool
rxvq_is_mergeable(struct virtio_net * dev)30 rxvq_is_mergeable(struct virtio_net *dev)
31 {
32 	return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
33 }
34 
35 static  __rte_always_inline bool
virtio_net_is_inorder(struct virtio_net * dev)36 virtio_net_is_inorder(struct virtio_net *dev)
37 {
38 	return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
39 }
40 
41 static bool
is_valid_virt_queue_idx(uint32_t idx,int is_tx,uint32_t nr_vring)42 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
43 {
44 	return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
45 }
46 
47 static inline void
do_data_copy_enqueue(struct virtio_net * dev,struct vhost_virtqueue * vq)48 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
49 {
50 	struct batch_copy_elem *elem = vq->batch_copy_elems;
51 	uint16_t count = vq->batch_copy_nb_elems;
52 	int i;
53 
54 	for (i = 0; i < count; i++) {
55 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
56 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
57 					   elem[i].len);
58 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
59 	}
60 
61 	vq->batch_copy_nb_elems = 0;
62 }
63 
64 static inline void
do_data_copy_dequeue(struct vhost_virtqueue * vq)65 do_data_copy_dequeue(struct vhost_virtqueue *vq)
66 {
67 	struct batch_copy_elem *elem = vq->batch_copy_elems;
68 	uint16_t count = vq->batch_copy_nb_elems;
69 	int i;
70 
71 	for (i = 0; i < count; i++)
72 		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
73 
74 	vq->batch_copy_nb_elems = 0;
75 }
76 
77 static __rte_always_inline void
do_flush_shadow_used_ring_split(struct virtio_net * dev,struct vhost_virtqueue * vq,uint16_t to,uint16_t from,uint16_t size)78 do_flush_shadow_used_ring_split(struct virtio_net *dev,
79 			struct vhost_virtqueue *vq,
80 			uint16_t to, uint16_t from, uint16_t size)
81 {
82 	rte_memcpy(&vq->used->ring[to],
83 			&vq->shadow_used_split[from],
84 			size * sizeof(struct vring_used_elem));
85 	vhost_log_cache_used_vring(dev, vq,
86 			offsetof(struct vring_used, ring[to]),
87 			size * sizeof(struct vring_used_elem));
88 }
89 
90 static __rte_always_inline void
flush_shadow_used_ring_split(struct virtio_net * dev,struct vhost_virtqueue * vq)91 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
92 {
93 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
94 
95 	if (used_idx + vq->shadow_used_idx <= vq->size) {
96 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
97 					  vq->shadow_used_idx);
98 	} else {
99 		uint16_t size;
100 
101 		/* update used ring interval [used_idx, vq->size] */
102 		size = vq->size - used_idx;
103 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
104 
105 		/* update the left half used ring interval [0, left_size] */
106 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
107 					  vq->shadow_used_idx - size);
108 	}
109 	vq->last_used_idx += vq->shadow_used_idx;
110 
111 	vhost_log_cache_sync(dev, vq);
112 
113 	__atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
114 			   __ATOMIC_RELEASE);
115 	vq->shadow_used_idx = 0;
116 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
117 		sizeof(vq->used->idx));
118 }
119 
120 static __rte_always_inline void
async_flush_shadow_used_ring_split(struct virtio_net * dev,struct vhost_virtqueue * vq)121 async_flush_shadow_used_ring_split(struct virtio_net *dev,
122 	struct vhost_virtqueue *vq)
123 {
124 	uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
125 
126 	if (used_idx + vq->shadow_used_idx <= vq->size) {
127 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
128 					  vq->shadow_used_idx);
129 	} else {
130 		uint16_t size;
131 
132 		/* update used ring interval [used_idx, vq->size] */
133 		size = vq->size - used_idx;
134 		do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
135 
136 		/* update the left half used ring interval [0, left_size] */
137 		do_flush_shadow_used_ring_split(dev, vq, 0, size,
138 					  vq->shadow_used_idx - size);
139 	}
140 
141 	vq->last_used_idx += vq->shadow_used_idx;
142 	vq->shadow_used_idx = 0;
143 }
144 
145 static __rte_always_inline void
update_shadow_used_ring_split(struct vhost_virtqueue * vq,uint16_t desc_idx,uint32_t len)146 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
147 			 uint16_t desc_idx, uint32_t len)
148 {
149 	uint16_t i = vq->shadow_used_idx++;
150 
151 	vq->shadow_used_split[i].id  = desc_idx;
152 	vq->shadow_used_split[i].len = len;
153 }
154 
155 static __rte_always_inline void
vhost_flush_enqueue_shadow_packed(struct virtio_net * dev,struct vhost_virtqueue * vq)156 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
157 				  struct vhost_virtqueue *vq)
158 {
159 	int i;
160 	uint16_t used_idx = vq->last_used_idx;
161 	uint16_t head_idx = vq->last_used_idx;
162 	uint16_t head_flags = 0;
163 
164 	/* Split loop in two to save memory barriers */
165 	for (i = 0; i < vq->shadow_used_idx; i++) {
166 		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
167 		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
168 
169 		used_idx += vq->shadow_used_packed[i].count;
170 		if (used_idx >= vq->size)
171 			used_idx -= vq->size;
172 	}
173 
174 	rte_smp_wmb();
175 
176 	for (i = 0; i < vq->shadow_used_idx; i++) {
177 		uint16_t flags;
178 
179 		if (vq->shadow_used_packed[i].len)
180 			flags = VRING_DESC_F_WRITE;
181 		else
182 			flags = 0;
183 
184 		if (vq->used_wrap_counter) {
185 			flags |= VRING_DESC_F_USED;
186 			flags |= VRING_DESC_F_AVAIL;
187 		} else {
188 			flags &= ~VRING_DESC_F_USED;
189 			flags &= ~VRING_DESC_F_AVAIL;
190 		}
191 
192 		if (i > 0) {
193 			vq->desc_packed[vq->last_used_idx].flags = flags;
194 
195 			vhost_log_cache_used_vring(dev, vq,
196 					vq->last_used_idx *
197 					sizeof(struct vring_packed_desc),
198 					sizeof(struct vring_packed_desc));
199 		} else {
200 			head_idx = vq->last_used_idx;
201 			head_flags = flags;
202 		}
203 
204 		vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
205 	}
206 
207 	vq->desc_packed[head_idx].flags = head_flags;
208 
209 	vhost_log_cache_used_vring(dev, vq,
210 				head_idx *
211 				sizeof(struct vring_packed_desc),
212 				sizeof(struct vring_packed_desc));
213 
214 	vq->shadow_used_idx = 0;
215 	vhost_log_cache_sync(dev, vq);
216 }
217 
218 static __rte_always_inline void
vhost_flush_dequeue_shadow_packed(struct virtio_net * dev,struct vhost_virtqueue * vq)219 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
220 				  struct vhost_virtqueue *vq)
221 {
222 	struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
223 
224 	vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
225 	rte_smp_wmb();
226 	vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags;
227 
228 	vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
229 				   sizeof(struct vring_packed_desc),
230 				   sizeof(struct vring_packed_desc));
231 	vq->shadow_used_idx = 0;
232 	vhost_log_cache_sync(dev, vq);
233 }
234 
235 static __rte_always_inline void
vhost_flush_enqueue_batch_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,uint64_t * lens,uint16_t * ids)236 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
237 				 struct vhost_virtqueue *vq,
238 				 uint64_t *lens,
239 				 uint16_t *ids)
240 {
241 	uint16_t i;
242 	uint16_t flags;
243 
244 	if (vq->shadow_used_idx) {
245 		do_data_copy_enqueue(dev, vq);
246 		vhost_flush_enqueue_shadow_packed(dev, vq);
247 	}
248 
249 	flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
250 
251 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
252 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
253 		vq->desc_packed[vq->last_used_idx + i].len = lens[i];
254 	}
255 
256 	rte_smp_wmb();
257 
258 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
259 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
260 
261 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
262 				   sizeof(struct vring_packed_desc),
263 				   sizeof(struct vring_packed_desc) *
264 				   PACKED_BATCH_SIZE);
265 	vhost_log_cache_sync(dev, vq);
266 
267 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
268 }
269 
270 static __rte_always_inline void
vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue * vq,uint16_t id)271 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
272 					  uint16_t id)
273 {
274 	vq->shadow_used_packed[0].id = id;
275 
276 	if (!vq->shadow_used_idx) {
277 		vq->shadow_last_used_idx = vq->last_used_idx;
278 		vq->shadow_used_packed[0].flags =
279 			PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
280 		vq->shadow_used_packed[0].len = 0;
281 		vq->shadow_used_packed[0].count = 1;
282 		vq->shadow_used_idx++;
283 	}
284 
285 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
286 }
287 
288 static __rte_always_inline void
vhost_shadow_dequeue_batch_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,uint16_t * ids)289 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
290 				  struct vhost_virtqueue *vq,
291 				  uint16_t *ids)
292 {
293 	uint16_t flags;
294 	uint16_t i;
295 	uint16_t begin;
296 
297 	flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
298 
299 	if (!vq->shadow_used_idx) {
300 		vq->shadow_last_used_idx = vq->last_used_idx;
301 		vq->shadow_used_packed[0].id  = ids[0];
302 		vq->shadow_used_packed[0].len = 0;
303 		vq->shadow_used_packed[0].count = 1;
304 		vq->shadow_used_packed[0].flags = flags;
305 		vq->shadow_used_idx++;
306 		begin = 1;
307 	} else
308 		begin = 0;
309 
310 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
311 		vq->desc_packed[vq->last_used_idx + i].id = ids[i];
312 		vq->desc_packed[vq->last_used_idx + i].len = 0;
313 	}
314 
315 	rte_smp_wmb();
316 	vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
317 		vq->desc_packed[vq->last_used_idx + i].flags = flags;
318 
319 	vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
320 				   sizeof(struct vring_packed_desc),
321 				   sizeof(struct vring_packed_desc) *
322 				   PACKED_BATCH_SIZE);
323 	vhost_log_cache_sync(dev, vq);
324 
325 	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
326 }
327 
328 static __rte_always_inline void
vhost_shadow_dequeue_single_packed(struct vhost_virtqueue * vq,uint16_t buf_id,uint16_t count)329 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
330 				   uint16_t buf_id,
331 				   uint16_t count)
332 {
333 	uint16_t flags;
334 
335 	flags = vq->desc_packed[vq->last_used_idx].flags;
336 	if (vq->used_wrap_counter) {
337 		flags |= VRING_DESC_F_USED;
338 		flags |= VRING_DESC_F_AVAIL;
339 	} else {
340 		flags &= ~VRING_DESC_F_USED;
341 		flags &= ~VRING_DESC_F_AVAIL;
342 	}
343 
344 	if (!vq->shadow_used_idx) {
345 		vq->shadow_last_used_idx = vq->last_used_idx;
346 
347 		vq->shadow_used_packed[0].id  = buf_id;
348 		vq->shadow_used_packed[0].len = 0;
349 		vq->shadow_used_packed[0].flags = flags;
350 		vq->shadow_used_idx++;
351 	} else {
352 		vq->desc_packed[vq->last_used_idx].id = buf_id;
353 		vq->desc_packed[vq->last_used_idx].len = 0;
354 		vq->desc_packed[vq->last_used_idx].flags = flags;
355 	}
356 
357 	vq_inc_last_used_packed(vq, count);
358 }
359 
360 static __rte_always_inline void
vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue * vq,uint16_t buf_id,uint16_t count)361 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
362 					   uint16_t buf_id,
363 					   uint16_t count)
364 {
365 	uint16_t flags;
366 
367 	vq->shadow_used_packed[0].id = buf_id;
368 
369 	flags = vq->desc_packed[vq->last_used_idx].flags;
370 	if (vq->used_wrap_counter) {
371 		flags |= VRING_DESC_F_USED;
372 		flags |= VRING_DESC_F_AVAIL;
373 	} else {
374 		flags &= ~VRING_DESC_F_USED;
375 		flags &= ~VRING_DESC_F_AVAIL;
376 	}
377 
378 	if (!vq->shadow_used_idx) {
379 		vq->shadow_last_used_idx = vq->last_used_idx;
380 		vq->shadow_used_packed[0].len = 0;
381 		vq->shadow_used_packed[0].flags = flags;
382 		vq->shadow_used_idx++;
383 	}
384 
385 	vq_inc_last_used_packed(vq, count);
386 }
387 
388 static __rte_always_inline void
vhost_shadow_enqueue_single_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,uint32_t len[],uint16_t id[],uint16_t count[],uint16_t num_buffers)389 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
390 				   struct vhost_virtqueue *vq,
391 				   uint32_t len[],
392 				   uint16_t id[],
393 				   uint16_t count[],
394 				   uint16_t num_buffers)
395 {
396 	uint16_t i;
397 	for (i = 0; i < num_buffers; i++) {
398 		/* enqueue shadow flush action aligned with batch num */
399 		if (!vq->shadow_used_idx)
400 			vq->shadow_aligned_idx = vq->last_used_idx &
401 				PACKED_BATCH_MASK;
402 		vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
403 		vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
404 		vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
405 		vq->shadow_aligned_idx += count[i];
406 		vq->shadow_used_idx++;
407 	}
408 
409 	if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
410 		do_data_copy_enqueue(dev, vq);
411 		vhost_flush_enqueue_shadow_packed(dev, vq);
412 	}
413 }
414 
415 /* avoid write operation when necessary, to lessen cache issues */
416 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
417 	if ((var) != (val))			\
418 		(var) = (val);			\
419 } while (0)
420 
421 static __rte_always_inline void
virtio_enqueue_offload(struct rte_mbuf * m_buf,struct virtio_net_hdr * net_hdr)422 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
423 {
424 	uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
425 
426 	if (m_buf->ol_flags & PKT_TX_TCP_SEG)
427 		csum_l4 |= PKT_TX_TCP_CKSUM;
428 
429 	if (csum_l4) {
430 		net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
431 		net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
432 
433 		switch (csum_l4) {
434 		case PKT_TX_TCP_CKSUM:
435 			net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
436 						cksum));
437 			break;
438 		case PKT_TX_UDP_CKSUM:
439 			net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
440 						dgram_cksum));
441 			break;
442 		case PKT_TX_SCTP_CKSUM:
443 			net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
444 						cksum));
445 			break;
446 		}
447 	} else {
448 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
449 		ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
450 		ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
451 	}
452 
453 	/* IP cksum verification cannot be bypassed, then calculate here */
454 	if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
455 		struct rte_ipv4_hdr *ipv4_hdr;
456 
457 		ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
458 						   m_buf->l2_len);
459 		ipv4_hdr->hdr_checksum = 0;
460 		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
461 	}
462 
463 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
464 		if (m_buf->ol_flags & PKT_TX_IPV4)
465 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
466 		else
467 			net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
468 		net_hdr->gso_size = m_buf->tso_segsz;
469 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
470 					+ m_buf->l4_len;
471 	} else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
472 		net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
473 		net_hdr->gso_size = m_buf->tso_segsz;
474 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
475 			m_buf->l4_len;
476 	} else {
477 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
478 		ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
479 		ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
480 	}
481 }
482 
483 static __rte_always_inline int
map_one_desc(struct virtio_net * dev,struct vhost_virtqueue * vq,struct buf_vector * buf_vec,uint16_t * vec_idx,uint64_t desc_iova,uint64_t desc_len,uint8_t perm)484 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
485 		struct buf_vector *buf_vec, uint16_t *vec_idx,
486 		uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
487 {
488 	uint16_t vec_id = *vec_idx;
489 
490 	while (desc_len) {
491 		uint64_t desc_addr;
492 		uint64_t desc_chunck_len = desc_len;
493 
494 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
495 			return -1;
496 
497 		desc_addr = vhost_iova_to_vva(dev, vq,
498 				desc_iova,
499 				&desc_chunck_len,
500 				perm);
501 		if (unlikely(!desc_addr))
502 			return -1;
503 
504 		rte_prefetch0((void *)(uintptr_t)desc_addr);
505 
506 		buf_vec[vec_id].buf_iova = desc_iova;
507 		buf_vec[vec_id].buf_addr = desc_addr;
508 		buf_vec[vec_id].buf_len  = desc_chunck_len;
509 
510 		desc_len -= desc_chunck_len;
511 		desc_iova += desc_chunck_len;
512 		vec_id++;
513 	}
514 	*vec_idx = vec_id;
515 
516 	return 0;
517 }
518 
519 static __rte_always_inline int
fill_vec_buf_split(struct virtio_net * dev,struct vhost_virtqueue * vq,uint32_t avail_idx,uint16_t * vec_idx,struct buf_vector * buf_vec,uint16_t * desc_chain_head,uint32_t * desc_chain_len,uint8_t perm)520 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
521 			 uint32_t avail_idx, uint16_t *vec_idx,
522 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
523 			 uint32_t *desc_chain_len, uint8_t perm)
524 {
525 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
526 	uint16_t vec_id = *vec_idx;
527 	uint32_t len    = 0;
528 	uint64_t dlen;
529 	uint32_t nr_descs = vq->size;
530 	uint32_t cnt    = 0;
531 	struct vring_desc *descs = vq->desc;
532 	struct vring_desc *idesc = NULL;
533 
534 	if (unlikely(idx >= vq->size))
535 		return -1;
536 
537 	*desc_chain_head = idx;
538 
539 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
540 		dlen = vq->desc[idx].len;
541 		nr_descs = dlen / sizeof(struct vring_desc);
542 		if (unlikely(nr_descs > vq->size))
543 			return -1;
544 
545 		descs = (struct vring_desc *)(uintptr_t)
546 			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
547 						&dlen,
548 						VHOST_ACCESS_RO);
549 		if (unlikely(!descs))
550 			return -1;
551 
552 		if (unlikely(dlen < vq->desc[idx].len)) {
553 			/*
554 			 * The indirect desc table is not contiguous
555 			 * in process VA space, we have to copy it.
556 			 */
557 			idesc = vhost_alloc_copy_ind_table(dev, vq,
558 					vq->desc[idx].addr, vq->desc[idx].len);
559 			if (unlikely(!idesc))
560 				return -1;
561 
562 			descs = idesc;
563 		}
564 
565 		idx = 0;
566 	}
567 
568 	while (1) {
569 		if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
570 			free_ind_table(idesc);
571 			return -1;
572 		}
573 
574 		len += descs[idx].len;
575 
576 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
577 						descs[idx].addr, descs[idx].len,
578 						perm))) {
579 			free_ind_table(idesc);
580 			return -1;
581 		}
582 
583 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
584 			break;
585 
586 		idx = descs[idx].next;
587 	}
588 
589 	*desc_chain_len = len;
590 	*vec_idx = vec_id;
591 
592 	if (unlikely(!!idesc))
593 		free_ind_table(idesc);
594 
595 	return 0;
596 }
597 
598 /*
599  * Returns -1 on fail, 0 on success
600  */
601 static inline int
reserve_avail_buf_split(struct virtio_net * dev,struct vhost_virtqueue * vq,uint32_t size,struct buf_vector * buf_vec,uint16_t * num_buffers,uint16_t avail_head,uint16_t * nr_vec)602 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
603 				uint32_t size, struct buf_vector *buf_vec,
604 				uint16_t *num_buffers, uint16_t avail_head,
605 				uint16_t *nr_vec)
606 {
607 	uint16_t cur_idx;
608 	uint16_t vec_idx = 0;
609 	uint16_t max_tries, tries = 0;
610 
611 	uint16_t head_idx = 0;
612 	uint32_t len = 0;
613 
614 	*num_buffers = 0;
615 	cur_idx  = vq->last_avail_idx;
616 
617 	if (rxvq_is_mergeable(dev))
618 		max_tries = vq->size - 1;
619 	else
620 		max_tries = 1;
621 
622 	while (size > 0) {
623 		if (unlikely(cur_idx == avail_head))
624 			return -1;
625 		/*
626 		 * if we tried all available ring items, and still
627 		 * can't get enough buf, it means something abnormal
628 		 * happened.
629 		 */
630 		if (unlikely(++tries > max_tries))
631 			return -1;
632 
633 		if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
634 						&vec_idx, buf_vec,
635 						&head_idx, &len,
636 						VHOST_ACCESS_RW) < 0))
637 			return -1;
638 		len = RTE_MIN(len, size);
639 		update_shadow_used_ring_split(vq, head_idx, len);
640 		size -= len;
641 
642 		cur_idx++;
643 		*num_buffers += 1;
644 	}
645 
646 	*nr_vec = vec_idx;
647 
648 	return 0;
649 }
650 
651 static __rte_always_inline int
fill_vec_buf_packed_indirect(struct virtio_net * dev,struct vhost_virtqueue * vq,struct vring_packed_desc * desc,uint16_t * vec_idx,struct buf_vector * buf_vec,uint32_t * len,uint8_t perm)652 fill_vec_buf_packed_indirect(struct virtio_net *dev,
653 			struct vhost_virtqueue *vq,
654 			struct vring_packed_desc *desc, uint16_t *vec_idx,
655 			struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
656 {
657 	uint16_t i;
658 	uint32_t nr_descs;
659 	uint16_t vec_id = *vec_idx;
660 	uint64_t dlen;
661 	struct vring_packed_desc *descs, *idescs = NULL;
662 
663 	dlen = desc->len;
664 	descs = (struct vring_packed_desc *)(uintptr_t)
665 		vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
666 	if (unlikely(!descs))
667 		return -1;
668 
669 	if (unlikely(dlen < desc->len)) {
670 		/*
671 		 * The indirect desc table is not contiguous
672 		 * in process VA space, we have to copy it.
673 		 */
674 		idescs = vhost_alloc_copy_ind_table(dev,
675 				vq, desc->addr, desc->len);
676 		if (unlikely(!idescs))
677 			return -1;
678 
679 		descs = idescs;
680 	}
681 
682 	nr_descs =  desc->len / sizeof(struct vring_packed_desc);
683 	if (unlikely(nr_descs >= vq->size)) {
684 		free_ind_table(idescs);
685 		return -1;
686 	}
687 
688 	for (i = 0; i < nr_descs; i++) {
689 		if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
690 			free_ind_table(idescs);
691 			return -1;
692 		}
693 
694 		*len += descs[i].len;
695 		if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
696 						descs[i].addr, descs[i].len,
697 						perm)))
698 			return -1;
699 	}
700 	*vec_idx = vec_id;
701 
702 	if (unlikely(!!idescs))
703 		free_ind_table(idescs);
704 
705 	return 0;
706 }
707 
708 static __rte_always_inline int
fill_vec_buf_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,uint16_t avail_idx,uint16_t * desc_count,struct buf_vector * buf_vec,uint16_t * vec_idx,uint16_t * buf_id,uint32_t * len,uint8_t perm)709 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
710 				uint16_t avail_idx, uint16_t *desc_count,
711 				struct buf_vector *buf_vec, uint16_t *vec_idx,
712 				uint16_t *buf_id, uint32_t *len, uint8_t perm)
713 {
714 	bool wrap_counter = vq->avail_wrap_counter;
715 	struct vring_packed_desc *descs = vq->desc_packed;
716 	uint16_t vec_id = *vec_idx;
717 
718 	if (avail_idx < vq->last_avail_idx)
719 		wrap_counter ^= 1;
720 
721 	/*
722 	 * Perform a load-acquire barrier in desc_is_avail to
723 	 * enforce the ordering between desc flags and desc
724 	 * content.
725 	 */
726 	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
727 		return -1;
728 
729 	*desc_count = 0;
730 	*len = 0;
731 
732 	while (1) {
733 		if (unlikely(vec_id >= BUF_VECTOR_MAX))
734 			return -1;
735 
736 		if (unlikely(*desc_count >= vq->size))
737 			return -1;
738 
739 		*desc_count += 1;
740 		*buf_id = descs[avail_idx].id;
741 
742 		if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
743 			if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
744 							&descs[avail_idx],
745 							&vec_id, buf_vec,
746 							len, perm) < 0))
747 				return -1;
748 		} else {
749 			*len += descs[avail_idx].len;
750 
751 			if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
752 							descs[avail_idx].addr,
753 							descs[avail_idx].len,
754 							perm)))
755 				return -1;
756 		}
757 
758 		if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
759 			break;
760 
761 		if (++avail_idx >= vq->size) {
762 			avail_idx -= vq->size;
763 			wrap_counter ^= 1;
764 		}
765 	}
766 
767 	*vec_idx = vec_id;
768 
769 	return 0;
770 }
771 
772 static __rte_noinline void
copy_vnet_hdr_to_desc(struct virtio_net * dev,struct vhost_virtqueue * vq,struct buf_vector * buf_vec,struct virtio_net_hdr_mrg_rxbuf * hdr)773 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
774 		struct buf_vector *buf_vec,
775 		struct virtio_net_hdr_mrg_rxbuf *hdr)
776 {
777 	uint64_t len;
778 	uint64_t remain = dev->vhost_hlen;
779 	uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
780 	uint64_t iova = buf_vec->buf_iova;
781 
782 	while (remain) {
783 		len = RTE_MIN(remain,
784 				buf_vec->buf_len);
785 		dst = buf_vec->buf_addr;
786 		rte_memcpy((void *)(uintptr_t)dst,
787 				(void *)(uintptr_t)src,
788 				len);
789 
790 		PRINT_PACKET(dev, (uintptr_t)dst,
791 				(uint32_t)len, 0);
792 		vhost_log_cache_write_iova(dev, vq,
793 				iova, len);
794 
795 		remain -= len;
796 		iova += len;
797 		src += len;
798 		buf_vec++;
799 	}
800 }
801 
802 static __rte_always_inline int
copy_mbuf_to_desc(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf * m,struct buf_vector * buf_vec,uint16_t nr_vec,uint16_t num_buffers)803 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
804 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
805 			    uint16_t nr_vec, uint16_t num_buffers)
806 {
807 	uint32_t vec_idx = 0;
808 	uint32_t mbuf_offset, mbuf_avail;
809 	uint32_t buf_offset, buf_avail;
810 	uint64_t buf_addr, buf_iova, buf_len;
811 	uint32_t cpy_len;
812 	uint64_t hdr_addr;
813 	struct rte_mbuf *hdr_mbuf;
814 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
815 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
816 	int error = 0;
817 
818 	if (unlikely(m == NULL)) {
819 		error = -1;
820 		goto out;
821 	}
822 
823 	buf_addr = buf_vec[vec_idx].buf_addr;
824 	buf_iova = buf_vec[vec_idx].buf_iova;
825 	buf_len = buf_vec[vec_idx].buf_len;
826 
827 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
828 		error = -1;
829 		goto out;
830 	}
831 
832 	hdr_mbuf = m;
833 	hdr_addr = buf_addr;
834 	if (unlikely(buf_len < dev->vhost_hlen))
835 		hdr = &tmp_hdr;
836 	else
837 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
838 
839 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
840 		dev->vid, num_buffers);
841 
842 	if (unlikely(buf_len < dev->vhost_hlen)) {
843 		buf_offset = dev->vhost_hlen - buf_len;
844 		vec_idx++;
845 		buf_addr = buf_vec[vec_idx].buf_addr;
846 		buf_iova = buf_vec[vec_idx].buf_iova;
847 		buf_len = buf_vec[vec_idx].buf_len;
848 		buf_avail = buf_len - buf_offset;
849 	} else {
850 		buf_offset = dev->vhost_hlen;
851 		buf_avail = buf_len - dev->vhost_hlen;
852 	}
853 
854 	mbuf_avail  = rte_pktmbuf_data_len(m);
855 	mbuf_offset = 0;
856 	while (mbuf_avail != 0 || m->next != NULL) {
857 		/* done with current buf, get the next one */
858 		if (buf_avail == 0) {
859 			vec_idx++;
860 			if (unlikely(vec_idx >= nr_vec)) {
861 				error = -1;
862 				goto out;
863 			}
864 
865 			buf_addr = buf_vec[vec_idx].buf_addr;
866 			buf_iova = buf_vec[vec_idx].buf_iova;
867 			buf_len = buf_vec[vec_idx].buf_len;
868 
869 			buf_offset = 0;
870 			buf_avail  = buf_len;
871 		}
872 
873 		/* done with current mbuf, get the next one */
874 		if (mbuf_avail == 0) {
875 			m = m->next;
876 
877 			mbuf_offset = 0;
878 			mbuf_avail  = rte_pktmbuf_data_len(m);
879 		}
880 
881 		if (hdr_addr) {
882 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
883 			if (rxvq_is_mergeable(dev))
884 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
885 						num_buffers);
886 
887 			if (unlikely(hdr == &tmp_hdr)) {
888 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
889 			} else {
890 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
891 						dev->vhost_hlen, 0);
892 				vhost_log_cache_write_iova(dev, vq,
893 						buf_vec[0].buf_iova,
894 						dev->vhost_hlen);
895 			}
896 
897 			hdr_addr = 0;
898 		}
899 
900 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
901 
902 		if (likely(cpy_len > MAX_BATCH_LEN ||
903 					vq->batch_copy_nb_elems >= vq->size)) {
904 			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
905 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
906 				cpy_len);
907 			vhost_log_cache_write_iova(dev, vq,
908 						   buf_iova + buf_offset,
909 						   cpy_len);
910 			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
911 				cpy_len, 0);
912 		} else {
913 			batch_copy[vq->batch_copy_nb_elems].dst =
914 				(void *)((uintptr_t)(buf_addr + buf_offset));
915 			batch_copy[vq->batch_copy_nb_elems].src =
916 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
917 			batch_copy[vq->batch_copy_nb_elems].log_addr =
918 				buf_iova + buf_offset;
919 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
920 			vq->batch_copy_nb_elems++;
921 		}
922 
923 		mbuf_avail  -= cpy_len;
924 		mbuf_offset += cpy_len;
925 		buf_avail  -= cpy_len;
926 		buf_offset += cpy_len;
927 	}
928 
929 out:
930 
931 	return error;
932 }
933 
934 static __rte_always_inline void
async_fill_vec(struct iovec * v,void * base,size_t len)935 async_fill_vec(struct iovec *v, void *base, size_t len)
936 {
937 	v->iov_base = base;
938 	v->iov_len = len;
939 }
940 
941 static __rte_always_inline void
async_fill_iter(struct rte_vhost_iov_iter * it,size_t count,struct iovec * vec,unsigned long nr_seg)942 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
943 	struct iovec *vec, unsigned long nr_seg)
944 {
945 	it->offset = 0;
946 	it->count = count;
947 
948 	if (count) {
949 		it->iov = vec;
950 		it->nr_segs = nr_seg;
951 	} else {
952 		it->iov = 0;
953 		it->nr_segs = 0;
954 	}
955 }
956 
957 static __rte_always_inline void
async_fill_desc(struct rte_vhost_async_desc * desc,struct rte_vhost_iov_iter * src,struct rte_vhost_iov_iter * dst)958 async_fill_desc(struct rte_vhost_async_desc *desc,
959 	struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
960 {
961 	desc->src = src;
962 	desc->dst = dst;
963 }
964 
965 static __rte_always_inline int
async_mbuf_to_desc(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf * m,struct buf_vector * buf_vec,uint16_t nr_vec,uint16_t num_buffers,struct iovec * src_iovec,struct iovec * dst_iovec,struct rte_vhost_iov_iter * src_it,struct rte_vhost_iov_iter * dst_it)966 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
967 			struct rte_mbuf *m, struct buf_vector *buf_vec,
968 			uint16_t nr_vec, uint16_t num_buffers,
969 			struct iovec *src_iovec, struct iovec *dst_iovec,
970 			struct rte_vhost_iov_iter *src_it,
971 			struct rte_vhost_iov_iter *dst_it)
972 {
973 	uint32_t vec_idx = 0;
974 	uint32_t mbuf_offset, mbuf_avail;
975 	uint32_t buf_offset, buf_avail;
976 	uint64_t buf_addr, buf_iova, buf_len;
977 	uint32_t cpy_len, cpy_threshold;
978 	uint64_t hdr_addr;
979 	struct rte_mbuf *hdr_mbuf;
980 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
981 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
982 	int error = 0;
983 	uint64_t mapped_len;
984 
985 	uint32_t tlen = 0;
986 	int tvec_idx = 0;
987 	void *hpa;
988 
989 	if (unlikely(m == NULL)) {
990 		error = -1;
991 		goto out;
992 	}
993 
994 	cpy_threshold = vq->async_threshold;
995 
996 	buf_addr = buf_vec[vec_idx].buf_addr;
997 	buf_iova = buf_vec[vec_idx].buf_iova;
998 	buf_len = buf_vec[vec_idx].buf_len;
999 
1000 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1001 		error = -1;
1002 		goto out;
1003 	}
1004 
1005 	hdr_mbuf = m;
1006 	hdr_addr = buf_addr;
1007 	if (unlikely(buf_len < dev->vhost_hlen))
1008 		hdr = &tmp_hdr;
1009 	else
1010 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1011 
1012 	VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1013 		dev->vid, num_buffers);
1014 
1015 	if (unlikely(buf_len < dev->vhost_hlen)) {
1016 		buf_offset = dev->vhost_hlen - buf_len;
1017 		vec_idx++;
1018 		buf_addr = buf_vec[vec_idx].buf_addr;
1019 		buf_iova = buf_vec[vec_idx].buf_iova;
1020 		buf_len = buf_vec[vec_idx].buf_len;
1021 		buf_avail = buf_len - buf_offset;
1022 	} else {
1023 		buf_offset = dev->vhost_hlen;
1024 		buf_avail = buf_len - dev->vhost_hlen;
1025 	}
1026 
1027 	mbuf_avail  = rte_pktmbuf_data_len(m);
1028 	mbuf_offset = 0;
1029 
1030 	while (mbuf_avail != 0 || m->next != NULL) {
1031 		/* done with current buf, get the next one */
1032 		if (buf_avail == 0) {
1033 			vec_idx++;
1034 			if (unlikely(vec_idx >= nr_vec)) {
1035 				error = -1;
1036 				goto out;
1037 			}
1038 
1039 			buf_addr = buf_vec[vec_idx].buf_addr;
1040 			buf_iova = buf_vec[vec_idx].buf_iova;
1041 			buf_len = buf_vec[vec_idx].buf_len;
1042 
1043 			buf_offset = 0;
1044 			buf_avail  = buf_len;
1045 		}
1046 
1047 		/* done with current mbuf, get the next one */
1048 		if (mbuf_avail == 0) {
1049 			m = m->next;
1050 
1051 			mbuf_offset = 0;
1052 			mbuf_avail  = rte_pktmbuf_data_len(m);
1053 		}
1054 
1055 		if (hdr_addr) {
1056 			virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1057 			if (rxvq_is_mergeable(dev))
1058 				ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1059 						num_buffers);
1060 
1061 			if (unlikely(hdr == &tmp_hdr)) {
1062 				copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1063 			} else {
1064 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1065 						dev->vhost_hlen, 0);
1066 				vhost_log_cache_write_iova(dev, vq,
1067 						buf_vec[0].buf_iova,
1068 						dev->vhost_hlen);
1069 			}
1070 
1071 			hdr_addr = 0;
1072 		}
1073 
1074 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1075 
1076 		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
1077 			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1078 					buf_iova + buf_offset,
1079 					cpy_len, &mapped_len);
1080 
1081 			if (unlikely(!hpa || mapped_len < cpy_threshold))
1082 				break;
1083 
1084 			async_fill_vec(src_iovec + tvec_idx,
1085 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1086 				mbuf_offset), (size_t)mapped_len);
1087 
1088 			async_fill_vec(dst_iovec + tvec_idx,
1089 					hpa, (size_t)mapped_len);
1090 
1091 			tlen += (uint32_t)mapped_len;
1092 			cpy_len -= (uint32_t)mapped_len;
1093 			mbuf_avail  -= (uint32_t)mapped_len;
1094 			mbuf_offset += (uint32_t)mapped_len;
1095 			buf_avail  -= (uint32_t)mapped_len;
1096 			buf_offset += (uint32_t)mapped_len;
1097 			tvec_idx++;
1098 		}
1099 
1100 		if (likely(cpy_len)) {
1101 			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
1102 				rte_memcpy(
1103 				(void *)((uintptr_t)(buf_addr + buf_offset)),
1104 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1105 				cpy_len);
1106 
1107 				PRINT_PACKET(dev,
1108 					(uintptr_t)(buf_addr + buf_offset),
1109 					cpy_len, 0);
1110 			} else {
1111 				batch_copy[vq->batch_copy_nb_elems].dst =
1112 				(void *)((uintptr_t)(buf_addr + buf_offset));
1113 				batch_copy[vq->batch_copy_nb_elems].src =
1114 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1115 				batch_copy[vq->batch_copy_nb_elems].log_addr =
1116 					buf_iova + buf_offset;
1117 				batch_copy[vq->batch_copy_nb_elems].len =
1118 					cpy_len;
1119 				vq->batch_copy_nb_elems++;
1120 			}
1121 
1122 			mbuf_avail  -= cpy_len;
1123 			mbuf_offset += cpy_len;
1124 			buf_avail  -= cpy_len;
1125 			buf_offset += cpy_len;
1126 		}
1127 
1128 	}
1129 
1130 out:
1131 	async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1132 	async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1133 
1134 	return error;
1135 }
1136 
1137 static __rte_always_inline int
vhost_enqueue_single_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf * pkt,struct buf_vector * buf_vec,uint16_t * nr_descs)1138 vhost_enqueue_single_packed(struct virtio_net *dev,
1139 			    struct vhost_virtqueue *vq,
1140 			    struct rte_mbuf *pkt,
1141 			    struct buf_vector *buf_vec,
1142 			    uint16_t *nr_descs)
1143 {
1144 	uint16_t nr_vec = 0;
1145 	uint16_t avail_idx = vq->last_avail_idx;
1146 	uint16_t max_tries, tries = 0;
1147 	uint16_t buf_id = 0;
1148 	uint32_t len = 0;
1149 	uint16_t desc_count;
1150 	uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1151 	uint16_t num_buffers = 0;
1152 	uint32_t buffer_len[vq->size];
1153 	uint16_t buffer_buf_id[vq->size];
1154 	uint16_t buffer_desc_count[vq->size];
1155 
1156 	if (rxvq_is_mergeable(dev))
1157 		max_tries = vq->size - 1;
1158 	else
1159 		max_tries = 1;
1160 
1161 	while (size > 0) {
1162 		/*
1163 		 * if we tried all available ring items, and still
1164 		 * can't get enough buf, it means something abnormal
1165 		 * happened.
1166 		 */
1167 		if (unlikely(++tries > max_tries))
1168 			return -1;
1169 
1170 		if (unlikely(fill_vec_buf_packed(dev, vq,
1171 						avail_idx, &desc_count,
1172 						buf_vec, &nr_vec,
1173 						&buf_id, &len,
1174 						VHOST_ACCESS_RW) < 0))
1175 			return -1;
1176 
1177 		len = RTE_MIN(len, size);
1178 		size -= len;
1179 
1180 		buffer_len[num_buffers] = len;
1181 		buffer_buf_id[num_buffers] = buf_id;
1182 		buffer_desc_count[num_buffers] = desc_count;
1183 		num_buffers += 1;
1184 
1185 		*nr_descs += desc_count;
1186 		avail_idx += desc_count;
1187 		if (avail_idx >= vq->size)
1188 			avail_idx -= vq->size;
1189 	}
1190 
1191 	if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1192 		return -1;
1193 
1194 	vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1195 					   buffer_desc_count, num_buffers);
1196 
1197 	return 0;
1198 }
1199 
1200 static __rte_noinline uint32_t
virtio_dev_rx_split(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf ** pkts,uint32_t count)1201 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1202 	struct rte_mbuf **pkts, uint32_t count)
1203 {
1204 	uint32_t pkt_idx = 0;
1205 	uint16_t num_buffers;
1206 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1207 	uint16_t avail_head;
1208 
1209 	/*
1210 	 * The ordering between avail index and
1211 	 * desc reads needs to be enforced.
1212 	 */
1213 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1214 
1215 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1216 
1217 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1218 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1219 		uint16_t nr_vec = 0;
1220 
1221 		if (unlikely(reserve_avail_buf_split(dev, vq,
1222 						pkt_len, buf_vec, &num_buffers,
1223 						avail_head, &nr_vec) < 0)) {
1224 			VHOST_LOG_DATA(DEBUG,
1225 				"(%d) failed to get enough desc from vring\n",
1226 				dev->vid);
1227 			vq->shadow_used_idx -= num_buffers;
1228 			break;
1229 		}
1230 
1231 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1232 			dev->vid, vq->last_avail_idx,
1233 			vq->last_avail_idx + num_buffers);
1234 
1235 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1236 						buf_vec, nr_vec,
1237 						num_buffers) < 0) {
1238 			vq->shadow_used_idx -= num_buffers;
1239 			break;
1240 		}
1241 
1242 		vq->last_avail_idx += num_buffers;
1243 	}
1244 
1245 	do_data_copy_enqueue(dev, vq);
1246 
1247 	if (likely(vq->shadow_used_idx)) {
1248 		flush_shadow_used_ring_split(dev, vq);
1249 		vhost_vring_call_split(dev, vq);
1250 	}
1251 
1252 	return pkt_idx;
1253 }
1254 
1255 static __rte_always_inline int
virtio_dev_rx_batch_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf ** pkts)1256 virtio_dev_rx_batch_packed(struct virtio_net *dev,
1257 			   struct vhost_virtqueue *vq,
1258 			   struct rte_mbuf **pkts)
1259 {
1260 	bool wrap_counter = vq->avail_wrap_counter;
1261 	struct vring_packed_desc *descs = vq->desc_packed;
1262 	uint16_t avail_idx = vq->last_avail_idx;
1263 	uint64_t desc_addrs[PACKED_BATCH_SIZE];
1264 	struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1265 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1266 	uint64_t lens[PACKED_BATCH_SIZE];
1267 	uint16_t ids[PACKED_BATCH_SIZE];
1268 	uint16_t i;
1269 
1270 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
1271 		return -1;
1272 
1273 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1274 		return -1;
1275 
1276 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1277 		if (unlikely(pkts[i]->next != NULL))
1278 			return -1;
1279 		if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1280 					    wrap_counter)))
1281 			return -1;
1282 	}
1283 
1284 	rte_smp_rmb();
1285 
1286 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1287 		lens[i] = descs[avail_idx + i].len;
1288 
1289 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1290 		if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1291 			return -1;
1292 	}
1293 
1294 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1295 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1296 						  descs[avail_idx + i].addr,
1297 						  &lens[i],
1298 						  VHOST_ACCESS_RW);
1299 
1300 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1301 		if (unlikely(!desc_addrs[i]))
1302 			return -1;
1303 		if (unlikely(lens[i] != descs[avail_idx + i].len))
1304 			return -1;
1305 	}
1306 
1307 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1308 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1309 		hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1310 					(uintptr_t)desc_addrs[i];
1311 		lens[i] = pkts[i]->pkt_len +
1312 			sizeof(struct virtio_net_hdr_mrg_rxbuf);
1313 	}
1314 
1315 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1316 		virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1317 
1318 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1319 
1320 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1321 		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1322 			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1323 			   pkts[i]->pkt_len);
1324 	}
1325 
1326 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1327 		vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1328 					   lens[i]);
1329 
1330 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1331 		ids[i] = descs[avail_idx + i].id;
1332 
1333 	vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1334 
1335 	return 0;
1336 }
1337 
1338 static __rte_always_inline int16_t
virtio_dev_rx_single_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mbuf * pkt)1339 virtio_dev_rx_single_packed(struct virtio_net *dev,
1340 			    struct vhost_virtqueue *vq,
1341 			    struct rte_mbuf *pkt)
1342 {
1343 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1344 	uint16_t nr_descs = 0;
1345 
1346 	rte_smp_rmb();
1347 	if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1348 						 &nr_descs) < 0)) {
1349 		VHOST_LOG_DATA(DEBUG,
1350 				"(%d) failed to get enough desc from vring\n",
1351 				dev->vid);
1352 		return -1;
1353 	}
1354 
1355 	VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1356 			dev->vid, vq->last_avail_idx,
1357 			vq->last_avail_idx + nr_descs);
1358 
1359 	vq_inc_last_avail_packed(vq, nr_descs);
1360 
1361 	return 0;
1362 }
1363 
1364 static __rte_noinline uint32_t
virtio_dev_rx_packed(struct virtio_net * dev,struct vhost_virtqueue * __rte_restrict vq,struct rte_mbuf ** __rte_restrict pkts,uint32_t count)1365 virtio_dev_rx_packed(struct virtio_net *dev,
1366 		     struct vhost_virtqueue *__rte_restrict vq,
1367 		     struct rte_mbuf **__rte_restrict pkts,
1368 		     uint32_t count)
1369 {
1370 	uint32_t pkt_idx = 0;
1371 	uint32_t remained = count;
1372 
1373 	do {
1374 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1375 
1376 		if (remained >= PACKED_BATCH_SIZE) {
1377 			if (!virtio_dev_rx_batch_packed(dev, vq,
1378 							&pkts[pkt_idx])) {
1379 				pkt_idx += PACKED_BATCH_SIZE;
1380 				remained -= PACKED_BATCH_SIZE;
1381 				continue;
1382 			}
1383 		}
1384 
1385 		if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1386 			break;
1387 		pkt_idx++;
1388 		remained--;
1389 
1390 	} while (pkt_idx < count);
1391 
1392 	if (vq->shadow_used_idx) {
1393 		do_data_copy_enqueue(dev, vq);
1394 		vhost_flush_enqueue_shadow_packed(dev, vq);
1395 	}
1396 
1397 	if (pkt_idx)
1398 		vhost_vring_call_packed(dev, vq);
1399 
1400 	return pkt_idx;
1401 }
1402 
1403 static __rte_always_inline uint32_t
virtio_dev_rx(struct virtio_net * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t count)1404 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1405 	struct rte_mbuf **pkts, uint32_t count)
1406 {
1407 	struct vhost_virtqueue *vq;
1408 	uint32_t nb_tx = 0;
1409 
1410 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1411 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1412 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1413 			dev->vid, __func__, queue_id);
1414 		return 0;
1415 	}
1416 
1417 	vq = dev->virtqueue[queue_id];
1418 
1419 	rte_spinlock_lock(&vq->access_lock);
1420 
1421 	if (unlikely(vq->enabled == 0))
1422 		goto out_access_unlock;
1423 
1424 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1425 		vhost_user_iotlb_rd_lock(vq);
1426 
1427 	if (unlikely(vq->access_ok == 0))
1428 		if (unlikely(vring_translate(dev, vq) < 0))
1429 			goto out;
1430 
1431 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1432 	if (count == 0)
1433 		goto out;
1434 
1435 	if (vq_is_packed(dev))
1436 		nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1437 	else
1438 		nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1439 
1440 out:
1441 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1442 		vhost_user_iotlb_rd_unlock(vq);
1443 
1444 out_access_unlock:
1445 	rte_spinlock_unlock(&vq->access_lock);
1446 
1447 	return nb_tx;
1448 }
1449 
1450 uint16_t
rte_vhost_enqueue_burst(int vid,uint16_t queue_id,struct rte_mbuf ** __rte_restrict pkts,uint16_t count)1451 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1452 	struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1453 {
1454 	struct virtio_net *dev = get_device(vid);
1455 
1456 	if (!dev)
1457 		return 0;
1458 
1459 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1460 		VHOST_LOG_DATA(ERR,
1461 			"(%d) %s: built-in vhost net backend is disabled.\n",
1462 			dev->vid, __func__);
1463 		return 0;
1464 	}
1465 
1466 	return virtio_dev_rx(dev, queue_id, pkts, count);
1467 }
1468 
1469 static __rte_always_inline uint16_t
virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,uint16_t vq_size,uint16_t n_inflight)1470 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1471 	uint16_t vq_size, uint16_t n_inflight)
1472 {
1473 	return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1474 		(vq_size - n_inflight + pkts_idx) & (vq_size - 1);
1475 }
1476 
1477 static __rte_noinline uint32_t
virtio_dev_rx_async_submit_split(struct virtio_net * dev,struct vhost_virtqueue * vq,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t count)1478 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1479 	struct vhost_virtqueue *vq, uint16_t queue_id,
1480 	struct rte_mbuf **pkts, uint32_t count)
1481 {
1482 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1483 	uint16_t num_buffers;
1484 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
1485 	uint16_t avail_head;
1486 
1487 	struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1488 	struct iovec *vec_pool = vq->vec_pool;
1489 	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1490 	struct iovec *src_iovec = vec_pool;
1491 	struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1492 	struct rte_vhost_iov_iter *src_it = it_pool;
1493 	struct rte_vhost_iov_iter *dst_it = it_pool + 1;
1494 	uint16_t n_free_slot, slot_idx = 0;
1495 	uint16_t pkt_err = 0;
1496 	uint16_t segs_await = 0;
1497 	struct async_inflight_info *pkts_info = vq->async_pkts_info;
1498 	int n_pkts = 0;
1499 
1500 	avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1501 
1502 	/*
1503 	 * The ordering between avail index and
1504 	 * desc reads needs to be enforced.
1505 	 */
1506 	rte_smp_rmb();
1507 
1508 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1509 
1510 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1511 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1512 		uint16_t nr_vec = 0;
1513 
1514 		if (unlikely(reserve_avail_buf_split(dev, vq,
1515 						pkt_len, buf_vec, &num_buffers,
1516 						avail_head, &nr_vec) < 0)) {
1517 			VHOST_LOG_DATA(DEBUG,
1518 				"(%d) failed to get enough desc from vring\n",
1519 				dev->vid);
1520 			vq->shadow_used_idx -= num_buffers;
1521 			break;
1522 		}
1523 
1524 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1525 			dev->vid, vq->last_avail_idx,
1526 			vq->last_avail_idx + num_buffers);
1527 
1528 		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1529 				buf_vec, nr_vec, num_buffers,
1530 				src_iovec, dst_iovec, src_it, dst_it) < 0) {
1531 			vq->shadow_used_idx -= num_buffers;
1532 			break;
1533 		}
1534 
1535 		slot_idx = (vq->async_pkts_idx + pkt_idx) & (vq->size - 1);
1536 		if (src_it->count) {
1537 			async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
1538 			pkt_burst_idx++;
1539 			pkts_info[slot_idx].descs = num_buffers;
1540 			pkts_info[slot_idx].segs = src_it->nr_segs;
1541 			src_iovec += src_it->nr_segs;
1542 			dst_iovec += dst_it->nr_segs;
1543 			src_it += 2;
1544 			dst_it += 2;
1545 			segs_await += src_it->nr_segs;
1546 		} else {
1547 			pkts_info[slot_idx].info = num_buffers;
1548 			vq->async_pkts_inflight_n++;
1549 		}
1550 
1551 		vq->last_avail_idx += num_buffers;
1552 
1553 		/*
1554 		 * conditions to trigger async device transfer:
1555 		 * - buffered packet number reaches transfer threshold
1556 		 * - this is the last packet in the burst enqueue
1557 		 * - unused async iov number is less than max vhost vector
1558 		 */
1559 		if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1560 			(pkt_idx == count - 1 && pkt_burst_idx) ||
1561 			(VHOST_MAX_ASYNC_VEC / 2 - segs_await <
1562 			BUF_VECTOR_MAX)) {
1563 			n_pkts = vq->async_ops.transfer_data(dev->vid,
1564 					queue_id, tdes, 0, pkt_burst_idx);
1565 			src_iovec = vec_pool;
1566 			dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1567 			src_it = it_pool;
1568 			dst_it = it_pool + 1;
1569 			segs_await = 0;
1570 			vq->async_pkts_inflight_n += pkt_burst_idx;
1571 
1572 			if (unlikely(n_pkts < (int)pkt_burst_idx)) {
1573 				/*
1574 				 * log error packets number here and do actual
1575 				 * error processing when applications poll
1576 				 * completion
1577 				 */
1578 				pkt_err = pkt_burst_idx - n_pkts;
1579 				pkt_burst_idx = 0;
1580 				break;
1581 			}
1582 
1583 			pkt_burst_idx = 0;
1584 		}
1585 	}
1586 
1587 	if (pkt_burst_idx) {
1588 		n_pkts = vq->async_ops.transfer_data(dev->vid,
1589 				queue_id, tdes, 0, pkt_burst_idx);
1590 		vq->async_pkts_inflight_n += pkt_burst_idx;
1591 
1592 		if (unlikely(n_pkts < (int)pkt_burst_idx))
1593 			pkt_err = pkt_burst_idx - n_pkts;
1594 	}
1595 
1596 	do_data_copy_enqueue(dev, vq);
1597 
1598 	while (unlikely(pkt_err && pkt_idx)) {
1599 		if (pkts_info[slot_idx].segs)
1600 			pkt_err--;
1601 		vq->last_avail_idx -= pkts_info[slot_idx].descs;
1602 		vq->shadow_used_idx -= pkts_info[slot_idx].descs;
1603 		vq->async_pkts_inflight_n--;
1604 		slot_idx = (slot_idx - 1) & (vq->size - 1);
1605 		pkt_idx--;
1606 	}
1607 
1608 	n_free_slot = vq->size - vq->async_pkts_idx;
1609 	if (n_free_slot > pkt_idx) {
1610 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
1611 			pkts, pkt_idx * sizeof(uintptr_t));
1612 		vq->async_pkts_idx += pkt_idx;
1613 	} else {
1614 		rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
1615 			pkts, n_free_slot * sizeof(uintptr_t));
1616 		rte_memcpy(&vq->async_pkts_pending[0],
1617 			&pkts[n_free_slot],
1618 			(pkt_idx - n_free_slot) * sizeof(uintptr_t));
1619 		vq->async_pkts_idx = pkt_idx - n_free_slot;
1620 	}
1621 
1622 	if (likely(vq->shadow_used_idx))
1623 		async_flush_shadow_used_ring_split(dev, vq);
1624 
1625 	return pkt_idx;
1626 }
1627 
rte_vhost_poll_enqueue_completed(int vid,uint16_t queue_id,struct rte_mbuf ** pkts,uint16_t count)1628 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
1629 		struct rte_mbuf **pkts, uint16_t count)
1630 {
1631 	struct virtio_net *dev = get_device(vid);
1632 	struct vhost_virtqueue *vq;
1633 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
1634 	uint16_t start_idx, pkts_idx, vq_size;
1635 	uint16_t n_inflight;
1636 	struct async_inflight_info *pkts_info;
1637 
1638 	if (!dev)
1639 		return 0;
1640 
1641 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1642 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1643 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1644 			dev->vid, __func__, queue_id);
1645 		return 0;
1646 	}
1647 
1648 	vq = dev->virtqueue[queue_id];
1649 
1650 	if (unlikely(!vq->async_registered)) {
1651 		VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
1652 			dev->vid, __func__, queue_id);
1653 		return 0;
1654 	}
1655 
1656 	rte_spinlock_lock(&vq->access_lock);
1657 
1658 	n_inflight = vq->async_pkts_inflight_n;
1659 	pkts_idx = vq->async_pkts_idx;
1660 	pkts_info = vq->async_pkts_info;
1661 	vq_size = vq->size;
1662 	start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
1663 		vq_size, vq->async_pkts_inflight_n);
1664 
1665 	if (count > vq->async_last_pkts_n)
1666 		n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
1667 			queue_id, 0, count - vq->async_last_pkts_n);
1668 	n_pkts_cpl += vq->async_last_pkts_n;
1669 
1670 	rte_smp_wmb();
1671 
1672 	while (likely((n_pkts_put < count) && n_inflight)) {
1673 		uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
1674 		if (n_pkts_cpl && pkts_info[info_idx].segs)
1675 			n_pkts_cpl--;
1676 		else if (!n_pkts_cpl && pkts_info[info_idx].segs)
1677 			break;
1678 		n_pkts_put++;
1679 		n_inflight--;
1680 		n_descs += pkts_info[info_idx].descs;
1681 	}
1682 
1683 	vq->async_last_pkts_n = n_pkts_cpl;
1684 
1685 	if (n_pkts_put) {
1686 		vq->async_pkts_inflight_n = n_inflight;
1687 		if (likely(vq->enabled && vq->access_ok)) {
1688 			__atomic_add_fetch(&vq->used->idx,
1689 					n_descs, __ATOMIC_RELEASE);
1690 			vhost_vring_call_split(dev, vq);
1691 		}
1692 
1693 		if (start_idx + n_pkts_put <= vq_size) {
1694 			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
1695 				n_pkts_put * sizeof(uintptr_t));
1696 		} else {
1697 			rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
1698 				(vq_size - start_idx) * sizeof(uintptr_t));
1699 			rte_memcpy(&pkts[vq_size - start_idx],
1700 				vq->async_pkts_pending,
1701 				(n_pkts_put + start_idx - vq_size) *
1702 				sizeof(uintptr_t));
1703 		}
1704 	}
1705 
1706 	rte_spinlock_unlock(&vq->access_lock);
1707 
1708 	return n_pkts_put;
1709 }
1710 
1711 static __rte_always_inline uint32_t
virtio_dev_rx_async_submit(struct virtio_net * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t count)1712 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
1713 	struct rte_mbuf **pkts, uint32_t count)
1714 {
1715 	struct vhost_virtqueue *vq;
1716 	uint32_t nb_tx = 0;
1717 
1718 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1719 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1720 		VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1721 			dev->vid, __func__, queue_id);
1722 		return 0;
1723 	}
1724 
1725 	vq = dev->virtqueue[queue_id];
1726 
1727 	rte_spinlock_lock(&vq->access_lock);
1728 
1729 	if (unlikely(vq->enabled == 0 || !vq->async_registered))
1730 		goto out_access_unlock;
1731 
1732 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1733 		vhost_user_iotlb_rd_lock(vq);
1734 
1735 	if (unlikely(vq->access_ok == 0))
1736 		if (unlikely(vring_translate(dev, vq) < 0))
1737 			goto out;
1738 
1739 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1740 	if (count == 0)
1741 		goto out;
1742 
1743 	/* TODO: packed queue not implemented */
1744 	if (vq_is_packed(dev))
1745 		nb_tx = 0;
1746 	else
1747 		nb_tx = virtio_dev_rx_async_submit_split(dev,
1748 				vq, queue_id, pkts, count);
1749 
1750 out:
1751 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1752 		vhost_user_iotlb_rd_unlock(vq);
1753 
1754 out_access_unlock:
1755 	rte_spinlock_unlock(&vq->access_lock);
1756 
1757 	return nb_tx;
1758 }
1759 
1760 uint16_t
rte_vhost_submit_enqueue_burst(int vid,uint16_t queue_id,struct rte_mbuf ** pkts,uint16_t count)1761 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
1762 		struct rte_mbuf **pkts, uint16_t count)
1763 {
1764 	struct virtio_net *dev = get_device(vid);
1765 
1766 	if (!dev)
1767 		return 0;
1768 
1769 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1770 		VHOST_LOG_DATA(ERR,
1771 			"(%d) %s: built-in vhost net backend is disabled.\n",
1772 			dev->vid, __func__);
1773 		return 0;
1774 	}
1775 
1776 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
1777 }
1778 
1779 static inline bool
virtio_net_with_host_offload(struct virtio_net * dev)1780 virtio_net_with_host_offload(struct virtio_net *dev)
1781 {
1782 	if (dev->features &
1783 			((1ULL << VIRTIO_NET_F_CSUM) |
1784 			 (1ULL << VIRTIO_NET_F_HOST_ECN) |
1785 			 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
1786 			 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
1787 			 (1ULL << VIRTIO_NET_F_HOST_UFO)))
1788 		return true;
1789 
1790 	return false;
1791 }
1792 
1793 static void
parse_ethernet(struct rte_mbuf * m,uint16_t * l4_proto,void ** l4_hdr)1794 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
1795 {
1796 	struct rte_ipv4_hdr *ipv4_hdr;
1797 	struct rte_ipv6_hdr *ipv6_hdr;
1798 	void *l3_hdr = NULL;
1799 	struct rte_ether_hdr *eth_hdr;
1800 	uint16_t ethertype;
1801 
1802 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1803 
1804 	m->l2_len = sizeof(struct rte_ether_hdr);
1805 	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
1806 
1807 	if (ethertype == RTE_ETHER_TYPE_VLAN) {
1808 		struct rte_vlan_hdr *vlan_hdr =
1809 			(struct rte_vlan_hdr *)(eth_hdr + 1);
1810 
1811 		m->l2_len += sizeof(struct rte_vlan_hdr);
1812 		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
1813 	}
1814 
1815 	l3_hdr = (char *)eth_hdr + m->l2_len;
1816 
1817 	switch (ethertype) {
1818 	case RTE_ETHER_TYPE_IPV4:
1819 		ipv4_hdr = l3_hdr;
1820 		*l4_proto = ipv4_hdr->next_proto_id;
1821 		m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
1822 		*l4_hdr = (char *)l3_hdr + m->l3_len;
1823 		m->ol_flags |= PKT_TX_IPV4;
1824 		break;
1825 	case RTE_ETHER_TYPE_IPV6:
1826 		ipv6_hdr = l3_hdr;
1827 		*l4_proto = ipv6_hdr->proto;
1828 		m->l3_len = sizeof(struct rte_ipv6_hdr);
1829 		*l4_hdr = (char *)l3_hdr + m->l3_len;
1830 		m->ol_flags |= PKT_TX_IPV6;
1831 		break;
1832 	default:
1833 		m->l3_len = 0;
1834 		*l4_proto = 0;
1835 		*l4_hdr = NULL;
1836 		break;
1837 	}
1838 }
1839 
1840 static __rte_always_inline void
vhost_dequeue_offload(struct virtio_net_hdr * hdr,struct rte_mbuf * m)1841 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1842 {
1843 	uint16_t l4_proto = 0;
1844 	void *l4_hdr = NULL;
1845 	struct rte_tcp_hdr *tcp_hdr = NULL;
1846 
1847 	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1848 		return;
1849 
1850 	parse_ethernet(m, &l4_proto, &l4_hdr);
1851 	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1852 		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1853 			switch (hdr->csum_offset) {
1854 			case (offsetof(struct rte_tcp_hdr, cksum)):
1855 				if (l4_proto == IPPROTO_TCP)
1856 					m->ol_flags |= PKT_TX_TCP_CKSUM;
1857 				break;
1858 			case (offsetof(struct rte_udp_hdr, dgram_cksum)):
1859 				if (l4_proto == IPPROTO_UDP)
1860 					m->ol_flags |= PKT_TX_UDP_CKSUM;
1861 				break;
1862 			case (offsetof(struct rte_sctp_hdr, cksum)):
1863 				if (l4_proto == IPPROTO_SCTP)
1864 					m->ol_flags |= PKT_TX_SCTP_CKSUM;
1865 				break;
1866 			default:
1867 				break;
1868 			}
1869 		}
1870 	}
1871 
1872 	if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1873 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1874 		case VIRTIO_NET_HDR_GSO_TCPV4:
1875 		case VIRTIO_NET_HDR_GSO_TCPV6:
1876 			tcp_hdr = l4_hdr;
1877 			m->ol_flags |= PKT_TX_TCP_SEG;
1878 			m->tso_segsz = hdr->gso_size;
1879 			m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1880 			break;
1881 		case VIRTIO_NET_HDR_GSO_UDP:
1882 			m->ol_flags |= PKT_TX_UDP_SEG;
1883 			m->tso_segsz = hdr->gso_size;
1884 			m->l4_len = sizeof(struct rte_udp_hdr);
1885 			break;
1886 		default:
1887 			VHOST_LOG_DATA(WARNING,
1888 				"unsupported gso type %u.\n", hdr->gso_type);
1889 			break;
1890 		}
1891 	}
1892 }
1893 
1894 static __rte_noinline void
copy_vnet_hdr_from_desc(struct virtio_net_hdr * hdr,struct buf_vector * buf_vec)1895 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
1896 		struct buf_vector *buf_vec)
1897 {
1898 	uint64_t len;
1899 	uint64_t remain = sizeof(struct virtio_net_hdr);
1900 	uint64_t src;
1901 	uint64_t dst = (uint64_t)(uintptr_t)hdr;
1902 
1903 	while (remain) {
1904 		len = RTE_MIN(remain, buf_vec->buf_len);
1905 		src = buf_vec->buf_addr;
1906 		rte_memcpy((void *)(uintptr_t)dst,
1907 				(void *)(uintptr_t)src, len);
1908 
1909 		remain -= len;
1910 		dst += len;
1911 		buf_vec++;
1912 	}
1913 }
1914 
1915 static __rte_always_inline int
copy_desc_to_mbuf(struct virtio_net * dev,struct vhost_virtqueue * vq,struct buf_vector * buf_vec,uint16_t nr_vec,struct rte_mbuf * m,struct rte_mempool * mbuf_pool)1916 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1917 		  struct buf_vector *buf_vec, uint16_t nr_vec,
1918 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
1919 {
1920 	uint32_t buf_avail, buf_offset;
1921 	uint64_t buf_addr, buf_len;
1922 	uint32_t mbuf_avail, mbuf_offset;
1923 	uint32_t cpy_len;
1924 	struct rte_mbuf *cur = m, *prev = m;
1925 	struct virtio_net_hdr tmp_hdr;
1926 	struct virtio_net_hdr *hdr = NULL;
1927 	/* A counter to avoid desc dead loop chain */
1928 	uint16_t vec_idx = 0;
1929 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1930 	int error = 0;
1931 
1932 	buf_addr = buf_vec[vec_idx].buf_addr;
1933 	buf_len = buf_vec[vec_idx].buf_len;
1934 
1935 	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1936 		error = -1;
1937 		goto out;
1938 	}
1939 
1940 	if (virtio_net_with_host_offload(dev)) {
1941 		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
1942 			/*
1943 			 * No luck, the virtio-net header doesn't fit
1944 			 * in a contiguous virtual area.
1945 			 */
1946 			copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
1947 			hdr = &tmp_hdr;
1948 		} else {
1949 			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
1950 		}
1951 	}
1952 
1953 	/*
1954 	 * A virtio driver normally uses at least 2 desc buffers
1955 	 * for Tx: the first for storing the header, and others
1956 	 * for storing the data.
1957 	 */
1958 	if (unlikely(buf_len < dev->vhost_hlen)) {
1959 		buf_offset = dev->vhost_hlen - buf_len;
1960 		vec_idx++;
1961 		buf_addr = buf_vec[vec_idx].buf_addr;
1962 		buf_len = buf_vec[vec_idx].buf_len;
1963 		buf_avail  = buf_len - buf_offset;
1964 	} else if (buf_len == dev->vhost_hlen) {
1965 		if (unlikely(++vec_idx >= nr_vec))
1966 			goto out;
1967 		buf_addr = buf_vec[vec_idx].buf_addr;
1968 		buf_len = buf_vec[vec_idx].buf_len;
1969 
1970 		buf_offset = 0;
1971 		buf_avail = buf_len;
1972 	} else {
1973 		buf_offset = dev->vhost_hlen;
1974 		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
1975 	}
1976 
1977 	PRINT_PACKET(dev,
1978 			(uintptr_t)(buf_addr + buf_offset),
1979 			(uint32_t)buf_avail, 0);
1980 
1981 	mbuf_offset = 0;
1982 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1983 	while (1) {
1984 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1985 
1986 		if (likely(cpy_len > MAX_BATCH_LEN ||
1987 					vq->batch_copy_nb_elems >= vq->size ||
1988 					(hdr && cur == m))) {
1989 			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1990 						mbuf_offset),
1991 					(void *)((uintptr_t)(buf_addr +
1992 							buf_offset)), cpy_len);
1993 		} else {
1994 			batch_copy[vq->batch_copy_nb_elems].dst =
1995 				rte_pktmbuf_mtod_offset(cur, void *,
1996 						mbuf_offset);
1997 			batch_copy[vq->batch_copy_nb_elems].src =
1998 				(void *)((uintptr_t)(buf_addr + buf_offset));
1999 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2000 			vq->batch_copy_nb_elems++;
2001 		}
2002 
2003 		mbuf_avail  -= cpy_len;
2004 		mbuf_offset += cpy_len;
2005 		buf_avail -= cpy_len;
2006 		buf_offset += cpy_len;
2007 
2008 		/* This buf reaches to its end, get the next one */
2009 		if (buf_avail == 0) {
2010 			if (++vec_idx >= nr_vec)
2011 				break;
2012 
2013 			buf_addr = buf_vec[vec_idx].buf_addr;
2014 			buf_len = buf_vec[vec_idx].buf_len;
2015 
2016 			buf_offset = 0;
2017 			buf_avail  = buf_len;
2018 
2019 			PRINT_PACKET(dev, (uintptr_t)buf_addr,
2020 					(uint32_t)buf_avail, 0);
2021 		}
2022 
2023 		/*
2024 		 * This mbuf reaches to its end, get a new one
2025 		 * to hold more data.
2026 		 */
2027 		if (mbuf_avail == 0) {
2028 			cur = rte_pktmbuf_alloc(mbuf_pool);
2029 			if (unlikely(cur == NULL)) {
2030 				VHOST_LOG_DATA(ERR, "Failed to "
2031 					"allocate memory for mbuf.\n");
2032 				error = -1;
2033 				goto out;
2034 			}
2035 
2036 			prev->next = cur;
2037 			prev->data_len = mbuf_offset;
2038 			m->nb_segs += 1;
2039 			m->pkt_len += mbuf_offset;
2040 			prev = cur;
2041 
2042 			mbuf_offset = 0;
2043 			mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2044 		}
2045 	}
2046 
2047 	prev->data_len = mbuf_offset;
2048 	m->pkt_len    += mbuf_offset;
2049 
2050 	if (hdr)
2051 		vhost_dequeue_offload(hdr, m);
2052 
2053 out:
2054 
2055 	return error;
2056 }
2057 
2058 static void
virtio_dev_extbuf_free(void * addr __rte_unused,void * opaque)2059 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2060 {
2061 	rte_free(opaque);
2062 }
2063 
2064 static int
virtio_dev_extbuf_alloc(struct rte_mbuf * pkt,uint32_t size)2065 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2066 {
2067 	struct rte_mbuf_ext_shared_info *shinfo = NULL;
2068 	uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2069 	uint16_t buf_len;
2070 	rte_iova_t iova;
2071 	void *buf;
2072 
2073 	total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2074 	total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2075 
2076 	if (unlikely(total_len > UINT16_MAX))
2077 		return -ENOSPC;
2078 
2079 	buf_len = total_len;
2080 	buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2081 	if (unlikely(buf == NULL))
2082 		return -ENOMEM;
2083 
2084 	/* Initialize shinfo */
2085 	shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2086 						virtio_dev_extbuf_free, buf);
2087 	if (unlikely(shinfo == NULL)) {
2088 		rte_free(buf);
2089 		VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2090 		return -1;
2091 	}
2092 
2093 	iova = rte_malloc_virt2iova(buf);
2094 	rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2095 	rte_pktmbuf_reset_headroom(pkt);
2096 
2097 	return 0;
2098 }
2099 
2100 /*
2101  * Allocate a host supported pktmbuf.
2102  */
2103 static __rte_always_inline struct rte_mbuf *
virtio_dev_pktmbuf_alloc(struct virtio_net * dev,struct rte_mempool * mp,uint32_t data_len)2104 virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
2105 			 uint32_t data_len)
2106 {
2107 	struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
2108 
2109 	if (unlikely(pkt == NULL)) {
2110 		VHOST_LOG_DATA(ERR,
2111 			"Failed to allocate memory for mbuf.\n");
2112 		return NULL;
2113 	}
2114 
2115 	if (rte_pktmbuf_tailroom(pkt) >= data_len)
2116 		return pkt;
2117 
2118 	/* attach an external buffer if supported */
2119 	if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2120 		return pkt;
2121 
2122 	/* check if chained buffers are allowed */
2123 	if (!dev->linearbuf)
2124 		return pkt;
2125 
2126 	/* Data doesn't fit into the buffer and the host supports
2127 	 * only linear buffers
2128 	 */
2129 	rte_pktmbuf_free(pkt);
2130 
2131 	return NULL;
2132 }
2133 
2134 static __rte_noinline uint16_t
virtio_dev_tx_split(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)2135 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2136 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2137 {
2138 	uint16_t i;
2139 	uint16_t free_entries;
2140 	uint16_t dropped = 0;
2141 	static bool allocerr_warned;
2142 
2143 	/*
2144 	 * The ordering between avail index and
2145 	 * desc reads needs to be enforced.
2146 	 */
2147 	free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2148 			vq->last_avail_idx;
2149 	if (free_entries == 0)
2150 		return 0;
2151 
2152 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2153 
2154 	VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2155 
2156 	count = RTE_MIN(count, MAX_PKT_BURST);
2157 	count = RTE_MIN(count, free_entries);
2158 	VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2159 			dev->vid, count);
2160 
2161 	for (i = 0; i < count; i++) {
2162 		struct buf_vector buf_vec[BUF_VECTOR_MAX];
2163 		uint16_t head_idx;
2164 		uint32_t buf_len;
2165 		uint16_t nr_vec = 0;
2166 		int err;
2167 
2168 		if (unlikely(fill_vec_buf_split(dev, vq,
2169 						vq->last_avail_idx + i,
2170 						&nr_vec, buf_vec,
2171 						&head_idx, &buf_len,
2172 						VHOST_ACCESS_RO) < 0))
2173 			break;
2174 
2175 		update_shadow_used_ring_split(vq, head_idx, 0);
2176 
2177 		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
2178 		if (unlikely(pkts[i] == NULL)) {
2179 			/*
2180 			 * mbuf allocation fails for jumbo packets when external
2181 			 * buffer allocation is not allowed and linear buffer
2182 			 * is required. Drop this packet.
2183 			 */
2184 			if (!allocerr_warned) {
2185 				VHOST_LOG_DATA(ERR,
2186 					"Failed mbuf alloc of size %d from %s on %s.\n",
2187 					buf_len, mbuf_pool->name, dev->ifname);
2188 				allocerr_warned = true;
2189 			}
2190 			dropped += 1;
2191 			i++;
2192 			break;
2193 		}
2194 
2195 		err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2196 				mbuf_pool);
2197 		if (unlikely(err)) {
2198 			rte_pktmbuf_free(pkts[i]);
2199 			if (!allocerr_warned) {
2200 				VHOST_LOG_DATA(ERR,
2201 					"Failed to copy desc to mbuf on %s.\n",
2202 					dev->ifname);
2203 				allocerr_warned = true;
2204 			}
2205 			dropped += 1;
2206 			i++;
2207 			break;
2208 		}
2209 	}
2210 
2211 	vq->last_avail_idx += i;
2212 
2213 	do_data_copy_dequeue(vq);
2214 	if (unlikely(i < count))
2215 		vq->shadow_used_idx = i;
2216 	if (likely(vq->shadow_used_idx)) {
2217 		flush_shadow_used_ring_split(dev, vq);
2218 		vhost_vring_call_split(dev, vq);
2219 	}
2220 
2221 	return (i - dropped);
2222 }
2223 
2224 static __rte_always_inline int
vhost_reserve_avail_batch_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t avail_idx,uintptr_t * desc_addrs,uint16_t * ids)2225 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2226 				 struct vhost_virtqueue *vq,
2227 				 struct rte_mempool *mbuf_pool,
2228 				 struct rte_mbuf **pkts,
2229 				 uint16_t avail_idx,
2230 				 uintptr_t *desc_addrs,
2231 				 uint16_t *ids)
2232 {
2233 	bool wrap = vq->avail_wrap_counter;
2234 	struct vring_packed_desc *descs = vq->desc_packed;
2235 	struct virtio_net_hdr *hdr;
2236 	uint64_t lens[PACKED_BATCH_SIZE];
2237 	uint64_t buf_lens[PACKED_BATCH_SIZE];
2238 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2239 	uint16_t flags, i;
2240 
2241 	if (unlikely(avail_idx & PACKED_BATCH_MASK))
2242 		return -1;
2243 	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2244 		return -1;
2245 
2246 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2247 		flags = descs[avail_idx + i].flags;
2248 		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2249 			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
2250 			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2251 			return -1;
2252 	}
2253 
2254 	rte_smp_rmb();
2255 
2256 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2257 		lens[i] = descs[avail_idx + i].len;
2258 
2259 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2260 		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2261 						  descs[avail_idx + i].addr,
2262 						  &lens[i], VHOST_ACCESS_RW);
2263 	}
2264 
2265 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2266 		if (unlikely(!desc_addrs[i]))
2267 			return -1;
2268 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
2269 			return -1;
2270 	}
2271 
2272 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2273 		pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, lens[i]);
2274 		if (!pkts[i])
2275 			goto free_buf;
2276 	}
2277 
2278 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2279 		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
2280 
2281 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2282 		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
2283 			goto free_buf;
2284 	}
2285 
2286 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2287 		pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
2288 		pkts[i]->data_len = pkts[i]->pkt_len;
2289 		ids[i] = descs[avail_idx + i].id;
2290 	}
2291 
2292 	if (virtio_net_with_host_offload(dev)) {
2293 		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2294 			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
2295 			vhost_dequeue_offload(hdr, pkts[i]);
2296 		}
2297 	}
2298 
2299 	return 0;
2300 
2301 free_buf:
2302 	for (i = 0; i < PACKED_BATCH_SIZE; i++)
2303 		rte_pktmbuf_free(pkts[i]);
2304 
2305 	return -1;
2306 }
2307 
2308 static __rte_always_inline int
virtio_dev_tx_batch_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts)2309 virtio_dev_tx_batch_packed(struct virtio_net *dev,
2310 			   struct vhost_virtqueue *vq,
2311 			   struct rte_mempool *mbuf_pool,
2312 			   struct rte_mbuf **pkts)
2313 {
2314 	uint16_t avail_idx = vq->last_avail_idx;
2315 	uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2316 	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
2317 	uint16_t ids[PACKED_BATCH_SIZE];
2318 	uint16_t i;
2319 
2320 	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
2321 					     avail_idx, desc_addrs, ids))
2322 		return -1;
2323 
2324 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2325 		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
2326 
2327 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2328 		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
2329 			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
2330 			   pkts[i]->pkt_len);
2331 
2332 	if (virtio_net_is_inorder(dev))
2333 		vhost_shadow_dequeue_batch_packed_inorder(vq,
2334 			ids[PACKED_BATCH_SIZE - 1]);
2335 	else
2336 		vhost_shadow_dequeue_batch_packed(dev, vq, ids);
2337 
2338 	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2339 
2340 	return 0;
2341 }
2342 
2343 static __rte_always_inline int
vhost_dequeue_single_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t * buf_id,uint16_t * desc_count)2344 vhost_dequeue_single_packed(struct virtio_net *dev,
2345 			    struct vhost_virtqueue *vq,
2346 			    struct rte_mempool *mbuf_pool,
2347 			    struct rte_mbuf **pkts,
2348 			    uint16_t *buf_id,
2349 			    uint16_t *desc_count)
2350 {
2351 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
2352 	uint32_t buf_len;
2353 	uint16_t nr_vec = 0;
2354 	int err;
2355 	static bool allocerr_warned;
2356 
2357 	if (unlikely(fill_vec_buf_packed(dev, vq,
2358 					 vq->last_avail_idx, desc_count,
2359 					 buf_vec, &nr_vec,
2360 					 buf_id, &buf_len,
2361 					 VHOST_ACCESS_RO) < 0))
2362 		return -1;
2363 
2364 	*pkts = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
2365 	if (unlikely(*pkts == NULL)) {
2366 		if (!allocerr_warned) {
2367 			VHOST_LOG_DATA(ERR,
2368 				"Failed mbuf alloc of size %d from %s on %s.\n",
2369 				buf_len, mbuf_pool->name, dev->ifname);
2370 			allocerr_warned = true;
2371 		}
2372 		return -1;
2373 	}
2374 
2375 	err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts,
2376 				mbuf_pool);
2377 	if (unlikely(err)) {
2378 		if (!allocerr_warned) {
2379 			VHOST_LOG_DATA(ERR,
2380 				"Failed to copy desc to mbuf on %s.\n",
2381 				dev->ifname);
2382 			allocerr_warned = true;
2383 		}
2384 		rte_pktmbuf_free(*pkts);
2385 		return -1;
2386 	}
2387 
2388 	return 0;
2389 }
2390 
2391 static __rte_always_inline int
virtio_dev_tx_single_packed(struct virtio_net * dev,struct vhost_virtqueue * vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts)2392 virtio_dev_tx_single_packed(struct virtio_net *dev,
2393 			    struct vhost_virtqueue *vq,
2394 			    struct rte_mempool *mbuf_pool,
2395 			    struct rte_mbuf **pkts)
2396 {
2397 
2398 	uint16_t buf_id, desc_count = 0;
2399 	int ret;
2400 
2401 	ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2402 					&desc_count);
2403 
2404 	if (likely(desc_count > 0)) {
2405 		if (virtio_net_is_inorder(dev))
2406 			vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
2407 								   desc_count);
2408 		else
2409 			vhost_shadow_dequeue_single_packed(vq, buf_id,
2410 					desc_count);
2411 
2412 		vq_inc_last_avail_packed(vq, desc_count);
2413 	}
2414 
2415 	return ret;
2416 }
2417 
2418 static __rte_noinline uint16_t
virtio_dev_tx_packed(struct virtio_net * dev,struct vhost_virtqueue * __rte_restrict vq,struct rte_mempool * mbuf_pool,struct rte_mbuf ** __rte_restrict pkts,uint32_t count)2419 virtio_dev_tx_packed(struct virtio_net *dev,
2420 		     struct vhost_virtqueue *__rte_restrict vq,
2421 		     struct rte_mempool *mbuf_pool,
2422 		     struct rte_mbuf **__rte_restrict pkts,
2423 		     uint32_t count)
2424 {
2425 	uint32_t pkt_idx = 0;
2426 	uint32_t remained = count;
2427 
2428 	do {
2429 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2430 
2431 		if (remained >= PACKED_BATCH_SIZE) {
2432 			if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool,
2433 							&pkts[pkt_idx])) {
2434 				pkt_idx += PACKED_BATCH_SIZE;
2435 				remained -= PACKED_BATCH_SIZE;
2436 				continue;
2437 			}
2438 		}
2439 
2440 		if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
2441 						&pkts[pkt_idx]))
2442 			break;
2443 		pkt_idx++;
2444 		remained--;
2445 
2446 	} while (remained);
2447 
2448 	if (vq->shadow_used_idx) {
2449 		do_data_copy_dequeue(vq);
2450 
2451 		vhost_flush_dequeue_shadow_packed(dev, vq);
2452 		vhost_vring_call_packed(dev, vq);
2453 	}
2454 
2455 	return pkt_idx;
2456 }
2457 
2458 uint16_t
rte_vhost_dequeue_burst(int vid,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)2459 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
2460 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2461 {
2462 	struct virtio_net *dev;
2463 	struct rte_mbuf *rarp_mbuf = NULL;
2464 	struct vhost_virtqueue *vq;
2465 	int16_t success = 1;
2466 
2467 	dev = get_device(vid);
2468 	if (!dev)
2469 		return 0;
2470 
2471 	if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2472 		VHOST_LOG_DATA(ERR,
2473 			"(%d) %s: built-in vhost net backend is disabled.\n",
2474 			dev->vid, __func__);
2475 		return 0;
2476 	}
2477 
2478 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
2479 		VHOST_LOG_DATA(ERR,
2480 			"(%d) %s: invalid virtqueue idx %d.\n",
2481 			dev->vid, __func__, queue_id);
2482 		return 0;
2483 	}
2484 
2485 	vq = dev->virtqueue[queue_id];
2486 
2487 	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
2488 		return 0;
2489 
2490 	if (unlikely(vq->enabled == 0)) {
2491 		count = 0;
2492 		goto out_access_unlock;
2493 	}
2494 
2495 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2496 		vhost_user_iotlb_rd_lock(vq);
2497 
2498 	if (unlikely(vq->access_ok == 0))
2499 		if (unlikely(vring_translate(dev, vq) < 0)) {
2500 			count = 0;
2501 			goto out;
2502 		}
2503 
2504 	/*
2505 	 * Construct a RARP broadcast packet, and inject it to the "pkts"
2506 	 * array, to looks like that guest actually send such packet.
2507 	 *
2508 	 * Check user_send_rarp() for more information.
2509 	 *
2510 	 * broadcast_rarp shares a cacheline in the virtio_net structure
2511 	 * with some fields that are accessed during enqueue and
2512 	 * __atomic_compare_exchange_n causes a write if performed compare
2513 	 * and exchange. This could result in false sharing between enqueue
2514 	 * and dequeue.
2515 	 *
2516 	 * Prevent unnecessary false sharing by reading broadcast_rarp first
2517 	 * and only performing compare and exchange if the read indicates it
2518 	 * is likely to be set.
2519 	 */
2520 	if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
2521 			__atomic_compare_exchange_n(&dev->broadcast_rarp,
2522 			&success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
2523 
2524 		rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
2525 		if (rarp_mbuf == NULL) {
2526 			VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
2527 			count = 0;
2528 			goto out;
2529 		}
2530 		count -= 1;
2531 	}
2532 
2533 	if (vq_is_packed(dev))
2534 		count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
2535 	else
2536 		count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
2537 
2538 out:
2539 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2540 		vhost_user_iotlb_rd_unlock(vq);
2541 
2542 out_access_unlock:
2543 	rte_spinlock_unlock(&vq->access_lock);
2544 
2545 	if (unlikely(rarp_mbuf != NULL)) {
2546 		/*
2547 		 * Inject it to the head of "pkts" array, so that switch's mac
2548 		 * learning table will get updated first.
2549 		 */
2550 		memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
2551 		pkts[0] = rarp_mbuf;
2552 		count += 1;
2553 	}
2554 
2555 	return count;
2556 }
2557