xref: /dpdk/lib/vhost/vhost_user.c (revision ae2c2cb6)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2018 Intel Corporation
3  */
4 
5 /* Security model
6  * --------------
7  * The vhost-user protocol connection is an external interface, so it must be
8  * robust against invalid inputs.
9  *
10  * This is important because the vhost-user master is only one step removed
11  * from the guest.  Malicious guests that have escaped will then launch further
12  * attacks from the vhost-user master.
13  *
14  * Even in deployments where guests are trusted, a bug in the vhost-user master
15  * can still cause invalid messages to be sent.  Such messages must not
16  * compromise the stability of the DPDK application by causing crashes, memory
17  * corruption, or other problematic behavior.
18  *
19  * Do not assume received VhostUserMsg fields contain sensible values!
20  */
21 
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <fcntl.h>
28 #include <sys/ioctl.h>
29 #include <sys/mman.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <sys/syscall.h>
33 #include <assert.h>
34 #ifdef RTE_LIBRTE_VHOST_NUMA
35 #include <numaif.h>
36 #endif
37 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
38 #include <linux/userfaultfd.h>
39 #endif
40 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
41 #include <linux/memfd.h>
42 #define MEMFD_SUPPORTED
43 #endif
44 
45 #include <rte_common.h>
46 #include <rte_malloc.h>
47 #include <rte_log.h>
48 #include <rte_vfio.h>
49 #include <rte_errno.h>
50 
51 #include "iotlb.h"
52 #include "vhost.h"
53 #include "vhost_user.h"
54 
55 #define VIRTIO_MIN_MTU 68
56 #define VIRTIO_MAX_MTU 65535
57 
58 #define INFLIGHT_ALIGNMENT	64
59 #define INFLIGHT_VERSION	0x1
60 
61 static const char *vhost_message_str[VHOST_USER_MAX] = {
62 	[VHOST_USER_NONE] = "VHOST_USER_NONE",
63 	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
64 	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
65 	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
66 	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
67 	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
68 	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
69 	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
70 	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
71 	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
72 	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
73 	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
74 	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
75 	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
76 	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
77 	[VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
78 	[VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
79 	[VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
80 	[VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
81 	[VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
82 	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
83 	[VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
84 	[VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
85 	[VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
86 	[VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
87 	[VHOST_USER_POSTCOPY_ADVISE]  = "VHOST_USER_POSTCOPY_ADVISE",
88 	[VHOST_USER_POSTCOPY_LISTEN]  = "VHOST_USER_POSTCOPY_LISTEN",
89 	[VHOST_USER_POSTCOPY_END]  = "VHOST_USER_POSTCOPY_END",
90 	[VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD",
91 	[VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD",
92 	[VHOST_USER_SET_STATUS] = "VHOST_USER_SET_STATUS",
93 	[VHOST_USER_GET_STATUS] = "VHOST_USER_GET_STATUS",
94 };
95 
96 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx);
97 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx);
98 
99 static void
100 close_msg_fds(struct vhu_msg_context *ctx)
101 {
102 	int i;
103 
104 	for (i = 0; i < ctx->fd_num; i++) {
105 		int fd = ctx->fds[i];
106 
107 		if (fd == -1)
108 			continue;
109 
110 		ctx->fds[i] = -1;
111 		close(fd);
112 	}
113 }
114 
115 /*
116  * Ensure the expected number of FDs is received,
117  * close all FDs and return an error if this is not the case.
118  */
119 static int
120 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds)
121 {
122 	if (ctx->fd_num == expected_fds)
123 		return 0;
124 
125 	VHOST_LOG_CONFIG(ERR, "(%s) expect %d FDs for request %s, received %d\n",
126 		dev->ifname, expected_fds,
127 		vhost_message_str[ctx->msg.request.master],
128 		ctx->fd_num);
129 
130 	close_msg_fds(ctx);
131 
132 	return -1;
133 }
134 
135 static uint64_t
136 get_blk_size(int fd)
137 {
138 	struct stat stat;
139 	int ret;
140 
141 	ret = fstat(fd, &stat);
142 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
143 }
144 
145 static int
146 async_dma_map(struct virtio_net *dev, struct rte_vhost_mem_region *region, bool do_map)
147 {
148 	uint64_t host_iova;
149 	int ret = 0;
150 
151 	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
152 	if (do_map) {
153 		/* Add mapped region into the default container of DPDK. */
154 		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
155 						 region->host_user_addr,
156 						 host_iova,
157 						 region->size);
158 		if (ret) {
159 			/*
160 			 * DMA device may bind with kernel driver, in this case,
161 			 * we don't need to program IOMMU manually. However, if no
162 			 * device is bound with vfio/uio in DPDK, and vfio kernel
163 			 * module is loaded, the API will still be called and return
164 			 * with ENODEV/ENOSUP.
165 			 *
166 			 * DPDK vfio only returns ENODEV/ENOSUP in very similar
167 			 * situations(vfio either unsupported, or supported
168 			 * but no devices found). Either way, no mappings could be
169 			 * performed. We treat it as normal case in async path.
170 			 */
171 			if (rte_errno == ENODEV || rte_errno == ENOTSUP)
172 				return 0;
173 
174 			VHOST_LOG_CONFIG(ERR, "(%s) DMA engine map failed\n", dev->ifname);
175 			/* DMA mapping errors won't stop VHST_USER_SET_MEM_TABLE. */
176 			return 0;
177 		}
178 
179 	} else {
180 		/* Remove mapped region from the default container of DPDK. */
181 		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
182 						   region->host_user_addr,
183 						   host_iova,
184 						   region->size);
185 		if (ret) {
186 			/* like DMA map, ignore the kernel driver case when unmap. */
187 			if (rte_errno == EINVAL)
188 				return 0;
189 
190 			VHOST_LOG_CONFIG(ERR, "(%s) DMA engine unmap failed\n", dev->ifname);
191 			return ret;
192 		}
193 	}
194 
195 	return ret;
196 }
197 
198 static void
199 free_mem_region(struct virtio_net *dev)
200 {
201 	uint32_t i;
202 	struct rte_vhost_mem_region *reg;
203 
204 	if (!dev || !dev->mem)
205 		return;
206 
207 	for (i = 0; i < dev->mem->nregions; i++) {
208 		reg = &dev->mem->regions[i];
209 		if (reg->host_user_addr) {
210 			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
211 				async_dma_map(dev, reg, false);
212 
213 			munmap(reg->mmap_addr, reg->mmap_size);
214 			close(reg->fd);
215 		}
216 	}
217 }
218 
219 void
220 vhost_backend_cleanup(struct virtio_net *dev)
221 {
222 	struct rte_vdpa_device *vdpa_dev;
223 
224 	vdpa_dev = dev->vdpa_dev;
225 	if (vdpa_dev && vdpa_dev->ops->dev_cleanup != NULL)
226 		vdpa_dev->ops->dev_cleanup(dev->vid);
227 
228 	if (dev->mem) {
229 		free_mem_region(dev);
230 		rte_free(dev->mem);
231 		dev->mem = NULL;
232 	}
233 
234 	rte_free(dev->guest_pages);
235 	dev->guest_pages = NULL;
236 
237 	if (dev->log_addr) {
238 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
239 		dev->log_addr = 0;
240 	}
241 
242 	if (dev->inflight_info) {
243 		if (dev->inflight_info->addr) {
244 			munmap(dev->inflight_info->addr,
245 			       dev->inflight_info->size);
246 			dev->inflight_info->addr = NULL;
247 		}
248 
249 		if (dev->inflight_info->fd >= 0) {
250 			close(dev->inflight_info->fd);
251 			dev->inflight_info->fd = -1;
252 		}
253 
254 		rte_free(dev->inflight_info);
255 		dev->inflight_info = NULL;
256 	}
257 
258 	if (dev->slave_req_fd >= 0) {
259 		close(dev->slave_req_fd);
260 		dev->slave_req_fd = -1;
261 	}
262 
263 	if (dev->postcopy_ufd >= 0) {
264 		close(dev->postcopy_ufd);
265 		dev->postcopy_ufd = -1;
266 	}
267 
268 	dev->postcopy_listening = 0;
269 }
270 
271 static void
272 vhost_user_notify_queue_state(struct virtio_net *dev, uint16_t index,
273 			      int enable)
274 {
275 	struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
276 	struct vhost_virtqueue *vq = dev->virtqueue[index];
277 
278 	/* Configure guest notifications on enable */
279 	if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF)
280 		vhost_enable_guest_notification(dev, vq, vq->notif_enable);
281 
282 	if (vdpa_dev && vdpa_dev->ops->set_vring_state)
283 		vdpa_dev->ops->set_vring_state(dev->vid, index, enable);
284 
285 	if (dev->notify_ops->vring_state_changed)
286 		dev->notify_ops->vring_state_changed(dev->vid,
287 				index, enable);
288 }
289 
290 /*
291  * This function just returns success at the moment unless
292  * the device hasn't been initialised.
293  */
294 static int
295 vhost_user_set_owner(struct virtio_net **pdev,
296 			struct vhu_msg_context *ctx,
297 			int main_fd __rte_unused)
298 {
299 	struct virtio_net *dev = *pdev;
300 
301 	if (validate_msg_fds(dev, ctx, 0) != 0)
302 		return RTE_VHOST_MSG_RESULT_ERR;
303 
304 	return RTE_VHOST_MSG_RESULT_OK;
305 }
306 
307 static int
308 vhost_user_reset_owner(struct virtio_net **pdev,
309 			struct vhu_msg_context *ctx,
310 			int main_fd __rte_unused)
311 {
312 	struct virtio_net *dev = *pdev;
313 
314 	if (validate_msg_fds(dev, ctx, 0) != 0)
315 		return RTE_VHOST_MSG_RESULT_ERR;
316 
317 	vhost_destroy_device_notify(dev);
318 
319 	cleanup_device(dev, 0);
320 	reset_device(dev);
321 	return RTE_VHOST_MSG_RESULT_OK;
322 }
323 
324 /*
325  * The features that we support are requested.
326  */
327 static int
328 vhost_user_get_features(struct virtio_net **pdev,
329 			struct vhu_msg_context *ctx,
330 			int main_fd __rte_unused)
331 {
332 	struct virtio_net *dev = *pdev;
333 	uint64_t features = 0;
334 
335 	if (validate_msg_fds(dev, ctx, 0) != 0)
336 		return RTE_VHOST_MSG_RESULT_ERR;
337 
338 	rte_vhost_driver_get_features(dev->ifname, &features);
339 
340 	ctx->msg.payload.u64 = features;
341 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
342 	ctx->fd_num = 0;
343 
344 	return RTE_VHOST_MSG_RESULT_REPLY;
345 }
346 
347 /*
348  * The queue number that we support are requested.
349  */
350 static int
351 vhost_user_get_queue_num(struct virtio_net **pdev,
352 			struct vhu_msg_context *ctx,
353 			int main_fd __rte_unused)
354 {
355 	struct virtio_net *dev = *pdev;
356 	uint32_t queue_num = 0;
357 
358 	if (validate_msg_fds(dev, ctx, 0) != 0)
359 		return RTE_VHOST_MSG_RESULT_ERR;
360 
361 	rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
362 
363 	ctx->msg.payload.u64 = (uint64_t)queue_num;
364 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
365 	ctx->fd_num = 0;
366 
367 	return RTE_VHOST_MSG_RESULT_REPLY;
368 }
369 
370 /*
371  * We receive the negotiated features supported by us and the virtio device.
372  */
373 static int
374 vhost_user_set_features(struct virtio_net **pdev,
375 			struct vhu_msg_context *ctx,
376 			int main_fd __rte_unused)
377 {
378 	struct virtio_net *dev = *pdev;
379 	uint64_t features = ctx->msg.payload.u64;
380 	uint64_t vhost_features = 0;
381 	struct rte_vdpa_device *vdpa_dev;
382 
383 	if (validate_msg_fds(dev, ctx, 0) != 0)
384 		return RTE_VHOST_MSG_RESULT_ERR;
385 
386 	rte_vhost_driver_get_features(dev->ifname, &vhost_features);
387 	if (features & ~vhost_features) {
388 		VHOST_LOG_CONFIG(ERR, "(%s) received invalid negotiated features.\n",
389 			dev->ifname);
390 		dev->flags |= VIRTIO_DEV_FEATURES_FAILED;
391 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
392 
393 		return RTE_VHOST_MSG_RESULT_ERR;
394 	}
395 
396 	if (dev->flags & VIRTIO_DEV_RUNNING) {
397 		if (dev->features == features)
398 			return RTE_VHOST_MSG_RESULT_OK;
399 
400 		/*
401 		 * Error out if master tries to change features while device is
402 		 * in running state. The exception being VHOST_F_LOG_ALL, which
403 		 * is enabled when the live-migration starts.
404 		 */
405 		if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
406 			VHOST_LOG_CONFIG(ERR, "(%s) features changed while device is running.\n",
407 				dev->ifname);
408 			return RTE_VHOST_MSG_RESULT_ERR;
409 		}
410 
411 		if (dev->notify_ops->features_changed)
412 			dev->notify_ops->features_changed(dev->vid, features);
413 	}
414 
415 	dev->features = features;
416 	if (dev->features &
417 		((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
418 		 (1ULL << VIRTIO_F_VERSION_1) |
419 		 (1ULL << VIRTIO_F_RING_PACKED))) {
420 		dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
421 	} else {
422 		dev->vhost_hlen = sizeof(struct virtio_net_hdr);
423 	}
424 	VHOST_LOG_CONFIG(INFO, "(%s) negotiated Virtio features: 0x%" PRIx64 "\n",
425 			dev->ifname, dev->features);
426 	VHOST_LOG_CONFIG(DEBUG, "(%s) mergeable RX buffers %s, virtio 1 %s\n",
427 		dev->ifname,
428 		(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
429 		(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
430 
431 	if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) &&
432 	    !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
433 		/*
434 		 * Remove all but first queue pair if MQ hasn't been
435 		 * negotiated. This is safe because the device is not
436 		 * running at this stage.
437 		 */
438 		while (dev->nr_vring > 2) {
439 			struct vhost_virtqueue *vq;
440 
441 			vq = dev->virtqueue[--dev->nr_vring];
442 			if (!vq)
443 				continue;
444 
445 			dev->virtqueue[dev->nr_vring] = NULL;
446 			cleanup_vq(vq, 1);
447 			cleanup_vq_inflight(dev, vq);
448 			free_vq(dev, vq);
449 		}
450 	}
451 
452 	vdpa_dev = dev->vdpa_dev;
453 	if (vdpa_dev)
454 		vdpa_dev->ops->set_features(dev->vid);
455 
456 	dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED;
457 	return RTE_VHOST_MSG_RESULT_OK;
458 }
459 
460 /*
461  * The virtio device sends us the size of the descriptor ring.
462  */
463 static int
464 vhost_user_set_vring_num(struct virtio_net **pdev,
465 			struct vhu_msg_context *ctx,
466 			int main_fd __rte_unused)
467 {
468 	struct virtio_net *dev = *pdev;
469 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
470 
471 	if (validate_msg_fds(dev, ctx, 0) != 0)
472 		return RTE_VHOST_MSG_RESULT_ERR;
473 
474 	if (ctx->msg.payload.state.num > 32768) {
475 		VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n",
476 				dev->ifname, ctx->msg.payload.state.num);
477 		return RTE_VHOST_MSG_RESULT_ERR;
478 	}
479 
480 	vq->size = ctx->msg.payload.state.num;
481 
482 	/* VIRTIO 1.0, 2.4 Virtqueues says:
483 	 *
484 	 *   Queue Size value is always a power of 2. The maximum Queue Size
485 	 *   value is 32768.
486 	 *
487 	 * VIRTIO 1.1 2.7 Virtqueues says:
488 	 *
489 	 *   Packed virtqueues support up to 2^15 entries each.
490 	 */
491 	if (!vq_is_packed(dev)) {
492 		if (vq->size & (vq->size - 1)) {
493 			VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n",
494 					dev->ifname, vq->size);
495 			return RTE_VHOST_MSG_RESULT_ERR;
496 		}
497 	}
498 
499 	if (vq_is_packed(dev)) {
500 		rte_free(vq->shadow_used_packed);
501 		vq->shadow_used_packed = rte_malloc_socket(NULL,
502 				vq->size *
503 				sizeof(struct vring_used_elem_packed),
504 				RTE_CACHE_LINE_SIZE, vq->numa_node);
505 		if (!vq->shadow_used_packed) {
506 			VHOST_LOG_CONFIG(ERR,
507 				"(%s) failed to allocate memory for shadow used ring.\n",
508 				dev->ifname);
509 			return RTE_VHOST_MSG_RESULT_ERR;
510 		}
511 
512 	} else {
513 		rte_free(vq->shadow_used_split);
514 
515 		vq->shadow_used_split = rte_malloc_socket(NULL,
516 				vq->size * sizeof(struct vring_used_elem),
517 				RTE_CACHE_LINE_SIZE, vq->numa_node);
518 
519 		if (!vq->shadow_used_split) {
520 			VHOST_LOG_CONFIG(ERR,
521 				"(%s) failed to allocate memory for vq internal data.\n",
522 				dev->ifname);
523 			return RTE_VHOST_MSG_RESULT_ERR;
524 		}
525 	}
526 
527 	rte_free(vq->batch_copy_elems);
528 	vq->batch_copy_elems = rte_malloc_socket(NULL,
529 				vq->size * sizeof(struct batch_copy_elem),
530 				RTE_CACHE_LINE_SIZE, vq->numa_node);
531 	if (!vq->batch_copy_elems) {
532 		VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate memory for batching copy.\n",
533 			dev->ifname);
534 		return RTE_VHOST_MSG_RESULT_ERR;
535 	}
536 
537 	return RTE_VHOST_MSG_RESULT_OK;
538 }
539 
540 /*
541  * Reallocate virtio_dev, vhost_virtqueue and related data structures to
542  * make them on the same numa node as the memory of vring descriptor.
543  */
544 #ifdef RTE_LIBRTE_VHOST_NUMA
545 static struct virtio_net*
546 numa_realloc(struct virtio_net *dev, int index)
547 {
548 	int node, dev_node;
549 	struct virtio_net *old_dev;
550 	struct vhost_virtqueue *vq;
551 	struct batch_copy_elem *bce;
552 	struct guest_page *gp;
553 	struct rte_vhost_memory *mem;
554 	size_t mem_size;
555 	int ret;
556 
557 	old_dev = dev;
558 	vq = dev->virtqueue[index];
559 
560 	/*
561 	 * If VQ is ready, it is too late to reallocate, it certainly already
562 	 * happened anyway on VHOST_USER_SET_VRING_ADRR.
563 	 */
564 	if (vq->ready)
565 		return dev;
566 
567 	ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR);
568 	if (ret) {
569 		VHOST_LOG_CONFIG(ERR, "(%s) unable to get virtqueue %d numa information.\n",
570 				dev->ifname, index);
571 		return dev;
572 	}
573 
574 	if (node == vq->numa_node)
575 		goto out_dev_realloc;
576 
577 	vq = rte_realloc_socket(vq, sizeof(*vq), 0, node);
578 	if (!vq) {
579 		VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc virtqueue %d on node %d\n",
580 				dev->ifname, index, node);
581 		return dev;
582 	}
583 
584 	if (vq != dev->virtqueue[index]) {
585 		VHOST_LOG_CONFIG(INFO, "(%s) reallocated virtqueue on node %d\n",
586 				dev->ifname, node);
587 		dev->virtqueue[index] = vq;
588 		vhost_user_iotlb_init(dev, index);
589 	}
590 
591 	if (vq_is_packed(dev)) {
592 		struct vring_used_elem_packed *sup;
593 
594 		sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup),
595 				RTE_CACHE_LINE_SIZE, node);
596 		if (!sup) {
597 			VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow packed on node %d\n",
598 					dev->ifname, node);
599 			return dev;
600 		}
601 		vq->shadow_used_packed = sup;
602 	} else {
603 		struct vring_used_elem *sus;
604 
605 		sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus),
606 				RTE_CACHE_LINE_SIZE, node);
607 		if (!sus) {
608 			VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow split on node %d\n",
609 					dev->ifname, node);
610 			return dev;
611 		}
612 		vq->shadow_used_split = sus;
613 	}
614 
615 	bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce),
616 			RTE_CACHE_LINE_SIZE, node);
617 	if (!bce) {
618 		VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc batch copy elem on node %d\n",
619 				dev->ifname, node);
620 		return dev;
621 	}
622 	vq->batch_copy_elems = bce;
623 
624 	if (vq->log_cache) {
625 		struct log_cache_entry *lc;
626 
627 		lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node);
628 		if (!lc) {
629 			VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc log cache on node %d\n",
630 					dev->ifname, node);
631 			return dev;
632 		}
633 		vq->log_cache = lc;
634 	}
635 
636 	if (vq->resubmit_inflight) {
637 		struct rte_vhost_resubmit_info *ri;
638 
639 		ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node);
640 		if (!ri) {
641 			VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit inflight on node %d\n",
642 					dev->ifname, node);
643 			return dev;
644 		}
645 		vq->resubmit_inflight = ri;
646 
647 		if (ri->resubmit_list) {
648 			struct rte_vhost_resubmit_desc *rd;
649 
650 			rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num,
651 					0, node);
652 			if (!rd) {
653 				VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit list on node %d\n",
654 						dev->ifname, node);
655 				return dev;
656 			}
657 			ri->resubmit_list = rd;
658 		}
659 	}
660 
661 	vq->numa_node = node;
662 
663 out_dev_realloc:
664 
665 	if (dev->flags & VIRTIO_DEV_RUNNING)
666 		return dev;
667 
668 	ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR);
669 	if (ret) {
670 		VHOST_LOG_CONFIG(ERR, "(%s) unable to get numa information.\n", dev->ifname);
671 		return dev;
672 	}
673 
674 	if (dev_node == node)
675 		return dev;
676 
677 	dev = rte_realloc_socket(old_dev, sizeof(*dev), 0, node);
678 	if (!dev) {
679 		VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc dev on node %d\n",
680 				old_dev->ifname, node);
681 		return old_dev;
682 	}
683 
684 	VHOST_LOG_CONFIG(INFO, "(%s) reallocated device on node %d\n", dev->ifname, node);
685 	vhost_devices[dev->vid] = dev;
686 
687 	mem_size = sizeof(struct rte_vhost_memory) +
688 		sizeof(struct rte_vhost_mem_region) * dev->mem->nregions;
689 	mem = rte_realloc_socket(dev->mem, mem_size, 0, node);
690 	if (!mem) {
691 		VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc mem table on node %d\n",
692 				dev->ifname, node);
693 		return dev;
694 	}
695 	dev->mem = mem;
696 
697 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
698 			RTE_CACHE_LINE_SIZE, node);
699 	if (!gp) {
700 		VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc guest pages on node %d\n",
701 				dev->ifname, node);
702 		return dev;
703 	}
704 	dev->guest_pages = gp;
705 
706 	return dev;
707 }
708 #else
709 static struct virtio_net*
710 numa_realloc(struct virtio_net *dev, int index __rte_unused)
711 {
712 	return dev;
713 }
714 #endif
715 
716 /* Converts QEMU virtual address to Vhost virtual address. */
717 static uint64_t
718 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
719 {
720 	struct rte_vhost_mem_region *r;
721 	uint32_t i;
722 
723 	if (unlikely(!dev || !dev->mem))
724 		goto out_error;
725 
726 	/* Find the region where the address lives. */
727 	for (i = 0; i < dev->mem->nregions; i++) {
728 		r = &dev->mem->regions[i];
729 
730 		if (qva >= r->guest_user_addr &&
731 		    qva <  r->guest_user_addr + r->size) {
732 
733 			if (unlikely(*len > r->guest_user_addr + r->size - qva))
734 				*len = r->guest_user_addr + r->size - qva;
735 
736 			return qva - r->guest_user_addr +
737 			       r->host_user_addr;
738 		}
739 	}
740 out_error:
741 	*len = 0;
742 
743 	return 0;
744 }
745 
746 
747 /*
748  * Converts ring address to Vhost virtual address.
749  * If IOMMU is enabled, the ring address is a guest IO virtual address,
750  * else it is a QEMU virtual address.
751  */
752 static uint64_t
753 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
754 		uint64_t ra, uint64_t *size)
755 {
756 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
757 		uint64_t vva;
758 
759 		vhost_user_iotlb_rd_lock(vq);
760 		vva = vhost_iova_to_vva(dev, vq, ra,
761 					size, VHOST_ACCESS_RW);
762 		vhost_user_iotlb_rd_unlock(vq);
763 
764 		return vva;
765 	}
766 
767 	return qva_to_vva(dev, ra, size);
768 }
769 
770 static uint64_t
771 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq)
772 {
773 	uint64_t log_gpa;
774 
775 	vhost_user_iotlb_rd_lock(vq);
776 	log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr);
777 	vhost_user_iotlb_rd_unlock(vq);
778 
779 	return log_gpa;
780 }
781 
782 static struct virtio_net *
783 translate_ring_addresses(struct virtio_net *dev, int vq_index)
784 {
785 	struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
786 	struct vhost_vring_addr *addr = &vq->ring_addrs;
787 	uint64_t len, expected_len;
788 
789 	if (addr->flags & (1 << VHOST_VRING_F_LOG)) {
790 		vq->log_guest_addr =
791 			log_addr_to_gpa(dev, vq);
792 		if (vq->log_guest_addr == 0) {
793 			VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map log_guest_addr.\n",
794 				dev->ifname);
795 			return dev;
796 		}
797 	}
798 
799 	if (vq_is_packed(dev)) {
800 		len = sizeof(struct vring_packed_desc) * vq->size;
801 		vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
802 			ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len);
803 		if (vq->desc_packed == NULL ||
804 				len != sizeof(struct vring_packed_desc) *
805 				vq->size) {
806 			VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc_packed ring.\n",
807 				dev->ifname);
808 			return dev;
809 		}
810 
811 		dev = numa_realloc(dev, vq_index);
812 		vq = dev->virtqueue[vq_index];
813 		addr = &vq->ring_addrs;
814 
815 		len = sizeof(struct vring_packed_desc_event);
816 		vq->driver_event = (struct vring_packed_desc_event *)
817 					(uintptr_t)ring_addr_to_vva(dev,
818 					vq, addr->avail_user_addr, &len);
819 		if (vq->driver_event == NULL ||
820 				len != sizeof(struct vring_packed_desc_event)) {
821 			VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find driver area address.\n",
822 				dev->ifname);
823 			return dev;
824 		}
825 
826 		len = sizeof(struct vring_packed_desc_event);
827 		vq->device_event = (struct vring_packed_desc_event *)
828 					(uintptr_t)ring_addr_to_vva(dev,
829 					vq, addr->used_user_addr, &len);
830 		if (vq->device_event == NULL ||
831 				len != sizeof(struct vring_packed_desc_event)) {
832 			VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find device area address.\n",
833 				dev->ifname);
834 			return dev;
835 		}
836 
837 		vq->access_ok = true;
838 		return dev;
839 	}
840 
841 	/* The addresses are converted from QEMU virtual to Vhost virtual. */
842 	if (vq->desc && vq->avail && vq->used)
843 		return dev;
844 
845 	len = sizeof(struct vring_desc) * vq->size;
846 	vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
847 			vq, addr->desc_user_addr, &len);
848 	if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
849 		VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc ring.\n", dev->ifname);
850 		return dev;
851 	}
852 
853 	dev = numa_realloc(dev, vq_index);
854 	vq = dev->virtqueue[vq_index];
855 	addr = &vq->ring_addrs;
856 
857 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
858 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
859 		len += sizeof(uint16_t);
860 	expected_len = len;
861 	vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
862 			vq, addr->avail_user_addr, &len);
863 	if (vq->avail == 0 || len != expected_len) {
864 		VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map avail ring.\n", dev->ifname);
865 		return dev;
866 	}
867 
868 	len = sizeof(struct vring_used) +
869 		sizeof(struct vring_used_elem) * vq->size;
870 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
871 		len += sizeof(uint16_t);
872 	expected_len = len;
873 	vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
874 			vq, addr->used_user_addr, &len);
875 	if (vq->used == 0 || len != expected_len) {
876 		VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map used ring.\n", dev->ifname);
877 		return dev;
878 	}
879 
880 	if (vq->last_used_idx != vq->used->idx) {
881 		VHOST_LOG_CONFIG(WARNING, "(%s) last_used_idx (%u) and vq->used->idx (%u) mismatches;\n",
882 			dev->ifname,
883 			vq->last_used_idx, vq->used->idx);
884 		vq->last_used_idx  = vq->used->idx;
885 		vq->last_avail_idx = vq->used->idx;
886 		VHOST_LOG_CONFIG(WARNING, "(%s) some packets maybe resent for Tx and dropped for Rx\n",
887 			dev->ifname);
888 	}
889 
890 	vq->access_ok = true;
891 
892 	VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address desc: %p\n", dev->ifname, vq->desc);
893 	VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address avail: %p\n", dev->ifname, vq->avail);
894 	VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address used: %p\n", dev->ifname, vq->used);
895 	VHOST_LOG_CONFIG(DEBUG, "(%s) log_guest_addr: %" PRIx64 "\n",
896 			dev->ifname, vq->log_guest_addr);
897 
898 	return dev;
899 }
900 
901 /*
902  * The virtio device sends us the desc, used and avail ring addresses.
903  * This function then converts these to our address space.
904  */
905 static int
906 vhost_user_set_vring_addr(struct virtio_net **pdev,
907 			struct vhu_msg_context *ctx,
908 			int main_fd __rte_unused)
909 {
910 	struct virtio_net *dev = *pdev;
911 	struct vhost_virtqueue *vq;
912 	struct vhost_vring_addr *addr = &ctx->msg.payload.addr;
913 	bool access_ok;
914 
915 	if (validate_msg_fds(dev, ctx, 0) != 0)
916 		return RTE_VHOST_MSG_RESULT_ERR;
917 
918 	if (dev->mem == NULL)
919 		return RTE_VHOST_MSG_RESULT_ERR;
920 
921 	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
922 	vq = dev->virtqueue[ctx->msg.payload.addr.index];
923 
924 	access_ok = vq->access_ok;
925 
926 	/*
927 	 * Rings addresses should not be interpreted as long as the ring is not
928 	 * started and enabled
929 	 */
930 	memcpy(&vq->ring_addrs, addr, sizeof(*addr));
931 
932 	vring_invalidate(dev, vq);
933 
934 	if ((vq->enabled && (dev->features &
935 				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) ||
936 			access_ok) {
937 		dev = translate_ring_addresses(dev, ctx->msg.payload.addr.index);
938 		if (!dev)
939 			return RTE_VHOST_MSG_RESULT_ERR;
940 
941 		*pdev = dev;
942 	}
943 
944 	return RTE_VHOST_MSG_RESULT_OK;
945 }
946 
947 /*
948  * The virtio device sends us the available ring last used index.
949  */
950 static int
951 vhost_user_set_vring_base(struct virtio_net **pdev,
952 			struct vhu_msg_context *ctx,
953 			int main_fd __rte_unused)
954 {
955 	struct virtio_net *dev = *pdev;
956 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
957 	uint64_t val = ctx->msg.payload.state.num;
958 
959 	if (validate_msg_fds(dev, ctx, 0) != 0)
960 		return RTE_VHOST_MSG_RESULT_ERR;
961 
962 	if (vq_is_packed(dev)) {
963 		/*
964 		 * Bit[0:14]: avail index
965 		 * Bit[15]: avail wrap counter
966 		 */
967 		vq->last_avail_idx = val & 0x7fff;
968 		vq->avail_wrap_counter = !!(val & (0x1 << 15));
969 		/*
970 		 * Set used index to same value as available one, as
971 		 * their values should be the same since ring processing
972 		 * was stopped at get time.
973 		 */
974 		vq->last_used_idx = vq->last_avail_idx;
975 		vq->used_wrap_counter = vq->avail_wrap_counter;
976 	} else {
977 		vq->last_used_idx = ctx->msg.payload.state.num;
978 		vq->last_avail_idx = ctx->msg.payload.state.num;
979 	}
980 
981 	VHOST_LOG_CONFIG(INFO,
982 		"(%s) vring base idx:%u last_used_idx:%u last_avail_idx:%u.\n",
983 		dev->ifname, ctx->msg.payload.state.index, vq->last_used_idx,
984 		vq->last_avail_idx);
985 
986 	return RTE_VHOST_MSG_RESULT_OK;
987 }
988 
989 static int
990 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
991 		   uint64_t host_phys_addr, uint64_t size)
992 {
993 	struct guest_page *page, *last_page;
994 	struct guest_page *old_pages;
995 
996 	if (dev->nr_guest_pages == dev->max_guest_pages) {
997 		dev->max_guest_pages *= 2;
998 		old_pages = dev->guest_pages;
999 		dev->guest_pages = rte_realloc(dev->guest_pages,
1000 					dev->max_guest_pages * sizeof(*page),
1001 					RTE_CACHE_LINE_SIZE);
1002 		if (dev->guest_pages == NULL) {
1003 			VHOST_LOG_CONFIG(ERR, "(%s) cannot realloc guest_pages\n", dev->ifname);
1004 			rte_free(old_pages);
1005 			return -1;
1006 		}
1007 	}
1008 
1009 	if (dev->nr_guest_pages > 0) {
1010 		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
1011 		/* merge if the two pages are continuous */
1012 		if (host_phys_addr == last_page->host_phys_addr +
1013 				      last_page->size) {
1014 			last_page->size += size;
1015 			return 0;
1016 		}
1017 	}
1018 
1019 	page = &dev->guest_pages[dev->nr_guest_pages++];
1020 	page->guest_phys_addr = guest_phys_addr;
1021 	page->host_phys_addr  = host_phys_addr;
1022 	page->size = size;
1023 
1024 	return 0;
1025 }
1026 
1027 static int
1028 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
1029 		uint64_t page_size)
1030 {
1031 	uint64_t reg_size = reg->size;
1032 	uint64_t host_user_addr  = reg->host_user_addr;
1033 	uint64_t guest_phys_addr = reg->guest_phys_addr;
1034 	uint64_t host_phys_addr;
1035 	uint64_t size;
1036 
1037 	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
1038 	size = page_size - (guest_phys_addr & (page_size - 1));
1039 	size = RTE_MIN(size, reg_size);
1040 
1041 	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
1042 		return -1;
1043 
1044 	host_user_addr  += size;
1045 	guest_phys_addr += size;
1046 	reg_size -= size;
1047 
1048 	while (reg_size > 0) {
1049 		size = RTE_MIN(reg_size, page_size);
1050 		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
1051 						  host_user_addr);
1052 		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
1053 				size) < 0)
1054 			return -1;
1055 
1056 		host_user_addr  += size;
1057 		guest_phys_addr += size;
1058 		reg_size -= size;
1059 	}
1060 
1061 	/* sort guest page array if over binary search threshold */
1062 	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
1063 		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
1064 			sizeof(struct guest_page), guest_page_addrcmp);
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 #ifdef RTE_LIBRTE_VHOST_DEBUG
1071 /* TODO: enable it only in debug mode? */
1072 static void
1073 dump_guest_pages(struct virtio_net *dev)
1074 {
1075 	uint32_t i;
1076 	struct guest_page *page;
1077 
1078 	for (i = 0; i < dev->nr_guest_pages; i++) {
1079 		page = &dev->guest_pages[i];
1080 
1081 		VHOST_LOG_CONFIG(INFO, "(%s) guest physical page region %u\n",
1082 				dev->ifname, i);
1083 		VHOST_LOG_CONFIG(INFO, "(%s)\tguest_phys_addr: %" PRIx64 "\n",
1084 				dev->ifname, page->guest_phys_addr);
1085 		VHOST_LOG_CONFIG(INFO, "(%s)\thost_phys_addr : %" PRIx64 "\n",
1086 				dev->ifname, page->host_phys_addr);
1087 		VHOST_LOG_CONFIG(INFO, "(%s)\tsize           : %" PRIx64 "\n",
1088 				dev->ifname, page->size);
1089 	}
1090 }
1091 #else
1092 #define dump_guest_pages(dev)
1093 #endif
1094 
1095 static bool
1096 vhost_memory_changed(struct VhostUserMemory *new,
1097 		     struct rte_vhost_memory *old)
1098 {
1099 	uint32_t i;
1100 
1101 	if (new->nregions != old->nregions)
1102 		return true;
1103 
1104 	for (i = 0; i < new->nregions; ++i) {
1105 		VhostUserMemoryRegion *new_r = &new->regions[i];
1106 		struct rte_vhost_mem_region *old_r = &old->regions[i];
1107 
1108 		if (new_r->guest_phys_addr != old_r->guest_phys_addr)
1109 			return true;
1110 		if (new_r->memory_size != old_r->size)
1111 			return true;
1112 		if (new_r->userspace_addr != old_r->guest_user_addr)
1113 			return true;
1114 	}
1115 
1116 	return false;
1117 }
1118 
1119 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
1120 static int
1121 vhost_user_postcopy_region_register(struct virtio_net *dev,
1122 		struct rte_vhost_mem_region *reg)
1123 {
1124 	struct uffdio_register reg_struct;
1125 
1126 	/*
1127 	 * Let's register all the mmapped area to ensure
1128 	 * alignment on page boundary.
1129 	 */
1130 	reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr;
1131 	reg_struct.range.len = reg->mmap_size;
1132 	reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
1133 
1134 	if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
1135 				&reg_struct)) {
1136 		VHOST_LOG_CONFIG(ERR, "(%s) failed to register ufd for region "
1137 				"%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n",
1138 				dev->ifname,
1139 				(uint64_t)reg_struct.range.start,
1140 				(uint64_t)reg_struct.range.start +
1141 				(uint64_t)reg_struct.range.len - 1,
1142 				dev->postcopy_ufd,
1143 				strerror(errno));
1144 		return -1;
1145 	}
1146 
1147 	VHOST_LOG_CONFIG(INFO,
1148 			"(%s)\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n",
1149 			dev->ifname,
1150 			(uint64_t)reg_struct.range.start,
1151 			(uint64_t)reg_struct.range.start +
1152 			(uint64_t)reg_struct.range.len - 1);
1153 
1154 	return 0;
1155 }
1156 #else
1157 static int
1158 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused,
1159 		struct rte_vhost_mem_region *reg __rte_unused)
1160 {
1161 	return -1;
1162 }
1163 #endif
1164 
1165 static int
1166 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
1167 		struct vhu_msg_context *ctx)
1168 {
1169 	struct VhostUserMemory *memory;
1170 	struct rte_vhost_mem_region *reg;
1171 	struct vhu_msg_context ack_ctx;
1172 	uint32_t i;
1173 
1174 	if (!dev->postcopy_listening)
1175 		return 0;
1176 
1177 	/*
1178 	 * We haven't a better way right now than sharing
1179 	 * DPDK's virtual address with Qemu, so that Qemu can
1180 	 * retrieve the region offset when handling userfaults.
1181 	 */
1182 	memory = &ctx->msg.payload.memory;
1183 	for (i = 0; i < memory->nregions; i++) {
1184 		reg = &dev->mem->regions[i];
1185 		memory->regions[i].userspace_addr = reg->host_user_addr;
1186 	}
1187 
1188 	/* Send the addresses back to qemu */
1189 	ctx->fd_num = 0;
1190 	send_vhost_reply(dev, main_fd, ctx);
1191 
1192 	/* Wait for qemu to acknowledge it got the addresses
1193 	 * we've got to wait before we're allowed to generate faults.
1194 	 */
1195 	if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) {
1196 		VHOST_LOG_CONFIG(ERR, "(%s) failed to read qemu ack on postcopy set-mem-table\n",
1197 				dev->ifname);
1198 		return -1;
1199 	}
1200 
1201 	if (validate_msg_fds(dev, &ack_ctx, 0) != 0)
1202 		return -1;
1203 
1204 	if (ack_ctx.msg.request.master != VHOST_USER_SET_MEM_TABLE) {
1205 		VHOST_LOG_CONFIG(ERR, "(%s) bad qemu ack on postcopy set-mem-table (%d)\n",
1206 				dev->ifname, ack_ctx.msg.request.master);
1207 		return -1;
1208 	}
1209 
1210 	/* Now userfault register and we can use the memory */
1211 	for (i = 0; i < memory->nregions; i++) {
1212 		reg = &dev->mem->regions[i];
1213 		if (vhost_user_postcopy_region_register(dev, reg) < 0)
1214 			return -1;
1215 	}
1216 
1217 	return 0;
1218 }
1219 
1220 static int
1221 vhost_user_mmap_region(struct virtio_net *dev,
1222 		struct rte_vhost_mem_region *region,
1223 		uint64_t mmap_offset)
1224 {
1225 	void *mmap_addr;
1226 	uint64_t mmap_size;
1227 	uint64_t alignment;
1228 	int populate;
1229 	int ret;
1230 
1231 	/* Check for memory_size + mmap_offset overflow */
1232 	if (mmap_offset >= -region->size) {
1233 		VHOST_LOG_CONFIG(ERR, "(%s) mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow\n",
1234 				dev->ifname, mmap_offset, region->size);
1235 		return -1;
1236 	}
1237 
1238 	mmap_size = region->size + mmap_offset;
1239 
1240 	/* mmap() without flag of MAP_ANONYMOUS, should be called with length
1241 	 * argument aligned with hugepagesz at older longterm version Linux,
1242 	 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL.
1243 	 *
1244 	 * To avoid failure, make sure in caller to keep length aligned.
1245 	 */
1246 	alignment = get_blk_size(region->fd);
1247 	if (alignment == (uint64_t)-1) {
1248 		VHOST_LOG_CONFIG(ERR, "(%s) couldn't get hugepage size through fstat\n",
1249 				dev->ifname);
1250 		return -1;
1251 	}
1252 	mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
1253 	if (mmap_size == 0) {
1254 		/*
1255 		 * It could happen if initial mmap_size + alignment overflows
1256 		 * the sizeof uint64, which could happen if either mmap_size or
1257 		 * alignment value is wrong.
1258 		 *
1259 		 * mmap() kernel implementation would return an error, but
1260 		 * better catch it before and provide useful info in the logs.
1261 		 */
1262 		VHOST_LOG_CONFIG(ERR, "(%s) mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid\n",
1263 				dev->ifname, region->size + mmap_offset, alignment);
1264 		return -1;
1265 	}
1266 
1267 	populate = dev->async_copy ? MAP_POPULATE : 0;
1268 	mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1269 			MAP_SHARED | populate, region->fd, 0);
1270 
1271 	if (mmap_addr == MAP_FAILED) {
1272 		VHOST_LOG_CONFIG(ERR, "(%s) mmap failed (%s).\n", dev->ifname, strerror(errno));
1273 		return -1;
1274 	}
1275 
1276 	region->mmap_addr = mmap_addr;
1277 	region->mmap_size = mmap_size;
1278 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
1279 
1280 	if (dev->async_copy) {
1281 		if (add_guest_pages(dev, region, alignment) < 0) {
1282 			VHOST_LOG_CONFIG(ERR, "(%s) adding guest pages to region failed.\n",
1283 					dev->ifname);
1284 			return -1;
1285 		}
1286 
1287 		if (rte_vfio_is_enabled("vfio")) {
1288 			ret = async_dma_map(dev, region, true);
1289 			if (ret) {
1290 				VHOST_LOG_CONFIG(ERR,
1291 					"(%s) configure IOMMU for DMA engine failed\n",
1292 					dev->ifname);
1293 				return -1;
1294 			}
1295 		}
1296 	}
1297 
1298 	VHOST_LOG_CONFIG(INFO, "(%s) guest memory region size: 0x%" PRIx64 "\n",
1299 			dev->ifname, region->size);
1300 	VHOST_LOG_CONFIG(INFO, "(%s)\t guest physical addr: 0x%" PRIx64 "\n",
1301 			dev->ifname, region->guest_phys_addr);
1302 	VHOST_LOG_CONFIG(INFO, "(%s)\t guest virtual  addr: 0x%" PRIx64 "\n",
1303 			dev->ifname, region->guest_user_addr);
1304 	VHOST_LOG_CONFIG(INFO, "(%s)\t host  virtual  addr: 0x%" PRIx64 "\n",
1305 			dev->ifname, region->host_user_addr);
1306 	VHOST_LOG_CONFIG(INFO, "(%s)\t mmap addr : 0x%" PRIx64 "\n",
1307 			dev->ifname, (uint64_t)(uintptr_t)mmap_addr);
1308 	VHOST_LOG_CONFIG(INFO, "(%s)\t mmap size : 0x%" PRIx64 "\n",
1309 			dev->ifname, mmap_size);
1310 	VHOST_LOG_CONFIG(INFO, "(%s)\t mmap align: 0x%" PRIx64 "\n",
1311 			dev->ifname, alignment);
1312 	VHOST_LOG_CONFIG(INFO, "(%s)\t mmap off  : 0x%" PRIx64 "\n",
1313 			dev->ifname, mmap_offset);
1314 
1315 	return 0;
1316 }
1317 
1318 static int
1319 vhost_user_set_mem_table(struct virtio_net **pdev,
1320 			struct vhu_msg_context *ctx,
1321 			int main_fd)
1322 {
1323 	struct virtio_net *dev = *pdev;
1324 	struct VhostUserMemory *memory = &ctx->msg.payload.memory;
1325 	struct rte_vhost_mem_region *reg;
1326 	int numa_node = SOCKET_ID_ANY;
1327 	uint64_t mmap_offset;
1328 	uint32_t i;
1329 	bool async_notify = false;
1330 
1331 	if (validate_msg_fds(dev, ctx, memory->nregions) != 0)
1332 		return RTE_VHOST_MSG_RESULT_ERR;
1333 
1334 	if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
1335 		VHOST_LOG_CONFIG(ERR, "(%s) too many memory regions (%u)\n",
1336 				dev->ifname, memory->nregions);
1337 		goto close_msg_fds;
1338 	}
1339 
1340 	if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
1341 		VHOST_LOG_CONFIG(INFO, "(%s) memory regions not changed\n", dev->ifname);
1342 
1343 		close_msg_fds(ctx);
1344 
1345 		return RTE_VHOST_MSG_RESULT_OK;
1346 	}
1347 
1348 	if (dev->mem) {
1349 		if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) {
1350 			struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
1351 
1352 			if (vdpa_dev && vdpa_dev->ops->dev_close)
1353 				vdpa_dev->ops->dev_close(dev->vid);
1354 			dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
1355 		}
1356 
1357 		/* notify the vhost application to stop DMA transfers */
1358 		if (dev->async_copy && dev->notify_ops->vring_state_changed) {
1359 			for (i = 0; i < dev->nr_vring; i++) {
1360 				dev->notify_ops->vring_state_changed(dev->vid,
1361 						i, 0);
1362 			}
1363 			async_notify = true;
1364 		}
1365 
1366 		free_mem_region(dev);
1367 		rte_free(dev->mem);
1368 		dev->mem = NULL;
1369 	}
1370 
1371 	/* Flush IOTLB cache as previous HVAs are now invalid */
1372 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1373 		for (i = 0; i < dev->nr_vring; i++)
1374 			vhost_user_iotlb_flush_all(dev->virtqueue[i]);
1375 
1376 	/*
1377 	 * If VQ 0 has already been allocated, try to allocate on the same
1378 	 * NUMA node. It can be reallocated later in numa_realloc().
1379 	 */
1380 	if (dev->nr_vring > 0)
1381 		numa_node = dev->virtqueue[0]->numa_node;
1382 
1383 	dev->nr_guest_pages = 0;
1384 	if (dev->guest_pages == NULL) {
1385 		dev->max_guest_pages = 8;
1386 		dev->guest_pages = rte_zmalloc_socket(NULL,
1387 					dev->max_guest_pages *
1388 					sizeof(struct guest_page),
1389 					RTE_CACHE_LINE_SIZE,
1390 					numa_node);
1391 		if (dev->guest_pages == NULL) {
1392 			VHOST_LOG_CONFIG(ERR,
1393 				"(%s) failed to allocate memory for dev->guest_pages\n",
1394 				dev->ifname);
1395 			goto close_msg_fds;
1396 		}
1397 	}
1398 
1399 	dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) +
1400 		sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node);
1401 	if (dev->mem == NULL) {
1402 		VHOST_LOG_CONFIG(ERR,
1403 			"(%s) failed to allocate memory for dev->mem\n",
1404 			dev->ifname);
1405 		goto free_guest_pages;
1406 	}
1407 
1408 	for (i = 0; i < memory->nregions; i++) {
1409 		reg = &dev->mem->regions[i];
1410 
1411 		reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
1412 		reg->guest_user_addr = memory->regions[i].userspace_addr;
1413 		reg->size            = memory->regions[i].memory_size;
1414 		reg->fd              = ctx->fds[i];
1415 
1416 		/*
1417 		 * Assign invalid file descriptor value to avoid double
1418 		 * closing on error path.
1419 		 */
1420 		ctx->fds[i] = -1;
1421 
1422 		mmap_offset = memory->regions[i].mmap_offset;
1423 
1424 		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
1425 			VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap region %u\n", dev->ifname, i);
1426 			goto free_mem_table;
1427 		}
1428 
1429 		dev->mem->nregions++;
1430 	}
1431 
1432 	if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0)
1433 		goto free_mem_table;
1434 
1435 	for (i = 0; i < dev->nr_vring; i++) {
1436 		struct vhost_virtqueue *vq = dev->virtqueue[i];
1437 
1438 		if (!vq)
1439 			continue;
1440 
1441 		if (vq->desc || vq->avail || vq->used) {
1442 			/*
1443 			 * If the memory table got updated, the ring addresses
1444 			 * need to be translated again as virtual addresses have
1445 			 * changed.
1446 			 */
1447 			vring_invalidate(dev, vq);
1448 
1449 			dev = translate_ring_addresses(dev, i);
1450 			if (!dev) {
1451 				dev = *pdev;
1452 				goto free_mem_table;
1453 			}
1454 
1455 			*pdev = dev;
1456 		}
1457 	}
1458 
1459 	dump_guest_pages(dev);
1460 
1461 	if (async_notify) {
1462 		for (i = 0; i < dev->nr_vring; i++)
1463 			dev->notify_ops->vring_state_changed(dev->vid, i, 1);
1464 	}
1465 
1466 	return RTE_VHOST_MSG_RESULT_OK;
1467 
1468 free_mem_table:
1469 	free_mem_region(dev);
1470 	rte_free(dev->mem);
1471 	dev->mem = NULL;
1472 
1473 free_guest_pages:
1474 	rte_free(dev->guest_pages);
1475 	dev->guest_pages = NULL;
1476 close_msg_fds:
1477 	close_msg_fds(ctx);
1478 	return RTE_VHOST_MSG_RESULT_ERR;
1479 }
1480 
1481 static bool
1482 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
1483 {
1484 	bool rings_ok;
1485 
1486 	if (!vq)
1487 		return false;
1488 
1489 	if (vq_is_packed(dev))
1490 		rings_ok = vq->desc_packed && vq->driver_event &&
1491 			vq->device_event;
1492 	else
1493 		rings_ok = vq->desc && vq->avail && vq->used;
1494 
1495 	return rings_ok &&
1496 	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1497 	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1498 	       vq->enabled;
1499 }
1500 
1501 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u
1502 
1503 static int
1504 virtio_is_ready(struct virtio_net *dev)
1505 {
1506 	struct vhost_virtqueue *vq;
1507 	uint32_t i, nr_vring = dev->nr_vring;
1508 
1509 	if (dev->flags & VIRTIO_DEV_READY)
1510 		return 1;
1511 
1512 	if (!dev->nr_vring)
1513 		return 0;
1514 
1515 	if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) {
1516 		nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY;
1517 
1518 		if (dev->nr_vring < nr_vring)
1519 			return 0;
1520 	}
1521 
1522 	for (i = 0; i < nr_vring; i++) {
1523 		vq = dev->virtqueue[i];
1524 
1525 		if (!vq_is_ready(dev, vq))
1526 			return 0;
1527 	}
1528 
1529 	/* If supported, ensure the frontend is really done with config */
1530 	if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
1531 		if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK))
1532 			return 0;
1533 
1534 	dev->flags |= VIRTIO_DEV_READY;
1535 
1536 	if (!(dev->flags & VIRTIO_DEV_RUNNING))
1537 		VHOST_LOG_CONFIG(INFO, "(%s) virtio is now ready for processing.\n", dev->ifname);
1538 	return 1;
1539 }
1540 
1541 static void *
1542 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd)
1543 {
1544 	void *ptr;
1545 	int mfd = -1;
1546 	char fname[20] = "/tmp/memfd-XXXXXX";
1547 
1548 	*fd = -1;
1549 #ifdef MEMFD_SUPPORTED
1550 	mfd = memfd_create(name, MFD_CLOEXEC);
1551 #else
1552 	RTE_SET_USED(name);
1553 #endif
1554 	if (mfd == -1) {
1555 		mfd = mkstemp(fname);
1556 		if (mfd == -1) {
1557 			VHOST_LOG_CONFIG(ERR, "(%s) failed to get inflight buffer fd\n",
1558 					dev->ifname);
1559 			return NULL;
1560 		}
1561 
1562 		unlink(fname);
1563 	}
1564 
1565 	if (ftruncate(mfd, size) == -1) {
1566 		VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc inflight buffer\n", dev->ifname);
1567 		close(mfd);
1568 		return NULL;
1569 	}
1570 
1571 	ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
1572 	if (ptr == MAP_FAILED) {
1573 		VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap inflight buffer\n", dev->ifname);
1574 		close(mfd);
1575 		return NULL;
1576 	}
1577 
1578 	*fd = mfd;
1579 	return ptr;
1580 }
1581 
1582 static uint32_t
1583 get_pervq_shm_size_split(uint16_t queue_size)
1584 {
1585 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) *
1586 				  queue_size + sizeof(uint64_t) +
1587 				  sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT);
1588 }
1589 
1590 static uint32_t
1591 get_pervq_shm_size_packed(uint16_t queue_size)
1592 {
1593 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed)
1594 				  * queue_size + sizeof(uint64_t) +
1595 				  sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9,
1596 				  INFLIGHT_ALIGNMENT);
1597 }
1598 
1599 static int
1600 vhost_user_get_inflight_fd(struct virtio_net **pdev,
1601 			   struct vhu_msg_context *ctx,
1602 			   int main_fd __rte_unused)
1603 {
1604 	struct rte_vhost_inflight_info_packed *inflight_packed;
1605 	uint64_t pervq_inflight_size, mmap_size;
1606 	uint16_t num_queues, queue_size;
1607 	struct virtio_net *dev = *pdev;
1608 	int fd, i, j;
1609 	int numa_node = SOCKET_ID_ANY;
1610 	void *addr;
1611 
1612 	if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) {
1613 		VHOST_LOG_CONFIG(ERR, "(%s) invalid get_inflight_fd message size is %d\n",
1614 			dev->ifname, ctx->msg.size);
1615 		return RTE_VHOST_MSG_RESULT_ERR;
1616 	}
1617 
1618 	/*
1619 	 * If VQ 0 has already been allocated, try to allocate on the same
1620 	 * NUMA node. It can be reallocated later in numa_realloc().
1621 	 */
1622 	if (dev->nr_vring > 0)
1623 		numa_node = dev->virtqueue[0]->numa_node;
1624 
1625 	if (dev->inflight_info == NULL) {
1626 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1627 				sizeof(struct inflight_mem_info), 0, numa_node);
1628 		if (!dev->inflight_info) {
1629 			VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n",
1630 					dev->ifname);
1631 			return RTE_VHOST_MSG_RESULT_ERR;
1632 		}
1633 		dev->inflight_info->fd = -1;
1634 	}
1635 
1636 	num_queues = ctx->msg.payload.inflight.num_queues;
1637 	queue_size = ctx->msg.payload.inflight.queue_size;
1638 
1639 	VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd num_queues: %u\n",
1640 		dev->ifname, ctx->msg.payload.inflight.num_queues);
1641 	VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd queue_size: %u\n",
1642 		dev->ifname, ctx->msg.payload.inflight.queue_size);
1643 
1644 	if (vq_is_packed(dev))
1645 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1646 	else
1647 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1648 
1649 	mmap_size = num_queues * pervq_inflight_size;
1650 	addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd);
1651 	if (!addr) {
1652 		VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc vhost inflight area\n", dev->ifname);
1653 			ctx->msg.payload.inflight.mmap_size = 0;
1654 		return RTE_VHOST_MSG_RESULT_ERR;
1655 	}
1656 	memset(addr, 0, mmap_size);
1657 
1658 	if (dev->inflight_info->addr) {
1659 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1660 		dev->inflight_info->addr = NULL;
1661 	}
1662 
1663 	if (dev->inflight_info->fd >= 0) {
1664 		close(dev->inflight_info->fd);
1665 		dev->inflight_info->fd = -1;
1666 	}
1667 
1668 	dev->inflight_info->addr = addr;
1669 	dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size;
1670 	dev->inflight_info->fd = ctx->fds[0] = fd;
1671 	ctx->msg.payload.inflight.mmap_offset = 0;
1672 	ctx->fd_num = 1;
1673 
1674 	if (vq_is_packed(dev)) {
1675 		for (i = 0; i < num_queues; i++) {
1676 			inflight_packed =
1677 				(struct rte_vhost_inflight_info_packed *)addr;
1678 			inflight_packed->used_wrap_counter = 1;
1679 			inflight_packed->old_used_wrap_counter = 1;
1680 			for (j = 0; j < queue_size; j++)
1681 				inflight_packed->desc[j].next = j + 1;
1682 			addr = (void *)((char *)addr + pervq_inflight_size);
1683 		}
1684 	}
1685 
1686 	VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_size: %"PRIu64"\n",
1687 			dev->ifname, ctx->msg.payload.inflight.mmap_size);
1688 	VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_offset: %"PRIu64"\n",
1689 			dev->ifname, ctx->msg.payload.inflight.mmap_offset);
1690 	VHOST_LOG_CONFIG(INFO, "(%s) send inflight fd: %d\n", dev->ifname, ctx->fds[0]);
1691 
1692 	return RTE_VHOST_MSG_RESULT_REPLY;
1693 }
1694 
1695 static int
1696 vhost_user_set_inflight_fd(struct virtio_net **pdev,
1697 			   struct vhu_msg_context *ctx,
1698 			   int main_fd __rte_unused)
1699 {
1700 	uint64_t mmap_size, mmap_offset;
1701 	uint16_t num_queues, queue_size;
1702 	struct virtio_net *dev = *pdev;
1703 	uint32_t pervq_inflight_size;
1704 	struct vhost_virtqueue *vq;
1705 	void *addr;
1706 	int fd, i;
1707 	int numa_node = SOCKET_ID_ANY;
1708 
1709 	fd = ctx->fds[0];
1710 	if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) {
1711 		VHOST_LOG_CONFIG(ERR, "(%s) invalid set_inflight_fd message size is %d,fd is %d\n",
1712 			dev->ifname, ctx->msg.size, fd);
1713 		return RTE_VHOST_MSG_RESULT_ERR;
1714 	}
1715 
1716 	mmap_size = ctx->msg.payload.inflight.mmap_size;
1717 	mmap_offset = ctx->msg.payload.inflight.mmap_offset;
1718 	num_queues = ctx->msg.payload.inflight.num_queues;
1719 	queue_size = ctx->msg.payload.inflight.queue_size;
1720 
1721 	if (vq_is_packed(dev))
1722 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1723 	else
1724 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1725 
1726 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_size: %"PRIu64"\n",
1727 			dev->ifname, mmap_size);
1728 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_offset: %"PRIu64"\n",
1729 			dev->ifname, mmap_offset);
1730 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd num_queues: %u\n", dev->ifname, num_queues);
1731 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd queue_size: %u\n", dev->ifname, queue_size);
1732 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd fd: %d\n", dev->ifname, fd);
1733 	VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd pervq_inflight_size: %d\n",
1734 			dev->ifname, pervq_inflight_size);
1735 
1736 	/*
1737 	 * If VQ 0 has already been allocated, try to allocate on the same
1738 	 * NUMA node. It can be reallocated later in numa_realloc().
1739 	 */
1740 	if (dev->nr_vring > 0)
1741 		numa_node = dev->virtqueue[0]->numa_node;
1742 
1743 	if (!dev->inflight_info) {
1744 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1745 				sizeof(struct inflight_mem_info), 0, numa_node);
1746 		if (dev->inflight_info == NULL) {
1747 			VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n",
1748 					dev->ifname);
1749 			return RTE_VHOST_MSG_RESULT_ERR;
1750 		}
1751 		dev->inflight_info->fd = -1;
1752 	}
1753 
1754 	if (dev->inflight_info->addr) {
1755 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1756 		dev->inflight_info->addr = NULL;
1757 	}
1758 
1759 	addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
1760 		    fd, mmap_offset);
1761 	if (addr == MAP_FAILED) {
1762 		VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap share memory.\n", dev->ifname);
1763 		return RTE_VHOST_MSG_RESULT_ERR;
1764 	}
1765 
1766 	if (dev->inflight_info->fd >= 0) {
1767 		close(dev->inflight_info->fd);
1768 		dev->inflight_info->fd = -1;
1769 	}
1770 
1771 	dev->inflight_info->fd = fd;
1772 	dev->inflight_info->addr = addr;
1773 	dev->inflight_info->size = mmap_size;
1774 
1775 	for (i = 0; i < num_queues; i++) {
1776 		vq = dev->virtqueue[i];
1777 		if (!vq)
1778 			continue;
1779 
1780 		if (vq_is_packed(dev)) {
1781 			vq->inflight_packed = addr;
1782 			vq->inflight_packed->desc_num = queue_size;
1783 		} else {
1784 			vq->inflight_split = addr;
1785 			vq->inflight_split->desc_num = queue_size;
1786 		}
1787 		addr = (void *)((char *)addr + pervq_inflight_size);
1788 	}
1789 
1790 	return RTE_VHOST_MSG_RESULT_OK;
1791 }
1792 
1793 static int
1794 vhost_user_set_vring_call(struct virtio_net **pdev,
1795 			struct vhu_msg_context *ctx,
1796 			int main_fd __rte_unused)
1797 {
1798 	struct virtio_net *dev = *pdev;
1799 	struct vhost_vring_file file;
1800 	struct vhost_virtqueue *vq;
1801 	int expected_fds;
1802 
1803 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1804 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
1805 		return RTE_VHOST_MSG_RESULT_ERR;
1806 
1807 	file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
1808 	if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)
1809 		file.fd = VIRTIO_INVALID_EVENTFD;
1810 	else
1811 		file.fd = ctx->fds[0];
1812 	VHOST_LOG_CONFIG(INFO, "(%s) vring call idx:%d file:%d\n",
1813 			dev->ifname, file.index, file.fd);
1814 
1815 	vq = dev->virtqueue[file.index];
1816 
1817 	if (vq->ready) {
1818 		vq->ready = false;
1819 		vhost_user_notify_queue_state(dev, file.index, 0);
1820 	}
1821 
1822 	if (vq->callfd >= 0)
1823 		close(vq->callfd);
1824 
1825 	vq->callfd = file.fd;
1826 
1827 	return RTE_VHOST_MSG_RESULT_OK;
1828 }
1829 
1830 static int vhost_user_set_vring_err(struct virtio_net **pdev,
1831 			struct vhu_msg_context *ctx,
1832 			int main_fd __rte_unused)
1833 {
1834 	struct virtio_net *dev = *pdev;
1835 	int expected_fds;
1836 
1837 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1838 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
1839 		return RTE_VHOST_MSG_RESULT_ERR;
1840 
1841 	if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1842 		close(ctx->fds[0]);
1843 	VHOST_LOG_CONFIG(INFO, "(%s) not implemented\n", dev->ifname);
1844 
1845 	return RTE_VHOST_MSG_RESULT_OK;
1846 }
1847 
1848 static int
1849 resubmit_desc_compare(const void *a, const void *b)
1850 {
1851 	const struct rte_vhost_resubmit_desc *desc0 = a;
1852 	const struct rte_vhost_resubmit_desc *desc1 = b;
1853 
1854 	if (desc1->counter > desc0->counter)
1855 		return 1;
1856 
1857 	return -1;
1858 }
1859 
1860 static int
1861 vhost_check_queue_inflights_split(struct virtio_net *dev,
1862 				  struct vhost_virtqueue *vq)
1863 {
1864 	uint16_t i;
1865 	uint16_t resubmit_num = 0, last_io, num;
1866 	struct vring_used *used = vq->used;
1867 	struct rte_vhost_resubmit_info *resubmit;
1868 	struct rte_vhost_inflight_info_split *inflight_split;
1869 
1870 	if (!(dev->protocol_features &
1871 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
1872 		return RTE_VHOST_MSG_RESULT_OK;
1873 
1874 	/* The frontend may still not support the inflight feature
1875 	 * although we negotiate the protocol feature.
1876 	 */
1877 	if ((!vq->inflight_split))
1878 		return RTE_VHOST_MSG_RESULT_OK;
1879 
1880 	if (!vq->inflight_split->version) {
1881 		vq->inflight_split->version = INFLIGHT_VERSION;
1882 		return RTE_VHOST_MSG_RESULT_OK;
1883 	}
1884 
1885 	if (vq->resubmit_inflight)
1886 		return RTE_VHOST_MSG_RESULT_OK;
1887 
1888 	inflight_split = vq->inflight_split;
1889 	vq->global_counter = 0;
1890 	last_io = inflight_split->last_inflight_io;
1891 
1892 	if (inflight_split->used_idx != used->idx) {
1893 		inflight_split->desc[last_io].inflight = 0;
1894 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
1895 		inflight_split->used_idx = used->idx;
1896 	}
1897 
1898 	for (i = 0; i < inflight_split->desc_num; i++) {
1899 		if (inflight_split->desc[i].inflight == 1)
1900 			resubmit_num++;
1901 	}
1902 
1903 	vq->last_avail_idx += resubmit_num;
1904 
1905 	if (resubmit_num) {
1906 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
1907 				0, vq->numa_node);
1908 		if (!resubmit) {
1909 			VHOST_LOG_CONFIG(ERR,
1910 					"(%s) failed to allocate memory for resubmit info.\n",
1911 					dev->ifname);
1912 			return RTE_VHOST_MSG_RESULT_ERR;
1913 		}
1914 
1915 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
1916 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
1917 				0, vq->numa_node);
1918 		if (!resubmit->resubmit_list) {
1919 			VHOST_LOG_CONFIG(ERR,
1920 					"(%s) failed to allocate memory for inflight desc.\n",
1921 					dev->ifname);
1922 			rte_free(resubmit);
1923 			return RTE_VHOST_MSG_RESULT_ERR;
1924 		}
1925 
1926 		num = 0;
1927 		for (i = 0; i < vq->inflight_split->desc_num; i++) {
1928 			if (vq->inflight_split->desc[i].inflight == 1) {
1929 				resubmit->resubmit_list[num].index = i;
1930 				resubmit->resubmit_list[num].counter =
1931 					inflight_split->desc[i].counter;
1932 				num++;
1933 			}
1934 		}
1935 		resubmit->resubmit_num = num;
1936 
1937 		if (resubmit->resubmit_num > 1)
1938 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
1939 			      sizeof(struct rte_vhost_resubmit_desc),
1940 			      resubmit_desc_compare);
1941 
1942 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
1943 		vq->resubmit_inflight = resubmit;
1944 	}
1945 
1946 	return RTE_VHOST_MSG_RESULT_OK;
1947 }
1948 
1949 static int
1950 vhost_check_queue_inflights_packed(struct virtio_net *dev,
1951 				   struct vhost_virtqueue *vq)
1952 {
1953 	uint16_t i;
1954 	uint16_t resubmit_num = 0, old_used_idx, num;
1955 	struct rte_vhost_resubmit_info *resubmit;
1956 	struct rte_vhost_inflight_info_packed *inflight_packed;
1957 
1958 	if (!(dev->protocol_features &
1959 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
1960 		return RTE_VHOST_MSG_RESULT_OK;
1961 
1962 	/* The frontend may still not support the inflight feature
1963 	 * although we negotiate the protocol feature.
1964 	 */
1965 	if ((!vq->inflight_packed))
1966 		return RTE_VHOST_MSG_RESULT_OK;
1967 
1968 	if (!vq->inflight_packed->version) {
1969 		vq->inflight_packed->version = INFLIGHT_VERSION;
1970 		return RTE_VHOST_MSG_RESULT_OK;
1971 	}
1972 
1973 	if (vq->resubmit_inflight)
1974 		return RTE_VHOST_MSG_RESULT_OK;
1975 
1976 	inflight_packed = vq->inflight_packed;
1977 	vq->global_counter = 0;
1978 	old_used_idx = inflight_packed->old_used_idx;
1979 
1980 	if (inflight_packed->used_idx != old_used_idx) {
1981 		if (inflight_packed->desc[old_used_idx].inflight == 0) {
1982 			inflight_packed->old_used_idx =
1983 				inflight_packed->used_idx;
1984 			inflight_packed->old_used_wrap_counter =
1985 				inflight_packed->used_wrap_counter;
1986 			inflight_packed->old_free_head =
1987 				inflight_packed->free_head;
1988 		} else {
1989 			inflight_packed->used_idx =
1990 				inflight_packed->old_used_idx;
1991 			inflight_packed->used_wrap_counter =
1992 				inflight_packed->old_used_wrap_counter;
1993 			inflight_packed->free_head =
1994 				inflight_packed->old_free_head;
1995 		}
1996 	}
1997 
1998 	for (i = 0; i < inflight_packed->desc_num; i++) {
1999 		if (inflight_packed->desc[i].inflight == 1)
2000 			resubmit_num++;
2001 	}
2002 
2003 	if (resubmit_num) {
2004 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
2005 				0, vq->numa_node);
2006 		if (resubmit == NULL) {
2007 			VHOST_LOG_CONFIG(ERR,
2008 					"(%s) failed to allocate memory for resubmit info.\n",
2009 					dev->ifname);
2010 			return RTE_VHOST_MSG_RESULT_ERR;
2011 		}
2012 
2013 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
2014 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
2015 				0, vq->numa_node);
2016 		if (resubmit->resubmit_list == NULL) {
2017 			VHOST_LOG_CONFIG(ERR,
2018 					"(%s) failed to allocate memory for resubmit desc.\n",
2019 					dev->ifname);
2020 			rte_free(resubmit);
2021 			return RTE_VHOST_MSG_RESULT_ERR;
2022 		}
2023 
2024 		num = 0;
2025 		for (i = 0; i < inflight_packed->desc_num; i++) {
2026 			if (vq->inflight_packed->desc[i].inflight == 1) {
2027 				resubmit->resubmit_list[num].index = i;
2028 				resubmit->resubmit_list[num].counter =
2029 					inflight_packed->desc[i].counter;
2030 				num++;
2031 			}
2032 		}
2033 		resubmit->resubmit_num = num;
2034 
2035 		if (resubmit->resubmit_num > 1)
2036 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
2037 			      sizeof(struct rte_vhost_resubmit_desc),
2038 			      resubmit_desc_compare);
2039 
2040 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
2041 		vq->resubmit_inflight = resubmit;
2042 	}
2043 
2044 	return RTE_VHOST_MSG_RESULT_OK;
2045 }
2046 
2047 static int
2048 vhost_user_set_vring_kick(struct virtio_net **pdev,
2049 			struct vhu_msg_context *ctx,
2050 			int main_fd __rte_unused)
2051 {
2052 	struct virtio_net *dev = *pdev;
2053 	struct vhost_vring_file file;
2054 	struct vhost_virtqueue *vq;
2055 	int expected_fds;
2056 
2057 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
2058 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
2059 		return RTE_VHOST_MSG_RESULT_ERR;
2060 
2061 	file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
2062 	if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)
2063 		file.fd = VIRTIO_INVALID_EVENTFD;
2064 	else
2065 		file.fd = ctx->fds[0];
2066 	VHOST_LOG_CONFIG(INFO, "(%s) vring kick idx:%d file:%d\n",
2067 			dev->ifname, file.index, file.fd);
2068 
2069 	/* Interpret ring addresses only when ring is started. */
2070 	dev = translate_ring_addresses(dev, file.index);
2071 	if (!dev) {
2072 		if (file.fd != VIRTIO_INVALID_EVENTFD)
2073 			close(file.fd);
2074 
2075 		return RTE_VHOST_MSG_RESULT_ERR;
2076 	}
2077 
2078 	*pdev = dev;
2079 
2080 	vq = dev->virtqueue[file.index];
2081 
2082 	/*
2083 	 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
2084 	 * the ring starts already enabled. Otherwise, it is enabled via
2085 	 * the SET_VRING_ENABLE message.
2086 	 */
2087 	if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
2088 		vq->enabled = true;
2089 	}
2090 
2091 	if (vq->ready) {
2092 		vq->ready = false;
2093 		vhost_user_notify_queue_state(dev, file.index, 0);
2094 	}
2095 
2096 	if (vq->kickfd >= 0)
2097 		close(vq->kickfd);
2098 	vq->kickfd = file.fd;
2099 
2100 	if (vq_is_packed(dev)) {
2101 		if (vhost_check_queue_inflights_packed(dev, vq)) {
2102 			VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n",
2103 					dev->ifname, file.index);
2104 			return RTE_VHOST_MSG_RESULT_ERR;
2105 		}
2106 	} else {
2107 		if (vhost_check_queue_inflights_split(dev, vq)) {
2108 			VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n",
2109 					dev->ifname, file.index);
2110 			return RTE_VHOST_MSG_RESULT_ERR;
2111 		}
2112 	}
2113 
2114 	return RTE_VHOST_MSG_RESULT_OK;
2115 }
2116 
2117 /*
2118  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
2119  */
2120 static int
2121 vhost_user_get_vring_base(struct virtio_net **pdev,
2122 			struct vhu_msg_context *ctx,
2123 			int main_fd __rte_unused)
2124 {
2125 	struct virtio_net *dev = *pdev;
2126 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
2127 	uint64_t val;
2128 
2129 	if (validate_msg_fds(dev, ctx, 0) != 0)
2130 		return RTE_VHOST_MSG_RESULT_ERR;
2131 
2132 	/* We have to stop the queue (virtio) if it is running. */
2133 	vhost_destroy_device_notify(dev);
2134 
2135 	dev->flags &= ~VIRTIO_DEV_READY;
2136 	dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
2137 
2138 	/* Here we are safe to get the indexes */
2139 	if (vq_is_packed(dev)) {
2140 		/*
2141 		 * Bit[0:14]: avail index
2142 		 * Bit[15]: avail wrap counter
2143 		 */
2144 		val = vq->last_avail_idx & 0x7fff;
2145 		val |= vq->avail_wrap_counter << 15;
2146 		ctx->msg.payload.state.num = val;
2147 	} else {
2148 		ctx->msg.payload.state.num = vq->last_avail_idx;
2149 	}
2150 
2151 	VHOST_LOG_CONFIG(INFO, "(%s) vring base idx:%d file:%d\n",
2152 			dev->ifname, ctx->msg.payload.state.index,
2153 			ctx->msg.payload.state.num);
2154 	/*
2155 	 * Based on current qemu vhost-user implementation, this message is
2156 	 * sent and only sent in vhost_vring_stop.
2157 	 * TODO: cleanup the vring, it isn't usable since here.
2158 	 */
2159 	if (vq->kickfd >= 0)
2160 		close(vq->kickfd);
2161 
2162 	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
2163 
2164 	if (vq->callfd >= 0)
2165 		close(vq->callfd);
2166 
2167 	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
2168 
2169 	vq->signalled_used_valid = false;
2170 
2171 	if (vq_is_packed(dev)) {
2172 		rte_free(vq->shadow_used_packed);
2173 		vq->shadow_used_packed = NULL;
2174 	} else {
2175 		rte_free(vq->shadow_used_split);
2176 		vq->shadow_used_split = NULL;
2177 	}
2178 
2179 	rte_free(vq->batch_copy_elems);
2180 	vq->batch_copy_elems = NULL;
2181 
2182 	rte_free(vq->log_cache);
2183 	vq->log_cache = NULL;
2184 
2185 	ctx->msg.size = sizeof(ctx->msg.payload.state);
2186 	ctx->fd_num = 0;
2187 
2188 	vhost_user_iotlb_flush_all(vq);
2189 
2190 	vring_invalidate(dev, vq);
2191 
2192 	return RTE_VHOST_MSG_RESULT_REPLY;
2193 }
2194 
2195 /*
2196  * when virtio queues are ready to work, qemu will send us to
2197  * enable the virtio queue pair.
2198  */
2199 static int
2200 vhost_user_set_vring_enable(struct virtio_net **pdev,
2201 			struct vhu_msg_context *ctx,
2202 			int main_fd __rte_unused)
2203 {
2204 	struct virtio_net *dev = *pdev;
2205 	bool enable = !!ctx->msg.payload.state.num;
2206 	int index = (int)ctx->msg.payload.state.index;
2207 
2208 	if (validate_msg_fds(dev, ctx, 0) != 0)
2209 		return RTE_VHOST_MSG_RESULT_ERR;
2210 
2211 	VHOST_LOG_CONFIG(INFO, "(%s) set queue enable: %d to qp idx: %d\n",
2212 			dev->ifname, enable, index);
2213 
2214 	if (enable && dev->virtqueue[index]->async) {
2215 		if (dev->virtqueue[index]->async->pkts_inflight_n) {
2216 			VHOST_LOG_CONFIG(ERR,
2217 				"(%s) failed to enable vring. Inflight packets must be completed first\n",
2218 				dev->ifname);
2219 			return RTE_VHOST_MSG_RESULT_ERR;
2220 		}
2221 	}
2222 
2223 	dev->virtqueue[index]->enabled = enable;
2224 
2225 	return RTE_VHOST_MSG_RESULT_OK;
2226 }
2227 
2228 static int
2229 vhost_user_get_protocol_features(struct virtio_net **pdev,
2230 			struct vhu_msg_context *ctx,
2231 			int main_fd __rte_unused)
2232 {
2233 	struct virtio_net *dev = *pdev;
2234 	uint64_t features, protocol_features;
2235 
2236 	if (validate_msg_fds(dev, ctx, 0) != 0)
2237 		return RTE_VHOST_MSG_RESULT_ERR;
2238 
2239 	rte_vhost_driver_get_features(dev->ifname, &features);
2240 	rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features);
2241 
2242 	ctx->msg.payload.u64 = protocol_features;
2243 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2244 	ctx->fd_num = 0;
2245 
2246 	return RTE_VHOST_MSG_RESULT_REPLY;
2247 }
2248 
2249 static int
2250 vhost_user_set_protocol_features(struct virtio_net **pdev,
2251 			struct vhu_msg_context *ctx,
2252 			int main_fd __rte_unused)
2253 {
2254 	struct virtio_net *dev = *pdev;
2255 	uint64_t protocol_features = ctx->msg.payload.u64;
2256 	uint64_t slave_protocol_features = 0;
2257 
2258 	if (validate_msg_fds(dev, ctx, 0) != 0)
2259 		return RTE_VHOST_MSG_RESULT_ERR;
2260 
2261 	rte_vhost_driver_get_protocol_features(dev->ifname,
2262 			&slave_protocol_features);
2263 	if (protocol_features & ~slave_protocol_features) {
2264 		VHOST_LOG_CONFIG(ERR, "(%s) received invalid protocol features.\n", dev->ifname);
2265 		return RTE_VHOST_MSG_RESULT_ERR;
2266 	}
2267 
2268 	dev->protocol_features = protocol_features;
2269 	VHOST_LOG_CONFIG(INFO, "(%s) negotiated Vhost-user protocol features: 0x%" PRIx64 "\n",
2270 		dev->ifname, dev->protocol_features);
2271 
2272 	return RTE_VHOST_MSG_RESULT_OK;
2273 }
2274 
2275 static int
2276 vhost_user_set_log_base(struct virtio_net **pdev,
2277 			struct vhu_msg_context *ctx,
2278 			int main_fd __rte_unused)
2279 {
2280 	struct virtio_net *dev = *pdev;
2281 	int fd = ctx->fds[0];
2282 	uint64_t size, off;
2283 	void *addr;
2284 	uint32_t i;
2285 
2286 	if (validate_msg_fds(dev, ctx, 1) != 0)
2287 		return RTE_VHOST_MSG_RESULT_ERR;
2288 
2289 	if (fd < 0) {
2290 		VHOST_LOG_CONFIG(ERR, "(%s) invalid log fd: %d\n", dev->ifname, fd);
2291 		return RTE_VHOST_MSG_RESULT_ERR;
2292 	}
2293 
2294 	if (ctx->msg.size != sizeof(VhostUserLog)) {
2295 		VHOST_LOG_CONFIG(ERR, "(%s) invalid log base msg size: %"PRId32" != %d\n",
2296 			dev->ifname, ctx->msg.size, (int)sizeof(VhostUserLog));
2297 		goto close_msg_fds;
2298 	}
2299 
2300 	size = ctx->msg.payload.log.mmap_size;
2301 	off  = ctx->msg.payload.log.mmap_offset;
2302 
2303 	/* Check for mmap size and offset overflow. */
2304 	if (off >= -size) {
2305 		VHOST_LOG_CONFIG(ERR,
2306 				"(%s) log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
2307 				dev->ifname, off, size);
2308 		goto close_msg_fds;
2309 	}
2310 
2311 	VHOST_LOG_CONFIG(INFO, "(%s) log mmap size: %"PRId64", offset: %"PRId64"\n",
2312 			dev->ifname, size, off);
2313 
2314 	/*
2315 	 * mmap from 0 to workaround a hugepage mmap bug: mmap will
2316 	 * fail when offset is not page size aligned.
2317 	 */
2318 	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2319 	close(fd);
2320 	if (addr == MAP_FAILED) {
2321 		VHOST_LOG_CONFIG(ERR, "(%s) mmap log base failed!\n", dev->ifname);
2322 		return RTE_VHOST_MSG_RESULT_ERR;
2323 	}
2324 
2325 	/*
2326 	 * Free previously mapped log memory on occasionally
2327 	 * multiple VHOST_USER_SET_LOG_BASE.
2328 	 */
2329 	if (dev->log_addr) {
2330 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
2331 	}
2332 	dev->log_addr = (uint64_t)(uintptr_t)addr;
2333 	dev->log_base = dev->log_addr + off;
2334 	dev->log_size = size;
2335 
2336 	for (i = 0; i < dev->nr_vring; i++) {
2337 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2338 
2339 		rte_free(vq->log_cache);
2340 		vq->log_cache = NULL;
2341 		vq->log_cache_nb_elem = 0;
2342 		vq->log_cache = rte_malloc_socket("vq log cache",
2343 				sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR,
2344 				0, vq->numa_node);
2345 		/*
2346 		 * If log cache alloc fail, don't fail migration, but no
2347 		 * caching will be done, which will impact performance
2348 		 */
2349 		if (!vq->log_cache)
2350 			VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate VQ logging cache\n",
2351 					dev->ifname);
2352 	}
2353 
2354 	/*
2355 	 * The spec is not clear about it (yet), but QEMU doesn't expect
2356 	 * any payload in the reply.
2357 	 */
2358 	ctx->msg.size = 0;
2359 	ctx->fd_num = 0;
2360 
2361 	return RTE_VHOST_MSG_RESULT_REPLY;
2362 
2363 close_msg_fds:
2364 	close_msg_fds(ctx);
2365 	return RTE_VHOST_MSG_RESULT_ERR;
2366 }
2367 
2368 static int vhost_user_set_log_fd(struct virtio_net **pdev,
2369 			struct vhu_msg_context *ctx,
2370 			int main_fd __rte_unused)
2371 {
2372 	struct virtio_net *dev = *pdev;
2373 
2374 	if (validate_msg_fds(dev, ctx, 1) != 0)
2375 		return RTE_VHOST_MSG_RESULT_ERR;
2376 
2377 	close(ctx->fds[0]);
2378 	VHOST_LOG_CONFIG(INFO, "(%s) not implemented.\n", dev->ifname);
2379 
2380 	return RTE_VHOST_MSG_RESULT_OK;
2381 }
2382 
2383 /*
2384  * An rarp packet is constructed and broadcasted to notify switches about
2385  * the new location of the migrated VM, so that packets from outside will
2386  * not be lost after migration.
2387  *
2388  * However, we don't actually "send" a rarp packet here, instead, we set
2389  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
2390  */
2391 static int
2392 vhost_user_send_rarp(struct virtio_net **pdev,
2393 			struct vhu_msg_context *ctx,
2394 			int main_fd __rte_unused)
2395 {
2396 	struct virtio_net *dev = *pdev;
2397 	uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64;
2398 	struct rte_vdpa_device *vdpa_dev;
2399 
2400 	if (validate_msg_fds(dev, ctx, 0) != 0)
2401 		return RTE_VHOST_MSG_RESULT_ERR;
2402 
2403 	VHOST_LOG_CONFIG(DEBUG, "(%s) MAC: " RTE_ETHER_ADDR_PRT_FMT "\n",
2404 		dev->ifname, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
2405 	memcpy(dev->mac.addr_bytes, mac, 6);
2406 
2407 	/*
2408 	 * Set the flag to inject a RARP broadcast packet at
2409 	 * rte_vhost_dequeue_burst().
2410 	 *
2411 	 * __ATOMIC_RELEASE ordering is for making sure the mac is
2412 	 * copied before the flag is set.
2413 	 */
2414 	__atomic_store_n(&dev->broadcast_rarp, 1, __ATOMIC_RELEASE);
2415 	vdpa_dev = dev->vdpa_dev;
2416 	if (vdpa_dev && vdpa_dev->ops->migration_done)
2417 		vdpa_dev->ops->migration_done(dev->vid);
2418 
2419 	return RTE_VHOST_MSG_RESULT_OK;
2420 }
2421 
2422 static int
2423 vhost_user_net_set_mtu(struct virtio_net **pdev,
2424 			struct vhu_msg_context *ctx,
2425 			int main_fd __rte_unused)
2426 {
2427 	struct virtio_net *dev = *pdev;
2428 
2429 	if (validate_msg_fds(dev, ctx, 0) != 0)
2430 		return RTE_VHOST_MSG_RESULT_ERR;
2431 
2432 	if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU ||
2433 			ctx->msg.payload.u64 > VIRTIO_MAX_MTU) {
2434 		VHOST_LOG_CONFIG(ERR, "(%s) invalid MTU size (%"PRIu64")\n",
2435 				dev->ifname, ctx->msg.payload.u64);
2436 
2437 		return RTE_VHOST_MSG_RESULT_ERR;
2438 	}
2439 
2440 	dev->mtu = ctx->msg.payload.u64;
2441 
2442 	return RTE_VHOST_MSG_RESULT_OK;
2443 }
2444 
2445 static int
2446 vhost_user_set_req_fd(struct virtio_net **pdev,
2447 			struct vhu_msg_context *ctx,
2448 			int main_fd __rte_unused)
2449 {
2450 	struct virtio_net *dev = *pdev;
2451 	int fd = ctx->fds[0];
2452 
2453 	if (validate_msg_fds(dev, ctx, 1) != 0)
2454 		return RTE_VHOST_MSG_RESULT_ERR;
2455 
2456 	if (fd < 0) {
2457 		VHOST_LOG_CONFIG(ERR, "(%s) invalid file descriptor for slave channel (%d)\n",
2458 				dev->ifname, fd);
2459 		return RTE_VHOST_MSG_RESULT_ERR;
2460 	}
2461 
2462 	if (dev->slave_req_fd >= 0)
2463 		close(dev->slave_req_fd);
2464 
2465 	dev->slave_req_fd = fd;
2466 
2467 	return RTE_VHOST_MSG_RESULT_OK;
2468 }
2469 
2470 static int
2471 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2472 {
2473 	struct vhost_vring_addr *ra;
2474 	uint64_t start, end, len;
2475 
2476 	start = imsg->iova;
2477 	end = start + imsg->size;
2478 
2479 	ra = &vq->ring_addrs;
2480 	len = sizeof(struct vring_desc) * vq->size;
2481 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2482 		return 1;
2483 
2484 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
2485 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2486 		return 1;
2487 
2488 	len = sizeof(struct vring_used) +
2489 	       sizeof(struct vring_used_elem) * vq->size;
2490 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2491 		return 1;
2492 
2493 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2494 		len = sizeof(uint64_t);
2495 		if (ra->log_guest_addr < end &&
2496 		    (ra->log_guest_addr + len) > start)
2497 			return 1;
2498 	}
2499 
2500 	return 0;
2501 }
2502 
2503 static int
2504 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2505 {
2506 	struct vhost_vring_addr *ra;
2507 	uint64_t start, end, len;
2508 
2509 	start = imsg->iova;
2510 	end = start + imsg->size;
2511 
2512 	ra = &vq->ring_addrs;
2513 	len = sizeof(struct vring_packed_desc) * vq->size;
2514 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2515 		return 1;
2516 
2517 	len = sizeof(struct vring_packed_desc_event);
2518 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2519 		return 1;
2520 
2521 	len = sizeof(struct vring_packed_desc_event);
2522 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2523 		return 1;
2524 
2525 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2526 		len = sizeof(uint64_t);
2527 		if (ra->log_guest_addr < end &&
2528 		    (ra->log_guest_addr + len) > start)
2529 			return 1;
2530 	}
2531 
2532 	return 0;
2533 }
2534 
2535 static int is_vring_iotlb(struct virtio_net *dev,
2536 			  struct vhost_virtqueue *vq,
2537 			  struct vhost_iotlb_msg *imsg)
2538 {
2539 	if (vq_is_packed(dev))
2540 		return is_vring_iotlb_packed(vq, imsg);
2541 	else
2542 		return is_vring_iotlb_split(vq, imsg);
2543 }
2544 
2545 static int
2546 vhost_user_iotlb_msg(struct virtio_net **pdev,
2547 			struct vhu_msg_context *ctx,
2548 			int main_fd __rte_unused)
2549 {
2550 	struct virtio_net *dev = *pdev;
2551 	struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb;
2552 	uint16_t i;
2553 	uint64_t vva, len;
2554 
2555 	if (validate_msg_fds(dev, ctx, 0) != 0)
2556 		return RTE_VHOST_MSG_RESULT_ERR;
2557 
2558 	switch (imsg->type) {
2559 	case VHOST_IOTLB_UPDATE:
2560 		len = imsg->size;
2561 		vva = qva_to_vva(dev, imsg->uaddr, &len);
2562 		if (!vva)
2563 			return RTE_VHOST_MSG_RESULT_ERR;
2564 
2565 		for (i = 0; i < dev->nr_vring; i++) {
2566 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2567 
2568 			if (!vq)
2569 				continue;
2570 
2571 			vhost_user_iotlb_cache_insert(dev, vq, imsg->iova, vva,
2572 					len, imsg->perm);
2573 
2574 			if (is_vring_iotlb(dev, vq, imsg))
2575 				*pdev = dev = translate_ring_addresses(dev, i);
2576 		}
2577 		break;
2578 	case VHOST_IOTLB_INVALIDATE:
2579 		for (i = 0; i < dev->nr_vring; i++) {
2580 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2581 
2582 			if (!vq)
2583 				continue;
2584 
2585 			vhost_user_iotlb_cache_remove(vq, imsg->iova,
2586 					imsg->size);
2587 
2588 			if (is_vring_iotlb(dev, vq, imsg))
2589 				vring_invalidate(dev, vq);
2590 		}
2591 		break;
2592 	default:
2593 		VHOST_LOG_CONFIG(ERR, "(%s) invalid IOTLB message type (%d)\n",
2594 				dev->ifname, imsg->type);
2595 		return RTE_VHOST_MSG_RESULT_ERR;
2596 	}
2597 
2598 	return RTE_VHOST_MSG_RESULT_OK;
2599 }
2600 
2601 static int
2602 vhost_user_set_postcopy_advise(struct virtio_net **pdev,
2603 			struct vhu_msg_context *ctx,
2604 			int main_fd __rte_unused)
2605 {
2606 	struct virtio_net *dev = *pdev;
2607 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
2608 	struct uffdio_api api_struct;
2609 
2610 	if (validate_msg_fds(dev, ctx, 0) != 0)
2611 		return RTE_VHOST_MSG_RESULT_ERR;
2612 
2613 	dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
2614 
2615 	if (dev->postcopy_ufd == -1) {
2616 		VHOST_LOG_CONFIG(ERR, "(%s) userfaultfd not available: %s\n",
2617 			dev->ifname, strerror(errno));
2618 		return RTE_VHOST_MSG_RESULT_ERR;
2619 	}
2620 	api_struct.api = UFFD_API;
2621 	api_struct.features = 0;
2622 	if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
2623 		VHOST_LOG_CONFIG(ERR, "(%s) UFFDIO_API ioctl failure: %s\n",
2624 			dev->ifname, strerror(errno));
2625 		close(dev->postcopy_ufd);
2626 		dev->postcopy_ufd = -1;
2627 		return RTE_VHOST_MSG_RESULT_ERR;
2628 	}
2629 	ctx->fds[0] = dev->postcopy_ufd;
2630 	ctx->fd_num = 1;
2631 
2632 	return RTE_VHOST_MSG_RESULT_REPLY;
2633 #else
2634 	dev->postcopy_ufd = -1;
2635 	ctx->fd_num = 0;
2636 
2637 	return RTE_VHOST_MSG_RESULT_ERR;
2638 #endif
2639 }
2640 
2641 static int
2642 vhost_user_set_postcopy_listen(struct virtio_net **pdev,
2643 			struct vhu_msg_context *ctx __rte_unused,
2644 			int main_fd __rte_unused)
2645 {
2646 	struct virtio_net *dev = *pdev;
2647 
2648 	if (validate_msg_fds(dev, ctx, 0) != 0)
2649 		return RTE_VHOST_MSG_RESULT_ERR;
2650 
2651 	if (dev->mem && dev->mem->nregions) {
2652 		VHOST_LOG_CONFIG(ERR, "(%s) regions already registered at postcopy-listen\n",
2653 				dev->ifname);
2654 		return RTE_VHOST_MSG_RESULT_ERR;
2655 	}
2656 	dev->postcopy_listening = 1;
2657 
2658 	return RTE_VHOST_MSG_RESULT_OK;
2659 }
2660 
2661 static int
2662 vhost_user_postcopy_end(struct virtio_net **pdev,
2663 			struct vhu_msg_context *ctx,
2664 			int main_fd __rte_unused)
2665 {
2666 	struct virtio_net *dev = *pdev;
2667 
2668 	if (validate_msg_fds(dev, ctx, 0) != 0)
2669 		return RTE_VHOST_MSG_RESULT_ERR;
2670 
2671 	dev->postcopy_listening = 0;
2672 	if (dev->postcopy_ufd >= 0) {
2673 		close(dev->postcopy_ufd);
2674 		dev->postcopy_ufd = -1;
2675 	}
2676 
2677 	ctx->msg.payload.u64 = 0;
2678 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2679 	ctx->fd_num = 0;
2680 
2681 	return RTE_VHOST_MSG_RESULT_REPLY;
2682 }
2683 
2684 static int
2685 vhost_user_get_status(struct virtio_net **pdev,
2686 		      struct vhu_msg_context *ctx,
2687 		      int main_fd __rte_unused)
2688 {
2689 	struct virtio_net *dev = *pdev;
2690 
2691 	if (validate_msg_fds(dev, ctx, 0) != 0)
2692 		return RTE_VHOST_MSG_RESULT_ERR;
2693 
2694 	ctx->msg.payload.u64 = dev->status;
2695 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2696 	ctx->fd_num = 0;
2697 
2698 	return RTE_VHOST_MSG_RESULT_REPLY;
2699 }
2700 
2701 static int
2702 vhost_user_set_status(struct virtio_net **pdev,
2703 			struct vhu_msg_context *ctx,
2704 			int main_fd __rte_unused)
2705 {
2706 	struct virtio_net *dev = *pdev;
2707 
2708 	if (validate_msg_fds(dev, ctx, 0) != 0)
2709 		return RTE_VHOST_MSG_RESULT_ERR;
2710 
2711 	/* As per Virtio specification, the device status is 8bits long */
2712 	if (ctx->msg.payload.u64 > UINT8_MAX) {
2713 		VHOST_LOG_CONFIG(ERR, "(%s) invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64 "\n",
2714 				dev->ifname, ctx->msg.payload.u64);
2715 		return RTE_VHOST_MSG_RESULT_ERR;
2716 	}
2717 
2718 	dev->status = ctx->msg.payload.u64;
2719 
2720 	if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) &&
2721 	    (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) {
2722 		VHOST_LOG_CONFIG(ERR,
2723 				"(%s) FEATURES_OK bit is set but feature negotiation failed\n",
2724 				dev->ifname);
2725 		/*
2726 		 * Clear the bit to let the driver know about the feature
2727 		 * negotiation failure
2728 		 */
2729 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
2730 	}
2731 
2732 	VHOST_LOG_CONFIG(INFO, "(%s) new device status(0x%08x):\n", dev->ifname,
2733 			dev->status);
2734 	VHOST_LOG_CONFIG(INFO, "(%s)\t-RESET: %u\n", dev->ifname,
2735 			(dev->status == VIRTIO_DEVICE_STATUS_RESET));
2736 	VHOST_LOG_CONFIG(INFO, "(%s)\t-ACKNOWLEDGE: %u\n", dev->ifname,
2737 			!!(dev->status & VIRTIO_DEVICE_STATUS_ACK));
2738 	VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER: %u\n", dev->ifname,
2739 			!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER));
2740 	VHOST_LOG_CONFIG(INFO, "(%s)\t-FEATURES_OK: %u\n", dev->ifname,
2741 			!!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK));
2742 	VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER_OK: %u\n", dev->ifname,
2743 			!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK));
2744 	VHOST_LOG_CONFIG(INFO, "(%s)\t-DEVICE_NEED_RESET: %u\n", dev->ifname,
2745 			!!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET));
2746 	VHOST_LOG_CONFIG(INFO, "(%s)\t-FAILED: %u\n", dev->ifname,
2747 			!!(dev->status & VIRTIO_DEVICE_STATUS_FAILED));
2748 
2749 	return RTE_VHOST_MSG_RESULT_OK;
2750 }
2751 
2752 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev,
2753 					struct vhu_msg_context *ctx,
2754 					int main_fd);
2755 
2756 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
2757 	[VHOST_USER_NONE] = NULL,
2758 	[VHOST_USER_GET_FEATURES] = vhost_user_get_features,
2759 	[VHOST_USER_SET_FEATURES] = vhost_user_set_features,
2760 	[VHOST_USER_SET_OWNER] = vhost_user_set_owner,
2761 	[VHOST_USER_RESET_OWNER] = vhost_user_reset_owner,
2762 	[VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table,
2763 	[VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base,
2764 	[VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd,
2765 	[VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num,
2766 	[VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr,
2767 	[VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base,
2768 	[VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base,
2769 	[VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick,
2770 	[VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call,
2771 	[VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err,
2772 	[VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features,
2773 	[VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features,
2774 	[VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num,
2775 	[VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable,
2776 	[VHOST_USER_SEND_RARP] = vhost_user_send_rarp,
2777 	[VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
2778 	[VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
2779 	[VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
2780 	[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
2781 	[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
2782 	[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
2783 	[VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
2784 	[VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
2785 	[VHOST_USER_SET_STATUS] = vhost_user_set_status,
2786 	[VHOST_USER_GET_STATUS] = vhost_user_get_status,
2787 };
2788 
2789 /* return bytes# of read on success or negative val on failure. */
2790 static int
2791 read_vhost_message(struct virtio_net *dev, int sockfd, struct  vhu_msg_context *ctx)
2792 {
2793 	int ret;
2794 
2795 	ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE,
2796 		ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num);
2797 	if (ret <= 0) {
2798 		return ret;
2799 	} else if (ret != VHOST_USER_HDR_SIZE) {
2800 		VHOST_LOG_CONFIG(ERR, "(%s) Unexpected header size read\n", dev->ifname);
2801 		close_msg_fds(ctx);
2802 		return -1;
2803 	}
2804 
2805 	if (ctx->msg.size) {
2806 		if (ctx->msg.size > sizeof(ctx->msg.payload)) {
2807 			VHOST_LOG_CONFIG(ERR, "(%s) invalid msg size: %d\n",
2808 					dev->ifname, ctx->msg.size);
2809 			return -1;
2810 		}
2811 		ret = read(sockfd, &ctx->msg.payload, ctx->msg.size);
2812 		if (ret <= 0)
2813 			return ret;
2814 		if (ret != (int)ctx->msg.size) {
2815 			VHOST_LOG_CONFIG(ERR, "(%s) read control message failed\n", dev->ifname);
2816 			return -1;
2817 		}
2818 	}
2819 
2820 	return ret;
2821 }
2822 
2823 static int
2824 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx)
2825 {
2826 	if (!ctx)
2827 		return 0;
2828 
2829 	return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg,
2830 		VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num);
2831 }
2832 
2833 static int
2834 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx)
2835 {
2836 	if (!ctx)
2837 		return 0;
2838 
2839 	ctx->msg.flags &= ~VHOST_USER_VERSION_MASK;
2840 	ctx->msg.flags &= ~VHOST_USER_NEED_REPLY;
2841 	ctx->msg.flags |= VHOST_USER_VERSION;
2842 	ctx->msg.flags |= VHOST_USER_REPLY_MASK;
2843 
2844 	return send_vhost_message(dev, sockfd, ctx);
2845 }
2846 
2847 static int
2848 send_vhost_slave_message(struct virtio_net *dev,
2849 		struct vhu_msg_context *ctx)
2850 {
2851 	int ret;
2852 
2853 	if (ctx->msg.flags & VHOST_USER_NEED_REPLY)
2854 		rte_spinlock_lock(&dev->slave_req_lock);
2855 
2856 	ret = send_vhost_message(dev, dev->slave_req_fd, ctx);
2857 	if (ret < 0 && (ctx->msg.flags & VHOST_USER_NEED_REPLY))
2858 		rte_spinlock_unlock(&dev->slave_req_lock);
2859 
2860 	return ret;
2861 }
2862 
2863 /*
2864  * Allocate a queue pair if it hasn't been allocated yet
2865  */
2866 static int
2867 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
2868 			struct vhu_msg_context *ctx)
2869 {
2870 	uint32_t vring_idx;
2871 
2872 	switch (ctx->msg.request.master) {
2873 	case VHOST_USER_SET_VRING_KICK:
2874 	case VHOST_USER_SET_VRING_CALL:
2875 	case VHOST_USER_SET_VRING_ERR:
2876 		vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
2877 		break;
2878 	case VHOST_USER_SET_VRING_NUM:
2879 	case VHOST_USER_SET_VRING_BASE:
2880 	case VHOST_USER_GET_VRING_BASE:
2881 	case VHOST_USER_SET_VRING_ENABLE:
2882 		vring_idx = ctx->msg.payload.state.index;
2883 		break;
2884 	case VHOST_USER_SET_VRING_ADDR:
2885 		vring_idx = ctx->msg.payload.addr.index;
2886 		break;
2887 	default:
2888 		return 0;
2889 	}
2890 
2891 	if (vring_idx >= VHOST_MAX_VRING) {
2892 		VHOST_LOG_CONFIG(ERR, "(%s) invalid vring index: %u\n", dev->ifname, vring_idx);
2893 		return -1;
2894 	}
2895 
2896 	if (dev->virtqueue[vring_idx])
2897 		return 0;
2898 
2899 	return alloc_vring_queue(dev, vring_idx);
2900 }
2901 
2902 static void
2903 vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
2904 {
2905 	unsigned int i = 0;
2906 	unsigned int vq_num = 0;
2907 
2908 	while (vq_num < dev->nr_vring) {
2909 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2910 
2911 		if (vq) {
2912 			rte_spinlock_lock(&vq->access_lock);
2913 			vq_num++;
2914 		}
2915 		i++;
2916 	}
2917 }
2918 
2919 static void
2920 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
2921 {
2922 	unsigned int i = 0;
2923 	unsigned int vq_num = 0;
2924 
2925 	while (vq_num < dev->nr_vring) {
2926 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2927 
2928 		if (vq) {
2929 			rte_spinlock_unlock(&vq->access_lock);
2930 			vq_num++;
2931 		}
2932 		i++;
2933 	}
2934 }
2935 
2936 int
2937 vhost_user_msg_handler(int vid, int fd)
2938 {
2939 	struct virtio_net *dev;
2940 	struct vhu_msg_context ctx;
2941 	struct rte_vdpa_device *vdpa_dev;
2942 	int ret;
2943 	int unlock_required = 0;
2944 	bool handled;
2945 	int request;
2946 	uint32_t i;
2947 
2948 	dev = get_device(vid);
2949 	if (dev == NULL)
2950 		return -1;
2951 
2952 	if (!dev->notify_ops) {
2953 		dev->notify_ops = vhost_driver_callback_get(dev->ifname);
2954 		if (!dev->notify_ops) {
2955 			VHOST_LOG_CONFIG(ERR, "(%s) failed to get callback ops for driver\n",
2956 				dev->ifname);
2957 			return -1;
2958 		}
2959 	}
2960 
2961 	ret = read_vhost_message(dev, fd, &ctx);
2962 	if (ret <= 0) {
2963 		if (ret < 0)
2964 			VHOST_LOG_CONFIG(ERR, "(%s) vhost read message failed\n", dev->ifname);
2965 		else
2966 			VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname);
2967 
2968 		return -1;
2969 	}
2970 
2971 	ret = 0;
2972 	request = ctx.msg.request.master;
2973 	if (request > VHOST_USER_NONE && request < VHOST_USER_MAX &&
2974 			vhost_message_str[request]) {
2975 		if (request != VHOST_USER_IOTLB_MSG)
2976 			VHOST_LOG_CONFIG(INFO, "(%s) read message %s\n",
2977 				dev->ifname, vhost_message_str[request]);
2978 		else
2979 			VHOST_LOG_CONFIG(DEBUG, "(%s) read message %s\n",
2980 				dev->ifname, vhost_message_str[request]);
2981 	} else {
2982 		VHOST_LOG_CONFIG(DEBUG, "(%s) external request %d\n", dev->ifname, request);
2983 	}
2984 
2985 	ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx);
2986 	if (ret < 0) {
2987 		VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc queue\n", dev->ifname);
2988 		return -1;
2989 	}
2990 
2991 	/*
2992 	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
2993 	 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
2994 	 * and device is destroyed. destroy_device waits for queues to be
2995 	 * inactive, so it is safe. Otherwise taking the access_lock
2996 	 * would cause a dead lock.
2997 	 */
2998 	switch (request) {
2999 	case VHOST_USER_SET_FEATURES:
3000 	case VHOST_USER_SET_PROTOCOL_FEATURES:
3001 	case VHOST_USER_SET_OWNER:
3002 	case VHOST_USER_SET_MEM_TABLE:
3003 	case VHOST_USER_SET_LOG_BASE:
3004 	case VHOST_USER_SET_LOG_FD:
3005 	case VHOST_USER_SET_VRING_NUM:
3006 	case VHOST_USER_SET_VRING_ADDR:
3007 	case VHOST_USER_SET_VRING_BASE:
3008 	case VHOST_USER_SET_VRING_KICK:
3009 	case VHOST_USER_SET_VRING_CALL:
3010 	case VHOST_USER_SET_VRING_ERR:
3011 	case VHOST_USER_SET_VRING_ENABLE:
3012 	case VHOST_USER_SEND_RARP:
3013 	case VHOST_USER_NET_SET_MTU:
3014 	case VHOST_USER_SET_SLAVE_REQ_FD:
3015 		if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3016 			vhost_user_lock_all_queue_pairs(dev);
3017 			unlock_required = 1;
3018 		}
3019 		break;
3020 	default:
3021 		break;
3022 
3023 	}
3024 
3025 	handled = false;
3026 	if (dev->extern_ops.pre_msg_handle) {
3027 		ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
3028 				(void *)&ctx.msg);
3029 		switch (ret) {
3030 		case RTE_VHOST_MSG_RESULT_REPLY:
3031 			send_vhost_reply(dev, fd, &ctx);
3032 			/* Fall-through */
3033 		case RTE_VHOST_MSG_RESULT_ERR:
3034 		case RTE_VHOST_MSG_RESULT_OK:
3035 			handled = true;
3036 			goto skip_to_post_handle;
3037 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3038 		default:
3039 			break;
3040 		}
3041 	}
3042 
3043 	if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) {
3044 		if (!vhost_message_handlers[request])
3045 			goto skip_to_post_handle;
3046 		ret = vhost_message_handlers[request](&dev, &ctx, fd);
3047 
3048 		switch (ret) {
3049 		case RTE_VHOST_MSG_RESULT_ERR:
3050 			VHOST_LOG_CONFIG(ERR, "(%s) processing %s failed.\n",
3051 					dev->ifname, vhost_message_str[request]);
3052 			handled = true;
3053 			break;
3054 		case RTE_VHOST_MSG_RESULT_OK:
3055 			VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded.\n",
3056 					dev->ifname, vhost_message_str[request]);
3057 			handled = true;
3058 			break;
3059 		case RTE_VHOST_MSG_RESULT_REPLY:
3060 			VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded and needs reply.\n",
3061 					dev->ifname, vhost_message_str[request]);
3062 			send_vhost_reply(dev, fd, &ctx);
3063 			handled = true;
3064 			break;
3065 		default:
3066 			break;
3067 		}
3068 	}
3069 
3070 skip_to_post_handle:
3071 	if (ret != RTE_VHOST_MSG_RESULT_ERR &&
3072 			dev->extern_ops.post_msg_handle) {
3073 		ret = (*dev->extern_ops.post_msg_handle)(dev->vid,
3074 				(void *)&ctx.msg);
3075 		switch (ret) {
3076 		case RTE_VHOST_MSG_RESULT_REPLY:
3077 			send_vhost_reply(dev, fd, &ctx);
3078 			/* Fall-through */
3079 		case RTE_VHOST_MSG_RESULT_ERR:
3080 		case RTE_VHOST_MSG_RESULT_OK:
3081 			handled = true;
3082 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3083 		default:
3084 			break;
3085 		}
3086 	}
3087 
3088 	/* If message was not handled at this stage, treat it as an error */
3089 	if (!handled) {
3090 		VHOST_LOG_CONFIG(ERR, "(%s) vhost message (req: %d) was not handled.\n",
3091 				dev->ifname, request);
3092 		close_msg_fds(&ctx);
3093 		ret = RTE_VHOST_MSG_RESULT_ERR;
3094 	}
3095 
3096 	/*
3097 	 * If the request required a reply that was already sent,
3098 	 * this optional reply-ack won't be sent as the
3099 	 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
3100 	 */
3101 	if (ctx.msg.flags & VHOST_USER_NEED_REPLY) {
3102 		ctx.msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR;
3103 		ctx.msg.size = sizeof(ctx.msg.payload.u64);
3104 		ctx.fd_num = 0;
3105 		send_vhost_reply(dev, fd, &ctx);
3106 	} else if (ret == RTE_VHOST_MSG_RESULT_ERR) {
3107 		VHOST_LOG_CONFIG(ERR, "(%s) vhost message handling failed.\n", dev->ifname);
3108 		return -1;
3109 	}
3110 
3111 	for (i = 0; i < dev->nr_vring; i++) {
3112 		struct vhost_virtqueue *vq = dev->virtqueue[i];
3113 		bool cur_ready = vq_is_ready(dev, vq);
3114 
3115 		if (cur_ready != (vq && vq->ready)) {
3116 			vq->ready = cur_ready;
3117 			vhost_user_notify_queue_state(dev, i, cur_ready);
3118 		}
3119 	}
3120 
3121 	if (unlock_required)
3122 		vhost_user_unlock_all_queue_pairs(dev);
3123 
3124 	if (!virtio_is_ready(dev))
3125 		goto out;
3126 
3127 	/*
3128 	 * Virtio is now ready. If not done already, it is time
3129 	 * to notify the application it can process the rings and
3130 	 * configure the vDPA device if present.
3131 	 */
3132 
3133 	if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
3134 		if (dev->notify_ops->new_device(dev->vid) == 0)
3135 			dev->flags |= VIRTIO_DEV_RUNNING;
3136 	}
3137 
3138 	vdpa_dev = dev->vdpa_dev;
3139 	if (!vdpa_dev)
3140 		goto out;
3141 
3142 	if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3143 		if (vdpa_dev->ops->dev_conf(dev->vid))
3144 			VHOST_LOG_CONFIG(ERR, "(%s) failed to configure vDPA device\n",
3145 					dev->ifname);
3146 		else
3147 			dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
3148 	}
3149 
3150 out:
3151 	return 0;
3152 }
3153 
3154 static int process_slave_message_reply(struct virtio_net *dev,
3155 				       const struct vhu_msg_context *ctx)
3156 {
3157 	struct vhu_msg_context msg_reply;
3158 	int ret;
3159 
3160 	if ((ctx->msg.flags & VHOST_USER_NEED_REPLY) == 0)
3161 		return 0;
3162 
3163 	ret = read_vhost_message(dev, dev->slave_req_fd, &msg_reply);
3164 	if (ret <= 0) {
3165 		if (ret < 0)
3166 			VHOST_LOG_CONFIG(ERR, "(%s) vhost read slave message reply failed\n",
3167 					dev->ifname);
3168 		else
3169 			VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname);
3170 		ret = -1;
3171 		goto out;
3172 	}
3173 
3174 	ret = 0;
3175 	if (msg_reply.msg.request.slave != ctx->msg.request.slave) {
3176 		VHOST_LOG_CONFIG(ERR, "(%s) received unexpected msg type (%u), expected %u\n",
3177 				dev->ifname, msg_reply.msg.request.slave, ctx->msg.request.slave);
3178 		ret = -1;
3179 		goto out;
3180 	}
3181 
3182 	ret = msg_reply.msg.payload.u64 ? -1 : 0;
3183 
3184 out:
3185 	rte_spinlock_unlock(&dev->slave_req_lock);
3186 	return ret;
3187 }
3188 
3189 int
3190 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
3191 {
3192 	int ret;
3193 	struct vhu_msg_context ctx = {
3194 		.msg = {
3195 			.request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
3196 			.flags = VHOST_USER_VERSION,
3197 			.size = sizeof(ctx.msg.payload.iotlb),
3198 			.payload.iotlb = {
3199 				.iova = iova,
3200 				.perm = perm,
3201 				.type = VHOST_IOTLB_MISS,
3202 			},
3203 		},
3204 	};
3205 
3206 	ret = send_vhost_message(dev, dev->slave_req_fd, &ctx);
3207 	if (ret < 0) {
3208 		VHOST_LOG_CONFIG(ERR, "(%s) failed to send IOTLB miss message (%d)\n",
3209 				dev->ifname, ret);
3210 		return ret;
3211 	}
3212 
3213 	return 0;
3214 }
3215 
3216 static int
3217 vhost_user_slave_config_change(struct virtio_net *dev, bool need_reply)
3218 {
3219 	int ret;
3220 	struct vhu_msg_context ctx = {
3221 		.msg = {
3222 			.request.slave = VHOST_USER_SLAVE_CONFIG_CHANGE_MSG,
3223 			.flags = VHOST_USER_VERSION,
3224 			.size = 0,
3225 		}
3226 	};
3227 
3228 	if (need_reply)
3229 		ctx.msg.flags |= VHOST_USER_NEED_REPLY;
3230 
3231 	ret = send_vhost_slave_message(dev, &ctx);
3232 	if (ret < 0) {
3233 		VHOST_LOG_CONFIG(ERR, "(%s) failed to send config change (%d)\n",
3234 				dev->ifname, ret);
3235 		return ret;
3236 	}
3237 
3238 	return process_slave_message_reply(dev, &ctx);
3239 }
3240 
3241 int
3242 rte_vhost_slave_config_change(int vid, bool need_reply)
3243 {
3244 	struct virtio_net *dev;
3245 
3246 	dev = get_device(vid);
3247 	if (!dev)
3248 		return -ENODEV;
3249 
3250 	return vhost_user_slave_config_change(dev, need_reply);
3251 }
3252 
3253 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
3254 						    int index, int fd,
3255 						    uint64_t offset,
3256 						    uint64_t size)
3257 {
3258 	int ret;
3259 	struct vhu_msg_context ctx = {
3260 		.msg = {
3261 			.request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
3262 			.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
3263 			.size = sizeof(ctx.msg.payload.area),
3264 			.payload.area = {
3265 				.u64 = index & VHOST_USER_VRING_IDX_MASK,
3266 				.size = size,
3267 				.offset = offset,
3268 			},
3269 		},
3270 	};
3271 
3272 	if (fd < 0)
3273 		ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
3274 	else {
3275 		ctx.fds[0] = fd;
3276 		ctx.fd_num = 1;
3277 	}
3278 
3279 	ret = send_vhost_slave_message(dev, &ctx);
3280 	if (ret < 0) {
3281 		VHOST_LOG_CONFIG(ERR, "(%s) failed to set host notifier (%d)\n",
3282 				dev->ifname, ret);
3283 		return ret;
3284 	}
3285 
3286 	return process_slave_message_reply(dev, &ctx);
3287 }
3288 
3289 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable)
3290 {
3291 	struct virtio_net *dev;
3292 	struct rte_vdpa_device *vdpa_dev;
3293 	int vfio_device_fd, ret = 0;
3294 	uint64_t offset, size;
3295 	unsigned int i, q_start, q_last;
3296 
3297 	dev = get_device(vid);
3298 	if (!dev)
3299 		return -ENODEV;
3300 
3301 	vdpa_dev = dev->vdpa_dev;
3302 	if (vdpa_dev == NULL)
3303 		return -ENODEV;
3304 
3305 	if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
3306 	    !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) ||
3307 	    !(dev->protocol_features &
3308 			(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) ||
3309 	    !(dev->protocol_features &
3310 			(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) ||
3311 	    !(dev->protocol_features &
3312 			(1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)))
3313 		return -ENOTSUP;
3314 
3315 	if (qid == RTE_VHOST_QUEUE_ALL) {
3316 		q_start = 0;
3317 		q_last = dev->nr_vring - 1;
3318 	} else {
3319 		if (qid >= dev->nr_vring)
3320 			return -EINVAL;
3321 		q_start = qid;
3322 		q_last = qid;
3323 	}
3324 
3325 	RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP);
3326 	RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP);
3327 
3328 	vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid);
3329 	if (vfio_device_fd < 0)
3330 		return -ENOTSUP;
3331 
3332 	if (enable) {
3333 		for (i = q_start; i <= q_last; i++) {
3334 			if (vdpa_dev->ops->get_notify_area(vid, i, &offset,
3335 					&size) < 0) {
3336 				ret = -ENOTSUP;
3337 				goto disable;
3338 			}
3339 
3340 			if (vhost_user_slave_set_vring_host_notifier(dev, i,
3341 					vfio_device_fd, offset, size) < 0) {
3342 				ret = -EFAULT;
3343 				goto disable;
3344 			}
3345 		}
3346 	} else {
3347 disable:
3348 		for (i = q_start; i <= q_last; i++) {
3349 			vhost_user_slave_set_vring_host_notifier(dev, i, -1,
3350 					0, 0);
3351 		}
3352 	}
3353 
3354 	return ret;
3355 }
3356