1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 /* Security model 6 * -------------- 7 * The vhost-user protocol connection is an external interface, so it must be 8 * robust against invalid inputs. 9 * 10 * This is important because the vhost-user master is only one step removed 11 * from the guest. Malicious guests that have escaped will then launch further 12 * attacks from the vhost-user master. 13 * 14 * Even in deployments where guests are trusted, a bug in the vhost-user master 15 * can still cause invalid messages to be sent. Such messages must not 16 * compromise the stability of the DPDK application by causing crashes, memory 17 * corruption, or other problematic behavior. 18 * 19 * Do not assume received VhostUserMsg fields contain sensible values! 20 */ 21 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #include <unistd.h> 27 #include <fcntl.h> 28 #include <sys/ioctl.h> 29 #include <sys/mman.h> 30 #include <sys/types.h> 31 #include <sys/stat.h> 32 #include <sys/syscall.h> 33 #include <assert.h> 34 #ifdef RTE_LIBRTE_VHOST_NUMA 35 #include <numaif.h> 36 #endif 37 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 38 #include <linux/userfaultfd.h> 39 #endif 40 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ 41 #include <linux/memfd.h> 42 #define MEMFD_SUPPORTED 43 #endif 44 45 #include <rte_common.h> 46 #include <rte_malloc.h> 47 #include <rte_log.h> 48 #include <rte_vfio.h> 49 #include <rte_errno.h> 50 51 #include "iotlb.h" 52 #include "vhost.h" 53 #include "vhost_user.h" 54 55 #define VIRTIO_MIN_MTU 68 56 #define VIRTIO_MAX_MTU 65535 57 58 #define INFLIGHT_ALIGNMENT 64 59 #define INFLIGHT_VERSION 0x1 60 61 static const char *vhost_message_str[VHOST_USER_MAX] = { 62 [VHOST_USER_NONE] = "VHOST_USER_NONE", 63 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", 64 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", 65 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", 66 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", 67 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", 68 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", 69 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", 70 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", 71 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", 72 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", 73 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", 74 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", 75 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", 76 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", 77 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", 78 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", 79 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", 80 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", 81 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", 82 [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", 83 [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", 84 [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", 85 [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", 86 [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", 87 [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", 88 [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", 89 [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", 90 [VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD", 91 [VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD", 92 [VHOST_USER_SET_STATUS] = "VHOST_USER_SET_STATUS", 93 [VHOST_USER_GET_STATUS] = "VHOST_USER_GET_STATUS", 94 }; 95 96 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 97 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx); 98 99 static void 100 close_msg_fds(struct vhu_msg_context *ctx) 101 { 102 int i; 103 104 for (i = 0; i < ctx->fd_num; i++) { 105 int fd = ctx->fds[i]; 106 107 if (fd == -1) 108 continue; 109 110 ctx->fds[i] = -1; 111 close(fd); 112 } 113 } 114 115 /* 116 * Ensure the expected number of FDs is received, 117 * close all FDs and return an error if this is not the case. 118 */ 119 static int 120 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds) 121 { 122 if (ctx->fd_num == expected_fds) 123 return 0; 124 125 VHOST_LOG_CONFIG(ERR, "(%s) expect %d FDs for request %s, received %d\n", 126 dev->ifname, expected_fds, 127 vhost_message_str[ctx->msg.request.master], 128 ctx->fd_num); 129 130 close_msg_fds(ctx); 131 132 return -1; 133 } 134 135 static uint64_t 136 get_blk_size(int fd) 137 { 138 struct stat stat; 139 int ret; 140 141 ret = fstat(fd, &stat); 142 return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; 143 } 144 145 static int 146 async_dma_map(struct virtio_net *dev, struct rte_vhost_mem_region *region, bool do_map) 147 { 148 uint64_t host_iova; 149 int ret = 0; 150 151 host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr); 152 if (do_map) { 153 /* Add mapped region into the default container of DPDK. */ 154 ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD, 155 region->host_user_addr, 156 host_iova, 157 region->size); 158 if (ret) { 159 /* 160 * DMA device may bind with kernel driver, in this case, 161 * we don't need to program IOMMU manually. However, if no 162 * device is bound with vfio/uio in DPDK, and vfio kernel 163 * module is loaded, the API will still be called and return 164 * with ENODEV/ENOSUP. 165 * 166 * DPDK vfio only returns ENODEV/ENOSUP in very similar 167 * situations(vfio either unsupported, or supported 168 * but no devices found). Either way, no mappings could be 169 * performed. We treat it as normal case in async path. 170 */ 171 if (rte_errno == ENODEV || rte_errno == ENOTSUP) 172 return 0; 173 174 VHOST_LOG_CONFIG(ERR, "(%s) DMA engine map failed\n", dev->ifname); 175 /* DMA mapping errors won't stop VHST_USER_SET_MEM_TABLE. */ 176 return 0; 177 } 178 179 } else { 180 /* Remove mapped region from the default container of DPDK. */ 181 ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD, 182 region->host_user_addr, 183 host_iova, 184 region->size); 185 if (ret) { 186 /* like DMA map, ignore the kernel driver case when unmap. */ 187 if (rte_errno == EINVAL) 188 return 0; 189 190 VHOST_LOG_CONFIG(ERR, "(%s) DMA engine unmap failed\n", dev->ifname); 191 return ret; 192 } 193 } 194 195 return ret; 196 } 197 198 static void 199 free_mem_region(struct virtio_net *dev) 200 { 201 uint32_t i; 202 struct rte_vhost_mem_region *reg; 203 204 if (!dev || !dev->mem) 205 return; 206 207 for (i = 0; i < dev->mem->nregions; i++) { 208 reg = &dev->mem->regions[i]; 209 if (reg->host_user_addr) { 210 if (dev->async_copy && rte_vfio_is_enabled("vfio")) 211 async_dma_map(dev, reg, false); 212 213 munmap(reg->mmap_addr, reg->mmap_size); 214 close(reg->fd); 215 } 216 } 217 } 218 219 void 220 vhost_backend_cleanup(struct virtio_net *dev) 221 { 222 if (dev->mem) { 223 free_mem_region(dev); 224 rte_free(dev->mem); 225 dev->mem = NULL; 226 } 227 228 rte_free(dev->guest_pages); 229 dev->guest_pages = NULL; 230 231 if (dev->log_addr) { 232 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 233 dev->log_addr = 0; 234 } 235 236 if (dev->inflight_info) { 237 if (dev->inflight_info->addr) { 238 munmap(dev->inflight_info->addr, 239 dev->inflight_info->size); 240 dev->inflight_info->addr = NULL; 241 } 242 243 if (dev->inflight_info->fd >= 0) { 244 close(dev->inflight_info->fd); 245 dev->inflight_info->fd = -1; 246 } 247 248 rte_free(dev->inflight_info); 249 dev->inflight_info = NULL; 250 } 251 252 if (dev->slave_req_fd >= 0) { 253 close(dev->slave_req_fd); 254 dev->slave_req_fd = -1; 255 } 256 257 if (dev->postcopy_ufd >= 0) { 258 close(dev->postcopy_ufd); 259 dev->postcopy_ufd = -1; 260 } 261 262 dev->postcopy_listening = 0; 263 } 264 265 static void 266 vhost_user_notify_queue_state(struct virtio_net *dev, uint16_t index, 267 int enable) 268 { 269 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 270 struct vhost_virtqueue *vq = dev->virtqueue[index]; 271 272 /* Configure guest notifications on enable */ 273 if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF) 274 vhost_enable_guest_notification(dev, vq, vq->notif_enable); 275 276 if (vdpa_dev && vdpa_dev->ops->set_vring_state) 277 vdpa_dev->ops->set_vring_state(dev->vid, index, enable); 278 279 if (dev->notify_ops->vring_state_changed) 280 dev->notify_ops->vring_state_changed(dev->vid, 281 index, enable); 282 } 283 284 /* 285 * This function just returns success at the moment unless 286 * the device hasn't been initialised. 287 */ 288 static int 289 vhost_user_set_owner(struct virtio_net **pdev, 290 struct vhu_msg_context *ctx, 291 int main_fd __rte_unused) 292 { 293 struct virtio_net *dev = *pdev; 294 295 if (validate_msg_fds(dev, ctx, 0) != 0) 296 return RTE_VHOST_MSG_RESULT_ERR; 297 298 return RTE_VHOST_MSG_RESULT_OK; 299 } 300 301 static int 302 vhost_user_reset_owner(struct virtio_net **pdev, 303 struct vhu_msg_context *ctx, 304 int main_fd __rte_unused) 305 { 306 struct virtio_net *dev = *pdev; 307 308 if (validate_msg_fds(dev, ctx, 0) != 0) 309 return RTE_VHOST_MSG_RESULT_ERR; 310 311 vhost_destroy_device_notify(dev); 312 313 cleanup_device(dev, 0); 314 reset_device(dev); 315 return RTE_VHOST_MSG_RESULT_OK; 316 } 317 318 /* 319 * The features that we support are requested. 320 */ 321 static int 322 vhost_user_get_features(struct virtio_net **pdev, 323 struct vhu_msg_context *ctx, 324 int main_fd __rte_unused) 325 { 326 struct virtio_net *dev = *pdev; 327 uint64_t features = 0; 328 329 if (validate_msg_fds(dev, ctx, 0) != 0) 330 return RTE_VHOST_MSG_RESULT_ERR; 331 332 rte_vhost_driver_get_features(dev->ifname, &features); 333 334 ctx->msg.payload.u64 = features; 335 ctx->msg.size = sizeof(ctx->msg.payload.u64); 336 ctx->fd_num = 0; 337 338 return RTE_VHOST_MSG_RESULT_REPLY; 339 } 340 341 /* 342 * The queue number that we support are requested. 343 */ 344 static int 345 vhost_user_get_queue_num(struct virtio_net **pdev, 346 struct vhu_msg_context *ctx, 347 int main_fd __rte_unused) 348 { 349 struct virtio_net *dev = *pdev; 350 uint32_t queue_num = 0; 351 352 if (validate_msg_fds(dev, ctx, 0) != 0) 353 return RTE_VHOST_MSG_RESULT_ERR; 354 355 rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); 356 357 ctx->msg.payload.u64 = (uint64_t)queue_num; 358 ctx->msg.size = sizeof(ctx->msg.payload.u64); 359 ctx->fd_num = 0; 360 361 return RTE_VHOST_MSG_RESULT_REPLY; 362 } 363 364 /* 365 * We receive the negotiated features supported by us and the virtio device. 366 */ 367 static int 368 vhost_user_set_features(struct virtio_net **pdev, 369 struct vhu_msg_context *ctx, 370 int main_fd __rte_unused) 371 { 372 struct virtio_net *dev = *pdev; 373 uint64_t features = ctx->msg.payload.u64; 374 uint64_t vhost_features = 0; 375 struct rte_vdpa_device *vdpa_dev; 376 377 if (validate_msg_fds(dev, ctx, 0) != 0) 378 return RTE_VHOST_MSG_RESULT_ERR; 379 380 rte_vhost_driver_get_features(dev->ifname, &vhost_features); 381 if (features & ~vhost_features) { 382 VHOST_LOG_CONFIG(ERR, "(%s) received invalid negotiated features.\n", 383 dev->ifname); 384 dev->flags |= VIRTIO_DEV_FEATURES_FAILED; 385 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 386 387 return RTE_VHOST_MSG_RESULT_ERR; 388 } 389 390 if (dev->flags & VIRTIO_DEV_RUNNING) { 391 if (dev->features == features) 392 return RTE_VHOST_MSG_RESULT_OK; 393 394 /* 395 * Error out if master tries to change features while device is 396 * in running state. The exception being VHOST_F_LOG_ALL, which 397 * is enabled when the live-migration starts. 398 */ 399 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { 400 VHOST_LOG_CONFIG(ERR, "(%s) features changed while device is running.\n", 401 dev->ifname); 402 return RTE_VHOST_MSG_RESULT_ERR; 403 } 404 405 if (dev->notify_ops->features_changed) 406 dev->notify_ops->features_changed(dev->vid, features); 407 } 408 409 dev->features = features; 410 if (dev->features & 411 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 412 (1ULL << VIRTIO_F_VERSION_1) | 413 (1ULL << VIRTIO_F_RING_PACKED))) { 414 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); 415 } else { 416 dev->vhost_hlen = sizeof(struct virtio_net_hdr); 417 } 418 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Virtio features: 0x%" PRIx64 "\n", 419 dev->ifname, dev->features); 420 VHOST_LOG_CONFIG(DEBUG, "(%s) mergeable RX buffers %s, virtio 1 %s\n", 421 dev->ifname, 422 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", 423 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); 424 425 if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && 426 !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { 427 /* 428 * Remove all but first queue pair if MQ hasn't been 429 * negotiated. This is safe because the device is not 430 * running at this stage. 431 */ 432 while (dev->nr_vring > 2) { 433 struct vhost_virtqueue *vq; 434 435 vq = dev->virtqueue[--dev->nr_vring]; 436 if (!vq) 437 continue; 438 439 dev->virtqueue[dev->nr_vring] = NULL; 440 cleanup_vq(vq, 1); 441 cleanup_vq_inflight(dev, vq); 442 free_vq(dev, vq); 443 } 444 } 445 446 vdpa_dev = dev->vdpa_dev; 447 if (vdpa_dev) 448 vdpa_dev->ops->set_features(dev->vid); 449 450 dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED; 451 return RTE_VHOST_MSG_RESULT_OK; 452 } 453 454 /* 455 * The virtio device sends us the size of the descriptor ring. 456 */ 457 static int 458 vhost_user_set_vring_num(struct virtio_net **pdev, 459 struct vhu_msg_context *ctx, 460 int main_fd __rte_unused) 461 { 462 struct virtio_net *dev = *pdev; 463 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 464 465 if (validate_msg_fds(dev, ctx, 0) != 0) 466 return RTE_VHOST_MSG_RESULT_ERR; 467 468 if (ctx->msg.payload.state.num > 32768) { 469 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 470 dev->ifname, ctx->msg.payload.state.num); 471 return RTE_VHOST_MSG_RESULT_ERR; 472 } 473 474 vq->size = ctx->msg.payload.state.num; 475 476 /* VIRTIO 1.0, 2.4 Virtqueues says: 477 * 478 * Queue Size value is always a power of 2. The maximum Queue Size 479 * value is 32768. 480 * 481 * VIRTIO 1.1 2.7 Virtqueues says: 482 * 483 * Packed virtqueues support up to 2^15 entries each. 484 */ 485 if (!vq_is_packed(dev)) { 486 if (vq->size & (vq->size - 1)) { 487 VHOST_LOG_CONFIG(ERR, "(%s) invalid virtqueue size %u\n", 488 dev->ifname, vq->size); 489 return RTE_VHOST_MSG_RESULT_ERR; 490 } 491 } 492 493 if (vq_is_packed(dev)) { 494 if (vq->shadow_used_packed) 495 rte_free(vq->shadow_used_packed); 496 vq->shadow_used_packed = rte_malloc_socket(NULL, 497 vq->size * 498 sizeof(struct vring_used_elem_packed), 499 RTE_CACHE_LINE_SIZE, vq->numa_node); 500 if (!vq->shadow_used_packed) { 501 VHOST_LOG_CONFIG(ERR, 502 "(%s) failed to allocate memory for shadow used ring.\n", 503 dev->ifname); 504 return RTE_VHOST_MSG_RESULT_ERR; 505 } 506 507 } else { 508 if (vq->shadow_used_split) 509 rte_free(vq->shadow_used_split); 510 511 vq->shadow_used_split = rte_malloc_socket(NULL, 512 vq->size * sizeof(struct vring_used_elem), 513 RTE_CACHE_LINE_SIZE, vq->numa_node); 514 515 if (!vq->shadow_used_split) { 516 VHOST_LOG_CONFIG(ERR, 517 "(%s) failed to allocate memory for vq internal data.\n", 518 dev->ifname); 519 return RTE_VHOST_MSG_RESULT_ERR; 520 } 521 } 522 523 if (vq->batch_copy_elems) 524 rte_free(vq->batch_copy_elems); 525 vq->batch_copy_elems = rte_malloc_socket(NULL, 526 vq->size * sizeof(struct batch_copy_elem), 527 RTE_CACHE_LINE_SIZE, vq->numa_node); 528 if (!vq->batch_copy_elems) { 529 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate memory for batching copy.\n", 530 dev->ifname); 531 return RTE_VHOST_MSG_RESULT_ERR; 532 } 533 534 return RTE_VHOST_MSG_RESULT_OK; 535 } 536 537 /* 538 * Reallocate virtio_dev, vhost_virtqueue and related data structures to 539 * make them on the same numa node as the memory of vring descriptor. 540 */ 541 #ifdef RTE_LIBRTE_VHOST_NUMA 542 static struct virtio_net* 543 numa_realloc(struct virtio_net *dev, int index) 544 { 545 int node, dev_node; 546 struct virtio_net *old_dev; 547 struct vhost_virtqueue *vq; 548 struct batch_copy_elem *bce; 549 struct guest_page *gp; 550 struct rte_vhost_memory *mem; 551 size_t mem_size; 552 int ret; 553 554 old_dev = dev; 555 vq = dev->virtqueue[index]; 556 557 /* 558 * If VQ is ready, it is too late to reallocate, it certainly already 559 * happened anyway on VHOST_USER_SET_VRING_ADRR. 560 */ 561 if (vq->ready) 562 return dev; 563 564 ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR); 565 if (ret) { 566 VHOST_LOG_CONFIG(ERR, "(%s) unable to get virtqueue %d numa information.\n", 567 dev->ifname, index); 568 return dev; 569 } 570 571 if (node == vq->numa_node) 572 goto out_dev_realloc; 573 574 vq = rte_realloc_socket(vq, sizeof(*vq), 0, node); 575 if (!vq) { 576 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc virtqueue %d on node %d\n", 577 dev->ifname, index, node); 578 return dev; 579 } 580 581 if (vq != dev->virtqueue[index]) { 582 VHOST_LOG_CONFIG(INFO, "(%s) reallocated virtqueue on node %d\n", 583 dev->ifname, node); 584 dev->virtqueue[index] = vq; 585 vhost_user_iotlb_init(dev, index); 586 } 587 588 if (vq_is_packed(dev)) { 589 struct vring_used_elem_packed *sup; 590 591 sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup), 592 RTE_CACHE_LINE_SIZE, node); 593 if (!sup) { 594 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow packed on node %d\n", 595 dev->ifname, node); 596 return dev; 597 } 598 vq->shadow_used_packed = sup; 599 } else { 600 struct vring_used_elem *sus; 601 602 sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus), 603 RTE_CACHE_LINE_SIZE, node); 604 if (!sus) { 605 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc shadow split on node %d\n", 606 dev->ifname, node); 607 return dev; 608 } 609 vq->shadow_used_split = sus; 610 } 611 612 bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce), 613 RTE_CACHE_LINE_SIZE, node); 614 if (!bce) { 615 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc batch copy elem on node %d\n", 616 dev->ifname, node); 617 return dev; 618 } 619 vq->batch_copy_elems = bce; 620 621 if (vq->log_cache) { 622 struct log_cache_entry *lc; 623 624 lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node); 625 if (!lc) { 626 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc log cache on node %d\n", 627 dev->ifname, node); 628 return dev; 629 } 630 vq->log_cache = lc; 631 } 632 633 if (vq->resubmit_inflight) { 634 struct rte_vhost_resubmit_info *ri; 635 636 ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node); 637 if (!ri) { 638 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit inflight on node %d\n", 639 dev->ifname, node); 640 return dev; 641 } 642 vq->resubmit_inflight = ri; 643 644 if (ri->resubmit_list) { 645 struct rte_vhost_resubmit_desc *rd; 646 647 rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num, 648 0, node); 649 if (!rd) { 650 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc resubmit list on node %d\n", 651 dev->ifname, node); 652 return dev; 653 } 654 ri->resubmit_list = rd; 655 } 656 } 657 658 vq->numa_node = node; 659 660 out_dev_realloc: 661 662 if (dev->flags & VIRTIO_DEV_RUNNING) 663 return dev; 664 665 ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR); 666 if (ret) { 667 VHOST_LOG_CONFIG(ERR, "(%s) unable to get numa information.\n", dev->ifname); 668 return dev; 669 } 670 671 if (dev_node == node) 672 return dev; 673 674 dev = rte_realloc_socket(old_dev, sizeof(*dev), 0, node); 675 if (!dev) { 676 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc dev on node %d\n", 677 dev->ifname, node); 678 return old_dev; 679 } 680 681 VHOST_LOG_CONFIG(INFO, "(%s) reallocated device on node %d\n", dev->ifname, node); 682 vhost_devices[dev->vid] = dev; 683 684 mem_size = sizeof(struct rte_vhost_memory) + 685 sizeof(struct rte_vhost_mem_region) * dev->mem->nregions; 686 mem = rte_realloc_socket(dev->mem, mem_size, 0, node); 687 if (!mem) { 688 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc mem table on node %d\n", 689 dev->ifname, node); 690 return dev; 691 } 692 dev->mem = mem; 693 694 gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp), 695 RTE_CACHE_LINE_SIZE, node); 696 if (!gp) { 697 VHOST_LOG_CONFIG(ERR, "(%s) failed to realloc guest pages on node %d\n", 698 dev->ifname, node); 699 return dev; 700 } 701 dev->guest_pages = gp; 702 703 return dev; 704 } 705 #else 706 static struct virtio_net* 707 numa_realloc(struct virtio_net *dev, int index __rte_unused) 708 { 709 return dev; 710 } 711 #endif 712 713 /* Converts QEMU virtual address to Vhost virtual address. */ 714 static uint64_t 715 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) 716 { 717 struct rte_vhost_mem_region *r; 718 uint32_t i; 719 720 if (unlikely(!dev || !dev->mem)) 721 goto out_error; 722 723 /* Find the region where the address lives. */ 724 for (i = 0; i < dev->mem->nregions; i++) { 725 r = &dev->mem->regions[i]; 726 727 if (qva >= r->guest_user_addr && 728 qva < r->guest_user_addr + r->size) { 729 730 if (unlikely(*len > r->guest_user_addr + r->size - qva)) 731 *len = r->guest_user_addr + r->size - qva; 732 733 return qva - r->guest_user_addr + 734 r->host_user_addr; 735 } 736 } 737 out_error: 738 *len = 0; 739 740 return 0; 741 } 742 743 744 /* 745 * Converts ring address to Vhost virtual address. 746 * If IOMMU is enabled, the ring address is a guest IO virtual address, 747 * else it is a QEMU virtual address. 748 */ 749 static uint64_t 750 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, 751 uint64_t ra, uint64_t *size) 752 { 753 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { 754 uint64_t vva; 755 756 vhost_user_iotlb_rd_lock(vq); 757 vva = vhost_iova_to_vva(dev, vq, ra, 758 size, VHOST_ACCESS_RW); 759 vhost_user_iotlb_rd_unlock(vq); 760 761 return vva; 762 } 763 764 return qva_to_vva(dev, ra, size); 765 } 766 767 static uint64_t 768 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) 769 { 770 uint64_t log_gpa; 771 772 vhost_user_iotlb_rd_lock(vq); 773 log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr); 774 vhost_user_iotlb_rd_unlock(vq); 775 776 return log_gpa; 777 } 778 779 static struct virtio_net * 780 translate_ring_addresses(struct virtio_net *dev, int vq_index) 781 { 782 struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; 783 struct vhost_vring_addr *addr = &vq->ring_addrs; 784 uint64_t len, expected_len; 785 786 if (addr->flags & (1 << VHOST_VRING_F_LOG)) { 787 vq->log_guest_addr = 788 log_addr_to_gpa(dev, vq); 789 if (vq->log_guest_addr == 0) { 790 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map log_guest_addr.\n", 791 dev->ifname); 792 return dev; 793 } 794 } 795 796 if (vq_is_packed(dev)) { 797 len = sizeof(struct vring_packed_desc) * vq->size; 798 vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) 799 ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); 800 if (vq->desc_packed == NULL || 801 len != sizeof(struct vring_packed_desc) * 802 vq->size) { 803 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc_packed ring.\n", 804 dev->ifname); 805 return dev; 806 } 807 808 dev = numa_realloc(dev, vq_index); 809 vq = dev->virtqueue[vq_index]; 810 addr = &vq->ring_addrs; 811 812 len = sizeof(struct vring_packed_desc_event); 813 vq->driver_event = (struct vring_packed_desc_event *) 814 (uintptr_t)ring_addr_to_vva(dev, 815 vq, addr->avail_user_addr, &len); 816 if (vq->driver_event == NULL || 817 len != sizeof(struct vring_packed_desc_event)) { 818 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find driver area address.\n", 819 dev->ifname); 820 return dev; 821 } 822 823 len = sizeof(struct vring_packed_desc_event); 824 vq->device_event = (struct vring_packed_desc_event *) 825 (uintptr_t)ring_addr_to_vva(dev, 826 vq, addr->used_user_addr, &len); 827 if (vq->device_event == NULL || 828 len != sizeof(struct vring_packed_desc_event)) { 829 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to find device area address.\n", 830 dev->ifname); 831 return dev; 832 } 833 834 vq->access_ok = true; 835 return dev; 836 } 837 838 /* The addresses are converted from QEMU virtual to Vhost virtual. */ 839 if (vq->desc && vq->avail && vq->used) 840 return dev; 841 842 len = sizeof(struct vring_desc) * vq->size; 843 vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, 844 vq, addr->desc_user_addr, &len); 845 if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { 846 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map desc ring.\n", dev->ifname); 847 return dev; 848 } 849 850 dev = numa_realloc(dev, vq_index); 851 vq = dev->virtqueue[vq_index]; 852 addr = &vq->ring_addrs; 853 854 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 855 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 856 len += sizeof(uint16_t); 857 expected_len = len; 858 vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, 859 vq, addr->avail_user_addr, &len); 860 if (vq->avail == 0 || len != expected_len) { 861 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map avail ring.\n", dev->ifname); 862 return dev; 863 } 864 865 len = sizeof(struct vring_used) + 866 sizeof(struct vring_used_elem) * vq->size; 867 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) 868 len += sizeof(uint16_t); 869 expected_len = len; 870 vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, 871 vq, addr->used_user_addr, &len); 872 if (vq->used == 0 || len != expected_len) { 873 VHOST_LOG_CONFIG(DEBUG, "(%s) failed to map used ring.\n", dev->ifname); 874 return dev; 875 } 876 877 if (vq->last_used_idx != vq->used->idx) { 878 VHOST_LOG_CONFIG(WARNING, "(%s) last_used_idx (%u) and vq->used->idx (%u) mismatches;\n", 879 dev->ifname, 880 vq->last_used_idx, vq->used->idx); 881 vq->last_used_idx = vq->used->idx; 882 vq->last_avail_idx = vq->used->idx; 883 VHOST_LOG_CONFIG(WARNING, "(%s) some packets maybe resent for Tx and dropped for Rx\n", 884 dev->ifname); 885 } 886 887 vq->access_ok = true; 888 889 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address desc: %p\n", dev->ifname, vq->desc); 890 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address avail: %p\n", dev->ifname, vq->avail); 891 VHOST_LOG_CONFIG(DEBUG, "(%s) mapped address used: %p\n", dev->ifname, vq->used); 892 VHOST_LOG_CONFIG(DEBUG, "(%s) log_guest_addr: %" PRIx64 "\n", 893 dev->ifname, vq->log_guest_addr); 894 895 return dev; 896 } 897 898 /* 899 * The virtio device sends us the desc, used and avail ring addresses. 900 * This function then converts these to our address space. 901 */ 902 static int 903 vhost_user_set_vring_addr(struct virtio_net **pdev, 904 struct vhu_msg_context *ctx, 905 int main_fd __rte_unused) 906 { 907 struct virtio_net *dev = *pdev; 908 struct vhost_virtqueue *vq; 909 struct vhost_vring_addr *addr = &ctx->msg.payload.addr; 910 bool access_ok; 911 912 if (validate_msg_fds(dev, ctx, 0) != 0) 913 return RTE_VHOST_MSG_RESULT_ERR; 914 915 if (dev->mem == NULL) 916 return RTE_VHOST_MSG_RESULT_ERR; 917 918 /* addr->index refers to the queue index. The txq 1, rxq is 0. */ 919 vq = dev->virtqueue[ctx->msg.payload.addr.index]; 920 921 access_ok = vq->access_ok; 922 923 /* 924 * Rings addresses should not be interpreted as long as the ring is not 925 * started and enabled 926 */ 927 memcpy(&vq->ring_addrs, addr, sizeof(*addr)); 928 929 vring_invalidate(dev, vq); 930 931 if ((vq->enabled && (dev->features & 932 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) || 933 access_ok) { 934 dev = translate_ring_addresses(dev, ctx->msg.payload.addr.index); 935 if (!dev) 936 return RTE_VHOST_MSG_RESULT_ERR; 937 938 *pdev = dev; 939 } 940 941 return RTE_VHOST_MSG_RESULT_OK; 942 } 943 944 /* 945 * The virtio device sends us the available ring last used index. 946 */ 947 static int 948 vhost_user_set_vring_base(struct virtio_net **pdev, 949 struct vhu_msg_context *ctx, 950 int main_fd __rte_unused) 951 { 952 struct virtio_net *dev = *pdev; 953 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 954 uint64_t val = ctx->msg.payload.state.num; 955 956 if (validate_msg_fds(dev, ctx, 0) != 0) 957 return RTE_VHOST_MSG_RESULT_ERR; 958 959 if (vq_is_packed(dev)) { 960 /* 961 * Bit[0:14]: avail index 962 * Bit[15]: avail wrap counter 963 */ 964 vq->last_avail_idx = val & 0x7fff; 965 vq->avail_wrap_counter = !!(val & (0x1 << 15)); 966 /* 967 * Set used index to same value as available one, as 968 * their values should be the same since ring processing 969 * was stopped at get time. 970 */ 971 vq->last_used_idx = vq->last_avail_idx; 972 vq->used_wrap_counter = vq->avail_wrap_counter; 973 } else { 974 vq->last_used_idx = ctx->msg.payload.state.num; 975 vq->last_avail_idx = ctx->msg.payload.state.num; 976 } 977 978 VHOST_LOG_CONFIG(INFO, 979 "(%s) vring base idx:%u last_used_idx:%u last_avail_idx:%u.\n", 980 dev->ifname, ctx->msg.payload.state.index, vq->last_used_idx, 981 vq->last_avail_idx); 982 983 return RTE_VHOST_MSG_RESULT_OK; 984 } 985 986 static int 987 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, 988 uint64_t host_phys_addr, uint64_t size) 989 { 990 struct guest_page *page, *last_page; 991 struct guest_page *old_pages; 992 993 if (dev->nr_guest_pages == dev->max_guest_pages) { 994 dev->max_guest_pages *= 2; 995 old_pages = dev->guest_pages; 996 dev->guest_pages = rte_realloc(dev->guest_pages, 997 dev->max_guest_pages * sizeof(*page), 998 RTE_CACHE_LINE_SIZE); 999 if (dev->guest_pages == NULL) { 1000 VHOST_LOG_CONFIG(ERR, "(%s) cannot realloc guest_pages\n", dev->ifname); 1001 rte_free(old_pages); 1002 return -1; 1003 } 1004 } 1005 1006 if (dev->nr_guest_pages > 0) { 1007 last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; 1008 /* merge if the two pages are continuous */ 1009 if (host_phys_addr == last_page->host_phys_addr + 1010 last_page->size) { 1011 last_page->size += size; 1012 return 0; 1013 } 1014 } 1015 1016 page = &dev->guest_pages[dev->nr_guest_pages++]; 1017 page->guest_phys_addr = guest_phys_addr; 1018 page->host_phys_addr = host_phys_addr; 1019 page->size = size; 1020 1021 return 0; 1022 } 1023 1024 static int 1025 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, 1026 uint64_t page_size) 1027 { 1028 uint64_t reg_size = reg->size; 1029 uint64_t host_user_addr = reg->host_user_addr; 1030 uint64_t guest_phys_addr = reg->guest_phys_addr; 1031 uint64_t host_phys_addr; 1032 uint64_t size; 1033 1034 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); 1035 size = page_size - (guest_phys_addr & (page_size - 1)); 1036 size = RTE_MIN(size, reg_size); 1037 1038 if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0) 1039 return -1; 1040 1041 host_user_addr += size; 1042 guest_phys_addr += size; 1043 reg_size -= size; 1044 1045 while (reg_size > 0) { 1046 size = RTE_MIN(reg_size, page_size); 1047 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t) 1048 host_user_addr); 1049 if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, 1050 size) < 0) 1051 return -1; 1052 1053 host_user_addr += size; 1054 guest_phys_addr += size; 1055 reg_size -= size; 1056 } 1057 1058 /* sort guest page array if over binary search threshold */ 1059 if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) { 1060 qsort((void *)dev->guest_pages, dev->nr_guest_pages, 1061 sizeof(struct guest_page), guest_page_addrcmp); 1062 } 1063 1064 return 0; 1065 } 1066 1067 #ifdef RTE_LIBRTE_VHOST_DEBUG 1068 /* TODO: enable it only in debug mode? */ 1069 static void 1070 dump_guest_pages(struct virtio_net *dev) 1071 { 1072 uint32_t i; 1073 struct guest_page *page; 1074 1075 for (i = 0; i < dev->nr_guest_pages; i++) { 1076 page = &dev->guest_pages[i]; 1077 1078 VHOST_LOG_CONFIG(INFO, "(%s) guest physical page region %u\n", 1079 dev->ifname, i); 1080 VHOST_LOG_CONFIG(INFO, "(%s)\tguest_phys_addr: %" PRIx64 "\n", 1081 dev->ifname, page->guest_phys_addr); 1082 VHOST_LOG_CONFIG(INFO, "(%s)\thost_phys_addr : %" PRIx64 "\n", 1083 dev->ifname, page->host_phys_addr); 1084 VHOST_LOG_CONFIG(INFO, "(%s)\tsize : %" PRIx64 "\n", 1085 dev->ifname, page->size); 1086 } 1087 } 1088 #else 1089 #define dump_guest_pages(dev) 1090 #endif 1091 1092 static bool 1093 vhost_memory_changed(struct VhostUserMemory *new, 1094 struct rte_vhost_memory *old) 1095 { 1096 uint32_t i; 1097 1098 if (new->nregions != old->nregions) 1099 return true; 1100 1101 for (i = 0; i < new->nregions; ++i) { 1102 VhostUserMemoryRegion *new_r = &new->regions[i]; 1103 struct rte_vhost_mem_region *old_r = &old->regions[i]; 1104 1105 if (new_r->guest_phys_addr != old_r->guest_phys_addr) 1106 return true; 1107 if (new_r->memory_size != old_r->size) 1108 return true; 1109 if (new_r->userspace_addr != old_r->guest_user_addr) 1110 return true; 1111 } 1112 1113 return false; 1114 } 1115 1116 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 1117 static int 1118 vhost_user_postcopy_region_register(struct virtio_net *dev, 1119 struct rte_vhost_mem_region *reg) 1120 { 1121 struct uffdio_register reg_struct; 1122 1123 /* 1124 * Let's register all the mmapped area to ensure 1125 * alignment on page boundary. 1126 */ 1127 reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr; 1128 reg_struct.range.len = reg->mmap_size; 1129 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 1130 1131 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, 1132 ®_struct)) { 1133 VHOST_LOG_CONFIG(ERR, "(%s) failed to register ufd for region " 1134 "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n", 1135 dev->ifname, 1136 (uint64_t)reg_struct.range.start, 1137 (uint64_t)reg_struct.range.start + 1138 (uint64_t)reg_struct.range.len - 1, 1139 dev->postcopy_ufd, 1140 strerror(errno)); 1141 return -1; 1142 } 1143 1144 VHOST_LOG_CONFIG(INFO, 1145 "(%s)\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n", 1146 dev->ifname, 1147 (uint64_t)reg_struct.range.start, 1148 (uint64_t)reg_struct.range.start + 1149 (uint64_t)reg_struct.range.len - 1); 1150 1151 return 0; 1152 } 1153 #else 1154 static int 1155 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused, 1156 struct rte_vhost_mem_region *reg __rte_unused) 1157 { 1158 return -1; 1159 } 1160 #endif 1161 1162 static int 1163 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd, 1164 struct vhu_msg_context *ctx) 1165 { 1166 struct VhostUserMemory *memory; 1167 struct rte_vhost_mem_region *reg; 1168 struct vhu_msg_context ack_ctx; 1169 uint32_t i; 1170 1171 if (!dev->postcopy_listening) 1172 return 0; 1173 1174 /* 1175 * We haven't a better way right now than sharing 1176 * DPDK's virtual address with Qemu, so that Qemu can 1177 * retrieve the region offset when handling userfaults. 1178 */ 1179 memory = &ctx->msg.payload.memory; 1180 for (i = 0; i < memory->nregions; i++) { 1181 reg = &dev->mem->regions[i]; 1182 memory->regions[i].userspace_addr = reg->host_user_addr; 1183 } 1184 1185 /* Send the addresses back to qemu */ 1186 ctx->fd_num = 0; 1187 send_vhost_reply(dev, main_fd, ctx); 1188 1189 /* Wait for qemu to acknowledge it got the addresses 1190 * we've got to wait before we're allowed to generate faults. 1191 */ 1192 if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) { 1193 VHOST_LOG_CONFIG(ERR, "(%s) failed to read qemu ack on postcopy set-mem-table\n", 1194 dev->ifname); 1195 return -1; 1196 } 1197 1198 if (validate_msg_fds(dev, &ack_ctx, 0) != 0) 1199 return -1; 1200 1201 if (ack_ctx.msg.request.master != VHOST_USER_SET_MEM_TABLE) { 1202 VHOST_LOG_CONFIG(ERR, "(%s) bad qemu ack on postcopy set-mem-table (%d)\n", 1203 dev->ifname, ack_ctx.msg.request.master); 1204 return -1; 1205 } 1206 1207 /* Now userfault register and we can use the memory */ 1208 for (i = 0; i < memory->nregions; i++) { 1209 reg = &dev->mem->regions[i]; 1210 if (vhost_user_postcopy_region_register(dev, reg) < 0) 1211 return -1; 1212 } 1213 1214 return 0; 1215 } 1216 1217 static int 1218 vhost_user_mmap_region(struct virtio_net *dev, 1219 struct rte_vhost_mem_region *region, 1220 uint64_t mmap_offset) 1221 { 1222 void *mmap_addr; 1223 uint64_t mmap_size; 1224 uint64_t alignment; 1225 int populate; 1226 int ret; 1227 1228 /* Check for memory_size + mmap_offset overflow */ 1229 if (mmap_offset >= -region->size) { 1230 VHOST_LOG_CONFIG(ERR, "(%s) mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow\n", 1231 dev->ifname, mmap_offset, region->size); 1232 return -1; 1233 } 1234 1235 mmap_size = region->size + mmap_offset; 1236 1237 /* mmap() without flag of MAP_ANONYMOUS, should be called with length 1238 * argument aligned with hugepagesz at older longterm version Linux, 1239 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL. 1240 * 1241 * To avoid failure, make sure in caller to keep length aligned. 1242 */ 1243 alignment = get_blk_size(region->fd); 1244 if (alignment == (uint64_t)-1) { 1245 VHOST_LOG_CONFIG(ERR, "(%s) couldn't get hugepage size through fstat\n", 1246 dev->ifname); 1247 return -1; 1248 } 1249 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); 1250 if (mmap_size == 0) { 1251 /* 1252 * It could happen if initial mmap_size + alignment overflows 1253 * the sizeof uint64, which could happen if either mmap_size or 1254 * alignment value is wrong. 1255 * 1256 * mmap() kernel implementation would return an error, but 1257 * better catch it before and provide useful info in the logs. 1258 */ 1259 VHOST_LOG_CONFIG(ERR, "(%s) mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid\n", 1260 dev->ifname, region->size + mmap_offset, alignment); 1261 return -1; 1262 } 1263 1264 populate = dev->async_copy ? MAP_POPULATE : 0; 1265 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1266 MAP_SHARED | populate, region->fd, 0); 1267 1268 if (mmap_addr == MAP_FAILED) { 1269 VHOST_LOG_CONFIG(ERR, "(%s) mmap failed (%s).\n", dev->ifname, strerror(errno)); 1270 return -1; 1271 } 1272 1273 region->mmap_addr = mmap_addr; 1274 region->mmap_size = mmap_size; 1275 region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; 1276 1277 if (dev->async_copy) { 1278 if (add_guest_pages(dev, region, alignment) < 0) { 1279 VHOST_LOG_CONFIG(ERR, "(%s) adding guest pages to region failed.\n", 1280 dev->ifname); 1281 return -1; 1282 } 1283 1284 if (rte_vfio_is_enabled("vfio")) { 1285 ret = async_dma_map(dev, region, true); 1286 if (ret) { 1287 VHOST_LOG_CONFIG(ERR, 1288 "(%s) configure IOMMU for DMA engine failed\n", 1289 dev->ifname); 1290 return -1; 1291 } 1292 } 1293 } 1294 1295 VHOST_LOG_CONFIG(INFO, "(%s) guest memory region size: 0x%" PRIx64 "\n", 1296 dev->ifname, region->size); 1297 VHOST_LOG_CONFIG(INFO, "(%s)\t guest physical addr: 0x%" PRIx64 "\n", 1298 dev->ifname, region->guest_phys_addr); 1299 VHOST_LOG_CONFIG(INFO, "(%s)\t guest virtual addr: 0x%" PRIx64 "\n", 1300 dev->ifname, region->guest_user_addr); 1301 VHOST_LOG_CONFIG(INFO, "(%s)\t host virtual addr: 0x%" PRIx64 "\n", 1302 dev->ifname, region->host_user_addr); 1303 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap addr : 0x%" PRIx64 "\n", 1304 dev->ifname, (uint64_t)(uintptr_t)mmap_addr); 1305 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap size : 0x%" PRIx64 "\n", 1306 dev->ifname, mmap_size); 1307 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap align: 0x%" PRIx64 "\n", 1308 dev->ifname, alignment); 1309 VHOST_LOG_CONFIG(INFO, "(%s)\t mmap off : 0x%" PRIx64 "\n", 1310 dev->ifname, mmap_offset); 1311 1312 return 0; 1313 } 1314 1315 static int 1316 vhost_user_set_mem_table(struct virtio_net **pdev, 1317 struct vhu_msg_context *ctx, 1318 int main_fd) 1319 { 1320 struct virtio_net *dev = *pdev; 1321 struct VhostUserMemory *memory = &ctx->msg.payload.memory; 1322 struct rte_vhost_mem_region *reg; 1323 int numa_node = SOCKET_ID_ANY; 1324 uint64_t mmap_offset; 1325 uint32_t i; 1326 bool async_notify = false; 1327 1328 if (validate_msg_fds(dev, ctx, memory->nregions) != 0) 1329 return RTE_VHOST_MSG_RESULT_ERR; 1330 1331 if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { 1332 VHOST_LOG_CONFIG(ERR, "(%s) too many memory regions (%u)\n", 1333 dev->ifname, memory->nregions); 1334 goto close_msg_fds; 1335 } 1336 1337 if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { 1338 VHOST_LOG_CONFIG(INFO, "(%s) memory regions not changed\n", dev->ifname); 1339 1340 close_msg_fds(ctx); 1341 1342 return RTE_VHOST_MSG_RESULT_OK; 1343 } 1344 1345 if (dev->mem) { 1346 if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) { 1347 struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev; 1348 1349 if (vdpa_dev && vdpa_dev->ops->dev_close) 1350 vdpa_dev->ops->dev_close(dev->vid); 1351 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 1352 } 1353 1354 /* notify the vhost application to stop DMA transfers */ 1355 if (dev->async_copy && dev->notify_ops->vring_state_changed) { 1356 for (i = 0; i < dev->nr_vring; i++) { 1357 dev->notify_ops->vring_state_changed(dev->vid, 1358 i, 0); 1359 } 1360 async_notify = true; 1361 } 1362 1363 free_mem_region(dev); 1364 rte_free(dev->mem); 1365 dev->mem = NULL; 1366 } 1367 1368 /* Flush IOTLB cache as previous HVAs are now invalid */ 1369 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) 1370 for (i = 0; i < dev->nr_vring; i++) 1371 vhost_user_iotlb_flush_all(dev->virtqueue[i]); 1372 1373 /* 1374 * If VQ 0 has already been allocated, try to allocate on the same 1375 * NUMA node. It can be reallocated later in numa_realloc(). 1376 */ 1377 if (dev->nr_vring > 0) 1378 numa_node = dev->virtqueue[0]->numa_node; 1379 1380 dev->nr_guest_pages = 0; 1381 if (dev->guest_pages == NULL) { 1382 dev->max_guest_pages = 8; 1383 dev->guest_pages = rte_zmalloc_socket(NULL, 1384 dev->max_guest_pages * 1385 sizeof(struct guest_page), 1386 RTE_CACHE_LINE_SIZE, 1387 numa_node); 1388 if (dev->guest_pages == NULL) { 1389 VHOST_LOG_CONFIG(ERR, 1390 "(%s) failed to allocate memory for dev->guest_pages\n", 1391 dev->ifname); 1392 goto close_msg_fds; 1393 } 1394 } 1395 1396 dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) + 1397 sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node); 1398 if (dev->mem == NULL) { 1399 VHOST_LOG_CONFIG(ERR, 1400 "(%s) failed to allocate memory for dev->mem\n", 1401 dev->ifname); 1402 goto free_guest_pages; 1403 } 1404 1405 for (i = 0; i < memory->nregions; i++) { 1406 reg = &dev->mem->regions[i]; 1407 1408 reg->guest_phys_addr = memory->regions[i].guest_phys_addr; 1409 reg->guest_user_addr = memory->regions[i].userspace_addr; 1410 reg->size = memory->regions[i].memory_size; 1411 reg->fd = ctx->fds[i]; 1412 1413 /* 1414 * Assign invalid file descriptor value to avoid double 1415 * closing on error path. 1416 */ 1417 ctx->fds[i] = -1; 1418 1419 mmap_offset = memory->regions[i].mmap_offset; 1420 1421 if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) { 1422 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap region %u\n", dev->ifname, i); 1423 goto free_mem_table; 1424 } 1425 1426 dev->mem->nregions++; 1427 } 1428 1429 if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0) 1430 goto free_mem_table; 1431 1432 for (i = 0; i < dev->nr_vring; i++) { 1433 struct vhost_virtqueue *vq = dev->virtqueue[i]; 1434 1435 if (!vq) 1436 continue; 1437 1438 if (vq->desc || vq->avail || vq->used) { 1439 /* 1440 * If the memory table got updated, the ring addresses 1441 * need to be translated again as virtual addresses have 1442 * changed. 1443 */ 1444 vring_invalidate(dev, vq); 1445 1446 dev = translate_ring_addresses(dev, i); 1447 if (!dev) { 1448 dev = *pdev; 1449 goto free_mem_table; 1450 } 1451 1452 *pdev = dev; 1453 } 1454 } 1455 1456 dump_guest_pages(dev); 1457 1458 if (async_notify) { 1459 for (i = 0; i < dev->nr_vring; i++) 1460 dev->notify_ops->vring_state_changed(dev->vid, i, 1); 1461 } 1462 1463 return RTE_VHOST_MSG_RESULT_OK; 1464 1465 free_mem_table: 1466 free_mem_region(dev); 1467 rte_free(dev->mem); 1468 dev->mem = NULL; 1469 1470 free_guest_pages: 1471 rte_free(dev->guest_pages); 1472 dev->guest_pages = NULL; 1473 close_msg_fds: 1474 close_msg_fds(ctx); 1475 return RTE_VHOST_MSG_RESULT_ERR; 1476 } 1477 1478 static bool 1479 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) 1480 { 1481 bool rings_ok; 1482 1483 if (!vq) 1484 return false; 1485 1486 if (vq_is_packed(dev)) 1487 rings_ok = vq->desc_packed && vq->driver_event && 1488 vq->device_event; 1489 else 1490 rings_ok = vq->desc && vq->avail && vq->used; 1491 1492 return rings_ok && 1493 vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && 1494 vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && 1495 vq->enabled; 1496 } 1497 1498 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u 1499 1500 static int 1501 virtio_is_ready(struct virtio_net *dev) 1502 { 1503 struct vhost_virtqueue *vq; 1504 uint32_t i, nr_vring = dev->nr_vring; 1505 1506 if (dev->flags & VIRTIO_DEV_READY) 1507 return 1; 1508 1509 if (!dev->nr_vring) 1510 return 0; 1511 1512 if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) { 1513 nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY; 1514 1515 if (dev->nr_vring < nr_vring) 1516 return 0; 1517 } 1518 1519 for (i = 0; i < nr_vring; i++) { 1520 vq = dev->virtqueue[i]; 1521 1522 if (!vq_is_ready(dev, vq)) 1523 return 0; 1524 } 1525 1526 /* If supported, ensure the frontend is really done with config */ 1527 if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS)) 1528 if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)) 1529 return 0; 1530 1531 dev->flags |= VIRTIO_DEV_READY; 1532 1533 if (!(dev->flags & VIRTIO_DEV_RUNNING)) 1534 VHOST_LOG_CONFIG(INFO, "(%s) virtio is now ready for processing.\n", dev->ifname); 1535 return 1; 1536 } 1537 1538 static void * 1539 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd) 1540 { 1541 void *ptr; 1542 int mfd = -1; 1543 char fname[20] = "/tmp/memfd-XXXXXX"; 1544 1545 *fd = -1; 1546 #ifdef MEMFD_SUPPORTED 1547 mfd = memfd_create(name, MFD_CLOEXEC); 1548 #else 1549 RTE_SET_USED(name); 1550 #endif 1551 if (mfd == -1) { 1552 mfd = mkstemp(fname); 1553 if (mfd == -1) { 1554 VHOST_LOG_CONFIG(ERR, "(%s) failed to get inflight buffer fd\n", 1555 dev->ifname); 1556 return NULL; 1557 } 1558 1559 unlink(fname); 1560 } 1561 1562 if (ftruncate(mfd, size) == -1) { 1563 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc inflight buffer\n", dev->ifname); 1564 close(mfd); 1565 return NULL; 1566 } 1567 1568 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); 1569 if (ptr == MAP_FAILED) { 1570 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap inflight buffer\n", dev->ifname); 1571 close(mfd); 1572 return NULL; 1573 } 1574 1575 *fd = mfd; 1576 return ptr; 1577 } 1578 1579 static uint32_t 1580 get_pervq_shm_size_split(uint16_t queue_size) 1581 { 1582 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) * 1583 queue_size + sizeof(uint64_t) + 1584 sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT); 1585 } 1586 1587 static uint32_t 1588 get_pervq_shm_size_packed(uint16_t queue_size) 1589 { 1590 return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed) 1591 * queue_size + sizeof(uint64_t) + 1592 sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9, 1593 INFLIGHT_ALIGNMENT); 1594 } 1595 1596 static int 1597 vhost_user_get_inflight_fd(struct virtio_net **pdev, 1598 struct vhu_msg_context *ctx, 1599 int main_fd __rte_unused) 1600 { 1601 struct rte_vhost_inflight_info_packed *inflight_packed; 1602 uint64_t pervq_inflight_size, mmap_size; 1603 uint16_t num_queues, queue_size; 1604 struct virtio_net *dev = *pdev; 1605 int fd, i, j; 1606 int numa_node = SOCKET_ID_ANY; 1607 void *addr; 1608 1609 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) { 1610 VHOST_LOG_CONFIG(ERR, "(%s) invalid get_inflight_fd message size is %d\n", 1611 dev->ifname, ctx->msg.size); 1612 return RTE_VHOST_MSG_RESULT_ERR; 1613 } 1614 1615 /* 1616 * If VQ 0 has already been allocated, try to allocate on the same 1617 * NUMA node. It can be reallocated later in numa_realloc(). 1618 */ 1619 if (dev->nr_vring > 0) 1620 numa_node = dev->virtqueue[0]->numa_node; 1621 1622 if (dev->inflight_info == NULL) { 1623 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1624 sizeof(struct inflight_mem_info), 0, numa_node); 1625 if (!dev->inflight_info) { 1626 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1627 dev->ifname); 1628 return RTE_VHOST_MSG_RESULT_ERR; 1629 } 1630 dev->inflight_info->fd = -1; 1631 } 1632 1633 num_queues = ctx->msg.payload.inflight.num_queues; 1634 queue_size = ctx->msg.payload.inflight.queue_size; 1635 1636 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd num_queues: %u\n", 1637 dev->ifname, ctx->msg.payload.inflight.num_queues); 1638 VHOST_LOG_CONFIG(INFO, "(%s) get_inflight_fd queue_size: %u\n", 1639 dev->ifname, ctx->msg.payload.inflight.queue_size); 1640 1641 if (vq_is_packed(dev)) 1642 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1643 else 1644 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1645 1646 mmap_size = num_queues * pervq_inflight_size; 1647 addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd); 1648 if (!addr) { 1649 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc vhost inflight area\n", dev->ifname); 1650 ctx->msg.payload.inflight.mmap_size = 0; 1651 return RTE_VHOST_MSG_RESULT_ERR; 1652 } 1653 memset(addr, 0, mmap_size); 1654 1655 if (dev->inflight_info->addr) { 1656 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1657 dev->inflight_info->addr = NULL; 1658 } 1659 1660 if (dev->inflight_info->fd >= 0) { 1661 close(dev->inflight_info->fd); 1662 dev->inflight_info->fd = -1; 1663 } 1664 1665 dev->inflight_info->addr = addr; 1666 dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size; 1667 dev->inflight_info->fd = ctx->fds[0] = fd; 1668 ctx->msg.payload.inflight.mmap_offset = 0; 1669 ctx->fd_num = 1; 1670 1671 if (vq_is_packed(dev)) { 1672 for (i = 0; i < num_queues; i++) { 1673 inflight_packed = 1674 (struct rte_vhost_inflight_info_packed *)addr; 1675 inflight_packed->used_wrap_counter = 1; 1676 inflight_packed->old_used_wrap_counter = 1; 1677 for (j = 0; j < queue_size; j++) 1678 inflight_packed->desc[j].next = j + 1; 1679 addr = (void *)((char *)addr + pervq_inflight_size); 1680 } 1681 } 1682 1683 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_size: %"PRIu64"\n", 1684 dev->ifname, ctx->msg.payload.inflight.mmap_size); 1685 VHOST_LOG_CONFIG(INFO, "(%s) send inflight mmap_offset: %"PRIu64"\n", 1686 dev->ifname, ctx->msg.payload.inflight.mmap_offset); 1687 VHOST_LOG_CONFIG(INFO, "(%s) send inflight fd: %d\n", dev->ifname, ctx->fds[0]); 1688 1689 return RTE_VHOST_MSG_RESULT_REPLY; 1690 } 1691 1692 static int 1693 vhost_user_set_inflight_fd(struct virtio_net **pdev, 1694 struct vhu_msg_context *ctx, 1695 int main_fd __rte_unused) 1696 { 1697 uint64_t mmap_size, mmap_offset; 1698 uint16_t num_queues, queue_size; 1699 struct virtio_net *dev = *pdev; 1700 uint32_t pervq_inflight_size; 1701 struct vhost_virtqueue *vq; 1702 void *addr; 1703 int fd, i; 1704 int numa_node = SOCKET_ID_ANY; 1705 1706 fd = ctx->fds[0]; 1707 if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) { 1708 VHOST_LOG_CONFIG(ERR, "(%s) invalid set_inflight_fd message size is %d,fd is %d\n", 1709 dev->ifname, ctx->msg.size, fd); 1710 return RTE_VHOST_MSG_RESULT_ERR; 1711 } 1712 1713 mmap_size = ctx->msg.payload.inflight.mmap_size; 1714 mmap_offset = ctx->msg.payload.inflight.mmap_offset; 1715 num_queues = ctx->msg.payload.inflight.num_queues; 1716 queue_size = ctx->msg.payload.inflight.queue_size; 1717 1718 if (vq_is_packed(dev)) 1719 pervq_inflight_size = get_pervq_shm_size_packed(queue_size); 1720 else 1721 pervq_inflight_size = get_pervq_shm_size_split(queue_size); 1722 1723 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_size: %"PRIu64"\n", 1724 dev->ifname, mmap_size); 1725 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd mmap_offset: %"PRIu64"\n", 1726 dev->ifname, mmap_offset); 1727 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd num_queues: %u\n", dev->ifname, num_queues); 1728 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd queue_size: %u\n", dev->ifname, queue_size); 1729 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd fd: %d\n", dev->ifname, fd); 1730 VHOST_LOG_CONFIG(INFO, "(%s) set_inflight_fd pervq_inflight_size: %d\n", 1731 dev->ifname, pervq_inflight_size); 1732 1733 /* 1734 * If VQ 0 has already been allocated, try to allocate on the same 1735 * NUMA node. It can be reallocated later in numa_realloc(). 1736 */ 1737 if (dev->nr_vring > 0) 1738 numa_node = dev->virtqueue[0]->numa_node; 1739 1740 if (!dev->inflight_info) { 1741 dev->inflight_info = rte_zmalloc_socket("inflight_info", 1742 sizeof(struct inflight_mem_info), 0, numa_node); 1743 if (dev->inflight_info == NULL) { 1744 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc dev inflight area\n", 1745 dev->ifname); 1746 return RTE_VHOST_MSG_RESULT_ERR; 1747 } 1748 dev->inflight_info->fd = -1; 1749 } 1750 1751 if (dev->inflight_info->addr) { 1752 munmap(dev->inflight_info->addr, dev->inflight_info->size); 1753 dev->inflight_info->addr = NULL; 1754 } 1755 1756 addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1757 fd, mmap_offset); 1758 if (addr == MAP_FAILED) { 1759 VHOST_LOG_CONFIG(ERR, "(%s) failed to mmap share memory.\n", dev->ifname); 1760 return RTE_VHOST_MSG_RESULT_ERR; 1761 } 1762 1763 if (dev->inflight_info->fd >= 0) { 1764 close(dev->inflight_info->fd); 1765 dev->inflight_info->fd = -1; 1766 } 1767 1768 dev->inflight_info->fd = fd; 1769 dev->inflight_info->addr = addr; 1770 dev->inflight_info->size = mmap_size; 1771 1772 for (i = 0; i < num_queues; i++) { 1773 vq = dev->virtqueue[i]; 1774 if (!vq) 1775 continue; 1776 1777 if (vq_is_packed(dev)) { 1778 vq->inflight_packed = addr; 1779 vq->inflight_packed->desc_num = queue_size; 1780 } else { 1781 vq->inflight_split = addr; 1782 vq->inflight_split->desc_num = queue_size; 1783 } 1784 addr = (void *)((char *)addr + pervq_inflight_size); 1785 } 1786 1787 return RTE_VHOST_MSG_RESULT_OK; 1788 } 1789 1790 static int 1791 vhost_user_set_vring_call(struct virtio_net **pdev, 1792 struct vhu_msg_context *ctx, 1793 int main_fd __rte_unused) 1794 { 1795 struct virtio_net *dev = *pdev; 1796 struct vhost_vring_file file; 1797 struct vhost_virtqueue *vq; 1798 int expected_fds; 1799 1800 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1801 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1802 return RTE_VHOST_MSG_RESULT_ERR; 1803 1804 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 1805 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 1806 file.fd = VIRTIO_INVALID_EVENTFD; 1807 else 1808 file.fd = ctx->fds[0]; 1809 VHOST_LOG_CONFIG(INFO, "(%s) vring call idx:%d file:%d\n", 1810 dev->ifname, file.index, file.fd); 1811 1812 vq = dev->virtqueue[file.index]; 1813 1814 if (vq->ready) { 1815 vq->ready = false; 1816 vhost_user_notify_queue_state(dev, file.index, 0); 1817 } 1818 1819 if (vq->callfd >= 0) 1820 close(vq->callfd); 1821 1822 vq->callfd = file.fd; 1823 1824 return RTE_VHOST_MSG_RESULT_OK; 1825 } 1826 1827 static int vhost_user_set_vring_err(struct virtio_net **pdev, 1828 struct vhu_msg_context *ctx, 1829 int main_fd __rte_unused) 1830 { 1831 struct virtio_net *dev = *pdev; 1832 int expected_fds; 1833 1834 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 1835 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 1836 return RTE_VHOST_MSG_RESULT_ERR; 1837 1838 if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) 1839 close(ctx->fds[0]); 1840 VHOST_LOG_CONFIG(INFO, "(%s) not implemented\n", dev->ifname); 1841 1842 return RTE_VHOST_MSG_RESULT_OK; 1843 } 1844 1845 static int 1846 resubmit_desc_compare(const void *a, const void *b) 1847 { 1848 const struct rte_vhost_resubmit_desc *desc0 = a; 1849 const struct rte_vhost_resubmit_desc *desc1 = b; 1850 1851 if (desc1->counter > desc0->counter) 1852 return 1; 1853 1854 return -1; 1855 } 1856 1857 static int 1858 vhost_check_queue_inflights_split(struct virtio_net *dev, 1859 struct vhost_virtqueue *vq) 1860 { 1861 uint16_t i; 1862 uint16_t resubmit_num = 0, last_io, num; 1863 struct vring_used *used = vq->used; 1864 struct rte_vhost_resubmit_info *resubmit; 1865 struct rte_vhost_inflight_info_split *inflight_split; 1866 1867 if (!(dev->protocol_features & 1868 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1869 return RTE_VHOST_MSG_RESULT_OK; 1870 1871 /* The frontend may still not support the inflight feature 1872 * although we negotiate the protocol feature. 1873 */ 1874 if ((!vq->inflight_split)) 1875 return RTE_VHOST_MSG_RESULT_OK; 1876 1877 if (!vq->inflight_split->version) { 1878 vq->inflight_split->version = INFLIGHT_VERSION; 1879 return RTE_VHOST_MSG_RESULT_OK; 1880 } 1881 1882 if (vq->resubmit_inflight) 1883 return RTE_VHOST_MSG_RESULT_OK; 1884 1885 inflight_split = vq->inflight_split; 1886 vq->global_counter = 0; 1887 last_io = inflight_split->last_inflight_io; 1888 1889 if (inflight_split->used_idx != used->idx) { 1890 inflight_split->desc[last_io].inflight = 0; 1891 rte_atomic_thread_fence(__ATOMIC_SEQ_CST); 1892 inflight_split->used_idx = used->idx; 1893 } 1894 1895 for (i = 0; i < inflight_split->desc_num; i++) { 1896 if (inflight_split->desc[i].inflight == 1) 1897 resubmit_num++; 1898 } 1899 1900 vq->last_avail_idx += resubmit_num; 1901 1902 if (resubmit_num) { 1903 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 1904 0, vq->numa_node); 1905 if (!resubmit) { 1906 VHOST_LOG_CONFIG(ERR, 1907 "(%s) failed to allocate memory for resubmit info.\n", 1908 dev->ifname); 1909 return RTE_VHOST_MSG_RESULT_ERR; 1910 } 1911 1912 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 1913 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 1914 0, vq->numa_node); 1915 if (!resubmit->resubmit_list) { 1916 VHOST_LOG_CONFIG(ERR, 1917 "(%s) failed to allocate memory for inflight desc.\n", 1918 dev->ifname); 1919 rte_free(resubmit); 1920 return RTE_VHOST_MSG_RESULT_ERR; 1921 } 1922 1923 num = 0; 1924 for (i = 0; i < vq->inflight_split->desc_num; i++) { 1925 if (vq->inflight_split->desc[i].inflight == 1) { 1926 resubmit->resubmit_list[num].index = i; 1927 resubmit->resubmit_list[num].counter = 1928 inflight_split->desc[i].counter; 1929 num++; 1930 } 1931 } 1932 resubmit->resubmit_num = num; 1933 1934 if (resubmit->resubmit_num > 1) 1935 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 1936 sizeof(struct rte_vhost_resubmit_desc), 1937 resubmit_desc_compare); 1938 1939 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 1940 vq->resubmit_inflight = resubmit; 1941 } 1942 1943 return RTE_VHOST_MSG_RESULT_OK; 1944 } 1945 1946 static int 1947 vhost_check_queue_inflights_packed(struct virtio_net *dev, 1948 struct vhost_virtqueue *vq) 1949 { 1950 uint16_t i; 1951 uint16_t resubmit_num = 0, old_used_idx, num; 1952 struct rte_vhost_resubmit_info *resubmit; 1953 struct rte_vhost_inflight_info_packed *inflight_packed; 1954 1955 if (!(dev->protocol_features & 1956 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) 1957 return RTE_VHOST_MSG_RESULT_OK; 1958 1959 /* The frontend may still not support the inflight feature 1960 * although we negotiate the protocol feature. 1961 */ 1962 if ((!vq->inflight_packed)) 1963 return RTE_VHOST_MSG_RESULT_OK; 1964 1965 if (!vq->inflight_packed->version) { 1966 vq->inflight_packed->version = INFLIGHT_VERSION; 1967 return RTE_VHOST_MSG_RESULT_OK; 1968 } 1969 1970 if (vq->resubmit_inflight) 1971 return RTE_VHOST_MSG_RESULT_OK; 1972 1973 inflight_packed = vq->inflight_packed; 1974 vq->global_counter = 0; 1975 old_used_idx = inflight_packed->old_used_idx; 1976 1977 if (inflight_packed->used_idx != old_used_idx) { 1978 if (inflight_packed->desc[old_used_idx].inflight == 0) { 1979 inflight_packed->old_used_idx = 1980 inflight_packed->used_idx; 1981 inflight_packed->old_used_wrap_counter = 1982 inflight_packed->used_wrap_counter; 1983 inflight_packed->old_free_head = 1984 inflight_packed->free_head; 1985 } else { 1986 inflight_packed->used_idx = 1987 inflight_packed->old_used_idx; 1988 inflight_packed->used_wrap_counter = 1989 inflight_packed->old_used_wrap_counter; 1990 inflight_packed->free_head = 1991 inflight_packed->old_free_head; 1992 } 1993 } 1994 1995 for (i = 0; i < inflight_packed->desc_num; i++) { 1996 if (inflight_packed->desc[i].inflight == 1) 1997 resubmit_num++; 1998 } 1999 2000 if (resubmit_num) { 2001 resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info), 2002 0, vq->numa_node); 2003 if (resubmit == NULL) { 2004 VHOST_LOG_CONFIG(ERR, 2005 "(%s) failed to allocate memory for resubmit info.\n", 2006 dev->ifname); 2007 return RTE_VHOST_MSG_RESULT_ERR; 2008 } 2009 2010 resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", 2011 resubmit_num * sizeof(struct rte_vhost_resubmit_desc), 2012 0, vq->numa_node); 2013 if (resubmit->resubmit_list == NULL) { 2014 VHOST_LOG_CONFIG(ERR, 2015 "(%s) failed to allocate memory for resubmit desc.\n", 2016 dev->ifname); 2017 rte_free(resubmit); 2018 return RTE_VHOST_MSG_RESULT_ERR; 2019 } 2020 2021 num = 0; 2022 for (i = 0; i < inflight_packed->desc_num; i++) { 2023 if (vq->inflight_packed->desc[i].inflight == 1) { 2024 resubmit->resubmit_list[num].index = i; 2025 resubmit->resubmit_list[num].counter = 2026 inflight_packed->desc[i].counter; 2027 num++; 2028 } 2029 } 2030 resubmit->resubmit_num = num; 2031 2032 if (resubmit->resubmit_num > 1) 2033 qsort(resubmit->resubmit_list, resubmit->resubmit_num, 2034 sizeof(struct rte_vhost_resubmit_desc), 2035 resubmit_desc_compare); 2036 2037 vq->global_counter = resubmit->resubmit_list[0].counter + 1; 2038 vq->resubmit_inflight = resubmit; 2039 } 2040 2041 return RTE_VHOST_MSG_RESULT_OK; 2042 } 2043 2044 static int 2045 vhost_user_set_vring_kick(struct virtio_net **pdev, 2046 struct vhu_msg_context *ctx, 2047 int main_fd __rte_unused) 2048 { 2049 struct virtio_net *dev = *pdev; 2050 struct vhost_vring_file file; 2051 struct vhost_virtqueue *vq; 2052 int expected_fds; 2053 2054 expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1; 2055 if (validate_msg_fds(dev, ctx, expected_fds) != 0) 2056 return RTE_VHOST_MSG_RESULT_ERR; 2057 2058 file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2059 if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) 2060 file.fd = VIRTIO_INVALID_EVENTFD; 2061 else 2062 file.fd = ctx->fds[0]; 2063 VHOST_LOG_CONFIG(INFO, "(%s) vring kick idx:%d file:%d\n", 2064 dev->ifname, file.index, file.fd); 2065 2066 /* Interpret ring addresses only when ring is started. */ 2067 dev = translate_ring_addresses(dev, file.index); 2068 if (!dev) { 2069 if (file.fd != VIRTIO_INVALID_EVENTFD) 2070 close(file.fd); 2071 2072 return RTE_VHOST_MSG_RESULT_ERR; 2073 } 2074 2075 *pdev = dev; 2076 2077 vq = dev->virtqueue[file.index]; 2078 2079 /* 2080 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, 2081 * the ring starts already enabled. Otherwise, it is enabled via 2082 * the SET_VRING_ENABLE message. 2083 */ 2084 if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { 2085 vq->enabled = true; 2086 } 2087 2088 if (vq->ready) { 2089 vq->ready = false; 2090 vhost_user_notify_queue_state(dev, file.index, 0); 2091 } 2092 2093 if (vq->kickfd >= 0) 2094 close(vq->kickfd); 2095 vq->kickfd = file.fd; 2096 2097 if (vq_is_packed(dev)) { 2098 if (vhost_check_queue_inflights_packed(dev, vq)) { 2099 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2100 dev->ifname, file.index); 2101 return RTE_VHOST_MSG_RESULT_ERR; 2102 } 2103 } else { 2104 if (vhost_check_queue_inflights_split(dev, vq)) { 2105 VHOST_LOG_CONFIG(ERR, "(%s) failed to inflights for vq: %d\n", 2106 dev->ifname, file.index); 2107 return RTE_VHOST_MSG_RESULT_ERR; 2108 } 2109 } 2110 2111 return RTE_VHOST_MSG_RESULT_OK; 2112 } 2113 2114 /* 2115 * when virtio is stopped, qemu will send us the GET_VRING_BASE message. 2116 */ 2117 static int 2118 vhost_user_get_vring_base(struct virtio_net **pdev, 2119 struct vhu_msg_context *ctx, 2120 int main_fd __rte_unused) 2121 { 2122 struct virtio_net *dev = *pdev; 2123 struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index]; 2124 uint64_t val; 2125 2126 if (validate_msg_fds(dev, ctx, 0) != 0) 2127 return RTE_VHOST_MSG_RESULT_ERR; 2128 2129 /* We have to stop the queue (virtio) if it is running. */ 2130 vhost_destroy_device_notify(dev); 2131 2132 dev->flags &= ~VIRTIO_DEV_READY; 2133 dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; 2134 2135 /* Here we are safe to get the indexes */ 2136 if (vq_is_packed(dev)) { 2137 /* 2138 * Bit[0:14]: avail index 2139 * Bit[15]: avail wrap counter 2140 */ 2141 val = vq->last_avail_idx & 0x7fff; 2142 val |= vq->avail_wrap_counter << 15; 2143 ctx->msg.payload.state.num = val; 2144 } else { 2145 ctx->msg.payload.state.num = vq->last_avail_idx; 2146 } 2147 2148 VHOST_LOG_CONFIG(INFO, "(%s) vring base idx:%d file:%d\n", 2149 dev->ifname, ctx->msg.payload.state.index, 2150 ctx->msg.payload.state.num); 2151 /* 2152 * Based on current qemu vhost-user implementation, this message is 2153 * sent and only sent in vhost_vring_stop. 2154 * TODO: cleanup the vring, it isn't usable since here. 2155 */ 2156 if (vq->kickfd >= 0) 2157 close(vq->kickfd); 2158 2159 vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; 2160 2161 if (vq->callfd >= 0) 2162 close(vq->callfd); 2163 2164 vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; 2165 2166 vq->signalled_used_valid = false; 2167 2168 if (vq_is_packed(dev)) { 2169 rte_free(vq->shadow_used_packed); 2170 vq->shadow_used_packed = NULL; 2171 } else { 2172 rte_free(vq->shadow_used_split); 2173 vq->shadow_used_split = NULL; 2174 } 2175 2176 rte_free(vq->batch_copy_elems); 2177 vq->batch_copy_elems = NULL; 2178 2179 rte_free(vq->log_cache); 2180 vq->log_cache = NULL; 2181 2182 ctx->msg.size = sizeof(ctx->msg.payload.state); 2183 ctx->fd_num = 0; 2184 2185 vhost_user_iotlb_flush_all(vq); 2186 2187 vring_invalidate(dev, vq); 2188 2189 return RTE_VHOST_MSG_RESULT_REPLY; 2190 } 2191 2192 /* 2193 * when virtio queues are ready to work, qemu will send us to 2194 * enable the virtio queue pair. 2195 */ 2196 static int 2197 vhost_user_set_vring_enable(struct virtio_net **pdev, 2198 struct vhu_msg_context *ctx, 2199 int main_fd __rte_unused) 2200 { 2201 struct virtio_net *dev = *pdev; 2202 bool enable = !!ctx->msg.payload.state.num; 2203 int index = (int)ctx->msg.payload.state.index; 2204 2205 if (validate_msg_fds(dev, ctx, 0) != 0) 2206 return RTE_VHOST_MSG_RESULT_ERR; 2207 2208 VHOST_LOG_CONFIG(INFO, "(%s) set queue enable: %d to qp idx: %d\n", 2209 dev->ifname, enable, index); 2210 2211 if (enable && dev->virtqueue[index]->async) { 2212 if (dev->virtqueue[index]->async->pkts_inflight_n) { 2213 VHOST_LOG_CONFIG(ERR, 2214 "(%s) failed to enable vring. Inflight packets must be completed first\n", 2215 dev->ifname); 2216 return RTE_VHOST_MSG_RESULT_ERR; 2217 } 2218 } 2219 2220 dev->virtqueue[index]->enabled = enable; 2221 2222 return RTE_VHOST_MSG_RESULT_OK; 2223 } 2224 2225 static int 2226 vhost_user_get_protocol_features(struct virtio_net **pdev, 2227 struct vhu_msg_context *ctx, 2228 int main_fd __rte_unused) 2229 { 2230 struct virtio_net *dev = *pdev; 2231 uint64_t features, protocol_features; 2232 2233 if (validate_msg_fds(dev, ctx, 0) != 0) 2234 return RTE_VHOST_MSG_RESULT_ERR; 2235 2236 rte_vhost_driver_get_features(dev->ifname, &features); 2237 rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); 2238 2239 ctx->msg.payload.u64 = protocol_features; 2240 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2241 ctx->fd_num = 0; 2242 2243 return RTE_VHOST_MSG_RESULT_REPLY; 2244 } 2245 2246 static int 2247 vhost_user_set_protocol_features(struct virtio_net **pdev, 2248 struct vhu_msg_context *ctx, 2249 int main_fd __rte_unused) 2250 { 2251 struct virtio_net *dev = *pdev; 2252 uint64_t protocol_features = ctx->msg.payload.u64; 2253 uint64_t slave_protocol_features = 0; 2254 2255 if (validate_msg_fds(dev, ctx, 0) != 0) 2256 return RTE_VHOST_MSG_RESULT_ERR; 2257 2258 rte_vhost_driver_get_protocol_features(dev->ifname, 2259 &slave_protocol_features); 2260 if (protocol_features & ~slave_protocol_features) { 2261 VHOST_LOG_CONFIG(ERR, "(%s) received invalid protocol features.\n", dev->ifname); 2262 return RTE_VHOST_MSG_RESULT_ERR; 2263 } 2264 2265 dev->protocol_features = protocol_features; 2266 VHOST_LOG_CONFIG(INFO, "(%s) negotiated Vhost-user protocol features: 0x%" PRIx64 "\n", 2267 dev->ifname, dev->protocol_features); 2268 2269 return RTE_VHOST_MSG_RESULT_OK; 2270 } 2271 2272 static int 2273 vhost_user_set_log_base(struct virtio_net **pdev, 2274 struct vhu_msg_context *ctx, 2275 int main_fd __rte_unused) 2276 { 2277 struct virtio_net *dev = *pdev; 2278 int fd = ctx->fds[0]; 2279 uint64_t size, off; 2280 void *addr; 2281 uint32_t i; 2282 2283 if (validate_msg_fds(dev, ctx, 1) != 0) 2284 return RTE_VHOST_MSG_RESULT_ERR; 2285 2286 if (fd < 0) { 2287 VHOST_LOG_CONFIG(ERR, "(%s) invalid log fd: %d\n", dev->ifname, fd); 2288 return RTE_VHOST_MSG_RESULT_ERR; 2289 } 2290 2291 if (ctx->msg.size != sizeof(VhostUserLog)) { 2292 VHOST_LOG_CONFIG(ERR, "(%s) invalid log base msg size: %"PRId32" != %d\n", 2293 dev->ifname, ctx->msg.size, (int)sizeof(VhostUserLog)); 2294 goto close_msg_fds; 2295 } 2296 2297 size = ctx->msg.payload.log.mmap_size; 2298 off = ctx->msg.payload.log.mmap_offset; 2299 2300 /* Check for mmap size and offset overflow. */ 2301 if (off >= -size) { 2302 VHOST_LOG_CONFIG(ERR, 2303 "(%s) log offset %#"PRIx64" and log size %#"PRIx64" overflow\n", 2304 dev->ifname, off, size); 2305 goto close_msg_fds; 2306 } 2307 2308 VHOST_LOG_CONFIG(INFO, "(%s) log mmap size: %"PRId64", offset: %"PRId64"\n", 2309 dev->ifname, size, off); 2310 2311 /* 2312 * mmap from 0 to workaround a hugepage mmap bug: mmap will 2313 * fail when offset is not page size aligned. 2314 */ 2315 addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 2316 close(fd); 2317 if (addr == MAP_FAILED) { 2318 VHOST_LOG_CONFIG(ERR, "(%s) mmap log base failed!\n", dev->ifname); 2319 return RTE_VHOST_MSG_RESULT_ERR; 2320 } 2321 2322 /* 2323 * Free previously mapped log memory on occasionally 2324 * multiple VHOST_USER_SET_LOG_BASE. 2325 */ 2326 if (dev->log_addr) { 2327 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); 2328 } 2329 dev->log_addr = (uint64_t)(uintptr_t)addr; 2330 dev->log_base = dev->log_addr + off; 2331 dev->log_size = size; 2332 2333 for (i = 0; i < dev->nr_vring; i++) { 2334 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2335 2336 rte_free(vq->log_cache); 2337 vq->log_cache = NULL; 2338 vq->log_cache_nb_elem = 0; 2339 vq->log_cache = rte_malloc_socket("vq log cache", 2340 sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR, 2341 0, vq->numa_node); 2342 /* 2343 * If log cache alloc fail, don't fail migration, but no 2344 * caching will be done, which will impact performance 2345 */ 2346 if (!vq->log_cache) 2347 VHOST_LOG_CONFIG(ERR, "(%s) failed to allocate VQ logging cache\n", 2348 dev->ifname); 2349 } 2350 2351 /* 2352 * The spec is not clear about it (yet), but QEMU doesn't expect 2353 * any payload in the reply. 2354 */ 2355 ctx->msg.size = 0; 2356 ctx->fd_num = 0; 2357 2358 return RTE_VHOST_MSG_RESULT_REPLY; 2359 2360 close_msg_fds: 2361 close_msg_fds(ctx); 2362 return RTE_VHOST_MSG_RESULT_ERR; 2363 } 2364 2365 static int vhost_user_set_log_fd(struct virtio_net **pdev, 2366 struct vhu_msg_context *ctx, 2367 int main_fd __rte_unused) 2368 { 2369 struct virtio_net *dev = *pdev; 2370 2371 if (validate_msg_fds(dev, ctx, 1) != 0) 2372 return RTE_VHOST_MSG_RESULT_ERR; 2373 2374 close(ctx->fds[0]); 2375 VHOST_LOG_CONFIG(INFO, "(%s) not implemented.\n", dev->ifname); 2376 2377 return RTE_VHOST_MSG_RESULT_OK; 2378 } 2379 2380 /* 2381 * An rarp packet is constructed and broadcasted to notify switches about 2382 * the new location of the migrated VM, so that packets from outside will 2383 * not be lost after migration. 2384 * 2385 * However, we don't actually "send" a rarp packet here, instead, we set 2386 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. 2387 */ 2388 static int 2389 vhost_user_send_rarp(struct virtio_net **pdev, 2390 struct vhu_msg_context *ctx, 2391 int main_fd __rte_unused) 2392 { 2393 struct virtio_net *dev = *pdev; 2394 uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64; 2395 struct rte_vdpa_device *vdpa_dev; 2396 2397 if (validate_msg_fds(dev, ctx, 0) != 0) 2398 return RTE_VHOST_MSG_RESULT_ERR; 2399 2400 VHOST_LOG_CONFIG(DEBUG, "(%s) MAC: " RTE_ETHER_ADDR_PRT_FMT "\n", 2401 dev->ifname, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); 2402 memcpy(dev->mac.addr_bytes, mac, 6); 2403 2404 /* 2405 * Set the flag to inject a RARP broadcast packet at 2406 * rte_vhost_dequeue_burst(). 2407 * 2408 * __ATOMIC_RELEASE ordering is for making sure the mac is 2409 * copied before the flag is set. 2410 */ 2411 __atomic_store_n(&dev->broadcast_rarp, 1, __ATOMIC_RELEASE); 2412 vdpa_dev = dev->vdpa_dev; 2413 if (vdpa_dev && vdpa_dev->ops->migration_done) 2414 vdpa_dev->ops->migration_done(dev->vid); 2415 2416 return RTE_VHOST_MSG_RESULT_OK; 2417 } 2418 2419 static int 2420 vhost_user_net_set_mtu(struct virtio_net **pdev, 2421 struct vhu_msg_context *ctx, 2422 int main_fd __rte_unused) 2423 { 2424 struct virtio_net *dev = *pdev; 2425 2426 if (validate_msg_fds(dev, ctx, 0) != 0) 2427 return RTE_VHOST_MSG_RESULT_ERR; 2428 2429 if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU || 2430 ctx->msg.payload.u64 > VIRTIO_MAX_MTU) { 2431 VHOST_LOG_CONFIG(ERR, "(%s) invalid MTU size (%"PRIu64")\n", 2432 dev->ifname, ctx->msg.payload.u64); 2433 2434 return RTE_VHOST_MSG_RESULT_ERR; 2435 } 2436 2437 dev->mtu = ctx->msg.payload.u64; 2438 2439 return RTE_VHOST_MSG_RESULT_OK; 2440 } 2441 2442 static int 2443 vhost_user_set_req_fd(struct virtio_net **pdev, 2444 struct vhu_msg_context *ctx, 2445 int main_fd __rte_unused) 2446 { 2447 struct virtio_net *dev = *pdev; 2448 int fd = ctx->fds[0]; 2449 2450 if (validate_msg_fds(dev, ctx, 1) != 0) 2451 return RTE_VHOST_MSG_RESULT_ERR; 2452 2453 if (fd < 0) { 2454 VHOST_LOG_CONFIG(ERR, "(%s) invalid file descriptor for slave channel (%d)\n", 2455 dev->ifname, fd); 2456 return RTE_VHOST_MSG_RESULT_ERR; 2457 } 2458 2459 if (dev->slave_req_fd >= 0) 2460 close(dev->slave_req_fd); 2461 2462 dev->slave_req_fd = fd; 2463 2464 return RTE_VHOST_MSG_RESULT_OK; 2465 } 2466 2467 static int 2468 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2469 { 2470 struct vhost_vring_addr *ra; 2471 uint64_t start, end, len; 2472 2473 start = imsg->iova; 2474 end = start + imsg->size; 2475 2476 ra = &vq->ring_addrs; 2477 len = sizeof(struct vring_desc) * vq->size; 2478 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2479 return 1; 2480 2481 len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; 2482 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2483 return 1; 2484 2485 len = sizeof(struct vring_used) + 2486 sizeof(struct vring_used_elem) * vq->size; 2487 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2488 return 1; 2489 2490 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2491 len = sizeof(uint64_t); 2492 if (ra->log_guest_addr < end && 2493 (ra->log_guest_addr + len) > start) 2494 return 1; 2495 } 2496 2497 return 0; 2498 } 2499 2500 static int 2501 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) 2502 { 2503 struct vhost_vring_addr *ra; 2504 uint64_t start, end, len; 2505 2506 start = imsg->iova; 2507 end = start + imsg->size; 2508 2509 ra = &vq->ring_addrs; 2510 len = sizeof(struct vring_packed_desc) * vq->size; 2511 if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start) 2512 return 1; 2513 2514 len = sizeof(struct vring_packed_desc_event); 2515 if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start) 2516 return 1; 2517 2518 len = sizeof(struct vring_packed_desc_event); 2519 if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) 2520 return 1; 2521 2522 if (ra->flags & (1 << VHOST_VRING_F_LOG)) { 2523 len = sizeof(uint64_t); 2524 if (ra->log_guest_addr < end && 2525 (ra->log_guest_addr + len) > start) 2526 return 1; 2527 } 2528 2529 return 0; 2530 } 2531 2532 static int is_vring_iotlb(struct virtio_net *dev, 2533 struct vhost_virtqueue *vq, 2534 struct vhost_iotlb_msg *imsg) 2535 { 2536 if (vq_is_packed(dev)) 2537 return is_vring_iotlb_packed(vq, imsg); 2538 else 2539 return is_vring_iotlb_split(vq, imsg); 2540 } 2541 2542 static int 2543 vhost_user_iotlb_msg(struct virtio_net **pdev, 2544 struct vhu_msg_context *ctx, 2545 int main_fd __rte_unused) 2546 { 2547 struct virtio_net *dev = *pdev; 2548 struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb; 2549 uint16_t i; 2550 uint64_t vva, len; 2551 2552 if (validate_msg_fds(dev, ctx, 0) != 0) 2553 return RTE_VHOST_MSG_RESULT_ERR; 2554 2555 switch (imsg->type) { 2556 case VHOST_IOTLB_UPDATE: 2557 len = imsg->size; 2558 vva = qva_to_vva(dev, imsg->uaddr, &len); 2559 if (!vva) 2560 return RTE_VHOST_MSG_RESULT_ERR; 2561 2562 for (i = 0; i < dev->nr_vring; i++) { 2563 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2564 2565 if (!vq) 2566 continue; 2567 2568 vhost_user_iotlb_cache_insert(dev, vq, imsg->iova, vva, 2569 len, imsg->perm); 2570 2571 if (is_vring_iotlb(dev, vq, imsg)) 2572 *pdev = dev = translate_ring_addresses(dev, i); 2573 } 2574 break; 2575 case VHOST_IOTLB_INVALIDATE: 2576 for (i = 0; i < dev->nr_vring; i++) { 2577 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2578 2579 if (!vq) 2580 continue; 2581 2582 vhost_user_iotlb_cache_remove(vq, imsg->iova, 2583 imsg->size); 2584 2585 if (is_vring_iotlb(dev, vq, imsg)) 2586 vring_invalidate(dev, vq); 2587 } 2588 break; 2589 default: 2590 VHOST_LOG_CONFIG(ERR, "(%s) invalid IOTLB message type (%d)\n", 2591 dev->ifname, imsg->type); 2592 return RTE_VHOST_MSG_RESULT_ERR; 2593 } 2594 2595 return RTE_VHOST_MSG_RESULT_OK; 2596 } 2597 2598 static int 2599 vhost_user_set_postcopy_advise(struct virtio_net **pdev, 2600 struct vhu_msg_context *ctx, 2601 int main_fd __rte_unused) 2602 { 2603 struct virtio_net *dev = *pdev; 2604 #ifdef RTE_LIBRTE_VHOST_POSTCOPY 2605 struct uffdio_api api_struct; 2606 2607 if (validate_msg_fds(dev, ctx, 0) != 0) 2608 return RTE_VHOST_MSG_RESULT_ERR; 2609 2610 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 2611 2612 if (dev->postcopy_ufd == -1) { 2613 VHOST_LOG_CONFIG(ERR, "(%s) userfaultfd not available: %s\n", 2614 dev->ifname, strerror(errno)); 2615 return RTE_VHOST_MSG_RESULT_ERR; 2616 } 2617 api_struct.api = UFFD_API; 2618 api_struct.features = 0; 2619 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 2620 VHOST_LOG_CONFIG(ERR, "(%s) UFFDIO_API ioctl failure: %s\n", 2621 dev->ifname, strerror(errno)); 2622 close(dev->postcopy_ufd); 2623 dev->postcopy_ufd = -1; 2624 return RTE_VHOST_MSG_RESULT_ERR; 2625 } 2626 ctx->fds[0] = dev->postcopy_ufd; 2627 ctx->fd_num = 1; 2628 2629 return RTE_VHOST_MSG_RESULT_REPLY; 2630 #else 2631 dev->postcopy_ufd = -1; 2632 ctx->fd_num = 0; 2633 2634 return RTE_VHOST_MSG_RESULT_ERR; 2635 #endif 2636 } 2637 2638 static int 2639 vhost_user_set_postcopy_listen(struct virtio_net **pdev, 2640 struct vhu_msg_context *ctx __rte_unused, 2641 int main_fd __rte_unused) 2642 { 2643 struct virtio_net *dev = *pdev; 2644 2645 if (validate_msg_fds(dev, ctx, 0) != 0) 2646 return RTE_VHOST_MSG_RESULT_ERR; 2647 2648 if (dev->mem && dev->mem->nregions) { 2649 VHOST_LOG_CONFIG(ERR, "(%s) regions already registered at postcopy-listen\n", 2650 dev->ifname); 2651 return RTE_VHOST_MSG_RESULT_ERR; 2652 } 2653 dev->postcopy_listening = 1; 2654 2655 return RTE_VHOST_MSG_RESULT_OK; 2656 } 2657 2658 static int 2659 vhost_user_postcopy_end(struct virtio_net **pdev, 2660 struct vhu_msg_context *ctx, 2661 int main_fd __rte_unused) 2662 { 2663 struct virtio_net *dev = *pdev; 2664 2665 if (validate_msg_fds(dev, ctx, 0) != 0) 2666 return RTE_VHOST_MSG_RESULT_ERR; 2667 2668 dev->postcopy_listening = 0; 2669 if (dev->postcopy_ufd >= 0) { 2670 close(dev->postcopy_ufd); 2671 dev->postcopy_ufd = -1; 2672 } 2673 2674 ctx->msg.payload.u64 = 0; 2675 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2676 ctx->fd_num = 0; 2677 2678 return RTE_VHOST_MSG_RESULT_REPLY; 2679 } 2680 2681 static int 2682 vhost_user_get_status(struct virtio_net **pdev, 2683 struct vhu_msg_context *ctx, 2684 int main_fd __rte_unused) 2685 { 2686 struct virtio_net *dev = *pdev; 2687 2688 if (validate_msg_fds(dev, ctx, 0) != 0) 2689 return RTE_VHOST_MSG_RESULT_ERR; 2690 2691 ctx->msg.payload.u64 = dev->status; 2692 ctx->msg.size = sizeof(ctx->msg.payload.u64); 2693 ctx->fd_num = 0; 2694 2695 return RTE_VHOST_MSG_RESULT_REPLY; 2696 } 2697 2698 static int 2699 vhost_user_set_status(struct virtio_net **pdev, 2700 struct vhu_msg_context *ctx, 2701 int main_fd __rte_unused) 2702 { 2703 struct virtio_net *dev = *pdev; 2704 2705 if (validate_msg_fds(dev, ctx, 0) != 0) 2706 return RTE_VHOST_MSG_RESULT_ERR; 2707 2708 /* As per Virtio specification, the device status is 8bits long */ 2709 if (ctx->msg.payload.u64 > UINT8_MAX) { 2710 VHOST_LOG_CONFIG(ERR, "(%s) invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64 "\n", 2711 dev->ifname, ctx->msg.payload.u64); 2712 return RTE_VHOST_MSG_RESULT_ERR; 2713 } 2714 2715 dev->status = ctx->msg.payload.u64; 2716 2717 if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) && 2718 (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) { 2719 VHOST_LOG_CONFIG(ERR, 2720 "(%s) FEATURES_OK bit is set but feature negotiation failed\n", 2721 dev->ifname); 2722 /* 2723 * Clear the bit to let the driver know about the feature 2724 * negotiation failure 2725 */ 2726 dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK; 2727 } 2728 2729 VHOST_LOG_CONFIG(INFO, "(%s) new device status(0x%08x):\n", dev->ifname, 2730 dev->status); 2731 VHOST_LOG_CONFIG(INFO, "(%s)\t-RESET: %u\n", dev->ifname, 2732 (dev->status == VIRTIO_DEVICE_STATUS_RESET)); 2733 VHOST_LOG_CONFIG(INFO, "(%s)\t-ACKNOWLEDGE: %u\n", dev->ifname, 2734 !!(dev->status & VIRTIO_DEVICE_STATUS_ACK)); 2735 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER: %u\n", dev->ifname, 2736 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER)); 2737 VHOST_LOG_CONFIG(INFO, "(%s)\t-FEATURES_OK: %u\n", dev->ifname, 2738 !!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK)); 2739 VHOST_LOG_CONFIG(INFO, "(%s)\t-DRIVER_OK: %u\n", dev->ifname, 2740 !!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK)); 2741 VHOST_LOG_CONFIG(INFO, "(%s)\t-DEVICE_NEED_RESET: %u\n", dev->ifname, 2742 !!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET)); 2743 VHOST_LOG_CONFIG(INFO, "(%s)\t-FAILED: %u\n", dev->ifname, 2744 !!(dev->status & VIRTIO_DEVICE_STATUS_FAILED)); 2745 2746 return RTE_VHOST_MSG_RESULT_OK; 2747 } 2748 2749 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev, 2750 struct vhu_msg_context *ctx, 2751 int main_fd); 2752 2753 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { 2754 [VHOST_USER_NONE] = NULL, 2755 [VHOST_USER_GET_FEATURES] = vhost_user_get_features, 2756 [VHOST_USER_SET_FEATURES] = vhost_user_set_features, 2757 [VHOST_USER_SET_OWNER] = vhost_user_set_owner, 2758 [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner, 2759 [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table, 2760 [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base, 2761 [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd, 2762 [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num, 2763 [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr, 2764 [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base, 2765 [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base, 2766 [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick, 2767 [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call, 2768 [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err, 2769 [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features, 2770 [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features, 2771 [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num, 2772 [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable, 2773 [VHOST_USER_SEND_RARP] = vhost_user_send_rarp, 2774 [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu, 2775 [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd, 2776 [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg, 2777 [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, 2778 [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, 2779 [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, 2780 [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd, 2781 [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd, 2782 [VHOST_USER_SET_STATUS] = vhost_user_set_status, 2783 [VHOST_USER_GET_STATUS] = vhost_user_get_status, 2784 }; 2785 2786 /* return bytes# of read on success or negative val on failure. */ 2787 static int 2788 read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2789 { 2790 int ret; 2791 2792 ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE, 2793 ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num); 2794 if (ret <= 0) { 2795 return ret; 2796 } else if (ret != VHOST_USER_HDR_SIZE) { 2797 VHOST_LOG_CONFIG(ERR, "(%s) Unexpected header size read\n", dev->ifname); 2798 close_msg_fds(ctx); 2799 return -1; 2800 } 2801 2802 if (ctx->msg.size) { 2803 if (ctx->msg.size > sizeof(ctx->msg.payload)) { 2804 VHOST_LOG_CONFIG(ERR, "(%s) invalid msg size: %d\n", 2805 dev->ifname, ctx->msg.size); 2806 return -1; 2807 } 2808 ret = read(sockfd, &ctx->msg.payload, ctx->msg.size); 2809 if (ret <= 0) 2810 return ret; 2811 if (ret != (int)ctx->msg.size) { 2812 VHOST_LOG_CONFIG(ERR, "(%s) read control message failed\n", dev->ifname); 2813 return -1; 2814 } 2815 } 2816 2817 return ret; 2818 } 2819 2820 static int 2821 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2822 { 2823 if (!ctx) 2824 return 0; 2825 2826 return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, 2827 VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num); 2828 } 2829 2830 static int 2831 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx) 2832 { 2833 if (!ctx) 2834 return 0; 2835 2836 ctx->msg.flags &= ~VHOST_USER_VERSION_MASK; 2837 ctx->msg.flags &= ~VHOST_USER_NEED_REPLY; 2838 ctx->msg.flags |= VHOST_USER_VERSION; 2839 ctx->msg.flags |= VHOST_USER_REPLY_MASK; 2840 2841 return send_vhost_message(dev, sockfd, ctx); 2842 } 2843 2844 static int 2845 send_vhost_slave_message(struct virtio_net *dev, 2846 struct vhu_msg_context *ctx) 2847 { 2848 int ret; 2849 2850 if (ctx->msg.flags & VHOST_USER_NEED_REPLY) 2851 rte_spinlock_lock(&dev->slave_req_lock); 2852 2853 ret = send_vhost_message(dev, dev->slave_req_fd, ctx); 2854 if (ret < 0 && (ctx->msg.flags & VHOST_USER_NEED_REPLY)) 2855 rte_spinlock_unlock(&dev->slave_req_lock); 2856 2857 return ret; 2858 } 2859 2860 /* 2861 * Allocate a queue pair if it hasn't been allocated yet 2862 */ 2863 static int 2864 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, 2865 struct vhu_msg_context *ctx) 2866 { 2867 uint32_t vring_idx; 2868 2869 switch (ctx->msg.request.master) { 2870 case VHOST_USER_SET_VRING_KICK: 2871 case VHOST_USER_SET_VRING_CALL: 2872 case VHOST_USER_SET_VRING_ERR: 2873 vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; 2874 break; 2875 case VHOST_USER_SET_VRING_NUM: 2876 case VHOST_USER_SET_VRING_BASE: 2877 case VHOST_USER_GET_VRING_BASE: 2878 case VHOST_USER_SET_VRING_ENABLE: 2879 vring_idx = ctx->msg.payload.state.index; 2880 break; 2881 case VHOST_USER_SET_VRING_ADDR: 2882 vring_idx = ctx->msg.payload.addr.index; 2883 break; 2884 default: 2885 return 0; 2886 } 2887 2888 if (vring_idx >= VHOST_MAX_VRING) { 2889 VHOST_LOG_CONFIG(ERR, "(%s) invalid vring index: %u\n", dev->ifname, vring_idx); 2890 return -1; 2891 } 2892 2893 if (dev->virtqueue[vring_idx]) 2894 return 0; 2895 2896 return alloc_vring_queue(dev, vring_idx); 2897 } 2898 2899 static void 2900 vhost_user_lock_all_queue_pairs(struct virtio_net *dev) 2901 { 2902 unsigned int i = 0; 2903 unsigned int vq_num = 0; 2904 2905 while (vq_num < dev->nr_vring) { 2906 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2907 2908 if (vq) { 2909 rte_spinlock_lock(&vq->access_lock); 2910 vq_num++; 2911 } 2912 i++; 2913 } 2914 } 2915 2916 static void 2917 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) 2918 { 2919 unsigned int i = 0; 2920 unsigned int vq_num = 0; 2921 2922 while (vq_num < dev->nr_vring) { 2923 struct vhost_virtqueue *vq = dev->virtqueue[i]; 2924 2925 if (vq) { 2926 rte_spinlock_unlock(&vq->access_lock); 2927 vq_num++; 2928 } 2929 i++; 2930 } 2931 } 2932 2933 int 2934 vhost_user_msg_handler(int vid, int fd) 2935 { 2936 struct virtio_net *dev; 2937 struct vhu_msg_context ctx; 2938 struct rte_vdpa_device *vdpa_dev; 2939 int ret; 2940 int unlock_required = 0; 2941 bool handled; 2942 int request; 2943 uint32_t i; 2944 2945 dev = get_device(vid); 2946 if (dev == NULL) 2947 return -1; 2948 2949 if (!dev->notify_ops) { 2950 dev->notify_ops = vhost_driver_callback_get(dev->ifname); 2951 if (!dev->notify_ops) { 2952 VHOST_LOG_CONFIG(ERR, "(%s) failed to get callback ops for driver\n", 2953 dev->ifname); 2954 return -1; 2955 } 2956 } 2957 2958 ret = read_vhost_message(dev, fd, &ctx); 2959 if (ret <= 0) { 2960 if (ret < 0) 2961 VHOST_LOG_CONFIG(ERR, "(%s) vhost read message failed\n", dev->ifname); 2962 else 2963 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 2964 2965 return -1; 2966 } 2967 2968 ret = 0; 2969 request = ctx.msg.request.master; 2970 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX && 2971 vhost_message_str[request]) { 2972 if (request != VHOST_USER_IOTLB_MSG) 2973 VHOST_LOG_CONFIG(INFO, "(%s) read message %s\n", 2974 dev->ifname, vhost_message_str[request]); 2975 else 2976 VHOST_LOG_CONFIG(DEBUG, "(%s) read message %s\n", 2977 dev->ifname, vhost_message_str[request]); 2978 } else { 2979 VHOST_LOG_CONFIG(DEBUG, "(%s) external request %d\n", dev->ifname, request); 2980 } 2981 2982 ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx); 2983 if (ret < 0) { 2984 VHOST_LOG_CONFIG(ERR, "(%s) failed to alloc queue\n", dev->ifname); 2985 return -1; 2986 } 2987 2988 /* 2989 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE 2990 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops 2991 * and device is destroyed. destroy_device waits for queues to be 2992 * inactive, so it is safe. Otherwise taking the access_lock 2993 * would cause a dead lock. 2994 */ 2995 switch (request) { 2996 case VHOST_USER_SET_FEATURES: 2997 case VHOST_USER_SET_PROTOCOL_FEATURES: 2998 case VHOST_USER_SET_OWNER: 2999 case VHOST_USER_SET_MEM_TABLE: 3000 case VHOST_USER_SET_LOG_BASE: 3001 case VHOST_USER_SET_LOG_FD: 3002 case VHOST_USER_SET_VRING_NUM: 3003 case VHOST_USER_SET_VRING_ADDR: 3004 case VHOST_USER_SET_VRING_BASE: 3005 case VHOST_USER_SET_VRING_KICK: 3006 case VHOST_USER_SET_VRING_CALL: 3007 case VHOST_USER_SET_VRING_ERR: 3008 case VHOST_USER_SET_VRING_ENABLE: 3009 case VHOST_USER_SEND_RARP: 3010 case VHOST_USER_NET_SET_MTU: 3011 case VHOST_USER_SET_SLAVE_REQ_FD: 3012 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3013 vhost_user_lock_all_queue_pairs(dev); 3014 unlock_required = 1; 3015 } 3016 break; 3017 default: 3018 break; 3019 3020 } 3021 3022 handled = false; 3023 if (dev->extern_ops.pre_msg_handle) { 3024 ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, 3025 (void *)&ctx.msg); 3026 switch (ret) { 3027 case RTE_VHOST_MSG_RESULT_REPLY: 3028 send_vhost_reply(dev, fd, &ctx); 3029 /* Fall-through */ 3030 case RTE_VHOST_MSG_RESULT_ERR: 3031 case RTE_VHOST_MSG_RESULT_OK: 3032 handled = true; 3033 goto skip_to_post_handle; 3034 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3035 default: 3036 break; 3037 } 3038 } 3039 3040 if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) { 3041 if (!vhost_message_handlers[request]) 3042 goto skip_to_post_handle; 3043 ret = vhost_message_handlers[request](&dev, &ctx, fd); 3044 3045 switch (ret) { 3046 case RTE_VHOST_MSG_RESULT_ERR: 3047 VHOST_LOG_CONFIG(ERR, "(%s) processing %s failed.\n", 3048 dev->ifname, vhost_message_str[request]); 3049 handled = true; 3050 break; 3051 case RTE_VHOST_MSG_RESULT_OK: 3052 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded.\n", 3053 dev->ifname, vhost_message_str[request]); 3054 handled = true; 3055 break; 3056 case RTE_VHOST_MSG_RESULT_REPLY: 3057 VHOST_LOG_CONFIG(DEBUG, "(%s) processing %s succeeded and needs reply.\n", 3058 dev->ifname, vhost_message_str[request]); 3059 send_vhost_reply(dev, fd, &ctx); 3060 handled = true; 3061 break; 3062 default: 3063 break; 3064 } 3065 } 3066 3067 skip_to_post_handle: 3068 if (ret != RTE_VHOST_MSG_RESULT_ERR && 3069 dev->extern_ops.post_msg_handle) { 3070 ret = (*dev->extern_ops.post_msg_handle)(dev->vid, 3071 (void *)&ctx.msg); 3072 switch (ret) { 3073 case RTE_VHOST_MSG_RESULT_REPLY: 3074 send_vhost_reply(dev, fd, &ctx); 3075 /* Fall-through */ 3076 case RTE_VHOST_MSG_RESULT_ERR: 3077 case RTE_VHOST_MSG_RESULT_OK: 3078 handled = true; 3079 case RTE_VHOST_MSG_RESULT_NOT_HANDLED: 3080 default: 3081 break; 3082 } 3083 } 3084 3085 /* If message was not handled at this stage, treat it as an error */ 3086 if (!handled) { 3087 VHOST_LOG_CONFIG(ERR, "(%s) vhost message (req: %d) was not handled.\n", 3088 dev->ifname, request); 3089 close_msg_fds(&ctx); 3090 ret = RTE_VHOST_MSG_RESULT_ERR; 3091 } 3092 3093 /* 3094 * If the request required a reply that was already sent, 3095 * this optional reply-ack won't be sent as the 3096 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). 3097 */ 3098 if (ctx.msg.flags & VHOST_USER_NEED_REPLY) { 3099 ctx.msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR; 3100 ctx.msg.size = sizeof(ctx.msg.payload.u64); 3101 ctx.fd_num = 0; 3102 send_vhost_reply(dev, fd, &ctx); 3103 } else if (ret == RTE_VHOST_MSG_RESULT_ERR) { 3104 VHOST_LOG_CONFIG(ERR, "(%s) vhost message handling failed.\n", dev->ifname); 3105 return -1; 3106 } 3107 3108 for (i = 0; i < dev->nr_vring; i++) { 3109 struct vhost_virtqueue *vq = dev->virtqueue[i]; 3110 bool cur_ready = vq_is_ready(dev, vq); 3111 3112 if (cur_ready != (vq && vq->ready)) { 3113 vq->ready = cur_ready; 3114 vhost_user_notify_queue_state(dev, i, cur_ready); 3115 } 3116 } 3117 3118 if (unlock_required) 3119 vhost_user_unlock_all_queue_pairs(dev); 3120 3121 if (!virtio_is_ready(dev)) 3122 goto out; 3123 3124 /* 3125 * Virtio is now ready. If not done already, it is time 3126 * to notify the application it can process the rings and 3127 * configure the vDPA device if present. 3128 */ 3129 3130 if (!(dev->flags & VIRTIO_DEV_RUNNING)) { 3131 if (dev->notify_ops->new_device(dev->vid) == 0) 3132 dev->flags |= VIRTIO_DEV_RUNNING; 3133 } 3134 3135 vdpa_dev = dev->vdpa_dev; 3136 if (!vdpa_dev) 3137 goto out; 3138 3139 if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) { 3140 if (vdpa_dev->ops->dev_conf(dev->vid)) 3141 VHOST_LOG_CONFIG(ERR, "(%s) failed to configure vDPA device\n", 3142 dev->ifname); 3143 else 3144 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; 3145 } 3146 3147 out: 3148 return 0; 3149 } 3150 3151 static int process_slave_message_reply(struct virtio_net *dev, 3152 const struct vhu_msg_context *ctx) 3153 { 3154 struct vhu_msg_context msg_reply; 3155 int ret; 3156 3157 if ((ctx->msg.flags & VHOST_USER_NEED_REPLY) == 0) 3158 return 0; 3159 3160 ret = read_vhost_message(dev, dev->slave_req_fd, &msg_reply); 3161 if (ret <= 0) { 3162 if (ret < 0) 3163 VHOST_LOG_CONFIG(ERR, "(%s) vhost read slave message reply failed\n", 3164 dev->ifname); 3165 else 3166 VHOST_LOG_CONFIG(INFO, "(%s) vhost peer closed\n", dev->ifname); 3167 ret = -1; 3168 goto out; 3169 } 3170 3171 ret = 0; 3172 if (msg_reply.msg.request.slave != ctx->msg.request.slave) { 3173 VHOST_LOG_CONFIG(ERR, "(%s) received unexpected msg type (%u), expected %u\n", 3174 dev->ifname, msg_reply.msg.request.slave, ctx->msg.request.slave); 3175 ret = -1; 3176 goto out; 3177 } 3178 3179 ret = msg_reply.msg.payload.u64 ? -1 : 0; 3180 3181 out: 3182 rte_spinlock_unlock(&dev->slave_req_lock); 3183 return ret; 3184 } 3185 3186 int 3187 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) 3188 { 3189 int ret; 3190 struct vhu_msg_context ctx = { 3191 .msg = { 3192 .request.slave = VHOST_USER_SLAVE_IOTLB_MSG, 3193 .flags = VHOST_USER_VERSION, 3194 .size = sizeof(ctx.msg.payload.iotlb), 3195 .payload.iotlb = { 3196 .iova = iova, 3197 .perm = perm, 3198 .type = VHOST_IOTLB_MISS, 3199 }, 3200 }, 3201 }; 3202 3203 ret = send_vhost_message(dev, dev->slave_req_fd, &ctx); 3204 if (ret < 0) { 3205 VHOST_LOG_CONFIG(ERR, "(%s) failed to send IOTLB miss message (%d)\n", 3206 dev->ifname, ret); 3207 return ret; 3208 } 3209 3210 return 0; 3211 } 3212 3213 static int 3214 vhost_user_slave_config_change(struct virtio_net *dev, bool need_reply) 3215 { 3216 int ret; 3217 struct vhu_msg_context ctx = { 3218 .msg = { 3219 .request.slave = VHOST_USER_SLAVE_CONFIG_CHANGE_MSG, 3220 .flags = VHOST_USER_VERSION, 3221 .size = 0, 3222 } 3223 }; 3224 3225 if (need_reply) 3226 ctx.msg.flags |= VHOST_USER_NEED_REPLY; 3227 3228 ret = send_vhost_slave_message(dev, &ctx); 3229 if (ret < 0) { 3230 VHOST_LOG_CONFIG(ERR, "(%s) failed to send config change (%d)\n", 3231 dev->ifname, ret); 3232 return ret; 3233 } 3234 3235 return process_slave_message_reply(dev, &ctx); 3236 } 3237 3238 int 3239 rte_vhost_slave_config_change(int vid, bool need_reply) 3240 { 3241 struct virtio_net *dev; 3242 3243 dev = get_device(vid); 3244 if (!dev) 3245 return -ENODEV; 3246 3247 return vhost_user_slave_config_change(dev, need_reply); 3248 } 3249 3250 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, 3251 int index, int fd, 3252 uint64_t offset, 3253 uint64_t size) 3254 { 3255 int ret; 3256 struct vhu_msg_context ctx = { 3257 .msg = { 3258 .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 3259 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, 3260 .size = sizeof(ctx.msg.payload.area), 3261 .payload.area = { 3262 .u64 = index & VHOST_USER_VRING_IDX_MASK, 3263 .size = size, 3264 .offset = offset, 3265 }, 3266 }, 3267 }; 3268 3269 if (fd < 0) 3270 ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 3271 else { 3272 ctx.fds[0] = fd; 3273 ctx.fd_num = 1; 3274 } 3275 3276 ret = send_vhost_slave_message(dev, &ctx); 3277 if (ret < 0) { 3278 VHOST_LOG_CONFIG(ERR, "(%s) failed to set host notifier (%d)\n", 3279 dev->ifname, ret); 3280 return ret; 3281 } 3282 3283 return process_slave_message_reply(dev, &ctx); 3284 } 3285 3286 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable) 3287 { 3288 struct virtio_net *dev; 3289 struct rte_vdpa_device *vdpa_dev; 3290 int vfio_device_fd, ret = 0; 3291 uint64_t offset, size; 3292 unsigned int i, q_start, q_last; 3293 3294 dev = get_device(vid); 3295 if (!dev) 3296 return -ENODEV; 3297 3298 vdpa_dev = dev->vdpa_dev; 3299 if (vdpa_dev == NULL) 3300 return -ENODEV; 3301 3302 if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || 3303 !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || 3304 !(dev->protocol_features & 3305 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || 3306 !(dev->protocol_features & 3307 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || 3308 !(dev->protocol_features & 3309 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) 3310 return -ENOTSUP; 3311 3312 if (qid == RTE_VHOST_QUEUE_ALL) { 3313 q_start = 0; 3314 q_last = dev->nr_vring - 1; 3315 } else { 3316 if (qid >= dev->nr_vring) 3317 return -EINVAL; 3318 q_start = qid; 3319 q_last = qid; 3320 } 3321 3322 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); 3323 RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); 3324 3325 vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); 3326 if (vfio_device_fd < 0) 3327 return -ENOTSUP; 3328 3329 if (enable) { 3330 for (i = q_start; i <= q_last; i++) { 3331 if (vdpa_dev->ops->get_notify_area(vid, i, &offset, 3332 &size) < 0) { 3333 ret = -ENOTSUP; 3334 goto disable; 3335 } 3336 3337 if (vhost_user_slave_set_vring_host_notifier(dev, i, 3338 vfio_device_fd, offset, size) < 0) { 3339 ret = -EFAULT; 3340 goto disable; 3341 } 3342 } 3343 } else { 3344 disable: 3345 for (i = q_start; i <= q_last; i++) { 3346 vhost_user_slave_set_vring_host_notifier(dev, i, -1, 3347 0, 0); 3348 } 3349 } 3350 3351 return ret; 3352 } 3353