xref: /f-stack/dpdk/lib/librte_vhost/socket.c (revision ebf5cedb)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/queue.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <pthread.h>
18 
19 #include <rte_log.h>
20 
21 #include "fd_man.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24 
25 
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27 
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33 	struct vhost_user_connection_list conn_list;
34 	pthread_mutex_t conn_mutex;
35 	char *path;
36 	int socket_fd;
37 	struct sockaddr_un un;
38 	bool is_server;
39 	bool reconnect;
40 	bool dequeue_zero_copy;
41 	bool iommu_support;
42 	bool use_builtin_virtio_net;
43 	bool extbuf;
44 	bool linearbuf;
45 
46 	/*
47 	 * The "supported_features" indicates the feature bits the
48 	 * vhost driver supports. The "features" indicates the feature
49 	 * bits after the rte_vhost_driver_features_disable/enable().
50 	 * It is also the final feature bits used for vhost-user
51 	 * features negotiation.
52 	 */
53 	uint64_t supported_features;
54 	uint64_t features;
55 
56 	uint64_t protocol_features;
57 
58 	/*
59 	 * Device id to identify a specific backend device.
60 	 * It's set to -1 for the default software implementation.
61 	 * If valid, one socket can have 1 connection only.
62 	 */
63 	int vdpa_dev_id;
64 
65 	struct vhost_device_ops const *notify_ops;
66 };
67 
68 struct vhost_user_connection {
69 	struct vhost_user_socket *vsocket;
70 	int connfd;
71 	int vid;
72 
73 	TAILQ_ENTRY(vhost_user_connection) next;
74 };
75 
76 #define MAX_VHOST_SOCKET 1024
77 struct vhost_user {
78 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
79 	struct fdset fdset;
80 	int vsocket_cnt;
81 	pthread_mutex_t mutex;
82 };
83 
84 #define MAX_VIRTIO_BACKLOG 128
85 
86 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
87 static void vhost_user_read_cb(int fd, void *dat, int *remove);
88 static int create_unix_socket(struct vhost_user_socket *vsocket);
89 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
90 
91 static struct vhost_user vhost_user = {
92 	.fdset = {
93 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
94 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
95 		.fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
96 		.num = 0
97 	},
98 	.vsocket_cnt = 0,
99 	.mutex = PTHREAD_MUTEX_INITIALIZER,
100 };
101 
102 /*
103  * return bytes# of read on success or negative val on failure. Update fdnum
104  * with number of fds read.
105  */
106 int
107 read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
108 		int *fd_num)
109 {
110 	struct iovec iov;
111 	struct msghdr msgh;
112 	char control[CMSG_SPACE(max_fds * sizeof(int))];
113 	struct cmsghdr *cmsg;
114 	int got_fds = 0;
115 	int ret;
116 
117 	*fd_num = 0;
118 
119 	memset(&msgh, 0, sizeof(msgh));
120 	iov.iov_base = buf;
121 	iov.iov_len  = buflen;
122 
123 	msgh.msg_iov = &iov;
124 	msgh.msg_iovlen = 1;
125 	msgh.msg_control = control;
126 	msgh.msg_controllen = sizeof(control);
127 
128 	ret = recvmsg(sockfd, &msgh, 0);
129 	if (ret <= 0) {
130 		if (ret)
131 			RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
132 		return ret;
133 	}
134 
135 	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
136 		RTE_LOG(ERR, VHOST_CONFIG, "truncated msg\n");
137 		return -1;
138 	}
139 
140 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
141 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
142 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
143 			(cmsg->cmsg_type == SCM_RIGHTS)) {
144 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
145 			*fd_num = got_fds;
146 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
147 			break;
148 		}
149 	}
150 
151 	/* Clear out unused file descriptors */
152 	while (got_fds < max_fds)
153 		fds[got_fds++] = -1;
154 
155 	return ret;
156 }
157 
158 int
159 send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
160 {
161 
162 	struct iovec iov;
163 	struct msghdr msgh;
164 	size_t fdsize = fd_num * sizeof(int);
165 	char control[CMSG_SPACE(fdsize)];
166 	struct cmsghdr *cmsg;
167 	int ret;
168 
169 	memset(&msgh, 0, sizeof(msgh));
170 	iov.iov_base = buf;
171 	iov.iov_len = buflen;
172 
173 	msgh.msg_iov = &iov;
174 	msgh.msg_iovlen = 1;
175 
176 	if (fds && fd_num > 0) {
177 		msgh.msg_control = control;
178 		msgh.msg_controllen = sizeof(control);
179 		cmsg = CMSG_FIRSTHDR(&msgh);
180 		if (cmsg == NULL) {
181 			RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n");
182 			errno = EINVAL;
183 			return -1;
184 		}
185 		cmsg->cmsg_len = CMSG_LEN(fdsize);
186 		cmsg->cmsg_level = SOL_SOCKET;
187 		cmsg->cmsg_type = SCM_RIGHTS;
188 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
189 	} else {
190 		msgh.msg_control = NULL;
191 		msgh.msg_controllen = 0;
192 	}
193 
194 	do {
195 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
196 	} while (ret < 0 && errno == EINTR);
197 
198 	if (ret < 0) {
199 		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
200 		return ret;
201 	}
202 
203 	return ret;
204 }
205 
206 static void
207 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
208 {
209 	int vid;
210 	size_t size;
211 	struct vhost_user_connection *conn;
212 	int ret;
213 
214 	if (vsocket == NULL)
215 		return;
216 
217 	conn = malloc(sizeof(*conn));
218 	if (conn == NULL) {
219 		close(fd);
220 		return;
221 	}
222 
223 	vid = vhost_new_device();
224 	if (vid == -1) {
225 		goto err;
226 	}
227 
228 	size = strnlen(vsocket->path, PATH_MAX);
229 	vhost_set_ifname(vid, vsocket->path, size);
230 
231 	vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
232 
233 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id);
234 
235 	if (vsocket->dequeue_zero_copy)
236 		vhost_enable_dequeue_zero_copy(vid);
237 
238 	if (vsocket->extbuf)
239 		vhost_enable_extbuf(vid);
240 
241 	if (vsocket->linearbuf)
242 		vhost_enable_linearbuf(vid);
243 
244 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
245 
246 	if (vsocket->notify_ops->new_connection) {
247 		ret = vsocket->notify_ops->new_connection(vid);
248 		if (ret < 0) {
249 			RTE_LOG(ERR, VHOST_CONFIG,
250 				"failed to add vhost user connection with fd %d\n",
251 				fd);
252 			goto err_cleanup;
253 		}
254 	}
255 
256 	conn->connfd = fd;
257 	conn->vsocket = vsocket;
258 	conn->vid = vid;
259 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
260 			NULL, conn);
261 	if (ret < 0) {
262 		RTE_LOG(ERR, VHOST_CONFIG,
263 			"failed to add fd %d into vhost server fdset\n",
264 			fd);
265 
266 		if (vsocket->notify_ops->destroy_connection)
267 			vsocket->notify_ops->destroy_connection(conn->vid);
268 
269 		goto err_cleanup;
270 	}
271 
272 	pthread_mutex_lock(&vsocket->conn_mutex);
273 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
274 	pthread_mutex_unlock(&vsocket->conn_mutex);
275 
276 	fdset_pipe_notify(&vhost_user.fdset);
277 	return;
278 
279 err_cleanup:
280 	vhost_destroy_device(vid);
281 err:
282 	free(conn);
283 	close(fd);
284 }
285 
286 /* call back when there is new vhost-user connection from client  */
287 static void
288 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
289 {
290 	struct vhost_user_socket *vsocket = dat;
291 
292 	fd = accept(fd, NULL, NULL);
293 	if (fd < 0)
294 		return;
295 
296 	RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
297 	vhost_user_add_connection(fd, vsocket);
298 }
299 
300 static void
301 vhost_user_read_cb(int connfd, void *dat, int *remove)
302 {
303 	struct vhost_user_connection *conn = dat;
304 	struct vhost_user_socket *vsocket = conn->vsocket;
305 	int ret;
306 
307 	ret = vhost_user_msg_handler(conn->vid, connfd);
308 	if (ret < 0) {
309 		struct virtio_net *dev = get_device(conn->vid);
310 
311 		close(connfd);
312 		*remove = 1;
313 
314 		if (dev)
315 			vhost_destroy_device_notify(dev);
316 
317 		if (vsocket->notify_ops->destroy_connection)
318 			vsocket->notify_ops->destroy_connection(conn->vid);
319 
320 		vhost_destroy_device(conn->vid);
321 
322 		if (vsocket->reconnect) {
323 			create_unix_socket(vsocket);
324 			vhost_user_start_client(vsocket);
325 		}
326 
327 		pthread_mutex_lock(&vsocket->conn_mutex);
328 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
329 		pthread_mutex_unlock(&vsocket->conn_mutex);
330 
331 		free(conn);
332 	}
333 }
334 
335 static int
336 create_unix_socket(struct vhost_user_socket *vsocket)
337 {
338 	int fd;
339 	struct sockaddr_un *un = &vsocket->un;
340 
341 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
342 	if (fd < 0)
343 		return -1;
344 	RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
345 		vsocket->is_server ? "server" : "client", fd);
346 
347 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
348 		RTE_LOG(ERR, VHOST_CONFIG,
349 			"vhost-user: can't set nonblocking mode for socket, fd: "
350 			"%d (%s)\n", fd, strerror(errno));
351 		close(fd);
352 		return -1;
353 	}
354 
355 	memset(un, 0, sizeof(*un));
356 	un->sun_family = AF_UNIX;
357 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
358 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
359 
360 	vsocket->socket_fd = fd;
361 	return 0;
362 }
363 
364 static int
365 vhost_user_start_server(struct vhost_user_socket *vsocket)
366 {
367 	int ret;
368 	int fd = vsocket->socket_fd;
369 	const char *path = vsocket->path;
370 
371 	/*
372 	 * bind () may fail if the socket file with the same name already
373 	 * exists. But the library obviously should not delete the file
374 	 * provided by the user, since we can not be sure that it is not
375 	 * being used by other applications. Moreover, many applications form
376 	 * socket names based on user input, which is prone to errors.
377 	 *
378 	 * The user must ensure that the socket does not exist before
379 	 * registering the vhost driver in server mode.
380 	 */
381 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
382 	if (ret < 0) {
383 		RTE_LOG(ERR, VHOST_CONFIG,
384 			"failed to bind to %s: %s; remove it and try again\n",
385 			path, strerror(errno));
386 		goto err;
387 	}
388 	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
389 
390 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
391 	if (ret < 0)
392 		goto err;
393 
394 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
395 		  NULL, vsocket);
396 	if (ret < 0) {
397 		RTE_LOG(ERR, VHOST_CONFIG,
398 			"failed to add listen fd %d to vhost server fdset\n",
399 			fd);
400 		goto err;
401 	}
402 
403 	return 0;
404 
405 err:
406 	close(fd);
407 	return -1;
408 }
409 
410 struct vhost_user_reconnect {
411 	struct sockaddr_un un;
412 	int fd;
413 	struct vhost_user_socket *vsocket;
414 
415 	TAILQ_ENTRY(vhost_user_reconnect) next;
416 };
417 
418 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
419 struct vhost_user_reconnect_list {
420 	struct vhost_user_reconnect_tailq_list head;
421 	pthread_mutex_t mutex;
422 };
423 
424 static struct vhost_user_reconnect_list reconn_list;
425 static pthread_t reconn_tid;
426 
427 static int
428 vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
429 {
430 	int ret, flags;
431 
432 	ret = connect(fd, un, sz);
433 	if (ret < 0 && errno != EISCONN)
434 		return -1;
435 
436 	flags = fcntl(fd, F_GETFL, 0);
437 	if (flags < 0) {
438 		RTE_LOG(ERR, VHOST_CONFIG,
439 			"can't get flags for connfd %d\n", fd);
440 		return -2;
441 	}
442 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
443 		RTE_LOG(ERR, VHOST_CONFIG,
444 				"can't disable nonblocking on fd %d\n", fd);
445 		return -2;
446 	}
447 	return 0;
448 }
449 
450 static void *
451 vhost_user_client_reconnect(void *arg __rte_unused)
452 {
453 	int ret;
454 	struct vhost_user_reconnect *reconn, *next;
455 
456 	while (1) {
457 		pthread_mutex_lock(&reconn_list.mutex);
458 
459 		/*
460 		 * An equal implementation of TAILQ_FOREACH_SAFE,
461 		 * which does not exist on all platforms.
462 		 */
463 		for (reconn = TAILQ_FIRST(&reconn_list.head);
464 		     reconn != NULL; reconn = next) {
465 			next = TAILQ_NEXT(reconn, next);
466 
467 			ret = vhost_user_connect_nonblock(reconn->fd,
468 						(struct sockaddr *)&reconn->un,
469 						sizeof(reconn->un));
470 			if (ret == -2) {
471 				close(reconn->fd);
472 				RTE_LOG(ERR, VHOST_CONFIG,
473 					"reconnection for fd %d failed\n",
474 					reconn->fd);
475 				goto remove_fd;
476 			}
477 			if (ret == -1)
478 				continue;
479 
480 			RTE_LOG(INFO, VHOST_CONFIG,
481 				"%s: connected\n", reconn->vsocket->path);
482 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
483 remove_fd:
484 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
485 			free(reconn);
486 		}
487 
488 		pthread_mutex_unlock(&reconn_list.mutex);
489 		sleep(1);
490 	}
491 
492 	return NULL;
493 }
494 
495 static int
496 vhost_user_reconnect_init(void)
497 {
498 	int ret;
499 
500 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
501 	if (ret < 0) {
502 		RTE_LOG(ERR, VHOST_CONFIG, "failed to initialize mutex");
503 		return ret;
504 	}
505 	TAILQ_INIT(&reconn_list.head);
506 
507 	ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
508 			     vhost_user_client_reconnect, NULL);
509 	if (ret != 0) {
510 		RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
511 		if (pthread_mutex_destroy(&reconn_list.mutex)) {
512 			RTE_LOG(ERR, VHOST_CONFIG,
513 				"failed to destroy reconnect mutex");
514 		}
515 	}
516 
517 	return ret;
518 }
519 
520 static int
521 vhost_user_start_client(struct vhost_user_socket *vsocket)
522 {
523 	int ret;
524 	int fd = vsocket->socket_fd;
525 	const char *path = vsocket->path;
526 	struct vhost_user_reconnect *reconn;
527 
528 	ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
529 					  sizeof(vsocket->un));
530 	if (ret == 0) {
531 		vhost_user_add_connection(fd, vsocket);
532 		return 0;
533 	}
534 
535 	RTE_LOG(WARNING, VHOST_CONFIG,
536 		"failed to connect to %s: %s\n",
537 		path, strerror(errno));
538 
539 	if (ret == -2 || !vsocket->reconnect) {
540 		close(fd);
541 		return -1;
542 	}
543 
544 	RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
545 	reconn = malloc(sizeof(*reconn));
546 	if (reconn == NULL) {
547 		RTE_LOG(ERR, VHOST_CONFIG,
548 			"failed to allocate memory for reconnect\n");
549 		close(fd);
550 		return -1;
551 	}
552 	reconn->un = vsocket->un;
553 	reconn->fd = fd;
554 	reconn->vsocket = vsocket;
555 	pthread_mutex_lock(&reconn_list.mutex);
556 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
557 	pthread_mutex_unlock(&reconn_list.mutex);
558 
559 	return 0;
560 }
561 
562 static struct vhost_user_socket *
563 find_vhost_user_socket(const char *path)
564 {
565 	int i;
566 
567 	if (path == NULL)
568 		return NULL;
569 
570 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
571 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
572 
573 		if (!strcmp(vsocket->path, path))
574 			return vsocket;
575 	}
576 
577 	return NULL;
578 }
579 
580 int
581 rte_vhost_driver_attach_vdpa_device(const char *path, int did)
582 {
583 	struct vhost_user_socket *vsocket;
584 
585 	if (rte_vdpa_get_device(did) == NULL || path == NULL)
586 		return -1;
587 
588 	pthread_mutex_lock(&vhost_user.mutex);
589 	vsocket = find_vhost_user_socket(path);
590 	if (vsocket)
591 		vsocket->vdpa_dev_id = did;
592 	pthread_mutex_unlock(&vhost_user.mutex);
593 
594 	return vsocket ? 0 : -1;
595 }
596 
597 int
598 rte_vhost_driver_detach_vdpa_device(const char *path)
599 {
600 	struct vhost_user_socket *vsocket;
601 
602 	pthread_mutex_lock(&vhost_user.mutex);
603 	vsocket = find_vhost_user_socket(path);
604 	if (vsocket)
605 		vsocket->vdpa_dev_id = -1;
606 	pthread_mutex_unlock(&vhost_user.mutex);
607 
608 	return vsocket ? 0 : -1;
609 }
610 
611 int
612 rte_vhost_driver_get_vdpa_device_id(const char *path)
613 {
614 	struct vhost_user_socket *vsocket;
615 	int did = -1;
616 
617 	pthread_mutex_lock(&vhost_user.mutex);
618 	vsocket = find_vhost_user_socket(path);
619 	if (vsocket)
620 		did = vsocket->vdpa_dev_id;
621 	pthread_mutex_unlock(&vhost_user.mutex);
622 
623 	return did;
624 }
625 
626 int
627 rte_vhost_driver_disable_features(const char *path, uint64_t features)
628 {
629 	struct vhost_user_socket *vsocket;
630 
631 	pthread_mutex_lock(&vhost_user.mutex);
632 	vsocket = find_vhost_user_socket(path);
633 
634 	/* Note that use_builtin_virtio_net is not affected by this function
635 	 * since callers may want to selectively disable features of the
636 	 * built-in vhost net device backend.
637 	 */
638 
639 	if (vsocket)
640 		vsocket->features &= ~features;
641 	pthread_mutex_unlock(&vhost_user.mutex);
642 
643 	return vsocket ? 0 : -1;
644 }
645 
646 int
647 rte_vhost_driver_enable_features(const char *path, uint64_t features)
648 {
649 	struct vhost_user_socket *vsocket;
650 
651 	pthread_mutex_lock(&vhost_user.mutex);
652 	vsocket = find_vhost_user_socket(path);
653 	if (vsocket) {
654 		if ((vsocket->supported_features & features) != features) {
655 			/*
656 			 * trying to enable features the driver doesn't
657 			 * support.
658 			 */
659 			pthread_mutex_unlock(&vhost_user.mutex);
660 			return -1;
661 		}
662 		vsocket->features |= features;
663 	}
664 	pthread_mutex_unlock(&vhost_user.mutex);
665 
666 	return vsocket ? 0 : -1;
667 }
668 
669 int
670 rte_vhost_driver_set_features(const char *path, uint64_t features)
671 {
672 	struct vhost_user_socket *vsocket;
673 
674 	pthread_mutex_lock(&vhost_user.mutex);
675 	vsocket = find_vhost_user_socket(path);
676 	if (vsocket) {
677 		vsocket->supported_features = features;
678 		vsocket->features = features;
679 
680 		/* Anyone setting feature bits is implementing their own vhost
681 		 * device backend.
682 		 */
683 		vsocket->use_builtin_virtio_net = false;
684 	}
685 	pthread_mutex_unlock(&vhost_user.mutex);
686 
687 	return vsocket ? 0 : -1;
688 }
689 
690 int
691 rte_vhost_driver_get_features(const char *path, uint64_t *features)
692 {
693 	struct vhost_user_socket *vsocket;
694 	uint64_t vdpa_features;
695 	struct rte_vdpa_device *vdpa_dev;
696 	int did = -1;
697 	int ret = 0;
698 
699 	pthread_mutex_lock(&vhost_user.mutex);
700 	vsocket = find_vhost_user_socket(path);
701 	if (!vsocket) {
702 		RTE_LOG(ERR, VHOST_CONFIG,
703 			"socket file %s is not registered yet.\n", path);
704 		ret = -1;
705 		goto unlock_exit;
706 	}
707 
708 	did = vsocket->vdpa_dev_id;
709 	vdpa_dev = rte_vdpa_get_device(did);
710 	if (!vdpa_dev || !vdpa_dev->ops->get_features) {
711 		*features = vsocket->features;
712 		goto unlock_exit;
713 	}
714 
715 	if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) {
716 		RTE_LOG(ERR, VHOST_CONFIG,
717 				"failed to get vdpa features "
718 				"for socket file %s.\n", path);
719 		ret = -1;
720 		goto unlock_exit;
721 	}
722 
723 	*features = vsocket->features & vdpa_features;
724 
725 unlock_exit:
726 	pthread_mutex_unlock(&vhost_user.mutex);
727 	return ret;
728 }
729 
730 int
731 rte_vhost_driver_set_protocol_features(const char *path,
732 		uint64_t protocol_features)
733 {
734 	struct vhost_user_socket *vsocket;
735 
736 	pthread_mutex_lock(&vhost_user.mutex);
737 	vsocket = find_vhost_user_socket(path);
738 	if (vsocket)
739 		vsocket->protocol_features = protocol_features;
740 	pthread_mutex_unlock(&vhost_user.mutex);
741 	return vsocket ? 0 : -1;
742 }
743 
744 int
745 rte_vhost_driver_get_protocol_features(const char *path,
746 		uint64_t *protocol_features)
747 {
748 	struct vhost_user_socket *vsocket;
749 	uint64_t vdpa_protocol_features;
750 	struct rte_vdpa_device *vdpa_dev;
751 	int did = -1;
752 	int ret = 0;
753 
754 	pthread_mutex_lock(&vhost_user.mutex);
755 	vsocket = find_vhost_user_socket(path);
756 	if (!vsocket) {
757 		RTE_LOG(ERR, VHOST_CONFIG,
758 			"socket file %s is not registered yet.\n", path);
759 		ret = -1;
760 		goto unlock_exit;
761 	}
762 
763 	did = vsocket->vdpa_dev_id;
764 	vdpa_dev = rte_vdpa_get_device(did);
765 	if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
766 		*protocol_features = vsocket->protocol_features;
767 		goto unlock_exit;
768 	}
769 
770 	if (vdpa_dev->ops->get_protocol_features(did,
771 				&vdpa_protocol_features) < 0) {
772 		RTE_LOG(ERR, VHOST_CONFIG,
773 				"failed to get vdpa protocol features "
774 				"for socket file %s.\n", path);
775 		ret = -1;
776 		goto unlock_exit;
777 	}
778 
779 	*protocol_features = vsocket->protocol_features
780 		& vdpa_protocol_features;
781 
782 unlock_exit:
783 	pthread_mutex_unlock(&vhost_user.mutex);
784 	return ret;
785 }
786 
787 int
788 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
789 {
790 	struct vhost_user_socket *vsocket;
791 	uint32_t vdpa_queue_num;
792 	struct rte_vdpa_device *vdpa_dev;
793 	int did = -1;
794 	int ret = 0;
795 
796 	pthread_mutex_lock(&vhost_user.mutex);
797 	vsocket = find_vhost_user_socket(path);
798 	if (!vsocket) {
799 		RTE_LOG(ERR, VHOST_CONFIG,
800 			"socket file %s is not registered yet.\n", path);
801 		ret = -1;
802 		goto unlock_exit;
803 	}
804 
805 	did = vsocket->vdpa_dev_id;
806 	vdpa_dev = rte_vdpa_get_device(did);
807 	if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) {
808 		*queue_num = VHOST_MAX_QUEUE_PAIRS;
809 		goto unlock_exit;
810 	}
811 
812 	if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) {
813 		RTE_LOG(ERR, VHOST_CONFIG,
814 				"failed to get vdpa queue number "
815 				"for socket file %s.\n", path);
816 		ret = -1;
817 		goto unlock_exit;
818 	}
819 
820 	*queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
821 
822 unlock_exit:
823 	pthread_mutex_unlock(&vhost_user.mutex);
824 	return ret;
825 }
826 
827 static void
828 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
829 {
830 	if (vsocket && vsocket->path) {
831 		free(vsocket->path);
832 		vsocket->path = NULL;
833 	}
834 
835 	if (vsocket) {
836 		free(vsocket);
837 		vsocket = NULL;
838 	}
839 }
840 
841 /*
842  * Register a new vhost-user socket; here we could act as server
843  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
844  * is set.
845  */
846 int
847 rte_vhost_driver_register(const char *path, uint64_t flags)
848 {
849 	int ret = -1;
850 	struct vhost_user_socket *vsocket;
851 
852 	if (!path)
853 		return -1;
854 
855 	pthread_mutex_lock(&vhost_user.mutex);
856 
857 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
858 		RTE_LOG(ERR, VHOST_CONFIG,
859 			"error: the number of vhost sockets reaches maximum\n");
860 		goto out;
861 	}
862 
863 	vsocket = malloc(sizeof(struct vhost_user_socket));
864 	if (!vsocket)
865 		goto out;
866 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
867 	vsocket->path = strdup(path);
868 	if (vsocket->path == NULL) {
869 		RTE_LOG(ERR, VHOST_CONFIG,
870 			"error: failed to copy socket path string\n");
871 		vhost_user_socket_mem_free(vsocket);
872 		goto out;
873 	}
874 	TAILQ_INIT(&vsocket->conn_list);
875 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
876 	if (ret) {
877 		RTE_LOG(ERR, VHOST_CONFIG,
878 			"error: failed to init connection mutex\n");
879 		goto out_free;
880 	}
881 	vsocket->vdpa_dev_id = -1;
882 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
883 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
884 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
885 
886 	if (vsocket->dequeue_zero_copy &&
887 	    (flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
888 		RTE_LOG(ERR, VHOST_CONFIG,
889 			"error: enabling dequeue zero copy and IOMMU features "
890 			"simultaneously is not supported\n");
891 		goto out_mutex;
892 	}
893 
894 	/*
895 	 * Set the supported features correctly for the builtin vhost-user
896 	 * net driver.
897 	 *
898 	 * Applications know nothing about features the builtin virtio net
899 	 * driver (virtio_net.c) supports, thus it's not possible for them
900 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
901 	 * we set it unconditionally. If the application want to implement
902 	 * another vhost-user driver (say SCSI), it should call the
903 	 * rte_vhost_driver_set_features(), which will overwrite following
904 	 * two values.
905 	 */
906 	vsocket->use_builtin_virtio_net = true;
907 	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
908 	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
909 	vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
910 
911 	/*
912 	 * Dequeue zero copy can't assure descriptors returned in order.
913 	 * Also, it requires that the guest memory is populated, which is
914 	 * not compatible with postcopy.
915 	 */
916 	if (vsocket->dequeue_zero_copy) {
917 		if (vsocket->extbuf) {
918 			RTE_LOG(ERR, VHOST_CONFIG,
919 			"error: zero copy is incompatible with external buffers\n");
920 			ret = -1;
921 			goto out_mutex;
922 		}
923 		if (vsocket->linearbuf) {
924 			RTE_LOG(ERR, VHOST_CONFIG,
925 			"error: zero copy is incompatible with linear buffers\n");
926 			ret = -1;
927 			goto out_mutex;
928 		}
929 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
930 		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
931 
932 		RTE_LOG(INFO, VHOST_CONFIG,
933 			"Dequeue zero copy requested, disabling postcopy support\n");
934 		vsocket->protocol_features &=
935 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
936 	}
937 
938 	/*
939 	 * We'll not be able to receive a buffer from guest in linear mode
940 	 * without external buffer if it will not fit in a single mbuf, which is
941 	 * likely if segmentation offloading enabled.
942 	 */
943 	if (vsocket->linearbuf && !vsocket->extbuf) {
944 		uint64_t seg_offload_features =
945 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
946 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
947 				(1ULL << VIRTIO_NET_F_HOST_UFO);
948 
949 		RTE_LOG(INFO, VHOST_CONFIG,
950 			"Linear buffers requested without external buffers, "
951 			"disabling host segmentation offloading support\n");
952 		vsocket->supported_features &= ~seg_offload_features;
953 		vsocket->features &= ~seg_offload_features;
954 	}
955 
956 	if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
957 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
958 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
959 	}
960 
961 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
962 		vsocket->protocol_features &=
963 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
964 	} else {
965 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
966 		RTE_LOG(ERR, VHOST_CONFIG,
967 			"Postcopy requested but not compiled\n");
968 		ret = -1;
969 		goto out_mutex;
970 #endif
971 	}
972 
973 	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
974 		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
975 		if (vsocket->reconnect && reconn_tid == 0) {
976 			if (vhost_user_reconnect_init() != 0)
977 				goto out_mutex;
978 		}
979 	} else {
980 		vsocket->is_server = true;
981 	}
982 	ret = create_unix_socket(vsocket);
983 	if (ret < 0) {
984 		goto out_mutex;
985 	}
986 
987 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
988 
989 	pthread_mutex_unlock(&vhost_user.mutex);
990 	return ret;
991 
992 out_mutex:
993 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
994 		RTE_LOG(ERR, VHOST_CONFIG,
995 			"error: failed to destroy connection mutex\n");
996 	}
997 out_free:
998 	vhost_user_socket_mem_free(vsocket);
999 out:
1000 	pthread_mutex_unlock(&vhost_user.mutex);
1001 
1002 	return ret;
1003 }
1004 
1005 static bool
1006 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1007 {
1008 	int found = false;
1009 	struct vhost_user_reconnect *reconn, *next;
1010 
1011 	pthread_mutex_lock(&reconn_list.mutex);
1012 
1013 	for (reconn = TAILQ_FIRST(&reconn_list.head);
1014 	     reconn != NULL; reconn = next) {
1015 		next = TAILQ_NEXT(reconn, next);
1016 
1017 		if (reconn->vsocket == vsocket) {
1018 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
1019 			close(reconn->fd);
1020 			free(reconn);
1021 			found = true;
1022 			break;
1023 		}
1024 	}
1025 	pthread_mutex_unlock(&reconn_list.mutex);
1026 	return found;
1027 }
1028 
1029 /**
1030  * Unregister the specified vhost socket
1031  */
1032 int
1033 rte_vhost_driver_unregister(const char *path)
1034 {
1035 	int i;
1036 	int count;
1037 	struct vhost_user_connection *conn, *next;
1038 
1039 	if (path == NULL)
1040 		return -1;
1041 
1042 again:
1043 	pthread_mutex_lock(&vhost_user.mutex);
1044 
1045 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1046 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1047 
1048 		if (!strcmp(vsocket->path, path)) {
1049 			pthread_mutex_lock(&vsocket->conn_mutex);
1050 			for (conn = TAILQ_FIRST(&vsocket->conn_list);
1051 			     conn != NULL;
1052 			     conn = next) {
1053 				next = TAILQ_NEXT(conn, next);
1054 
1055 				/*
1056 				 * If r/wcb is executing, release vsocket's
1057 				 * conn_mutex and vhost_user's mutex locks, and
1058 				 * try again since the r/wcb may use the
1059 				 * conn_mutex and mutex locks.
1060 				 */
1061 				if (fdset_try_del(&vhost_user.fdset,
1062 						  conn->connfd) == -1) {
1063 					pthread_mutex_unlock(
1064 							&vsocket->conn_mutex);
1065 					pthread_mutex_unlock(&vhost_user.mutex);
1066 					goto again;
1067 				}
1068 
1069 				RTE_LOG(INFO, VHOST_CONFIG,
1070 					"free connfd = %d for device '%s'\n",
1071 					conn->connfd, path);
1072 				close(conn->connfd);
1073 				vhost_destroy_device(conn->vid);
1074 				TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1075 				free(conn);
1076 			}
1077 			pthread_mutex_unlock(&vsocket->conn_mutex);
1078 
1079 			if (vsocket->is_server) {
1080 				/*
1081 				 * If r/wcb is executing, release vhost_user's
1082 				 * mutex lock, and try again since the r/wcb
1083 				 * may use the mutex lock.
1084 				 */
1085 				if (fdset_try_del(&vhost_user.fdset,
1086 						vsocket->socket_fd) == -1) {
1087 					pthread_mutex_unlock(&vhost_user.mutex);
1088 					goto again;
1089 				}
1090 
1091 				close(vsocket->socket_fd);
1092 				unlink(path);
1093 			} else if (vsocket->reconnect) {
1094 				vhost_user_remove_reconnect(vsocket);
1095 			}
1096 
1097 			pthread_mutex_destroy(&vsocket->conn_mutex);
1098 			vhost_user_socket_mem_free(vsocket);
1099 
1100 			count = --vhost_user.vsocket_cnt;
1101 			vhost_user.vsockets[i] = vhost_user.vsockets[count];
1102 			vhost_user.vsockets[count] = NULL;
1103 			pthread_mutex_unlock(&vhost_user.mutex);
1104 
1105 			return 0;
1106 		}
1107 	}
1108 	pthread_mutex_unlock(&vhost_user.mutex);
1109 
1110 	return -1;
1111 }
1112 
1113 /*
1114  * Register ops so that we can add/remove device to data core.
1115  */
1116 int
1117 rte_vhost_driver_callback_register(const char *path,
1118 	struct vhost_device_ops const * const ops)
1119 {
1120 	struct vhost_user_socket *vsocket;
1121 
1122 	pthread_mutex_lock(&vhost_user.mutex);
1123 	vsocket = find_vhost_user_socket(path);
1124 	if (vsocket)
1125 		vsocket->notify_ops = ops;
1126 	pthread_mutex_unlock(&vhost_user.mutex);
1127 
1128 	return vsocket ? 0 : -1;
1129 }
1130 
1131 struct vhost_device_ops const *
1132 vhost_driver_callback_get(const char *path)
1133 {
1134 	struct vhost_user_socket *vsocket;
1135 
1136 	pthread_mutex_lock(&vhost_user.mutex);
1137 	vsocket = find_vhost_user_socket(path);
1138 	pthread_mutex_unlock(&vhost_user.mutex);
1139 
1140 	return vsocket ? vsocket->notify_ops : NULL;
1141 }
1142 
1143 int
1144 rte_vhost_driver_start(const char *path)
1145 {
1146 	struct vhost_user_socket *vsocket;
1147 	static pthread_t fdset_tid;
1148 
1149 	pthread_mutex_lock(&vhost_user.mutex);
1150 	vsocket = find_vhost_user_socket(path);
1151 	pthread_mutex_unlock(&vhost_user.mutex);
1152 
1153 	if (!vsocket)
1154 		return -1;
1155 
1156 	if (fdset_tid == 0) {
1157 		/**
1158 		 * create a pipe which will be waited by poll and notified to
1159 		 * rebuild the wait list of poll.
1160 		 */
1161 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1162 			RTE_LOG(ERR, VHOST_CONFIG,
1163 				"failed to create pipe for vhost fdset\n");
1164 			return -1;
1165 		}
1166 
1167 		int ret = rte_ctrl_thread_create(&fdset_tid,
1168 			"vhost-events", NULL, fdset_event_dispatch,
1169 			&vhost_user.fdset);
1170 		if (ret != 0) {
1171 			RTE_LOG(ERR, VHOST_CONFIG,
1172 				"failed to create fdset handling thread");
1173 
1174 			fdset_pipe_uninit(&vhost_user.fdset);
1175 			return -1;
1176 		}
1177 	}
1178 
1179 	if (vsocket->is_server)
1180 		return vhost_user_start_server(vsocket);
1181 	else
1182 		return vhost_user_start_client(vsocket);
1183 }
1184