xref: /f-stack/dpdk/lib/librte_vhost/socket.c (revision d30ea906)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/queue.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <pthread.h>
18 
19 #include <rte_log.h>
20 
21 #include "fd_man.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24 
25 
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27 
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33 	struct vhost_user_connection_list conn_list;
34 	pthread_mutex_t conn_mutex;
35 	char *path;
36 	int socket_fd;
37 	struct sockaddr_un un;
38 	bool is_server;
39 	bool reconnect;
40 	bool dequeue_zero_copy;
41 	bool iommu_support;
42 	bool use_builtin_virtio_net;
43 
44 	/*
45 	 * The "supported_features" indicates the feature bits the
46 	 * vhost driver supports. The "features" indicates the feature
47 	 * bits after the rte_vhost_driver_features_disable/enable().
48 	 * It is also the final feature bits used for vhost-user
49 	 * features negotiation.
50 	 */
51 	uint64_t supported_features;
52 	uint64_t features;
53 
54 	uint64_t protocol_features;
55 
56 	/*
57 	 * Device id to identify a specific backend device.
58 	 * It's set to -1 for the default software implementation.
59 	 * If valid, one socket can have 1 connection only.
60 	 */
61 	int vdpa_dev_id;
62 
63 	struct vhost_device_ops const *notify_ops;
64 };
65 
66 struct vhost_user_connection {
67 	struct vhost_user_socket *vsocket;
68 	int connfd;
69 	int vid;
70 
71 	TAILQ_ENTRY(vhost_user_connection) next;
72 };
73 
74 #define MAX_VHOST_SOCKET 1024
75 struct vhost_user {
76 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
77 	struct fdset fdset;
78 	int vsocket_cnt;
79 	pthread_mutex_t mutex;
80 };
81 
82 #define MAX_VIRTIO_BACKLOG 128
83 
84 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
85 static void vhost_user_read_cb(int fd, void *dat, int *remove);
86 static int create_unix_socket(struct vhost_user_socket *vsocket);
87 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
88 
89 static struct vhost_user vhost_user = {
90 	.fdset = {
91 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
92 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
93 		.num = 0
94 	},
95 	.vsocket_cnt = 0,
96 	.mutex = PTHREAD_MUTEX_INITIALIZER,
97 };
98 
99 /*
100  * return bytes# of read on success or negative val on failure. Update fdnum
101  * with number of fds read.
102  */
103 int
104 read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
105 		int *fd_num)
106 {
107 	struct iovec iov;
108 	struct msghdr msgh;
109 	char control[CMSG_SPACE(max_fds * sizeof(int))];
110 	struct cmsghdr *cmsg;
111 	int got_fds = 0;
112 	int ret;
113 
114 	*fd_num = 0;
115 
116 	memset(&msgh, 0, sizeof(msgh));
117 	iov.iov_base = buf;
118 	iov.iov_len  = buflen;
119 
120 	msgh.msg_iov = &iov;
121 	msgh.msg_iovlen = 1;
122 	msgh.msg_control = control;
123 	msgh.msg_controllen = sizeof(control);
124 
125 	ret = recvmsg(sockfd, &msgh, 0);
126 	if (ret <= 0) {
127 		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
128 		return ret;
129 	}
130 
131 	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
132 		RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
133 		return -1;
134 	}
135 
136 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
137 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
138 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
139 			(cmsg->cmsg_type == SCM_RIGHTS)) {
140 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
141 			*fd_num = got_fds;
142 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
143 			break;
144 		}
145 	}
146 
147 	/* Clear out unused file descriptors */
148 	while (got_fds < max_fds)
149 		fds[got_fds++] = -1;
150 
151 	return ret;
152 }
153 
154 int
155 send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
156 {
157 
158 	struct iovec iov;
159 	struct msghdr msgh;
160 	size_t fdsize = fd_num * sizeof(int);
161 	char control[CMSG_SPACE(fdsize)];
162 	struct cmsghdr *cmsg;
163 	int ret;
164 
165 	memset(&msgh, 0, sizeof(msgh));
166 	iov.iov_base = buf;
167 	iov.iov_len = buflen;
168 
169 	msgh.msg_iov = &iov;
170 	msgh.msg_iovlen = 1;
171 
172 	if (fds && fd_num > 0) {
173 		msgh.msg_control = control;
174 		msgh.msg_controllen = sizeof(control);
175 		cmsg = CMSG_FIRSTHDR(&msgh);
176 		if (cmsg == NULL) {
177 			RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n");
178 			errno = EINVAL;
179 			return -1;
180 		}
181 		cmsg->cmsg_len = CMSG_LEN(fdsize);
182 		cmsg->cmsg_level = SOL_SOCKET;
183 		cmsg->cmsg_type = SCM_RIGHTS;
184 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
185 	} else {
186 		msgh.msg_control = NULL;
187 		msgh.msg_controllen = 0;
188 	}
189 
190 	do {
191 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
192 	} while (ret < 0 && errno == EINTR);
193 
194 	if (ret < 0) {
195 		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
196 		return ret;
197 	}
198 
199 	return ret;
200 }
201 
202 static void
203 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
204 {
205 	int vid;
206 	size_t size;
207 	struct vhost_user_connection *conn;
208 	int ret;
209 
210 	if (vsocket == NULL)
211 		return;
212 
213 	conn = malloc(sizeof(*conn));
214 	if (conn == NULL) {
215 		close(fd);
216 		return;
217 	}
218 
219 	vid = vhost_new_device();
220 	if (vid == -1) {
221 		goto err;
222 	}
223 
224 	size = strnlen(vsocket->path, PATH_MAX);
225 	vhost_set_ifname(vid, vsocket->path, size);
226 
227 	vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
228 
229 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id);
230 
231 	if (vsocket->dequeue_zero_copy)
232 		vhost_enable_dequeue_zero_copy(vid);
233 
234 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
235 
236 	if (vsocket->notify_ops->new_connection) {
237 		ret = vsocket->notify_ops->new_connection(vid);
238 		if (ret < 0) {
239 			RTE_LOG(ERR, VHOST_CONFIG,
240 				"failed to add vhost user connection with fd %d\n",
241 				fd);
242 			goto err;
243 		}
244 	}
245 
246 	conn->connfd = fd;
247 	conn->vsocket = vsocket;
248 	conn->vid = vid;
249 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
250 			NULL, conn);
251 	if (ret < 0) {
252 		RTE_LOG(ERR, VHOST_CONFIG,
253 			"failed to add fd %d into vhost server fdset\n",
254 			fd);
255 
256 		if (vsocket->notify_ops->destroy_connection)
257 			vsocket->notify_ops->destroy_connection(conn->vid);
258 
259 		goto err;
260 	}
261 
262 	pthread_mutex_lock(&vsocket->conn_mutex);
263 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
264 	pthread_mutex_unlock(&vsocket->conn_mutex);
265 
266 	fdset_pipe_notify(&vhost_user.fdset);
267 	return;
268 
269 err:
270 	free(conn);
271 	close(fd);
272 }
273 
274 /* call back when there is new vhost-user connection from client  */
275 static void
276 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
277 {
278 	struct vhost_user_socket *vsocket = dat;
279 
280 	fd = accept(fd, NULL, NULL);
281 	if (fd < 0)
282 		return;
283 
284 	RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
285 	vhost_user_add_connection(fd, vsocket);
286 }
287 
288 static void
289 vhost_user_read_cb(int connfd, void *dat, int *remove)
290 {
291 	struct vhost_user_connection *conn = dat;
292 	struct vhost_user_socket *vsocket = conn->vsocket;
293 	int ret;
294 
295 	ret = vhost_user_msg_handler(conn->vid, connfd);
296 	if (ret < 0) {
297 		close(connfd);
298 		*remove = 1;
299 		vhost_destroy_device(conn->vid);
300 
301 		if (vsocket->notify_ops->destroy_connection)
302 			vsocket->notify_ops->destroy_connection(conn->vid);
303 
304 		pthread_mutex_lock(&vsocket->conn_mutex);
305 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
306 		pthread_mutex_unlock(&vsocket->conn_mutex);
307 
308 		free(conn);
309 
310 		if (vsocket->reconnect) {
311 			create_unix_socket(vsocket);
312 			vhost_user_start_client(vsocket);
313 		}
314 	}
315 }
316 
317 static int
318 create_unix_socket(struct vhost_user_socket *vsocket)
319 {
320 	int fd;
321 	struct sockaddr_un *un = &vsocket->un;
322 
323 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
324 	if (fd < 0)
325 		return -1;
326 	RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
327 		vsocket->is_server ? "server" : "client", fd);
328 
329 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
330 		RTE_LOG(ERR, VHOST_CONFIG,
331 			"vhost-user: can't set nonblocking mode for socket, fd: "
332 			"%d (%s)\n", fd, strerror(errno));
333 		close(fd);
334 		return -1;
335 	}
336 
337 	memset(un, 0, sizeof(*un));
338 	un->sun_family = AF_UNIX;
339 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
340 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
341 
342 	vsocket->socket_fd = fd;
343 	return 0;
344 }
345 
346 static int
347 vhost_user_start_server(struct vhost_user_socket *vsocket)
348 {
349 	int ret;
350 	int fd = vsocket->socket_fd;
351 	const char *path = vsocket->path;
352 
353 	/*
354 	 * bind () may fail if the socket file with the same name already
355 	 * exists. But the library obviously should not delete the file
356 	 * provided by the user, since we can not be sure that it is not
357 	 * being used by other applications. Moreover, many applications form
358 	 * socket names based on user input, which is prone to errors.
359 	 *
360 	 * The user must ensure that the socket does not exist before
361 	 * registering the vhost driver in server mode.
362 	 */
363 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
364 	if (ret < 0) {
365 		RTE_LOG(ERR, VHOST_CONFIG,
366 			"failed to bind to %s: %s; remove it and try again\n",
367 			path, strerror(errno));
368 		goto err;
369 	}
370 	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
371 
372 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
373 	if (ret < 0)
374 		goto err;
375 
376 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
377 		  NULL, vsocket);
378 	if (ret < 0) {
379 		RTE_LOG(ERR, VHOST_CONFIG,
380 			"failed to add listen fd %d to vhost server fdset\n",
381 			fd);
382 		goto err;
383 	}
384 
385 	return 0;
386 
387 err:
388 	close(fd);
389 	return -1;
390 }
391 
392 struct vhost_user_reconnect {
393 	struct sockaddr_un un;
394 	int fd;
395 	struct vhost_user_socket *vsocket;
396 
397 	TAILQ_ENTRY(vhost_user_reconnect) next;
398 };
399 
400 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
401 struct vhost_user_reconnect_list {
402 	struct vhost_user_reconnect_tailq_list head;
403 	pthread_mutex_t mutex;
404 };
405 
406 static struct vhost_user_reconnect_list reconn_list;
407 static pthread_t reconn_tid;
408 
409 static int
410 vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
411 {
412 	int ret, flags;
413 
414 	ret = connect(fd, un, sz);
415 	if (ret < 0 && errno != EISCONN)
416 		return -1;
417 
418 	flags = fcntl(fd, F_GETFL, 0);
419 	if (flags < 0) {
420 		RTE_LOG(ERR, VHOST_CONFIG,
421 			"can't get flags for connfd %d\n", fd);
422 		return -2;
423 	}
424 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
425 		RTE_LOG(ERR, VHOST_CONFIG,
426 				"can't disable nonblocking on fd %d\n", fd);
427 		return -2;
428 	}
429 	return 0;
430 }
431 
432 static void *
433 vhost_user_client_reconnect(void *arg __rte_unused)
434 {
435 	int ret;
436 	struct vhost_user_reconnect *reconn, *next;
437 
438 	while (1) {
439 		pthread_mutex_lock(&reconn_list.mutex);
440 
441 		/*
442 		 * An equal implementation of TAILQ_FOREACH_SAFE,
443 		 * which does not exist on all platforms.
444 		 */
445 		for (reconn = TAILQ_FIRST(&reconn_list.head);
446 		     reconn != NULL; reconn = next) {
447 			next = TAILQ_NEXT(reconn, next);
448 
449 			ret = vhost_user_connect_nonblock(reconn->fd,
450 						(struct sockaddr *)&reconn->un,
451 						sizeof(reconn->un));
452 			if (ret == -2) {
453 				close(reconn->fd);
454 				RTE_LOG(ERR, VHOST_CONFIG,
455 					"reconnection for fd %d failed\n",
456 					reconn->fd);
457 				goto remove_fd;
458 			}
459 			if (ret == -1)
460 				continue;
461 
462 			RTE_LOG(INFO, VHOST_CONFIG,
463 				"%s: connected\n", reconn->vsocket->path);
464 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
465 remove_fd:
466 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
467 			free(reconn);
468 		}
469 
470 		pthread_mutex_unlock(&reconn_list.mutex);
471 		sleep(1);
472 	}
473 
474 	return NULL;
475 }
476 
477 static int
478 vhost_user_reconnect_init(void)
479 {
480 	int ret;
481 
482 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
483 	if (ret < 0) {
484 		RTE_LOG(ERR, VHOST_CONFIG, "failed to initialize mutex");
485 		return ret;
486 	}
487 	TAILQ_INIT(&reconn_list.head);
488 
489 	ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
490 			     vhost_user_client_reconnect, NULL);
491 	if (ret != 0) {
492 		RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
493 		if (pthread_mutex_destroy(&reconn_list.mutex)) {
494 			RTE_LOG(ERR, VHOST_CONFIG,
495 				"failed to destroy reconnect mutex");
496 		}
497 	}
498 
499 	return ret;
500 }
501 
502 static int
503 vhost_user_start_client(struct vhost_user_socket *vsocket)
504 {
505 	int ret;
506 	int fd = vsocket->socket_fd;
507 	const char *path = vsocket->path;
508 	struct vhost_user_reconnect *reconn;
509 
510 	ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
511 					  sizeof(vsocket->un));
512 	if (ret == 0) {
513 		vhost_user_add_connection(fd, vsocket);
514 		return 0;
515 	}
516 
517 	RTE_LOG(WARNING, VHOST_CONFIG,
518 		"failed to connect to %s: %s\n",
519 		path, strerror(errno));
520 
521 	if (ret == -2 || !vsocket->reconnect) {
522 		close(fd);
523 		return -1;
524 	}
525 
526 	RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
527 	reconn = malloc(sizeof(*reconn));
528 	if (reconn == NULL) {
529 		RTE_LOG(ERR, VHOST_CONFIG,
530 			"failed to allocate memory for reconnect\n");
531 		close(fd);
532 		return -1;
533 	}
534 	reconn->un = vsocket->un;
535 	reconn->fd = fd;
536 	reconn->vsocket = vsocket;
537 	pthread_mutex_lock(&reconn_list.mutex);
538 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
539 	pthread_mutex_unlock(&reconn_list.mutex);
540 
541 	return 0;
542 }
543 
544 static struct vhost_user_socket *
545 find_vhost_user_socket(const char *path)
546 {
547 	int i;
548 
549 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
550 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
551 
552 		if (!strcmp(vsocket->path, path))
553 			return vsocket;
554 	}
555 
556 	return NULL;
557 }
558 
559 int
560 rte_vhost_driver_attach_vdpa_device(const char *path, int did)
561 {
562 	struct vhost_user_socket *vsocket;
563 
564 	if (rte_vdpa_get_device(did) == NULL)
565 		return -1;
566 
567 	pthread_mutex_lock(&vhost_user.mutex);
568 	vsocket = find_vhost_user_socket(path);
569 	if (vsocket)
570 		vsocket->vdpa_dev_id = did;
571 	pthread_mutex_unlock(&vhost_user.mutex);
572 
573 	return vsocket ? 0 : -1;
574 }
575 
576 int
577 rte_vhost_driver_detach_vdpa_device(const char *path)
578 {
579 	struct vhost_user_socket *vsocket;
580 
581 	pthread_mutex_lock(&vhost_user.mutex);
582 	vsocket = find_vhost_user_socket(path);
583 	if (vsocket)
584 		vsocket->vdpa_dev_id = -1;
585 	pthread_mutex_unlock(&vhost_user.mutex);
586 
587 	return vsocket ? 0 : -1;
588 }
589 
590 int
591 rte_vhost_driver_get_vdpa_device_id(const char *path)
592 {
593 	struct vhost_user_socket *vsocket;
594 	int did = -1;
595 
596 	pthread_mutex_lock(&vhost_user.mutex);
597 	vsocket = find_vhost_user_socket(path);
598 	if (vsocket)
599 		did = vsocket->vdpa_dev_id;
600 	pthread_mutex_unlock(&vhost_user.mutex);
601 
602 	return did;
603 }
604 
605 int
606 rte_vhost_driver_disable_features(const char *path, uint64_t features)
607 {
608 	struct vhost_user_socket *vsocket;
609 
610 	pthread_mutex_lock(&vhost_user.mutex);
611 	vsocket = find_vhost_user_socket(path);
612 
613 	/* Note that use_builtin_virtio_net is not affected by this function
614 	 * since callers may want to selectively disable features of the
615 	 * built-in vhost net device backend.
616 	 */
617 
618 	if (vsocket)
619 		vsocket->features &= ~features;
620 	pthread_mutex_unlock(&vhost_user.mutex);
621 
622 	return vsocket ? 0 : -1;
623 }
624 
625 int
626 rte_vhost_driver_enable_features(const char *path, uint64_t features)
627 {
628 	struct vhost_user_socket *vsocket;
629 
630 	pthread_mutex_lock(&vhost_user.mutex);
631 	vsocket = find_vhost_user_socket(path);
632 	if (vsocket) {
633 		if ((vsocket->supported_features & features) != features) {
634 			/*
635 			 * trying to enable features the driver doesn't
636 			 * support.
637 			 */
638 			pthread_mutex_unlock(&vhost_user.mutex);
639 			return -1;
640 		}
641 		vsocket->features |= features;
642 	}
643 	pthread_mutex_unlock(&vhost_user.mutex);
644 
645 	return vsocket ? 0 : -1;
646 }
647 
648 int
649 rte_vhost_driver_set_features(const char *path, uint64_t features)
650 {
651 	struct vhost_user_socket *vsocket;
652 
653 	pthread_mutex_lock(&vhost_user.mutex);
654 	vsocket = find_vhost_user_socket(path);
655 	if (vsocket) {
656 		vsocket->supported_features = features;
657 		vsocket->features = features;
658 
659 		/* Anyone setting feature bits is implementing their own vhost
660 		 * device backend.
661 		 */
662 		vsocket->use_builtin_virtio_net = false;
663 	}
664 	pthread_mutex_unlock(&vhost_user.mutex);
665 
666 	return vsocket ? 0 : -1;
667 }
668 
669 int
670 rte_vhost_driver_get_features(const char *path, uint64_t *features)
671 {
672 	struct vhost_user_socket *vsocket;
673 	uint64_t vdpa_features;
674 	struct rte_vdpa_device *vdpa_dev;
675 	int did = -1;
676 	int ret = 0;
677 
678 	pthread_mutex_lock(&vhost_user.mutex);
679 	vsocket = find_vhost_user_socket(path);
680 	if (!vsocket) {
681 		RTE_LOG(ERR, VHOST_CONFIG,
682 			"socket file %s is not registered yet.\n", path);
683 		ret = -1;
684 		goto unlock_exit;
685 	}
686 
687 	did = vsocket->vdpa_dev_id;
688 	vdpa_dev = rte_vdpa_get_device(did);
689 	if (!vdpa_dev || !vdpa_dev->ops->get_features) {
690 		*features = vsocket->features;
691 		goto unlock_exit;
692 	}
693 
694 	if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) {
695 		RTE_LOG(ERR, VHOST_CONFIG,
696 				"failed to get vdpa features "
697 				"for socket file %s.\n", path);
698 		ret = -1;
699 		goto unlock_exit;
700 	}
701 
702 	*features = vsocket->features & vdpa_features;
703 
704 unlock_exit:
705 	pthread_mutex_unlock(&vhost_user.mutex);
706 	return ret;
707 }
708 
709 int
710 rte_vhost_driver_get_protocol_features(const char *path,
711 		uint64_t *protocol_features)
712 {
713 	struct vhost_user_socket *vsocket;
714 	uint64_t vdpa_protocol_features;
715 	struct rte_vdpa_device *vdpa_dev;
716 	int did = -1;
717 	int ret = 0;
718 
719 	pthread_mutex_lock(&vhost_user.mutex);
720 	vsocket = find_vhost_user_socket(path);
721 	if (!vsocket) {
722 		RTE_LOG(ERR, VHOST_CONFIG,
723 			"socket file %s is not registered yet.\n", path);
724 		ret = -1;
725 		goto unlock_exit;
726 	}
727 
728 	did = vsocket->vdpa_dev_id;
729 	vdpa_dev = rte_vdpa_get_device(did);
730 	if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
731 		*protocol_features = vsocket->protocol_features;
732 		goto unlock_exit;
733 	}
734 
735 	if (vdpa_dev->ops->get_protocol_features(did,
736 				&vdpa_protocol_features) < 0) {
737 		RTE_LOG(ERR, VHOST_CONFIG,
738 				"failed to get vdpa protocol features "
739 				"for socket file %s.\n", path);
740 		ret = -1;
741 		goto unlock_exit;
742 	}
743 
744 	*protocol_features = vsocket->protocol_features
745 		& vdpa_protocol_features;
746 
747 unlock_exit:
748 	pthread_mutex_unlock(&vhost_user.mutex);
749 	return ret;
750 }
751 
752 int
753 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
754 {
755 	struct vhost_user_socket *vsocket;
756 	uint32_t vdpa_queue_num;
757 	struct rte_vdpa_device *vdpa_dev;
758 	int did = -1;
759 	int ret = 0;
760 
761 	pthread_mutex_lock(&vhost_user.mutex);
762 	vsocket = find_vhost_user_socket(path);
763 	if (!vsocket) {
764 		RTE_LOG(ERR, VHOST_CONFIG,
765 			"socket file %s is not registered yet.\n", path);
766 		ret = -1;
767 		goto unlock_exit;
768 	}
769 
770 	did = vsocket->vdpa_dev_id;
771 	vdpa_dev = rte_vdpa_get_device(did);
772 	if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) {
773 		*queue_num = VHOST_MAX_QUEUE_PAIRS;
774 		goto unlock_exit;
775 	}
776 
777 	if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) {
778 		RTE_LOG(ERR, VHOST_CONFIG,
779 				"failed to get vdpa queue number "
780 				"for socket file %s.\n", path);
781 		ret = -1;
782 		goto unlock_exit;
783 	}
784 
785 	*queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
786 
787 unlock_exit:
788 	pthread_mutex_unlock(&vhost_user.mutex);
789 	return ret;
790 }
791 
792 static void
793 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
794 {
795 	if (vsocket && vsocket->path) {
796 		free(vsocket->path);
797 		vsocket->path = NULL;
798 	}
799 
800 	if (vsocket) {
801 		free(vsocket);
802 		vsocket = NULL;
803 	}
804 }
805 
806 /*
807  * Register a new vhost-user socket; here we could act as server
808  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
809  * is set.
810  */
811 int
812 rte_vhost_driver_register(const char *path, uint64_t flags)
813 {
814 	int ret = -1;
815 	struct vhost_user_socket *vsocket;
816 
817 	if (!path)
818 		return -1;
819 
820 	pthread_mutex_lock(&vhost_user.mutex);
821 
822 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
823 		RTE_LOG(ERR, VHOST_CONFIG,
824 			"error: the number of vhost sockets reaches maximum\n");
825 		goto out;
826 	}
827 
828 	vsocket = malloc(sizeof(struct vhost_user_socket));
829 	if (!vsocket)
830 		goto out;
831 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
832 	vsocket->path = strdup(path);
833 	if (vsocket->path == NULL) {
834 		RTE_LOG(ERR, VHOST_CONFIG,
835 			"error: failed to copy socket path string\n");
836 		vhost_user_socket_mem_free(vsocket);
837 		goto out;
838 	}
839 	TAILQ_INIT(&vsocket->conn_list);
840 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
841 	if (ret) {
842 		RTE_LOG(ERR, VHOST_CONFIG,
843 			"error: failed to init connection mutex\n");
844 		goto out_free;
845 	}
846 	vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
847 
848 	/*
849 	 * Set the supported features correctly for the builtin vhost-user
850 	 * net driver.
851 	 *
852 	 * Applications know nothing about features the builtin virtio net
853 	 * driver (virtio_net.c) supports, thus it's not possible for them
854 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
855 	 * we set it unconditionally. If the application want to implement
856 	 * another vhost-user driver (say SCSI), it should call the
857 	 * rte_vhost_driver_set_features(), which will overwrite following
858 	 * two values.
859 	 */
860 	vsocket->use_builtin_virtio_net = true;
861 	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
862 	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
863 	vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
864 
865 	/*
866 	 * Dequeue zero copy can't assure descriptors returned in order.
867 	 * Also, it requires that the guest memory is populated, which is
868 	 * not compatible with postcopy.
869 	 */
870 	if (vsocket->dequeue_zero_copy) {
871 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
872 		vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
873 
874 		RTE_LOG(INFO, VHOST_CONFIG,
875 			"Dequeue zero copy requested, disabling postcopy support\n");
876 		vsocket->protocol_features &=
877 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
878 	}
879 
880 	if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
881 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
882 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
883 	}
884 
885 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
886 		vsocket->protocol_features &=
887 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
888 	} else {
889 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
890 		RTE_LOG(ERR, VHOST_CONFIG,
891 			"Postcopy requested but not compiled\n");
892 		ret = -1;
893 		goto out_mutex;
894 #endif
895 	}
896 
897 	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
898 		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
899 		if (vsocket->reconnect && reconn_tid == 0) {
900 			if (vhost_user_reconnect_init() != 0)
901 				goto out_mutex;
902 		}
903 	} else {
904 		vsocket->is_server = true;
905 	}
906 	ret = create_unix_socket(vsocket);
907 	if (ret < 0) {
908 		goto out_mutex;
909 	}
910 
911 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
912 
913 	pthread_mutex_unlock(&vhost_user.mutex);
914 	return ret;
915 
916 out_mutex:
917 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
918 		RTE_LOG(ERR, VHOST_CONFIG,
919 			"error: failed to destroy connection mutex\n");
920 	}
921 out_free:
922 	vhost_user_socket_mem_free(vsocket);
923 out:
924 	pthread_mutex_unlock(&vhost_user.mutex);
925 
926 	return ret;
927 }
928 
929 static bool
930 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
931 {
932 	int found = false;
933 	struct vhost_user_reconnect *reconn, *next;
934 
935 	pthread_mutex_lock(&reconn_list.mutex);
936 
937 	for (reconn = TAILQ_FIRST(&reconn_list.head);
938 	     reconn != NULL; reconn = next) {
939 		next = TAILQ_NEXT(reconn, next);
940 
941 		if (reconn->vsocket == vsocket) {
942 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
943 			close(reconn->fd);
944 			free(reconn);
945 			found = true;
946 			break;
947 		}
948 	}
949 	pthread_mutex_unlock(&reconn_list.mutex);
950 	return found;
951 }
952 
953 /**
954  * Unregister the specified vhost socket
955  */
956 int
957 rte_vhost_driver_unregister(const char *path)
958 {
959 	int i;
960 	int count;
961 	struct vhost_user_connection *conn, *next;
962 
963 	pthread_mutex_lock(&vhost_user.mutex);
964 
965 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
966 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
967 
968 		if (!strcmp(vsocket->path, path)) {
969 again:
970 			pthread_mutex_lock(&vsocket->conn_mutex);
971 			for (conn = TAILQ_FIRST(&vsocket->conn_list);
972 			     conn != NULL;
973 			     conn = next) {
974 				next = TAILQ_NEXT(conn, next);
975 
976 				/*
977 				 * If r/wcb is executing, release the
978 				 * conn_mutex lock, and try again since
979 				 * the r/wcb may use the conn_mutex lock.
980 				 */
981 				if (fdset_try_del(&vhost_user.fdset,
982 						  conn->connfd) == -1) {
983 					pthread_mutex_unlock(
984 							&vsocket->conn_mutex);
985 					goto again;
986 				}
987 
988 				RTE_LOG(INFO, VHOST_CONFIG,
989 					"free connfd = %d for device '%s'\n",
990 					conn->connfd, path);
991 				close(conn->connfd);
992 				vhost_destroy_device(conn->vid);
993 				TAILQ_REMOVE(&vsocket->conn_list, conn, next);
994 				free(conn);
995 			}
996 			pthread_mutex_unlock(&vsocket->conn_mutex);
997 
998 			if (vsocket->is_server) {
999 				fdset_del(&vhost_user.fdset,
1000 						vsocket->socket_fd);
1001 				close(vsocket->socket_fd);
1002 				unlink(path);
1003 			} else if (vsocket->reconnect) {
1004 				vhost_user_remove_reconnect(vsocket);
1005 			}
1006 
1007 			pthread_mutex_destroy(&vsocket->conn_mutex);
1008 			vhost_user_socket_mem_free(vsocket);
1009 
1010 			count = --vhost_user.vsocket_cnt;
1011 			vhost_user.vsockets[i] = vhost_user.vsockets[count];
1012 			vhost_user.vsockets[count] = NULL;
1013 			pthread_mutex_unlock(&vhost_user.mutex);
1014 
1015 			return 0;
1016 		}
1017 	}
1018 	pthread_mutex_unlock(&vhost_user.mutex);
1019 
1020 	return -1;
1021 }
1022 
1023 /*
1024  * Register ops so that we can add/remove device to data core.
1025  */
1026 int
1027 rte_vhost_driver_callback_register(const char *path,
1028 	struct vhost_device_ops const * const ops)
1029 {
1030 	struct vhost_user_socket *vsocket;
1031 
1032 	pthread_mutex_lock(&vhost_user.mutex);
1033 	vsocket = find_vhost_user_socket(path);
1034 	if (vsocket)
1035 		vsocket->notify_ops = ops;
1036 	pthread_mutex_unlock(&vhost_user.mutex);
1037 
1038 	return vsocket ? 0 : -1;
1039 }
1040 
1041 struct vhost_device_ops const *
1042 vhost_driver_callback_get(const char *path)
1043 {
1044 	struct vhost_user_socket *vsocket;
1045 
1046 	pthread_mutex_lock(&vhost_user.mutex);
1047 	vsocket = find_vhost_user_socket(path);
1048 	pthread_mutex_unlock(&vhost_user.mutex);
1049 
1050 	return vsocket ? vsocket->notify_ops : NULL;
1051 }
1052 
1053 int
1054 rte_vhost_driver_start(const char *path)
1055 {
1056 	struct vhost_user_socket *vsocket;
1057 	static pthread_t fdset_tid;
1058 
1059 	pthread_mutex_lock(&vhost_user.mutex);
1060 	vsocket = find_vhost_user_socket(path);
1061 	pthread_mutex_unlock(&vhost_user.mutex);
1062 
1063 	if (!vsocket)
1064 		return -1;
1065 
1066 	if (fdset_tid == 0) {
1067 		/**
1068 		 * create a pipe which will be waited by poll and notified to
1069 		 * rebuild the wait list of poll.
1070 		 */
1071 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1072 			RTE_LOG(ERR, VHOST_CONFIG,
1073 				"failed to create pipe for vhost fdset\n");
1074 			return -1;
1075 		}
1076 
1077 		int ret = rte_ctrl_thread_create(&fdset_tid,
1078 			"vhost-events", NULL, fdset_event_dispatch,
1079 			&vhost_user.fdset);
1080 		if (ret != 0) {
1081 			RTE_LOG(ERR, VHOST_CONFIG,
1082 				"failed to create fdset handling thread");
1083 
1084 			fdset_pipe_uninit(&vhost_user.fdset);
1085 			return -1;
1086 		}
1087 	}
1088 
1089 	if (vsocket->is_server)
1090 		return vhost_user_start_server(vsocket);
1091 	else
1092 		return vhost_user_start_client(vsocket);
1093 }
1094