1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California.
6 * Copyright (c) 2004 The FreeBSD Foundation
7 * Copyright (c) 2004-2008 Robert N. M. Watson
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
35 */
36
37 /*
38 * Comments on the socket life cycle:
39 *
40 * soalloc() sets of socket layer state for a socket, called only by
41 * socreate() and sonewconn(). Socket layer private.
42 *
43 * sodealloc() tears down socket layer state for a socket, called only by
44 * sofree() and sonewconn(). Socket layer private.
45 *
46 * pru_attach() associates protocol layer state with an allocated socket;
47 * called only once, may fail, aborting socket allocation. This is called
48 * from socreate() and sonewconn(). Socket layer private.
49 *
50 * pru_detach() disassociates protocol layer state from an attached socket,
51 * and will be called exactly once for sockets in which pru_attach() has
52 * been successfully called. If pru_attach() returned an error,
53 * pru_detach() will not be called. Socket layer private.
54 *
55 * pru_abort() and pru_close() notify the protocol layer that the last
56 * consumer of a socket is starting to tear down the socket, and that the
57 * protocol should terminate the connection. Historically, pru_abort() also
58 * detached protocol state from the socket state, but this is no longer the
59 * case.
60 *
61 * socreate() creates a socket and attaches protocol state. This is a public
62 * interface that may be used by socket layer consumers to create new
63 * sockets.
64 *
65 * sonewconn() creates a socket and attaches protocol state. This is a
66 * public interface that may be used by protocols to create new sockets when
67 * a new connection is received and will be available for accept() on a
68 * listen socket.
69 *
70 * soclose() destroys a socket after possibly waiting for it to disconnect.
71 * This is a public interface that socket consumers should use to close and
72 * release a socket when done with it.
73 *
74 * soabort() destroys a socket without waiting for it to disconnect (used
75 * only for incoming connections that are already partially or fully
76 * connected). This is used internally by the socket layer when clearing
77 * listen socket queues (due to overflow or close on the listen socket), but
78 * is also a public interface protocols may use to abort connections in
79 * their incomplete listen queues should they no longer be required. Sockets
80 * placed in completed connection listen queues should not be aborted for
81 * reasons described in the comment above the soclose() implementation. This
82 * is not a general purpose close routine, and except in the specific
83 * circumstances described here, should not be used.
84 *
85 * sofree() will free a socket and its protocol state if all references on
86 * the socket have been released, and is the public interface to attempt to
87 * free a socket when a reference is removed. This is a socket layer private
88 * interface.
89 *
90 * NOTE: In addition to socreate() and soclose(), which provide a single
91 * socket reference to the consumer to be managed as required, there are two
92 * calls to explicitly manage socket references, soref(), and sorele().
93 * Currently, these are generally required only when transitioning a socket
94 * from a listen queue to a file descriptor, in order to prevent garbage
95 * collection of the socket at an untimely moment. For a number of reasons,
96 * these interfaces are not preferred, and should be avoided.
97 *
98 * NOTE: With regard to VNETs the general rule is that callers do not set
99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
101 * and sorflush(), which are usually called from a pre-set VNET context.
102 * sopoll() currently does not need a VNET context to be set.
103 */
104
105 #include <sys/cdefs.h>
106 #include "opt_inet.h"
107 #include "opt_inet6.h"
108 #include "opt_kern_tls.h"
109 #include "opt_ktrace.h"
110 #include "opt_sctp.h"
111
112 #include <sys/param.h>
113 #include <sys/systm.h>
114 #include <sys/capsicum.h>
115 #include <sys/fcntl.h>
116 #include <sys/limits.h>
117 #include <sys/lock.h>
118 #include <sys/mac.h>
119 #include <sys/malloc.h>
120 #include <sys/mbuf.h>
121 #include <sys/mutex.h>
122 #include <sys/domain.h>
123 #include <sys/file.h> /* for struct knote */
124 #include <sys/hhook.h>
125 #include <sys/kernel.h>
126 #include <sys/khelp.h>
127 #include <sys/kthread.h>
128 #include <sys/ktls.h>
129 #include <sys/event.h>
130 #include <sys/eventhandler.h>
131 #include <sys/poll.h>
132 #include <sys/proc.h>
133 #include <sys/protosw.h>
134 #include <sys/sbuf.h>
135 #include <sys/socket.h>
136 #include <sys/socketvar.h>
137 #include <sys/resourcevar.h>
138 #include <net/route.h>
139 #include <sys/sched.h>
140 #include <sys/signalvar.h>
141 #include <sys/smp.h>
142 #include <sys/stat.h>
143 #include <sys/sx.h>
144 #include <sys/sysctl.h>
145 #include <sys/taskqueue.h>
146 #include <sys/uio.h>
147 #include <sys/un.h>
148 #include <sys/unpcb.h>
149 #include <sys/jail.h>
150 #include <sys/syslog.h>
151 #include <netinet/in.h>
152 #include <netinet/in_pcb.h>
153 #include <netinet/tcp.h>
154
155 #include <net/vnet.h>
156
157 #include <security/mac/mac_framework.h>
158 #include <security/mac/mac_internal.h>
159
160 #include <vm/uma.h>
161
162 #ifdef COMPAT_FREEBSD32
163 #include <sys/mount.h>
164 #include <sys/sysent.h>
165 #include <compat/freebsd32/freebsd32.h>
166 #endif
167
168 static int soreceive_generic_locked(struct socket *so,
169 struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
170 struct mbuf **controlp, int *flagsp);
171 static int soreceive_rcvoob(struct socket *so, struct uio *uio,
172 int flags);
173 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
174 struct sockaddr **psa, struct uio *uio, struct mbuf **mp,
175 struct mbuf **controlp, int flags);
176 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr,
177 struct uio *uio, struct mbuf *top, struct mbuf *control,
178 int flags, struct thread *td);
179 static void so_rdknl_lock(void *);
180 static void so_rdknl_unlock(void *);
181 static void so_rdknl_assert_lock(void *, int);
182 static void so_wrknl_lock(void *);
183 static void so_wrknl_unlock(void *);
184 static void so_wrknl_assert_lock(void *, int);
185
186 static void filt_sordetach(struct knote *kn);
187 static int filt_soread(struct knote *kn, long hint);
188 static void filt_sowdetach(struct knote *kn);
189 static int filt_sowrite(struct knote *kn, long hint);
190 static int filt_soempty(struct knote *kn, long hint);
191 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
192 fo_kqfilter_t soo_kqfilter;
193
194 static struct filterops soread_filtops = {
195 .f_isfd = 1,
196 .f_detach = filt_sordetach,
197 .f_event = filt_soread,
198 };
199 static struct filterops sowrite_filtops = {
200 .f_isfd = 1,
201 .f_detach = filt_sowdetach,
202 .f_event = filt_sowrite,
203 };
204 static struct filterops soempty_filtops = {
205 .f_isfd = 1,
206 .f_detach = filt_sowdetach,
207 .f_event = filt_soempty,
208 };
209
210 so_gen_t so_gencnt; /* generation count for sockets */
211
212 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
213 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
214
215 #define VNET_SO_ASSERT(so) \
216 VNET_ASSERT(curvnet != NULL, \
217 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
218
219 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
220 #define V_socket_hhh VNET(socket_hhh)
221
222 #ifdef COMPAT_FREEBSD32
223 #ifdef __amd64__
224 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */
225 #define __splice32_packed __packed
226 #else
227 #define __splice32_packed
228 #endif
229 struct splice32 {
230 int32_t sp_fd;
231 int64_t sp_max;
232 struct timeval32 sp_idle;
233 } __splice32_packed;
234 #undef __splice32_packed
235 #endif
236
237 /*
238 * Limit on the number of connections in the listen queue waiting
239 * for accept(2).
240 * NB: The original sysctl somaxconn is still available but hidden
241 * to prevent confusion about the actual purpose of this number.
242 */
243 static u_int somaxconn = SOMAXCONN;
244
245 static int
sysctl_somaxconn(SYSCTL_HANDLER_ARGS)246 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
247 {
248 int error;
249 int val;
250
251 val = somaxconn;
252 error = sysctl_handle_int(oidp, &val, 0, req);
253 if (error || !req->newptr )
254 return (error);
255
256 /*
257 * The purpose of the UINT_MAX / 3 limit, is so that the formula
258 * 3 * so_qlimit / 2
259 * below, will not overflow.
260 */
261
262 if (val < 1 || val > UINT_MAX / 3)
263 return (EINVAL);
264
265 somaxconn = val;
266 return (0);
267 }
268 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
269 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
270 sysctl_somaxconn, "I",
271 "Maximum listen socket pending connection accept queue size");
272 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
273 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
274 sizeof(int), sysctl_somaxconn, "I",
275 "Maximum listen socket pending connection accept queue size (compat)");
276
277 static int numopensockets;
278 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
279 &numopensockets, 0, "Number of open sockets");
280
281 /*
282 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
283 * so_gencnt field.
284 */
285 static struct mtx so_global_mtx;
286 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
287
288 /*
289 * General IPC sysctl name space, used by sockets and a variety of other IPC
290 * types.
291 */
292 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
293 "IPC");
294
295 /*
296 * Initialize the socket subsystem and set up the socket
297 * memory allocator.
298 */
299 static uma_zone_t socket_zone;
300 int maxsockets;
301
302 static void
socket_zone_change(void * tag)303 socket_zone_change(void *tag)
304 {
305
306 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
307 }
308
309 static int splice_init_state;
310 static struct sx splice_init_lock;
311 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init");
312
313 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0,
314 "Settings relating to the SO_SPLICE socket option");
315
316 static bool splice_receive_stream = true;
317 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN,
318 &splice_receive_stream, 0,
319 "Use soreceive_stream() for stream splices");
320
321 static uma_zone_t splice_zone;
322 static struct proc *splice_proc;
323 struct splice_wq {
324 struct mtx mtx;
325 STAILQ_HEAD(, so_splice) head;
326 bool running;
327 } __aligned(CACHE_LINE_SIZE);
328 static struct splice_wq *splice_wq;
329 static uint32_t splice_index = 0;
330
331 static void so_splice_timeout(void *arg, int pending);
332 static void so_splice_xfer(struct so_splice *s);
333 static int so_unsplice(struct socket *so, bool timeout);
334
335 static void
splice_work_thread(void * ctx)336 splice_work_thread(void *ctx)
337 {
338 struct splice_wq *wq = ctx;
339 struct so_splice *s, *s_temp;
340 STAILQ_HEAD(, so_splice) local_head;
341 int cpu;
342
343 cpu = wq - splice_wq;
344 if (bootverbose)
345 printf("starting so_splice worker thread for CPU %d\n", cpu);
346
347 for (;;) {
348 mtx_lock(&wq->mtx);
349 while (STAILQ_EMPTY(&wq->head)) {
350 wq->running = false;
351 mtx_sleep(wq, &wq->mtx, 0, "-", 0);
352 wq->running = true;
353 }
354 STAILQ_INIT(&local_head);
355 STAILQ_CONCAT(&local_head, &wq->head);
356 STAILQ_INIT(&wq->head);
357 mtx_unlock(&wq->mtx);
358 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) {
359 mtx_lock(&s->mtx);
360 CURVNET_SET(s->src->so_vnet);
361 so_splice_xfer(s);
362 CURVNET_RESTORE();
363 }
364 }
365 }
366
367 static void
so_splice_dispatch_async(struct so_splice * sp)368 so_splice_dispatch_async(struct so_splice *sp)
369 {
370 struct splice_wq *wq;
371 bool running;
372
373 wq = &splice_wq[sp->wq_index];
374 mtx_lock(&wq->mtx);
375 STAILQ_INSERT_TAIL(&wq->head, sp, next);
376 running = wq->running;
377 mtx_unlock(&wq->mtx);
378 if (!running)
379 wakeup(wq);
380 }
381
382 void
so_splice_dispatch(struct so_splice * sp)383 so_splice_dispatch(struct so_splice *sp)
384 {
385 mtx_assert(&sp->mtx, MA_OWNED);
386
387 if (sp->state != SPLICE_IDLE) {
388 mtx_unlock(&sp->mtx);
389 } else {
390 sp->state = SPLICE_QUEUED;
391 mtx_unlock(&sp->mtx);
392 so_splice_dispatch_async(sp);
393 }
394 }
395
396 static int
splice_zinit(void * mem,int size __unused,int flags __unused)397 splice_zinit(void *mem, int size __unused, int flags __unused)
398 {
399 struct so_splice *s;
400
401 s = (struct so_splice *)mem;
402 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF);
403 return (0);
404 }
405
406 static void
splice_zfini(void * mem,int size)407 splice_zfini(void *mem, int size)
408 {
409 struct so_splice *s;
410
411 s = (struct so_splice *)mem;
412 mtx_destroy(&s->mtx);
413 }
414
415 static int
splice_init(void)416 splice_init(void)
417 {
418 struct thread *td;
419 int error, i, state;
420
421 state = atomic_load_acq_int(&splice_init_state);
422 if (__predict_true(state > 0))
423 return (0);
424 if (state < 0)
425 return (ENXIO);
426 sx_xlock(&splice_init_lock);
427 if (splice_init_state != 0) {
428 sx_xunlock(&splice_init_lock);
429 return (0);
430 }
431
432 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL,
433 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0);
434
435 splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP,
436 M_WAITOK | M_ZERO);
437
438 /*
439 * Initialize the workqueues to run the splice work. We create a
440 * work queue for each CPU.
441 */
442 CPU_FOREACH(i) {
443 STAILQ_INIT(&splice_wq[i].head);
444 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF);
445 }
446
447 /* Start kthreads for each workqueue. */
448 error = 0;
449 CPU_FOREACH(i) {
450 error = kproc_kthread_add(splice_work_thread, &splice_wq[i],
451 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i);
452 if (error) {
453 printf("Can't add so_splice thread %d error %d\n",
454 i, error);
455 break;
456 }
457
458 /*
459 * It's possible to create loops with SO_SPLICE; ensure that
460 * worker threads aren't able to starve the system too easily.
461 */
462 thread_lock(td);
463 sched_prio(td, PUSER);
464 thread_unlock(td);
465 }
466
467 splice_init_state = error != 0 ? -1 : 1;
468 sx_xunlock(&splice_init_lock);
469
470 return (error);
471 }
472
473 /*
474 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding
475 * one lock in order to avoid potential deadlocks in case there is some other
476 * code path which acquires more than one I/O lock at a time.
477 */
478 static void
splice_lock_pair(struct socket * so_src,struct socket * so_dst)479 splice_lock_pair(struct socket *so_src, struct socket *so_dst)
480 {
481 int error;
482
483 for (;;) {
484 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR);
485 KASSERT(error == 0,
486 ("%s: failed to lock send I/O lock: %d", __func__, error));
487 error = SOCK_IO_RECV_LOCK(so_src, 0);
488 KASSERT(error == 0 || error == EWOULDBLOCK,
489 ("%s: failed to lock recv I/O lock: %d", __func__, error));
490 if (error == 0)
491 break;
492 SOCK_IO_SEND_UNLOCK(so_dst);
493
494 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR);
495 KASSERT(error == 0,
496 ("%s: failed to lock recv I/O lock: %d", __func__, error));
497 error = SOCK_IO_SEND_LOCK(so_dst, 0);
498 KASSERT(error == 0 || error == EWOULDBLOCK,
499 ("%s: failed to lock send I/O lock: %d", __func__, error));
500 if (error == 0)
501 break;
502 SOCK_IO_RECV_UNLOCK(so_src);
503 }
504 }
505
506 static void
splice_unlock_pair(struct socket * so_src,struct socket * so_dst)507 splice_unlock_pair(struct socket *so_src, struct socket *so_dst)
508 {
509 SOCK_IO_RECV_UNLOCK(so_src);
510 SOCK_IO_SEND_UNLOCK(so_dst);
511 }
512
513 /*
514 * Move data from the source to the sink. Assumes that both of the relevant
515 * socket I/O locks are held.
516 */
517 static int
so_splice_xfer_data(struct socket * so_src,struct socket * so_dst,off_t max,ssize_t * lenp)518 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max,
519 ssize_t *lenp)
520 {
521 struct uio uio;
522 struct mbuf *m;
523 struct sockbuf *sb_src, *sb_dst;
524 ssize_t len;
525 long space;
526 int error, flags;
527
528 SOCK_IO_RECV_ASSERT_LOCKED(so_src);
529 SOCK_IO_SEND_ASSERT_LOCKED(so_dst);
530
531 error = 0;
532 m = NULL;
533 memset(&uio, 0, sizeof(uio));
534
535 sb_src = &so_src->so_rcv;
536 sb_dst = &so_dst->so_snd;
537
538 space = sbspace(sb_dst);
539 if (space < 0)
540 space = 0;
541 len = MIN(max, MIN(space, sbavail(sb_src)));
542 if (len == 0) {
543 SOCK_RECVBUF_LOCK(so_src);
544 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0)
545 error = EPIPE;
546 SOCK_RECVBUF_UNLOCK(so_src);
547 } else {
548 flags = MSG_DONTWAIT;
549 uio.uio_resid = len;
550 if (splice_receive_stream && sb_src->sb_tls_info == NULL) {
551 error = soreceive_stream_locked(so_src, sb_src, NULL,
552 &uio, &m, NULL, flags);
553 } else {
554 error = soreceive_generic_locked(so_src, NULL,
555 &uio, &m, NULL, &flags);
556 }
557 if (error != 0 && m != NULL) {
558 m_freem(m);
559 m = NULL;
560 }
561 }
562 if (m != NULL) {
563 len -= uio.uio_resid;
564 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL,
565 MSG_DONTWAIT, curthread);
566 } else if (error == 0) {
567 len = 0;
568 SOCK_SENDBUF_LOCK(so_dst);
569 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0)
570 error = EPIPE;
571 SOCK_SENDBUF_UNLOCK(so_dst);
572 }
573 if (error == 0)
574 *lenp = len;
575 return (error);
576 }
577
578 /*
579 * Transfer data from the source to the sink.
580 *
581 * If "direct" is true, the transfer is done in the context of whichever thread
582 * is operating on one of the socket buffers. We do not know which locks are
583 * held, so we can only trylock the socket buffers; if this fails, we fall back
584 * to the worker thread, which invokes this routine with "direct" set to false.
585 */
586 static void
so_splice_xfer(struct so_splice * sp)587 so_splice_xfer(struct so_splice *sp)
588 {
589 struct socket *so_src, *so_dst;
590 off_t max;
591 ssize_t len;
592 int error;
593
594 mtx_assert(&sp->mtx, MA_OWNED);
595 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING,
596 ("so_splice_xfer: invalid state %d", sp->state));
597 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0"));
598
599 if (sp->state == SPLICE_CLOSING) {
600 /* Userspace asked us to close the splice. */
601 goto closing;
602 }
603
604 sp->state = SPLICE_RUNNING;
605 so_src = sp->src;
606 so_dst = sp->dst;
607 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX;
608 if (max < 0)
609 max = 0;
610
611 /*
612 * Lock the sockets in order to block userspace from doing anything
613 * sneaky. If an error occurs or one of the sockets can no longer
614 * transfer data, we will automatically unsplice.
615 */
616 mtx_unlock(&sp->mtx);
617 splice_lock_pair(so_src, so_dst);
618
619 error = so_splice_xfer_data(so_src, so_dst, max, &len);
620
621 mtx_lock(&sp->mtx);
622
623 /*
624 * Update our stats while still holding the socket locks. This
625 * synchronizes with getsockopt(SO_SPLICE), see the comment there.
626 */
627 if (error == 0) {
628 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len));
629 so_src->so_splice_sent += len;
630 }
631 splice_unlock_pair(so_src, so_dst);
632
633 switch (sp->state) {
634 case SPLICE_CLOSING:
635 closing:
636 sp->state = SPLICE_CLOSED;
637 wakeup(sp);
638 mtx_unlock(&sp->mtx);
639 break;
640 case SPLICE_RUNNING:
641 if (error != 0 ||
642 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) {
643 sp->state = SPLICE_EXCEPTION;
644 soref(so_src);
645 mtx_unlock(&sp->mtx);
646 (void)so_unsplice(so_src, false);
647 sorele(so_src);
648 } else {
649 /*
650 * Locklessly check for additional bytes in the source's
651 * receive buffer and queue more work if possible. We
652 * may end up queuing needless work, but that's ok, and
653 * if we race with a thread inserting more data into the
654 * buffer and observe sbavail() == 0, the splice mutex
655 * ensures that splice_push() will queue more work for
656 * us.
657 */
658 if (sbavail(&so_src->so_rcv) > 0 &&
659 sbspace(&so_dst->so_snd) > 0) {
660 sp->state = SPLICE_QUEUED;
661 mtx_unlock(&sp->mtx);
662 so_splice_dispatch_async(sp);
663 } else {
664 sp->state = SPLICE_IDLE;
665 mtx_unlock(&sp->mtx);
666 }
667 }
668 break;
669 default:
670 __assert_unreachable();
671 }
672 }
673
674 static void
socket_hhook_register(int subtype)675 socket_hhook_register(int subtype)
676 {
677
678 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
679 &V_socket_hhh[subtype],
680 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
681 printf("%s: WARNING: unable to register hook\n", __func__);
682 }
683
684 static void
socket_hhook_deregister(int subtype)685 socket_hhook_deregister(int subtype)
686 {
687
688 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
689 printf("%s: WARNING: unable to deregister hook\n", __func__);
690 }
691
692 static void
socket_init(void * tag)693 socket_init(void *tag)
694 {
695
696 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
697 NULL, NULL, UMA_ALIGN_PTR, 0);
698 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
699 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
700 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
701 EVENTHANDLER_PRI_FIRST);
702 }
703 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
704
705 static void
socket_vnet_init(const void * unused __unused)706 socket_vnet_init(const void *unused __unused)
707 {
708 int i;
709
710 /* We expect a contiguous range */
711 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
712 socket_hhook_register(i);
713 }
714 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
715 socket_vnet_init, NULL);
716
717 static void
socket_vnet_uninit(const void * unused __unused)718 socket_vnet_uninit(const void *unused __unused)
719 {
720 int i;
721
722 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
723 socket_hhook_deregister(i);
724 }
725 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
726 socket_vnet_uninit, NULL);
727
728 /*
729 * Initialise maxsockets. This SYSINIT must be run after
730 * tunable_mbinit().
731 */
732 static void
init_maxsockets(void * ignored)733 init_maxsockets(void *ignored)
734 {
735
736 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
737 maxsockets = imax(maxsockets, maxfiles);
738 }
739 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
740
741 /*
742 * Sysctl to get and set the maximum global sockets limit. Notify protocols
743 * of the change so that they can update their dependent limits as required.
744 */
745 static int
sysctl_maxsockets(SYSCTL_HANDLER_ARGS)746 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
747 {
748 int error, newmaxsockets;
749
750 newmaxsockets = maxsockets;
751 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
752 if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
753 if (newmaxsockets > maxsockets &&
754 newmaxsockets <= maxfiles) {
755 maxsockets = newmaxsockets;
756 EVENTHANDLER_INVOKE(maxsockets_change);
757 } else
758 error = EINVAL;
759 }
760 return (error);
761 }
762 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
763 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
764 &maxsockets, 0, sysctl_maxsockets, "IU",
765 "Maximum number of sockets available");
766
767 /*
768 * Socket operation routines. These routines are called by the routines in
769 * sys_socket.c or from a system process, and implement the semantics of
770 * socket operations by switching out to the protocol specific routines.
771 */
772
773 /*
774 * Get a socket structure from our zone, and initialize it. Note that it
775 * would probably be better to allocate socket and PCB at the same time, but
776 * I'm not convinced that all the protocols can be easily modified to do
777 * this.
778 *
779 * soalloc() returns a socket with a ref count of 0.
780 */
781 static struct socket *
soalloc(struct vnet * vnet)782 soalloc(struct vnet *vnet)
783 {
784 struct socket *so;
785
786 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
787 if (so == NULL)
788 return (NULL);
789 #ifdef MAC
790 if (mac_socket_init(so, M_NOWAIT) != 0) {
791 uma_zfree(socket_zone, so);
792 return (NULL);
793 }
794 #endif
795 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
796 uma_zfree(socket_zone, so);
797 return (NULL);
798 }
799
800 /*
801 * The socket locking protocol allows to lock 2 sockets at a time,
802 * however, the first one must be a listening socket. WITNESS lacks
803 * a feature to change class of an existing lock, so we use DUPOK.
804 */
805 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
806 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
807 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
808 so->so_rcv.sb_sel = &so->so_rdsel;
809 so->so_snd.sb_sel = &so->so_wrsel;
810 sx_init(&so->so_snd_sx, "so_snd_sx");
811 sx_init(&so->so_rcv_sx, "so_rcv_sx");
812 TAILQ_INIT(&so->so_snd.sb_aiojobq);
813 TAILQ_INIT(&so->so_rcv.sb_aiojobq);
814 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
815 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
816 #ifdef VIMAGE
817 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
818 __func__, __LINE__, so));
819 so->so_vnet = vnet;
820 #endif
821 /* We shouldn't need the so_global_mtx */
822 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
823 /* Do we need more comprehensive error returns? */
824 uma_zfree(socket_zone, so);
825 return (NULL);
826 }
827 mtx_lock(&so_global_mtx);
828 so->so_gencnt = ++so_gencnt;
829 ++numopensockets;
830 #ifdef VIMAGE
831 vnet->vnet_sockcnt++;
832 #endif
833 mtx_unlock(&so_global_mtx);
834
835 return (so);
836 }
837
838 /*
839 * Free the storage associated with a socket at the socket layer, tear down
840 * locks, labels, etc. All protocol state is assumed already to have been
841 * torn down (and possibly never set up) by the caller.
842 */
843 void
sodealloc(struct socket * so)844 sodealloc(struct socket *so)
845 {
846
847 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
848 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
849
850 mtx_lock(&so_global_mtx);
851 so->so_gencnt = ++so_gencnt;
852 --numopensockets; /* Could be below, but faster here. */
853 #ifdef VIMAGE
854 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
855 __func__, __LINE__, so));
856 so->so_vnet->vnet_sockcnt--;
857 #endif
858 mtx_unlock(&so_global_mtx);
859 #ifdef MAC
860 mac_socket_destroy(so);
861 #endif
862 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
863
864 khelp_destroy_osd(&so->osd);
865 if (SOLISTENING(so)) {
866 if (so->sol_accept_filter != NULL)
867 accept_filt_setopt(so, NULL);
868 } else {
869 if (so->so_rcv.sb_hiwat)
870 (void)chgsbsize(so->so_cred->cr_uidinfo,
871 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
872 if (so->so_snd.sb_hiwat)
873 (void)chgsbsize(so->so_cred->cr_uidinfo,
874 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
875 sx_destroy(&so->so_snd_sx);
876 sx_destroy(&so->so_rcv_sx);
877 mtx_destroy(&so->so_snd_mtx);
878 mtx_destroy(&so->so_rcv_mtx);
879 }
880 crfree(so->so_cred);
881 mtx_destroy(&so->so_lock);
882 uma_zfree(socket_zone, so);
883 }
884
885 /*
886 * socreate returns a socket with a ref count of 1 and a file descriptor
887 * reference. The socket should be closed with soclose().
888 */
889 int
socreate(int dom,struct socket ** aso,int type,int proto,struct ucred * cred,struct thread * td)890 socreate(int dom, struct socket **aso, int type, int proto,
891 struct ucred *cred, struct thread *td)
892 {
893 struct protosw *prp;
894 struct socket *so;
895 int error;
896
897 /*
898 * XXX: divert(4) historically abused PF_INET. Keep this compatibility
899 * shim until all applications have been updated.
900 */
901 if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
902 proto == IPPROTO_DIVERT)) {
903 dom = PF_DIVERT;
904 printf("%s uses obsolete way to create divert(4) socket\n",
905 td->td_proc->p_comm);
906 }
907
908 prp = pffindproto(dom, type, proto);
909 if (prp == NULL) {
910 /* No support for domain. */
911 if (pffinddomain(dom) == NULL)
912 return (EAFNOSUPPORT);
913 /* No support for socket type. */
914 if (proto == 0 && type != 0)
915 return (EPROTOTYPE);
916 return (EPROTONOSUPPORT);
917 }
918
919 MPASS(prp->pr_attach);
920
921 if ((prp->pr_flags & PR_CAPATTACH) == 0) {
922 if (CAP_TRACING(td))
923 ktrcapfail(CAPFAIL_PROTO, &proto);
924 if (IN_CAPABILITY_MODE(td))
925 return (ECAPMODE);
926 }
927
928 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
929 return (EPROTONOSUPPORT);
930
931 so = soalloc(CRED_TO_VNET(cred));
932 if (so == NULL)
933 return (ENOBUFS);
934
935 so->so_type = type;
936 so->so_cred = crhold(cred);
937 if ((prp->pr_domain->dom_family == PF_INET) ||
938 (prp->pr_domain->dom_family == PF_INET6) ||
939 (prp->pr_domain->dom_family == PF_ROUTE))
940 so->so_fibnum = td->td_proc->p_fibnum;
941 else
942 so->so_fibnum = 0;
943 so->so_proto = prp;
944 #ifdef MAC
945 mac_socket_create(cred, so);
946 #endif
947 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
948 so_rdknl_assert_lock);
949 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
950 so_wrknl_assert_lock);
951 if ((prp->pr_flags & PR_SOCKBUF) == 0) {
952 so->so_snd.sb_mtx = &so->so_snd_mtx;
953 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
954 }
955 /*
956 * Auto-sizing of socket buffers is managed by the protocols and
957 * the appropriate flags must be set in the pru_attach function.
958 */
959 CURVNET_SET(so->so_vnet);
960 error = prp->pr_attach(so, proto, td);
961 CURVNET_RESTORE();
962 if (error) {
963 sodealloc(so);
964 return (error);
965 }
966 soref(so);
967 *aso = so;
968 return (0);
969 }
970
971 #ifdef REGRESSION
972 static int regression_sonewconn_earlytest = 1;
973 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
974 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
975 #endif
976
977 static int sooverprio = LOG_DEBUG;
978 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW,
979 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable");
980
981 static struct timeval overinterval = { 60, 0 };
982 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
983 &overinterval,
984 "Delay in seconds between warnings for listen socket overflows");
985
986 /*
987 * When an attempt at a new connection is noted on a socket which supports
988 * accept(2), the protocol has two options:
989 * 1) Call legacy sonewconn() function, which would call protocol attach
990 * method, same as used for socket(2).
991 * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
992 * and then call solisten_enqueue().
993 *
994 * Note: the ref count on the socket is 0 on return.
995 */
996 struct socket *
solisten_clone(struct socket * head)997 solisten_clone(struct socket *head)
998 {
999 struct sbuf descrsb;
1000 struct socket *so;
1001 int len, overcount;
1002 u_int qlen;
1003 const char localprefix[] = "local:";
1004 char descrbuf[SUNPATHLEN + sizeof(localprefix)];
1005 #if defined(INET6)
1006 char addrbuf[INET6_ADDRSTRLEN];
1007 #elif defined(INET)
1008 char addrbuf[INET_ADDRSTRLEN];
1009 #endif
1010 bool dolog, over;
1011
1012 SOLISTEN_LOCK(head);
1013 over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
1014 #ifdef REGRESSION
1015 if (regression_sonewconn_earlytest && over) {
1016 #else
1017 if (over) {
1018 #endif
1019 head->sol_overcount++;
1020 dolog = (sooverprio >= 0) &&
1021 !!ratecheck(&head->sol_lastover, &overinterval);
1022
1023 /*
1024 * If we're going to log, copy the overflow count and queue
1025 * length from the listen socket before dropping the lock.
1026 * Also, reset the overflow count.
1027 */
1028 if (dolog) {
1029 overcount = head->sol_overcount;
1030 head->sol_overcount = 0;
1031 qlen = head->sol_qlen;
1032 }
1033 SOLISTEN_UNLOCK(head);
1034
1035 if (dolog) {
1036 /*
1037 * Try to print something descriptive about the
1038 * socket for the error message.
1039 */
1040 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
1041 SBUF_FIXEDLEN);
1042 switch (head->so_proto->pr_domain->dom_family) {
1043 #if defined(INET) || defined(INET6)
1044 #ifdef INET
1045 case AF_INET:
1046 #endif
1047 #ifdef INET6
1048 case AF_INET6:
1049 if (head->so_proto->pr_domain->dom_family ==
1050 AF_INET6 ||
1051 (sotoinpcb(head)->inp_inc.inc_flags &
1052 INC_ISIPV6)) {
1053 ip6_sprintf(addrbuf,
1054 &sotoinpcb(head)->inp_inc.inc6_laddr);
1055 sbuf_printf(&descrsb, "[%s]", addrbuf);
1056 } else
1057 #endif
1058 {
1059 #ifdef INET
1060 inet_ntoa_r(
1061 sotoinpcb(head)->inp_inc.inc_laddr,
1062 addrbuf);
1063 sbuf_cat(&descrsb, addrbuf);
1064 #endif
1065 }
1066 sbuf_printf(&descrsb, ":%hu (proto %u)",
1067 ntohs(sotoinpcb(head)->inp_inc.inc_lport),
1068 head->so_proto->pr_protocol);
1069 break;
1070 #endif /* INET || INET6 */
1071 case AF_UNIX:
1072 sbuf_cat(&descrsb, localprefix);
1073 if (sotounpcb(head)->unp_addr != NULL)
1074 len =
1075 sotounpcb(head)->unp_addr->sun_len -
1076 offsetof(struct sockaddr_un,
1077 sun_path);
1078 else
1079 len = 0;
1080 if (len > 0)
1081 sbuf_bcat(&descrsb,
1082 sotounpcb(head)->unp_addr->sun_path,
1083 len);
1084 else
1085 sbuf_cat(&descrsb, "(unknown)");
1086 break;
1087 }
1088
1089 /*
1090 * If we can't print something more specific, at least
1091 * print the domain name.
1092 */
1093 if (sbuf_finish(&descrsb) != 0 ||
1094 sbuf_len(&descrsb) <= 0) {
1095 sbuf_clear(&descrsb);
1096 sbuf_cat(&descrsb,
1097 head->so_proto->pr_domain->dom_name ?:
1098 "unknown");
1099 sbuf_finish(&descrsb);
1100 }
1101 KASSERT(sbuf_len(&descrsb) > 0,
1102 ("%s: sbuf creation failed", __func__));
1103 /*
1104 * Preserve the historic listen queue overflow log
1105 * message, that starts with "sonewconn:". It has
1106 * been known to sysadmins for years and also test
1107 * sys/kern/sonewconn_overflow checks for it.
1108 */
1109 if (head->so_cred == 0) {
1110 log(LOG_PRI(sooverprio),
1111 "sonewconn: pcb %p (%s): "
1112 "Listen queue overflow: %i already in "
1113 "queue awaiting acceptance (%d "
1114 "occurrences)\n", head->so_pcb,
1115 sbuf_data(&descrsb),
1116 qlen, overcount);
1117 } else {
1118 log(LOG_PRI(sooverprio),
1119 "sonewconn: pcb %p (%s): "
1120 "Listen queue overflow: "
1121 "%i already in queue awaiting acceptance "
1122 "(%d occurrences), euid %d, rgid %d, jail %s\n",
1123 head->so_pcb, sbuf_data(&descrsb), qlen,
1124 overcount, head->so_cred->cr_uid,
1125 head->so_cred->cr_rgid,
1126 head->so_cred->cr_prison ?
1127 head->so_cred->cr_prison->pr_name :
1128 "not_jailed");
1129 }
1130 sbuf_delete(&descrsb);
1131
1132 overcount = 0;
1133 }
1134
1135 return (NULL);
1136 }
1137 SOLISTEN_UNLOCK(head);
1138 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
1139 __func__, head));
1140 so = soalloc(head->so_vnet);
1141 if (so == NULL) {
1142 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
1143 "limit reached or out of memory\n",
1144 __func__, head->so_pcb);
1145 return (NULL);
1146 }
1147 so->so_listen = head;
1148 so->so_type = head->so_type;
1149 /*
1150 * POSIX is ambiguous on what options an accept(2)ed socket should
1151 * inherit from the listener. Words "create a new socket" may be
1152 * interpreted as not inheriting anything. Best programming practice
1153 * for application developers is to not rely on such inheritance.
1154 * FreeBSD had historically inherited all so_options excluding
1155 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options,
1156 * including those completely irrelevant to a new born socket. For
1157 * compatibility with older versions we will inherit a list of
1158 * meaningful options.
1159 */
1160 so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE |
1161 SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE);
1162 so->so_linger = head->so_linger;
1163 so->so_state = head->so_state;
1164 so->so_fibnum = head->so_fibnum;
1165 so->so_proto = head->so_proto;
1166 so->so_cred = crhold(head->so_cred);
1167 #ifdef MAC
1168 mac_socket_newconn(head, so);
1169 #endif
1170 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
1171 so_rdknl_assert_lock);
1172 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
1173 so_wrknl_assert_lock);
1174 VNET_SO_ASSERT(head);
1175 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
1176 sodealloc(so);
1177 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
1178 __func__, head->so_pcb);
1179 return (NULL);
1180 }
1181 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
1182 so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
1183 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
1184 so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
1185 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
1186 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
1187 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
1188 so->so_snd.sb_mtx = &so->so_snd_mtx;
1189 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
1190 }
1191
1192 return (so);
1193 }
1194
1195 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */
1196 struct socket *
1197 sonewconn(struct socket *head, int connstatus)
1198 {
1199 struct socket *so;
1200
1201 if ((so = solisten_clone(head)) == NULL)
1202 return (NULL);
1203
1204 if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
1205 sodealloc(so);
1206 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
1207 __func__, head->so_pcb);
1208 return (NULL);
1209 }
1210
1211 (void)solisten_enqueue(so, connstatus);
1212
1213 return (so);
1214 }
1215
1216 /*
1217 * Enqueue socket cloned by solisten_clone() to the listen queue of the
1218 * listener it has been cloned from.
1219 *
1220 * Return 'true' if socket landed on complete queue, otherwise 'false'.
1221 */
1222 bool
1223 solisten_enqueue(struct socket *so, int connstatus)
1224 {
1225 struct socket *head = so->so_listen;
1226
1227 MPASS(refcount_load(&so->so_count) == 0);
1228 refcount_init(&so->so_count, 1);
1229
1230 SOLISTEN_LOCK(head);
1231 if (head->sol_accept_filter != NULL)
1232 connstatus = 0;
1233 so->so_state |= connstatus;
1234 soref(head); /* A socket on (in)complete queue refs head. */
1235 if (connstatus) {
1236 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
1237 so->so_qstate = SQ_COMP;
1238 head->sol_qlen++;
1239 solisten_wakeup(head); /* unlocks */
1240 return (true);
1241 } else {
1242 /*
1243 * Keep removing sockets from the head until there's room for
1244 * us to insert on the tail. In pre-locking revisions, this
1245 * was a simple if(), but as we could be racing with other
1246 * threads and soabort() requires dropping locks, we must
1247 * loop waiting for the condition to be true.
1248 */
1249 while (head->sol_incqlen > head->sol_qlimit) {
1250 struct socket *sp;
1251
1252 sp = TAILQ_FIRST(&head->sol_incomp);
1253 TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
1254 head->sol_incqlen--;
1255 SOCK_LOCK(sp);
1256 sp->so_qstate = SQ_NONE;
1257 sp->so_listen = NULL;
1258 SOCK_UNLOCK(sp);
1259 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */
1260 soabort(sp);
1261 SOLISTEN_LOCK(head);
1262 }
1263 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
1264 so->so_qstate = SQ_INCOMP;
1265 head->sol_incqlen++;
1266 SOLISTEN_UNLOCK(head);
1267 return (false);
1268 }
1269 }
1270
1271 #if defined(SCTP) || defined(SCTP_SUPPORT)
1272 /*
1273 * Socket part of sctp_peeloff(). Detach a new socket from an
1274 * association. The new socket is returned with a reference.
1275 *
1276 * XXXGL: reduce copy-paste with solisten_clone().
1277 */
1278 struct socket *
1279 sopeeloff(struct socket *head)
1280 {
1281 struct socket *so;
1282
1283 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
1284 __func__, __LINE__, head));
1285 so = soalloc(head->so_vnet);
1286 if (so == NULL) {
1287 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
1288 "limit reached or out of memory\n",
1289 __func__, head->so_pcb);
1290 return (NULL);
1291 }
1292 so->so_type = head->so_type;
1293 so->so_options = head->so_options;
1294 so->so_linger = head->so_linger;
1295 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
1296 so->so_fibnum = head->so_fibnum;
1297 so->so_proto = head->so_proto;
1298 so->so_cred = crhold(head->so_cred);
1299 #ifdef MAC
1300 mac_socket_newconn(head, so);
1301 #endif
1302 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
1303 so_rdknl_assert_lock);
1304 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
1305 so_wrknl_assert_lock);
1306 VNET_SO_ASSERT(head);
1307 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
1308 sodealloc(so);
1309 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
1310 __func__, head->so_pcb);
1311 return (NULL);
1312 }
1313 if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
1314 sodealloc(so);
1315 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
1316 __func__, head->so_pcb);
1317 return (NULL);
1318 }
1319 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
1320 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
1321 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
1322 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
1323 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
1324 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
1325 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
1326 so->so_snd.sb_mtx = &so->so_snd_mtx;
1327 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
1328 }
1329
1330 soref(so);
1331
1332 return (so);
1333 }
1334 #endif /* SCTP */
1335
1336 int
1337 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
1338 {
1339 int error;
1340
1341 CURVNET_SET(so->so_vnet);
1342 error = so->so_proto->pr_bind(so, nam, td);
1343 CURVNET_RESTORE();
1344 return (error);
1345 }
1346
1347 int
1348 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1349 {
1350 int error;
1351
1352 CURVNET_SET(so->so_vnet);
1353 error = so->so_proto->pr_bindat(fd, so, nam, td);
1354 CURVNET_RESTORE();
1355 return (error);
1356 }
1357
1358 /*
1359 * solisten() transitions a socket from a non-listening state to a listening
1360 * state, but can also be used to update the listen queue depth on an
1361 * existing listen socket. The protocol will call back into the sockets
1362 * layer using solisten_proto_check() and solisten_proto() to check and set
1363 * socket-layer listen state. Call backs are used so that the protocol can
1364 * acquire both protocol and socket layer locks in whatever order is required
1365 * by the protocol.
1366 *
1367 * Protocol implementors are advised to hold the socket lock across the
1368 * socket-layer test and set to avoid races at the socket layer.
1369 */
1370 int
1371 solisten(struct socket *so, int backlog, struct thread *td)
1372 {
1373 int error;
1374
1375 CURVNET_SET(so->so_vnet);
1376 error = so->so_proto->pr_listen(so, backlog, td);
1377 CURVNET_RESTORE();
1378 return (error);
1379 }
1380
1381 /*
1382 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in
1383 * order to interlock with socket I/O.
1384 */
1385 int
1386 solisten_proto_check(struct socket *so)
1387 {
1388 SOCK_LOCK_ASSERT(so);
1389
1390 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
1391 SS_ISDISCONNECTING)) != 0)
1392 return (EINVAL);
1393
1394 /*
1395 * Sleeping is not permitted here, so simply fail if userspace is
1396 * attempting to transmit or receive on the socket. This kind of
1397 * transient failure is not ideal, but it should occur only if userspace
1398 * is misusing the socket interfaces.
1399 */
1400 if (!sx_try_xlock(&so->so_snd_sx))
1401 return (EAGAIN);
1402 if (!sx_try_xlock(&so->so_rcv_sx)) {
1403 sx_xunlock(&so->so_snd_sx);
1404 return (EAGAIN);
1405 }
1406 mtx_lock(&so->so_snd_mtx);
1407 mtx_lock(&so->so_rcv_mtx);
1408
1409 /* Interlock with soo_aio_queue() and KTLS. */
1410 if (!SOLISTENING(so)) {
1411 bool ktls;
1412
1413 #ifdef KERN_TLS
1414 ktls = so->so_snd.sb_tls_info != NULL ||
1415 so->so_rcv.sb_tls_info != NULL;
1416 #else
1417 ktls = false;
1418 #endif
1419 if (ktls ||
1420 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
1421 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) {
1422 solisten_proto_abort(so);
1423 return (EINVAL);
1424 }
1425 }
1426
1427 return (0);
1428 }
1429
1430 /*
1431 * Undo the setup done by solisten_proto_check().
1432 */
1433 void
1434 solisten_proto_abort(struct socket *so)
1435 {
1436 mtx_unlock(&so->so_snd_mtx);
1437 mtx_unlock(&so->so_rcv_mtx);
1438 sx_xunlock(&so->so_snd_sx);
1439 sx_xunlock(&so->so_rcv_sx);
1440 }
1441
1442 void
1443 solisten_proto(struct socket *so, int backlog)
1444 {
1445 int sbrcv_lowat, sbsnd_lowat;
1446 u_int sbrcv_hiwat, sbsnd_hiwat;
1447 short sbrcv_flags, sbsnd_flags;
1448 sbintime_t sbrcv_timeo, sbsnd_timeo;
1449
1450 SOCK_LOCK_ASSERT(so);
1451 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
1452 SS_ISDISCONNECTING)) == 0,
1453 ("%s: bad socket state %p", __func__, so));
1454
1455 if (SOLISTENING(so))
1456 goto listening;
1457
1458 /*
1459 * Change this socket to listening state.
1460 */
1461 sbrcv_lowat = so->so_rcv.sb_lowat;
1462 sbsnd_lowat = so->so_snd.sb_lowat;
1463 sbrcv_hiwat = so->so_rcv.sb_hiwat;
1464 sbsnd_hiwat = so->so_snd.sb_hiwat;
1465 sbrcv_flags = so->so_rcv.sb_flags;
1466 sbsnd_flags = so->so_snd.sb_flags;
1467 sbrcv_timeo = so->so_rcv.sb_timeo;
1468 sbsnd_timeo = so->so_snd.sb_timeo;
1469
1470 #ifdef MAC
1471 mac_socketpeer_label_free(so->so_peerlabel);
1472 #endif
1473
1474 sbdestroy(so, SO_SND);
1475 sbdestroy(so, SO_RCV);
1476
1477 #ifdef INVARIANTS
1478 bzero(&so->so_rcv,
1479 sizeof(struct socket) - offsetof(struct socket, so_rcv));
1480 #endif
1481
1482 so->sol_sbrcv_lowat = sbrcv_lowat;
1483 so->sol_sbsnd_lowat = sbsnd_lowat;
1484 so->sol_sbrcv_hiwat = sbrcv_hiwat;
1485 so->sol_sbsnd_hiwat = sbsnd_hiwat;
1486 so->sol_sbrcv_flags = sbrcv_flags;
1487 so->sol_sbsnd_flags = sbsnd_flags;
1488 so->sol_sbrcv_timeo = sbrcv_timeo;
1489 so->sol_sbsnd_timeo = sbsnd_timeo;
1490
1491 so->sol_qlen = so->sol_incqlen = 0;
1492 TAILQ_INIT(&so->sol_incomp);
1493 TAILQ_INIT(&so->sol_comp);
1494
1495 so->sol_accept_filter = NULL;
1496 so->sol_accept_filter_arg = NULL;
1497 so->sol_accept_filter_str = NULL;
1498
1499 so->sol_upcall = NULL;
1500 so->sol_upcallarg = NULL;
1501
1502 so->so_options |= SO_ACCEPTCONN;
1503
1504 listening:
1505 if (backlog < 0 || backlog > somaxconn)
1506 backlog = somaxconn;
1507 so->sol_qlimit = backlog;
1508
1509 mtx_unlock(&so->so_snd_mtx);
1510 mtx_unlock(&so->so_rcv_mtx);
1511 sx_xunlock(&so->so_snd_sx);
1512 sx_xunlock(&so->so_rcv_sx);
1513 }
1514
1515 /*
1516 * Wakeup listeners/subsystems once we have a complete connection.
1517 * Enters with lock, returns unlocked.
1518 */
1519 void
1520 solisten_wakeup(struct socket *sol)
1521 {
1522
1523 if (sol->sol_upcall != NULL)
1524 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
1525 else {
1526 selwakeuppri(&sol->so_rdsel, PSOCK);
1527 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
1528 }
1529 SOLISTEN_UNLOCK(sol);
1530 wakeup_one(&sol->sol_comp);
1531 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
1532 pgsigio(&sol->so_sigio, SIGIO, 0);
1533 }
1534
1535 /*
1536 * Return single connection off a listening socket queue. Main consumer of
1537 * the function is kern_accept4(). Some modules, that do their own accept
1538 * management also use the function. The socket reference held by the
1539 * listen queue is handed to the caller.
1540 *
1541 * Listening socket must be locked on entry and is returned unlocked on
1542 * return.
1543 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1544 */
1545 int
1546 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1547 {
1548 struct socket *so;
1549 int error;
1550
1551 SOLISTEN_LOCK_ASSERT(head);
1552
1553 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1554 head->so_error == 0) {
1555 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1556 "accept", 0);
1557 if (error != 0) {
1558 SOLISTEN_UNLOCK(head);
1559 return (error);
1560 }
1561 }
1562 if (head->so_error) {
1563 error = head->so_error;
1564 head->so_error = 0;
1565 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1566 error = EWOULDBLOCK;
1567 else
1568 error = 0;
1569 if (error) {
1570 SOLISTEN_UNLOCK(head);
1571 return (error);
1572 }
1573 so = TAILQ_FIRST(&head->sol_comp);
1574 SOCK_LOCK(so);
1575 KASSERT(so->so_qstate == SQ_COMP,
1576 ("%s: so %p not SQ_COMP", __func__, so));
1577 head->sol_qlen--;
1578 so->so_qstate = SQ_NONE;
1579 so->so_listen = NULL;
1580 TAILQ_REMOVE(&head->sol_comp, so, so_list);
1581 if (flags & ACCEPT4_INHERIT)
1582 so->so_state |= (head->so_state & SS_NBIO);
1583 else
1584 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1585 SOCK_UNLOCK(so);
1586 sorele_locked(head);
1587
1588 *ret = so;
1589 return (0);
1590 }
1591
1592 static struct so_splice *
1593 so_splice_alloc(off_t max)
1594 {
1595 struct so_splice *sp;
1596
1597 sp = uma_zalloc(splice_zone, M_WAITOK);
1598 sp->src = NULL;
1599 sp->dst = NULL;
1600 sp->max = max > 0 ? max : -1;
1601 do {
1602 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) %
1603 (mp_maxid + 1);
1604 } while (CPU_ABSENT(sp->wq_index));
1605 sp->state = SPLICE_IDLE;
1606 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout,
1607 sp);
1608 return (sp);
1609 }
1610
1611 static void
1612 so_splice_free(struct so_splice *sp)
1613 {
1614 KASSERT(sp->state == SPLICE_CLOSED,
1615 ("so_splice_free: sp %p not closed", sp));
1616 uma_zfree(splice_zone, sp);
1617 }
1618
1619 static void
1620 so_splice_timeout(void *arg, int pending __unused)
1621 {
1622 struct so_splice *sp;
1623
1624 sp = arg;
1625 (void)so_unsplice(sp->src, true);
1626 }
1627
1628 /*
1629 * Splice the output from so to the input of so2.
1630 */
1631 static int
1632 so_splice(struct socket *so, struct socket *so2, struct splice *splice)
1633 {
1634 struct so_splice *sp;
1635 int error;
1636
1637 if (splice->sp_max < 0)
1638 return (EINVAL);
1639 /* Handle only TCP for now; TODO: other streaming protos */
1640 if (so->so_proto->pr_protocol != IPPROTO_TCP ||
1641 so2->so_proto->pr_protocol != IPPROTO_TCP)
1642 return (EPROTONOSUPPORT);
1643 if (so->so_vnet != so2->so_vnet)
1644 return (EINVAL);
1645
1646 /* so_splice_xfer() assumes that we're using these implementations. */
1647 KASSERT(so->so_proto->pr_sosend == sosend_generic,
1648 ("so_splice: sosend not sosend_generic"));
1649 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic ||
1650 so2->so_proto->pr_soreceive == soreceive_stream,
1651 ("so_splice: soreceive not soreceive_generic/stream"));
1652
1653 sp = so_splice_alloc(splice->sp_max);
1654 so->so_splice_sent = 0;
1655 sp->src = so;
1656 sp->dst = so2;
1657
1658 error = 0;
1659 SOCK_LOCK(so);
1660 if (SOLISTENING(so))
1661 error = EINVAL;
1662 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
1663 error = ENOTCONN;
1664 else if (so->so_splice != NULL)
1665 error = EBUSY;
1666 if (error != 0) {
1667 SOCK_UNLOCK(so);
1668 uma_zfree(splice_zone, sp);
1669 return (error);
1670 }
1671 soref(so);
1672 so->so_splice = sp;
1673 SOCK_RECVBUF_LOCK(so);
1674 so->so_rcv.sb_flags |= SB_SPLICED;
1675 SOCK_RECVBUF_UNLOCK(so);
1676 SOCK_UNLOCK(so);
1677
1678 error = 0;
1679 SOCK_LOCK(so2);
1680 if (SOLISTENING(so2))
1681 error = EINVAL;
1682 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0)
1683 error = ENOTCONN;
1684 else if (so2->so_splice_back != NULL)
1685 error = EBUSY;
1686 if (error != 0) {
1687 SOCK_UNLOCK(so2);
1688 SOCK_LOCK(so);
1689 so->so_splice = NULL;
1690 SOCK_RECVBUF_LOCK(so);
1691 so->so_rcv.sb_flags &= ~SB_SPLICED;
1692 SOCK_RECVBUF_UNLOCK(so);
1693 SOCK_UNLOCK(so);
1694 sorele(so);
1695 uma_zfree(splice_zone, sp);
1696 return (error);
1697 }
1698 soref(so2);
1699 so2->so_splice_back = sp;
1700 SOCK_SENDBUF_LOCK(so2);
1701 so2->so_snd.sb_flags |= SB_SPLICED;
1702 mtx_lock(&sp->mtx);
1703 SOCK_SENDBUF_UNLOCK(so2);
1704 SOCK_UNLOCK(so2);
1705
1706 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) {
1707 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout,
1708 tvtosbt(splice->sp_idle), 0, C_PREL(4));
1709 }
1710
1711 /*
1712 * Transfer any data already present in the socket buffer.
1713 */
1714 sp->state = SPLICE_QUEUED;
1715 so_splice_xfer(sp);
1716 return (0);
1717 }
1718
1719 static int
1720 so_unsplice(struct socket *so, bool timeout)
1721 {
1722 struct socket *so2;
1723 struct so_splice *sp;
1724 bool drain;
1725
1726 /*
1727 * First unset SB_SPLICED and hide the splice structure so that
1728 * wakeup routines will stop enqueuing work. This also ensures that
1729 * a only a single thread will proceed with the unsplice.
1730 */
1731 SOCK_LOCK(so);
1732 if (SOLISTENING(so)) {
1733 SOCK_UNLOCK(so);
1734 return (EINVAL);
1735 }
1736 SOCK_RECVBUF_LOCK(so);
1737 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) {
1738 SOCK_RECVBUF_UNLOCK(so);
1739 SOCK_UNLOCK(so);
1740 return (ENOTCONN);
1741 }
1742 so->so_rcv.sb_flags &= ~SB_SPLICED;
1743 sp = so->so_splice;
1744 so->so_splice = NULL;
1745 SOCK_RECVBUF_UNLOCK(so);
1746 SOCK_UNLOCK(so);
1747
1748 so2 = sp->dst;
1749 SOCK_LOCK(so2);
1750 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__));
1751 SOCK_SENDBUF_LOCK(so2);
1752 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0,
1753 ("%s: so2 is not spliced", __func__));
1754 KASSERT(so2->so_splice_back == sp,
1755 ("%s: so_splice_back != sp", __func__));
1756 so2->so_snd.sb_flags &= ~SB_SPLICED;
1757 so2->so_splice_back = NULL;
1758 SOCK_SENDBUF_UNLOCK(so2);
1759 SOCK_UNLOCK(so2);
1760
1761 /*
1762 * No new work is being enqueued. The worker thread might be
1763 * splicing data right now, in which case we want to wait for it to
1764 * finish before proceeding.
1765 */
1766 mtx_lock(&sp->mtx);
1767 switch (sp->state) {
1768 case SPLICE_QUEUED:
1769 case SPLICE_RUNNING:
1770 sp->state = SPLICE_CLOSING;
1771 while (sp->state == SPLICE_CLOSING)
1772 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0);
1773 break;
1774 case SPLICE_IDLE:
1775 case SPLICE_EXCEPTION:
1776 sp->state = SPLICE_CLOSED;
1777 break;
1778 default:
1779 __assert_unreachable();
1780 }
1781 if (!timeout) {
1782 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout,
1783 NULL) != 0;
1784 } else {
1785 drain = false;
1786 }
1787 mtx_unlock(&sp->mtx);
1788 if (drain)
1789 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout);
1790
1791 /*
1792 * Now we hold the sole reference to the splice structure.
1793 * Clean up: signal userspace and release socket references.
1794 */
1795 sorwakeup(so);
1796 CURVNET_SET(so->so_vnet);
1797 sorele(so);
1798 sowwakeup(so2);
1799 sorele(so2);
1800 CURVNET_RESTORE();
1801 so_splice_free(sp);
1802 return (0);
1803 }
1804
1805 /*
1806 * Free socket upon release of the very last reference.
1807 */
1808 static void
1809 sofree(struct socket *so)
1810 {
1811 struct protosw *pr = so->so_proto;
1812
1813 SOCK_LOCK_ASSERT(so);
1814 KASSERT(refcount_load(&so->so_count) == 0,
1815 ("%s: so %p has references", __func__, so));
1816 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
1817 ("%s: so %p is on listen queue", __func__, so));
1818 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0,
1819 ("%s: so %p rcvbuf is spliced", __func__, so));
1820 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0,
1821 ("%s: so %p sndbuf is spliced", __func__, so));
1822 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL,
1823 ("%s: so %p has spliced data", __func__, so));
1824
1825 SOCK_UNLOCK(so);
1826
1827 if (so->so_dtor != NULL)
1828 so->so_dtor(so);
1829
1830 VNET_SO_ASSERT(so);
1831 if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) {
1832 MPASS(pr->pr_domain->dom_dispose != NULL);
1833 (*pr->pr_domain->dom_dispose)(so);
1834 }
1835 if (pr->pr_detach != NULL)
1836 pr->pr_detach(so);
1837
1838 /*
1839 * From this point on, we assume that no other references to this
1840 * socket exist anywhere else in the stack. Therefore, no locks need
1841 * to be acquired or held.
1842 */
1843 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
1844 sbdestroy(so, SO_SND);
1845 sbdestroy(so, SO_RCV);
1846 }
1847 seldrain(&so->so_rdsel);
1848 seldrain(&so->so_wrsel);
1849 knlist_destroy(&so->so_rdsel.si_note);
1850 knlist_destroy(&so->so_wrsel.si_note);
1851 sodealloc(so);
1852 }
1853
1854 /*
1855 * Release a reference on a socket while holding the socket lock.
1856 * Unlocks the socket lock before returning.
1857 */
1858 void
1859 sorele_locked(struct socket *so)
1860 {
1861 SOCK_LOCK_ASSERT(so);
1862 if (refcount_release(&so->so_count))
1863 sofree(so);
1864 else
1865 SOCK_UNLOCK(so);
1866 }
1867
1868 /*
1869 * Close a socket on last file table reference removal. Initiate disconnect
1870 * if connected. Free socket when disconnect complete.
1871 *
1872 * This function will sorele() the socket. Note that soclose() may be called
1873 * prior to the ref count reaching zero. The actual socket structure will
1874 * not be freed until the ref count reaches zero.
1875 */
1876 int
1877 soclose(struct socket *so)
1878 {
1879 struct accept_queue lqueue;
1880 int error = 0;
1881 bool listening, last __diagused;
1882
1883 CURVNET_SET(so->so_vnet);
1884 funsetown(&so->so_sigio);
1885 if (so->so_state & SS_ISCONNECTED) {
1886 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1887 error = sodisconnect(so);
1888 if (error) {
1889 if (error == ENOTCONN)
1890 error = 0;
1891 goto drop;
1892 }
1893 }
1894
1895 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1896 if ((so->so_state & SS_ISDISCONNECTING) &&
1897 (so->so_state & SS_NBIO))
1898 goto drop;
1899 while (so->so_state & SS_ISCONNECTED) {
1900 error = tsleep(&so->so_timeo,
1901 PSOCK | PCATCH, "soclos",
1902 so->so_linger * hz);
1903 if (error)
1904 break;
1905 }
1906 }
1907 }
1908
1909 drop:
1910 if (so->so_proto->pr_close != NULL)
1911 so->so_proto->pr_close(so);
1912
1913 SOCK_LOCK(so);
1914 if ((listening = SOLISTENING(so))) {
1915 struct socket *sp;
1916
1917 TAILQ_INIT(&lqueue);
1918 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1919 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1920
1921 so->sol_qlen = so->sol_incqlen = 0;
1922
1923 TAILQ_FOREACH(sp, &lqueue, so_list) {
1924 SOCK_LOCK(sp);
1925 sp->so_qstate = SQ_NONE;
1926 sp->so_listen = NULL;
1927 SOCK_UNLOCK(sp);
1928 last = refcount_release(&so->so_count);
1929 KASSERT(!last, ("%s: released last reference for %p",
1930 __func__, so));
1931 }
1932 }
1933 sorele_locked(so);
1934 if (listening) {
1935 struct socket *sp, *tsp;
1936
1937 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
1938 soabort(sp);
1939 }
1940 CURVNET_RESTORE();
1941 return (error);
1942 }
1943
1944 /*
1945 * soabort() is used to abruptly tear down a connection, such as when a
1946 * resource limit is reached (listen queue depth exceeded), or if a listen
1947 * socket is closed while there are sockets waiting to be accepted.
1948 *
1949 * This interface is tricky, because it is called on an unreferenced socket,
1950 * and must be called only by a thread that has actually removed the socket
1951 * from the listen queue it was on. Likely this thread holds the last
1952 * reference on the socket and soabort() will proceed with sofree(). But
1953 * it might be not the last, as the sockets on the listen queues are seen
1954 * from the protocol side.
1955 *
1956 * This interface will call into the protocol code, so must not be called
1957 * with any socket locks held. Protocols do call it while holding their own
1958 * recursible protocol mutexes, but this is something that should be subject
1959 * to review in the future.
1960 *
1961 * Usually socket should have a single reference left, but this is not a
1962 * requirement. In the past, when we have had named references for file
1963 * descriptor and protocol, we asserted that none of them are being held.
1964 */
1965 void
1966 soabort(struct socket *so)
1967 {
1968
1969 VNET_SO_ASSERT(so);
1970
1971 if (so->so_proto->pr_abort != NULL)
1972 so->so_proto->pr_abort(so);
1973 SOCK_LOCK(so);
1974 sorele_locked(so);
1975 }
1976
1977 int
1978 soaccept(struct socket *so, struct sockaddr **nam)
1979 {
1980 int error;
1981
1982 CURVNET_SET(so->so_vnet);
1983 error = so->so_proto->pr_accept(so, nam);
1984 CURVNET_RESTORE();
1985 return (error);
1986 }
1987
1988 int
1989 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1990 {
1991
1992 return (soconnectat(AT_FDCWD, so, nam, td));
1993 }
1994
1995 int
1996 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1997 {
1998 int error;
1999
2000 CURVNET_SET(so->so_vnet);
2001
2002 /*
2003 * If protocol is connection-based, can only connect once.
2004 * Otherwise, if connected, try to disconnect first. This allows
2005 * user to disconnect by connecting to, e.g., a null address.
2006 *
2007 * Note, this check is racy and may need to be re-evaluated at the
2008 * protocol layer.
2009 */
2010 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
2011 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2012 (error = sodisconnect(so)))) {
2013 error = EISCONN;
2014 } else {
2015 /*
2016 * Prevent accumulated error from previous connection from
2017 * biting us.
2018 */
2019 so->so_error = 0;
2020 if (fd == AT_FDCWD) {
2021 error = so->so_proto->pr_connect(so, nam, td);
2022 } else {
2023 error = so->so_proto->pr_connectat(fd, so, nam, td);
2024 }
2025 }
2026 CURVNET_RESTORE();
2027
2028 return (error);
2029 }
2030
2031 int
2032 soconnect2(struct socket *so1, struct socket *so2)
2033 {
2034 int error;
2035
2036 CURVNET_SET(so1->so_vnet);
2037 error = so1->so_proto->pr_connect2(so1, so2);
2038 CURVNET_RESTORE();
2039 return (error);
2040 }
2041
2042 int
2043 sodisconnect(struct socket *so)
2044 {
2045 int error;
2046
2047 if ((so->so_state & SS_ISCONNECTED) == 0)
2048 return (ENOTCONN);
2049 if (so->so_state & SS_ISDISCONNECTING)
2050 return (EALREADY);
2051 VNET_SO_ASSERT(so);
2052 error = so->so_proto->pr_disconnect(so);
2053 return (error);
2054 }
2055
2056 int
2057 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
2058 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2059 {
2060 long space;
2061 ssize_t resid;
2062 int clen = 0, error, dontroute;
2063
2064 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
2065 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
2066 ("sosend_dgram: !PR_ATOMIC"));
2067
2068 if (uio != NULL)
2069 resid = uio->uio_resid;
2070 else
2071 resid = top->m_pkthdr.len;
2072 /*
2073 * In theory resid should be unsigned. However, space must be
2074 * signed, as it might be less than 0 if we over-committed, and we
2075 * must use a signed comparison of space and resid. On the other
2076 * hand, a negative resid causes us to loop sending 0-length
2077 * segments to the protocol.
2078 */
2079 if (resid < 0) {
2080 error = EINVAL;
2081 goto out;
2082 }
2083
2084 dontroute =
2085 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
2086 if (td != NULL)
2087 td->td_ru.ru_msgsnd++;
2088 if (control != NULL)
2089 clen = control->m_len;
2090
2091 SOCKBUF_LOCK(&so->so_snd);
2092 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2093 SOCKBUF_UNLOCK(&so->so_snd);
2094 error = EPIPE;
2095 goto out;
2096 }
2097 if (so->so_error) {
2098 error = so->so_error;
2099 so->so_error = 0;
2100 SOCKBUF_UNLOCK(&so->so_snd);
2101 goto out;
2102 }
2103 if ((so->so_state & SS_ISCONNECTED) == 0) {
2104 /*
2105 * `sendto' and `sendmsg' is allowed on a connection-based
2106 * socket if it supports implied connect. Return ENOTCONN if
2107 * not connected and no address is supplied.
2108 */
2109 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
2110 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
2111 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
2112 !(resid == 0 && clen != 0)) {
2113 SOCKBUF_UNLOCK(&so->so_snd);
2114 error = ENOTCONN;
2115 goto out;
2116 }
2117 } else if (addr == NULL) {
2118 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
2119 error = ENOTCONN;
2120 else
2121 error = EDESTADDRREQ;
2122 SOCKBUF_UNLOCK(&so->so_snd);
2123 goto out;
2124 }
2125 }
2126
2127 /*
2128 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
2129 * problem and need fixing.
2130 */
2131 space = sbspace(&so->so_snd);
2132 if (flags & MSG_OOB)
2133 space += 1024;
2134 space -= clen;
2135 SOCKBUF_UNLOCK(&so->so_snd);
2136 if (resid > space) {
2137 error = EMSGSIZE;
2138 goto out;
2139 }
2140 if (uio == NULL) {
2141 resid = 0;
2142 if (flags & MSG_EOR)
2143 top->m_flags |= M_EOR;
2144 } else {
2145 /*
2146 * Copy the data from userland into a mbuf chain.
2147 * If no data is to be copied in, a single empty mbuf
2148 * is returned.
2149 */
2150 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
2151 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
2152 if (top == NULL) {
2153 error = EFAULT; /* only possible error */
2154 goto out;
2155 }
2156 space -= resid - uio->uio_resid;
2157 resid = uio->uio_resid;
2158 }
2159 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
2160 /*
2161 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
2162 * than with.
2163 */
2164 if (dontroute) {
2165 SOCK_LOCK(so);
2166 so->so_options |= SO_DONTROUTE;
2167 SOCK_UNLOCK(so);
2168 }
2169 /*
2170 * XXX all the SBS_CANTSENDMORE checks previously done could be out
2171 * of date. We could have received a reset packet in an interrupt or
2172 * maybe we slept while doing page faults in uiomove() etc. We could
2173 * probably recheck again inside the locking protection here, but
2174 * there are probably other places that this also happens. We must
2175 * rethink this.
2176 */
2177 VNET_SO_ASSERT(so);
2178 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
2179 /*
2180 * If the user set MSG_EOF, the protocol understands this flag and
2181 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
2182 */
2183 ((flags & MSG_EOF) &&
2184 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2185 (resid <= 0)) ?
2186 PRUS_EOF :
2187 /* If there is more to send set PRUS_MORETOCOME */
2188 (flags & MSG_MORETOCOME) ||
2189 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
2190 top, addr, control, td);
2191 if (dontroute) {
2192 SOCK_LOCK(so);
2193 so->so_options &= ~SO_DONTROUTE;
2194 SOCK_UNLOCK(so);
2195 }
2196 clen = 0;
2197 control = NULL;
2198 top = NULL;
2199 out:
2200 if (top != NULL)
2201 m_freem(top);
2202 if (control != NULL)
2203 m_freem(control);
2204 return (error);
2205 }
2206
2207 /*
2208 * Send on a socket. If send must go all at once and message is larger than
2209 * send buffering, then hard error. Lock against other senders. If must go
2210 * all at once and not enough room now, then inform user that this would
2211 * block and do nothing. Otherwise, if nonblocking, send as much as
2212 * possible. The data to be sent is described by "uio" if nonzero, otherwise
2213 * by the mbuf chain "top" (which must be null if uio is not). Data provided
2214 * in mbuf chain must be small enough to send all at once.
2215 *
2216 * Returns nonzero on error, timeout or signal; callers must check for short
2217 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
2218 * on return.
2219 */
2220 static int
2221 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio,
2222 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2223 {
2224 long space;
2225 ssize_t resid;
2226 int clen = 0, error, dontroute;
2227 int atomic = sosendallatonce(so) || top;
2228 int pr_send_flag;
2229 #ifdef KERN_TLS
2230 struct ktls_session *tls;
2231 int tls_enq_cnt, tls_send_flag;
2232 uint8_t tls_rtype;
2233
2234 tls = NULL;
2235 tls_rtype = TLS_RLTYPE_APP;
2236 #endif
2237
2238 SOCK_IO_SEND_ASSERT_LOCKED(so);
2239
2240 if (uio != NULL)
2241 resid = uio->uio_resid;
2242 else if ((top->m_flags & M_PKTHDR) != 0)
2243 resid = top->m_pkthdr.len;
2244 else
2245 resid = m_length(top, NULL);
2246 /*
2247 * In theory resid should be unsigned. However, space must be
2248 * signed, as it might be less than 0 if we over-committed, and we
2249 * must use a signed comparison of space and resid. On the other
2250 * hand, a negative resid causes us to loop sending 0-length
2251 * segments to the protocol.
2252 *
2253 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
2254 * type sockets since that's an error.
2255 */
2256 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2257 error = EINVAL;
2258 goto out;
2259 }
2260
2261 dontroute =
2262 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
2263 (so->so_proto->pr_flags & PR_ATOMIC);
2264 if (td != NULL)
2265 td->td_ru.ru_msgsnd++;
2266 if (control != NULL)
2267 clen = control->m_len;
2268
2269 #ifdef KERN_TLS
2270 tls_send_flag = 0;
2271 tls = ktls_hold(so->so_snd.sb_tls_info);
2272 if (tls != NULL) {
2273 if (tls->mode == TCP_TLS_MODE_SW)
2274 tls_send_flag = PRUS_NOTREADY;
2275
2276 if (control != NULL) {
2277 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
2278
2279 if (clen >= sizeof(*cm) &&
2280 cm->cmsg_type == TLS_SET_RECORD_TYPE) {
2281 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
2282 clen = 0;
2283 m_freem(control);
2284 control = NULL;
2285 atomic = 1;
2286 }
2287 }
2288
2289 if (resid == 0 && !ktls_permit_empty_frames(tls)) {
2290 error = EINVAL;
2291 goto out;
2292 }
2293 }
2294 #endif
2295
2296 restart:
2297 do {
2298 SOCKBUF_LOCK(&so->so_snd);
2299 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2300 SOCKBUF_UNLOCK(&so->so_snd);
2301 error = EPIPE;
2302 goto out;
2303 }
2304 if (so->so_error) {
2305 error = so->so_error;
2306 so->so_error = 0;
2307 SOCKBUF_UNLOCK(&so->so_snd);
2308 goto out;
2309 }
2310 if ((so->so_state & SS_ISCONNECTED) == 0) {
2311 /*
2312 * `sendto' and `sendmsg' is allowed on a connection-
2313 * based socket if it supports implied connect.
2314 * Return ENOTCONN if not connected and no address is
2315 * supplied.
2316 */
2317 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
2318 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
2319 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
2320 !(resid == 0 && clen != 0)) {
2321 SOCKBUF_UNLOCK(&so->so_snd);
2322 error = ENOTCONN;
2323 goto out;
2324 }
2325 } else if (addr == NULL) {
2326 SOCKBUF_UNLOCK(&so->so_snd);
2327 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
2328 error = ENOTCONN;
2329 else
2330 error = EDESTADDRREQ;
2331 goto out;
2332 }
2333 }
2334 space = sbspace(&so->so_snd);
2335 if (flags & MSG_OOB)
2336 space += 1024;
2337 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2338 clen > so->so_snd.sb_hiwat) {
2339 SOCKBUF_UNLOCK(&so->so_snd);
2340 error = EMSGSIZE;
2341 goto out;
2342 }
2343 if (space < resid + clen &&
2344 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
2345 if ((so->so_state & SS_NBIO) ||
2346 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
2347 SOCKBUF_UNLOCK(&so->so_snd);
2348 error = EWOULDBLOCK;
2349 goto out;
2350 }
2351 error = sbwait(so, SO_SND);
2352 SOCKBUF_UNLOCK(&so->so_snd);
2353 if (error)
2354 goto out;
2355 goto restart;
2356 }
2357 SOCKBUF_UNLOCK(&so->so_snd);
2358 space -= clen;
2359 do {
2360 if (uio == NULL) {
2361 resid = 0;
2362 if (flags & MSG_EOR)
2363 top->m_flags |= M_EOR;
2364 #ifdef KERN_TLS
2365 if (tls != NULL) {
2366 ktls_frame(top, tls, &tls_enq_cnt,
2367 tls_rtype);
2368 tls_rtype = TLS_RLTYPE_APP;
2369 }
2370 #endif
2371 } else {
2372 /*
2373 * Copy the data from userland into a mbuf
2374 * chain. If resid is 0, which can happen
2375 * only if we have control to send, then
2376 * a single empty mbuf is returned. This
2377 * is a workaround to prevent protocol send
2378 * methods to panic.
2379 */
2380 #ifdef KERN_TLS
2381 if (tls != NULL) {
2382 top = m_uiotombuf(uio, M_WAITOK, space,
2383 tls->params.max_frame_len,
2384 M_EXTPG |
2385 ((flags & MSG_EOR) ? M_EOR : 0));
2386 if (top != NULL) {
2387 ktls_frame(top, tls,
2388 &tls_enq_cnt, tls_rtype);
2389 }
2390 tls_rtype = TLS_RLTYPE_APP;
2391 } else
2392 #endif
2393 top = m_uiotombuf(uio, M_WAITOK, space,
2394 (atomic ? max_hdr : 0),
2395 (atomic ? M_PKTHDR : 0) |
2396 ((flags & MSG_EOR) ? M_EOR : 0));
2397 if (top == NULL) {
2398 error = EFAULT; /* only possible error */
2399 goto out;
2400 }
2401 space -= resid - uio->uio_resid;
2402 resid = uio->uio_resid;
2403 }
2404 if (dontroute) {
2405 SOCK_LOCK(so);
2406 so->so_options |= SO_DONTROUTE;
2407 SOCK_UNLOCK(so);
2408 }
2409 /*
2410 * XXX all the SBS_CANTSENDMORE checks previously
2411 * done could be out of date. We could have received
2412 * a reset packet in an interrupt or maybe we slept
2413 * while doing page faults in uiomove() etc. We
2414 * could probably recheck again inside the locking
2415 * protection here, but there are probably other
2416 * places that this also happens. We must rethink
2417 * this.
2418 */
2419 VNET_SO_ASSERT(so);
2420
2421 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
2422 /*
2423 * If the user set MSG_EOF, the protocol understands
2424 * this flag and nothing left to send then use
2425 * PRU_SEND_EOF instead of PRU_SEND.
2426 */
2427 ((flags & MSG_EOF) &&
2428 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2429 (resid <= 0)) ?
2430 PRUS_EOF :
2431 /* If there is more to send set PRUS_MORETOCOME. */
2432 (flags & MSG_MORETOCOME) ||
2433 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2434
2435 #ifdef KERN_TLS
2436 pr_send_flag |= tls_send_flag;
2437 #endif
2438
2439 error = so->so_proto->pr_send(so, pr_send_flag, top,
2440 addr, control, td);
2441
2442 if (dontroute) {
2443 SOCK_LOCK(so);
2444 so->so_options &= ~SO_DONTROUTE;
2445 SOCK_UNLOCK(so);
2446 }
2447
2448 #ifdef KERN_TLS
2449 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
2450 if (error != 0) {
2451 m_freem(top);
2452 top = NULL;
2453 } else {
2454 soref(so);
2455 ktls_enqueue(top, so, tls_enq_cnt);
2456 }
2457 }
2458 #endif
2459 clen = 0;
2460 control = NULL;
2461 top = NULL;
2462 if (error)
2463 goto out;
2464 } while (resid && space > 0);
2465 } while (resid);
2466
2467 out:
2468 #ifdef KERN_TLS
2469 if (tls != NULL)
2470 ktls_free(tls);
2471 #endif
2472 if (top != NULL)
2473 m_freem(top);
2474 if (control != NULL)
2475 m_freem(control);
2476 return (error);
2477 }
2478
2479 int
2480 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
2481 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2482 {
2483 int error;
2484
2485 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
2486 if (error)
2487 return (error);
2488 error = sosend_generic_locked(so, addr, uio, top, control, flags, td);
2489 SOCK_IO_SEND_UNLOCK(so);
2490 return (error);
2491 }
2492
2493 /*
2494 * Send to a socket from a kernel thread.
2495 *
2496 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied.
2497 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs
2498 * to be set at all. This function should just boil down to a static inline
2499 * calling the protocol method.
2500 */
2501 int
2502 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2503 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2504 {
2505 int error;
2506
2507 CURVNET_SET(so->so_vnet);
2508 error = so->so_proto->pr_sosend(so, addr, uio,
2509 top, control, flags, td);
2510 CURVNET_RESTORE();
2511 return (error);
2512 }
2513
2514 /*
2515 * send(2), write(2) or aio_write(2) on a socket.
2516 */
2517 int
2518 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2519 struct mbuf *control, int flags, struct proc *userproc)
2520 {
2521 struct thread *td;
2522 ssize_t len;
2523 int error;
2524
2525 td = uio->uio_td;
2526 len = uio->uio_resid;
2527 CURVNET_SET(so->so_vnet);
2528 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags,
2529 td);
2530 CURVNET_RESTORE();
2531 if (error != 0) {
2532 /*
2533 * Clear transient errors for stream protocols if they made
2534 * some progress. Make exclusion for aio(4) that would
2535 * schedule a new write in case of EWOULDBLOCK and clear
2536 * error itself. See soaio_process_job().
2537 */
2538 if (uio->uio_resid != len &&
2539 (so->so_proto->pr_flags & PR_ATOMIC) == 0 &&
2540 userproc == NULL &&
2541 (error == ERESTART || error == EINTR ||
2542 error == EWOULDBLOCK))
2543 error = 0;
2544 /* Generation of SIGPIPE can be controlled per socket. */
2545 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 &&
2546 (flags & MSG_NOSIGNAL) == 0) {
2547 if (userproc != NULL) {
2548 /* aio(4) job */
2549 PROC_LOCK(userproc);
2550 kern_psignal(userproc, SIGPIPE);
2551 PROC_UNLOCK(userproc);
2552 } else {
2553 PROC_LOCK(td->td_proc);
2554 tdsignal(td, SIGPIPE);
2555 PROC_UNLOCK(td->td_proc);
2556 }
2557 }
2558 }
2559 return (error);
2560 }
2561
2562 /*
2563 * The part of soreceive() that implements reading non-inline out-of-band
2564 * data from a socket. For more complete comments, see soreceive(), from
2565 * which this code originated.
2566 *
2567 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
2568 * unable to return an mbuf chain to the caller.
2569 */
2570 static int
2571 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
2572 {
2573 struct protosw *pr = so->so_proto;
2574 struct mbuf *m;
2575 int error;
2576
2577 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
2578 VNET_SO_ASSERT(so);
2579
2580 m = m_get(M_WAITOK, MT_DATA);
2581 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
2582 if (error)
2583 goto bad;
2584 do {
2585 error = uiomove(mtod(m, void *),
2586 (int) min(uio->uio_resid, m->m_len), uio);
2587 m = m_free(m);
2588 } while (uio->uio_resid && error == 0 && m);
2589 bad:
2590 if (m != NULL)
2591 m_freem(m);
2592 return (error);
2593 }
2594
2595 /*
2596 * Following replacement or removal of the first mbuf on the first mbuf chain
2597 * of a socket buffer, push necessary state changes back into the socket
2598 * buffer so that other consumers see the values consistently. 'nextrecord'
2599 * is the callers locally stored value of the original value of
2600 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
2601 * NOTE: 'nextrecord' may be NULL.
2602 */
2603 static __inline void
2604 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
2605 {
2606
2607 SOCKBUF_LOCK_ASSERT(sb);
2608 /*
2609 * First, update for the new value of nextrecord. If necessary, make
2610 * it the first record.
2611 */
2612 if (sb->sb_mb != NULL)
2613 sb->sb_mb->m_nextpkt = nextrecord;
2614 else
2615 sb->sb_mb = nextrecord;
2616
2617 /*
2618 * Now update any dependent socket buffer fields to reflect the new
2619 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
2620 * addition of a second clause that takes care of the case where
2621 * sb_mb has been updated, but remains the last record.
2622 */
2623 if (sb->sb_mb == NULL) {
2624 sb->sb_mbtail = NULL;
2625 sb->sb_lastrecord = NULL;
2626 } else if (sb->sb_mb->m_nextpkt == NULL)
2627 sb->sb_lastrecord = sb->sb_mb;
2628 }
2629
2630 /*
2631 * Implement receive operations on a socket. We depend on the way that
2632 * records are added to the sockbuf by sbappend. In particular, each record
2633 * (mbufs linked through m_next) must begin with an address if the protocol
2634 * so specifies, followed by an optional mbuf or mbufs containing ancillary
2635 * data, and then zero or more mbufs of data. In order to allow parallelism
2636 * between network receive and copying to user space, as well as avoid
2637 * sleeping with a mutex held, we release the socket buffer mutex during the
2638 * user space copy. Although the sockbuf is locked, new data may still be
2639 * appended, and thus we must maintain consistency of the sockbuf during that
2640 * time.
2641 *
2642 * The caller may receive the data as a single mbuf chain by supplying an
2643 * mbuf **mp0 for use in returning the chain. The uio is then used only for
2644 * the count in uio_resid.
2645 */
2646 static int
2647 soreceive_generic_locked(struct socket *so, struct sockaddr **psa,
2648 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp)
2649 {
2650 struct mbuf *m;
2651 int flags, error, offset;
2652 ssize_t len;
2653 struct protosw *pr = so->so_proto;
2654 struct mbuf *nextrecord;
2655 int moff, type = 0;
2656 ssize_t orig_resid = uio->uio_resid;
2657 bool report_real_len = false;
2658
2659 SOCK_IO_RECV_ASSERT_LOCKED(so);
2660
2661 error = 0;
2662 if (flagsp != NULL) {
2663 report_real_len = *flagsp & MSG_TRUNC;
2664 *flagsp &= ~MSG_TRUNC;
2665 flags = *flagsp &~ MSG_EOR;
2666 } else
2667 flags = 0;
2668
2669 restart:
2670 SOCKBUF_LOCK(&so->so_rcv);
2671 m = so->so_rcv.sb_mb;
2672 /*
2673 * If we have less data than requested, block awaiting more (subject
2674 * to any timeout) if:
2675 * 1. the current count is less than the low water mark, or
2676 * 2. MSG_DONTWAIT is not set
2677 */
2678 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2679 sbavail(&so->so_rcv) < uio->uio_resid) &&
2680 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
2681 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2682 KASSERT(m != NULL || !sbavail(&so->so_rcv),
2683 ("receive: m == %p sbavail == %u",
2684 m, sbavail(&so->so_rcv)));
2685 if (so->so_error || so->so_rerror) {
2686 if (m != NULL)
2687 goto dontblock;
2688 if (so->so_error)
2689 error = so->so_error;
2690 else
2691 error = so->so_rerror;
2692 if ((flags & MSG_PEEK) == 0) {
2693 if (so->so_error)
2694 so->so_error = 0;
2695 else
2696 so->so_rerror = 0;
2697 }
2698 SOCKBUF_UNLOCK(&so->so_rcv);
2699 goto release;
2700 }
2701 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2702 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2703 if (m != NULL)
2704 goto dontblock;
2705 #ifdef KERN_TLS
2706 else if (so->so_rcv.sb_tlsdcc == 0 &&
2707 so->so_rcv.sb_tlscc == 0) {
2708 #else
2709 else {
2710 #endif
2711 SOCKBUF_UNLOCK(&so->so_rcv);
2712 goto release;
2713 }
2714 }
2715 for (; m != NULL; m = m->m_next)
2716 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2717 m = so->so_rcv.sb_mb;
2718 goto dontblock;
2719 }
2720 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
2721 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
2722 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2723 SOCKBUF_UNLOCK(&so->so_rcv);
2724 error = ENOTCONN;
2725 goto release;
2726 }
2727 if (uio->uio_resid == 0 && !report_real_len) {
2728 SOCKBUF_UNLOCK(&so->so_rcv);
2729 goto release;
2730 }
2731 if ((so->so_state & SS_NBIO) ||
2732 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2733 SOCKBUF_UNLOCK(&so->so_rcv);
2734 error = EWOULDBLOCK;
2735 goto release;
2736 }
2737 SBLASTRECORDCHK(&so->so_rcv);
2738 SBLASTMBUFCHK(&so->so_rcv);
2739 error = sbwait(so, SO_RCV);
2740 SOCKBUF_UNLOCK(&so->so_rcv);
2741 if (error)
2742 goto release;
2743 goto restart;
2744 }
2745 dontblock:
2746 /*
2747 * From this point onward, we maintain 'nextrecord' as a cache of the
2748 * pointer to the next record in the socket buffer. We must keep the
2749 * various socket buffer pointers and local stack versions of the
2750 * pointers in sync, pushing out modifications before dropping the
2751 * socket buffer mutex, and re-reading them when picking it up.
2752 *
2753 * Otherwise, we will race with the network stack appending new data
2754 * or records onto the socket buffer by using inconsistent/stale
2755 * versions of the field, possibly resulting in socket buffer
2756 * corruption.
2757 *
2758 * By holding the high-level sblock(), we prevent simultaneous
2759 * readers from pulling off the front of the socket buffer.
2760 */
2761 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2762 if (uio->uio_td)
2763 uio->uio_td->td_ru.ru_msgrcv++;
2764 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2765 SBLASTRECORDCHK(&so->so_rcv);
2766 SBLASTMBUFCHK(&so->so_rcv);
2767 nextrecord = m->m_nextpkt;
2768 if (pr->pr_flags & PR_ADDR) {
2769 KASSERT(m->m_type == MT_SONAME,
2770 ("m->m_type == %d", m->m_type));
2771 orig_resid = 0;
2772 if (psa != NULL)
2773 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2774 M_NOWAIT);
2775 if (flags & MSG_PEEK) {
2776 m = m->m_next;
2777 } else {
2778 sbfree(&so->so_rcv, m);
2779 so->so_rcv.sb_mb = m_free(m);
2780 m = so->so_rcv.sb_mb;
2781 sockbuf_pushsync(&so->so_rcv, nextrecord);
2782 }
2783 }
2784
2785 /*
2786 * Process one or more MT_CONTROL mbufs present before any data mbufs
2787 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2788 * just copy the data; if !MSG_PEEK, we call into the protocol to
2789 * perform externalization (or freeing if controlp == NULL).
2790 */
2791 if (m != NULL && m->m_type == MT_CONTROL) {
2792 struct mbuf *cm = NULL, *cmn;
2793 struct mbuf **cme = &cm;
2794 #ifdef KERN_TLS
2795 struct cmsghdr *cmsg;
2796 struct tls_get_record tgr;
2797
2798 /*
2799 * For MSG_TLSAPPDATA, check for an alert record.
2800 * If found, return ENXIO without removing
2801 * it from the receive queue. This allows a subsequent
2802 * call without MSG_TLSAPPDATA to receive it.
2803 * Note that, for TLS, there should only be a single
2804 * control mbuf with the TLS_GET_RECORD message in it.
2805 */
2806 if (flags & MSG_TLSAPPDATA) {
2807 cmsg = mtod(m, struct cmsghdr *);
2808 if (cmsg->cmsg_type == TLS_GET_RECORD &&
2809 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2810 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2811 if (__predict_false(tgr.tls_type ==
2812 TLS_RLTYPE_ALERT)) {
2813 SOCKBUF_UNLOCK(&so->so_rcv);
2814 error = ENXIO;
2815 goto release;
2816 }
2817 }
2818 }
2819 #endif
2820
2821 do {
2822 if (flags & MSG_PEEK) {
2823 if (controlp != NULL) {
2824 *controlp = m_copym(m, 0, m->m_len,
2825 M_NOWAIT);
2826 controlp = &(*controlp)->m_next;
2827 }
2828 m = m->m_next;
2829 } else {
2830 sbfree(&so->so_rcv, m);
2831 so->so_rcv.sb_mb = m->m_next;
2832 m->m_next = NULL;
2833 *cme = m;
2834 cme = &(*cme)->m_next;
2835 m = so->so_rcv.sb_mb;
2836 }
2837 } while (m != NULL && m->m_type == MT_CONTROL);
2838 if ((flags & MSG_PEEK) == 0)
2839 sockbuf_pushsync(&so->so_rcv, nextrecord);
2840 while (cm != NULL) {
2841 cmn = cm->m_next;
2842 cm->m_next = NULL;
2843 if (pr->pr_domain->dom_externalize != NULL) {
2844 SOCKBUF_UNLOCK(&so->so_rcv);
2845 VNET_SO_ASSERT(so);
2846 error = (*pr->pr_domain->dom_externalize)
2847 (cm, controlp, flags);
2848 SOCKBUF_LOCK(&so->so_rcv);
2849 } else if (controlp != NULL)
2850 *controlp = cm;
2851 else
2852 m_freem(cm);
2853 if (controlp != NULL) {
2854 while (*controlp != NULL)
2855 controlp = &(*controlp)->m_next;
2856 }
2857 cm = cmn;
2858 }
2859 if (m != NULL)
2860 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2861 else
2862 nextrecord = so->so_rcv.sb_mb;
2863 orig_resid = 0;
2864 }
2865 if (m != NULL) {
2866 if ((flags & MSG_PEEK) == 0) {
2867 KASSERT(m->m_nextpkt == nextrecord,
2868 ("soreceive: post-control, nextrecord !sync"));
2869 if (nextrecord == NULL) {
2870 KASSERT(so->so_rcv.sb_mb == m,
2871 ("soreceive: post-control, sb_mb!=m"));
2872 KASSERT(so->so_rcv.sb_lastrecord == m,
2873 ("soreceive: post-control, lastrecord!=m"));
2874 }
2875 }
2876 type = m->m_type;
2877 if (type == MT_OOBDATA)
2878 flags |= MSG_OOB;
2879 } else {
2880 if ((flags & MSG_PEEK) == 0) {
2881 KASSERT(so->so_rcv.sb_mb == nextrecord,
2882 ("soreceive: sb_mb != nextrecord"));
2883 if (so->so_rcv.sb_mb == NULL) {
2884 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2885 ("soreceive: sb_lastercord != NULL"));
2886 }
2887 }
2888 }
2889 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2890 SBLASTRECORDCHK(&so->so_rcv);
2891 SBLASTMBUFCHK(&so->so_rcv);
2892
2893 /*
2894 * Now continue to read any data mbufs off of the head of the socket
2895 * buffer until the read request is satisfied. Note that 'type' is
2896 * used to store the type of any mbuf reads that have happened so far
2897 * such that soreceive() can stop reading if the type changes, which
2898 * causes soreceive() to return only one of regular data and inline
2899 * out-of-band data in a single socket receive operation.
2900 */
2901 moff = 0;
2902 offset = 0;
2903 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2904 && error == 0) {
2905 /*
2906 * If the type of mbuf has changed since the last mbuf
2907 * examined ('type'), end the receive operation.
2908 */
2909 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2910 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2911 if (type != m->m_type)
2912 break;
2913 } else if (type == MT_OOBDATA)
2914 break;
2915 else
2916 KASSERT(m->m_type == MT_DATA,
2917 ("m->m_type == %d", m->m_type));
2918 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2919 len = uio->uio_resid;
2920 if (so->so_oobmark && len > so->so_oobmark - offset)
2921 len = so->so_oobmark - offset;
2922 if (len > m->m_len - moff)
2923 len = m->m_len - moff;
2924 /*
2925 * If mp is set, just pass back the mbufs. Otherwise copy
2926 * them out via the uio, then free. Sockbuf must be
2927 * consistent here (points to current mbuf, it points to next
2928 * record) when we drop priority; we must note any additions
2929 * to the sockbuf when we block interrupts again.
2930 */
2931 if (mp == NULL) {
2932 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2933 SBLASTRECORDCHK(&so->so_rcv);
2934 SBLASTMBUFCHK(&so->so_rcv);
2935 SOCKBUF_UNLOCK(&so->so_rcv);
2936 if ((m->m_flags & M_EXTPG) != 0)
2937 error = m_unmapped_uiomove(m, moff, uio,
2938 (int)len);
2939 else
2940 error = uiomove(mtod(m, char *) + moff,
2941 (int)len, uio);
2942 SOCKBUF_LOCK(&so->so_rcv);
2943 if (error) {
2944 /*
2945 * The MT_SONAME mbuf has already been removed
2946 * from the record, so it is necessary to
2947 * remove the data mbufs, if any, to preserve
2948 * the invariant in the case of PR_ADDR that
2949 * requires MT_SONAME mbufs at the head of
2950 * each record.
2951 */
2952 if (pr->pr_flags & PR_ATOMIC &&
2953 ((flags & MSG_PEEK) == 0))
2954 (void)sbdroprecord_locked(&so->so_rcv);
2955 SOCKBUF_UNLOCK(&so->so_rcv);
2956 goto release;
2957 }
2958 } else
2959 uio->uio_resid -= len;
2960 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2961 if (len == m->m_len - moff) {
2962 if (m->m_flags & M_EOR)
2963 flags |= MSG_EOR;
2964 if (flags & MSG_PEEK) {
2965 m = m->m_next;
2966 moff = 0;
2967 } else {
2968 nextrecord = m->m_nextpkt;
2969 sbfree(&so->so_rcv, m);
2970 if (mp != NULL) {
2971 m->m_nextpkt = NULL;
2972 *mp = m;
2973 mp = &m->m_next;
2974 so->so_rcv.sb_mb = m = m->m_next;
2975 *mp = NULL;
2976 } else {
2977 so->so_rcv.sb_mb = m_free(m);
2978 m = so->so_rcv.sb_mb;
2979 }
2980 sockbuf_pushsync(&so->so_rcv, nextrecord);
2981 SBLASTRECORDCHK(&so->so_rcv);
2982 SBLASTMBUFCHK(&so->so_rcv);
2983 }
2984 } else {
2985 if (flags & MSG_PEEK)
2986 moff += len;
2987 else {
2988 if (mp != NULL) {
2989 if (flags & MSG_DONTWAIT) {
2990 *mp = m_copym(m, 0, len,
2991 M_NOWAIT);
2992 if (*mp == NULL) {
2993 /*
2994 * m_copym() couldn't
2995 * allocate an mbuf.
2996 * Adjust uio_resid back
2997 * (it was adjusted
2998 * down by len bytes,
2999 * which we didn't end
3000 * up "copying" over).
3001 */
3002 uio->uio_resid += len;
3003 break;
3004 }
3005 } else {
3006 SOCKBUF_UNLOCK(&so->so_rcv);
3007 *mp = m_copym(m, 0, len,
3008 M_WAITOK);
3009 SOCKBUF_LOCK(&so->so_rcv);
3010 }
3011 }
3012 sbcut_locked(&so->so_rcv, len);
3013 }
3014 }
3015 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3016 if (so->so_oobmark) {
3017 if ((flags & MSG_PEEK) == 0) {
3018 so->so_oobmark -= len;
3019 if (so->so_oobmark == 0) {
3020 so->so_rcv.sb_state |= SBS_RCVATMARK;
3021 break;
3022 }
3023 } else {
3024 offset += len;
3025 if (offset == so->so_oobmark)
3026 break;
3027 }
3028 }
3029 if (flags & MSG_EOR)
3030 break;
3031 /*
3032 * If the MSG_WAITALL flag is set (for non-atomic socket), we
3033 * must not quit until "uio->uio_resid == 0" or an error
3034 * termination. If a signal/timeout occurs, return with a
3035 * short count but without error. Keep sockbuf locked
3036 * against other readers.
3037 */
3038 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
3039 !sosendallatonce(so) && nextrecord == NULL) {
3040 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3041 if (so->so_error || so->so_rerror ||
3042 so->so_rcv.sb_state & SBS_CANTRCVMORE)
3043 break;
3044 /*
3045 * Notify the protocol that some data has been
3046 * drained before blocking.
3047 */
3048 if (pr->pr_flags & PR_WANTRCVD) {
3049 SOCKBUF_UNLOCK(&so->so_rcv);
3050 VNET_SO_ASSERT(so);
3051 pr->pr_rcvd(so, flags);
3052 SOCKBUF_LOCK(&so->so_rcv);
3053 if (__predict_false(so->so_rcv.sb_mb == NULL &&
3054 (so->so_error || so->so_rerror ||
3055 so->so_rcv.sb_state & SBS_CANTRCVMORE)))
3056 break;
3057 }
3058 SBLASTRECORDCHK(&so->so_rcv);
3059 SBLASTMBUFCHK(&so->so_rcv);
3060 /*
3061 * We could receive some data while was notifying
3062 * the protocol. Skip blocking in this case.
3063 */
3064 if (so->so_rcv.sb_mb == NULL) {
3065 error = sbwait(so, SO_RCV);
3066 if (error) {
3067 SOCKBUF_UNLOCK(&so->so_rcv);
3068 goto release;
3069 }
3070 }
3071 m = so->so_rcv.sb_mb;
3072 if (m != NULL)
3073 nextrecord = m->m_nextpkt;
3074 }
3075 }
3076
3077 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3078 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3079 if (report_real_len)
3080 uio->uio_resid -= m_length(m, NULL) - moff;
3081 flags |= MSG_TRUNC;
3082 if ((flags & MSG_PEEK) == 0)
3083 (void) sbdroprecord_locked(&so->so_rcv);
3084 }
3085 if ((flags & MSG_PEEK) == 0) {
3086 if (m == NULL) {
3087 /*
3088 * First part is an inline SB_EMPTY_FIXUP(). Second
3089 * part makes sure sb_lastrecord is up-to-date if
3090 * there is still data in the socket buffer.
3091 */
3092 so->so_rcv.sb_mb = nextrecord;
3093 if (so->so_rcv.sb_mb == NULL) {
3094 so->so_rcv.sb_mbtail = NULL;
3095 so->so_rcv.sb_lastrecord = NULL;
3096 } else if (nextrecord->m_nextpkt == NULL)
3097 so->so_rcv.sb_lastrecord = nextrecord;
3098 }
3099 SBLASTRECORDCHK(&so->so_rcv);
3100 SBLASTMBUFCHK(&so->so_rcv);
3101 /*
3102 * If soreceive() is being done from the socket callback,
3103 * then don't need to generate ACK to peer to update window,
3104 * since ACK will be generated on return to TCP.
3105 */
3106 if (!(flags & MSG_SOCALLBCK) &&
3107 (pr->pr_flags & PR_WANTRCVD)) {
3108 SOCKBUF_UNLOCK(&so->so_rcv);
3109 VNET_SO_ASSERT(so);
3110 pr->pr_rcvd(so, flags);
3111 SOCKBUF_LOCK(&so->so_rcv);
3112 }
3113 }
3114 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3115 if (orig_resid == uio->uio_resid && orig_resid &&
3116 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
3117 SOCKBUF_UNLOCK(&so->so_rcv);
3118 goto restart;
3119 }
3120 SOCKBUF_UNLOCK(&so->so_rcv);
3121
3122 if (flagsp != NULL)
3123 *flagsp |= flags;
3124 release:
3125 return (error);
3126 }
3127
3128 int
3129 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
3130 struct mbuf **mp, struct mbuf **controlp, int *flagsp)
3131 {
3132 int error, flags;
3133
3134 if (psa != NULL)
3135 *psa = NULL;
3136 if (controlp != NULL)
3137 *controlp = NULL;
3138 if (flagsp != NULL) {
3139 flags = *flagsp;
3140 if ((flags & MSG_OOB) != 0)
3141 return (soreceive_rcvoob(so, uio, flags));
3142 } else {
3143 flags = 0;
3144 }
3145 if (mp != NULL)
3146 *mp = NULL;
3147 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
3148 (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) {
3149 VNET_SO_ASSERT(so);
3150 so->so_proto->pr_rcvd(so, 0);
3151 }
3152
3153 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
3154 if (error)
3155 return (error);
3156 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp);
3157 SOCK_IO_RECV_UNLOCK(so);
3158 return (error);
3159 }
3160
3161 /*
3162 * Optimized version of soreceive() for stream (TCP) sockets.
3163 */
3164 static int
3165 soreceive_stream_locked(struct socket *so, struct sockbuf *sb,
3166 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0,
3167 struct mbuf **controlp, int flags)
3168 {
3169 int len = 0, error = 0, oresid;
3170 struct mbuf *m, *n = NULL;
3171
3172 SOCK_IO_RECV_ASSERT_LOCKED(so);
3173
3174 /* Easy one, no space to copyout anything. */
3175 if (uio->uio_resid == 0)
3176 return (EINVAL);
3177 oresid = uio->uio_resid;
3178
3179 SOCKBUF_LOCK(sb);
3180 /* We will never ever get anything unless we are or were connected. */
3181 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
3182 error = ENOTCONN;
3183 goto out;
3184 }
3185
3186 restart:
3187 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3188
3189 /* Abort if socket has reported problems. */
3190 if (so->so_error) {
3191 if (sbavail(sb) > 0)
3192 goto deliver;
3193 if (oresid > uio->uio_resid)
3194 goto out;
3195 error = so->so_error;
3196 if (!(flags & MSG_PEEK))
3197 so->so_error = 0;
3198 goto out;
3199 }
3200
3201 /* Door is closed. Deliver what is left, if any. */
3202 if (sb->sb_state & SBS_CANTRCVMORE) {
3203 if (sbavail(sb) > 0)
3204 goto deliver;
3205 else
3206 goto out;
3207 }
3208
3209 /* Socket buffer is empty and we shall not block. */
3210 if (sbavail(sb) == 0 &&
3211 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
3212 error = EAGAIN;
3213 goto out;
3214 }
3215
3216 /* Socket buffer got some data that we shall deliver now. */
3217 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
3218 ((so->so_state & SS_NBIO) ||
3219 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
3220 sbavail(sb) >= sb->sb_lowat ||
3221 sbavail(sb) >= uio->uio_resid ||
3222 sbavail(sb) >= sb->sb_hiwat) ) {
3223 goto deliver;
3224 }
3225
3226 /* On MSG_WAITALL we must wait until all data or error arrives. */
3227 if ((flags & MSG_WAITALL) &&
3228 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
3229 goto deliver;
3230
3231 /*
3232 * Wait and block until (more) data comes in.
3233 * NB: Drops the sockbuf lock during wait.
3234 */
3235 error = sbwait(so, SO_RCV);
3236 if (error)
3237 goto out;
3238 goto restart;
3239
3240 deliver:
3241 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3242 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
3243 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
3244
3245 /* Statistics. */
3246 if (uio->uio_td)
3247 uio->uio_td->td_ru.ru_msgrcv++;
3248
3249 /* Fill uio until full or current end of socket buffer is reached. */
3250 len = min(uio->uio_resid, sbavail(sb));
3251 if (mp0 != NULL) {
3252 /* Dequeue as many mbufs as possible. */
3253 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
3254 if (*mp0 == NULL)
3255 *mp0 = sb->sb_mb;
3256 else
3257 m_cat(*mp0, sb->sb_mb);
3258 for (m = sb->sb_mb;
3259 m != NULL && m->m_len <= len;
3260 m = m->m_next) {
3261 KASSERT(!(m->m_flags & M_NOTAVAIL),
3262 ("%s: m %p not available", __func__, m));
3263 len -= m->m_len;
3264 uio->uio_resid -= m->m_len;
3265 sbfree(sb, m);
3266 n = m;
3267 }
3268 n->m_next = NULL;
3269 sb->sb_mb = m;
3270 sb->sb_lastrecord = sb->sb_mb;
3271 if (sb->sb_mb == NULL)
3272 SB_EMPTY_FIXUP(sb);
3273 }
3274 /* Copy the remainder. */
3275 if (len > 0) {
3276 KASSERT(sb->sb_mb != NULL,
3277 ("%s: len > 0 && sb->sb_mb empty", __func__));
3278
3279 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
3280 if (m == NULL)
3281 len = 0; /* Don't flush data from sockbuf. */
3282 else
3283 uio->uio_resid -= len;
3284 if (*mp0 != NULL)
3285 m_cat(*mp0, m);
3286 else
3287 *mp0 = m;
3288 if (*mp0 == NULL) {
3289 error = ENOBUFS;
3290 goto out;
3291 }
3292 }
3293 } else {
3294 /* NB: Must unlock socket buffer as uiomove may sleep. */
3295 SOCKBUF_UNLOCK(sb);
3296 error = m_mbuftouio(uio, sb->sb_mb, len);
3297 SOCKBUF_LOCK(sb);
3298 if (error)
3299 goto out;
3300 }
3301 SBLASTRECORDCHK(sb);
3302 SBLASTMBUFCHK(sb);
3303
3304 /*
3305 * Remove the delivered data from the socket buffer unless we
3306 * were only peeking.
3307 */
3308 if (!(flags & MSG_PEEK)) {
3309 if (len > 0)
3310 sbdrop_locked(sb, len);
3311
3312 /* Notify protocol that we drained some data. */
3313 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
3314 (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
3315 !(flags & MSG_SOCALLBCK))) {
3316 SOCKBUF_UNLOCK(sb);
3317 VNET_SO_ASSERT(so);
3318 so->so_proto->pr_rcvd(so, flags);
3319 SOCKBUF_LOCK(sb);
3320 }
3321 }
3322
3323 /*
3324 * For MSG_WAITALL we may have to loop again and wait for
3325 * more data to come in.
3326 */
3327 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
3328 goto restart;
3329 out:
3330 SBLASTRECORDCHK(sb);
3331 SBLASTMBUFCHK(sb);
3332 SOCKBUF_UNLOCK(sb);
3333 return (error);
3334 }
3335
3336 int
3337 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
3338 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3339 {
3340 struct sockbuf *sb;
3341 int error, flags;
3342
3343 sb = &so->so_rcv;
3344
3345 /* We only do stream sockets. */
3346 if (so->so_type != SOCK_STREAM)
3347 return (EINVAL);
3348 if (psa != NULL)
3349 *psa = NULL;
3350 if (flagsp != NULL)
3351 flags = *flagsp & ~MSG_EOR;
3352 else
3353 flags = 0;
3354 if (controlp != NULL)
3355 *controlp = NULL;
3356 if (flags & MSG_OOB)
3357 return (soreceive_rcvoob(so, uio, flags));
3358 if (mp0 != NULL)
3359 *mp0 = NULL;
3360
3361 #ifdef KERN_TLS
3362 /*
3363 * KTLS store TLS records as records with a control message to
3364 * describe the framing.
3365 *
3366 * We check once here before acquiring locks to optimize the
3367 * common case.
3368 */
3369 if (sb->sb_tls_info != NULL)
3370 return (soreceive_generic(so, psa, uio, mp0, controlp,
3371 flagsp));
3372 #endif
3373
3374 /*
3375 * Prevent other threads from reading from the socket. This lock may be
3376 * dropped in order to sleep waiting for data to arrive.
3377 */
3378 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
3379 if (error)
3380 return (error);
3381 #ifdef KERN_TLS
3382 if (__predict_false(sb->sb_tls_info != NULL)) {
3383 SOCK_IO_RECV_UNLOCK(so);
3384 return (soreceive_generic(so, psa, uio, mp0, controlp,
3385 flagsp));
3386 }
3387 #endif
3388 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags);
3389 SOCK_IO_RECV_UNLOCK(so);
3390 return (error);
3391 }
3392
3393 /*
3394 * Optimized version of soreceive() for simple datagram cases from userspace.
3395 * Unlike in the stream case, we're able to drop a datagram if copyout()
3396 * fails, and because we handle datagrams atomically, we don't need to use a
3397 * sleep lock to prevent I/O interlacing.
3398 */
3399 int
3400 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
3401 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3402 {
3403 struct mbuf *m, *m2;
3404 int flags, error;
3405 ssize_t len;
3406 struct protosw *pr = so->so_proto;
3407 struct mbuf *nextrecord;
3408
3409 if (psa != NULL)
3410 *psa = NULL;
3411 if (controlp != NULL)
3412 *controlp = NULL;
3413 if (flagsp != NULL)
3414 flags = *flagsp &~ MSG_EOR;
3415 else
3416 flags = 0;
3417
3418 /*
3419 * For any complicated cases, fall back to the full
3420 * soreceive_generic().
3421 */
3422 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
3423 return (soreceive_generic(so, psa, uio, mp0, controlp,
3424 flagsp));
3425
3426 /*
3427 * Enforce restrictions on use.
3428 */
3429 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
3430 ("soreceive_dgram: wantrcvd"));
3431 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
3432 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
3433 ("soreceive_dgram: SBS_RCVATMARK"));
3434 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
3435 ("soreceive_dgram: P_CONNREQUIRED"));
3436
3437 /*
3438 * Loop blocking while waiting for a datagram.
3439 */
3440 SOCKBUF_LOCK(&so->so_rcv);
3441 while ((m = so->so_rcv.sb_mb) == NULL) {
3442 KASSERT(sbavail(&so->so_rcv) == 0,
3443 ("soreceive_dgram: sb_mb NULL but sbavail %u",
3444 sbavail(&so->so_rcv)));
3445 if (so->so_error) {
3446 error = so->so_error;
3447 so->so_error = 0;
3448 SOCKBUF_UNLOCK(&so->so_rcv);
3449 return (error);
3450 }
3451 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
3452 uio->uio_resid == 0) {
3453 SOCKBUF_UNLOCK(&so->so_rcv);
3454 return (0);
3455 }
3456 if ((so->so_state & SS_NBIO) ||
3457 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3458 SOCKBUF_UNLOCK(&so->so_rcv);
3459 return (EWOULDBLOCK);
3460 }
3461 SBLASTRECORDCHK(&so->so_rcv);
3462 SBLASTMBUFCHK(&so->so_rcv);
3463 error = sbwait(so, SO_RCV);
3464 if (error) {
3465 SOCKBUF_UNLOCK(&so->so_rcv);
3466 return (error);
3467 }
3468 }
3469 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3470
3471 if (uio->uio_td)
3472 uio->uio_td->td_ru.ru_msgrcv++;
3473 SBLASTRECORDCHK(&so->so_rcv);
3474 SBLASTMBUFCHK(&so->so_rcv);
3475 nextrecord = m->m_nextpkt;
3476 if (nextrecord == NULL) {
3477 KASSERT(so->so_rcv.sb_lastrecord == m,
3478 ("soreceive_dgram: lastrecord != m"));
3479 }
3480
3481 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
3482 ("soreceive_dgram: m_nextpkt != nextrecord"));
3483
3484 /*
3485 * Pull 'm' and its chain off the front of the packet queue.
3486 */
3487 so->so_rcv.sb_mb = NULL;
3488 sockbuf_pushsync(&so->so_rcv, nextrecord);
3489
3490 /*
3491 * Walk 'm's chain and free that many bytes from the socket buffer.
3492 */
3493 for (m2 = m; m2 != NULL; m2 = m2->m_next)
3494 sbfree(&so->so_rcv, m2);
3495
3496 /*
3497 * Do a few last checks before we let go of the lock.
3498 */
3499 SBLASTRECORDCHK(&so->so_rcv);
3500 SBLASTMBUFCHK(&so->so_rcv);
3501 SOCKBUF_UNLOCK(&so->so_rcv);
3502
3503 if (pr->pr_flags & PR_ADDR) {
3504 KASSERT(m->m_type == MT_SONAME,
3505 ("m->m_type == %d", m->m_type));
3506 if (psa != NULL)
3507 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
3508 M_NOWAIT);
3509 m = m_free(m);
3510 }
3511 if (m == NULL) {
3512 /* XXXRW: Can this happen? */
3513 return (0);
3514 }
3515
3516 /*
3517 * Packet to copyout() is now in 'm' and it is disconnected from the
3518 * queue.
3519 *
3520 * Process one or more MT_CONTROL mbufs present before any data mbufs
3521 * in the first mbuf chain on the socket buffer. We call into the
3522 * protocol to perform externalization (or freeing if controlp ==
3523 * NULL). In some cases there can be only MT_CONTROL mbufs without
3524 * MT_DATA mbufs.
3525 */
3526 if (m->m_type == MT_CONTROL) {
3527 struct mbuf *cm = NULL, *cmn;
3528 struct mbuf **cme = &cm;
3529
3530 do {
3531 m2 = m->m_next;
3532 m->m_next = NULL;
3533 *cme = m;
3534 cme = &(*cme)->m_next;
3535 m = m2;
3536 } while (m != NULL && m->m_type == MT_CONTROL);
3537 while (cm != NULL) {
3538 cmn = cm->m_next;
3539 cm->m_next = NULL;
3540 if (pr->pr_domain->dom_externalize != NULL) {
3541 error = (*pr->pr_domain->dom_externalize)
3542 (cm, controlp, flags);
3543 } else if (controlp != NULL)
3544 *controlp = cm;
3545 else
3546 m_freem(cm);
3547 if (controlp != NULL) {
3548 while (*controlp != NULL)
3549 controlp = &(*controlp)->m_next;
3550 }
3551 cm = cmn;
3552 }
3553 }
3554 KASSERT(m == NULL || m->m_type == MT_DATA,
3555 ("soreceive_dgram: !data"));
3556 while (m != NULL && uio->uio_resid > 0) {
3557 len = uio->uio_resid;
3558 if (len > m->m_len)
3559 len = m->m_len;
3560 error = uiomove(mtod(m, char *), (int)len, uio);
3561 if (error) {
3562 m_freem(m);
3563 return (error);
3564 }
3565 if (len == m->m_len)
3566 m = m_free(m);
3567 else {
3568 m->m_data += len;
3569 m->m_len -= len;
3570 }
3571 }
3572 if (m != NULL) {
3573 flags |= MSG_TRUNC;
3574 m_freem(m);
3575 }
3576 if (flagsp != NULL)
3577 *flagsp |= flags;
3578 return (0);
3579 }
3580
3581 int
3582 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3583 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3584 {
3585 int error;
3586
3587 CURVNET_SET(so->so_vnet);
3588 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
3589 CURVNET_RESTORE();
3590 return (error);
3591 }
3592
3593 int
3594 soshutdown(struct socket *so, int how)
3595 {
3596 struct protosw *pr;
3597 int error, soerror_enotconn;
3598
3599 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
3600 return (EINVAL);
3601
3602 soerror_enotconn = 0;
3603 SOCK_LOCK(so);
3604 if ((so->so_state &
3605 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
3606 /*
3607 * POSIX mandates us to return ENOTCONN when shutdown(2) is
3608 * invoked on a datagram sockets, however historically we would
3609 * actually tear socket down. This is known to be leveraged by
3610 * some applications to unblock process waiting in recvXXX(2)
3611 * by other process that it shares that socket with. Try to meet
3612 * both backward-compatibility and POSIX requirements by forcing
3613 * ENOTCONN but still asking protocol to perform pru_shutdown().
3614 */
3615 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
3616 SOCK_UNLOCK(so);
3617 return (ENOTCONN);
3618 }
3619 soerror_enotconn = 1;
3620 }
3621
3622 if (SOLISTENING(so)) {
3623 if (how != SHUT_WR) {
3624 so->so_error = ECONNABORTED;
3625 solisten_wakeup(so); /* unlocks so */
3626 } else {
3627 SOCK_UNLOCK(so);
3628 }
3629 goto done;
3630 }
3631 SOCK_UNLOCK(so);
3632
3633 CURVNET_SET(so->so_vnet);
3634 pr = so->so_proto;
3635 if (pr->pr_flush != NULL)
3636 pr->pr_flush(so, how);
3637 if (how != SHUT_WR)
3638 sorflush(so);
3639 if (how != SHUT_RD) {
3640 error = pr->pr_shutdown(so);
3641 wakeup(&so->so_timeo);
3642 CURVNET_RESTORE();
3643 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
3644 }
3645 wakeup(&so->so_timeo);
3646 CURVNET_RESTORE();
3647
3648 done:
3649 return (soerror_enotconn ? ENOTCONN : 0);
3650 }
3651
3652 void
3653 sorflush(struct socket *so)
3654 {
3655 struct protosw *pr;
3656 int error;
3657
3658 VNET_SO_ASSERT(so);
3659
3660 /*
3661 * Dislodge threads currently blocked in receive and wait to acquire
3662 * a lock against other simultaneous readers before clearing the
3663 * socket buffer. Don't let our acquire be interrupted by a signal
3664 * despite any existing socket disposition on interruptable waiting.
3665 */
3666 socantrcvmore(so);
3667
3668 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
3669 if (error != 0) {
3670 KASSERT(SOLISTENING(so),
3671 ("%s: soiolock(%p) failed", __func__, so));
3672 return;
3673 }
3674
3675 pr = so->so_proto;
3676 if (pr->pr_flags & PR_RIGHTS) {
3677 MPASS(pr->pr_domain->dom_dispose != NULL);
3678 (*pr->pr_domain->dom_dispose)(so);
3679 } else {
3680 sbrelease(so, SO_RCV);
3681 SOCK_IO_RECV_UNLOCK(so);
3682 }
3683
3684 }
3685
3686 /*
3687 * Wrapper for Socket established helper hook.
3688 * Parameters: socket, context of the hook point, hook id.
3689 */
3690 static int inline
3691 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
3692 {
3693 struct socket_hhook_data hhook_data = {
3694 .so = so,
3695 .hctx = hctx,
3696 .m = NULL,
3697 .status = 0
3698 };
3699
3700 CURVNET_SET(so->so_vnet);
3701 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
3702 CURVNET_RESTORE();
3703
3704 /* Ugly but needed, since hhooks return void for now */
3705 return (hhook_data.status);
3706 }
3707
3708 /*
3709 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
3710 * additional variant to handle the case where the option value needs to be
3711 * some kind of integer, but not a specific size. In addition to their use
3712 * here, these functions are also called by the protocol-level pr_ctloutput()
3713 * routines.
3714 */
3715 int
3716 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3717 {
3718 size_t valsize;
3719
3720 /*
3721 * If the user gives us more than we wanted, we ignore it, but if we
3722 * don't get the minimum length the caller wants, we return EINVAL.
3723 * On success, sopt->sopt_valsize is set to however much we actually
3724 * retrieved.
3725 */
3726 if ((valsize = sopt->sopt_valsize) < minlen)
3727 return EINVAL;
3728 if (valsize > len)
3729 sopt->sopt_valsize = valsize = len;
3730
3731 if (sopt->sopt_td != NULL)
3732 return (copyin(sopt->sopt_val, buf, valsize));
3733
3734 bcopy(sopt->sopt_val, buf, valsize);
3735 return (0);
3736 }
3737
3738 /*
3739 * Kernel version of setsockopt(2).
3740 *
3741 * XXX: optlen is size_t, not socklen_t
3742 */
3743 int
3744 so_setsockopt(struct socket *so, int level, int optname, void *optval,
3745 size_t optlen)
3746 {
3747 struct sockopt sopt;
3748
3749 sopt.sopt_level = level;
3750 sopt.sopt_name = optname;
3751 sopt.sopt_dir = SOPT_SET;
3752 sopt.sopt_val = optval;
3753 sopt.sopt_valsize = optlen;
3754 sopt.sopt_td = NULL;
3755 return (sosetopt(so, &sopt));
3756 }
3757
3758 int
3759 sosetopt(struct socket *so, struct sockopt *sopt)
3760 {
3761 int error, optval;
3762 struct linger l;
3763 struct timeval tv;
3764 sbintime_t val, *valp;
3765 uint32_t val32;
3766 #ifdef MAC
3767 struct mac extmac;
3768 #endif
3769
3770 CURVNET_SET(so->so_vnet);
3771 error = 0;
3772 if (sopt->sopt_level != SOL_SOCKET) {
3773 if (so->so_proto->pr_ctloutput != NULL)
3774 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3775 else
3776 error = ENOPROTOOPT;
3777 } else {
3778 switch (sopt->sopt_name) {
3779 case SO_ACCEPTFILTER:
3780 error = accept_filt_setopt(so, sopt);
3781 if (error)
3782 goto bad;
3783 break;
3784
3785 case SO_LINGER:
3786 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3787 if (error)
3788 goto bad;
3789 if (l.l_linger < 0 ||
3790 l.l_linger > USHRT_MAX ||
3791 l.l_linger > (INT_MAX / hz)) {
3792 error = EDOM;
3793 goto bad;
3794 }
3795 SOCK_LOCK(so);
3796 so->so_linger = l.l_linger;
3797 if (l.l_onoff)
3798 so->so_options |= SO_LINGER;
3799 else
3800 so->so_options &= ~SO_LINGER;
3801 SOCK_UNLOCK(so);
3802 break;
3803
3804 case SO_DEBUG:
3805 case SO_KEEPALIVE:
3806 case SO_DONTROUTE:
3807 case SO_USELOOPBACK:
3808 case SO_BROADCAST:
3809 case SO_REUSEADDR:
3810 case SO_REUSEPORT:
3811 case SO_REUSEPORT_LB:
3812 case SO_OOBINLINE:
3813 case SO_TIMESTAMP:
3814 case SO_BINTIME:
3815 case SO_NOSIGPIPE:
3816 case SO_NO_DDP:
3817 case SO_NO_OFFLOAD:
3818 case SO_RERROR:
3819 error = sooptcopyin(sopt, &optval, sizeof optval,
3820 sizeof optval);
3821 if (error)
3822 goto bad;
3823 SOCK_LOCK(so);
3824 if (optval)
3825 so->so_options |= sopt->sopt_name;
3826 else
3827 so->so_options &= ~sopt->sopt_name;
3828 SOCK_UNLOCK(so);
3829 break;
3830
3831 case SO_SETFIB:
3832 error = sooptcopyin(sopt, &optval, sizeof optval,
3833 sizeof optval);
3834 if (error)
3835 goto bad;
3836
3837 if (optval < 0 || optval >= rt_numfibs) {
3838 error = EINVAL;
3839 goto bad;
3840 }
3841 if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3842 (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3843 (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3844 so->so_fibnum = optval;
3845 else
3846 so->so_fibnum = 0;
3847 break;
3848
3849 case SO_USER_COOKIE:
3850 error = sooptcopyin(sopt, &val32, sizeof val32,
3851 sizeof val32);
3852 if (error)
3853 goto bad;
3854 so->so_user_cookie = val32;
3855 break;
3856
3857 case SO_SNDBUF:
3858 case SO_RCVBUF:
3859 case SO_SNDLOWAT:
3860 case SO_RCVLOWAT:
3861 error = so->so_proto->pr_setsbopt(so, sopt);
3862 if (error)
3863 goto bad;
3864 break;
3865
3866 case SO_SNDTIMEO:
3867 case SO_RCVTIMEO:
3868 #ifdef COMPAT_FREEBSD32
3869 if (SV_CURPROC_FLAG(SV_ILP32)) {
3870 struct timeval32 tv32;
3871
3872 error = sooptcopyin(sopt, &tv32, sizeof tv32,
3873 sizeof tv32);
3874 CP(tv32, tv, tv_sec);
3875 CP(tv32, tv, tv_usec);
3876 } else
3877 #endif
3878 error = sooptcopyin(sopt, &tv, sizeof tv,
3879 sizeof tv);
3880 if (error)
3881 goto bad;
3882 if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3883 tv.tv_usec >= 1000000) {
3884 error = EDOM;
3885 goto bad;
3886 }
3887 if (tv.tv_sec > INT32_MAX)
3888 val = SBT_MAX;
3889 else
3890 val = tvtosbt(tv);
3891 SOCK_LOCK(so);
3892 valp = sopt->sopt_name == SO_SNDTIMEO ?
3893 (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
3894 &so->so_snd.sb_timeo) :
3895 (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
3896 &so->so_rcv.sb_timeo);
3897 *valp = val;
3898 SOCK_UNLOCK(so);
3899 break;
3900
3901 case SO_LABEL:
3902 #ifdef MAC
3903 error = sooptcopyin(sopt, &extmac, sizeof extmac,
3904 sizeof extmac);
3905 if (error)
3906 goto bad;
3907 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3908 so, &extmac);
3909 #else
3910 error = EOPNOTSUPP;
3911 #endif
3912 break;
3913
3914 case SO_TS_CLOCK:
3915 error = sooptcopyin(sopt, &optval, sizeof optval,
3916 sizeof optval);
3917 if (error)
3918 goto bad;
3919 if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3920 error = EINVAL;
3921 goto bad;
3922 }
3923 so->so_ts_clock = optval;
3924 break;
3925
3926 case SO_MAX_PACING_RATE:
3927 error = sooptcopyin(sopt, &val32, sizeof(val32),
3928 sizeof(val32));
3929 if (error)
3930 goto bad;
3931 so->so_max_pacing_rate = val32;
3932 break;
3933
3934 case SO_SPLICE: {
3935 struct splice splice;
3936
3937 #ifdef COMPAT_FREEBSD32
3938 if (SV_CURPROC_FLAG(SV_ILP32)) {
3939 struct splice32 splice32;
3940
3941 error = sooptcopyin(sopt, &splice32,
3942 sizeof(splice32), sizeof(splice32));
3943 if (error == 0) {
3944 splice.sp_fd = splice32.sp_fd;
3945 splice.sp_max = splice32.sp_max;
3946 CP(splice32.sp_idle, splice.sp_idle,
3947 tv_sec);
3948 CP(splice32.sp_idle, splice.sp_idle,
3949 tv_usec);
3950 }
3951 } else
3952 #endif
3953 {
3954 error = sooptcopyin(sopt, &splice,
3955 sizeof(splice), sizeof(splice));
3956 }
3957 if (error)
3958 goto bad;
3959 #ifdef KTRACE
3960 if (KTRPOINT(curthread, KTR_STRUCT))
3961 ktrsplice(&splice);
3962 #endif
3963
3964 error = splice_init();
3965 if (error != 0)
3966 goto bad;
3967
3968 if (splice.sp_fd >= 0) {
3969 struct file *fp;
3970 struct socket *so2;
3971
3972 if (!cap_rights_contains(sopt->sopt_rights,
3973 &cap_recv_rights)) {
3974 error = ENOTCAPABLE;
3975 goto bad;
3976 }
3977 error = getsock(sopt->sopt_td, splice.sp_fd,
3978 &cap_send_rights, &fp);
3979 if (error != 0)
3980 goto bad;
3981 so2 = fp->f_data;
3982
3983 error = so_splice(so, so2, &splice);
3984 fdrop(fp, sopt->sopt_td);
3985 } else {
3986 error = so_unsplice(so, false);
3987 }
3988 break;
3989 }
3990 default:
3991 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3992 error = hhook_run_socket(so, sopt,
3993 HHOOK_SOCKET_OPT);
3994 else
3995 error = ENOPROTOOPT;
3996 break;
3997 }
3998 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3999 (void)(*so->so_proto->pr_ctloutput)(so, sopt);
4000 }
4001 bad:
4002 CURVNET_RESTORE();
4003 return (error);
4004 }
4005
4006 /*
4007 * Helper routine for getsockopt.
4008 */
4009 int
4010 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
4011 {
4012 int error;
4013 size_t valsize;
4014
4015 error = 0;
4016
4017 /*
4018 * Documented get behavior is that we always return a value, possibly
4019 * truncated to fit in the user's buffer. Traditional behavior is
4020 * that we always tell the user precisely how much we copied, rather
4021 * than something useful like the total amount we had available for
4022 * her. Note that this interface is not idempotent; the entire
4023 * answer must be generated ahead of time.
4024 */
4025 valsize = min(len, sopt->sopt_valsize);
4026 sopt->sopt_valsize = valsize;
4027 if (sopt->sopt_val != NULL) {
4028 if (sopt->sopt_td != NULL)
4029 error = copyout(buf, sopt->sopt_val, valsize);
4030 else
4031 bcopy(buf, sopt->sopt_val, valsize);
4032 }
4033 return (error);
4034 }
4035
4036 int
4037 sogetopt(struct socket *so, struct sockopt *sopt)
4038 {
4039 int error, optval;
4040 struct linger l;
4041 struct timeval tv;
4042 #ifdef MAC
4043 struct mac extmac;
4044 #endif
4045
4046 CURVNET_SET(so->so_vnet);
4047 error = 0;
4048 if (sopt->sopt_level != SOL_SOCKET) {
4049 if (so->so_proto->pr_ctloutput != NULL)
4050 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4051 else
4052 error = ENOPROTOOPT;
4053 CURVNET_RESTORE();
4054 return (error);
4055 } else {
4056 switch (sopt->sopt_name) {
4057 case SO_ACCEPTFILTER:
4058 error = accept_filt_getopt(so, sopt);
4059 break;
4060
4061 case SO_LINGER:
4062 SOCK_LOCK(so);
4063 l.l_onoff = so->so_options & SO_LINGER;
4064 l.l_linger = so->so_linger;
4065 SOCK_UNLOCK(so);
4066 error = sooptcopyout(sopt, &l, sizeof l);
4067 break;
4068
4069 case SO_USELOOPBACK:
4070 case SO_DONTROUTE:
4071 case SO_DEBUG:
4072 case SO_KEEPALIVE:
4073 case SO_REUSEADDR:
4074 case SO_REUSEPORT:
4075 case SO_REUSEPORT_LB:
4076 case SO_BROADCAST:
4077 case SO_OOBINLINE:
4078 case SO_ACCEPTCONN:
4079 case SO_TIMESTAMP:
4080 case SO_BINTIME:
4081 case SO_NOSIGPIPE:
4082 case SO_NO_DDP:
4083 case SO_NO_OFFLOAD:
4084 case SO_RERROR:
4085 optval = so->so_options & sopt->sopt_name;
4086 integer:
4087 error = sooptcopyout(sopt, &optval, sizeof optval);
4088 break;
4089
4090 case SO_DOMAIN:
4091 optval = so->so_proto->pr_domain->dom_family;
4092 goto integer;
4093
4094 case SO_TYPE:
4095 optval = so->so_type;
4096 goto integer;
4097
4098 case SO_PROTOCOL:
4099 optval = so->so_proto->pr_protocol;
4100 goto integer;
4101
4102 case SO_ERROR:
4103 SOCK_LOCK(so);
4104 if (so->so_error) {
4105 optval = so->so_error;
4106 so->so_error = 0;
4107 } else {
4108 optval = so->so_rerror;
4109 so->so_rerror = 0;
4110 }
4111 SOCK_UNLOCK(so);
4112 goto integer;
4113
4114 case SO_SNDBUF:
4115 SOCK_LOCK(so);
4116 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
4117 so->so_snd.sb_hiwat;
4118 SOCK_UNLOCK(so);
4119 goto integer;
4120
4121 case SO_RCVBUF:
4122 SOCK_LOCK(so);
4123 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
4124 so->so_rcv.sb_hiwat;
4125 SOCK_UNLOCK(so);
4126 goto integer;
4127
4128 case SO_SNDLOWAT:
4129 SOCK_LOCK(so);
4130 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
4131 so->so_snd.sb_lowat;
4132 SOCK_UNLOCK(so);
4133 goto integer;
4134
4135 case SO_RCVLOWAT:
4136 SOCK_LOCK(so);
4137 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
4138 so->so_rcv.sb_lowat;
4139 SOCK_UNLOCK(so);
4140 goto integer;
4141
4142 case SO_SNDTIMEO:
4143 case SO_RCVTIMEO:
4144 SOCK_LOCK(so);
4145 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
4146 (SOLISTENING(so) ? so->sol_sbsnd_timeo :
4147 so->so_snd.sb_timeo) :
4148 (SOLISTENING(so) ? so->sol_sbrcv_timeo :
4149 so->so_rcv.sb_timeo));
4150 SOCK_UNLOCK(so);
4151 #ifdef COMPAT_FREEBSD32
4152 if (SV_CURPROC_FLAG(SV_ILP32)) {
4153 struct timeval32 tv32;
4154
4155 CP(tv, tv32, tv_sec);
4156 CP(tv, tv32, tv_usec);
4157 error = sooptcopyout(sopt, &tv32, sizeof tv32);
4158 } else
4159 #endif
4160 error = sooptcopyout(sopt, &tv, sizeof tv);
4161 break;
4162
4163 case SO_LABEL:
4164 #ifdef MAC
4165 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
4166 sizeof(extmac));
4167 if (error)
4168 goto bad;
4169 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
4170 so, &extmac);
4171 if (error)
4172 goto bad;
4173 error = sooptcopyout(sopt, &extmac, sizeof extmac);
4174 #else
4175 error = EOPNOTSUPP;
4176 #endif
4177 break;
4178
4179 case SO_PEERLABEL:
4180 #ifdef MAC
4181 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
4182 sizeof(extmac));
4183 if (error)
4184 goto bad;
4185 error = mac_getsockopt_peerlabel(
4186 sopt->sopt_td->td_ucred, so, &extmac);
4187 if (error)
4188 goto bad;
4189 error = sooptcopyout(sopt, &extmac, sizeof extmac);
4190 #else
4191 error = EOPNOTSUPP;
4192 #endif
4193 break;
4194
4195 case SO_LISTENQLIMIT:
4196 SOCK_LOCK(so);
4197 optval = SOLISTENING(so) ? so->sol_qlimit : 0;
4198 SOCK_UNLOCK(so);
4199 goto integer;
4200
4201 case SO_LISTENQLEN:
4202 SOCK_LOCK(so);
4203 optval = SOLISTENING(so) ? so->sol_qlen : 0;
4204 SOCK_UNLOCK(so);
4205 goto integer;
4206
4207 case SO_LISTENINCQLEN:
4208 SOCK_LOCK(so);
4209 optval = SOLISTENING(so) ? so->sol_incqlen : 0;
4210 SOCK_UNLOCK(so);
4211 goto integer;
4212
4213 case SO_TS_CLOCK:
4214 optval = so->so_ts_clock;
4215 goto integer;
4216
4217 case SO_MAX_PACING_RATE:
4218 optval = so->so_max_pacing_rate;
4219 goto integer;
4220
4221 case SO_SPLICE: {
4222 off_t n;
4223
4224 /*
4225 * Acquire the I/O lock to serialize with
4226 * so_splice_xfer(). This is not required for
4227 * correctness, but makes testing simpler: once a byte
4228 * has been transmitted to the sink and observed (e.g.,
4229 * by reading from the socket to which the sink is
4230 * connected), a subsequent getsockopt(SO_SPLICE) will
4231 * return an up-to-date value.
4232 */
4233 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT);
4234 if (error != 0)
4235 goto bad;
4236 SOCK_LOCK(so);
4237 if (SOLISTENING(so)) {
4238 n = 0;
4239 } else {
4240 n = so->so_splice_sent;
4241 }
4242 SOCK_UNLOCK(so);
4243 SOCK_IO_RECV_UNLOCK(so);
4244 error = sooptcopyout(sopt, &n, sizeof(n));
4245 break;
4246 }
4247
4248 default:
4249 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
4250 error = hhook_run_socket(so, sopt,
4251 HHOOK_SOCKET_OPT);
4252 else
4253 error = ENOPROTOOPT;
4254 break;
4255 }
4256 }
4257 bad:
4258 CURVNET_RESTORE();
4259 return (error);
4260 }
4261
4262 int
4263 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
4264 {
4265 struct mbuf *m, *m_prev;
4266 int sopt_size = sopt->sopt_valsize;
4267
4268 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
4269 if (m == NULL)
4270 return ENOBUFS;
4271 if (sopt_size > MLEN) {
4272 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
4273 if ((m->m_flags & M_EXT) == 0) {
4274 m_free(m);
4275 return ENOBUFS;
4276 }
4277 m->m_len = min(MCLBYTES, sopt_size);
4278 } else {
4279 m->m_len = min(MLEN, sopt_size);
4280 }
4281 sopt_size -= m->m_len;
4282 *mp = m;
4283 m_prev = m;
4284
4285 while (sopt_size) {
4286 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
4287 if (m == NULL) {
4288 m_freem(*mp);
4289 return ENOBUFS;
4290 }
4291 if (sopt_size > MLEN) {
4292 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
4293 M_NOWAIT);
4294 if ((m->m_flags & M_EXT) == 0) {
4295 m_freem(m);
4296 m_freem(*mp);
4297 return ENOBUFS;
4298 }
4299 m->m_len = min(MCLBYTES, sopt_size);
4300 } else {
4301 m->m_len = min(MLEN, sopt_size);
4302 }
4303 sopt_size -= m->m_len;
4304 m_prev->m_next = m;
4305 m_prev = m;
4306 }
4307 return (0);
4308 }
4309
4310 int
4311 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
4312 {
4313 struct mbuf *m0 = m;
4314
4315 if (sopt->sopt_val == NULL)
4316 return (0);
4317 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4318 if (sopt->sopt_td != NULL) {
4319 int error;
4320
4321 error = copyin(sopt->sopt_val, mtod(m, char *),
4322 m->m_len);
4323 if (error != 0) {
4324 m_freem(m0);
4325 return(error);
4326 }
4327 } else
4328 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
4329 sopt->sopt_valsize -= m->m_len;
4330 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
4331 m = m->m_next;
4332 }
4333 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
4334 panic("ip6_sooptmcopyin");
4335 return (0);
4336 }
4337
4338 int
4339 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
4340 {
4341 struct mbuf *m0 = m;
4342 size_t valsize = 0;
4343
4344 if (sopt->sopt_val == NULL)
4345 return (0);
4346 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4347 if (sopt->sopt_td != NULL) {
4348 int error;
4349
4350 error = copyout(mtod(m, char *), sopt->sopt_val,
4351 m->m_len);
4352 if (error != 0) {
4353 m_freem(m0);
4354 return(error);
4355 }
4356 } else
4357 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
4358 sopt->sopt_valsize -= m->m_len;
4359 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
4360 valsize += m->m_len;
4361 m = m->m_next;
4362 }
4363 if (m != NULL) {
4364 /* enough soopt buffer should be given from user-land */
4365 m_freem(m0);
4366 return(EINVAL);
4367 }
4368 sopt->sopt_valsize = valsize;
4369 return (0);
4370 }
4371
4372 /*
4373 * sohasoutofband(): protocol notifies socket layer of the arrival of new
4374 * out-of-band data, which will then notify socket consumers.
4375 */
4376 void
4377 sohasoutofband(struct socket *so)
4378 {
4379
4380 if (so->so_sigio != NULL)
4381 pgsigio(&so->so_sigio, SIGURG, 0);
4382 selwakeuppri(&so->so_rdsel, PSOCK);
4383 }
4384
4385 int
4386 sopoll(struct socket *so, int events, struct ucred *active_cred,
4387 struct thread *td)
4388 {
4389
4390 /*
4391 * We do not need to set or assert curvnet as long as everyone uses
4392 * sopoll_generic().
4393 */
4394 return (so->so_proto->pr_sopoll(so, events, active_cred, td));
4395 }
4396
4397 int
4398 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
4399 struct thread *td)
4400 {
4401 int revents;
4402
4403 SOCK_LOCK(so);
4404 if (SOLISTENING(so)) {
4405 if (!(events & (POLLIN | POLLRDNORM)))
4406 revents = 0;
4407 else if (!TAILQ_EMPTY(&so->sol_comp))
4408 revents = events & (POLLIN | POLLRDNORM);
4409 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
4410 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
4411 else {
4412 selrecord(td, &so->so_rdsel);
4413 revents = 0;
4414 }
4415 } else {
4416 revents = 0;
4417 SOCK_SENDBUF_LOCK(so);
4418 SOCK_RECVBUF_LOCK(so);
4419 if (events & (POLLIN | POLLRDNORM))
4420 if (soreadabledata(so) && !isspliced(so))
4421 revents |= events & (POLLIN | POLLRDNORM);
4422 if (events & (POLLOUT | POLLWRNORM))
4423 if (sowriteable(so) && !issplicedback(so))
4424 revents |= events & (POLLOUT | POLLWRNORM);
4425 if (events & (POLLPRI | POLLRDBAND))
4426 if (so->so_oobmark ||
4427 (so->so_rcv.sb_state & SBS_RCVATMARK))
4428 revents |= events & (POLLPRI | POLLRDBAND);
4429 if ((events & POLLINIGNEOF) == 0) {
4430 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
4431 revents |= events & (POLLIN | POLLRDNORM);
4432 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
4433 revents |= POLLHUP;
4434 }
4435 }
4436 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4437 revents |= events & POLLRDHUP;
4438 if (revents == 0) {
4439 if (events &
4440 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
4441 selrecord(td, &so->so_rdsel);
4442 so->so_rcv.sb_flags |= SB_SEL;
4443 }
4444 if (events & (POLLOUT | POLLWRNORM)) {
4445 selrecord(td, &so->so_wrsel);
4446 so->so_snd.sb_flags |= SB_SEL;
4447 }
4448 }
4449 SOCK_RECVBUF_UNLOCK(so);
4450 SOCK_SENDBUF_UNLOCK(so);
4451 }
4452 SOCK_UNLOCK(so);
4453 return (revents);
4454 }
4455
4456 int
4457 soo_kqfilter(struct file *fp, struct knote *kn)
4458 {
4459 struct socket *so = kn->kn_fp->f_data;
4460 struct sockbuf *sb;
4461 sb_which which;
4462 struct knlist *knl;
4463
4464 switch (kn->kn_filter) {
4465 case EVFILT_READ:
4466 kn->kn_fop = &soread_filtops;
4467 knl = &so->so_rdsel.si_note;
4468 sb = &so->so_rcv;
4469 which = SO_RCV;
4470 break;
4471 case EVFILT_WRITE:
4472 kn->kn_fop = &sowrite_filtops;
4473 knl = &so->so_wrsel.si_note;
4474 sb = &so->so_snd;
4475 which = SO_SND;
4476 break;
4477 case EVFILT_EMPTY:
4478 kn->kn_fop = &soempty_filtops;
4479 knl = &so->so_wrsel.si_note;
4480 sb = &so->so_snd;
4481 which = SO_SND;
4482 break;
4483 default:
4484 return (EINVAL);
4485 }
4486
4487 SOCK_LOCK(so);
4488 if (SOLISTENING(so)) {
4489 knlist_add(knl, kn, 1);
4490 } else {
4491 SOCK_BUF_LOCK(so, which);
4492 knlist_add(knl, kn, 1);
4493 sb->sb_flags |= SB_KNOTE;
4494 SOCK_BUF_UNLOCK(so, which);
4495 }
4496 SOCK_UNLOCK(so);
4497 return (0);
4498 }
4499
4500 static void
4501 filt_sordetach(struct knote *kn)
4502 {
4503 struct socket *so = kn->kn_fp->f_data;
4504
4505 so_rdknl_lock(so);
4506 knlist_remove(&so->so_rdsel.si_note, kn, 1);
4507 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
4508 so->so_rcv.sb_flags &= ~SB_KNOTE;
4509 so_rdknl_unlock(so);
4510 }
4511
4512 /*ARGSUSED*/
4513 static int
4514 filt_soread(struct knote *kn, long hint)
4515 {
4516 struct socket *so;
4517
4518 so = kn->kn_fp->f_data;
4519
4520 if (SOLISTENING(so)) {
4521 SOCK_LOCK_ASSERT(so);
4522 kn->kn_data = so->sol_qlen;
4523 if (so->so_error) {
4524 kn->kn_flags |= EV_EOF;
4525 kn->kn_fflags = so->so_error;
4526 return (1);
4527 }
4528 return (!TAILQ_EMPTY(&so->sol_comp));
4529 }
4530
4531 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
4532 return (0);
4533
4534 SOCK_RECVBUF_LOCK_ASSERT(so);
4535
4536 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
4537 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
4538 kn->kn_flags |= EV_EOF;
4539 kn->kn_fflags = so->so_error;
4540 return (1);
4541 } else if (so->so_error || so->so_rerror)
4542 return (1);
4543
4544 if (kn->kn_sfflags & NOTE_LOWAT) {
4545 if (kn->kn_data >= kn->kn_sdata)
4546 return (1);
4547 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
4548 return (1);
4549
4550 /* This hook returning non-zero indicates an event, not error */
4551 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
4552 }
4553
4554 static void
4555 filt_sowdetach(struct knote *kn)
4556 {
4557 struct socket *so = kn->kn_fp->f_data;
4558
4559 so_wrknl_lock(so);
4560 knlist_remove(&so->so_wrsel.si_note, kn, 1);
4561 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
4562 so->so_snd.sb_flags &= ~SB_KNOTE;
4563 so_wrknl_unlock(so);
4564 }
4565
4566 /*ARGSUSED*/
4567 static int
4568 filt_sowrite(struct knote *kn, long hint)
4569 {
4570 struct socket *so;
4571
4572 so = kn->kn_fp->f_data;
4573
4574 if (SOLISTENING(so))
4575 return (0);
4576
4577 SOCK_SENDBUF_LOCK_ASSERT(so);
4578 kn->kn_data = sbspace(&so->so_snd);
4579
4580 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
4581
4582 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
4583 kn->kn_flags |= EV_EOF;
4584 kn->kn_fflags = so->so_error;
4585 return (1);
4586 } else if (so->so_error) /* temporary udp error */
4587 return (1);
4588 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
4589 (so->so_proto->pr_flags & PR_CONNREQUIRED))
4590 return (0);
4591 else if (kn->kn_sfflags & NOTE_LOWAT)
4592 return (kn->kn_data >= kn->kn_sdata);
4593 else
4594 return (kn->kn_data >= so->so_snd.sb_lowat);
4595 }
4596
4597 static int
4598 filt_soempty(struct knote *kn, long hint)
4599 {
4600 struct socket *so;
4601
4602 so = kn->kn_fp->f_data;
4603
4604 if (SOLISTENING(so))
4605 return (1);
4606
4607 SOCK_SENDBUF_LOCK_ASSERT(so);
4608 kn->kn_data = sbused(&so->so_snd);
4609
4610 if (kn->kn_data == 0)
4611 return (1);
4612 else
4613 return (0);
4614 }
4615
4616 int
4617 socheckuid(struct socket *so, uid_t uid)
4618 {
4619
4620 if (so == NULL)
4621 return (EPERM);
4622 if (so->so_cred->cr_uid != uid)
4623 return (EPERM);
4624 return (0);
4625 }
4626
4627 /*
4628 * These functions are used by protocols to notify the socket layer (and its
4629 * consumers) of state changes in the sockets driven by protocol-side events.
4630 */
4631
4632 /*
4633 * Procedures to manipulate state flags of socket and do appropriate wakeups.
4634 *
4635 * Normal sequence from the active (originating) side is that
4636 * soisconnecting() is called during processing of connect() call, resulting
4637 * in an eventual call to soisconnected() if/when the connection is
4638 * established. When the connection is torn down soisdisconnecting() is
4639 * called during processing of disconnect() call, and soisdisconnected() is
4640 * called when the connection to the peer is totally severed. The semantics
4641 * of these routines are such that connectionless protocols can call
4642 * soisconnected() and soisdisconnected() only, bypassing the in-progress
4643 * calls when setting up a ``connection'' takes no time.
4644 *
4645 * From the passive side, a socket is created with two queues of sockets:
4646 * so_incomp for connections in progress and so_comp for connections already
4647 * made and awaiting user acceptance. As a protocol is preparing incoming
4648 * connections, it creates a socket structure queued on so_incomp by calling
4649 * sonewconn(). When the connection is established, soisconnected() is
4650 * called, and transfers the socket structure to so_comp, making it available
4651 * to accept().
4652 *
4653 * If a socket is closed with sockets on either so_incomp or so_comp, these
4654 * sockets are dropped.
4655 *
4656 * If higher-level protocols are implemented in the kernel, the wakeups done
4657 * here will sometimes cause software-interrupt process scheduling.
4658 */
4659 void
4660 soisconnecting(struct socket *so)
4661 {
4662
4663 SOCK_LOCK(so);
4664 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
4665 so->so_state |= SS_ISCONNECTING;
4666 SOCK_UNLOCK(so);
4667 }
4668
4669 void
4670 soisconnected(struct socket *so)
4671 {
4672 bool last __diagused;
4673
4674 SOCK_LOCK(so);
4675 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
4676 so->so_state |= SS_ISCONNECTED;
4677
4678 if (so->so_qstate == SQ_INCOMP) {
4679 struct socket *head = so->so_listen;
4680 int ret;
4681
4682 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
4683 /*
4684 * Promoting a socket from incomplete queue to complete, we
4685 * need to go through reverse order of locking. We first do
4686 * trylock, and if that doesn't succeed, we go the hard way
4687 * leaving a reference and rechecking consistency after proper
4688 * locking.
4689 */
4690 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
4691 soref(head);
4692 SOCK_UNLOCK(so);
4693 SOLISTEN_LOCK(head);
4694 SOCK_LOCK(so);
4695 if (__predict_false(head != so->so_listen)) {
4696 /*
4697 * The socket went off the listen queue,
4698 * should be lost race to close(2) of sol.
4699 * The socket is about to soabort().
4700 */
4701 SOCK_UNLOCK(so);
4702 sorele_locked(head);
4703 return;
4704 }
4705 last = refcount_release(&head->so_count);
4706 KASSERT(!last, ("%s: released last reference for %p",
4707 __func__, head));
4708 }
4709 again:
4710 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
4711 TAILQ_REMOVE(&head->sol_incomp, so, so_list);
4712 head->sol_incqlen--;
4713 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
4714 head->sol_qlen++;
4715 so->so_qstate = SQ_COMP;
4716 SOCK_UNLOCK(so);
4717 solisten_wakeup(head); /* unlocks */
4718 } else {
4719 SOCK_RECVBUF_LOCK(so);
4720 soupcall_set(so, SO_RCV,
4721 head->sol_accept_filter->accf_callback,
4722 head->sol_accept_filter_arg);
4723 so->so_options &= ~SO_ACCEPTFILTER;
4724 ret = head->sol_accept_filter->accf_callback(so,
4725 head->sol_accept_filter_arg, M_NOWAIT);
4726 if (ret == SU_ISCONNECTED) {
4727 soupcall_clear(so, SO_RCV);
4728 SOCK_RECVBUF_UNLOCK(so);
4729 goto again;
4730 }
4731 SOCK_RECVBUF_UNLOCK(so);
4732 SOCK_UNLOCK(so);
4733 SOLISTEN_UNLOCK(head);
4734 }
4735 return;
4736 }
4737 SOCK_UNLOCK(so);
4738 wakeup(&so->so_timeo);
4739 sorwakeup(so);
4740 sowwakeup(so);
4741 }
4742
4743 void
4744 soisdisconnecting(struct socket *so)
4745 {
4746
4747 SOCK_LOCK(so);
4748 so->so_state &= ~SS_ISCONNECTING;
4749 so->so_state |= SS_ISDISCONNECTING;
4750
4751 if (!SOLISTENING(so)) {
4752 SOCK_RECVBUF_LOCK(so);
4753 socantrcvmore_locked(so);
4754 SOCK_SENDBUF_LOCK(so);
4755 socantsendmore_locked(so);
4756 }
4757 SOCK_UNLOCK(so);
4758 wakeup(&so->so_timeo);
4759 }
4760
4761 void
4762 soisdisconnected(struct socket *so)
4763 {
4764
4765 SOCK_LOCK(so);
4766
4767 /*
4768 * There is at least one reader of so_state that does not
4769 * acquire socket lock, namely soreceive_generic(). Ensure
4770 * that it never sees all flags that track connection status
4771 * cleared, by ordering the update with a barrier semantic of
4772 * our release thread fence.
4773 */
4774 so->so_state |= SS_ISDISCONNECTED;
4775 atomic_thread_fence_rel();
4776 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
4777
4778 if (!SOLISTENING(so)) {
4779 SOCK_UNLOCK(so);
4780 SOCK_RECVBUF_LOCK(so);
4781 socantrcvmore_locked(so);
4782 SOCK_SENDBUF_LOCK(so);
4783 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4784 socantsendmore_locked(so);
4785 } else
4786 SOCK_UNLOCK(so);
4787 wakeup(&so->so_timeo);
4788 }
4789
4790 int
4791 soiolock(struct socket *so, struct sx *sx, int flags)
4792 {
4793 int error;
4794
4795 KASSERT((flags & SBL_VALID) == flags,
4796 ("soiolock: invalid flags %#x", flags));
4797
4798 if ((flags & SBL_WAIT) != 0) {
4799 if ((flags & SBL_NOINTR) != 0) {
4800 sx_xlock(sx);
4801 } else {
4802 error = sx_xlock_sig(sx);
4803 if (error != 0)
4804 return (error);
4805 }
4806 } else if (!sx_try_xlock(sx)) {
4807 return (EWOULDBLOCK);
4808 }
4809
4810 if (__predict_false(SOLISTENING(so))) {
4811 sx_xunlock(sx);
4812 return (ENOTCONN);
4813 }
4814 return (0);
4815 }
4816
4817 void
4818 soiounlock(struct sx *sx)
4819 {
4820 sx_xunlock(sx);
4821 }
4822
4823 /*
4824 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4825 */
4826 struct sockaddr *
4827 sodupsockaddr(const struct sockaddr *sa, int mflags)
4828 {
4829 struct sockaddr *sa2;
4830
4831 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4832 if (sa2)
4833 bcopy(sa, sa2, sa->sa_len);
4834 return sa2;
4835 }
4836
4837 /*
4838 * Register per-socket destructor.
4839 */
4840 void
4841 sodtor_set(struct socket *so, so_dtor_t *func)
4842 {
4843
4844 SOCK_LOCK_ASSERT(so);
4845 so->so_dtor = func;
4846 }
4847
4848 /*
4849 * Register per-socket buffer upcalls.
4850 */
4851 void
4852 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
4853 {
4854 struct sockbuf *sb;
4855
4856 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4857
4858 switch (which) {
4859 case SO_RCV:
4860 sb = &so->so_rcv;
4861 break;
4862 case SO_SND:
4863 sb = &so->so_snd;
4864 break;
4865 }
4866 SOCK_BUF_LOCK_ASSERT(so, which);
4867 sb->sb_upcall = func;
4868 sb->sb_upcallarg = arg;
4869 sb->sb_flags |= SB_UPCALL;
4870 }
4871
4872 void
4873 soupcall_clear(struct socket *so, sb_which which)
4874 {
4875 struct sockbuf *sb;
4876
4877 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4878
4879 switch (which) {
4880 case SO_RCV:
4881 sb = &so->so_rcv;
4882 break;
4883 case SO_SND:
4884 sb = &so->so_snd;
4885 break;
4886 }
4887 SOCK_BUF_LOCK_ASSERT(so, which);
4888 KASSERT(sb->sb_upcall != NULL,
4889 ("%s: so %p no upcall to clear", __func__, so));
4890 sb->sb_upcall = NULL;
4891 sb->sb_upcallarg = NULL;
4892 sb->sb_flags &= ~SB_UPCALL;
4893 }
4894
4895 void
4896 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4897 {
4898
4899 SOLISTEN_LOCK_ASSERT(so);
4900 so->sol_upcall = func;
4901 so->sol_upcallarg = arg;
4902 }
4903
4904 static void
4905 so_rdknl_lock(void *arg)
4906 {
4907 struct socket *so = arg;
4908
4909 retry:
4910 if (SOLISTENING(so)) {
4911 SOLISTEN_LOCK(so);
4912 } else {
4913 SOCK_RECVBUF_LOCK(so);
4914 if (__predict_false(SOLISTENING(so))) {
4915 SOCK_RECVBUF_UNLOCK(so);
4916 goto retry;
4917 }
4918 }
4919 }
4920
4921 static void
4922 so_rdknl_unlock(void *arg)
4923 {
4924 struct socket *so = arg;
4925
4926 if (SOLISTENING(so))
4927 SOLISTEN_UNLOCK(so);
4928 else
4929 SOCK_RECVBUF_UNLOCK(so);
4930 }
4931
4932 static void
4933 so_rdknl_assert_lock(void *arg, int what)
4934 {
4935 struct socket *so = arg;
4936
4937 if (what == LA_LOCKED) {
4938 if (SOLISTENING(so))
4939 SOLISTEN_LOCK_ASSERT(so);
4940 else
4941 SOCK_RECVBUF_LOCK_ASSERT(so);
4942 } else {
4943 if (SOLISTENING(so))
4944 SOLISTEN_UNLOCK_ASSERT(so);
4945 else
4946 SOCK_RECVBUF_UNLOCK_ASSERT(so);
4947 }
4948 }
4949
4950 static void
4951 so_wrknl_lock(void *arg)
4952 {
4953 struct socket *so = arg;
4954
4955 retry:
4956 if (SOLISTENING(so)) {
4957 SOLISTEN_LOCK(so);
4958 } else {
4959 SOCK_SENDBUF_LOCK(so);
4960 if (__predict_false(SOLISTENING(so))) {
4961 SOCK_SENDBUF_UNLOCK(so);
4962 goto retry;
4963 }
4964 }
4965 }
4966
4967 static void
4968 so_wrknl_unlock(void *arg)
4969 {
4970 struct socket *so = arg;
4971
4972 if (SOLISTENING(so))
4973 SOLISTEN_UNLOCK(so);
4974 else
4975 SOCK_SENDBUF_UNLOCK(so);
4976 }
4977
4978 static void
4979 so_wrknl_assert_lock(void *arg, int what)
4980 {
4981 struct socket *so = arg;
4982
4983 if (what == LA_LOCKED) {
4984 if (SOLISTENING(so))
4985 SOLISTEN_LOCK_ASSERT(so);
4986 else
4987 SOCK_SENDBUF_LOCK_ASSERT(so);
4988 } else {
4989 if (SOLISTENING(so))
4990 SOLISTEN_UNLOCK_ASSERT(so);
4991 else
4992 SOCK_SENDBUF_UNLOCK_ASSERT(so);
4993 }
4994 }
4995
4996 /*
4997 * Create an external-format (``xsocket'') structure using the information in
4998 * the kernel-format socket structure pointed to by so. This is done to
4999 * reduce the spew of irrelevant information over this interface, to isolate
5000 * user code from changes in the kernel structure, and potentially to provide
5001 * information-hiding if we decide that some of this information should be
5002 * hidden from users.
5003 */
5004 void
5005 sotoxsocket(struct socket *so, struct xsocket *xso)
5006 {
5007
5008 bzero(xso, sizeof(*xso));
5009 xso->xso_len = sizeof *xso;
5010 xso->xso_so = (uintptr_t)so;
5011 xso->so_type = so->so_type;
5012 xso->so_options = so->so_options;
5013 xso->so_linger = so->so_linger;
5014 xso->so_state = so->so_state;
5015 xso->so_pcb = (uintptr_t)so->so_pcb;
5016 xso->xso_protocol = so->so_proto->pr_protocol;
5017 xso->xso_family = so->so_proto->pr_domain->dom_family;
5018 xso->so_timeo = so->so_timeo;
5019 xso->so_error = so->so_error;
5020 xso->so_uid = so->so_cred->cr_uid;
5021 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
5022 SOCK_LOCK(so);
5023 if (SOLISTENING(so)) {
5024 xso->so_qlen = so->sol_qlen;
5025 xso->so_incqlen = so->sol_incqlen;
5026 xso->so_qlimit = so->sol_qlimit;
5027 xso->so_oobmark = 0;
5028 } else {
5029 xso->so_state |= so->so_qstate;
5030 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
5031 xso->so_oobmark = so->so_oobmark;
5032 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
5033 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
5034 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0)
5035 xso->so_splice_so = (uintptr_t)so->so_splice->dst;
5036 }
5037 SOCK_UNLOCK(so);
5038 }
5039
5040 struct sockbuf *
5041 so_sockbuf_rcv(struct socket *so)
5042 {
5043
5044 return (&so->so_rcv);
5045 }
5046
5047 struct sockbuf *
5048 so_sockbuf_snd(struct socket *so)
5049 {
5050
5051 return (&so->so_snd);
5052 }
5053
5054 int
5055 so_state_get(const struct socket *so)
5056 {
5057
5058 return (so->so_state);
5059 }
5060
5061 void
5062 so_state_set(struct socket *so, int val)
5063 {
5064
5065 so->so_state = val;
5066 }
5067
5068 int
5069 so_options_get(const struct socket *so)
5070 {
5071
5072 return (so->so_options);
5073 }
5074
5075 void
5076 so_options_set(struct socket *so, int val)
5077 {
5078
5079 so->so_options = val;
5080 }
5081
5082 int
5083 so_error_get(const struct socket *so)
5084 {
5085
5086 return (so->so_error);
5087 }
5088
5089 void
5090 so_error_set(struct socket *so, int val)
5091 {
5092
5093 so->so_error = val;
5094 }
5095
5096 int
5097 so_linger_get(const struct socket *so)
5098 {
5099
5100 return (so->so_linger);
5101 }
5102
5103 void
5104 so_linger_set(struct socket *so, int val)
5105 {
5106
5107 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
5108 ("%s: val %d out of range", __func__, val));
5109
5110 so->so_linger = val;
5111 }
5112
5113 struct protosw *
5114 so_protosw_get(const struct socket *so)
5115 {
5116
5117 return (so->so_proto);
5118 }
5119
5120 void
5121 so_protosw_set(struct socket *so, struct protosw *val)
5122 {
5123
5124 so->so_proto = val;
5125 }
5126
5127 void
5128 so_sorwakeup(struct socket *so)
5129 {
5130
5131 sorwakeup(so);
5132 }
5133
5134 void
5135 so_sowwakeup(struct socket *so)
5136 {
5137
5138 sowwakeup(so);
5139 }
5140
5141 void
5142 so_sorwakeup_locked(struct socket *so)
5143 {
5144
5145 sorwakeup_locked(so);
5146 }
5147
5148 void
5149 so_sowwakeup_locked(struct socket *so)
5150 {
5151
5152 sowwakeup_locked(so);
5153 }
5154
5155 void
5156 so_lock(struct socket *so)
5157 {
5158
5159 SOCK_LOCK(so);
5160 }
5161
5162 void
5163 so_unlock(struct socket *so)
5164 {
5165
5166 SOCK_UNLOCK(so);
5167 }
5168