xref: /f-stack/freebsd/netinet/in_pcb.c (revision 5edfaa42)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2007-2009 Robert N. M. Watson
5  * Copyright (c) 2010-2011 Juniper Networks, Inc.
6  * All rights reserved.
7  *
8  * Portions of this software were developed by Robert N. M. Watson under
9  * contract to Juniper Networks, Inc.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 
41 #include "opt_ddb.h"
42 #include "opt_ipsec.h"
43 #include "opt_inet.h"
44 #include "opt_inet6.h"
45 #include "opt_pcbgroup.h"
46 #include "opt_rss.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/callout.h>
54 #include <sys/eventhandler.h>
55 #include <sys/domain.h>
56 #include <sys/protosw.h>
57 #include <sys/rmlock.h>
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/priv.h>
61 #include <sys/proc.h>
62 #include <sys/refcount.h>
63 #include <sys/jail.h>
64 #include <sys/kernel.h>
65 #include <sys/sysctl.h>
66 
67 #ifdef DDB
68 #include <ddb/ddb.h>
69 #endif
70 
71 #include <vm/uma.h>
72 
73 #include <net/if.h>
74 #include <net/if_var.h>
75 #include <net/if_types.h>
76 #include <net/if_llatbl.h>
77 #include <net/route.h>
78 #include <net/rss_config.h>
79 #include <net/vnet.h>
80 
81 #if defined(INET) || defined(INET6)
82 #include <netinet/in.h>
83 #include <netinet/in_pcb.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/tcp_var.h>
86 #include <netinet/udp.h>
87 #include <netinet/udp_var.h>
88 #endif
89 #ifdef INET
90 #include <netinet/in_var.h>
91 #endif
92 #ifdef INET6
93 #include <netinet/ip6.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet6/in6_var.h>
96 #include <netinet6/ip6_var.h>
97 #endif /* INET6 */
98 
99 
100 #ifdef IPSEC
101 #include <netipsec/ipsec.h>
102 #include <netipsec/key.h>
103 #endif /* IPSEC */
104 
105 #include <security/mac/mac_framework.h>
106 
107 #ifdef FSTACK
108 #include "ff_host_interface.h"
109 #endif
110 
111 static struct callout	ipport_tick_callout;
112 
113 /*
114  * These configure the range of local port addresses assigned to
115  * "unspecified" outgoing connections/packets/whatever.
116  */
117 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
118 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
119 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
120 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
121 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
122 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
123 
124 /*
125  * Reserved ports accessible only to root. There are significant
126  * security considerations that must be accounted for when changing these,
127  * but the security benefits can be great. Please be careful.
128  */
129 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
130 VNET_DEFINE(int, ipport_reservedlow);
131 
132 /* Variables dealing with random ephemeral port allocation. */
133 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
134 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
135 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
136 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
137 VNET_DEFINE(int, ipport_tcpallocs);
138 static VNET_DEFINE(int, ipport_tcplastcount);
139 
140 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
141 
142 static void	in_pcbremlists(struct inpcb *inp);
143 #ifdef INET
144 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
145 			    struct in_addr faddr, u_int fport_arg,
146 			    struct in_addr laddr, u_int lport_arg,
147 			    int lookupflags, struct ifnet *ifp);
148 
149 #define RANGECHK(var, min, max) \
150 	if ((var) < (min)) { (var) = (min); } \
151 	else if ((var) > (max)) { (var) = (max); }
152 
153 static int
154 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
155 {
156 	int error;
157 
158 	error = sysctl_handle_int(oidp, arg1, arg2, req);
159 	if (error == 0) {
160 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
161 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
162 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
163 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
164 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
165 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
166 	}
167 	return (error);
168 }
169 
170 #undef RANGECHK
171 
172 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
173     "IP Ports");
174 
175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
176 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
177 	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
179 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
180 	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
182 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
183 	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
185 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
186 	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
188 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
189 	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
191 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
192 	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
193 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
194 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
195 	&VNET_NAME(ipport_reservedhigh), 0, "");
196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
197 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
199 	CTLFLAG_VNET | CTLFLAG_RW,
200 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
201 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
202 	CTLFLAG_VNET | CTLFLAG_RW,
203 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
204 	"allocations before switching to a sequental one");
205 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
206 	CTLFLAG_VNET | CTLFLAG_RW,
207 	&VNET_NAME(ipport_randomtime), 0,
208 	"Minimum time to keep sequental port "
209 	"allocation before switching to a random one");
210 #endif /* INET */
211 
212 /*
213  * in_pcb.c: manage the Protocol Control Blocks.
214  *
215  * NOTE: It is assumed that most of these functions will be called with
216  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
217  * functions often modify hash chains or addresses in pcbs.
218  */
219 
220 /*
221  * Initialize an inpcbinfo -- we should be able to reduce the number of
222  * arguments in time.
223  */
224 void
225 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
226     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
227     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
228     uint32_t inpcbzone_flags, u_int hashfields)
229 {
230 
231 	INP_INFO_LOCK_INIT(pcbinfo, name);
232 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
233 	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
234 #ifdef VIMAGE
235 	pcbinfo->ipi_vnet = curvnet;
236 #endif
237 	pcbinfo->ipi_listhead = listhead;
238 	LIST_INIT(pcbinfo->ipi_listhead);
239 	pcbinfo->ipi_count = 0;
240 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
241 	    &pcbinfo->ipi_hashmask);
242 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
243 	    &pcbinfo->ipi_porthashmask);
244 #ifdef PCBGROUP
245 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
246 #endif
247 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
248 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
249 	    inpcbzone_flags);
250 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
251 	uma_zone_set_warning(pcbinfo->ipi_zone,
252 	    "kern.ipc.maxsockets limit reached");
253 }
254 
255 /*
256  * Destroy an inpcbinfo.
257  */
258 void
259 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
260 {
261 
262 	KASSERT(pcbinfo->ipi_count == 0,
263 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
264 
265 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
266 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
267 	    pcbinfo->ipi_porthashmask);
268 #ifdef PCBGROUP
269 	in_pcbgroup_destroy(pcbinfo);
270 #endif
271 	uma_zdestroy(pcbinfo->ipi_zone);
272 	INP_LIST_LOCK_DESTROY(pcbinfo);
273 	INP_HASH_LOCK_DESTROY(pcbinfo);
274 	INP_INFO_LOCK_DESTROY(pcbinfo);
275 }
276 
277 /*
278  * Allocate a PCB and associate it with the socket.
279  * On success return with the PCB locked.
280  */
281 int
282 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
283 {
284 	struct inpcb *inp;
285 	int error;
286 
287 #ifdef INVARIANTS
288 	if (pcbinfo == &V_tcbinfo) {
289 		INP_INFO_RLOCK_ASSERT(pcbinfo);
290 	} else {
291 		INP_INFO_WLOCK_ASSERT(pcbinfo);
292 	}
293 #endif
294 
295 	error = 0;
296 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
297 	if (inp == NULL)
298 		return (ENOBUFS);
299 	bzero(inp, inp_zero_size);
300 	inp->inp_pcbinfo = pcbinfo;
301 	inp->inp_socket = so;
302 	inp->inp_cred = crhold(so->so_cred);
303 	inp->inp_inc.inc_fibnum = so->so_fibnum;
304 #ifdef MAC
305 	error = mac_inpcb_init(inp, M_NOWAIT);
306 	if (error != 0)
307 		goto out;
308 	mac_inpcb_create(so, inp);
309 #endif
310 #ifdef IPSEC
311 	error = ipsec_init_policy(so, &inp->inp_sp);
312 	if (error != 0) {
313 #ifdef MAC
314 		mac_inpcb_destroy(inp);
315 #endif
316 		goto out;
317 	}
318 #endif /*IPSEC*/
319 #ifdef INET6
320 	if (INP_SOCKAF(so) == AF_INET6) {
321 		inp->inp_vflag |= INP_IPV6PROTO;
322 		if (V_ip6_v6only)
323 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
324 	}
325 #endif
326 	INP_WLOCK(inp);
327 	INP_LIST_WLOCK(pcbinfo);
328 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
329 	pcbinfo->ipi_count++;
330 	so->so_pcb = (caddr_t)inp;
331 #ifdef INET6
332 	if (V_ip6_auto_flowlabel)
333 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
334 #endif
335 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
336 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
337 	INP_LIST_WUNLOCK(pcbinfo);
338 #if defined(IPSEC) || defined(MAC)
339 out:
340 	if (error != 0) {
341 		crfree(inp->inp_cred);
342 		uma_zfree(pcbinfo->ipi_zone, inp);
343 	}
344 #endif
345 	return (error);
346 }
347 
348 #ifdef INET
349 int
350 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
351 {
352 	int anonport, error;
353 
354 	INP_WLOCK_ASSERT(inp);
355 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
356 
357 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
358 		return (EINVAL);
359 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
360 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
361 	    &inp->inp_lport, cred);
362 	if (error)
363 		return (error);
364 	if (in_pcbinshash(inp) != 0) {
365 		inp->inp_laddr.s_addr = INADDR_ANY;
366 		inp->inp_lport = 0;
367 		return (EAGAIN);
368 	}
369 	if (anonport)
370 		inp->inp_flags |= INP_ANONPORT;
371 	return (0);
372 }
373 #endif
374 
375 /*
376  * Select a local port (number) to use.
377  */
378 #if defined(INET) || defined(INET6)
379 int
380 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
381     struct ucred *cred, int lookupflags)
382 {
383 	struct inpcbinfo *pcbinfo;
384 	struct inpcb *tmpinp;
385 	unsigned short *lastport;
386 	int count, dorandom, error;
387 	u_short aux, first, last, lport;
388 #ifdef INET
389 	struct in_addr laddr;
390 #endif
391 
392 	pcbinfo = inp->inp_pcbinfo;
393 
394 	/*
395 	 * Because no actual state changes occur here, a global write lock on
396 	 * the pcbinfo isn't required.
397 	 */
398 	INP_LOCK_ASSERT(inp);
399 	INP_HASH_LOCK_ASSERT(pcbinfo);
400 
401 	if (inp->inp_flags & INP_HIGHPORT) {
402 		first = V_ipport_hifirstauto;	/* sysctl */
403 		last  = V_ipport_hilastauto;
404 		lastport = &pcbinfo->ipi_lasthi;
405 	} else if (inp->inp_flags & INP_LOWPORT) {
406 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
407 		if (error)
408 			return (error);
409 		first = V_ipport_lowfirstauto;	/* 1023 */
410 		last  = V_ipport_lowlastauto;	/* 600 */
411 		lastport = &pcbinfo->ipi_lastlow;
412 	} else {
413 		first = V_ipport_firstauto;	/* sysctl */
414 		last  = V_ipport_lastauto;
415 		lastport = &pcbinfo->ipi_lastport;
416 	}
417 	/*
418 	 * For UDP(-Lite), use random port allocation as long as the user
419 	 * allows it.  For TCP (and as of yet unknown) connections,
420 	 * use random port allocation only if the user allows it AND
421 	 * ipport_tick() allows it.
422 	 */
423 	if (V_ipport_randomized &&
424 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
425 		pcbinfo == &V_ulitecbinfo))
426 		dorandom = 1;
427 	else
428 		dorandom = 0;
429 	/*
430 	 * It makes no sense to do random port allocation if
431 	 * we have the only port available.
432 	 */
433 	if (first == last)
434 		dorandom = 0;
435 	/* Make sure to not include UDP(-Lite) packets in the count. */
436 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
437 		V_ipport_tcpallocs++;
438 	/*
439 	 * Instead of having two loops further down counting up or down
440 	 * make sure that first is always <= last and go with only one
441 	 * code path implementing all logic.
442 	 */
443 	if (first > last) {
444 		aux = first;
445 		first = last;
446 		last = aux;
447 	}
448 
449 #ifdef INET
450 	/* Make the compiler happy. */
451 	laddr.s_addr = 0;
452 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
453 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
454 		    __func__, inp));
455 		laddr = *laddrp;
456 	}
457 #endif
458 	tmpinp = NULL;	/* Make compiler happy. */
459 	lport = *lportp;
460 
461 	if (dorandom)
462 		*lastport = first + (arc4random() % (last - first));
463 
464 	count = last - first;
465 
466 	do {
467 		if (count-- < 0)	/* completely used? */
468 			return (EADDRNOTAVAIL);
469 		++*lastport;
470 		if (*lastport < first || *lastport > last)
471 			*lastport = first;
472 		lport = htons(*lastport);
473 
474 #ifdef INET6
475 		if ((inp->inp_vflag & INP_IPV6) != 0)
476 			tmpinp = in6_pcblookup_local(pcbinfo,
477 			    &inp->in6p_laddr, lport, lookupflags, cred);
478 #endif
479 #if defined(INET) && defined(INET6)
480 		else
481 #endif
482 #ifdef INET
483 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
484 			    lport, lookupflags, cred);
485 #endif
486 	} while (tmpinp != NULL);
487 
488 #ifdef INET
489 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
490 		laddrp->s_addr = laddr.s_addr;
491 #endif
492 	*lportp = lport;
493 
494 	return (0);
495 }
496 
497 /*
498  * Return cached socket options.
499  */
500 short
501 inp_so_options(const struct inpcb *inp)
502 {
503    short so_options;
504 
505    so_options = 0;
506 
507    if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
508 	   so_options |= SO_REUSEPORT;
509    if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
510 	   so_options |= SO_REUSEADDR;
511    return (so_options);
512 }
513 #endif /* INET || INET6 */
514 
515 /*
516  * Check if a new BINDMULTI socket is allowed to be created.
517  *
518  * ni points to the new inp.
519  * oi points to the exisitng inp.
520  *
521  * This checks whether the existing inp also has BINDMULTI and
522  * whether the credentials match.
523  */
524 int
525 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
526 {
527 	/* Check permissions match */
528 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
529 	    (ni->inp_cred->cr_uid !=
530 	    oi->inp_cred->cr_uid))
531 		return (0);
532 
533 	/* Check the existing inp has BINDMULTI set */
534 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
535 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
536 		return (0);
537 
538 	/*
539 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
540 	 * it is and it matches the checks.
541 	 */
542 	return (1);
543 }
544 
545 #ifdef INET
546 /*
547  * Set up a bind operation on a PCB, performing port allocation
548  * as required, but do not actually modify the PCB. Callers can
549  * either complete the bind by setting inp_laddr/inp_lport and
550  * calling in_pcbinshash(), or they can just use the resulting
551  * port and address to authorise the sending of a once-off packet.
552  *
553  * On error, the values of *laddrp and *lportp are not changed.
554  */
555 int
556 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
557     u_short *lportp, struct ucred *cred)
558 {
559 	struct socket *so = inp->inp_socket;
560 	struct sockaddr_in *sin;
561 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
562 	struct in_addr laddr;
563 	u_short lport = 0;
564 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
565 	int error;
566 
567 	/*
568 	 * No state changes, so read locks are sufficient here.
569 	 */
570 	INP_LOCK_ASSERT(inp);
571 	INP_HASH_LOCK_ASSERT(pcbinfo);
572 
573 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
574 		return (EADDRNOTAVAIL);
575 	laddr.s_addr = *laddrp;
576 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
577 		return (EINVAL);
578 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
579 		lookupflags = INPLOOKUP_WILDCARD;
580 	if (nam == NULL) {
581 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
582 			return (error);
583 	} else {
584 		sin = (struct sockaddr_in *)nam;
585 		if (nam->sa_len != sizeof (*sin))
586 			return (EINVAL);
587 #ifdef notdef
588 		/*
589 		 * We should check the family, but old programs
590 		 * incorrectly fail to initialize it.
591 		 */
592 		if (sin->sin_family != AF_INET)
593 			return (EAFNOSUPPORT);
594 #endif
595 		error = prison_local_ip4(cred, &sin->sin_addr);
596 		if (error)
597 			return (error);
598 		if (sin->sin_port != *lportp) {
599 			/* Don't allow the port to change. */
600 			if (*lportp != 0)
601 				return (EINVAL);
602 			lport = sin->sin_port;
603 		}
604 		/* NB: lport is left as 0 if the port isn't being changed. */
605 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
606 			/*
607 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
608 			 * allow complete duplication of binding if
609 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
610 			 * and a multicast address is bound on both
611 			 * new and duplicated sockets.
612 			 */
613 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
614 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
615 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
616 			sin->sin_port = 0;		/* yech... */
617 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
618 			/*
619 			 * Is the address a local IP address?
620 			 * If INP_BINDANY is set, then the socket may be bound
621 			 * to any endpoint address, local or not.
622 			 */
623 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
624 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
625 				return (EADDRNOTAVAIL);
626 		}
627 		laddr = sin->sin_addr;
628 		if (lport) {
629 			struct inpcb *t;
630 			struct tcptw *tw;
631 
632 			/* GROSS */
633 			if (ntohs(lport) <= V_ipport_reservedhigh &&
634 			    ntohs(lport) >= V_ipport_reservedlow &&
635 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
636 			    0))
637 				return (EACCES);
638 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
639 			    priv_check_cred(inp->inp_cred,
640 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
641 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
642 				    lport, INPLOOKUP_WILDCARD, cred);
643 	/*
644 	 * XXX
645 	 * This entire block sorely needs a rewrite.
646 	 */
647 				if (t &&
648 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
649 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
650 				    (so->so_type != SOCK_STREAM ||
651 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
652 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
653 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
654 				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
655 				    (inp->inp_cred->cr_uid !=
656 				     t->inp_cred->cr_uid))
657 					return (EADDRINUSE);
658 
659 				/*
660 				 * If the socket is a BINDMULTI socket, then
661 				 * the credentials need to match and the
662 				 * original socket also has to have been bound
663 				 * with BINDMULTI.
664 				 */
665 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
666 					return (EADDRINUSE);
667 			}
668 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
669 			    lport, lookupflags, cred);
670 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
671 				/*
672 				 * XXXRW: If an incpb has had its timewait
673 				 * state recycled, we treat the address as
674 				 * being in use (for now).  This is better
675 				 * than a panic, but not desirable.
676 				 */
677 				tw = intotw(t);
678 				if (tw == NULL ||
679 				    (reuseport & tw->tw_so_options) == 0)
680 					return (EADDRINUSE);
681 			} else if (t &&
682 			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
683 			    (reuseport & inp_so_options(t)) == 0) {
684 #ifdef INET6
685 				if (ntohl(sin->sin_addr.s_addr) !=
686 				    INADDR_ANY ||
687 				    ntohl(t->inp_laddr.s_addr) !=
688 				    INADDR_ANY ||
689 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
690 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
691 #endif
692 				return (EADDRINUSE);
693 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
694 					return (EADDRINUSE);
695 			}
696 		}
697 	}
698 	if (*lportp != 0)
699 		lport = *lportp;
700 	if (lport == 0) {
701 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
702 		if (error != 0)
703 			return (error);
704 
705 	}
706 	*laddrp = laddr.s_addr;
707 	*lportp = lport;
708 	return (0);
709 }
710 
711 /*
712  * Connect from a socket to a specified address.
713  * Both address and port must be specified in argument sin.
714  * If don't have a local address for this socket yet,
715  * then pick one.
716  */
717 int
718 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
719     struct ucred *cred, struct mbuf *m)
720 {
721 	u_short lport, fport;
722 	in_addr_t laddr, faddr;
723 	int anonport, error;
724 
725 	INP_WLOCK_ASSERT(inp);
726 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
727 
728 	lport = inp->inp_lport;
729 	laddr = inp->inp_laddr.s_addr;
730 	anonport = (lport == 0);
731 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
732 	    NULL, cred);
733 	if (error)
734 		return (error);
735 
736 	/* Do the initial binding of the local address if required. */
737 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
738 		inp->inp_lport = lport;
739 		inp->inp_laddr.s_addr = laddr;
740 		if (in_pcbinshash(inp) != 0) {
741 			inp->inp_laddr.s_addr = INADDR_ANY;
742 			inp->inp_lport = 0;
743 			return (EAGAIN);
744 		}
745 	}
746 
747 	/* Commit the remaining changes. */
748 	inp->inp_lport = lport;
749 	inp->inp_laddr.s_addr = laddr;
750 	inp->inp_faddr.s_addr = faddr;
751 	inp->inp_fport = fport;
752 	in_pcbrehash_mbuf(inp, m);
753 
754 	if (anonport)
755 		inp->inp_flags |= INP_ANONPORT;
756 	return (0);
757 }
758 
759 int
760 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
761 {
762 
763 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
764 }
765 
766 /*
767  * Do proper source address selection on an unbound socket in case
768  * of connect. Take jails into account as well.
769  */
770 int
771 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
772     struct ucred *cred)
773 {
774 	struct ifaddr *ifa;
775 	struct sockaddr *sa;
776 	struct sockaddr_in *sin;
777 	struct route sro;
778 	int error;
779 
780 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
781 
782 	/*
783 	 * Bypass source address selection and use the primary jail IP
784 	 * if requested.
785 	 */
786 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
787 		return (0);
788 
789 	error = 0;
790 	bzero(&sro, sizeof(sro));
791 
792 	sin = (struct sockaddr_in *)&sro.ro_dst;
793 	sin->sin_family = AF_INET;
794 	sin->sin_len = sizeof(struct sockaddr_in);
795 	sin->sin_addr.s_addr = faddr->s_addr;
796 
797 	/*
798 	 * If route is known our src addr is taken from the i/f,
799 	 * else punt.
800 	 *
801 	 * Find out route to destination.
802 	 */
803 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
804 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
805 
806 	/*
807 	 * If we found a route, use the address corresponding to
808 	 * the outgoing interface.
809 	 *
810 	 * Otherwise assume faddr is reachable on a directly connected
811 	 * network and try to find a corresponding interface to take
812 	 * the source address from.
813 	 */
814 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
815 		struct in_ifaddr *ia;
816 		struct ifnet *ifp;
817 
818 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
819 					inp->inp_socket->so_fibnum));
820 		if (ia == NULL)
821 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
822 						inp->inp_socket->so_fibnum));
823 		if (ia == NULL) {
824 			error = ENETUNREACH;
825 			goto done;
826 		}
827 
828 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
829 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
830 			ifa_free(&ia->ia_ifa);
831 			goto done;
832 		}
833 
834 		ifp = ia->ia_ifp;
835 		ifa_free(&ia->ia_ifa);
836 		ia = NULL;
837 		IF_ADDR_RLOCK(ifp);
838 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
839 
840 			sa = ifa->ifa_addr;
841 			if (sa->sa_family != AF_INET)
842 				continue;
843 			sin = (struct sockaddr_in *)sa;
844 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
845 				ia = (struct in_ifaddr *)ifa;
846 				break;
847 			}
848 		}
849 		if (ia != NULL) {
850 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
851 			IF_ADDR_RUNLOCK(ifp);
852 			goto done;
853 		}
854 		IF_ADDR_RUNLOCK(ifp);
855 
856 		/* 3. As a last resort return the 'default' jail address. */
857 		error = prison_get_ip4(cred, laddr);
858 		goto done;
859 	}
860 
861 	/*
862 	 * If the outgoing interface on the route found is not
863 	 * a loopback interface, use the address from that interface.
864 	 * In case of jails do those three steps:
865 	 * 1. check if the interface address belongs to the jail. If so use it.
866 	 * 2. check if we have any address on the outgoing interface
867 	 *    belonging to this jail. If so use it.
868 	 * 3. as a last resort return the 'default' jail address.
869 	 */
870 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
871 		struct in_ifaddr *ia;
872 		struct ifnet *ifp;
873 
874 		/* If not jailed, use the default returned. */
875 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
876 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
877 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
878 			goto done;
879 		}
880 
881 		/* Jailed. */
882 		/* 1. Check if the iface address belongs to the jail. */
883 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
884 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
885 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
886 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
887 			goto done;
888 		}
889 
890 		/*
891 		 * 2. Check if we have any address on the outgoing interface
892 		 *    belonging to this jail.
893 		 */
894 		ia = NULL;
895 		ifp = sro.ro_rt->rt_ifp;
896 		IF_ADDR_RLOCK(ifp);
897 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
898 			sa = ifa->ifa_addr;
899 			if (sa->sa_family != AF_INET)
900 				continue;
901 			sin = (struct sockaddr_in *)sa;
902 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
903 				ia = (struct in_ifaddr *)ifa;
904 				break;
905 			}
906 		}
907 		if (ia != NULL) {
908 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
909 			IF_ADDR_RUNLOCK(ifp);
910 			goto done;
911 		}
912 		IF_ADDR_RUNLOCK(ifp);
913 
914 		/* 3. As a last resort return the 'default' jail address. */
915 		error = prison_get_ip4(cred, laddr);
916 		goto done;
917 	}
918 
919 	/*
920 	 * The outgoing interface is marked with 'loopback net', so a route
921 	 * to ourselves is here.
922 	 * Try to find the interface of the destination address and then
923 	 * take the address from there. That interface is not necessarily
924 	 * a loopback interface.
925 	 * In case of jails, check that it is an address of the jail
926 	 * and if we cannot find, fall back to the 'default' jail address.
927 	 */
928 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
929 		struct sockaddr_in sain;
930 		struct in_ifaddr *ia;
931 
932 		bzero(&sain, sizeof(struct sockaddr_in));
933 		sain.sin_family = AF_INET;
934 		sain.sin_len = sizeof(struct sockaddr_in);
935 		sain.sin_addr.s_addr = faddr->s_addr;
936 
937 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
938 					inp->inp_socket->so_fibnum));
939 		if (ia == NULL)
940 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
941 						inp->inp_socket->so_fibnum));
942 		if (ia == NULL)
943 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
944 
945 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
946 			if (ia == NULL) {
947 				error = ENETUNREACH;
948 				goto done;
949 			}
950 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
951 			ifa_free(&ia->ia_ifa);
952 			goto done;
953 		}
954 
955 		/* Jailed. */
956 		if (ia != NULL) {
957 			struct ifnet *ifp;
958 
959 			ifp = ia->ia_ifp;
960 			ifa_free(&ia->ia_ifa);
961 			ia = NULL;
962 			IF_ADDR_RLOCK(ifp);
963 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
964 
965 				sa = ifa->ifa_addr;
966 				if (sa->sa_family != AF_INET)
967 					continue;
968 				sin = (struct sockaddr_in *)sa;
969 				if (prison_check_ip4(cred,
970 				    &sin->sin_addr) == 0) {
971 					ia = (struct in_ifaddr *)ifa;
972 					break;
973 				}
974 			}
975 			if (ia != NULL) {
976 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
977 				IF_ADDR_RUNLOCK(ifp);
978 				goto done;
979 			}
980 			IF_ADDR_RUNLOCK(ifp);
981 		}
982 
983 		/* 3. As a last resort return the 'default' jail address. */
984 		error = prison_get_ip4(cred, laddr);
985 		goto done;
986 	}
987 
988 done:
989 	if (sro.ro_rt != NULL)
990 		RTFREE(sro.ro_rt);
991 	return (error);
992 }
993 
994 /*
995  * Set up for a connect from a socket to the specified address.
996  * On entry, *laddrp and *lportp should contain the current local
997  * address and port for the PCB; these are updated to the values
998  * that should be placed in inp_laddr and inp_lport to complete
999  * the connect.
1000  *
1001  * On success, *faddrp and *fportp will be set to the remote address
1002  * and port. These are not updated in the error case.
1003  *
1004  * If the operation fails because the connection already exists,
1005  * *oinpp will be set to the PCB of that connection so that the
1006  * caller can decide to override it. In all other cases, *oinpp
1007  * is set to NULL.
1008  */
1009 int
1010 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1011     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1012     struct inpcb **oinpp, struct ucred *cred)
1013 {
1014 	struct rm_priotracker in_ifa_tracker;
1015 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1016 	struct in_ifaddr *ia;
1017 	struct inpcb *oinp;
1018 	struct in_addr laddr, faddr;
1019 	u_short lport, fport;
1020 	int error;
1021 
1022 	/*
1023 	 * Because a global state change doesn't actually occur here, a read
1024 	 * lock is sufficient.
1025 	 */
1026 	INP_LOCK_ASSERT(inp);
1027 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1028 
1029 	if (oinpp != NULL)
1030 		*oinpp = NULL;
1031 	if (nam->sa_len != sizeof (*sin))
1032 		return (EINVAL);
1033 	if (sin->sin_family != AF_INET)
1034 		return (EAFNOSUPPORT);
1035 	if (sin->sin_port == 0)
1036 		return (EADDRNOTAVAIL);
1037 	laddr.s_addr = *laddrp;
1038 	lport = *lportp;
1039 	faddr = sin->sin_addr;
1040 	fport = sin->sin_port;
1041 
1042 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
1043 		/*
1044 		 * If the destination address is INADDR_ANY,
1045 		 * use the primary local address.
1046 		 * If the supplied address is INADDR_BROADCAST,
1047 		 * and the primary interface supports broadcast,
1048 		 * choose the broadcast address for that interface.
1049 		 */
1050 		if (faddr.s_addr == INADDR_ANY) {
1051 			IN_IFADDR_RLOCK(&in_ifa_tracker);
1052 			faddr =
1053 			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1054 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1055 			if (cred != NULL &&
1056 			    (error = prison_get_ip4(cred, &faddr)) != 0)
1057 				return (error);
1058 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1059 			IN_IFADDR_RLOCK(&in_ifa_tracker);
1060 			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1061 			    IFF_BROADCAST)
1062 				faddr = satosin(&TAILQ_FIRST(
1063 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1064 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1065 		}
1066 	}
1067 #ifdef FSTACK
1068     if (laddr.s_addr == INADDR_ANY) {
1069         ff_in_pcbladdr(AF_INET, &faddr, fport, &laddr);
1070     }
1071 #endif
1072 	if (laddr.s_addr == INADDR_ANY) {
1073 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1074 		/*
1075 		 * If the destination address is multicast and an outgoing
1076 		 * interface has been set as a multicast option, prefer the
1077 		 * address of that interface as our source address.
1078 		 */
1079 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1080 		    inp->inp_moptions != NULL) {
1081 			struct ip_moptions *imo;
1082 			struct ifnet *ifp;
1083 
1084 			imo = inp->inp_moptions;
1085 			if (imo->imo_multicast_ifp != NULL) {
1086 				ifp = imo->imo_multicast_ifp;
1087 				IN_IFADDR_RLOCK(&in_ifa_tracker);
1088 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1089 					if ((ia->ia_ifp == ifp) &&
1090 					    (cred == NULL ||
1091 					    prison_check_ip4(cred,
1092 					    &ia->ia_addr.sin_addr) == 0))
1093 						break;
1094 				}
1095 				if (ia == NULL)
1096 					error = EADDRNOTAVAIL;
1097 				else {
1098 					laddr = ia->ia_addr.sin_addr;
1099 					error = 0;
1100 				}
1101 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1102 			}
1103 		}
1104 		if (error)
1105 			return (error);
1106 	}
1107 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
1108 	    laddr, lport, 0, NULL);
1109 	if (oinp != NULL) {
1110 		if (oinpp != NULL)
1111 			*oinpp = oinp;
1112 		return (EADDRINUSE);
1113 	}
1114 #ifndef FSTACK
1115 	if (lport == 0) {
1116 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1117 		    cred);
1118 		if (error)
1119 			return (error);
1120 	}
1121 #else
1122 if (lport == 0)
1123 {
1124     struct ifaddr *ifa;
1125     struct ifnet *ifp;
1126     struct sockaddr_in ifp_sin;
1127     unsigned loop_count = 0;
1128     bzero(&ifp_sin, sizeof(ifp_sin));
1129     ifp_sin.sin_addr.s_addr = laddr.s_addr;
1130     ifp_sin.sin_family = AF_INET;
1131     ifp_sin.sin_len = sizeof(ifp_sin);
1132     ifa = ifa_ifwithnet((struct sockaddr *)&ifp_sin, 0, RT_ALL_FIBS);
1133     if (ifa == NULL) {
1134         ifp_sin.sin_addr.s_addr = faddr.s_addr;
1135         ifa = ifa_ifwithnet((struct sockaddr *)&ifp_sin, 0, RT_ALL_FIBS);
1136         if ( ifa == NULL )
1137             return (EADDRNOTAVAIL);
1138     }
1139     ifp = ifa->ifa_ifp;
1140     while (lport == 0) {
1141         int rss;
1142         error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1143             cred);
1144         if (error)
1145             return (error);
1146         rss = ff_rss_check(ifp->if_softc, faddr.s_addr, laddr.s_addr,
1147             fport, lport);
1148         if (rss) {
1149             break;
1150         }
1151         lport = 0;
1152         /* Note:
1153          * if all ports are completely used, just return.
1154          * this ugly code is not a correct way, it just lets loop quit.
1155          * we will fix it as soon as possible.
1156          */
1157         if (++loop_count >= 65535) {
1158             return (EADDRNOTAVAIL);
1159         }
1160     }
1161 }
1162 #endif
1163 	*laddrp = laddr.s_addr;
1164 	*lportp = lport;
1165 	*faddrp = faddr.s_addr;
1166 	*fportp = fport;
1167 	return (0);
1168 }
1169 
1170 void
1171 in_pcbdisconnect(struct inpcb *inp)
1172 {
1173 
1174 	INP_WLOCK_ASSERT(inp);
1175 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1176 
1177 	inp->inp_faddr.s_addr = INADDR_ANY;
1178 	inp->inp_fport = 0;
1179 	in_pcbrehash(inp);
1180 }
1181 #endif /* INET */
1182 
1183 /*
1184  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1185  * For most protocols, this will be invoked immediately prior to calling
1186  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1187  * socket, in which case in_pcbfree() is deferred.
1188  */
1189 void
1190 in_pcbdetach(struct inpcb *inp)
1191 {
1192 
1193 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1194 
1195 	inp->inp_socket->so_pcb = NULL;
1196 	inp->inp_socket = NULL;
1197 }
1198 
1199 /*
1200  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1201  * stability of an inpcb pointer despite the inpcb lock being released.  This
1202  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1203  * but where the inpcb lock may already held, or when acquiring a reference
1204  * via a pcbgroup.
1205  *
1206  * in_pcbref() should be used only to provide brief memory stability, and
1207  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1208  * garbage collect the inpcb if it has been in_pcbfree()'d from another
1209  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1210  * lock and rele are the *only* safe operations that may be performed on the
1211  * inpcb.
1212  *
1213  * While the inpcb will not be freed, releasing the inpcb lock means that the
1214  * connection's state may change, so the caller should be careful to
1215  * revalidate any cached state on reacquiring the lock.  Drop the reference
1216  * using in_pcbrele().
1217  */
1218 void
1219 in_pcbref(struct inpcb *inp)
1220 {
1221 
1222 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1223 
1224 	refcount_acquire(&inp->inp_refcount);
1225 }
1226 
1227 /*
1228  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1229  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1230  * return a flag indicating whether or not the inpcb remains valid.  If it is
1231  * valid, we return with the inpcb lock held.
1232  *
1233  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1234  * reference on an inpcb.  Historically more work was done here (actually, in
1235  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1236  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1237  * about memory stability (and continued use of the write lock).
1238  */
1239 int
1240 in_pcbrele_rlocked(struct inpcb *inp)
1241 {
1242 	struct inpcbinfo *pcbinfo;
1243 
1244 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1245 
1246 	INP_RLOCK_ASSERT(inp);
1247 
1248 	if (refcount_release(&inp->inp_refcount) == 0) {
1249 		/*
1250 		 * If the inpcb has been freed, let the caller know, even if
1251 		 * this isn't the last reference.
1252 		 */
1253 		if (inp->inp_flags2 & INP_FREED) {
1254 			INP_RUNLOCK(inp);
1255 			return (1);
1256 		}
1257 		return (0);
1258 	}
1259 
1260 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1261 
1262 	INP_RUNLOCK(inp);
1263 	pcbinfo = inp->inp_pcbinfo;
1264 	uma_zfree(pcbinfo->ipi_zone, inp);
1265 	return (1);
1266 }
1267 
1268 int
1269 in_pcbrele_wlocked(struct inpcb *inp)
1270 {
1271 	struct inpcbinfo *pcbinfo;
1272 
1273 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1274 
1275 	INP_WLOCK_ASSERT(inp);
1276 
1277 	if (refcount_release(&inp->inp_refcount) == 0) {
1278 		/*
1279 		 * If the inpcb has been freed, let the caller know, even if
1280 		 * this isn't the last reference.
1281 		 */
1282 		if (inp->inp_flags2 & INP_FREED) {
1283 			INP_WUNLOCK(inp);
1284 			return (1);
1285 		}
1286 		return (0);
1287 	}
1288 
1289 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1290 
1291 	INP_WUNLOCK(inp);
1292 	pcbinfo = inp->inp_pcbinfo;
1293 	uma_zfree(pcbinfo->ipi_zone, inp);
1294 	return (1);
1295 }
1296 
1297 /*
1298  * Temporary wrapper.
1299  */
1300 int
1301 in_pcbrele(struct inpcb *inp)
1302 {
1303 
1304 	return (in_pcbrele_wlocked(inp));
1305 }
1306 
1307 /*
1308  * Unconditionally schedule an inpcb to be freed by decrementing its
1309  * reference count, which should occur only after the inpcb has been detached
1310  * from its socket.  If another thread holds a temporary reference (acquired
1311  * using in_pcbref()) then the free is deferred until that reference is
1312  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1313  * work, including removal from global lists, is done in this context, where
1314  * the pcbinfo lock is held.
1315  */
1316 void
1317 in_pcbfree(struct inpcb *inp)
1318 {
1319 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1320 
1321 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1322 
1323 #ifdef INVARIANTS
1324 	if (pcbinfo == &V_tcbinfo) {
1325 		INP_INFO_LOCK_ASSERT(pcbinfo);
1326 	} else {
1327 		INP_INFO_WLOCK_ASSERT(pcbinfo);
1328 	}
1329 #endif
1330 	INP_WLOCK_ASSERT(inp);
1331 
1332 	/* XXXRW: Do as much as possible here. */
1333 #ifdef IPSEC
1334 	if (inp->inp_sp != NULL)
1335 		ipsec_delete_pcbpolicy(inp);
1336 #endif
1337 	INP_LIST_WLOCK(pcbinfo);
1338 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1339 	in_pcbremlists(inp);
1340 	INP_LIST_WUNLOCK(pcbinfo);
1341 #ifdef INET6
1342 	if (inp->inp_vflag & INP_IPV6PROTO) {
1343 		ip6_freepcbopts(inp->in6p_outputopts);
1344 		if (inp->in6p_moptions != NULL)
1345 			ip6_freemoptions(inp->in6p_moptions);
1346 	}
1347 #endif
1348 	if (inp->inp_options)
1349 		(void)m_free(inp->inp_options);
1350 #ifdef INET
1351 	if (inp->inp_moptions != NULL)
1352 		inp_freemoptions(inp->inp_moptions);
1353 #endif
1354 	if (inp->inp_route.ro_rt) {
1355 		RTFREE(inp->inp_route.ro_rt);
1356 		inp->inp_route.ro_rt = (struct rtentry *)NULL;
1357 	}
1358 	if (inp->inp_route.ro_lle)
1359 		LLE_FREE(inp->inp_route.ro_lle);	/* zeros ro_lle */
1360 
1361 	inp->inp_vflag = 0;
1362 	inp->inp_flags2 |= INP_FREED;
1363 	crfree(inp->inp_cred);
1364 #ifdef MAC
1365 	mac_inpcb_destroy(inp);
1366 #endif
1367 	if (!in_pcbrele_wlocked(inp))
1368 		INP_WUNLOCK(inp);
1369 }
1370 
1371 /*
1372  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1373  * port reservation, and preventing it from being returned by inpcb lookups.
1374  *
1375  * It is used by TCP to mark an inpcb as unused and avoid future packet
1376  * delivery or event notification when a socket remains open but TCP has
1377  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1378  * or a RST on the wire, and allows the port binding to be reused while still
1379  * maintaining the invariant that so_pcb always points to a valid inpcb until
1380  * in_pcbdetach().
1381  *
1382  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1383  * in_pcbnotifyall() and in_pcbpurgeif0()?
1384  */
1385 void
1386 in_pcbdrop(struct inpcb *inp)
1387 {
1388 
1389 	INP_WLOCK_ASSERT(inp);
1390 
1391 	/*
1392 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1393 	 * the hash lock...?
1394 	 */
1395 	inp->inp_flags |= INP_DROPPED;
1396 	if (inp->inp_flags & INP_INHASHLIST) {
1397 		struct inpcbport *phd = inp->inp_phd;
1398 
1399 		INP_HASH_WLOCK(inp->inp_pcbinfo);
1400 		LIST_REMOVE(inp, inp_hash);
1401 		LIST_REMOVE(inp, inp_portlist);
1402 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1403 			LIST_REMOVE(phd, phd_hash);
1404 			free(phd, M_PCB);
1405 		}
1406 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1407 		inp->inp_flags &= ~INP_INHASHLIST;
1408 #ifdef PCBGROUP
1409 		in_pcbgroup_remove(inp);
1410 #endif
1411 	}
1412 }
1413 
1414 #ifdef INET
1415 /*
1416  * Common routines to return the socket addresses associated with inpcbs.
1417  */
1418 struct sockaddr *
1419 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1420 {
1421 	struct sockaddr_in *sin;
1422 
1423 	sin = malloc(sizeof *sin, M_SONAME,
1424 		M_WAITOK | M_ZERO);
1425 	sin->sin_family = AF_INET;
1426 	sin->sin_len = sizeof(*sin);
1427 	sin->sin_addr = *addr_p;
1428 	sin->sin_port = port;
1429 
1430 	return (struct sockaddr *)sin;
1431 }
1432 
1433 int
1434 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1435 {
1436 	struct inpcb *inp;
1437 	struct in_addr addr;
1438 	in_port_t port;
1439 
1440 	inp = sotoinpcb(so);
1441 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1442 
1443 	INP_RLOCK(inp);
1444 	port = inp->inp_lport;
1445 	addr = inp->inp_laddr;
1446 	INP_RUNLOCK(inp);
1447 
1448 	*nam = in_sockaddr(port, &addr);
1449 	return 0;
1450 }
1451 
1452 int
1453 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1454 {
1455 	struct inpcb *inp;
1456 	struct in_addr addr;
1457 	in_port_t port;
1458 
1459 	inp = sotoinpcb(so);
1460 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1461 
1462 	INP_RLOCK(inp);
1463 	port = inp->inp_fport;
1464 	addr = inp->inp_faddr;
1465 	INP_RUNLOCK(inp);
1466 
1467 	*nam = in_sockaddr(port, &addr);
1468 	return 0;
1469 }
1470 
1471 void
1472 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1473     struct inpcb *(*notify)(struct inpcb *, int))
1474 {
1475 	struct inpcb *inp, *inp_temp;
1476 
1477 	INP_INFO_WLOCK(pcbinfo);
1478 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1479 		INP_WLOCK(inp);
1480 #ifdef INET6
1481 		if ((inp->inp_vflag & INP_IPV4) == 0) {
1482 			INP_WUNLOCK(inp);
1483 			continue;
1484 		}
1485 #endif
1486 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
1487 		    inp->inp_socket == NULL) {
1488 			INP_WUNLOCK(inp);
1489 			continue;
1490 		}
1491 		if ((*notify)(inp, errno))
1492 			INP_WUNLOCK(inp);
1493 	}
1494 	INP_INFO_WUNLOCK(pcbinfo);
1495 }
1496 
1497 void
1498 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1499 {
1500 	struct inpcb *inp;
1501 	struct ip_moptions *imo;
1502 	int i, gap;
1503 
1504 	INP_INFO_WLOCK(pcbinfo);
1505 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1506 		INP_WLOCK(inp);
1507 		imo = inp->inp_moptions;
1508 		if ((inp->inp_vflag & INP_IPV4) &&
1509 		    imo != NULL) {
1510 			/*
1511 			 * Unselect the outgoing interface if it is being
1512 			 * detached.
1513 			 */
1514 			if (imo->imo_multicast_ifp == ifp)
1515 				imo->imo_multicast_ifp = NULL;
1516 
1517 			/*
1518 			 * Drop multicast group membership if we joined
1519 			 * through the interface being detached.
1520 			 */
1521 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
1522 			    i++) {
1523 				if (imo->imo_membership[i]->inm_ifp == ifp) {
1524 					in_delmulti(imo->imo_membership[i]);
1525 					gap++;
1526 				} else if (gap != 0)
1527 					imo->imo_membership[i - gap] =
1528 					    imo->imo_membership[i];
1529 			}
1530 			imo->imo_num_memberships -= gap;
1531 		}
1532 		INP_WUNLOCK(inp);
1533 	}
1534 	INP_INFO_WUNLOCK(pcbinfo);
1535 }
1536 
1537 /*
1538  * Lookup a PCB based on the local address and port.  Caller must hold the
1539  * hash lock.  No inpcb locks or references are acquired.
1540  */
1541 #define INP_LOOKUP_MAPPED_PCB_COST	3
1542 struct inpcb *
1543 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1544     u_short lport, int lookupflags, struct ucred *cred)
1545 {
1546 	struct inpcb *inp;
1547 #ifdef INET6
1548 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1549 #else
1550 	int matchwild = 3;
1551 #endif
1552 	int wildcard;
1553 
1554 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1555 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1556 
1557 	INP_HASH_LOCK_ASSERT(pcbinfo);
1558 
1559 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1560 		struct inpcbhead *head;
1561 		/*
1562 		 * Look for an unconnected (wildcard foreign addr) PCB that
1563 		 * matches the local address and port we're looking for.
1564 		 */
1565 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1566 		    0, pcbinfo->ipi_hashmask)];
1567 		LIST_FOREACH(inp, head, inp_hash) {
1568 #ifdef INET6
1569 			/* XXX inp locking */
1570 			if ((inp->inp_vflag & INP_IPV4) == 0)
1571 				continue;
1572 #endif
1573 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1574 			    inp->inp_laddr.s_addr == laddr.s_addr &&
1575 			    inp->inp_lport == lport) {
1576 				/*
1577 				 * Found?
1578 				 */
1579 				if (cred == NULL ||
1580 				    prison_equal_ip4(cred->cr_prison,
1581 					inp->inp_cred->cr_prison))
1582 					return (inp);
1583 			}
1584 		}
1585 		/*
1586 		 * Not found.
1587 		 */
1588 		return (NULL);
1589 	} else {
1590 		struct inpcbporthead *porthash;
1591 		struct inpcbport *phd;
1592 		struct inpcb *match = NULL;
1593 		/*
1594 		 * Best fit PCB lookup.
1595 		 *
1596 		 * First see if this local port is in use by looking on the
1597 		 * port hash list.
1598 		 */
1599 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1600 		    pcbinfo->ipi_porthashmask)];
1601 		LIST_FOREACH(phd, porthash, phd_hash) {
1602 			if (phd->phd_port == lport)
1603 				break;
1604 		}
1605 		if (phd != NULL) {
1606 			/*
1607 			 * Port is in use by one or more PCBs. Look for best
1608 			 * fit.
1609 			 */
1610 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1611 				wildcard = 0;
1612 				if (cred != NULL &&
1613 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
1614 					cred->cr_prison))
1615 					continue;
1616 #ifdef INET6
1617 				/* XXX inp locking */
1618 				if ((inp->inp_vflag & INP_IPV4) == 0)
1619 					continue;
1620 				/*
1621 				 * We never select the PCB that has
1622 				 * INP_IPV6 flag and is bound to :: if
1623 				 * we have another PCB which is bound
1624 				 * to 0.0.0.0.  If a PCB has the
1625 				 * INP_IPV6 flag, then we set its cost
1626 				 * higher than IPv4 only PCBs.
1627 				 *
1628 				 * Note that the case only happens
1629 				 * when a socket is bound to ::, under
1630 				 * the condition that the use of the
1631 				 * mapped address is allowed.
1632 				 */
1633 				if ((inp->inp_vflag & INP_IPV6) != 0)
1634 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1635 #endif
1636 				if (inp->inp_faddr.s_addr != INADDR_ANY)
1637 					wildcard++;
1638 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
1639 					if (laddr.s_addr == INADDR_ANY)
1640 						wildcard++;
1641 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
1642 						continue;
1643 				} else {
1644 					if (laddr.s_addr != INADDR_ANY)
1645 						wildcard++;
1646 				}
1647 				if (wildcard < matchwild) {
1648 					match = inp;
1649 					matchwild = wildcard;
1650 					if (matchwild == 0)
1651 						break;
1652 				}
1653 			}
1654 		}
1655 		return (match);
1656 	}
1657 }
1658 #undef INP_LOOKUP_MAPPED_PCB_COST
1659 
1660 #ifdef PCBGROUP
1661 /*
1662  * Lookup PCB in hash list, using pcbgroup tables.
1663  */
1664 static struct inpcb *
1665 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
1666     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
1667     u_int lport_arg, int lookupflags, struct ifnet *ifp)
1668 {
1669 	struct inpcbhead *head;
1670 	struct inpcb *inp, *tmpinp;
1671 	u_short fport = fport_arg, lport = lport_arg;
1672 
1673 	/*
1674 	 * First look for an exact match.
1675 	 */
1676 	tmpinp = NULL;
1677 	INP_GROUP_LOCK(pcbgroup);
1678 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1679 	    pcbgroup->ipg_hashmask)];
1680 	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1681 #ifdef INET6
1682 		/* XXX inp locking */
1683 		if ((inp->inp_vflag & INP_IPV4) == 0)
1684 			continue;
1685 #endif
1686 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1687 		    inp->inp_laddr.s_addr == laddr.s_addr &&
1688 		    inp->inp_fport == fport &&
1689 		    inp->inp_lport == lport) {
1690 			/*
1691 			 * XXX We should be able to directly return
1692 			 * the inp here, without any checks.
1693 			 * Well unless both bound with SO_REUSEPORT?
1694 			 */
1695 			if (prison_flag(inp->inp_cred, PR_IP4))
1696 				goto found;
1697 			if (tmpinp == NULL)
1698 				tmpinp = inp;
1699 		}
1700 	}
1701 	if (tmpinp != NULL) {
1702 		inp = tmpinp;
1703 		goto found;
1704 	}
1705 
1706 #ifdef	RSS
1707 	/*
1708 	 * For incoming connections, we may wish to do a wildcard
1709 	 * match for an RSS-local socket.
1710 	 */
1711 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1712 		struct inpcb *local_wild = NULL, *local_exact = NULL;
1713 #ifdef INET6
1714 		struct inpcb *local_wild_mapped = NULL;
1715 #endif
1716 		struct inpcb *jail_wild = NULL;
1717 		struct inpcbhead *head;
1718 		int injail;
1719 
1720 		/*
1721 		 * Order of socket selection - we always prefer jails.
1722 		 *      1. jailed, non-wild.
1723 		 *      2. jailed, wild.
1724 		 *      3. non-jailed, non-wild.
1725 		 *      4. non-jailed, wild.
1726 		 */
1727 
1728 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
1729 		    lport, 0, pcbgroup->ipg_hashmask)];
1730 		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1731 #ifdef INET6
1732 			/* XXX inp locking */
1733 			if ((inp->inp_vflag & INP_IPV4) == 0)
1734 				continue;
1735 #endif
1736 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1737 			    inp->inp_lport != lport)
1738 				continue;
1739 
1740 			injail = prison_flag(inp->inp_cred, PR_IP4);
1741 			if (injail) {
1742 				if (prison_check_ip4(inp->inp_cred,
1743 				    &laddr) != 0)
1744 					continue;
1745 			} else {
1746 				if (local_exact != NULL)
1747 					continue;
1748 			}
1749 
1750 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1751 				if (injail)
1752 					goto found;
1753 				else
1754 					local_exact = inp;
1755 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1756 #ifdef INET6
1757 				/* XXX inp locking, NULL check */
1758 				if (inp->inp_vflag & INP_IPV6PROTO)
1759 					local_wild_mapped = inp;
1760 				else
1761 #endif
1762 					if (injail)
1763 						jail_wild = inp;
1764 					else
1765 						local_wild = inp;
1766 			}
1767 		} /* LIST_FOREACH */
1768 
1769 		inp = jail_wild;
1770 		if (inp == NULL)
1771 			inp = local_exact;
1772 		if (inp == NULL)
1773 			inp = local_wild;
1774 #ifdef INET6
1775 		if (inp == NULL)
1776 			inp = local_wild_mapped;
1777 #endif
1778 		if (inp != NULL)
1779 			goto found;
1780 	}
1781 #endif
1782 
1783 	/*
1784 	 * Then look for a wildcard match, if requested.
1785 	 */
1786 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1787 		struct inpcb *local_wild = NULL, *local_exact = NULL;
1788 #ifdef INET6
1789 		struct inpcb *local_wild_mapped = NULL;
1790 #endif
1791 		struct inpcb *jail_wild = NULL;
1792 		struct inpcbhead *head;
1793 		int injail;
1794 
1795 		/*
1796 		 * Order of socket selection - we always prefer jails.
1797 		 *      1. jailed, non-wild.
1798 		 *      2. jailed, wild.
1799 		 *      3. non-jailed, non-wild.
1800 		 *      4. non-jailed, wild.
1801 		 */
1802 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
1803 		    0, pcbinfo->ipi_wildmask)];
1804 		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
1805 #ifdef INET6
1806 			/* XXX inp locking */
1807 			if ((inp->inp_vflag & INP_IPV4) == 0)
1808 				continue;
1809 #endif
1810 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1811 			    inp->inp_lport != lport)
1812 				continue;
1813 
1814 			injail = prison_flag(inp->inp_cred, PR_IP4);
1815 			if (injail) {
1816 				if (prison_check_ip4(inp->inp_cred,
1817 				    &laddr) != 0)
1818 					continue;
1819 			} else {
1820 				if (local_exact != NULL)
1821 					continue;
1822 			}
1823 
1824 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1825 				if (injail)
1826 					goto found;
1827 				else
1828 					local_exact = inp;
1829 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1830 #ifdef INET6
1831 				/* XXX inp locking, NULL check */
1832 				if (inp->inp_vflag & INP_IPV6PROTO)
1833 					local_wild_mapped = inp;
1834 				else
1835 #endif
1836 					if (injail)
1837 						jail_wild = inp;
1838 					else
1839 						local_wild = inp;
1840 			}
1841 		} /* LIST_FOREACH */
1842 		inp = jail_wild;
1843 		if (inp == NULL)
1844 			inp = local_exact;
1845 		if (inp == NULL)
1846 			inp = local_wild;
1847 #ifdef INET6
1848 		if (inp == NULL)
1849 			inp = local_wild_mapped;
1850 #endif
1851 		if (inp != NULL)
1852 			goto found;
1853 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
1854 	INP_GROUP_UNLOCK(pcbgroup);
1855 	return (NULL);
1856 
1857 found:
1858 	in_pcbref(inp);
1859 	INP_GROUP_UNLOCK(pcbgroup);
1860 	if (lookupflags & INPLOOKUP_WLOCKPCB) {
1861 		INP_WLOCK(inp);
1862 		if (in_pcbrele_wlocked(inp))
1863 			return (NULL);
1864 	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
1865 		INP_RLOCK(inp);
1866 		if (in_pcbrele_rlocked(inp))
1867 			return (NULL);
1868 	} else
1869 		panic("%s: locking bug", __func__);
1870 	return (inp);
1871 }
1872 #endif /* PCBGROUP */
1873 
1874 /*
1875  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
1876  * that the caller has locked the hash list, and will not perform any further
1877  * locking or reference operations on either the hash list or the connection.
1878  */
1879 static struct inpcb *
1880 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1881     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
1882     struct ifnet *ifp)
1883 {
1884 	struct inpcbhead *head;
1885 	struct inpcb *inp, *tmpinp;
1886 	u_short fport = fport_arg, lport = lport_arg;
1887 
1888 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1889 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1890 
1891 	INP_HASH_LOCK_ASSERT(pcbinfo);
1892 
1893 	/*
1894 	 * First look for an exact match.
1895 	 */
1896 	tmpinp = NULL;
1897 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1898 	    pcbinfo->ipi_hashmask)];
1899 	LIST_FOREACH(inp, head, inp_hash) {
1900 #ifdef INET6
1901 		/* XXX inp locking */
1902 		if ((inp->inp_vflag & INP_IPV4) == 0)
1903 			continue;
1904 #endif
1905 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1906 		    inp->inp_laddr.s_addr == laddr.s_addr &&
1907 		    inp->inp_fport == fport &&
1908 		    inp->inp_lport == lport) {
1909 			/*
1910 			 * XXX We should be able to directly return
1911 			 * the inp here, without any checks.
1912 			 * Well unless both bound with SO_REUSEPORT?
1913 			 */
1914 			if (prison_flag(inp->inp_cred, PR_IP4))
1915 				return (inp);
1916 			if (tmpinp == NULL)
1917 				tmpinp = inp;
1918 		}
1919 	}
1920 	if (tmpinp != NULL)
1921 		return (tmpinp);
1922 
1923 	/*
1924 	 * Then look for a wildcard match, if requested.
1925 	 */
1926 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1927 		struct inpcb *local_wild = NULL, *local_exact = NULL;
1928 #ifdef INET6
1929 		struct inpcb *local_wild_mapped = NULL;
1930 #endif
1931 		struct inpcb *jail_wild = NULL;
1932 		int injail;
1933 
1934 		/*
1935 		 * Order of socket selection - we always prefer jails.
1936 		 *      1. jailed, non-wild.
1937 		 *      2. jailed, wild.
1938 		 *      3. non-jailed, non-wild.
1939 		 *      4. non-jailed, wild.
1940 		 */
1941 
1942 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1943 		    0, pcbinfo->ipi_hashmask)];
1944 		LIST_FOREACH(inp, head, inp_hash) {
1945 #ifdef INET6
1946 			/* XXX inp locking */
1947 			if ((inp->inp_vflag & INP_IPV4) == 0)
1948 				continue;
1949 #endif
1950 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1951 			    inp->inp_lport != lport)
1952 				continue;
1953 
1954 			injail = prison_flag(inp->inp_cred, PR_IP4);
1955 			if (injail) {
1956 				if (prison_check_ip4(inp->inp_cred,
1957 				    &laddr) != 0)
1958 					continue;
1959 			} else {
1960 				if (local_exact != NULL)
1961 					continue;
1962 			}
1963 
1964 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1965 				if (injail)
1966 					return (inp);
1967 				else
1968 					local_exact = inp;
1969 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1970 #ifdef INET6
1971 				/* XXX inp locking, NULL check */
1972 				if (inp->inp_vflag & INP_IPV6PROTO)
1973 					local_wild_mapped = inp;
1974 				else
1975 #endif
1976 					if (injail)
1977 						jail_wild = inp;
1978 					else
1979 						local_wild = inp;
1980 			}
1981 		} /* LIST_FOREACH */
1982 		if (jail_wild != NULL)
1983 			return (jail_wild);
1984 		if (local_exact != NULL)
1985 			return (local_exact);
1986 		if (local_wild != NULL)
1987 			return (local_wild);
1988 #ifdef INET6
1989 		if (local_wild_mapped != NULL)
1990 			return (local_wild_mapped);
1991 #endif
1992 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
1993 
1994 	return (NULL);
1995 }
1996 
1997 /*
1998  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
1999  * hash list lock, and will return the inpcb locked (i.e., requires
2000  * INPLOOKUP_LOCKPCB).
2001  */
2002 static struct inpcb *
2003 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2004     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2005     struct ifnet *ifp)
2006 {
2007 	struct inpcb *inp;
2008 
2009 	INP_HASH_RLOCK(pcbinfo);
2010 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2011 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
2012 	if (inp != NULL) {
2013 		in_pcbref(inp);
2014 		INP_HASH_RUNLOCK(pcbinfo);
2015 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
2016 			INP_WLOCK(inp);
2017 			if (in_pcbrele_wlocked(inp))
2018 				return (NULL);
2019 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
2020 			INP_RLOCK(inp);
2021 			if (in_pcbrele_rlocked(inp))
2022 				return (NULL);
2023 		} else
2024 			panic("%s: locking bug", __func__);
2025 	} else
2026 		INP_HASH_RUNLOCK(pcbinfo);
2027 	return (inp);
2028 }
2029 
2030 /*
2031  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2032  * from which a pre-calculated hash value may be extracted.
2033  *
2034  * Possibly more of this logic should be in in_pcbgroup.c.
2035  */
2036 struct inpcb *
2037 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2038     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2039 {
2040 #if defined(PCBGROUP) && !defined(RSS)
2041 	struct inpcbgroup *pcbgroup;
2042 #endif
2043 
2044 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2045 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2046 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2047 	    ("%s: LOCKPCB not set", __func__));
2048 
2049 	/*
2050 	 * When not using RSS, use connection groups in preference to the
2051 	 * reservation table when looking up 4-tuples.  When using RSS, just
2052 	 * use the reservation table, due to the cost of the Toeplitz hash
2053 	 * in software.
2054 	 *
2055 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
2056 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
2057 	 * in software.
2058 	 */
2059 #if defined(PCBGROUP) && !defined(RSS)
2060 	if (in_pcbgroup_enabled(pcbinfo)) {
2061 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2062 		    fport);
2063 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2064 		    laddr, lport, lookupflags, ifp));
2065 	}
2066 #endif
2067 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2068 	    lookupflags, ifp));
2069 }
2070 
2071 struct inpcb *
2072 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2073     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2074     struct ifnet *ifp, struct mbuf *m)
2075 {
2076 #ifdef PCBGROUP
2077 	struct inpcbgroup *pcbgroup;
2078 #endif
2079 
2080 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2081 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2082 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2083 	    ("%s: LOCKPCB not set", __func__));
2084 
2085 #ifdef PCBGROUP
2086 	/*
2087 	 * If we can use a hardware-generated hash to look up the connection
2088 	 * group, use that connection group to find the inpcb.  Otherwise
2089 	 * fall back on a software hash -- or the reservation table if we're
2090 	 * using RSS.
2091 	 *
2092 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
2093 	 */
2094 	if (in_pcbgroup_enabled(pcbinfo) &&
2095 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2096 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2097 		    m->m_pkthdr.flowid);
2098 		if (pcbgroup != NULL)
2099 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2100 			    fport, laddr, lport, lookupflags, ifp));
2101 #ifndef RSS
2102 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2103 		    fport);
2104 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2105 		    laddr, lport, lookupflags, ifp));
2106 #endif
2107 	}
2108 #endif
2109 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2110 	    lookupflags, ifp));
2111 }
2112 #endif /* INET */
2113 
2114 /*
2115  * Insert PCB onto various hash lists.
2116  */
2117 static int
2118 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
2119 {
2120 	struct inpcbhead *pcbhash;
2121 	struct inpcbporthead *pcbporthash;
2122 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2123 	struct inpcbport *phd;
2124 	u_int32_t hashkey_faddr;
2125 
2126 	INP_WLOCK_ASSERT(inp);
2127 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2128 
2129 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2130 	    ("in_pcbinshash: INP_INHASHLIST"));
2131 
2132 #ifdef INET6
2133 	if (inp->inp_vflag & INP_IPV6)
2134 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2135 	else
2136 #endif
2137 	hashkey_faddr = inp->inp_faddr.s_addr;
2138 
2139 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2140 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2141 
2142 	pcbporthash = &pcbinfo->ipi_porthashbase[
2143 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2144 
2145 	/*
2146 	 * Go through port list and look for a head for this lport.
2147 	 */
2148 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
2149 		if (phd->phd_port == inp->inp_lport)
2150 			break;
2151 	}
2152 	/*
2153 	 * If none exists, malloc one and tack it on.
2154 	 */
2155 	if (phd == NULL) {
2156 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2157 		if (phd == NULL) {
2158 			return (ENOBUFS); /* XXX */
2159 		}
2160 		phd->phd_port = inp->inp_lport;
2161 		LIST_INIT(&phd->phd_pcblist);
2162 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2163 	}
2164 	inp->inp_phd = phd;
2165 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2166 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2167 	inp->inp_flags |= INP_INHASHLIST;
2168 #ifdef PCBGROUP
2169 	if (do_pcbgroup_update)
2170 		in_pcbgroup_update(inp);
2171 #endif
2172 	return (0);
2173 }
2174 
2175 /*
2176  * For now, there are two public interfaces to insert an inpcb into the hash
2177  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
2178  * is used only in the TCP syncache, where in_pcbinshash is called before the
2179  * full 4-tuple is set for the inpcb, and we don't want to install in the
2180  * pcbgroup until later.
2181  *
2182  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
2183  * connection groups, and partially initialised inpcbs should not be exposed
2184  * to either reservation hash tables or pcbgroups.
2185  */
2186 int
2187 in_pcbinshash(struct inpcb *inp)
2188 {
2189 
2190 	return (in_pcbinshash_internal(inp, 1));
2191 }
2192 
2193 int
2194 in_pcbinshash_nopcbgroup(struct inpcb *inp)
2195 {
2196 
2197 	return (in_pcbinshash_internal(inp, 0));
2198 }
2199 
2200 /*
2201  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2202  * changed. NOTE: This does not handle the case of the lport changing (the
2203  * hashed port list would have to be updated as well), so the lport must
2204  * not change after in_pcbinshash() has been called.
2205  */
2206 void
2207 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2208 {
2209 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2210 	struct inpcbhead *head;
2211 	u_int32_t hashkey_faddr;
2212 
2213 	INP_WLOCK_ASSERT(inp);
2214 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2215 
2216 	KASSERT(inp->inp_flags & INP_INHASHLIST,
2217 	    ("in_pcbrehash: !INP_INHASHLIST"));
2218 
2219 #ifdef INET6
2220 	if (inp->inp_vflag & INP_IPV6)
2221 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2222 	else
2223 #endif
2224 	hashkey_faddr = inp->inp_faddr.s_addr;
2225 
2226 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2227 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2228 
2229 	LIST_REMOVE(inp, inp_hash);
2230 	LIST_INSERT_HEAD(head, inp, inp_hash);
2231 
2232 #ifdef PCBGROUP
2233 	if (m != NULL)
2234 		in_pcbgroup_update_mbuf(inp, m);
2235 	else
2236 		in_pcbgroup_update(inp);
2237 #endif
2238 }
2239 
2240 void
2241 in_pcbrehash(struct inpcb *inp)
2242 {
2243 
2244 	in_pcbrehash_mbuf(inp, NULL);
2245 }
2246 
2247 /*
2248  * Remove PCB from various lists.
2249  */
2250 static void
2251 in_pcbremlists(struct inpcb *inp)
2252 {
2253 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2254 
2255 #ifdef INVARIANTS
2256 	if (pcbinfo == &V_tcbinfo) {
2257 		INP_INFO_RLOCK_ASSERT(pcbinfo);
2258 	} else {
2259 		INP_INFO_WLOCK_ASSERT(pcbinfo);
2260 	}
2261 #endif
2262 
2263 	INP_WLOCK_ASSERT(inp);
2264 	INP_LIST_WLOCK_ASSERT(pcbinfo);
2265 
2266 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2267 	if (inp->inp_flags & INP_INHASHLIST) {
2268 		struct inpcbport *phd = inp->inp_phd;
2269 
2270 		INP_HASH_WLOCK(pcbinfo);
2271 		LIST_REMOVE(inp, inp_hash);
2272 		LIST_REMOVE(inp, inp_portlist);
2273 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
2274 			LIST_REMOVE(phd, phd_hash);
2275 			free(phd, M_PCB);
2276 		}
2277 		INP_HASH_WUNLOCK(pcbinfo);
2278 		inp->inp_flags &= ~INP_INHASHLIST;
2279 	}
2280 	LIST_REMOVE(inp, inp_list);
2281 	pcbinfo->ipi_count--;
2282 #ifdef PCBGROUP
2283 	in_pcbgroup_remove(inp);
2284 #endif
2285 }
2286 
2287 /*
2288  * Check for alternatives when higher level complains
2289  * about service problems.  For now, invalidate cached
2290  * routing information.  If the route was created dynamically
2291  * (by a redirect), time to try a default gateway again.
2292  */
2293 void
2294 in_losing(struct inpcb *inp)
2295 {
2296 
2297 	if (inp->inp_route.ro_rt) {
2298 		RTFREE(inp->inp_route.ro_rt);
2299 		inp->inp_route.ro_rt = (struct rtentry *)NULL;
2300 	}
2301 	if (inp->inp_route.ro_lle)
2302 		LLE_FREE(inp->inp_route.ro_lle);	/* zeros ro_lle */
2303 	return;
2304 }
2305 
2306 /*
2307  * A set label operation has occurred at the socket layer, propagate the
2308  * label change into the in_pcb for the socket.
2309  */
2310 void
2311 in_pcbsosetlabel(struct socket *so)
2312 {
2313 #ifdef MAC
2314 	struct inpcb *inp;
2315 
2316 	inp = sotoinpcb(so);
2317 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2318 
2319 	INP_WLOCK(inp);
2320 	SOCK_LOCK(so);
2321 	mac_inpcb_sosetlabel(so, inp);
2322 	SOCK_UNLOCK(so);
2323 	INP_WUNLOCK(inp);
2324 #endif
2325 }
2326 
2327 /*
2328  * ipport_tick runs once per second, determining if random port allocation
2329  * should be continued.  If more than ipport_randomcps ports have been
2330  * allocated in the last second, then we return to sequential port
2331  * allocation. We return to random allocation only once we drop below
2332  * ipport_randomcps for at least ipport_randomtime seconds.
2333  */
2334 static void
2335 ipport_tick(void *xtp)
2336 {
2337 	VNET_ITERATOR_DECL(vnet_iter);
2338 
2339 	VNET_LIST_RLOCK_NOSLEEP();
2340 	VNET_FOREACH(vnet_iter) {
2341 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
2342 		if (V_ipport_tcpallocs <=
2343 		    V_ipport_tcplastcount + V_ipport_randomcps) {
2344 			if (V_ipport_stoprandom > 0)
2345 				V_ipport_stoprandom--;
2346 		} else
2347 			V_ipport_stoprandom = V_ipport_randomtime;
2348 		V_ipport_tcplastcount = V_ipport_tcpallocs;
2349 		CURVNET_RESTORE();
2350 	}
2351 	VNET_LIST_RUNLOCK_NOSLEEP();
2352 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2353 }
2354 
2355 static void
2356 ip_fini(void *xtp)
2357 {
2358 
2359 	callout_stop(&ipport_tick_callout);
2360 }
2361 
2362 /*
2363  * The ipport_callout should start running at about the time we attach the
2364  * inet or inet6 domains.
2365  */
2366 static void
2367 ipport_tick_init(const void *unused __unused)
2368 {
2369 
2370 	/* Start ipport_tick. */
2371 	callout_init(&ipport_tick_callout, 1);
2372 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2373 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2374 		SHUTDOWN_PRI_DEFAULT);
2375 }
2376 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2377     ipport_tick_init, NULL);
2378 
2379 void
2380 inp_wlock(struct inpcb *inp)
2381 {
2382 
2383 	INP_WLOCK(inp);
2384 }
2385 
2386 void
2387 inp_wunlock(struct inpcb *inp)
2388 {
2389 
2390 	INP_WUNLOCK(inp);
2391 }
2392 
2393 void
2394 inp_rlock(struct inpcb *inp)
2395 {
2396 
2397 	INP_RLOCK(inp);
2398 }
2399 
2400 void
2401 inp_runlock(struct inpcb *inp)
2402 {
2403 
2404 	INP_RUNLOCK(inp);
2405 }
2406 
2407 #ifdef INVARIANTS
2408 void
2409 inp_lock_assert(struct inpcb *inp)
2410 {
2411 
2412 	INP_WLOCK_ASSERT(inp);
2413 }
2414 
2415 void
2416 inp_unlock_assert(struct inpcb *inp)
2417 {
2418 
2419 	INP_UNLOCK_ASSERT(inp);
2420 }
2421 #endif
2422 
2423 void
2424 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2425 {
2426 	struct inpcb *inp;
2427 
2428 	INP_INFO_WLOCK(&V_tcbinfo);
2429 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2430 		INP_WLOCK(inp);
2431 		func(inp, arg);
2432 		INP_WUNLOCK(inp);
2433 	}
2434 	INP_INFO_WUNLOCK(&V_tcbinfo);
2435 }
2436 
2437 struct socket *
2438 inp_inpcbtosocket(struct inpcb *inp)
2439 {
2440 
2441 	INP_WLOCK_ASSERT(inp);
2442 	return (inp->inp_socket);
2443 }
2444 
2445 struct tcpcb *
2446 inp_inpcbtotcpcb(struct inpcb *inp)
2447 {
2448 
2449 	INP_WLOCK_ASSERT(inp);
2450 	return ((struct tcpcb *)inp->inp_ppcb);
2451 }
2452 
2453 int
2454 inp_ip_tos_get(const struct inpcb *inp)
2455 {
2456 
2457 	return (inp->inp_ip_tos);
2458 }
2459 
2460 void
2461 inp_ip_tos_set(struct inpcb *inp, int val)
2462 {
2463 
2464 	inp->inp_ip_tos = val;
2465 }
2466 
2467 void
2468 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2469     uint32_t *faddr, uint16_t *fp)
2470 {
2471 
2472 	INP_LOCK_ASSERT(inp);
2473 	*laddr = inp->inp_laddr.s_addr;
2474 	*faddr = inp->inp_faddr.s_addr;
2475 	*lp = inp->inp_lport;
2476 	*fp = inp->inp_fport;
2477 }
2478 
2479 struct inpcb *
2480 so_sotoinpcb(struct socket *so)
2481 {
2482 
2483 	return (sotoinpcb(so));
2484 }
2485 
2486 struct tcpcb *
2487 so_sototcpcb(struct socket *so)
2488 {
2489 
2490 	return (sototcpcb(so));
2491 }
2492 
2493 #ifdef DDB
2494 static void
2495 db_print_indent(int indent)
2496 {
2497 	int i;
2498 
2499 	for (i = 0; i < indent; i++)
2500 		db_printf(" ");
2501 }
2502 
2503 static void
2504 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2505 {
2506 	char faddr_str[48], laddr_str[48];
2507 
2508 	db_print_indent(indent);
2509 	db_printf("%s at %p\n", name, inc);
2510 
2511 	indent += 2;
2512 
2513 #ifdef INET6
2514 	if (inc->inc_flags & INC_ISIPV6) {
2515 		/* IPv6. */
2516 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
2517 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
2518 	} else
2519 #endif
2520 	{
2521 		/* IPv4. */
2522 		inet_ntoa_r(inc->inc_laddr, laddr_str);
2523 		inet_ntoa_r(inc->inc_faddr, faddr_str);
2524 	}
2525 	db_print_indent(indent);
2526 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
2527 	    ntohs(inc->inc_lport));
2528 	db_print_indent(indent);
2529 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
2530 	    ntohs(inc->inc_fport));
2531 }
2532 
2533 static void
2534 db_print_inpflags(int inp_flags)
2535 {
2536 	int comma;
2537 
2538 	comma = 0;
2539 	if (inp_flags & INP_RECVOPTS) {
2540 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2541 		comma = 1;
2542 	}
2543 	if (inp_flags & INP_RECVRETOPTS) {
2544 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2545 		comma = 1;
2546 	}
2547 	if (inp_flags & INP_RECVDSTADDR) {
2548 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2549 		comma = 1;
2550 	}
2551 	if (inp_flags & INP_HDRINCL) {
2552 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
2553 		comma = 1;
2554 	}
2555 	if (inp_flags & INP_HIGHPORT) {
2556 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2557 		comma = 1;
2558 	}
2559 	if (inp_flags & INP_LOWPORT) {
2560 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
2561 		comma = 1;
2562 	}
2563 	if (inp_flags & INP_ANONPORT) {
2564 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
2565 		comma = 1;
2566 	}
2567 	if (inp_flags & INP_RECVIF) {
2568 		db_printf("%sINP_RECVIF", comma ? ", " : "");
2569 		comma = 1;
2570 	}
2571 	if (inp_flags & INP_MTUDISC) {
2572 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
2573 		comma = 1;
2574 	}
2575 	if (inp_flags & INP_RECVTTL) {
2576 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
2577 		comma = 1;
2578 	}
2579 	if (inp_flags & INP_DONTFRAG) {
2580 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2581 		comma = 1;
2582 	}
2583 	if (inp_flags & INP_RECVTOS) {
2584 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
2585 		comma = 1;
2586 	}
2587 	if (inp_flags & IN6P_IPV6_V6ONLY) {
2588 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2589 		comma = 1;
2590 	}
2591 	if (inp_flags & IN6P_PKTINFO) {
2592 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2593 		comma = 1;
2594 	}
2595 	if (inp_flags & IN6P_HOPLIMIT) {
2596 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2597 		comma = 1;
2598 	}
2599 	if (inp_flags & IN6P_HOPOPTS) {
2600 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
2601 		comma = 1;
2602 	}
2603 	if (inp_flags & IN6P_DSTOPTS) {
2604 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
2605 		comma = 1;
2606 	}
2607 	if (inp_flags & IN6P_RTHDR) {
2608 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
2609 		comma = 1;
2610 	}
2611 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
2612 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
2613 		comma = 1;
2614 	}
2615 	if (inp_flags & IN6P_TCLASS) {
2616 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
2617 		comma = 1;
2618 	}
2619 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
2620 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
2621 		comma = 1;
2622 	}
2623 	if (inp_flags & INP_TIMEWAIT) {
2624 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
2625 		comma  = 1;
2626 	}
2627 	if (inp_flags & INP_ONESBCAST) {
2628 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
2629 		comma  = 1;
2630 	}
2631 	if (inp_flags & INP_DROPPED) {
2632 		db_printf("%sINP_DROPPED", comma ? ", " : "");
2633 		comma  = 1;
2634 	}
2635 	if (inp_flags & INP_SOCKREF) {
2636 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
2637 		comma  = 1;
2638 	}
2639 	if (inp_flags & IN6P_RFC2292) {
2640 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
2641 		comma = 1;
2642 	}
2643 	if (inp_flags & IN6P_MTU) {
2644 		db_printf("IN6P_MTU%s", comma ? ", " : "");
2645 		comma = 1;
2646 	}
2647 }
2648 
2649 static void
2650 db_print_inpvflag(u_char inp_vflag)
2651 {
2652 	int comma;
2653 
2654 	comma = 0;
2655 	if (inp_vflag & INP_IPV4) {
2656 		db_printf("%sINP_IPV4", comma ? ", " : "");
2657 		comma  = 1;
2658 	}
2659 	if (inp_vflag & INP_IPV6) {
2660 		db_printf("%sINP_IPV6", comma ? ", " : "");
2661 		comma  = 1;
2662 	}
2663 	if (inp_vflag & INP_IPV6PROTO) {
2664 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
2665 		comma  = 1;
2666 	}
2667 }
2668 
2669 static void
2670 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
2671 {
2672 
2673 	db_print_indent(indent);
2674 	db_printf("%s at %p\n", name, inp);
2675 
2676 	indent += 2;
2677 
2678 	db_print_indent(indent);
2679 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
2680 
2681 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
2682 
2683 	db_print_indent(indent);
2684 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
2685 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
2686 
2687 	db_print_indent(indent);
2688 	db_printf("inp_label: %p   inp_flags: 0x%x (",
2689 	   inp->inp_label, inp->inp_flags);
2690 	db_print_inpflags(inp->inp_flags);
2691 	db_printf(")\n");
2692 
2693 	db_print_indent(indent);
2694 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
2695 	    inp->inp_vflag);
2696 	db_print_inpvflag(inp->inp_vflag);
2697 	db_printf(")\n");
2698 
2699 	db_print_indent(indent);
2700 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
2701 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
2702 
2703 	db_print_indent(indent);
2704 #ifdef INET6
2705 	if (inp->inp_vflag & INP_IPV6) {
2706 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
2707 		    "in6p_moptions: %p\n", inp->in6p_options,
2708 		    inp->in6p_outputopts, inp->in6p_moptions);
2709 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
2710 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
2711 		    inp->in6p_hops);
2712 	} else
2713 #endif
2714 	{
2715 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
2716 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
2717 		    inp->inp_options, inp->inp_moptions);
2718 	}
2719 
2720 	db_print_indent(indent);
2721 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
2722 	    (uintmax_t)inp->inp_gencnt);
2723 }
2724 
2725 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
2726 {
2727 	struct inpcb *inp;
2728 
2729 	if (!have_addr) {
2730 		db_printf("usage: show inpcb <addr>\n");
2731 		return;
2732 	}
2733 	inp = (struct inpcb *)addr;
2734 
2735 	db_print_inpcb(inp, "inpcb", 0);
2736 }
2737 #endif /* DDB */
2738