1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Robert N. M. Watson under
11 * contract to Juniper Networks, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
38 */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_pcbgroup.h"
49 #include "opt_route.h"
50 #include "opt_rss.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/mbuf.h>
57 #include <sys/callout.h>
58 #include <sys/eventhandler.h>
59 #include <sys/domain.h>
60 #include <sys/protosw.h>
61 #include <sys/rmlock.h>
62 #include <sys/smp.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sockio.h>
66 #include <sys/priv.h>
67 #include <sys/proc.h>
68 #include <sys/refcount.h>
69 #include <sys/jail.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78 #include <vm/vm.h>
79
80 #include <net/if.h>
81 #include <net/if_var.h>
82 #include <net/if_types.h>
83 #include <net/if_llatbl.h>
84 #include <net/route.h>
85 #include <net/rss_config.h>
86 #include <net/vnet.h>
87
88 #if defined(INET) || defined(INET6)
89 #include <netinet/in.h>
90 #include <netinet/in_pcb.h>
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #include <netinet/in_fib.h>
94 #endif
95 #include <netinet/ip_var.h>
96 #include <netinet/tcp_var.h>
97 #ifdef TCPHPTS
98 #include <netinet/tcp_hpts.h>
99 #endif
100 #include <netinet/udp.h>
101 #include <netinet/udp_var.h>
102 #ifdef INET6
103 #include <netinet/ip6.h>
104 #include <netinet6/in6_pcb.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/ip6_var.h>
107 #endif /* INET6 */
108 #include <net/route/nhop.h>
109 #endif
110
111 #include <netipsec/ipsec_support.h>
112
113 #include <security/mac/mac_framework.h>
114
115 #ifdef FSTACK
116 #include "ff_host_interface.h"
117 #endif
118
119 #define INPCBLBGROUP_SIZMIN 8
120 #define INPCBLBGROUP_SIZMAX 256
121
122 static struct callout ipport_tick_callout;
123
124 /*
125 * These configure the range of local port addresses assigned to
126 * "unspecified" outgoing connections/packets/whatever.
127 */
128 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
129 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
130 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
131 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
132 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
133 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
134
135 /*
136 * Reserved ports accessible only to root. There are significant
137 * security considerations that must be accounted for when changing these,
138 * but the security benefits can be great. Please be careful.
139 */
140 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
141 VNET_DEFINE(int, ipport_reservedlow);
142
143 /* Variables dealing with random ephemeral port allocation. */
144 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
145 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
146 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
147 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
148 VNET_DEFINE(int, ipport_tcpallocs);
149 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
150
151 #define V_ipport_tcplastcount VNET(ipport_tcplastcount)
152
153 static void in_pcbremlists(struct inpcb *inp);
154 #ifdef INET
155 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
156 struct in_addr faddr, u_int fport_arg,
157 struct in_addr laddr, u_int lport_arg,
158 int lookupflags, struct ifnet *ifp,
159 uint8_t numa_domain);
160
161 #define RANGECHK(var, min, max) \
162 if ((var) < (min)) { (var) = (min); } \
163 else if ((var) > (max)) { (var) = (max); }
164
165 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)166 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
167 {
168 int error;
169
170 error = sysctl_handle_int(oidp, arg1, arg2, req);
171 if (error == 0) {
172 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
173 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
174 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
175 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
176 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
177 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
178 }
179 return (error);
180 }
181
182 #undef RANGECHK
183
184 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
185 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
186 "IP Ports");
187
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
189 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
190 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
191 "");
192 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
193 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
194 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
195 "");
196 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
197 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
198 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
199 "");
200 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
201 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
202 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
203 "");
204 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
205 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
206 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
207 "");
208 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
209 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
210 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
211 "");
212 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
213 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
214 &VNET_NAME(ipport_reservedhigh), 0, "");
215 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
216 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
217 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
218 CTLFLAG_VNET | CTLFLAG_RW,
219 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
220 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
221 CTLFLAG_VNET | CTLFLAG_RW,
222 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
223 "allocations before switching to a sequental one");
224 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
225 CTLFLAG_VNET | CTLFLAG_RW,
226 &VNET_NAME(ipport_randomtime), 0,
227 "Minimum time to keep sequental port "
228 "allocation before switching to a random one");
229
230 #ifdef RATELIMIT
231 counter_u64_t rate_limit_active;
232 counter_u64_t rate_limit_alloc_fail;
233 counter_u64_t rate_limit_set_ok;
234
235 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
236 "IP Rate Limiting");
237 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
238 &rate_limit_active, "Active rate limited connections");
239 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
240 &rate_limit_alloc_fail, "Rate limited connection failures");
241 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
242 &rate_limit_set_ok, "Rate limited setting succeeded");
243 #endif /* RATELIMIT */
244
245 #endif /* INET */
246
247 /*
248 * in_pcb.c: manage the Protocol Control Blocks.
249 *
250 * NOTE: It is assumed that most of these functions will be called with
251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
252 * functions often modify hash chains or addresses in pcbs.
253 */
254
255 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct inpcblbgrouphead * hdr,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain)256 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
257 uint16_t port, const union in_dependaddr *addr, int size,
258 uint8_t numa_domain)
259 {
260 struct inpcblbgroup *grp;
261 size_t bytes;
262
263 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
264 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
265 if (!grp)
266 return (NULL);
267 grp->il_vflag = vflag;
268 grp->il_lport = port;
269 grp->il_numa_domain = numa_domain;
270 grp->il_dependladdr = *addr;
271 grp->il_inpsiz = size;
272 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
273 return (grp);
274 }
275
276 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)277 in_pcblbgroup_free_deferred(epoch_context_t ctx)
278 {
279 struct inpcblbgroup *grp;
280
281 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
282 free(grp, M_PCB);
283 }
284
285 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)286 in_pcblbgroup_free(struct inpcblbgroup *grp)
287 {
288
289 CK_LIST_REMOVE(grp, il_list);
290 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
291 }
292
293 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)294 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
295 struct inpcblbgroup *old_grp, int size)
296 {
297 struct inpcblbgroup *grp;
298 int i;
299
300 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
301 old_grp->il_lport, &old_grp->il_dependladdr, size,
302 old_grp->il_numa_domain);
303 if (grp == NULL)
304 return (NULL);
305
306 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
307 ("invalid new local group size %d and old local group count %d",
308 grp->il_inpsiz, old_grp->il_inpcnt));
309
310 for (i = 0; i < old_grp->il_inpcnt; ++i)
311 grp->il_inp[i] = old_grp->il_inp[i];
312 grp->il_inpcnt = old_grp->il_inpcnt;
313 in_pcblbgroup_free(old_grp);
314 return (grp);
315 }
316
317 /*
318 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
319 * and shrink group if possible.
320 */
321 static void
in_pcblbgroup_reorder(struct inpcblbgrouphead * hdr,struct inpcblbgroup ** grpp,int i)322 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
323 int i)
324 {
325 struct inpcblbgroup *grp, *new_grp;
326
327 grp = *grpp;
328 for (; i + 1 < grp->il_inpcnt; ++i)
329 grp->il_inp[i] = grp->il_inp[i + 1];
330 grp->il_inpcnt--;
331
332 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
333 grp->il_inpcnt <= grp->il_inpsiz / 4) {
334 /* Shrink this group. */
335 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
336 if (new_grp != NULL)
337 *grpp = new_grp;
338 }
339 }
340
341 /*
342 * Add PCB to load balance group for SO_REUSEPORT_LB option.
343 */
344 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)345 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
346 {
347 const static struct timeval interval = { 60, 0 };
348 static struct timeval lastprint;
349 struct inpcbinfo *pcbinfo;
350 struct inpcblbgrouphead *hdr;
351 struct inpcblbgroup *grp;
352 uint32_t idx;
353
354 pcbinfo = inp->inp_pcbinfo;
355
356 INP_WLOCK_ASSERT(inp);
357 INP_HASH_WLOCK_ASSERT(pcbinfo);
358
359 /*
360 * Don't allow jailed socket to join local group.
361 */
362 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
363 return (0);
364
365 #ifdef INET6
366 /*
367 * Don't allow IPv4 mapped INET6 wild socket.
368 */
369 if ((inp->inp_vflag & INP_IPV4) &&
370 inp->inp_laddr.s_addr == INADDR_ANY &&
371 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
372 return (0);
373 }
374 #endif
375
376 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
377 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
378 CK_LIST_FOREACH(grp, hdr, il_list) {
379 if (grp->il_vflag == inp->inp_vflag &&
380 grp->il_lport == inp->inp_lport &&
381 grp->il_numa_domain == numa_domain &&
382 memcmp(&grp->il_dependladdr,
383 &inp->inp_inc.inc_ie.ie_dependladdr,
384 sizeof(grp->il_dependladdr)) == 0)
385 break;
386 }
387 if (grp == NULL) {
388 /* Create new load balance group. */
389 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
390 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
391 INPCBLBGROUP_SIZMIN, numa_domain);
392 if (grp == NULL)
393 return (ENOBUFS);
394 } else if (grp->il_inpcnt == grp->il_inpsiz) {
395 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
396 if (ratecheck(&lastprint, &interval))
397 printf("lb group port %d, limit reached\n",
398 ntohs(grp->il_lport));
399 return (0);
400 }
401
402 /* Expand this local group. */
403 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
404 if (grp == NULL)
405 return (ENOBUFS);
406 }
407
408 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
409 ("invalid local group size %d and count %d", grp->il_inpsiz,
410 grp->il_inpcnt));
411
412 grp->il_inp[grp->il_inpcnt] = inp;
413 grp->il_inpcnt++;
414 return (0);
415 }
416
417 /*
418 * Remove PCB from load balance group.
419 */
420 static void
in_pcbremlbgrouphash(struct inpcb * inp)421 in_pcbremlbgrouphash(struct inpcb *inp)
422 {
423 struct inpcbinfo *pcbinfo;
424 struct inpcblbgrouphead *hdr;
425 struct inpcblbgroup *grp;
426 int i;
427
428 pcbinfo = inp->inp_pcbinfo;
429
430 INP_WLOCK_ASSERT(inp);
431 INP_HASH_WLOCK_ASSERT(pcbinfo);
432
433 hdr = &pcbinfo->ipi_lbgrouphashbase[
434 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
435 CK_LIST_FOREACH(grp, hdr, il_list) {
436 for (i = 0; i < grp->il_inpcnt; ++i) {
437 if (grp->il_inp[i] != inp)
438 continue;
439
440 if (grp->il_inpcnt == 1) {
441 /* We are the last, free this local group. */
442 in_pcblbgroup_free(grp);
443 } else {
444 /* Pull up inpcbs, shrink group if possible. */
445 in_pcblbgroup_reorder(hdr, &grp, i);
446 }
447 return;
448 }
449 }
450 }
451
452 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)453 in_pcblbgroup_numa(struct inpcb *inp, int arg)
454 {
455 struct inpcbinfo *pcbinfo;
456 struct inpcblbgrouphead *hdr;
457 struct inpcblbgroup *grp;
458 int err, i;
459 uint8_t numa_domain;
460
461 switch (arg) {
462 case TCP_REUSPORT_LB_NUMA_NODOM:
463 numa_domain = M_NODOM;
464 break;
465 case TCP_REUSPORT_LB_NUMA_CURDOM:
466 numa_domain = PCPU_GET(domain);
467 break;
468 default:
469 if (arg < 0 || arg >= vm_ndomains)
470 return (EINVAL);
471 numa_domain = arg;
472 }
473
474 err = 0;
475 pcbinfo = inp->inp_pcbinfo;
476 INP_WLOCK_ASSERT(inp);
477 INP_HASH_WLOCK(pcbinfo);
478 hdr = &pcbinfo->ipi_lbgrouphashbase[
479 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
480 CK_LIST_FOREACH(grp, hdr, il_list) {
481 for (i = 0; i < grp->il_inpcnt; ++i) {
482 if (grp->il_inp[i] != inp)
483 continue;
484
485 if (grp->il_numa_domain == numa_domain) {
486 goto abort_with_hash_wlock;
487 }
488
489 /* Remove it from the old group. */
490 in_pcbremlbgrouphash(inp);
491
492 /* Add it to the new group based on numa domain. */
493 in_pcbinslbgrouphash(inp, numa_domain);
494 goto abort_with_hash_wlock;
495 }
496 }
497 err = ENOENT;
498 abort_with_hash_wlock:
499 INP_HASH_WUNLOCK(pcbinfo);
500 return (err);
501 }
502
503 /*
504 * Different protocols initialize their inpcbs differently - giving
505 * different name to the lock. But they all are disposed the same.
506 */
507 static void
inpcb_fini(void * mem,int size)508 inpcb_fini(void *mem, int size)
509 {
510 struct inpcb *inp = mem;
511
512 INP_LOCK_DESTROY(inp);
513 }
514
515 /*
516 * Initialize an inpcbinfo -- we should be able to reduce the number of
517 * arguments in time.
518 */
519 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,const char * name,struct inpcbhead * listhead,int hash_nelements,int porthash_nelements,char * inpcbzone_name,uma_init inpcbzone_init,u_int hashfields)520 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
521 struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
522 char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
523 {
524
525 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
526
527 INP_INFO_LOCK_INIT(pcbinfo, name);
528 INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
529 INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
530 #ifdef VIMAGE
531 pcbinfo->ipi_vnet = curvnet;
532 #endif
533 pcbinfo->ipi_listhead = listhead;
534 CK_LIST_INIT(pcbinfo->ipi_listhead);
535 pcbinfo->ipi_count = 0;
536 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
537 &pcbinfo->ipi_hashmask);
538 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
539 &pcbinfo->ipi_porthashmask);
540 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
541 &pcbinfo->ipi_lbgrouphashmask);
542 #ifdef PCBGROUP
543 in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
544 #endif
545 pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
546 NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
547 uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
548 uma_zone_set_warning(pcbinfo->ipi_zone,
549 "kern.ipc.maxsockets limit reached");
550 }
551
552 /*
553 * Destroy an inpcbinfo.
554 */
555 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)556 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
557 {
558
559 KASSERT(pcbinfo->ipi_count == 0,
560 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
561
562 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
563 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
564 pcbinfo->ipi_porthashmask);
565 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
566 pcbinfo->ipi_lbgrouphashmask);
567 #ifdef PCBGROUP
568 in_pcbgroup_destroy(pcbinfo);
569 #endif
570 uma_zdestroy(pcbinfo->ipi_zone);
571 INP_LIST_LOCK_DESTROY(pcbinfo);
572 INP_HASH_LOCK_DESTROY(pcbinfo);
573 INP_INFO_LOCK_DESTROY(pcbinfo);
574 }
575
576 /*
577 * Allocate a PCB and associate it with the socket.
578 * On success return with the PCB locked.
579 */
580 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)581 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
582 {
583 struct inpcb *inp;
584 int error;
585
586 error = 0;
587 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
588 if (inp == NULL)
589 return (ENOBUFS);
590 bzero(&inp->inp_start_zero, inp_zero_size);
591 #ifdef NUMA
592 inp->inp_numa_domain = M_NODOM;
593 #endif
594 inp->inp_pcbinfo = pcbinfo;
595 inp->inp_socket = so;
596 inp->inp_cred = crhold(so->so_cred);
597 inp->inp_inc.inc_fibnum = so->so_fibnum;
598 #ifdef MAC
599 error = mac_inpcb_init(inp, M_NOWAIT);
600 if (error != 0)
601 goto out;
602 mac_inpcb_create(so, inp);
603 #endif
604 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
605 error = ipsec_init_pcbpolicy(inp);
606 if (error != 0) {
607 #ifdef MAC
608 mac_inpcb_destroy(inp);
609 #endif
610 goto out;
611 }
612 #endif /*IPSEC*/
613 #ifdef INET6
614 if (INP_SOCKAF(so) == AF_INET6) {
615 inp->inp_vflag |= INP_IPV6PROTO;
616 if (V_ip6_v6only)
617 inp->inp_flags |= IN6P_IPV6_V6ONLY;
618 }
619 #endif
620 INP_WLOCK(inp);
621 INP_LIST_WLOCK(pcbinfo);
622 CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
623 pcbinfo->ipi_count++;
624 so->so_pcb = (caddr_t)inp;
625 #ifdef INET6
626 if (V_ip6_auto_flowlabel)
627 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
628 #endif
629 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
630 refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
631
632 /*
633 * Routes in inpcb's can cache L2 as well; they are guaranteed
634 * to be cleaned up.
635 */
636 inp->inp_route.ro_flags = RT_LLE_CACHE;
637 INP_LIST_WUNLOCK(pcbinfo);
638 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
639 out:
640 if (error != 0) {
641 crfree(inp->inp_cred);
642 uma_zfree(pcbinfo->ipi_zone, inp);
643 }
644 #endif
645 return (error);
646 }
647
648 #ifdef INET
649 int
in_pcbbind(struct inpcb * inp,struct sockaddr * nam,struct ucred * cred)650 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
651 {
652 int anonport, error;
653
654 INP_WLOCK_ASSERT(inp);
655 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
656
657 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
658 return (EINVAL);
659 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
660 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
661 &inp->inp_lport, cred);
662 if (error)
663 return (error);
664 if (in_pcbinshash(inp) != 0) {
665 inp->inp_laddr.s_addr = INADDR_ANY;
666 inp->inp_lport = 0;
667 return (EAGAIN);
668 }
669 if (anonport)
670 inp->inp_flags |= INP_ANONPORT;
671 return (0);
672 }
673 #endif
674
675 #if defined(INET) || defined(INET6)
676 /*
677 * Assign a local port like in_pcb_lport(), but also used with connect()
678 * and a foreign address and port. If fsa is non-NULL, choose a local port
679 * that is unused with those, otherwise one that is completely unused.
680 * lsa can be NULL for IPv6.
681 */
682 int
in_pcb_lport_dest(struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)683 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
684 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
685 {
686 struct inpcbinfo *pcbinfo;
687 struct inpcb *tmpinp;
688 unsigned short *lastport;
689 int count, dorandom, error;
690 u_short aux, first, last, lport;
691 #ifdef INET
692 struct in_addr laddr, faddr;
693 #endif
694 #ifdef INET6
695 struct in6_addr *laddr6, *faddr6;
696 #endif
697
698 pcbinfo = inp->inp_pcbinfo;
699
700 /*
701 * Because no actual state changes occur here, a global write lock on
702 * the pcbinfo isn't required.
703 */
704 INP_LOCK_ASSERT(inp);
705 INP_HASH_LOCK_ASSERT(pcbinfo);
706
707 if (inp->inp_flags & INP_HIGHPORT) {
708 first = V_ipport_hifirstauto; /* sysctl */
709 last = V_ipport_hilastauto;
710 lastport = &pcbinfo->ipi_lasthi;
711 } else if (inp->inp_flags & INP_LOWPORT) {
712 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
713 if (error)
714 return (error);
715 first = V_ipport_lowfirstauto; /* 1023 */
716 last = V_ipport_lowlastauto; /* 600 */
717 lastport = &pcbinfo->ipi_lastlow;
718 } else {
719 first = V_ipport_firstauto; /* sysctl */
720 last = V_ipport_lastauto;
721 lastport = &pcbinfo->ipi_lastport;
722 }
723 /*
724 * For UDP(-Lite), use random port allocation as long as the user
725 * allows it. For TCP (and as of yet unknown) connections,
726 * use random port allocation only if the user allows it AND
727 * ipport_tick() allows it.
728 */
729 if (V_ipport_randomized &&
730 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
731 pcbinfo == &V_ulitecbinfo))
732 dorandom = 1;
733 else
734 dorandom = 0;
735 /*
736 * It makes no sense to do random port allocation if
737 * we have the only port available.
738 */
739 if (first == last)
740 dorandom = 0;
741 /* Make sure to not include UDP(-Lite) packets in the count. */
742 if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
743 V_ipport_tcpallocs++;
744 /*
745 * Instead of having two loops further down counting up or down
746 * make sure that first is always <= last and go with only one
747 * code path implementing all logic.
748 */
749 if (first > last) {
750 aux = first;
751 first = last;
752 last = aux;
753 }
754
755 #ifdef INET
756 laddr.s_addr = INADDR_ANY;
757 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
758 if (lsa != NULL)
759 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
760 if (fsa != NULL)
761 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
762 }
763 #endif
764 #ifdef INET6
765 laddr6 = NULL;
766 if ((inp->inp_vflag & INP_IPV6) != 0) {
767 if (lsa != NULL)
768 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
769 if (fsa != NULL)
770 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
771 }
772 #endif
773
774 tmpinp = NULL;
775 lport = *lportp;
776
777 if (dorandom)
778 *lastport = first + (arc4random() % (last - first));
779
780 count = last - first;
781
782 do {
783 if (count-- < 0) /* completely used? */
784 return (EADDRNOTAVAIL);
785 ++*lastport;
786 if (*lastport < first || *lastport > last)
787 *lastport = first;
788 lport = htons(*lastport);
789
790 if (fsa != NULL) {
791 #ifdef INET
792 if (lsa->sa_family == AF_INET) {
793 tmpinp = in_pcblookup_hash_locked(pcbinfo,
794 faddr, fport, laddr, lport, lookupflags,
795 NULL, M_NODOM);
796 }
797 #endif
798 #ifdef INET6
799 if (lsa->sa_family == AF_INET6) {
800 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
801 faddr6, fport, laddr6, lport, lookupflags,
802 NULL, M_NODOM);
803 }
804 #endif
805 } else {
806 #ifdef INET6
807 if ((inp->inp_vflag & INP_IPV6) != 0)
808 tmpinp = in6_pcblookup_local(pcbinfo,
809 &inp->in6p_laddr, lport, lookupflags, cred);
810 #endif
811 #if defined(INET) && defined(INET6)
812 else
813 #endif
814 #ifdef INET
815 tmpinp = in_pcblookup_local(pcbinfo, laddr,
816 lport, lookupflags, cred);
817 #endif
818 }
819 } while (tmpinp != NULL);
820
821 *lportp = lport;
822
823 return (0);
824 }
825
826 /*
827 * Select a local port (number) to use.
828 */
829 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)830 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
831 struct ucred *cred, int lookupflags)
832 {
833 struct sockaddr_in laddr;
834
835 if (laddrp) {
836 bzero(&laddr, sizeof(laddr));
837 laddr.sin_family = AF_INET;
838 laddr.sin_addr = *laddrp;
839 }
840 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
841 NULL, lportp, NULL, 0, cred, lookupflags));
842 }
843
844 /*
845 * Return cached socket options.
846 */
847 int
inp_so_options(const struct inpcb * inp)848 inp_so_options(const struct inpcb *inp)
849 {
850 int so_options;
851
852 so_options = 0;
853
854 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
855 so_options |= SO_REUSEPORT_LB;
856 if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
857 so_options |= SO_REUSEPORT;
858 if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
859 so_options |= SO_REUSEADDR;
860 return (so_options);
861 }
862 #endif /* INET || INET6 */
863
864 /*
865 * Check if a new BINDMULTI socket is allowed to be created.
866 *
867 * ni points to the new inp.
868 * oi points to the exisitng inp.
869 *
870 * This checks whether the existing inp also has BINDMULTI and
871 * whether the credentials match.
872 */
873 int
in_pcbbind_check_bindmulti(const struct inpcb * ni,const struct inpcb * oi)874 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
875 {
876 /* Check permissions match */
877 if ((ni->inp_flags2 & INP_BINDMULTI) &&
878 (ni->inp_cred->cr_uid !=
879 oi->inp_cred->cr_uid))
880 return (0);
881
882 /* Check the existing inp has BINDMULTI set */
883 if ((ni->inp_flags2 & INP_BINDMULTI) &&
884 ((oi->inp_flags2 & INP_BINDMULTI) == 0))
885 return (0);
886
887 /*
888 * We're okay - either INP_BINDMULTI isn't set on ni, or
889 * it is and it matches the checks.
890 */
891 return (1);
892 }
893
894 #ifdef INET
895 /*
896 * Set up a bind operation on a PCB, performing port allocation
897 * as required, but do not actually modify the PCB. Callers can
898 * either complete the bind by setting inp_laddr/inp_lport and
899 * calling in_pcbinshash(), or they can just use the resulting
900 * port and address to authorise the sending of a once-off packet.
901 *
902 * On error, the values of *laddrp and *lportp are not changed.
903 */
904 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr * nam,in_addr_t * laddrp,u_short * lportp,struct ucred * cred)905 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
906 u_short *lportp, struct ucred *cred)
907 {
908 struct socket *so = inp->inp_socket;
909 struct sockaddr_in *sin;
910 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
911 struct in_addr laddr;
912 u_short lport = 0;
913 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
914 int error;
915
916 /*
917 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
918 * so that we don't have to add to the (already messy) code below.
919 */
920 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
921
922 /*
923 * No state changes, so read locks are sufficient here.
924 */
925 INP_LOCK_ASSERT(inp);
926 INP_HASH_LOCK_ASSERT(pcbinfo);
927
928 if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
929 return (EADDRNOTAVAIL);
930 laddr.s_addr = *laddrp;
931 if (nam != NULL && laddr.s_addr != INADDR_ANY)
932 return (EINVAL);
933 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
934 lookupflags = INPLOOKUP_WILDCARD;
935 if (nam == NULL) {
936 if ((error = prison_local_ip4(cred, &laddr)) != 0)
937 return (error);
938 } else {
939 sin = (struct sockaddr_in *)nam;
940 if (nam->sa_len != sizeof (*sin))
941 return (EINVAL);
942 #ifdef notdef
943 /*
944 * We should check the family, but old programs
945 * incorrectly fail to initialize it.
946 */
947 if (sin->sin_family != AF_INET)
948 return (EAFNOSUPPORT);
949 #endif
950 error = prison_local_ip4(cred, &sin->sin_addr);
951 if (error)
952 return (error);
953 if (sin->sin_port != *lportp) {
954 /* Don't allow the port to change. */
955 if (*lportp != 0)
956 return (EINVAL);
957 lport = sin->sin_port;
958 }
959 /* NB: lport is left as 0 if the port isn't being changed. */
960 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
961 /*
962 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
963 * allow complete duplication of binding if
964 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
965 * and a multicast address is bound on both
966 * new and duplicated sockets.
967 */
968 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
969 reuseport = SO_REUSEADDR|SO_REUSEPORT;
970 /*
971 * XXX: How to deal with SO_REUSEPORT_LB here?
972 * Treat same as SO_REUSEPORT for now.
973 */
974 if ((so->so_options &
975 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
976 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
977 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
978 sin->sin_port = 0; /* yech... */
979 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
980 /*
981 * Is the address a local IP address?
982 * If INP_BINDANY is set, then the socket may be bound
983 * to any endpoint address, local or not.
984 */
985 if ((inp->inp_flags & INP_BINDANY) == 0 &&
986 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
987 return (EADDRNOTAVAIL);
988 }
989 laddr = sin->sin_addr;
990 if (lport) {
991 struct inpcb *t;
992 struct tcptw *tw;
993
994 /* GROSS */
995 if (ntohs(lport) <= V_ipport_reservedhigh &&
996 ntohs(lport) >= V_ipport_reservedlow &&
997 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
998 return (EACCES);
999 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
1000 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
1001 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1002 lport, INPLOOKUP_WILDCARD, cred);
1003 /*
1004 * XXX
1005 * This entire block sorely needs a rewrite.
1006 */
1007 if (t &&
1008 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1009 ((t->inp_flags & INP_TIMEWAIT) == 0) &&
1010 (so->so_type != SOCK_STREAM ||
1011 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
1012 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
1013 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
1014 (t->inp_flags2 & INP_REUSEPORT) ||
1015 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
1016 (inp->inp_cred->cr_uid !=
1017 t->inp_cred->cr_uid))
1018 return (EADDRINUSE);
1019
1020 /*
1021 * If the socket is a BINDMULTI socket, then
1022 * the credentials need to match and the
1023 * original socket also has to have been bound
1024 * with BINDMULTI.
1025 */
1026 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1027 return (EADDRINUSE);
1028 }
1029 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1030 lport, lookupflags, cred);
1031 if (t && (t->inp_flags & INP_TIMEWAIT)) {
1032 /*
1033 * XXXRW: If an incpb has had its timewait
1034 * state recycled, we treat the address as
1035 * being in use (for now). This is better
1036 * than a panic, but not desirable.
1037 */
1038 tw = intotw(t);
1039 if (tw == NULL ||
1040 ((reuseport & tw->tw_so_options) == 0 &&
1041 (reuseport_lb &
1042 tw->tw_so_options) == 0)) {
1043 return (EADDRINUSE);
1044 }
1045 } else if (t &&
1046 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1047 (reuseport & inp_so_options(t)) == 0 &&
1048 (reuseport_lb & inp_so_options(t)) == 0) {
1049 #ifdef INET6
1050 if (ntohl(sin->sin_addr.s_addr) !=
1051 INADDR_ANY ||
1052 ntohl(t->inp_laddr.s_addr) !=
1053 INADDR_ANY ||
1054 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
1055 (t->inp_vflag & INP_IPV6PROTO) == 0)
1056 #endif
1057 return (EADDRINUSE);
1058 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1059 return (EADDRINUSE);
1060 }
1061 }
1062 }
1063 if (*lportp != 0)
1064 lport = *lportp;
1065 if (lport == 0) {
1066 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1067 if (error != 0)
1068 return (error);
1069 }
1070 *laddrp = laddr.s_addr;
1071 *lportp = lport;
1072 return (0);
1073 }
1074
1075 /*
1076 * Connect from a socket to a specified address.
1077 * Both address and port must be specified in argument sin.
1078 * If don't have a local address for this socket yet,
1079 * then pick one.
1080 */
1081 int
in_pcbconnect_mbuf(struct inpcb * inp,struct sockaddr * nam,struct ucred * cred,struct mbuf * m,bool rehash)1082 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
1083 struct ucred *cred, struct mbuf *m, bool rehash)
1084 {
1085 u_short lport, fport;
1086 in_addr_t laddr, faddr;
1087 int anonport, error;
1088
1089 INP_WLOCK_ASSERT(inp);
1090 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1091
1092 lport = inp->inp_lport;
1093 laddr = inp->inp_laddr.s_addr;
1094 anonport = (lport == 0);
1095 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1096 NULL, cred);
1097 if (error)
1098 return (error);
1099
1100 /* Do the initial binding of the local address if required. */
1101 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1102 KASSERT(rehash == true,
1103 ("Rehashing required for unbound inps"));
1104 inp->inp_lport = lport;
1105 inp->inp_laddr.s_addr = laddr;
1106 if (in_pcbinshash(inp) != 0) {
1107 inp->inp_laddr.s_addr = INADDR_ANY;
1108 inp->inp_lport = 0;
1109 return (EAGAIN);
1110 }
1111 }
1112
1113 /* Commit the remaining changes. */
1114 inp->inp_lport = lport;
1115 inp->inp_laddr.s_addr = laddr;
1116 inp->inp_faddr.s_addr = faddr;
1117 inp->inp_fport = fport;
1118 if (rehash) {
1119 in_pcbrehash_mbuf(inp, m);
1120 } else {
1121 in_pcbinshash_mbuf(inp, m);
1122 }
1123
1124 if (anonport)
1125 inp->inp_flags |= INP_ANONPORT;
1126 return (0);
1127 }
1128
1129 int
in_pcbconnect(struct inpcb * inp,struct sockaddr * nam,struct ucred * cred)1130 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
1131 {
1132
1133 return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
1134 }
1135
1136 /*
1137 * Do proper source address selection on an unbound socket in case
1138 * of connect. Take jails into account as well.
1139 */
1140 int
in_pcbladdr(struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1141 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1142 struct ucred *cred)
1143 {
1144 struct ifaddr *ifa;
1145 struct sockaddr *sa;
1146 struct sockaddr_in *sin, dst;
1147 struct nhop_object *nh;
1148 int error;
1149
1150 NET_EPOCH_ASSERT();
1151 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1152 /*
1153 * Bypass source address selection and use the primary jail IP
1154 * if requested.
1155 */
1156 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1157 return (0);
1158
1159 error = 0;
1160
1161 nh = NULL;
1162 bzero(&dst, sizeof(dst));
1163 sin = &dst;
1164 sin->sin_family = AF_INET;
1165 sin->sin_len = sizeof(struct sockaddr_in);
1166 sin->sin_addr.s_addr = faddr->s_addr;
1167
1168 /*
1169 * If route is known our src addr is taken from the i/f,
1170 * else punt.
1171 *
1172 * Find out route to destination.
1173 */
1174 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1175 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1176 0, NHR_NONE, 0);
1177
1178 /*
1179 * If we found a route, use the address corresponding to
1180 * the outgoing interface.
1181 *
1182 * Otherwise assume faddr is reachable on a directly connected
1183 * network and try to find a corresponding interface to take
1184 * the source address from.
1185 */
1186 if (nh == NULL || nh->nh_ifp == NULL) {
1187 struct in_ifaddr *ia;
1188 struct ifnet *ifp;
1189
1190 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1191 inp->inp_socket->so_fibnum));
1192 if (ia == NULL) {
1193 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1194 inp->inp_socket->so_fibnum));
1195 }
1196 if (ia == NULL) {
1197 error = ENETUNREACH;
1198 goto done;
1199 }
1200
1201 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1202 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1203 goto done;
1204 }
1205
1206 ifp = ia->ia_ifp;
1207 ia = NULL;
1208 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1209 sa = ifa->ifa_addr;
1210 if (sa->sa_family != AF_INET)
1211 continue;
1212 sin = (struct sockaddr_in *)sa;
1213 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1214 ia = (struct in_ifaddr *)ifa;
1215 break;
1216 }
1217 }
1218 if (ia != NULL) {
1219 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1220 goto done;
1221 }
1222
1223 /* 3. As a last resort return the 'default' jail address. */
1224 error = prison_get_ip4(cred, laddr);
1225 goto done;
1226 }
1227
1228 /*
1229 * If the outgoing interface on the route found is not
1230 * a loopback interface, use the address from that interface.
1231 * In case of jails do those three steps:
1232 * 1. check if the interface address belongs to the jail. If so use it.
1233 * 2. check if we have any address on the outgoing interface
1234 * belonging to this jail. If so use it.
1235 * 3. as a last resort return the 'default' jail address.
1236 */
1237 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1238 struct in_ifaddr *ia;
1239 struct ifnet *ifp;
1240
1241 /* If not jailed, use the default returned. */
1242 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1243 ia = (struct in_ifaddr *)nh->nh_ifa;
1244 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1245 goto done;
1246 }
1247
1248 /* Jailed. */
1249 /* 1. Check if the iface address belongs to the jail. */
1250 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1251 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1252 ia = (struct in_ifaddr *)nh->nh_ifa;
1253 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1254 goto done;
1255 }
1256
1257 /*
1258 * 2. Check if we have any address on the outgoing interface
1259 * belonging to this jail.
1260 */
1261 ia = NULL;
1262 ifp = nh->nh_ifp;
1263 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1264 sa = ifa->ifa_addr;
1265 if (sa->sa_family != AF_INET)
1266 continue;
1267 sin = (struct sockaddr_in *)sa;
1268 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1269 ia = (struct in_ifaddr *)ifa;
1270 break;
1271 }
1272 }
1273 if (ia != NULL) {
1274 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1275 goto done;
1276 }
1277
1278 /* 3. As a last resort return the 'default' jail address. */
1279 error = prison_get_ip4(cred, laddr);
1280 goto done;
1281 }
1282
1283 /*
1284 * The outgoing interface is marked with 'loopback net', so a route
1285 * to ourselves is here.
1286 * Try to find the interface of the destination address and then
1287 * take the address from there. That interface is not necessarily
1288 * a loopback interface.
1289 * In case of jails, check that it is an address of the jail
1290 * and if we cannot find, fall back to the 'default' jail address.
1291 */
1292 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1293 struct in_ifaddr *ia;
1294
1295 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1296 inp->inp_socket->so_fibnum));
1297 if (ia == NULL)
1298 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1299 inp->inp_socket->so_fibnum));
1300 if (ia == NULL)
1301 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1302
1303 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1304 if (ia == NULL) {
1305 error = ENETUNREACH;
1306 goto done;
1307 }
1308 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1309 goto done;
1310 }
1311
1312 /* Jailed. */
1313 if (ia != NULL) {
1314 struct ifnet *ifp;
1315
1316 ifp = ia->ia_ifp;
1317 ia = NULL;
1318 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1319 sa = ifa->ifa_addr;
1320 if (sa->sa_family != AF_INET)
1321 continue;
1322 sin = (struct sockaddr_in *)sa;
1323 if (prison_check_ip4(cred,
1324 &sin->sin_addr) == 0) {
1325 ia = (struct in_ifaddr *)ifa;
1326 break;
1327 }
1328 }
1329 if (ia != NULL) {
1330 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1331 goto done;
1332 }
1333 }
1334
1335 /* 3. As a last resort return the 'default' jail address. */
1336 error = prison_get_ip4(cred, laddr);
1337 goto done;
1338 }
1339
1340 done:
1341 return (error);
1342 }
1343
1344 /*
1345 * Set up for a connect from a socket to the specified address.
1346 * On entry, *laddrp and *lportp should contain the current local
1347 * address and port for the PCB; these are updated to the values
1348 * that should be placed in inp_laddr and inp_lport to complete
1349 * the connect.
1350 *
1351 * On success, *faddrp and *fportp will be set to the remote address
1352 * and port. These are not updated in the error case.
1353 *
1354 * If the operation fails because the connection already exists,
1355 * *oinpp will be set to the PCB of that connection so that the
1356 * caller can decide to override it. In all other cases, *oinpp
1357 * is set to NULL.
1358 */
1359 int
in_pcbconnect_setup(struct inpcb * inp,struct sockaddr * nam,in_addr_t * laddrp,u_short * lportp,in_addr_t * faddrp,u_short * fportp,struct inpcb ** oinpp,struct ucred * cred)1360 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1361 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1362 struct inpcb **oinpp, struct ucred *cred)
1363 {
1364 struct rm_priotracker in_ifa_tracker;
1365 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1366 struct in_ifaddr *ia;
1367 struct inpcb *oinp;
1368 struct in_addr laddr, faddr;
1369 u_short lport, fport;
1370 int error;
1371
1372 /*
1373 * Because a global state change doesn't actually occur here, a read
1374 * lock is sufficient.
1375 */
1376 NET_EPOCH_ASSERT();
1377 INP_LOCK_ASSERT(inp);
1378 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1379
1380 if (oinpp != NULL)
1381 *oinpp = NULL;
1382 if (nam->sa_len != sizeof (*sin))
1383 return (EINVAL);
1384 if (sin->sin_family != AF_INET)
1385 return (EAFNOSUPPORT);
1386 if (sin->sin_port == 0)
1387 return (EADDRNOTAVAIL);
1388 laddr.s_addr = *laddrp;
1389 lport = *lportp;
1390 faddr = sin->sin_addr;
1391 fport = sin->sin_port;
1392 #ifdef ROUTE_MPATH
1393 if (CALC_FLOWID_OUTBOUND) {
1394 uint32_t hash_val, hash_type;
1395
1396 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1397 inp->inp_socket->so_proto->pr_protocol, &hash_type);
1398
1399 inp->inp_flowid = hash_val;
1400 inp->inp_flowtype = hash_type;
1401 }
1402 #endif
1403 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1404 /*
1405 * If the destination address is INADDR_ANY,
1406 * use the primary local address.
1407 * If the supplied address is INADDR_BROADCAST,
1408 * and the primary interface supports broadcast,
1409 * choose the broadcast address for that interface.
1410 */
1411 if (faddr.s_addr == INADDR_ANY) {
1412 IN_IFADDR_RLOCK(&in_ifa_tracker);
1413 faddr =
1414 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1415 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1416 if (cred != NULL &&
1417 (error = prison_get_ip4(cred, &faddr)) != 0)
1418 return (error);
1419 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1420 IN_IFADDR_RLOCK(&in_ifa_tracker);
1421 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1422 IFF_BROADCAST)
1423 faddr = satosin(&CK_STAILQ_FIRST(
1424 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1425 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1426 }
1427 }
1428 #ifdef FSTACK
1429 if (laddr.s_addr == INADDR_ANY) {
1430 ff_in_pcbladdr(AF_INET, &faddr, fport, &laddr);
1431 }
1432 #endif
1433 if (laddr.s_addr == INADDR_ANY) {
1434 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1435 /*
1436 * If the destination address is multicast and an outgoing
1437 * interface has been set as a multicast option, prefer the
1438 * address of that interface as our source address.
1439 */
1440 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1441 inp->inp_moptions != NULL) {
1442 struct ip_moptions *imo;
1443 struct ifnet *ifp;
1444
1445 imo = inp->inp_moptions;
1446 if (imo->imo_multicast_ifp != NULL) {
1447 ifp = imo->imo_multicast_ifp;
1448 IN_IFADDR_RLOCK(&in_ifa_tracker);
1449 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1450 if ((ia->ia_ifp == ifp) &&
1451 (cred == NULL ||
1452 prison_check_ip4(cred,
1453 &ia->ia_addr.sin_addr) == 0))
1454 break;
1455 }
1456 if (ia == NULL)
1457 error = EADDRNOTAVAIL;
1458 else {
1459 laddr = ia->ia_addr.sin_addr;
1460 error = 0;
1461 }
1462 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1463 }
1464 }
1465 if (error)
1466 return (error);
1467 }
1468
1469 if (lport != 0) {
1470 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1471 fport, laddr, lport, 0, NULL, M_NODOM);
1472 if (oinp != NULL) {
1473 if (oinpp != NULL)
1474 *oinpp = oinp;
1475 return (EADDRINUSE);
1476 }
1477 } else {
1478 #ifndef FSTACK
1479 struct sockaddr_in lsin, fsin;
1480
1481 bzero(&lsin, sizeof(lsin));
1482 bzero(&fsin, sizeof(fsin));
1483 lsin.sin_family = AF_INET;
1484 lsin.sin_addr = laddr;
1485 fsin.sin_family = AF_INET;
1486 fsin.sin_addr = faddr;
1487 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1488 &lport, (struct sockaddr *)& fsin, fport, cred,
1489 INPLOOKUP_WILDCARD);
1490 if (error)
1491 return (error);
1492 #else
1493 struct ifaddr *ifa;
1494 struct ifnet *ifp;
1495 struct sockaddr_in ifp_sin;
1496 unsigned loop_count = 0;
1497 bzero(&ifp_sin, sizeof(ifp_sin));
1498 ifp_sin.sin_addr.s_addr = laddr.s_addr;
1499 ifp_sin.sin_family = AF_INET;
1500 ifp_sin.sin_len = sizeof(ifp_sin);
1501 ifa = ifa_ifwithnet((struct sockaddr *)&ifp_sin, 0, RT_ALL_FIBS);
1502 if (ifa == NULL) {
1503 ifp_sin.sin_addr.s_addr = faddr.s_addr;
1504 ifa = ifa_ifwithnet((struct sockaddr *)&ifp_sin, 0, RT_ALL_FIBS);
1505 if ( ifa == NULL )
1506 return (EADDRNOTAVAIL);
1507 }
1508 ifp = ifa->ifa_ifp;
1509 while (lport == 0) {
1510 int rss;
1511 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1512 cred);
1513 if (error)
1514 return (error);
1515 rss = ff_rss_check(ifp->if_softc, faddr.s_addr, laddr.s_addr,
1516 fport, lport);
1517 if (rss) {
1518 break;
1519 }
1520 lport = 0;
1521 /* Note:
1522 * if all ports are completely used, just return.
1523 * this ugly code is not a correct way, it just lets loop quit.
1524 * we will fix it as soon as possible.
1525 */
1526 if (++loop_count >= 65535) {
1527 return (EADDRNOTAVAIL);
1528 }
1529 }
1530 #endif
1531 }
1532 *laddrp = laddr.s_addr;
1533 *lportp = lport;
1534 *faddrp = faddr.s_addr;
1535 *fportp = fport;
1536 return (0);
1537 }
1538
1539 void
in_pcbdisconnect(struct inpcb * inp)1540 in_pcbdisconnect(struct inpcb *inp)
1541 {
1542
1543 INP_WLOCK_ASSERT(inp);
1544 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1545
1546 inp->inp_faddr.s_addr = INADDR_ANY;
1547 inp->inp_fport = 0;
1548 in_pcbrehash(inp);
1549 }
1550 #endif /* INET */
1551
1552 /*
1553 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1554 * For most protocols, this will be invoked immediately prior to calling
1555 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the
1556 * socket, in which case in_pcbfree() is deferred.
1557 */
1558 void
in_pcbdetach(struct inpcb * inp)1559 in_pcbdetach(struct inpcb *inp)
1560 {
1561
1562 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1563
1564 #ifdef RATELIMIT
1565 if (inp->inp_snd_tag != NULL)
1566 in_pcbdetach_txrtlmt(inp);
1567 #endif
1568 inp->inp_socket->so_pcb = NULL;
1569 inp->inp_socket = NULL;
1570 }
1571
1572 /*
1573 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1574 * stability of an inpcb pointer despite the inpcb lock being released. This
1575 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1576 * but where the inpcb lock may already held, or when acquiring a reference
1577 * via a pcbgroup.
1578 *
1579 * in_pcbref() should be used only to provide brief memory stability, and
1580 * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1581 * garbage collect the inpcb if it has been in_pcbfree()'d from another
1582 * context. Until in_pcbrele() has returned that the inpcb is still valid,
1583 * lock and rele are the *only* safe operations that may be performed on the
1584 * inpcb.
1585 *
1586 * While the inpcb will not be freed, releasing the inpcb lock means that the
1587 * connection's state may change, so the caller should be careful to
1588 * revalidate any cached state on reacquiring the lock. Drop the reference
1589 * using in_pcbrele().
1590 */
1591 void
in_pcbref(struct inpcb * inp)1592 in_pcbref(struct inpcb *inp)
1593 {
1594
1595 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1596
1597 refcount_acquire(&inp->inp_refcount);
1598 }
1599
1600 /*
1601 * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1602 * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1603 * return a flag indicating whether or not the inpcb remains valid. If it is
1604 * valid, we return with the inpcb lock held.
1605 *
1606 * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1607 * reference on an inpcb. Historically more work was done here (actually, in
1608 * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1609 * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
1610 * about memory stability (and continued use of the write lock).
1611 */
1612 int
in_pcbrele_rlocked(struct inpcb * inp)1613 in_pcbrele_rlocked(struct inpcb *inp)
1614 {
1615 struct inpcbinfo *pcbinfo;
1616
1617 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1618
1619 INP_RLOCK_ASSERT(inp);
1620
1621 if (refcount_release(&inp->inp_refcount) == 0) {
1622 /*
1623 * If the inpcb has been freed, let the caller know, even if
1624 * this isn't the last reference.
1625 */
1626 if (inp->inp_flags2 & INP_FREED) {
1627 INP_RUNLOCK(inp);
1628 return (1);
1629 }
1630 return (0);
1631 }
1632
1633 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1634 #ifdef TCPHPTS
1635 if (inp->inp_in_hpts || inp->inp_in_input) {
1636 struct tcp_hpts_entry *hpts;
1637 /*
1638 * We should not be on the hpts at
1639 * this point in any form. we must
1640 * get the lock to be sure.
1641 */
1642 hpts = tcp_hpts_lock(inp);
1643 if (inp->inp_in_hpts)
1644 panic("Hpts:%p inp:%p at free still on hpts",
1645 hpts, inp);
1646 mtx_unlock(&hpts->p_mtx);
1647 hpts = tcp_input_lock(inp);
1648 if (inp->inp_in_input)
1649 panic("Hpts:%p inp:%p at free still on input hpts",
1650 hpts, inp);
1651 mtx_unlock(&hpts->p_mtx);
1652 }
1653 #endif
1654 INP_RUNLOCK(inp);
1655 pcbinfo = inp->inp_pcbinfo;
1656 uma_zfree(pcbinfo->ipi_zone, inp);
1657 return (1);
1658 }
1659
1660 int
in_pcbrele_wlocked(struct inpcb * inp)1661 in_pcbrele_wlocked(struct inpcb *inp)
1662 {
1663 struct inpcbinfo *pcbinfo;
1664
1665 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1666
1667 INP_WLOCK_ASSERT(inp);
1668
1669 if (refcount_release(&inp->inp_refcount) == 0) {
1670 /*
1671 * If the inpcb has been freed, let the caller know, even if
1672 * this isn't the last reference.
1673 */
1674 if (inp->inp_flags2 & INP_FREED) {
1675 INP_WUNLOCK(inp);
1676 return (1);
1677 }
1678 return (0);
1679 }
1680
1681 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1682 #ifdef TCPHPTS
1683 if (inp->inp_in_hpts || inp->inp_in_input) {
1684 struct tcp_hpts_entry *hpts;
1685 /*
1686 * We should not be on the hpts at
1687 * this point in any form. we must
1688 * get the lock to be sure.
1689 */
1690 hpts = tcp_hpts_lock(inp);
1691 if (inp->inp_in_hpts)
1692 panic("Hpts:%p inp:%p at free still on hpts",
1693 hpts, inp);
1694 mtx_unlock(&hpts->p_mtx);
1695 hpts = tcp_input_lock(inp);
1696 if (inp->inp_in_input)
1697 panic("Hpts:%p inp:%p at free still on input hpts",
1698 hpts, inp);
1699 mtx_unlock(&hpts->p_mtx);
1700 }
1701 #endif
1702 INP_WUNLOCK(inp);
1703 pcbinfo = inp->inp_pcbinfo;
1704 uma_zfree(pcbinfo->ipi_zone, inp);
1705 return (1);
1706 }
1707
1708 /*
1709 * Temporary wrapper.
1710 */
1711 int
in_pcbrele(struct inpcb * inp)1712 in_pcbrele(struct inpcb *inp)
1713 {
1714
1715 return (in_pcbrele_wlocked(inp));
1716 }
1717
1718 void
in_pcblist_rele_rlocked(epoch_context_t ctx)1719 in_pcblist_rele_rlocked(epoch_context_t ctx)
1720 {
1721 struct in_pcblist *il;
1722 struct inpcb *inp;
1723 struct inpcbinfo *pcbinfo;
1724 int i, n;
1725
1726 il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
1727 pcbinfo = il->il_pcbinfo;
1728 n = il->il_count;
1729 INP_INFO_WLOCK(pcbinfo);
1730 for (i = 0; i < n; i++) {
1731 inp = il->il_inp_list[i];
1732 INP_RLOCK(inp);
1733 if (!in_pcbrele_rlocked(inp))
1734 INP_RUNLOCK(inp);
1735 }
1736 INP_INFO_WUNLOCK(pcbinfo);
1737 free(il, M_TEMP);
1738 }
1739
1740 static void
inpcbport_free(epoch_context_t ctx)1741 inpcbport_free(epoch_context_t ctx)
1742 {
1743 struct inpcbport *phd;
1744
1745 phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
1746 free(phd, M_PCB);
1747 }
1748
1749 static void
in_pcbfree_deferred(epoch_context_t ctx)1750 in_pcbfree_deferred(epoch_context_t ctx)
1751 {
1752 struct inpcb *inp;
1753 int released __unused;
1754
1755 inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
1756
1757 INP_WLOCK(inp);
1758 CURVNET_SET(inp->inp_vnet);
1759 #ifdef INET
1760 struct ip_moptions *imo = inp->inp_moptions;
1761 inp->inp_moptions = NULL;
1762 #endif
1763 /* XXXRW: Do as much as possible here. */
1764 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1765 if (inp->inp_sp != NULL)
1766 ipsec_delete_pcbpolicy(inp);
1767 #endif
1768 #ifdef INET6
1769 struct ip6_moptions *im6o = NULL;
1770 if (inp->inp_vflag & INP_IPV6PROTO) {
1771 ip6_freepcbopts(inp->in6p_outputopts);
1772 im6o = inp->in6p_moptions;
1773 inp->in6p_moptions = NULL;
1774 }
1775 #endif
1776 if (inp->inp_options)
1777 (void)m_free(inp->inp_options);
1778 inp->inp_vflag = 0;
1779 crfree(inp->inp_cred);
1780 #ifdef MAC
1781 mac_inpcb_destroy(inp);
1782 #endif
1783 released = in_pcbrele_wlocked(inp);
1784 MPASS(released);
1785 #ifdef INET6
1786 ip6_freemoptions(im6o);
1787 #endif
1788 #ifdef INET
1789 inp_freemoptions(imo);
1790 #endif
1791 CURVNET_RESTORE();
1792 }
1793
1794 /*
1795 * Unconditionally schedule an inpcb to be freed by decrementing its
1796 * reference count, which should occur only after the inpcb has been detached
1797 * from its socket. If another thread holds a temporary reference (acquired
1798 * using in_pcbref()) then the free is deferred until that reference is
1799 * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
1800 * work, including removal from global lists, is done in this context, where
1801 * the pcbinfo lock is held.
1802 */
1803 void
in_pcbfree(struct inpcb * inp)1804 in_pcbfree(struct inpcb *inp)
1805 {
1806 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1807
1808 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1809 KASSERT((inp->inp_flags2 & INP_FREED) == 0,
1810 ("%s: called twice for pcb %p", __func__, inp));
1811 if (inp->inp_flags2 & INP_FREED) {
1812 INP_WUNLOCK(inp);
1813 return;
1814 }
1815
1816 INP_WLOCK_ASSERT(inp);
1817 INP_LIST_WLOCK(pcbinfo);
1818 in_pcbremlists(inp);
1819 INP_LIST_WUNLOCK(pcbinfo);
1820 RO_INVALIDATE_CACHE(&inp->inp_route);
1821 /* mark as destruction in progress */
1822 inp->inp_flags2 |= INP_FREED;
1823 INP_WUNLOCK(inp);
1824 NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
1825 }
1826
1827 /*
1828 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1829 * port reservation, and preventing it from being returned by inpcb lookups.
1830 *
1831 * It is used by TCP to mark an inpcb as unused and avoid future packet
1832 * delivery or event notification when a socket remains open but TCP has
1833 * closed. This might occur as a result of a shutdown()-initiated TCP close
1834 * or a RST on the wire, and allows the port binding to be reused while still
1835 * maintaining the invariant that so_pcb always points to a valid inpcb until
1836 * in_pcbdetach().
1837 *
1838 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1839 * in_pcbnotifyall() and in_pcbpurgeif0()?
1840 */
1841 void
in_pcbdrop(struct inpcb * inp)1842 in_pcbdrop(struct inpcb *inp)
1843 {
1844
1845 INP_WLOCK_ASSERT(inp);
1846 #ifdef INVARIANTS
1847 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1848 MPASS(inp->inp_refcount > 1);
1849 #endif
1850
1851 /*
1852 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1853 * the hash lock...?
1854 */
1855 inp->inp_flags |= INP_DROPPED;
1856 if (inp->inp_flags & INP_INHASHLIST) {
1857 struct inpcbport *phd = inp->inp_phd;
1858
1859 INP_HASH_WLOCK(inp->inp_pcbinfo);
1860 in_pcbremlbgrouphash(inp);
1861 CK_LIST_REMOVE(inp, inp_hash);
1862 CK_LIST_REMOVE(inp, inp_portlist);
1863 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1864 CK_LIST_REMOVE(phd, phd_hash);
1865 NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
1866 }
1867 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1868 inp->inp_flags &= ~INP_INHASHLIST;
1869 #ifdef PCBGROUP
1870 in_pcbgroup_remove(inp);
1871 #endif
1872 }
1873 }
1874
1875 #ifdef INET
1876 /*
1877 * Common routines to return the socket addresses associated with inpcbs.
1878 */
1879 struct sockaddr *
in_sockaddr(in_port_t port,struct in_addr * addr_p)1880 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1881 {
1882 struct sockaddr_in *sin;
1883
1884 sin = malloc(sizeof *sin, M_SONAME,
1885 M_WAITOK | M_ZERO);
1886 sin->sin_family = AF_INET;
1887 sin->sin_len = sizeof(*sin);
1888 sin->sin_addr = *addr_p;
1889 sin->sin_port = port;
1890
1891 return (struct sockaddr *)sin;
1892 }
1893
1894 int
in_getsockaddr(struct socket * so,struct sockaddr ** nam)1895 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1896 {
1897 struct inpcb *inp;
1898 struct in_addr addr;
1899 in_port_t port;
1900
1901 inp = sotoinpcb(so);
1902 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1903
1904 INP_RLOCK(inp);
1905 port = inp->inp_lport;
1906 addr = inp->inp_laddr;
1907 INP_RUNLOCK(inp);
1908
1909 *nam = in_sockaddr(port, &addr);
1910 return 0;
1911 }
1912
1913 int
in_getpeeraddr(struct socket * so,struct sockaddr ** nam)1914 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1915 {
1916 struct inpcb *inp;
1917 struct in_addr addr;
1918 in_port_t port;
1919
1920 inp = sotoinpcb(so);
1921 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1922
1923 INP_RLOCK(inp);
1924 port = inp->inp_fport;
1925 addr = inp->inp_faddr;
1926 INP_RUNLOCK(inp);
1927
1928 *nam = in_sockaddr(port, &addr);
1929 return 0;
1930 }
1931
1932 void
in_pcbnotifyall(struct inpcbinfo * pcbinfo,struct in_addr faddr,int errno,struct inpcb * (* notify)(struct inpcb *,int))1933 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1934 struct inpcb *(*notify)(struct inpcb *, int))
1935 {
1936 struct inpcb *inp, *inp_temp;
1937
1938 INP_INFO_WLOCK(pcbinfo);
1939 CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1940 INP_WLOCK(inp);
1941 #ifdef INET6
1942 if ((inp->inp_vflag & INP_IPV4) == 0) {
1943 INP_WUNLOCK(inp);
1944 continue;
1945 }
1946 #endif
1947 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1948 inp->inp_socket == NULL) {
1949 INP_WUNLOCK(inp);
1950 continue;
1951 }
1952 if ((*notify)(inp, errno))
1953 INP_WUNLOCK(inp);
1954 }
1955 INP_INFO_WUNLOCK(pcbinfo);
1956 }
1957
1958 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1959 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1960 {
1961 struct inpcb *inp;
1962 struct in_multi *inm;
1963 struct in_mfilter *imf;
1964 struct ip_moptions *imo;
1965
1966 INP_INFO_WLOCK(pcbinfo);
1967 CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1968 INP_WLOCK(inp);
1969 imo = inp->inp_moptions;
1970 if ((inp->inp_vflag & INP_IPV4) &&
1971 imo != NULL) {
1972 /*
1973 * Unselect the outgoing interface if it is being
1974 * detached.
1975 */
1976 if (imo->imo_multicast_ifp == ifp)
1977 imo->imo_multicast_ifp = NULL;
1978
1979 /*
1980 * Drop multicast group membership if we joined
1981 * through the interface being detached.
1982 *
1983 * XXX This can all be deferred to an epoch_call
1984 */
1985 restart:
1986 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1987 if ((inm = imf->imf_inm) == NULL)
1988 continue;
1989 if (inm->inm_ifp != ifp)
1990 continue;
1991 ip_mfilter_remove(&imo->imo_head, imf);
1992 IN_MULTI_LOCK_ASSERT();
1993 in_leavegroup_locked(inm, NULL);
1994 ip_mfilter_free(imf);
1995 goto restart;
1996 }
1997 }
1998 INP_WUNLOCK(inp);
1999 }
2000 INP_INFO_WUNLOCK(pcbinfo);
2001 }
2002
2003 /*
2004 * Lookup a PCB based on the local address and port. Caller must hold the
2005 * hash lock. No inpcb locks or references are acquired.
2006 */
2007 #define INP_LOOKUP_MAPPED_PCB_COST 3
2008 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int lookupflags,struct ucred * cred)2009 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2010 u_short lport, int lookupflags, struct ucred *cred)
2011 {
2012 struct inpcb *inp;
2013 #ifdef INET6
2014 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2015 #else
2016 int matchwild = 3;
2017 #endif
2018 int wildcard;
2019
2020 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2021 ("%s: invalid lookup flags %d", __func__, lookupflags));
2022
2023 INP_HASH_LOCK_ASSERT(pcbinfo);
2024
2025 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2026 struct inpcbhead *head;
2027 /*
2028 * Look for an unconnected (wildcard foreign addr) PCB that
2029 * matches the local address and port we're looking for.
2030 */
2031 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2032 0, pcbinfo->ipi_hashmask)];
2033 CK_LIST_FOREACH(inp, head, inp_hash) {
2034 #ifdef INET6
2035 /* XXX inp locking */
2036 if ((inp->inp_vflag & INP_IPV4) == 0)
2037 continue;
2038 #endif
2039 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2040 inp->inp_laddr.s_addr == laddr.s_addr &&
2041 inp->inp_lport == lport) {
2042 /*
2043 * Found?
2044 */
2045 if (cred == NULL ||
2046 prison_equal_ip4(cred->cr_prison,
2047 inp->inp_cred->cr_prison))
2048 return (inp);
2049 }
2050 }
2051 /*
2052 * Not found.
2053 */
2054 return (NULL);
2055 } else {
2056 struct inpcbporthead *porthash;
2057 struct inpcbport *phd;
2058 struct inpcb *match = NULL;
2059 /*
2060 * Best fit PCB lookup.
2061 *
2062 * First see if this local port is in use by looking on the
2063 * port hash list.
2064 */
2065 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2066 pcbinfo->ipi_porthashmask)];
2067 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2068 if (phd->phd_port == lport)
2069 break;
2070 }
2071 if (phd != NULL) {
2072 /*
2073 * Port is in use by one or more PCBs. Look for best
2074 * fit.
2075 */
2076 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2077 wildcard = 0;
2078 if (cred != NULL &&
2079 !prison_equal_ip4(inp->inp_cred->cr_prison,
2080 cred->cr_prison))
2081 continue;
2082 #ifdef INET6
2083 /* XXX inp locking */
2084 if ((inp->inp_vflag & INP_IPV4) == 0)
2085 continue;
2086 /*
2087 * We never select the PCB that has
2088 * INP_IPV6 flag and is bound to :: if
2089 * we have another PCB which is bound
2090 * to 0.0.0.0. If a PCB has the
2091 * INP_IPV6 flag, then we set its cost
2092 * higher than IPv4 only PCBs.
2093 *
2094 * Note that the case only happens
2095 * when a socket is bound to ::, under
2096 * the condition that the use of the
2097 * mapped address is allowed.
2098 */
2099 if ((inp->inp_vflag & INP_IPV6) != 0)
2100 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2101 #endif
2102 if (inp->inp_faddr.s_addr != INADDR_ANY)
2103 wildcard++;
2104 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2105 if (laddr.s_addr == INADDR_ANY)
2106 wildcard++;
2107 else if (inp->inp_laddr.s_addr != laddr.s_addr)
2108 continue;
2109 } else {
2110 if (laddr.s_addr != INADDR_ANY)
2111 wildcard++;
2112 }
2113 if (wildcard < matchwild) {
2114 match = inp;
2115 matchwild = wildcard;
2116 if (matchwild == 0)
2117 break;
2118 }
2119 }
2120 }
2121 return (match);
2122 }
2123 }
2124 #undef INP_LOOKUP_MAPPED_PCB_COST
2125
2126 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * laddr,uint16_t lport,const struct in_addr * faddr,uint16_t fport,int lookupflags,int numa_domain)2127 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2128 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2129 uint16_t fport, int lookupflags, int numa_domain)
2130 {
2131 struct inpcb *local_wild, *numa_wild;
2132 const struct inpcblbgrouphead *hdr;
2133 struct inpcblbgroup *grp;
2134 uint32_t idx;
2135
2136 INP_HASH_LOCK_ASSERT(pcbinfo);
2137
2138 hdr = &pcbinfo->ipi_lbgrouphashbase[
2139 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2140
2141 /*
2142 * Order of socket selection:
2143 * 1. non-wild.
2144 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2145 *
2146 * NOTE:
2147 * - Load balanced group does not contain jailed sockets
2148 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2149 */
2150 local_wild = NULL;
2151 numa_wild = NULL;
2152 CK_LIST_FOREACH(grp, hdr, il_list) {
2153 #ifdef INET6
2154 if (!(grp->il_vflag & INP_IPV4))
2155 continue;
2156 #endif
2157 if (grp->il_lport != lport)
2158 continue;
2159
2160 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
2161 grp->il_inpcnt;
2162 if (grp->il_laddr.s_addr == laddr->s_addr) {
2163 if (numa_domain == M_NODOM ||
2164 grp->il_numa_domain == numa_domain) {
2165 return (grp->il_inp[idx]);
2166 } else {
2167 numa_wild = grp->il_inp[idx];
2168 }
2169 }
2170 if (grp->il_laddr.s_addr == INADDR_ANY &&
2171 (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
2172 (local_wild == NULL || numa_domain == M_NODOM ||
2173 grp->il_numa_domain == numa_domain)) {
2174 local_wild = grp->il_inp[idx];
2175 }
2176 }
2177 if (numa_wild != NULL)
2178 return (numa_wild);
2179
2180 return (local_wild);
2181 }
2182
2183 #ifdef PCBGROUP
2184 /*
2185 * Lookup PCB in hash list, using pcbgroup tables.
2186 */
2187 static struct inpcb *
in_pcblookup_group(struct inpcbinfo * pcbinfo,struct inpcbgroup * pcbgroup,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,struct ifnet * ifp)2188 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
2189 struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
2190 u_int lport_arg, int lookupflags, struct ifnet *ifp)
2191 {
2192 struct inpcbhead *head;
2193 struct inpcb *inp, *tmpinp;
2194 u_short fport = fport_arg, lport = lport_arg;
2195 bool locked;
2196
2197 /*
2198 * First look for an exact match.
2199 */
2200 tmpinp = NULL;
2201 INP_GROUP_LOCK(pcbgroup);
2202 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2203 pcbgroup->ipg_hashmask)];
2204 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2205 #ifdef INET6
2206 /* XXX inp locking */
2207 if ((inp->inp_vflag & INP_IPV4) == 0)
2208 continue;
2209 #endif
2210 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2211 inp->inp_laddr.s_addr == laddr.s_addr &&
2212 inp->inp_fport == fport &&
2213 inp->inp_lport == lport) {
2214 /*
2215 * XXX We should be able to directly return
2216 * the inp here, without any checks.
2217 * Well unless both bound with SO_REUSEPORT?
2218 */
2219 if (prison_flag(inp->inp_cred, PR_IP4))
2220 goto found;
2221 if (tmpinp == NULL)
2222 tmpinp = inp;
2223 }
2224 }
2225 if (tmpinp != NULL) {
2226 inp = tmpinp;
2227 goto found;
2228 }
2229
2230 #ifdef RSS
2231 /*
2232 * For incoming connections, we may wish to do a wildcard
2233 * match for an RSS-local socket.
2234 */
2235 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2236 struct inpcb *local_wild = NULL, *local_exact = NULL;
2237 #ifdef INET6
2238 struct inpcb *local_wild_mapped = NULL;
2239 #endif
2240 struct inpcb *jail_wild = NULL;
2241 struct inpcbhead *head;
2242 int injail;
2243
2244 /*
2245 * Order of socket selection - we always prefer jails.
2246 * 1. jailed, non-wild.
2247 * 2. jailed, wild.
2248 * 3. non-jailed, non-wild.
2249 * 4. non-jailed, wild.
2250 */
2251
2252 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
2253 lport, 0, pcbgroup->ipg_hashmask)];
2254 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2255 #ifdef INET6
2256 /* XXX inp locking */
2257 if ((inp->inp_vflag & INP_IPV4) == 0)
2258 continue;
2259 #endif
2260 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2261 inp->inp_lport != lport)
2262 continue;
2263
2264 injail = prison_flag(inp->inp_cred, PR_IP4);
2265 if (injail) {
2266 if (prison_check_ip4(inp->inp_cred,
2267 &laddr) != 0)
2268 continue;
2269 } else {
2270 if (local_exact != NULL)
2271 continue;
2272 }
2273
2274 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2275 if (injail)
2276 goto found;
2277 else
2278 local_exact = inp;
2279 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2280 #ifdef INET6
2281 /* XXX inp locking, NULL check */
2282 if (inp->inp_vflag & INP_IPV6PROTO)
2283 local_wild_mapped = inp;
2284 else
2285 #endif
2286 if (injail)
2287 jail_wild = inp;
2288 else
2289 local_wild = inp;
2290 }
2291 } /* LIST_FOREACH */
2292
2293 inp = jail_wild;
2294 if (inp == NULL)
2295 inp = local_exact;
2296 if (inp == NULL)
2297 inp = local_wild;
2298 #ifdef INET6
2299 if (inp == NULL)
2300 inp = local_wild_mapped;
2301 #endif
2302 if (inp != NULL)
2303 goto found;
2304 }
2305 #endif
2306
2307 /*
2308 * Then look for a wildcard match, if requested.
2309 */
2310 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2311 struct inpcb *local_wild = NULL, *local_exact = NULL;
2312 #ifdef INET6
2313 struct inpcb *local_wild_mapped = NULL;
2314 #endif
2315 struct inpcb *jail_wild = NULL;
2316 struct inpcbhead *head;
2317 int injail;
2318
2319 /*
2320 * Order of socket selection - we always prefer jails.
2321 * 1. jailed, non-wild.
2322 * 2. jailed, wild.
2323 * 3. non-jailed, non-wild.
2324 * 4. non-jailed, wild.
2325 */
2326 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
2327 0, pcbinfo->ipi_wildmask)];
2328 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
2329 #ifdef INET6
2330 /* XXX inp locking */
2331 if ((inp->inp_vflag & INP_IPV4) == 0)
2332 continue;
2333 #endif
2334 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2335 inp->inp_lport != lport)
2336 continue;
2337
2338 injail = prison_flag(inp->inp_cred, PR_IP4);
2339 if (injail) {
2340 if (prison_check_ip4(inp->inp_cred,
2341 &laddr) != 0)
2342 continue;
2343 } else {
2344 if (local_exact != NULL)
2345 continue;
2346 }
2347
2348 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2349 if (injail)
2350 goto found;
2351 else
2352 local_exact = inp;
2353 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2354 #ifdef INET6
2355 /* XXX inp locking, NULL check */
2356 if (inp->inp_vflag & INP_IPV6PROTO)
2357 local_wild_mapped = inp;
2358 else
2359 #endif
2360 if (injail)
2361 jail_wild = inp;
2362 else
2363 local_wild = inp;
2364 }
2365 } /* LIST_FOREACH */
2366 inp = jail_wild;
2367 if (inp == NULL)
2368 inp = local_exact;
2369 if (inp == NULL)
2370 inp = local_wild;
2371 #ifdef INET6
2372 if (inp == NULL)
2373 inp = local_wild_mapped;
2374 #endif
2375 if (inp != NULL)
2376 goto found;
2377 } /* if (lookupflags & INPLOOKUP_WILDCARD) */
2378 INP_GROUP_UNLOCK(pcbgroup);
2379 return (NULL);
2380
2381 found:
2382 if (lookupflags & INPLOOKUP_WLOCKPCB)
2383 locked = INP_TRY_WLOCK(inp);
2384 else if (lookupflags & INPLOOKUP_RLOCKPCB)
2385 locked = INP_TRY_RLOCK(inp);
2386 else
2387 panic("%s: locking bug", __func__);
2388 if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
2389 if (lookupflags & INPLOOKUP_WLOCKPCB)
2390 INP_WUNLOCK(inp);
2391 else
2392 INP_RUNLOCK(inp);
2393 return (NULL);
2394 } else if (!locked)
2395 in_pcbref(inp);
2396 INP_GROUP_UNLOCK(pcbgroup);
2397 if (!locked) {
2398 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2399 INP_WLOCK(inp);
2400 if (in_pcbrele_wlocked(inp))
2401 return (NULL);
2402 } else {
2403 INP_RLOCK(inp);
2404 if (in_pcbrele_rlocked(inp))
2405 return (NULL);
2406 }
2407 }
2408 #ifdef INVARIANTS
2409 if (lookupflags & INPLOOKUP_WLOCKPCB)
2410 INP_WLOCK_ASSERT(inp);
2411 else
2412 INP_RLOCK_ASSERT(inp);
2413 #endif
2414 return (inp);
2415 }
2416 #endif /* PCBGROUP */
2417
2418 /*
2419 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2420 * that the caller has locked the hash list, and will not perform any further
2421 * locking or reference operations on either the hash list or the connection.
2422 */
2423 static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,struct ifnet * ifp,uint8_t numa_domain)2424 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2425 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2426 struct ifnet *ifp, uint8_t numa_domain)
2427 {
2428 struct inpcbhead *head;
2429 struct inpcb *inp, *tmpinp;
2430 u_short fport = fport_arg, lport = lport_arg;
2431
2432 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2433 ("%s: invalid lookup flags %d", __func__, lookupflags));
2434 INP_HASH_LOCK_ASSERT(pcbinfo);
2435
2436 /*
2437 * First look for an exact match.
2438 */
2439 tmpinp = NULL;
2440 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2441 pcbinfo->ipi_hashmask)];
2442 CK_LIST_FOREACH(inp, head, inp_hash) {
2443 #ifdef INET6
2444 /* XXX inp locking */
2445 if ((inp->inp_vflag & INP_IPV4) == 0)
2446 continue;
2447 #endif
2448 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2449 inp->inp_laddr.s_addr == laddr.s_addr &&
2450 inp->inp_fport == fport &&
2451 inp->inp_lport == lport) {
2452 /*
2453 * XXX We should be able to directly return
2454 * the inp here, without any checks.
2455 * Well unless both bound with SO_REUSEPORT?
2456 */
2457 if (prison_flag(inp->inp_cred, PR_IP4))
2458 return (inp);
2459 if (tmpinp == NULL)
2460 tmpinp = inp;
2461 }
2462 }
2463 if (tmpinp != NULL)
2464 return (tmpinp);
2465
2466 /*
2467 * Then look in lb group (for wildcard match).
2468 */
2469 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2470 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2471 fport, lookupflags, numa_domain);
2472 if (inp != NULL)
2473 return (inp);
2474 }
2475
2476 /*
2477 * Then look for a wildcard match, if requested.
2478 */
2479 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2480 struct inpcb *local_wild = NULL, *local_exact = NULL;
2481 #ifdef INET6
2482 struct inpcb *local_wild_mapped = NULL;
2483 #endif
2484 struct inpcb *jail_wild = NULL;
2485 int injail;
2486
2487 /*
2488 * Order of socket selection - we always prefer jails.
2489 * 1. jailed, non-wild.
2490 * 2. jailed, wild.
2491 * 3. non-jailed, non-wild.
2492 * 4. non-jailed, wild.
2493 */
2494
2495 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2496 0, pcbinfo->ipi_hashmask)];
2497 CK_LIST_FOREACH(inp, head, inp_hash) {
2498 #ifdef INET6
2499 /* XXX inp locking */
2500 if ((inp->inp_vflag & INP_IPV4) == 0)
2501 continue;
2502 #endif
2503 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2504 inp->inp_lport != lport)
2505 continue;
2506
2507 injail = prison_flag(inp->inp_cred, PR_IP4);
2508 if (injail) {
2509 if (prison_check_ip4(inp->inp_cred,
2510 &laddr) != 0)
2511 continue;
2512 } else {
2513 if (local_exact != NULL)
2514 continue;
2515 }
2516
2517 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2518 if (injail)
2519 return (inp);
2520 else
2521 local_exact = inp;
2522 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2523 #ifdef INET6
2524 /* XXX inp locking, NULL check */
2525 if (inp->inp_vflag & INP_IPV6PROTO)
2526 local_wild_mapped = inp;
2527 else
2528 #endif
2529 if (injail)
2530 jail_wild = inp;
2531 else
2532 local_wild = inp;
2533 }
2534 } /* LIST_FOREACH */
2535 if (jail_wild != NULL)
2536 return (jail_wild);
2537 if (local_exact != NULL)
2538 return (local_exact);
2539 if (local_wild != NULL)
2540 return (local_wild);
2541 #ifdef INET6
2542 if (local_wild_mapped != NULL)
2543 return (local_wild_mapped);
2544 #endif
2545 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2546
2547 return (NULL);
2548 }
2549
2550 /*
2551 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the
2552 * hash list lock, and will return the inpcb locked (i.e., requires
2553 * INPLOOKUP_LOCKPCB).
2554 */
2555 static struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp,uint8_t numa_domain)2556 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2557 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2558 struct ifnet *ifp, uint8_t numa_domain)
2559 {
2560 struct inpcb *inp;
2561
2562 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2563 (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
2564 numa_domain);
2565 if (inp != NULL) {
2566 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2567 INP_WLOCK(inp);
2568 if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2569 INP_WUNLOCK(inp);
2570 inp = NULL;
2571 }
2572 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
2573 INP_RLOCK(inp);
2574 if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2575 INP_RUNLOCK(inp);
2576 inp = NULL;
2577 }
2578 } else
2579 panic("%s: locking bug", __func__);
2580 #ifdef INVARIANTS
2581 if (inp != NULL) {
2582 if (lookupflags & INPLOOKUP_WLOCKPCB)
2583 INP_WLOCK_ASSERT(inp);
2584 else
2585 INP_RLOCK_ASSERT(inp);
2586 }
2587 #endif
2588 }
2589
2590 return (inp);
2591 }
2592
2593 /*
2594 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2595 * from which a pre-calculated hash value may be extracted.
2596 *
2597 * Possibly more of this logic should be in in_pcbgroup.c.
2598 */
2599 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2600 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2601 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2602 {
2603 #if defined(PCBGROUP) && !defined(RSS)
2604 struct inpcbgroup *pcbgroup;
2605 #endif
2606
2607 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2608 ("%s: invalid lookup flags %d", __func__, lookupflags));
2609 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2610 ("%s: LOCKPCB not set", __func__));
2611
2612 /*
2613 * When not using RSS, use connection groups in preference to the
2614 * reservation table when looking up 4-tuples. When using RSS, just
2615 * use the reservation table, due to the cost of the Toeplitz hash
2616 * in software.
2617 *
2618 * XXXRW: This policy belongs in the pcbgroup code, as in principle
2619 * we could be doing RSS with a non-Toeplitz hash that is affordable
2620 * in software.
2621 */
2622 #if defined(PCBGROUP) && !defined(RSS)
2623 if (in_pcbgroup_enabled(pcbinfo)) {
2624 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2625 fport);
2626 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2627 laddr, lport, lookupflags, ifp));
2628 }
2629 #endif
2630 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2631 lookupflags, ifp, M_NODOM));
2632 }
2633
2634 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp,struct mbuf * m)2635 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2636 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2637 struct ifnet *ifp, struct mbuf *m)
2638 {
2639 #ifdef PCBGROUP
2640 struct inpcbgroup *pcbgroup;
2641 #endif
2642
2643 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2644 ("%s: invalid lookup flags %d", __func__, lookupflags));
2645 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2646 ("%s: LOCKPCB not set", __func__));
2647
2648 #ifdef PCBGROUP
2649 /*
2650 * If we can use a hardware-generated hash to look up the connection
2651 * group, use that connection group to find the inpcb. Otherwise
2652 * fall back on a software hash -- or the reservation table if we're
2653 * using RSS.
2654 *
2655 * XXXRW: As above, that policy belongs in the pcbgroup code.
2656 */
2657 if (in_pcbgroup_enabled(pcbinfo) &&
2658 !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2659 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2660 m->m_pkthdr.flowid);
2661 if (pcbgroup != NULL)
2662 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2663 fport, laddr, lport, lookupflags, ifp));
2664 #ifndef RSS
2665 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2666 fport);
2667 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2668 laddr, lport, lookupflags, ifp));
2669 #endif
2670 }
2671 #endif
2672 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2673 lookupflags, ifp, m->m_pkthdr.numa_domain));
2674 }
2675 #endif /* INET */
2676
2677 /*
2678 * Insert PCB onto various hash lists.
2679 */
2680 static int
in_pcbinshash_internal(struct inpcb * inp,struct mbuf * m)2681 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
2682 {
2683 struct inpcbhead *pcbhash;
2684 struct inpcbporthead *pcbporthash;
2685 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2686 struct inpcbport *phd;
2687 u_int32_t hashkey_faddr;
2688 int so_options;
2689
2690 INP_WLOCK_ASSERT(inp);
2691 INP_HASH_WLOCK_ASSERT(pcbinfo);
2692
2693 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2694 ("in_pcbinshash: INP_INHASHLIST"));
2695
2696 #ifdef INET6
2697 if (inp->inp_vflag & INP_IPV6)
2698 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2699 else
2700 #endif
2701 hashkey_faddr = inp->inp_faddr.s_addr;
2702
2703 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2704 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2705
2706 pcbporthash = &pcbinfo->ipi_porthashbase[
2707 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2708
2709 /*
2710 * Add entry to load balance group.
2711 * Only do this if SO_REUSEPORT_LB is set.
2712 */
2713 so_options = inp_so_options(inp);
2714 if (so_options & SO_REUSEPORT_LB) {
2715 int ret = in_pcbinslbgrouphash(inp, M_NODOM);
2716 if (ret) {
2717 /* pcb lb group malloc fail (ret=ENOBUFS). */
2718 return (ret);
2719 }
2720 }
2721
2722 /*
2723 * Go through port list and look for a head for this lport.
2724 */
2725 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2726 if (phd->phd_port == inp->inp_lport)
2727 break;
2728 }
2729 /*
2730 * If none exists, malloc one and tack it on.
2731 */
2732 if (phd == NULL) {
2733 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2734 if (phd == NULL) {
2735 return (ENOBUFS); /* XXX */
2736 }
2737 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
2738 phd->phd_port = inp->inp_lport;
2739 CK_LIST_INIT(&phd->phd_pcblist);
2740 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2741 }
2742 inp->inp_phd = phd;
2743 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2744 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2745 inp->inp_flags |= INP_INHASHLIST;
2746 #ifdef PCBGROUP
2747 if (m != NULL) {
2748 in_pcbgroup_update_mbuf(inp, m);
2749 } else {
2750 in_pcbgroup_update(inp);
2751 }
2752 #endif
2753 return (0);
2754 }
2755
2756 int
in_pcbinshash(struct inpcb * inp)2757 in_pcbinshash(struct inpcb *inp)
2758 {
2759
2760 return (in_pcbinshash_internal(inp, NULL));
2761 }
2762
2763 int
in_pcbinshash_mbuf(struct inpcb * inp,struct mbuf * m)2764 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
2765 {
2766
2767 return (in_pcbinshash_internal(inp, m));
2768 }
2769
2770 /*
2771 * Move PCB to the proper hash bucket when { faddr, fport } have been
2772 * changed. NOTE: This does not handle the case of the lport changing (the
2773 * hashed port list would have to be updated as well), so the lport must
2774 * not change after in_pcbinshash() has been called.
2775 */
2776 void
in_pcbrehash_mbuf(struct inpcb * inp,struct mbuf * m)2777 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2778 {
2779 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2780 struct inpcbhead *head;
2781 u_int32_t hashkey_faddr;
2782
2783 INP_WLOCK_ASSERT(inp);
2784 INP_HASH_WLOCK_ASSERT(pcbinfo);
2785
2786 KASSERT(inp->inp_flags & INP_INHASHLIST,
2787 ("in_pcbrehash: !INP_INHASHLIST"));
2788
2789 #ifdef INET6
2790 if (inp->inp_vflag & INP_IPV6)
2791 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2792 else
2793 #endif
2794 hashkey_faddr = inp->inp_faddr.s_addr;
2795
2796 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2797 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2798
2799 CK_LIST_REMOVE(inp, inp_hash);
2800 CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2801
2802 #ifdef PCBGROUP
2803 if (m != NULL)
2804 in_pcbgroup_update_mbuf(inp, m);
2805 else
2806 in_pcbgroup_update(inp);
2807 #endif
2808 }
2809
2810 void
in_pcbrehash(struct inpcb * inp)2811 in_pcbrehash(struct inpcb *inp)
2812 {
2813
2814 in_pcbrehash_mbuf(inp, NULL);
2815 }
2816
2817 /*
2818 * Remove PCB from various lists.
2819 */
2820 static void
in_pcbremlists(struct inpcb * inp)2821 in_pcbremlists(struct inpcb *inp)
2822 {
2823 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2824
2825 INP_WLOCK_ASSERT(inp);
2826 INP_LIST_WLOCK_ASSERT(pcbinfo);
2827
2828 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2829 if (inp->inp_flags & INP_INHASHLIST) {
2830 struct inpcbport *phd = inp->inp_phd;
2831
2832 INP_HASH_WLOCK(pcbinfo);
2833
2834 /* XXX: Only do if SO_REUSEPORT_LB set? */
2835 in_pcbremlbgrouphash(inp);
2836
2837 CK_LIST_REMOVE(inp, inp_hash);
2838 CK_LIST_REMOVE(inp, inp_portlist);
2839 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2840 CK_LIST_REMOVE(phd, phd_hash);
2841 NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
2842 }
2843 INP_HASH_WUNLOCK(pcbinfo);
2844 inp->inp_flags &= ~INP_INHASHLIST;
2845 }
2846 CK_LIST_REMOVE(inp, inp_list);
2847 pcbinfo->ipi_count--;
2848 #ifdef PCBGROUP
2849 in_pcbgroup_remove(inp);
2850 #endif
2851 }
2852
2853 /*
2854 * Check for alternatives when higher level complains
2855 * about service problems. For now, invalidate cached
2856 * routing information. If the route was created dynamically
2857 * (by a redirect), time to try a default gateway again.
2858 */
2859 void
in_losing(struct inpcb * inp)2860 in_losing(struct inpcb *inp)
2861 {
2862
2863 RO_INVALIDATE_CACHE(&inp->inp_route);
2864 return;
2865 }
2866
2867 /*
2868 * A set label operation has occurred at the socket layer, propagate the
2869 * label change into the in_pcb for the socket.
2870 */
2871 void
in_pcbsosetlabel(struct socket * so)2872 in_pcbsosetlabel(struct socket *so)
2873 {
2874 #ifdef MAC
2875 struct inpcb *inp;
2876
2877 inp = sotoinpcb(so);
2878 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2879
2880 INP_WLOCK(inp);
2881 SOCK_LOCK(so);
2882 mac_inpcb_sosetlabel(so, inp);
2883 SOCK_UNLOCK(so);
2884 INP_WUNLOCK(inp);
2885 #endif
2886 }
2887
2888 /*
2889 * ipport_tick runs once per second, determining if random port allocation
2890 * should be continued. If more than ipport_randomcps ports have been
2891 * allocated in the last second, then we return to sequential port
2892 * allocation. We return to random allocation only once we drop below
2893 * ipport_randomcps for at least ipport_randomtime seconds.
2894 */
2895 static void
ipport_tick(void * xtp)2896 ipport_tick(void *xtp)
2897 {
2898 VNET_ITERATOR_DECL(vnet_iter);
2899
2900 VNET_LIST_RLOCK_NOSLEEP();
2901 VNET_FOREACH(vnet_iter) {
2902 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2903 if (V_ipport_tcpallocs <=
2904 V_ipport_tcplastcount + V_ipport_randomcps) {
2905 if (V_ipport_stoprandom > 0)
2906 V_ipport_stoprandom--;
2907 } else
2908 V_ipport_stoprandom = V_ipport_randomtime;
2909 V_ipport_tcplastcount = V_ipport_tcpallocs;
2910 CURVNET_RESTORE();
2911 }
2912 VNET_LIST_RUNLOCK_NOSLEEP();
2913 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2914 }
2915
2916 static void
ip_fini(void * xtp)2917 ip_fini(void *xtp)
2918 {
2919
2920 callout_stop(&ipport_tick_callout);
2921 }
2922
2923 /*
2924 * The ipport_callout should start running at about the time we attach the
2925 * inet or inet6 domains.
2926 */
2927 static void
ipport_tick_init(const void * unused __unused)2928 ipport_tick_init(const void *unused __unused)
2929 {
2930
2931 /* Start ipport_tick. */
2932 callout_init(&ipport_tick_callout, 1);
2933 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2934 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2935 SHUTDOWN_PRI_DEFAULT);
2936 }
2937 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2938 ipport_tick_init, NULL);
2939
2940 void
inp_wlock(struct inpcb * inp)2941 inp_wlock(struct inpcb *inp)
2942 {
2943
2944 INP_WLOCK(inp);
2945 }
2946
2947 void
inp_wunlock(struct inpcb * inp)2948 inp_wunlock(struct inpcb *inp)
2949 {
2950
2951 INP_WUNLOCK(inp);
2952 }
2953
2954 void
inp_rlock(struct inpcb * inp)2955 inp_rlock(struct inpcb *inp)
2956 {
2957
2958 INP_RLOCK(inp);
2959 }
2960
2961 void
inp_runlock(struct inpcb * inp)2962 inp_runlock(struct inpcb *inp)
2963 {
2964
2965 INP_RUNLOCK(inp);
2966 }
2967
2968 #ifdef INVARIANT_SUPPORT
2969 void
inp_lock_assert(struct inpcb * inp)2970 inp_lock_assert(struct inpcb *inp)
2971 {
2972
2973 INP_WLOCK_ASSERT(inp);
2974 }
2975
2976 void
inp_unlock_assert(struct inpcb * inp)2977 inp_unlock_assert(struct inpcb *inp)
2978 {
2979
2980 INP_UNLOCK_ASSERT(inp);
2981 }
2982 #endif
2983
2984 void
inp_apply_all(void (* func)(struct inpcb *,void *),void * arg)2985 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2986 {
2987 struct inpcb *inp;
2988
2989 INP_INFO_WLOCK(&V_tcbinfo);
2990 CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2991 INP_WLOCK(inp);
2992 func(inp, arg);
2993 INP_WUNLOCK(inp);
2994 }
2995 INP_INFO_WUNLOCK(&V_tcbinfo);
2996 }
2997
2998 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2999 inp_inpcbtosocket(struct inpcb *inp)
3000 {
3001
3002 INP_WLOCK_ASSERT(inp);
3003 return (inp->inp_socket);
3004 }
3005
3006 struct tcpcb *
inp_inpcbtotcpcb(struct inpcb * inp)3007 inp_inpcbtotcpcb(struct inpcb *inp)
3008 {
3009
3010 INP_WLOCK_ASSERT(inp);
3011 return ((struct tcpcb *)inp->inp_ppcb);
3012 }
3013
3014 int
inp_ip_tos_get(const struct inpcb * inp)3015 inp_ip_tos_get(const struct inpcb *inp)
3016 {
3017
3018 return (inp->inp_ip_tos);
3019 }
3020
3021 void
inp_ip_tos_set(struct inpcb * inp,int val)3022 inp_ip_tos_set(struct inpcb *inp, int val)
3023 {
3024
3025 inp->inp_ip_tos = val;
3026 }
3027
3028 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)3029 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
3030 uint32_t *faddr, uint16_t *fp)
3031 {
3032
3033 INP_LOCK_ASSERT(inp);
3034 *laddr = inp->inp_laddr.s_addr;
3035 *faddr = inp->inp_faddr.s_addr;
3036 *lp = inp->inp_lport;
3037 *fp = inp->inp_fport;
3038 }
3039
3040 struct inpcb *
so_sotoinpcb(struct socket * so)3041 so_sotoinpcb(struct socket *so)
3042 {
3043
3044 return (sotoinpcb(so));
3045 }
3046
3047 struct tcpcb *
so_sototcpcb(struct socket * so)3048 so_sototcpcb(struct socket *so)
3049 {
3050
3051 return (sototcpcb(so));
3052 }
3053
3054 /*
3055 * Create an external-format (``xinpcb'') structure using the information in
3056 * the kernel-format in_pcb structure pointed to by inp. This is done to
3057 * reduce the spew of irrelevant information over this interface, to isolate
3058 * user code from changes in the kernel structure, and potentially to provide
3059 * information-hiding if we decide that some of this information should be
3060 * hidden from users.
3061 */
3062 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)3063 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
3064 {
3065
3066 bzero(xi, sizeof(*xi));
3067 xi->xi_len = sizeof(struct xinpcb);
3068 if (inp->inp_socket)
3069 sotoxsocket(inp->inp_socket, &xi->xi_socket);
3070 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
3071 xi->inp_gencnt = inp->inp_gencnt;
3072 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
3073 xi->inp_flow = inp->inp_flow;
3074 xi->inp_flowid = inp->inp_flowid;
3075 xi->inp_flowtype = inp->inp_flowtype;
3076 xi->inp_flags = inp->inp_flags;
3077 xi->inp_flags2 = inp->inp_flags2;
3078 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
3079 xi->in6p_cksum = inp->in6p_cksum;
3080 xi->in6p_hops = inp->in6p_hops;
3081 xi->inp_ip_tos = inp->inp_ip_tos;
3082 xi->inp_vflag = inp->inp_vflag;
3083 xi->inp_ip_ttl = inp->inp_ip_ttl;
3084 xi->inp_ip_p = inp->inp_ip_p;
3085 xi->inp_ip_minttl = inp->inp_ip_minttl;
3086 }
3087
3088 #ifdef DDB
3089 static void
db_print_indent(int indent)3090 db_print_indent(int indent)
3091 {
3092 int i;
3093
3094 for (i = 0; i < indent; i++)
3095 db_printf(" ");
3096 }
3097
3098 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3099 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3100 {
3101 char faddr_str[48], laddr_str[48];
3102
3103 db_print_indent(indent);
3104 db_printf("%s at %p\n", name, inc);
3105
3106 indent += 2;
3107
3108 #ifdef INET6
3109 if (inc->inc_flags & INC_ISIPV6) {
3110 /* IPv6. */
3111 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3112 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3113 } else
3114 #endif
3115 {
3116 /* IPv4. */
3117 inet_ntoa_r(inc->inc_laddr, laddr_str);
3118 inet_ntoa_r(inc->inc_faddr, faddr_str);
3119 }
3120 db_print_indent(indent);
3121 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3122 ntohs(inc->inc_lport));
3123 db_print_indent(indent);
3124 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3125 ntohs(inc->inc_fport));
3126 }
3127
3128 static void
db_print_inpflags(int inp_flags)3129 db_print_inpflags(int inp_flags)
3130 {
3131 int comma;
3132
3133 comma = 0;
3134 if (inp_flags & INP_RECVOPTS) {
3135 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3136 comma = 1;
3137 }
3138 if (inp_flags & INP_RECVRETOPTS) {
3139 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3140 comma = 1;
3141 }
3142 if (inp_flags & INP_RECVDSTADDR) {
3143 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3144 comma = 1;
3145 }
3146 if (inp_flags & INP_ORIGDSTADDR) {
3147 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3148 comma = 1;
3149 }
3150 if (inp_flags & INP_HDRINCL) {
3151 db_printf("%sINP_HDRINCL", comma ? ", " : "");
3152 comma = 1;
3153 }
3154 if (inp_flags & INP_HIGHPORT) {
3155 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3156 comma = 1;
3157 }
3158 if (inp_flags & INP_LOWPORT) {
3159 db_printf("%sINP_LOWPORT", comma ? ", " : "");
3160 comma = 1;
3161 }
3162 if (inp_flags & INP_ANONPORT) {
3163 db_printf("%sINP_ANONPORT", comma ? ", " : "");
3164 comma = 1;
3165 }
3166 if (inp_flags & INP_RECVIF) {
3167 db_printf("%sINP_RECVIF", comma ? ", " : "");
3168 comma = 1;
3169 }
3170 if (inp_flags & INP_MTUDISC) {
3171 db_printf("%sINP_MTUDISC", comma ? ", " : "");
3172 comma = 1;
3173 }
3174 if (inp_flags & INP_RECVTTL) {
3175 db_printf("%sINP_RECVTTL", comma ? ", " : "");
3176 comma = 1;
3177 }
3178 if (inp_flags & INP_DONTFRAG) {
3179 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3180 comma = 1;
3181 }
3182 if (inp_flags & INP_RECVTOS) {
3183 db_printf("%sINP_RECVTOS", comma ? ", " : "");
3184 comma = 1;
3185 }
3186 if (inp_flags & IN6P_IPV6_V6ONLY) {
3187 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3188 comma = 1;
3189 }
3190 if (inp_flags & IN6P_PKTINFO) {
3191 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3192 comma = 1;
3193 }
3194 if (inp_flags & IN6P_HOPLIMIT) {
3195 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3196 comma = 1;
3197 }
3198 if (inp_flags & IN6P_HOPOPTS) {
3199 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3200 comma = 1;
3201 }
3202 if (inp_flags & IN6P_DSTOPTS) {
3203 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3204 comma = 1;
3205 }
3206 if (inp_flags & IN6P_RTHDR) {
3207 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3208 comma = 1;
3209 }
3210 if (inp_flags & IN6P_RTHDRDSTOPTS) {
3211 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3212 comma = 1;
3213 }
3214 if (inp_flags & IN6P_TCLASS) {
3215 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3216 comma = 1;
3217 }
3218 if (inp_flags & IN6P_AUTOFLOWLABEL) {
3219 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3220 comma = 1;
3221 }
3222 if (inp_flags & INP_TIMEWAIT) {
3223 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3224 comma = 1;
3225 }
3226 if (inp_flags & INP_ONESBCAST) {
3227 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3228 comma = 1;
3229 }
3230 if (inp_flags & INP_DROPPED) {
3231 db_printf("%sINP_DROPPED", comma ? ", " : "");
3232 comma = 1;
3233 }
3234 if (inp_flags & INP_SOCKREF) {
3235 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3236 comma = 1;
3237 }
3238 if (inp_flags & IN6P_RFC2292) {
3239 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3240 comma = 1;
3241 }
3242 if (inp_flags & IN6P_MTU) {
3243 db_printf("IN6P_MTU%s", comma ? ", " : "");
3244 comma = 1;
3245 }
3246 }
3247
3248 static void
db_print_inpvflag(u_char inp_vflag)3249 db_print_inpvflag(u_char inp_vflag)
3250 {
3251 int comma;
3252
3253 comma = 0;
3254 if (inp_vflag & INP_IPV4) {
3255 db_printf("%sINP_IPV4", comma ? ", " : "");
3256 comma = 1;
3257 }
3258 if (inp_vflag & INP_IPV6) {
3259 db_printf("%sINP_IPV6", comma ? ", " : "");
3260 comma = 1;
3261 }
3262 if (inp_vflag & INP_IPV6PROTO) {
3263 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3264 comma = 1;
3265 }
3266 }
3267
3268 static void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3269 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3270 {
3271
3272 db_print_indent(indent);
3273 db_printf("%s at %p\n", name, inp);
3274
3275 indent += 2;
3276
3277 db_print_indent(indent);
3278 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3279
3280 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3281
3282 db_print_indent(indent);
3283 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
3284 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3285
3286 db_print_indent(indent);
3287 db_printf("inp_label: %p inp_flags: 0x%x (",
3288 inp->inp_label, inp->inp_flags);
3289 db_print_inpflags(inp->inp_flags);
3290 db_printf(")\n");
3291
3292 db_print_indent(indent);
3293 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
3294 inp->inp_vflag);
3295 db_print_inpvflag(inp->inp_vflag);
3296 db_printf(")\n");
3297
3298 db_print_indent(indent);
3299 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3300 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3301
3302 db_print_indent(indent);
3303 #ifdef INET6
3304 if (inp->inp_vflag & INP_IPV6) {
3305 db_printf("in6p_options: %p in6p_outputopts: %p "
3306 "in6p_moptions: %p\n", inp->in6p_options,
3307 inp->in6p_outputopts, inp->in6p_moptions);
3308 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3309 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3310 inp->in6p_hops);
3311 } else
3312 #endif
3313 {
3314 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3315 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3316 inp->inp_options, inp->inp_moptions);
3317 }
3318
3319 db_print_indent(indent);
3320 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
3321 (uintmax_t)inp->inp_gencnt);
3322 }
3323
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3324 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3325 {
3326 struct inpcb *inp;
3327
3328 if (!have_addr) {
3329 db_printf("usage: show inpcb <addr>\n");
3330 return;
3331 }
3332 inp = (struct inpcb *)addr;
3333
3334 db_print_inpcb(inp, "inpcb", 0);
3335 }
3336 #endif /* DDB */
3337
3338 #ifdef RATELIMIT
3339 /*
3340 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3341 * if any.
3342 */
3343 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3344 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3345 {
3346 union if_snd_tag_modify_params params = {
3347 .rate_limit.max_rate = max_pacing_rate,
3348 .rate_limit.flags = M_NOWAIT,
3349 };
3350 struct m_snd_tag *mst;
3351 struct ifnet *ifp;
3352 int error;
3353
3354 mst = inp->inp_snd_tag;
3355 if (mst == NULL)
3356 return (EINVAL);
3357
3358 ifp = mst->ifp;
3359 if (ifp == NULL)
3360 return (EINVAL);
3361
3362 if (ifp->if_snd_tag_modify == NULL) {
3363 error = EOPNOTSUPP;
3364 } else {
3365 error = ifp->if_snd_tag_modify(mst, ¶ms);
3366 }
3367 return (error);
3368 }
3369
3370 /*
3371 * Query existing TX rate limit based on the existing
3372 * "inp->inp_snd_tag", if any.
3373 */
3374 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3375 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3376 {
3377 union if_snd_tag_query_params params = { };
3378 struct m_snd_tag *mst;
3379 struct ifnet *ifp;
3380 int error;
3381
3382 mst = inp->inp_snd_tag;
3383 if (mst == NULL)
3384 return (EINVAL);
3385
3386 ifp = mst->ifp;
3387 if (ifp == NULL)
3388 return (EINVAL);
3389
3390 if (ifp->if_snd_tag_query == NULL) {
3391 error = EOPNOTSUPP;
3392 } else {
3393 error = ifp->if_snd_tag_query(mst, ¶ms);
3394 if (error == 0 && p_max_pacing_rate != NULL)
3395 *p_max_pacing_rate = params.rate_limit.max_rate;
3396 }
3397 return (error);
3398 }
3399
3400 /*
3401 * Query existing TX queue level based on the existing
3402 * "inp->inp_snd_tag", if any.
3403 */
3404 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3405 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3406 {
3407 union if_snd_tag_query_params params = { };
3408 struct m_snd_tag *mst;
3409 struct ifnet *ifp;
3410 int error;
3411
3412 mst = inp->inp_snd_tag;
3413 if (mst == NULL)
3414 return (EINVAL);
3415
3416 ifp = mst->ifp;
3417 if (ifp == NULL)
3418 return (EINVAL);
3419
3420 if (ifp->if_snd_tag_query == NULL)
3421 return (EOPNOTSUPP);
3422
3423 error = ifp->if_snd_tag_query(mst, ¶ms);
3424 if (error == 0 && p_txqueue_level != NULL)
3425 *p_txqueue_level = params.rate_limit.queue_level;
3426 return (error);
3427 }
3428
3429 /*
3430 * Allocate a new TX rate limit send tag from the network interface
3431 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3432 */
3433 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3434 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3435 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3436
3437 {
3438 union if_snd_tag_alloc_params params = {
3439 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3440 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3441 .rate_limit.hdr.flowid = flowid,
3442 .rate_limit.hdr.flowtype = flowtype,
3443 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3444 .rate_limit.max_rate = max_pacing_rate,
3445 .rate_limit.flags = M_NOWAIT,
3446 };
3447 int error;
3448
3449 INP_WLOCK_ASSERT(inp);
3450
3451 /*
3452 * If there is already a send tag, or the INP is being torn
3453 * down, allocating a new send tag is not allowed. Else send
3454 * tags may leak.
3455 */
3456 if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3457 return (EINVAL);
3458
3459 error = m_snd_tag_alloc(ifp, ¶ms, st);
3460 #ifdef INET
3461 if (error == 0) {
3462 counter_u64_add(rate_limit_set_ok, 1);
3463 counter_u64_add(rate_limit_active, 1);
3464 } else if (error != EOPNOTSUPP)
3465 counter_u64_add(rate_limit_alloc_fail, 1);
3466 #endif
3467 return (error);
3468 }
3469
3470 void
in_pcbdetach_tag(struct m_snd_tag * mst)3471 in_pcbdetach_tag(struct m_snd_tag *mst)
3472 {
3473
3474 m_snd_tag_rele(mst);
3475 #ifdef INET
3476 counter_u64_add(rate_limit_active, -1);
3477 #endif
3478 }
3479
3480 /*
3481 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3482 * if any:
3483 */
3484 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3485 in_pcbdetach_txrtlmt(struct inpcb *inp)
3486 {
3487 struct m_snd_tag *mst;
3488
3489 INP_WLOCK_ASSERT(inp);
3490
3491 mst = inp->inp_snd_tag;
3492 inp->inp_snd_tag = NULL;
3493
3494 if (mst == NULL)
3495 return;
3496
3497 m_snd_tag_rele(mst);
3498 #ifdef INET
3499 counter_u64_add(rate_limit_active, -1);
3500 #endif
3501 }
3502
3503 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3504 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3505 {
3506 int error;
3507
3508 /*
3509 * If the existing send tag is for the wrong interface due to
3510 * a route change, first drop the existing tag. Set the
3511 * CHANGED flag so that we will keep trying to allocate a new
3512 * tag if we fail to allocate one this time.
3513 */
3514 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3515 in_pcbdetach_txrtlmt(inp);
3516 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3517 }
3518
3519 /*
3520 * NOTE: When attaching to a network interface a reference is
3521 * made to ensure the network interface doesn't go away until
3522 * all ratelimit connections are gone. The network interface
3523 * pointers compared below represent valid network interfaces,
3524 * except when comparing towards NULL.
3525 */
3526 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3527 error = 0;
3528 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3529 if (inp->inp_snd_tag != NULL)
3530 in_pcbdetach_txrtlmt(inp);
3531 error = 0;
3532 } else if (inp->inp_snd_tag == NULL) {
3533 /*
3534 * In order to utilize packet pacing with RSS, we need
3535 * to wait until there is a valid RSS hash before we
3536 * can proceed:
3537 */
3538 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3539 error = EAGAIN;
3540 } else {
3541 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3542 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3543 }
3544 } else {
3545 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3546 }
3547 if (error == 0 || error == EOPNOTSUPP)
3548 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3549
3550 return (error);
3551 }
3552
3553 /*
3554 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3555 * is set in the fast path and will attach/detach/modify the TX rate
3556 * limit send tag based on the socket's so_max_pacing_rate value.
3557 */
3558 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3559 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3560 {
3561 struct socket *socket;
3562 uint32_t max_pacing_rate;
3563 bool did_upgrade;
3564 int error;
3565
3566 if (inp == NULL)
3567 return;
3568
3569 socket = inp->inp_socket;
3570 if (socket == NULL)
3571 return;
3572
3573 if (!INP_WLOCKED(inp)) {
3574 /*
3575 * NOTE: If the write locking fails, we need to bail
3576 * out and use the non-ratelimited ring for the
3577 * transmit until there is a new chance to get the
3578 * write lock.
3579 */
3580 if (!INP_TRY_UPGRADE(inp))
3581 return;
3582 did_upgrade = 1;
3583 } else {
3584 did_upgrade = 0;
3585 }
3586
3587 /*
3588 * NOTE: The so_max_pacing_rate value is read unlocked,
3589 * because atomic updates are not required since the variable
3590 * is checked at every mbuf we send. It is assumed that the
3591 * variable read itself will be atomic.
3592 */
3593 max_pacing_rate = socket->so_max_pacing_rate;
3594
3595 error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3596
3597 if (did_upgrade)
3598 INP_DOWNGRADE(inp);
3599 }
3600
3601 /*
3602 * Track route changes for TX rate limiting.
3603 */
3604 void
in_pcboutput_eagain(struct inpcb * inp)3605 in_pcboutput_eagain(struct inpcb *inp)
3606 {
3607 bool did_upgrade;
3608
3609 if (inp == NULL)
3610 return;
3611
3612 if (inp->inp_snd_tag == NULL)
3613 return;
3614
3615 if (!INP_WLOCKED(inp)) {
3616 /*
3617 * NOTE: If the write locking fails, we need to bail
3618 * out and use the non-ratelimited ring for the
3619 * transmit until there is a new chance to get the
3620 * write lock.
3621 */
3622 if (!INP_TRY_UPGRADE(inp))
3623 return;
3624 did_upgrade = 1;
3625 } else {
3626 did_upgrade = 0;
3627 }
3628
3629 /* detach rate limiting */
3630 in_pcbdetach_txrtlmt(inp);
3631
3632 /* make sure new mbuf send tag allocation is made */
3633 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3634
3635 if (did_upgrade)
3636 INP_DOWNGRADE(inp);
3637 }
3638
3639 #ifdef INET
3640 static void
rl_init(void * st)3641 rl_init(void *st)
3642 {
3643 rate_limit_active = counter_u64_alloc(M_WAITOK);
3644 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3645 rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3646 }
3647
3648 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3649 #endif
3650 #endif /* RATELIMIT */
3651