1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2008 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * Copyright (c) 2014 Kevin Lo
9 * All rights reserved.
10 *
11 * Portions of this software were developed by Robert N. M. Watson under
12 * contract to Juniper Networks, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_route.h"
48 #include "opt_rss.h"
49
50 #include <sys/param.h>
51 #include <sys/domain.h>
52 #include <sys/eventhandler.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/protosw.h>
61 #include <sys/sdt.h>
62 #include <sys/signalvar.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sx.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/systm.h>
69
70 #include <vm/uma.h>
71
72 #include <net/if.h>
73 #include <net/if_var.h>
74 #include <net/route.h>
75 #include <net/route/nhop.h>
76 #include <net/rss_config.h>
77
78 #include <netinet/in.h>
79 #include <netinet/in_kdtrace.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #endif
88 #include <netinet/ip_icmp.h>
89 #include <netinet/icmp_var.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/ip_options.h>
92 #ifdef INET6
93 #include <netinet6/ip6_var.h>
94 #endif
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/udplite.h>
98 #include <netinet/in_rss.h>
99
100 #include <netipsec/ipsec_support.h>
101
102 #include <machine/in_cksum.h>
103
104 #include <security/mac/mac_framework.h>
105
106 /*
107 * UDP and UDP-Lite protocols implementation.
108 * Per RFC 768, August, 1980.
109 * Per RFC 3828, July, 2004.
110 */
111
112 /*
113 * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums
114 * removes the only data integrity mechanism for packets and malformed
115 * packets that would otherwise be discarded due to bad checksums, and may
116 * cause problems (especially for NFS data blocks).
117 */
118 VNET_DEFINE(int, udp_cksum) = 1;
119 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
120 &VNET_NAME(udp_cksum), 0, "compute udp checksum");
121
122 VNET_DEFINE(int, udp_log_in_vain) = 0;
123 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
124 &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
125
126 VNET_DEFINE(int, udp_blackhole) = 0;
127 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
128 &VNET_NAME(udp_blackhole), 0,
129 "Do not send port unreachables for refused connects");
130
131 u_long udp_sendspace = 9216; /* really max datagram size */
132 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
133 &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
134
135 u_long udp_recvspace = 40 * (1024 +
136 #ifdef INET6
137 sizeof(struct sockaddr_in6)
138 #else
139 sizeof(struct sockaddr_in)
140 #endif
141 ); /* 40 1K datagrams */
142
143 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
144 &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
145
146 VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */
147 VNET_DEFINE(struct inpcbinfo, udbinfo);
148 VNET_DEFINE(struct inpcbhead, ulitecb);
149 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
150 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
151 #define V_udpcb_zone VNET(udpcb_zone)
152
153 #ifndef UDBHASHSIZE
154 #define UDBHASHSIZE 128
155 #endif
156
157 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */
158 VNET_PCPUSTAT_SYSINIT(udpstat);
159 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
160 udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
161
162 #ifdef VIMAGE
163 VNET_PCPUSTAT_SYSUNINIT(udpstat);
164 #endif /* VIMAGE */
165 #ifdef INET
166 static void udp_detach(struct socket *so);
167 static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
168 struct mbuf *, struct thread *, int);
169 #endif
170
171 static void
udp_zone_change(void * tag)172 udp_zone_change(void *tag)
173 {
174
175 uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
176 uma_zone_set_max(V_udpcb_zone, maxsockets);
177 }
178
179 static int
udp_inpcb_init(void * mem,int size,int flags)180 udp_inpcb_init(void *mem, int size, int flags)
181 {
182 struct inpcb *inp;
183
184 inp = mem;
185 INP_LOCK_INIT(inp, "inp", "udpinp");
186 return (0);
187 }
188
189 static int
udplite_inpcb_init(void * mem,int size,int flags)190 udplite_inpcb_init(void *mem, int size, int flags)
191 {
192 struct inpcb *inp;
193
194 inp = mem;
195 INP_LOCK_INIT(inp, "inp", "udpliteinp");
196 return (0);
197 }
198
199 void
udp_init(void)200 udp_init(void)
201 {
202
203 /*
204 * For now default to 2-tuple UDP hashing - until the fragment
205 * reassembly code can also update the flowid.
206 *
207 * Once we can calculate the flowid that way and re-establish
208 * a 4-tuple, flip this to 4-tuple.
209 */
210 in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
211 "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE);
212 V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
213 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
214 uma_zone_set_max(V_udpcb_zone, maxsockets);
215 uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
216 EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
217 EVENTHANDLER_PRI_ANY);
218 }
219
220 void
udplite_init(void)221 udplite_init(void)
222 {
223
224 in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
225 UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init,
226 IPI_HASHFIELDS_2TUPLE);
227 }
228
229 /*
230 * Kernel module interface for updating udpstat. The argument is an index
231 * into udpstat treated as an array of u_long. While this encodes the
232 * general layout of udpstat into the caller, it doesn't encode its location,
233 * so that future changes to add, for example, per-CPU stats support won't
234 * cause binary compatibility problems for kernel modules.
235 */
236 void
kmod_udpstat_inc(int statnum)237 kmod_udpstat_inc(int statnum)
238 {
239
240 counter_u64_add(VNET(udpstat)[statnum], 1);
241 }
242
243 int
udp_newudpcb(struct inpcb * inp)244 udp_newudpcb(struct inpcb *inp)
245 {
246 struct udpcb *up;
247
248 up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
249 if (up == NULL)
250 return (ENOBUFS);
251 inp->inp_ppcb = up;
252 return (0);
253 }
254
255 void
udp_discardcb(struct udpcb * up)256 udp_discardcb(struct udpcb *up)
257 {
258
259 uma_zfree(V_udpcb_zone, up);
260 }
261
262 #ifdef VIMAGE
263 static void
udp_destroy(void * unused __unused)264 udp_destroy(void *unused __unused)
265 {
266
267 in_pcbinfo_destroy(&V_udbinfo);
268 uma_zdestroy(V_udpcb_zone);
269 }
270 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
271
272 static void
udplite_destroy(void * unused __unused)273 udplite_destroy(void *unused __unused)
274 {
275
276 in_pcbinfo_destroy(&V_ulitecbinfo);
277 }
278 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
279 NULL);
280 #endif
281
282 #ifdef INET
283 /*
284 * Subroutine of udp_input(), which appends the provided mbuf chain to the
285 * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that
286 * contains the source address. If the socket ends up being an IPv6 socket,
287 * udp_append() will convert to a sockaddr_in6 before passing the address
288 * into the socket code.
289 *
290 * In the normal case udp_append() will return 0, indicating that you
291 * must unlock the inp. However if a tunneling protocol is in place we increment
292 * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
293 * then decrement the reference count. If the inp_rele returns 1, indicating the
294 * inp is gone, we return that to the caller to tell them *not* to unlock
295 * the inp. In the case of multi-cast this will cause the distribution
296 * to stop (though most tunneling protocols known currently do *not* use
297 * multicast).
298 */
299 static int
udp_append(struct inpcb * inp,struct ip * ip,struct mbuf * n,int off,struct sockaddr_in * udp_in)300 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
301 struct sockaddr_in *udp_in)
302 {
303 struct sockaddr *append_sa;
304 struct socket *so;
305 struct mbuf *tmpopts, *opts = NULL;
306 #ifdef INET6
307 struct sockaddr_in6 udp_in6;
308 #endif
309 struct udpcb *up;
310
311 INP_LOCK_ASSERT(inp);
312
313 /*
314 * Engage the tunneling protocol.
315 */
316 up = intoudpcb(inp);
317 if (up->u_tun_func != NULL) {
318 in_pcbref(inp);
319 INP_RUNLOCK(inp);
320 (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
321 up->u_tun_ctx);
322 INP_RLOCK(inp);
323 return (in_pcbrele_rlocked(inp));
324 }
325
326 off += sizeof(struct udphdr);
327
328 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
329 /* Check AH/ESP integrity. */
330 if (IPSEC_ENABLED(ipv4) &&
331 IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
332 m_freem(n);
333 return (0);
334 }
335 if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
336 if (IPSEC_ENABLED(ipv4) &&
337 UDPENCAP_INPUT(n, off, AF_INET) != 0)
338 return (0); /* Consumed. */
339 }
340 #endif /* IPSEC */
341 #ifdef MAC
342 if (mac_inpcb_check_deliver(inp, n) != 0) {
343 m_freem(n);
344 return (0);
345 }
346 #endif /* MAC */
347 if (inp->inp_flags & INP_CONTROLOPTS ||
348 inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
349 #ifdef INET6
350 if (inp->inp_vflag & INP_IPV6)
351 (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
352 else
353 #endif /* INET6 */
354 ip_savecontrol(inp, &opts, ip, n);
355 }
356 if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
357 tmpopts = sbcreatecontrol((caddr_t)&udp_in[1],
358 sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP);
359 if (tmpopts) {
360 if (opts) {
361 tmpopts->m_next = opts;
362 opts = tmpopts;
363 } else
364 opts = tmpopts;
365 }
366 }
367 #ifdef INET6
368 if (inp->inp_vflag & INP_IPV6) {
369 bzero(&udp_in6, sizeof(udp_in6));
370 udp_in6.sin6_len = sizeof(udp_in6);
371 udp_in6.sin6_family = AF_INET6;
372 in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
373 append_sa = (struct sockaddr *)&udp_in6;
374 } else
375 #endif /* INET6 */
376 append_sa = (struct sockaddr *)&udp_in[0];
377 m_adj(n, off);
378
379 so = inp->inp_socket;
380 SOCKBUF_LOCK(&so->so_rcv);
381 if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
382 soroverflow_locked(so);
383 m_freem(n);
384 if (opts)
385 m_freem(opts);
386 UDPSTAT_INC(udps_fullsock);
387 } else
388 sorwakeup_locked(so);
389 return (0);
390 }
391
392 int
udp_input(struct mbuf ** mp,int * offp,int proto)393 udp_input(struct mbuf **mp, int *offp, int proto)
394 {
395 struct ip *ip;
396 struct udphdr *uh;
397 struct ifnet *ifp;
398 struct inpcb *inp;
399 uint16_t len, ip_len;
400 struct inpcbinfo *pcbinfo;
401 struct ip save_ip;
402 struct sockaddr_in udp_in[2];
403 struct mbuf *m;
404 struct m_tag *fwd_tag;
405 int cscov_partial, iphlen;
406
407 m = *mp;
408 iphlen = *offp;
409 ifp = m->m_pkthdr.rcvif;
410 *mp = NULL;
411 UDPSTAT_INC(udps_ipackets);
412
413 /*
414 * Strip IP options, if any; should skip this, make available to
415 * user, and use on returned packets, but we don't yet have a way to
416 * check the checksum with options still present.
417 */
418 if (iphlen > sizeof (struct ip)) {
419 ip_stripoptions(m);
420 iphlen = sizeof(struct ip);
421 }
422
423 /*
424 * Get IP and UDP header together in first mbuf.
425 */
426 if (m->m_len < iphlen + sizeof(struct udphdr)) {
427 if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
428 UDPSTAT_INC(udps_hdrops);
429 return (IPPROTO_DONE);
430 }
431 }
432 ip = mtod(m, struct ip *);
433 uh = (struct udphdr *)((caddr_t)ip + iphlen);
434 cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
435
436 /*
437 * Destination port of 0 is illegal, based on RFC768.
438 */
439 if (uh->uh_dport == 0)
440 goto badunlocked;
441
442 /*
443 * Construct sockaddr format source address. Stuff source address
444 * and datagram in user buffer.
445 */
446 bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
447 udp_in[0].sin_len = sizeof(struct sockaddr_in);
448 udp_in[0].sin_family = AF_INET;
449 udp_in[0].sin_port = uh->uh_sport;
450 udp_in[0].sin_addr = ip->ip_src;
451 udp_in[1].sin_len = sizeof(struct sockaddr_in);
452 udp_in[1].sin_family = AF_INET;
453 udp_in[1].sin_port = uh->uh_dport;
454 udp_in[1].sin_addr = ip->ip_dst;
455
456 /*
457 * Make mbuf data length reflect UDP length. If not enough data to
458 * reflect UDP length, drop.
459 */
460 len = ntohs((u_short)uh->uh_ulen);
461 ip_len = ntohs(ip->ip_len) - iphlen;
462 if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
463 /* Zero means checksum over the complete packet. */
464 if (len == 0)
465 len = ip_len;
466 cscov_partial = 0;
467 }
468 if (ip_len != len) {
469 if (len > ip_len || len < sizeof(struct udphdr)) {
470 UDPSTAT_INC(udps_badlen);
471 goto badunlocked;
472 }
473 if (proto == IPPROTO_UDP)
474 m_adj(m, len - ip_len);
475 }
476
477 /*
478 * Save a copy of the IP header in case we want restore it for
479 * sending an ICMP error message in response.
480 */
481 if (!V_udp_blackhole)
482 save_ip = *ip;
483 else
484 memset(&save_ip, 0, sizeof(save_ip));
485
486 /*
487 * Checksum extended UDP header and data.
488 */
489 if (uh->uh_sum) {
490 u_short uh_sum;
491
492 if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
493 !cscov_partial) {
494 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
495 uh_sum = m->m_pkthdr.csum_data;
496 else
497 uh_sum = in_pseudo(ip->ip_src.s_addr,
498 ip->ip_dst.s_addr, htonl((u_short)len +
499 m->m_pkthdr.csum_data + proto));
500 uh_sum ^= 0xffff;
501 } else {
502 char b[9];
503
504 bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
505 bzero(((struct ipovly *)ip)->ih_x1, 9);
506 ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
507 uh->uh_ulen : htons(ip_len);
508 uh_sum = in_cksum(m, len + sizeof (struct ip));
509 bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
510 }
511 if (uh_sum) {
512 UDPSTAT_INC(udps_badsum);
513 m_freem(m);
514 return (IPPROTO_DONE);
515 }
516 } else {
517 if (proto == IPPROTO_UDP) {
518 UDPSTAT_INC(udps_nosum);
519 } else {
520 /* UDPLite requires a checksum */
521 /* XXX: What is the right UDPLite MIB counter here? */
522 m_freem(m);
523 return (IPPROTO_DONE);
524 }
525 }
526
527 pcbinfo = udp_get_inpcbinfo(proto);
528 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
529 in_broadcast(ip->ip_dst, ifp)) {
530 struct inpcb *last;
531 struct inpcbhead *pcblist;
532
533 NET_EPOCH_ASSERT();
534
535 pcblist = udp_get_pcblist(proto);
536 last = NULL;
537 CK_LIST_FOREACH(inp, pcblist, inp_list) {
538 if (inp->inp_lport != uh->uh_dport)
539 continue;
540 #ifdef INET6
541 if ((inp->inp_vflag & INP_IPV4) == 0)
542 continue;
543 #endif
544 if (inp->inp_laddr.s_addr != INADDR_ANY &&
545 inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
546 continue;
547 if (inp->inp_faddr.s_addr != INADDR_ANY &&
548 inp->inp_faddr.s_addr != ip->ip_src.s_addr)
549 continue;
550 if (inp->inp_fport != 0 &&
551 inp->inp_fport != uh->uh_sport)
552 continue;
553
554 INP_RLOCK(inp);
555
556 if (__predict_false(inp->inp_flags2 & INP_FREED)) {
557 INP_RUNLOCK(inp);
558 continue;
559 }
560
561 /*
562 * XXXRW: Because we weren't holding either the inpcb
563 * or the hash lock when we checked for a match
564 * before, we should probably recheck now that the
565 * inpcb lock is held.
566 */
567
568 /*
569 * Handle socket delivery policy for any-source
570 * and source-specific multicast. [RFC3678]
571 */
572 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
573 struct ip_moptions *imo;
574 struct sockaddr_in group;
575 int blocked;
576
577 imo = inp->inp_moptions;
578 if (imo == NULL) {
579 INP_RUNLOCK(inp);
580 continue;
581 }
582 bzero(&group, sizeof(struct sockaddr_in));
583 group.sin_len = sizeof(struct sockaddr_in);
584 group.sin_family = AF_INET;
585 group.sin_addr = ip->ip_dst;
586
587 blocked = imo_multi_filter(imo, ifp,
588 (struct sockaddr *)&group,
589 (struct sockaddr *)&udp_in[0]);
590 if (blocked != MCAST_PASS) {
591 if (blocked == MCAST_NOTGMEMBER)
592 IPSTAT_INC(ips_notmember);
593 if (blocked == MCAST_NOTSMEMBER ||
594 blocked == MCAST_MUTED)
595 UDPSTAT_INC(udps_filtermcast);
596 INP_RUNLOCK(inp);
597 continue;
598 }
599 }
600 if (last != NULL) {
601 struct mbuf *n;
602
603 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
604 NULL) {
605 if (proto == IPPROTO_UDPLITE)
606 UDPLITE_PROBE(receive, NULL, last, ip,
607 last, uh);
608 else
609 UDP_PROBE(receive, NULL, last, ip, last,
610 uh);
611 if (udp_append(last, ip, n, iphlen,
612 udp_in)) {
613 INP_RUNLOCK(inp);
614 goto badunlocked;
615 }
616 }
617 /* Release PCB lock taken on previous pass. */
618 INP_RUNLOCK(last);
619 }
620 last = inp;
621 /*
622 * Don't look for additional matches if this one does
623 * not have either the SO_REUSEPORT or SO_REUSEADDR
624 * socket options set. This heuristic avoids
625 * searching through all pcbs in the common case of a
626 * non-shared port. It assumes that an application
627 * will never clear these options after setting them.
628 */
629 if ((last->inp_socket->so_options &
630 (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
631 break;
632 }
633
634 if (last == NULL) {
635 /*
636 * No matching pcb found; discard datagram. (No need
637 * to send an ICMP Port Unreachable for a broadcast
638 * or multicast datgram.)
639 */
640 UDPSTAT_INC(udps_noport);
641 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
642 UDPSTAT_INC(udps_noportmcast);
643 else
644 UDPSTAT_INC(udps_noportbcast);
645 goto badunlocked;
646 }
647 if (proto == IPPROTO_UDPLITE)
648 UDPLITE_PROBE(receive, NULL, last, ip, last, uh);
649 else
650 UDP_PROBE(receive, NULL, last, ip, last, uh);
651 if (udp_append(last, ip, m, iphlen, udp_in) == 0)
652 INP_RUNLOCK(last);
653 return (IPPROTO_DONE);
654 }
655
656 /*
657 * Locate pcb for datagram.
658 */
659
660 /*
661 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
662 */
663 if ((m->m_flags & M_IP_NEXTHOP) &&
664 (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
665 struct sockaddr_in *next_hop;
666
667 next_hop = (struct sockaddr_in *)(fwd_tag + 1);
668
669 /*
670 * Transparently forwarded. Pretend to be the destination.
671 * Already got one like this?
672 */
673 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
674 ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
675 if (!inp) {
676 /*
677 * It's new. Try to find the ambushing socket.
678 * Because we've rewritten the destination address,
679 * any hardware-generated hash is ignored.
680 */
681 inp = in_pcblookup(pcbinfo, ip->ip_src,
682 uh->uh_sport, next_hop->sin_addr,
683 next_hop->sin_port ? htons(next_hop->sin_port) :
684 uh->uh_dport, INPLOOKUP_WILDCARD |
685 INPLOOKUP_RLOCKPCB, ifp);
686 }
687 /* Remove the tag from the packet. We don't need it anymore. */
688 m_tag_delete(m, fwd_tag);
689 m->m_flags &= ~M_IP_NEXTHOP;
690 } else
691 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
692 ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
693 INPLOOKUP_RLOCKPCB, ifp, m);
694 if (inp == NULL) {
695 if (V_udp_log_in_vain) {
696 char src[INET_ADDRSTRLEN];
697 char dst[INET_ADDRSTRLEN];
698
699 log(LOG_INFO,
700 "Connection attempt to UDP %s:%d from %s:%d\n",
701 inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
702 inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
703 }
704 if (proto == IPPROTO_UDPLITE)
705 UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
706 else
707 UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
708 UDPSTAT_INC(udps_noport);
709 if (m->m_flags & (M_BCAST | M_MCAST)) {
710 UDPSTAT_INC(udps_noportbcast);
711 goto badunlocked;
712 }
713 if (V_udp_blackhole)
714 goto badunlocked;
715 if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
716 goto badunlocked;
717 *ip = save_ip;
718 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
719 return (IPPROTO_DONE);
720 }
721
722 /*
723 * Check the minimum TTL for socket.
724 */
725 INP_RLOCK_ASSERT(inp);
726 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
727 if (proto == IPPROTO_UDPLITE)
728 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
729 else
730 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
731 INP_RUNLOCK(inp);
732 m_freem(m);
733 return (IPPROTO_DONE);
734 }
735 if (cscov_partial) {
736 struct udpcb *up;
737
738 up = intoudpcb(inp);
739 if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
740 INP_RUNLOCK(inp);
741 m_freem(m);
742 return (IPPROTO_DONE);
743 }
744 }
745
746 if (proto == IPPROTO_UDPLITE)
747 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
748 else
749 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
750 if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
751 INP_RUNLOCK(inp);
752 return (IPPROTO_DONE);
753
754 badunlocked:
755 m_freem(m);
756 return (IPPROTO_DONE);
757 }
758 #endif /* INET */
759
760 /*
761 * Notify a udp user of an asynchronous error; just wake up so that they can
762 * collect error status.
763 */
764 struct inpcb *
udp_notify(struct inpcb * inp,int errno)765 udp_notify(struct inpcb *inp, int errno)
766 {
767
768 INP_WLOCK_ASSERT(inp);
769 if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
770 errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
771 NH_FREE(inp->inp_route.ro_nh);
772 inp->inp_route.ro_nh = (struct nhop_object *)NULL;
773 }
774
775 inp->inp_socket->so_error = errno;
776 sorwakeup(inp->inp_socket);
777 sowwakeup(inp->inp_socket);
778 return (inp);
779 }
780
781 #ifdef INET
782 static void
udp_common_ctlinput(int cmd,struct sockaddr * sa,void * vip,struct inpcbinfo * pcbinfo)783 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
784 struct inpcbinfo *pcbinfo)
785 {
786 struct ip *ip = vip;
787 struct udphdr *uh;
788 struct in_addr faddr;
789 struct inpcb *inp;
790
791 faddr = ((struct sockaddr_in *)sa)->sin_addr;
792 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
793 return;
794
795 if (PRC_IS_REDIRECT(cmd)) {
796 /* signal EHOSTDOWN, as it flushes the cached route */
797 in_pcbnotifyall(pcbinfo, faddr, EHOSTDOWN, udp_notify);
798 return;
799 }
800
801 /*
802 * Hostdead is ugly because it goes linearly through all PCBs.
803 *
804 * XXX: We never get this from ICMP, otherwise it makes an excellent
805 * DoS attack on machines with many connections.
806 */
807 if (cmd == PRC_HOSTDEAD)
808 ip = NULL;
809 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
810 return;
811 if (ip != NULL) {
812 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
813 inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
814 ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
815 if (inp != NULL) {
816 INP_WLOCK_ASSERT(inp);
817 if (inp->inp_socket != NULL) {
818 udp_notify(inp, inetctlerrmap[cmd]);
819 }
820 INP_WUNLOCK(inp);
821 } else {
822 inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
823 ip->ip_src, uh->uh_sport,
824 INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
825 if (inp != NULL) {
826 struct udpcb *up;
827 void *ctx;
828 udp_tun_icmp_t func;
829
830 up = intoudpcb(inp);
831 ctx = up->u_tun_ctx;
832 func = up->u_icmp_func;
833 INP_RUNLOCK(inp);
834 if (func != NULL)
835 (*func)(cmd, sa, vip, ctx);
836 }
837 }
838 } else
839 in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
840 udp_notify);
841 }
842 void
udp_ctlinput(int cmd,struct sockaddr * sa,void * vip)843 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
844 {
845
846 return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
847 }
848
849 void
udplite_ctlinput(int cmd,struct sockaddr * sa,void * vip)850 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
851 {
852
853 return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
854 }
855 #endif /* INET */
856
857 static int
udp_pcblist(SYSCTL_HANDLER_ARGS)858 udp_pcblist(SYSCTL_HANDLER_ARGS)
859 {
860 struct xinpgen xig;
861 struct epoch_tracker et;
862 struct inpcb *inp;
863 int error;
864
865 if (req->newptr != 0)
866 return (EPERM);
867
868 if (req->oldptr == 0) {
869 int n;
870
871 n = V_udbinfo.ipi_count;
872 n += imax(n / 8, 10);
873 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
874 return (0);
875 }
876
877 if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
878 return (error);
879
880 bzero(&xig, sizeof(xig));
881 xig.xig_len = sizeof xig;
882 xig.xig_count = V_udbinfo.ipi_count;
883 xig.xig_gen = V_udbinfo.ipi_gencnt;
884 xig.xig_sogen = so_gencnt;
885 error = SYSCTL_OUT(req, &xig, sizeof xig);
886 if (error)
887 return (error);
888
889 NET_EPOCH_ENTER(et);
890 for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead);
891 inp != NULL;
892 inp = CK_LIST_NEXT(inp, inp_list)) {
893 INP_RLOCK(inp);
894 if (inp->inp_gencnt <= xig.xig_gen &&
895 cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
896 struct xinpcb xi;
897
898 in_pcbtoxinpcb(inp, &xi);
899 INP_RUNLOCK(inp);
900 error = SYSCTL_OUT(req, &xi, sizeof xi);
901 if (error)
902 break;
903 } else
904 INP_RUNLOCK(inp);
905 }
906 NET_EPOCH_EXIT(et);
907
908 if (!error) {
909 /*
910 * Give the user an updated idea of our state. If the
911 * generation differs from what we told her before, she knows
912 * that something happened while we were processing this
913 * request, and it might be necessary to retry.
914 */
915 xig.xig_gen = V_udbinfo.ipi_gencnt;
916 xig.xig_sogen = so_gencnt;
917 xig.xig_count = V_udbinfo.ipi_count;
918 error = SYSCTL_OUT(req, &xig, sizeof xig);
919 }
920
921 return (error);
922 }
923
924 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
925 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
926 udp_pcblist, "S,xinpcb",
927 "List of active UDP sockets");
928
929 #ifdef INET
930 static int
udp_getcred(SYSCTL_HANDLER_ARGS)931 udp_getcred(SYSCTL_HANDLER_ARGS)
932 {
933 struct xucred xuc;
934 struct sockaddr_in addrs[2];
935 struct epoch_tracker et;
936 struct inpcb *inp;
937 int error;
938
939 error = priv_check(req->td, PRIV_NETINET_GETCRED);
940 if (error)
941 return (error);
942 error = SYSCTL_IN(req, addrs, sizeof(addrs));
943 if (error)
944 return (error);
945 NET_EPOCH_ENTER(et);
946 inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
947 addrs[0].sin_addr, addrs[0].sin_port,
948 INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
949 NET_EPOCH_EXIT(et);
950 if (inp != NULL) {
951 INP_RLOCK_ASSERT(inp);
952 if (inp->inp_socket == NULL)
953 error = ENOENT;
954 if (error == 0)
955 error = cr_canseeinpcb(req->td->td_ucred, inp);
956 if (error == 0)
957 cru2x(inp->inp_cred, &xuc);
958 INP_RUNLOCK(inp);
959 } else
960 error = ENOENT;
961 if (error == 0)
962 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
963 return (error);
964 }
965
966 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
967 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
968 0, 0, udp_getcred, "S,xucred",
969 "Get the xucred of a UDP connection");
970 #endif /* INET */
971
972 int
udp_ctloutput(struct socket * so,struct sockopt * sopt)973 udp_ctloutput(struct socket *so, struct sockopt *sopt)
974 {
975 struct inpcb *inp;
976 struct udpcb *up;
977 int isudplite, error, optval;
978
979 error = 0;
980 isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
981 inp = sotoinpcb(so);
982 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
983 INP_WLOCK(inp);
984 if (sopt->sopt_level != so->so_proto->pr_protocol) {
985 #ifdef INET6
986 if (INP_CHECK_SOCKAF(so, AF_INET6)) {
987 INP_WUNLOCK(inp);
988 error = ip6_ctloutput(so, sopt);
989 }
990 #endif
991 #if defined(INET) && defined(INET6)
992 else
993 #endif
994 #ifdef INET
995 {
996 INP_WUNLOCK(inp);
997 error = ip_ctloutput(so, sopt);
998 }
999 #endif
1000 return (error);
1001 }
1002
1003 switch (sopt->sopt_dir) {
1004 case SOPT_SET:
1005 switch (sopt->sopt_name) {
1006 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1007 #ifdef INET
1008 case UDP_ENCAP:
1009 if (!IPSEC_ENABLED(ipv4)) {
1010 INP_WUNLOCK(inp);
1011 return (ENOPROTOOPT);
1012 }
1013 error = UDPENCAP_PCBCTL(inp, sopt);
1014 break;
1015 #endif /* INET */
1016 #endif /* IPSEC */
1017 case UDPLITE_SEND_CSCOV:
1018 case UDPLITE_RECV_CSCOV:
1019 if (!isudplite) {
1020 INP_WUNLOCK(inp);
1021 error = ENOPROTOOPT;
1022 break;
1023 }
1024 INP_WUNLOCK(inp);
1025 error = sooptcopyin(sopt, &optval, sizeof(optval),
1026 sizeof(optval));
1027 if (error != 0)
1028 break;
1029 inp = sotoinpcb(so);
1030 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1031 INP_WLOCK(inp);
1032 up = intoudpcb(inp);
1033 KASSERT(up != NULL, ("%s: up == NULL", __func__));
1034 if ((optval != 0 && optval < 8) || (optval > 65535)) {
1035 INP_WUNLOCK(inp);
1036 error = EINVAL;
1037 break;
1038 }
1039 if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1040 up->u_txcslen = optval;
1041 else
1042 up->u_rxcslen = optval;
1043 INP_WUNLOCK(inp);
1044 break;
1045 default:
1046 INP_WUNLOCK(inp);
1047 error = ENOPROTOOPT;
1048 break;
1049 }
1050 break;
1051 case SOPT_GET:
1052 switch (sopt->sopt_name) {
1053 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1054 #ifdef INET
1055 case UDP_ENCAP:
1056 if (!IPSEC_ENABLED(ipv4)) {
1057 INP_WUNLOCK(inp);
1058 return (ENOPROTOOPT);
1059 }
1060 error = UDPENCAP_PCBCTL(inp, sopt);
1061 break;
1062 #endif /* INET */
1063 #endif /* IPSEC */
1064 case UDPLITE_SEND_CSCOV:
1065 case UDPLITE_RECV_CSCOV:
1066 if (!isudplite) {
1067 INP_WUNLOCK(inp);
1068 error = ENOPROTOOPT;
1069 break;
1070 }
1071 up = intoudpcb(inp);
1072 KASSERT(up != NULL, ("%s: up == NULL", __func__));
1073 if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1074 optval = up->u_txcslen;
1075 else
1076 optval = up->u_rxcslen;
1077 INP_WUNLOCK(inp);
1078 error = sooptcopyout(sopt, &optval, sizeof(optval));
1079 break;
1080 default:
1081 INP_WUNLOCK(inp);
1082 error = ENOPROTOOPT;
1083 break;
1084 }
1085 break;
1086 }
1087 return (error);
1088 }
1089
1090 #ifdef INET
1091 #ifdef INET6
1092 /* The logic here is derived from ip6_setpktopt(). See comments there. */
1093 static int
udp_v4mapped_pktinfo(struct cmsghdr * cm,struct sockaddr_in * src,struct inpcb * inp,int flags)1094 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1095 struct inpcb *inp, int flags)
1096 {
1097 struct ifnet *ifp;
1098 struct in6_pktinfo *pktinfo;
1099 struct in_addr ia;
1100
1101 if ((flags & PRUS_IPV6) == 0)
1102 return (0);
1103
1104 if (cm->cmsg_level != IPPROTO_IPV6)
1105 return (0);
1106
1107 if (cm->cmsg_type != IPV6_2292PKTINFO &&
1108 cm->cmsg_type != IPV6_PKTINFO)
1109 return (0);
1110
1111 if (cm->cmsg_len !=
1112 CMSG_LEN(sizeof(struct in6_pktinfo)))
1113 return (EINVAL);
1114
1115 pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1116 if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1117 !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1118 return (EINVAL);
1119
1120 /* Validate the interface index if specified. */
1121 if (pktinfo->ipi6_ifindex > V_if_index)
1122 return (ENXIO);
1123
1124 ifp = NULL;
1125 if (pktinfo->ipi6_ifindex) {
1126 ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1127 if (ifp == NULL)
1128 return (ENXIO);
1129 }
1130 if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1131 ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1132 if (in_ifhasaddr(ifp, ia) == 0)
1133 return (EADDRNOTAVAIL);
1134 }
1135
1136 bzero(src, sizeof(*src));
1137 src->sin_family = AF_INET;
1138 src->sin_len = sizeof(*src);
1139 src->sin_port = inp->inp_lport;
1140 src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1141
1142 return (0);
1143 }
1144 #endif
1145
1146 static int
udp_output(struct inpcb * inp,struct mbuf * m,struct sockaddr * addr,struct mbuf * control,struct thread * td,int flags)1147 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1148 struct mbuf *control, struct thread *td, int flags)
1149 {
1150 struct udpiphdr *ui;
1151 int len = m->m_pkthdr.len;
1152 struct in_addr faddr, laddr;
1153 struct cmsghdr *cm;
1154 struct inpcbinfo *pcbinfo;
1155 struct sockaddr_in *sin, src;
1156 struct epoch_tracker et;
1157 int cscov_partial = 0;
1158 int error = 0;
1159 int ipflags = 0;
1160 u_short fport, lport;
1161 u_char tos;
1162 uint8_t pr;
1163 uint16_t cscov = 0;
1164 uint32_t flowid = 0;
1165 uint8_t flowtype = M_HASHTYPE_NONE;
1166
1167 if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1168 if (control)
1169 m_freem(control);
1170 m_freem(m);
1171 return (EMSGSIZE);
1172 }
1173
1174 src.sin_family = 0;
1175 sin = (struct sockaddr_in *)addr;
1176
1177 /*
1178 * udp_output() may need to temporarily bind or connect the current
1179 * inpcb. As such, we don't know up front whether we will need the
1180 * pcbinfo lock or not. Do any work to decide what is needed up
1181 * front before acquiring any locks.
1182 *
1183 * We will need network epoch in either case, to safely lookup into
1184 * pcb hash.
1185 */
1186 if (sin == NULL ||
1187 (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
1188 INP_WLOCK(inp);
1189 else
1190 INP_RLOCK(inp);
1191 NET_EPOCH_ENTER(et);
1192 tos = inp->inp_ip_tos;
1193 if (control != NULL) {
1194 /*
1195 * XXX: Currently, we assume all the optional information is
1196 * stored in a single mbuf.
1197 */
1198 if (control->m_next) {
1199 m_freem(control);
1200 error = EINVAL;
1201 goto release;
1202 }
1203 for (; control->m_len > 0;
1204 control->m_data += CMSG_ALIGN(cm->cmsg_len),
1205 control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1206 cm = mtod(control, struct cmsghdr *);
1207 if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1208 || cm->cmsg_len > control->m_len) {
1209 error = EINVAL;
1210 break;
1211 }
1212 #ifdef INET6
1213 error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1214 if (error != 0)
1215 break;
1216 #endif
1217 if (cm->cmsg_level != IPPROTO_IP)
1218 continue;
1219
1220 switch (cm->cmsg_type) {
1221 case IP_SENDSRCADDR:
1222 if (cm->cmsg_len !=
1223 CMSG_LEN(sizeof(struct in_addr))) {
1224 error = EINVAL;
1225 break;
1226 }
1227 bzero(&src, sizeof(src));
1228 src.sin_family = AF_INET;
1229 src.sin_len = sizeof(src);
1230 src.sin_port = inp->inp_lport;
1231 src.sin_addr =
1232 *(struct in_addr *)CMSG_DATA(cm);
1233 break;
1234
1235 case IP_TOS:
1236 if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1237 error = EINVAL;
1238 break;
1239 }
1240 tos = *(u_char *)CMSG_DATA(cm);
1241 break;
1242
1243 case IP_FLOWID:
1244 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1245 error = EINVAL;
1246 break;
1247 }
1248 flowid = *(uint32_t *) CMSG_DATA(cm);
1249 break;
1250
1251 case IP_FLOWTYPE:
1252 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1253 error = EINVAL;
1254 break;
1255 }
1256 flowtype = *(uint32_t *) CMSG_DATA(cm);
1257 break;
1258
1259 #ifdef RSS
1260 case IP_RSSBUCKETID:
1261 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1262 error = EINVAL;
1263 break;
1264 }
1265 /* This is just a placeholder for now */
1266 break;
1267 #endif /* RSS */
1268 default:
1269 error = ENOPROTOOPT;
1270 break;
1271 }
1272 if (error)
1273 break;
1274 }
1275 m_freem(control);
1276 control = NULL;
1277 }
1278 if (error)
1279 goto release;
1280
1281 pr = inp->inp_socket->so_proto->pr_protocol;
1282 pcbinfo = udp_get_inpcbinfo(pr);
1283
1284 /*
1285 * If the IP_SENDSRCADDR control message was specified, override the
1286 * source address for this datagram. Its use is invalidated if the
1287 * address thus specified is incomplete or clobbers other inpcbs.
1288 */
1289 laddr = inp->inp_laddr;
1290 lport = inp->inp_lport;
1291 if (src.sin_family == AF_INET) {
1292 INP_HASH_LOCK_ASSERT(pcbinfo);
1293 if ((lport == 0) ||
1294 (laddr.s_addr == INADDR_ANY &&
1295 src.sin_addr.s_addr == INADDR_ANY)) {
1296 error = EINVAL;
1297 goto release;
1298 }
1299 error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1300 &laddr.s_addr, &lport, td->td_ucred);
1301 if (error)
1302 goto release;
1303 }
1304
1305 /*
1306 * If a UDP socket has been connected, then a local address/port will
1307 * have been selected and bound.
1308 *
1309 * If a UDP socket has not been connected to, then an explicit
1310 * destination address must be used, in which case a local
1311 * address/port may not have been selected and bound.
1312 */
1313 if (sin != NULL) {
1314 INP_LOCK_ASSERT(inp);
1315 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1316 error = EISCONN;
1317 goto release;
1318 }
1319
1320 /*
1321 * Jail may rewrite the destination address, so let it do
1322 * that before we use it.
1323 */
1324 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1325 if (error)
1326 goto release;
1327
1328 /*
1329 * If a local address or port hasn't yet been selected, or if
1330 * the destination address needs to be rewritten due to using
1331 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1332 * to do the heavy lifting. Once a port is selected, we
1333 * commit the binding back to the socket; we also commit the
1334 * binding of the address if in jail.
1335 *
1336 * If we already have a valid binding and we're not
1337 * requesting a destination address rewrite, use a fast path.
1338 */
1339 if (inp->inp_laddr.s_addr == INADDR_ANY ||
1340 inp->inp_lport == 0 ||
1341 sin->sin_addr.s_addr == INADDR_ANY ||
1342 sin->sin_addr.s_addr == INADDR_BROADCAST) {
1343 INP_HASH_LOCK_ASSERT(pcbinfo);
1344 error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1345 &lport, &faddr.s_addr, &fport, NULL,
1346 td->td_ucred);
1347 if (error)
1348 goto release;
1349
1350 /*
1351 * XXXRW: Why not commit the port if the address is
1352 * !INADDR_ANY?
1353 */
1354 /* Commit the local port if newly assigned. */
1355 if (inp->inp_laddr.s_addr == INADDR_ANY &&
1356 inp->inp_lport == 0) {
1357 INP_WLOCK_ASSERT(inp);
1358 /*
1359 * Remember addr if jailed, to prevent
1360 * rebinding.
1361 */
1362 if (prison_flag(td->td_ucred, PR_IP4))
1363 inp->inp_laddr = laddr;
1364 inp->inp_lport = lport;
1365 INP_HASH_WLOCK(pcbinfo);
1366 error = in_pcbinshash(inp);
1367 INP_HASH_WUNLOCK(pcbinfo);
1368 if (error != 0) {
1369 inp->inp_lport = 0;
1370 error = EAGAIN;
1371 goto release;
1372 }
1373 inp->inp_flags |= INP_ANONPORT;
1374 }
1375 } else {
1376 faddr = sin->sin_addr;
1377 fport = sin->sin_port;
1378 }
1379 } else {
1380 INP_LOCK_ASSERT(inp);
1381 faddr = inp->inp_faddr;
1382 fport = inp->inp_fport;
1383 if (faddr.s_addr == INADDR_ANY) {
1384 error = ENOTCONN;
1385 goto release;
1386 }
1387 }
1388
1389 /*
1390 * Calculate data length and get a mbuf for UDP, IP, and possible
1391 * link-layer headers. Immediate slide the data pointer back forward
1392 * since we won't use that space at this layer.
1393 */
1394 M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1395 if (m == NULL) {
1396 error = ENOBUFS;
1397 goto release;
1398 }
1399 m->m_data += max_linkhdr;
1400 m->m_len -= max_linkhdr;
1401 m->m_pkthdr.len -= max_linkhdr;
1402
1403 /*
1404 * Fill in mbuf with extended UDP header and addresses and length put
1405 * into network format.
1406 */
1407 ui = mtod(m, struct udpiphdr *);
1408 bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */
1409 ui->ui_v = IPVERSION << 4;
1410 ui->ui_pr = pr;
1411 ui->ui_src = laddr;
1412 ui->ui_dst = faddr;
1413 ui->ui_sport = lport;
1414 ui->ui_dport = fport;
1415 ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1416 if (pr == IPPROTO_UDPLITE) {
1417 struct udpcb *up;
1418 uint16_t plen;
1419
1420 up = intoudpcb(inp);
1421 cscov = up->u_txcslen;
1422 plen = (u_short)len + sizeof(struct udphdr);
1423 if (cscov >= plen)
1424 cscov = 0;
1425 ui->ui_len = htons(plen);
1426 ui->ui_ulen = htons(cscov);
1427 /*
1428 * For UDP-Lite, checksum coverage length of zero means
1429 * the entire UDPLite packet is covered by the checksum.
1430 */
1431 cscov_partial = (cscov == 0) ? 0 : 1;
1432 }
1433
1434 /*
1435 * Set the Don't Fragment bit in the IP header.
1436 */
1437 if (inp->inp_flags & INP_DONTFRAG) {
1438 struct ip *ip;
1439
1440 ip = (struct ip *)&ui->ui_i;
1441 ip->ip_off |= htons(IP_DF);
1442 }
1443
1444 if (inp->inp_socket->so_options & SO_DONTROUTE)
1445 ipflags |= IP_ROUTETOIF;
1446 if (inp->inp_socket->so_options & SO_BROADCAST)
1447 ipflags |= IP_ALLOWBROADCAST;
1448 if (inp->inp_flags & INP_ONESBCAST)
1449 ipflags |= IP_SENDONES;
1450
1451 #ifdef MAC
1452 mac_inpcb_create_mbuf(inp, m);
1453 #endif
1454
1455 /*
1456 * Set up checksum and output datagram.
1457 */
1458 ui->ui_sum = 0;
1459 if (pr == IPPROTO_UDPLITE) {
1460 if (inp->inp_flags & INP_ONESBCAST)
1461 faddr.s_addr = INADDR_BROADCAST;
1462 if (cscov_partial) {
1463 if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1464 ui->ui_sum = 0xffff;
1465 } else {
1466 if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1467 ui->ui_sum = 0xffff;
1468 }
1469 } else if (V_udp_cksum) {
1470 if (inp->inp_flags & INP_ONESBCAST)
1471 faddr.s_addr = INADDR_BROADCAST;
1472 ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1473 htons((u_short)len + sizeof(struct udphdr) + pr));
1474 m->m_pkthdr.csum_flags = CSUM_UDP;
1475 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1476 }
1477 ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1478 ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */
1479 ((struct ip *)ui)->ip_tos = tos; /* XXX */
1480 UDPSTAT_INC(udps_opackets);
1481
1482 /*
1483 * Setup flowid / RSS information for outbound socket.
1484 *
1485 * Once the UDP code decides to set a flowid some other way,
1486 * this allows the flowid to be overridden by userland.
1487 */
1488 if (flowtype != M_HASHTYPE_NONE) {
1489 m->m_pkthdr.flowid = flowid;
1490 M_HASHTYPE_SET(m, flowtype);
1491 }
1492 #if defined(ROUTE_MPATH) || defined(RSS)
1493 else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1494 uint32_t hash_val, hash_type;
1495
1496 hash_val = fib4_calc_packet_hash(laddr, faddr,
1497 lport, fport, pr, &hash_type);
1498 m->m_pkthdr.flowid = hash_val;
1499 M_HASHTYPE_SET(m, hash_type);
1500 }
1501
1502 /*
1503 * Don't override with the inp cached flowid value.
1504 *
1505 * Depending upon the kind of send being done, the inp
1506 * flowid/flowtype values may actually not be appropriate
1507 * for this particular socket send.
1508 *
1509 * We should either leave the flowid at zero (which is what is
1510 * currently done) or set it to some software generated
1511 * hash value based on the packet contents.
1512 */
1513 ipflags |= IP_NODEFAULTFLOWID;
1514 #endif /* RSS */
1515
1516 if (pr == IPPROTO_UDPLITE)
1517 UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1518 else
1519 UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1520 error = ip_output(m, inp->inp_options,
1521 INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
1522 inp->inp_moptions, inp);
1523 INP_UNLOCK(inp);
1524 NET_EPOCH_EXIT(et);
1525 return (error);
1526
1527 release:
1528 INP_UNLOCK(inp);
1529 NET_EPOCH_EXIT(et);
1530 m_freem(m);
1531 return (error);
1532 }
1533
1534 static void
udp_abort(struct socket * so)1535 udp_abort(struct socket *so)
1536 {
1537 struct inpcb *inp;
1538 struct inpcbinfo *pcbinfo;
1539
1540 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1541 inp = sotoinpcb(so);
1542 KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1543 INP_WLOCK(inp);
1544 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1545 INP_HASH_WLOCK(pcbinfo);
1546 in_pcbdisconnect(inp);
1547 inp->inp_laddr.s_addr = INADDR_ANY;
1548 INP_HASH_WUNLOCK(pcbinfo);
1549 soisdisconnected(so);
1550 }
1551 INP_WUNLOCK(inp);
1552 }
1553
1554 static int
udp_attach(struct socket * so,int proto,struct thread * td)1555 udp_attach(struct socket *so, int proto, struct thread *td)
1556 {
1557 static uint32_t udp_flowid;
1558 struct inpcb *inp;
1559 struct inpcbinfo *pcbinfo;
1560 int error;
1561
1562 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1563 inp = sotoinpcb(so);
1564 KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1565 error = soreserve(so, udp_sendspace, udp_recvspace);
1566 if (error)
1567 return (error);
1568 INP_INFO_WLOCK(pcbinfo);
1569 error = in_pcballoc(so, pcbinfo);
1570 if (error) {
1571 INP_INFO_WUNLOCK(pcbinfo);
1572 return (error);
1573 }
1574
1575 inp = sotoinpcb(so);
1576 inp->inp_vflag |= INP_IPV4;
1577 inp->inp_ip_ttl = V_ip_defttl;
1578 inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1579 inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1580
1581 error = udp_newudpcb(inp);
1582 if (error) {
1583 in_pcbdetach(inp);
1584 in_pcbfree(inp);
1585 INP_INFO_WUNLOCK(pcbinfo);
1586 return (error);
1587 }
1588
1589 INP_WUNLOCK(inp);
1590 INP_INFO_WUNLOCK(pcbinfo);
1591 return (0);
1592 }
1593 #endif /* INET */
1594
1595 int
udp_set_kernel_tunneling(struct socket * so,udp_tun_func_t f,udp_tun_icmp_t i,void * ctx)1596 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1597 {
1598 struct inpcb *inp;
1599 struct udpcb *up;
1600
1601 KASSERT(so->so_type == SOCK_DGRAM,
1602 ("udp_set_kernel_tunneling: !dgram"));
1603 inp = sotoinpcb(so);
1604 KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1605 INP_WLOCK(inp);
1606 up = intoudpcb(inp);
1607 if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) ||
1608 (up->u_icmp_func != NULL))) {
1609 INP_WUNLOCK(inp);
1610 return (EBUSY);
1611 }
1612 up->u_tun_func = f;
1613 up->u_icmp_func = i;
1614 up->u_tun_ctx = ctx;
1615 INP_WUNLOCK(inp);
1616 return (0);
1617 }
1618
1619 #ifdef INET
1620 static int
udp_bind(struct socket * so,struct sockaddr * nam,struct thread * td)1621 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1622 {
1623 struct inpcb *inp;
1624 struct inpcbinfo *pcbinfo;
1625 struct sockaddr_in *sinp;
1626 int error;
1627
1628 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1629 inp = sotoinpcb(so);
1630 KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1631
1632 sinp = (struct sockaddr_in *)nam;
1633 if (nam->sa_family != AF_INET) {
1634 /*
1635 * Preserve compatibility with old programs.
1636 */
1637 if (nam->sa_family != AF_UNSPEC ||
1638 nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
1639 sinp->sin_addr.s_addr != INADDR_ANY)
1640 return (EAFNOSUPPORT);
1641 nam->sa_family = AF_INET;
1642 }
1643 if (nam->sa_len != sizeof(struct sockaddr_in))
1644 return (EINVAL);
1645
1646 INP_WLOCK(inp);
1647 INP_HASH_WLOCK(pcbinfo);
1648 error = in_pcbbind(inp, nam, td->td_ucred);
1649 INP_HASH_WUNLOCK(pcbinfo);
1650 INP_WUNLOCK(inp);
1651 return (error);
1652 }
1653
1654 static void
udp_close(struct socket * so)1655 udp_close(struct socket *so)
1656 {
1657 struct inpcb *inp;
1658 struct inpcbinfo *pcbinfo;
1659
1660 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1661 inp = sotoinpcb(so);
1662 KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1663 INP_WLOCK(inp);
1664 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1665 INP_HASH_WLOCK(pcbinfo);
1666 in_pcbdisconnect(inp);
1667 inp->inp_laddr.s_addr = INADDR_ANY;
1668 INP_HASH_WUNLOCK(pcbinfo);
1669 soisdisconnected(so);
1670 }
1671 INP_WUNLOCK(inp);
1672 }
1673
1674 static int
udp_connect(struct socket * so,struct sockaddr * nam,struct thread * td)1675 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1676 {
1677 struct epoch_tracker et;
1678 struct inpcb *inp;
1679 struct inpcbinfo *pcbinfo;
1680 struct sockaddr_in *sin;
1681 int error;
1682
1683 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1684 inp = sotoinpcb(so);
1685 KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1686
1687 sin = (struct sockaddr_in *)nam;
1688 if (sin->sin_family != AF_INET)
1689 return (EAFNOSUPPORT);
1690 if (sin->sin_len != sizeof(*sin))
1691 return (EINVAL);
1692
1693 INP_WLOCK(inp);
1694 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1695 INP_WUNLOCK(inp);
1696 return (EISCONN);
1697 }
1698 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1699 if (error != 0) {
1700 INP_WUNLOCK(inp);
1701 return (error);
1702 }
1703 NET_EPOCH_ENTER(et);
1704 INP_HASH_WLOCK(pcbinfo);
1705 error = in_pcbconnect(inp, nam, td->td_ucred);
1706 INP_HASH_WUNLOCK(pcbinfo);
1707 NET_EPOCH_EXIT(et);
1708 if (error == 0)
1709 soisconnected(so);
1710 INP_WUNLOCK(inp);
1711 return (error);
1712 }
1713
1714 static void
udp_detach(struct socket * so)1715 udp_detach(struct socket *so)
1716 {
1717 struct inpcb *inp;
1718 struct inpcbinfo *pcbinfo;
1719 struct udpcb *up;
1720
1721 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1722 inp = sotoinpcb(so);
1723 KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1724 KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1725 ("udp_detach: not disconnected"));
1726 INP_INFO_WLOCK(pcbinfo);
1727 INP_WLOCK(inp);
1728 up = intoudpcb(inp);
1729 KASSERT(up != NULL, ("%s: up == NULL", __func__));
1730 inp->inp_ppcb = NULL;
1731 in_pcbdetach(inp);
1732 in_pcbfree(inp);
1733 INP_INFO_WUNLOCK(pcbinfo);
1734 udp_discardcb(up);
1735 }
1736
1737 static int
udp_disconnect(struct socket * so)1738 udp_disconnect(struct socket *so)
1739 {
1740 struct inpcb *inp;
1741 struct inpcbinfo *pcbinfo;
1742
1743 pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1744 inp = sotoinpcb(so);
1745 KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1746 INP_WLOCK(inp);
1747 if (inp->inp_faddr.s_addr == INADDR_ANY) {
1748 INP_WUNLOCK(inp);
1749 return (ENOTCONN);
1750 }
1751 INP_HASH_WLOCK(pcbinfo);
1752 in_pcbdisconnect(inp);
1753 inp->inp_laddr.s_addr = INADDR_ANY;
1754 INP_HASH_WUNLOCK(pcbinfo);
1755 SOCK_LOCK(so);
1756 so->so_state &= ~SS_ISCONNECTED; /* XXX */
1757 SOCK_UNLOCK(so);
1758 INP_WUNLOCK(inp);
1759 return (0);
1760 }
1761
1762 static int
udp_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * addr,struct mbuf * control,struct thread * td)1763 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1764 struct mbuf *control, struct thread *td)
1765 {
1766 struct inpcb *inp;
1767 int error;
1768
1769 inp = sotoinpcb(so);
1770 KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1771
1772 if (addr != NULL) {
1773 error = 0;
1774 if (addr->sa_family != AF_INET)
1775 error = EAFNOSUPPORT;
1776 else if (addr->sa_len != sizeof(struct sockaddr_in))
1777 error = EINVAL;
1778 if (__predict_false(error != 0)) {
1779 m_freem(control);
1780 m_freem(m);
1781 return (error);
1782 }
1783 }
1784 return (udp_output(inp, m, addr, control, td, flags));
1785 }
1786 #endif /* INET */
1787
1788 int
udp_shutdown(struct socket * so)1789 udp_shutdown(struct socket *so)
1790 {
1791 struct inpcb *inp;
1792
1793 inp = sotoinpcb(so);
1794 KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1795 INP_WLOCK(inp);
1796 socantsendmore(so);
1797 INP_WUNLOCK(inp);
1798 return (0);
1799 }
1800
1801 #ifdef INET
1802 struct pr_usrreqs udp_usrreqs = {
1803 .pru_abort = udp_abort,
1804 .pru_attach = udp_attach,
1805 .pru_bind = udp_bind,
1806 .pru_connect = udp_connect,
1807 .pru_control = in_control,
1808 .pru_detach = udp_detach,
1809 .pru_disconnect = udp_disconnect,
1810 .pru_peeraddr = in_getpeeraddr,
1811 .pru_send = udp_send,
1812 .pru_soreceive = soreceive_dgram,
1813 .pru_sosend = sosend_dgram,
1814 .pru_shutdown = udp_shutdown,
1815 .pru_sockaddr = in_getsockaddr,
1816 .pru_sosetlabel = in_pcbsosetlabel,
1817 .pru_close = udp_close,
1818 };
1819 #endif /* INET */
1820