1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2006-2007 Robert N. M. Watson 5 * Copyright (c) 2010-2011 Juniper Networks, Inc. 6 * All rights reserved. 7 * 8 * Portions of this software were developed by Robert N. M. Watson under 9 * contract to Juniper Networks, Inc. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include "opt_ddb.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_tcpdebug.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/limits.h> 49 #include <sys/malloc.h> 50 #include <sys/refcount.h> 51 #include <sys/kernel.h> 52 #include <sys/sysctl.h> 53 #include <sys/mbuf.h> 54 #ifdef INET6 55 #include <sys/domain.h> 56 #endif /* INET6 */ 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/protosw.h> 60 #include <sys/proc.h> 61 #include <sys/jail.h> 62 63 #ifdef DDB 64 #include <ddb/ddb.h> 65 #endif 66 67 #include <net/if.h> 68 #include <net/if_var.h> 69 #include <net/route.h> 70 #include <net/vnet.h> 71 72 #include <netinet/in.h> 73 #include <netinet/in_kdtrace.h> 74 #include <netinet/in_pcb.h> 75 #include <netinet/in_systm.h> 76 #include <netinet/in_var.h> 77 #include <netinet/ip_var.h> 78 #ifdef INET6 79 #include <netinet/ip6.h> 80 #include <netinet6/in6_pcb.h> 81 #include <netinet6/ip6_var.h> 82 #include <netinet6/scope6_var.h> 83 #endif 84 #ifdef TCP_RFC7413 85 #include <netinet/tcp_fastopen.h> 86 #endif 87 #include <netinet/tcp.h> 88 #include <netinet/tcp_fsm.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcpip.h> 93 #include <netinet/cc/cc.h> 94 #ifdef TCPPCAP 95 #include <netinet/tcp_pcap.h> 96 #endif 97 #ifdef TCPDEBUG 98 #include <netinet/tcp_debug.h> 99 #endif 100 #ifdef TCP_OFFLOAD 101 #include <netinet/tcp_offload.h> 102 #endif 103 104 /* 105 * TCP protocol interface to socket abstraction. 106 */ 107 static int tcp_attach(struct socket *); 108 #ifdef INET 109 static int tcp_connect(struct tcpcb *, struct sockaddr *, 110 struct thread *td); 111 #endif /* INET */ 112 #ifdef INET6 113 static int tcp6_connect(struct tcpcb *, struct sockaddr *, 114 struct thread *td); 115 #endif /* INET6 */ 116 static void tcp_disconnect(struct tcpcb *); 117 static void tcp_usrclosed(struct tcpcb *); 118 static void tcp_fill_info(struct tcpcb *, struct tcp_info *); 119 120 #ifdef TCPDEBUG 121 #define TCPDEBUG0 int ostate = 0 122 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 123 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 124 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 125 #else 126 #define TCPDEBUG0 127 #define TCPDEBUG1() 128 #define TCPDEBUG2(req) 129 #endif 130 131 /* 132 * TCP attaches to socket via pru_attach(), reserving space, 133 * and an internet control block. 134 */ 135 static int 136 tcp_usr_attach(struct socket *so, int proto, struct thread *td) 137 { 138 struct inpcb *inp; 139 struct tcpcb *tp = NULL; 140 int error; 141 TCPDEBUG0; 142 143 inp = sotoinpcb(so); 144 KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); 145 TCPDEBUG1(); 146 147 error = tcp_attach(so); 148 if (error) 149 goto out; 150 151 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 152 so->so_linger = TCP_LINGERTIME; 153 154 inp = sotoinpcb(so); 155 tp = intotcpcb(inp); 156 out: 157 TCPDEBUG2(PRU_ATTACH); 158 TCP_PROBE2(debug__user, tp, PRU_ATTACH); 159 return error; 160 } 161 162 /* 163 * tcp_detach is called when the socket layer loses its final reference 164 * to the socket, be it a file descriptor reference, a reference from TCP, 165 * etc. At this point, there is only one case in which we will keep around 166 * inpcb state: time wait. 167 * 168 * This function can probably be re-absorbed back into tcp_usr_detach() now 169 * that there is a single detach path. 170 */ 171 static void 172 tcp_detach(struct socket *so, struct inpcb *inp) 173 { 174 struct tcpcb *tp; 175 176 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 177 INP_WLOCK_ASSERT(inp); 178 179 KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); 180 KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); 181 182 tp = intotcpcb(inp); 183 184 if (inp->inp_flags & INP_TIMEWAIT) { 185 /* 186 * There are two cases to handle: one in which the time wait 187 * state is being discarded (INP_DROPPED), and one in which 188 * this connection will remain in timewait. In the former, 189 * it is time to discard all state (except tcptw, which has 190 * already been discarded by the timewait close code, which 191 * should be further up the call stack somewhere). In the 192 * latter case, we detach from the socket, but leave the pcb 193 * present until timewait ends. 194 * 195 * XXXRW: Would it be cleaner to free the tcptw here? 196 * 197 * Astute question indeed, from twtcp perspective there are 198 * three cases to consider: 199 * 200 * #1 tcp_detach is called at tcptw creation time by 201 * tcp_twstart, then do not discard the newly created tcptw 202 * and leave inpcb present until timewait ends 203 * #2 tcp_detach is called at timewait end (or reuse) by 204 * tcp_twclose, then the tcptw has already been discarded 205 * (or reused) and inpcb is freed here 206 * #3 tcp_detach is called() after timewait ends (or reuse) 207 * (e.g. by soclose), then tcptw has already been discarded 208 * (or reused) and inpcb is freed here 209 * 210 * In all three cases the tcptw should not be freed here. 211 */ 212 if (inp->inp_flags & INP_DROPPED) { 213 KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " 214 "INP_DROPPED && tp != NULL")); 215 in_pcbdetach(inp); 216 in_pcbfree(inp); 217 } else { 218 in_pcbdetach(inp); 219 INP_WUNLOCK(inp); 220 } 221 } else { 222 /* 223 * If the connection is not in timewait, we consider two 224 * two conditions: one in which no further processing is 225 * necessary (dropped || embryonic), and one in which TCP is 226 * not yet done, but no longer requires the socket, so the 227 * pcb will persist for the time being. 228 * 229 * XXXRW: Does the second case still occur? 230 */ 231 if (inp->inp_flags & INP_DROPPED || 232 tp->t_state < TCPS_SYN_SENT) { 233 tcp_discardcb(tp); 234 in_pcbdetach(inp); 235 in_pcbfree(inp); 236 } else { 237 in_pcbdetach(inp); 238 INP_WUNLOCK(inp); 239 } 240 } 241 } 242 243 /* 244 * pru_detach() detaches the TCP protocol from the socket. 245 * If the protocol state is non-embryonic, then can't 246 * do this directly: have to initiate a pru_disconnect(), 247 * which may finish later; embryonic TCB's can just 248 * be discarded here. 249 */ 250 static void 251 tcp_usr_detach(struct socket *so) 252 { 253 struct inpcb *inp; 254 int rlock = 0; 255 256 inp = sotoinpcb(so); 257 KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); 258 if (!INP_INFO_WLOCKED(&V_tcbinfo)) { 259 INP_INFO_RLOCK(&V_tcbinfo); 260 rlock = 1; 261 } 262 INP_WLOCK(inp); 263 KASSERT(inp->inp_socket != NULL, 264 ("tcp_usr_detach: inp_socket == NULL")); 265 tcp_detach(so, inp); 266 if (rlock) 267 INP_INFO_RUNLOCK(&V_tcbinfo); 268 } 269 270 #ifdef LVS_TCPOPT_TOA 271 272 #ifndef TCPOPT_TOA 273 #define TCPOPT_TOA 254 274 #define TCPOLEN_TOA 8 275 #endif 276 277 struct toa_data { 278 uint8_t opcode; 279 uint8_t opsize; 280 uint16_t port; 281 uint32_t ip; 282 }; 283 284 static int 285 toa_getpeeraddr(struct socket *so, struct sockaddr **nam) 286 { 287 int ret; 288 struct toa_data *toa; 289 struct sockaddr_in *sin; 290 291 ret = in_getpeeraddr(so, nam); 292 if (ret) { 293 return ret; 294 } 295 296 toa = (struct toa_data *)so->so_toa; 297 if (toa->opcode == TCPOPT_TOA && toa->opsize == TCPOLEN_TOA) { 298 sin = (struct sockaddr_in *)(*nam); 299 300 sin->sin_addr.s_addr = toa->ip; 301 sin->sin_port = toa->port; 302 } 303 304 return 0; 305 } 306 #endif 307 308 #ifdef INET 309 /* 310 * Give the socket an address. 311 */ 312 static int 313 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 314 { 315 int error = 0; 316 struct inpcb *inp; 317 struct tcpcb *tp = NULL; 318 struct sockaddr_in *sinp; 319 320 sinp = (struct sockaddr_in *)nam; 321 if (nam->sa_len != sizeof (*sinp)) 322 return (EINVAL); 323 /* 324 * Must check for multicast addresses and disallow binding 325 * to them. 326 */ 327 if (sinp->sin_family == AF_INET && 328 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 329 return (EAFNOSUPPORT); 330 331 TCPDEBUG0; 332 inp = sotoinpcb(so); 333 KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); 334 INP_WLOCK(inp); 335 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 336 error = EINVAL; 337 goto out; 338 } 339 tp = intotcpcb(inp); 340 TCPDEBUG1(); 341 INP_HASH_WLOCK(&V_tcbinfo); 342 error = in_pcbbind(inp, nam, td->td_ucred); 343 INP_HASH_WUNLOCK(&V_tcbinfo); 344 out: 345 TCPDEBUG2(PRU_BIND); 346 TCP_PROBE2(debug__user, tp, PRU_BIND); 347 INP_WUNLOCK(inp); 348 349 return (error); 350 } 351 #endif /* INET */ 352 353 #ifdef INET6 354 static int 355 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 356 { 357 int error = 0; 358 struct inpcb *inp; 359 struct tcpcb *tp = NULL; 360 struct sockaddr_in6 *sin6p; 361 u_char vflagsav; 362 363 sin6p = (struct sockaddr_in6 *)nam; 364 if (nam->sa_len != sizeof (*sin6p)) 365 return (EINVAL); 366 /* 367 * Must check for multicast addresses and disallow binding 368 * to them. 369 */ 370 if (sin6p->sin6_family == AF_INET6 && 371 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 372 return (EAFNOSUPPORT); 373 374 TCPDEBUG0; 375 inp = sotoinpcb(so); 376 KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); 377 INP_WLOCK(inp); 378 vflagsav = inp->inp_vflag; 379 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 380 error = EINVAL; 381 goto out; 382 } 383 tp = intotcpcb(inp); 384 TCPDEBUG1(); 385 INP_HASH_WLOCK(&V_tcbinfo); 386 inp->inp_vflag &= ~INP_IPV4; 387 inp->inp_vflag |= INP_IPV6; 388 #ifdef INET 389 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 390 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 391 inp->inp_vflag |= INP_IPV4; 392 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 393 struct sockaddr_in sin; 394 395 in6_sin6_2_sin(&sin, sin6p); 396 inp->inp_vflag |= INP_IPV4; 397 inp->inp_vflag &= ~INP_IPV6; 398 error = in_pcbbind(inp, (struct sockaddr *)&sin, 399 td->td_ucred); 400 INP_HASH_WUNLOCK(&V_tcbinfo); 401 goto out; 402 } 403 } 404 #endif 405 error = in6_pcbbind(inp, nam, td->td_ucred); 406 INP_HASH_WUNLOCK(&V_tcbinfo); 407 out: 408 if (error != 0) 409 inp->inp_vflag = vflagsav; 410 TCPDEBUG2(PRU_BIND); 411 TCP_PROBE2(debug__user, tp, PRU_BIND); 412 INP_WUNLOCK(inp); 413 return (error); 414 } 415 #endif /* INET6 */ 416 417 #ifdef INET 418 /* 419 * Prepare to accept connections. 420 */ 421 static int 422 tcp_usr_listen(struct socket *so, int backlog, struct thread *td) 423 { 424 int error = 0; 425 struct inpcb *inp; 426 struct tcpcb *tp = NULL; 427 428 TCPDEBUG0; 429 inp = sotoinpcb(so); 430 KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); 431 INP_WLOCK(inp); 432 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 433 error = EINVAL; 434 goto out; 435 } 436 tp = intotcpcb(inp); 437 TCPDEBUG1(); 438 SOCK_LOCK(so); 439 error = solisten_proto_check(so); 440 INP_HASH_WLOCK(&V_tcbinfo); 441 if (error == 0 && inp->inp_lport == 0) 442 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 443 INP_HASH_WUNLOCK(&V_tcbinfo); 444 if (error == 0) { 445 tcp_state_change(tp, TCPS_LISTEN); 446 solisten_proto(so, backlog); 447 #ifdef TCP_OFFLOAD 448 if ((so->so_options & SO_NO_OFFLOAD) == 0) 449 tcp_offload_listen_start(tp); 450 #endif 451 } 452 SOCK_UNLOCK(so); 453 454 #ifdef TCP_RFC7413 455 if (tp->t_flags & TF_FASTOPEN) 456 tp->t_tfo_pending = tcp_fastopen_alloc_counter(); 457 #endif 458 out: 459 TCPDEBUG2(PRU_LISTEN); 460 TCP_PROBE2(debug__user, tp, PRU_LISTEN); 461 INP_WUNLOCK(inp); 462 return (error); 463 } 464 #endif /* INET */ 465 466 #ifdef INET6 467 static int 468 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) 469 { 470 int error = 0; 471 struct inpcb *inp; 472 struct tcpcb *tp = NULL; 473 u_char vflagsav; 474 475 TCPDEBUG0; 476 inp = sotoinpcb(so); 477 KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); 478 INP_WLOCK(inp); 479 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 480 error = EINVAL; 481 goto out; 482 } 483 vflagsav = inp->inp_vflag; 484 tp = intotcpcb(inp); 485 TCPDEBUG1(); 486 SOCK_LOCK(so); 487 error = solisten_proto_check(so); 488 INP_HASH_WLOCK(&V_tcbinfo); 489 if (error == 0 && inp->inp_lport == 0) { 490 inp->inp_vflag &= ~INP_IPV4; 491 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 492 inp->inp_vflag |= INP_IPV4; 493 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 494 } 495 INP_HASH_WUNLOCK(&V_tcbinfo); 496 if (error == 0) { 497 tcp_state_change(tp, TCPS_LISTEN); 498 solisten_proto(so, backlog); 499 #ifdef TCP_OFFLOAD 500 if ((so->so_options & SO_NO_OFFLOAD) == 0) 501 tcp_offload_listen_start(tp); 502 #endif 503 } 504 SOCK_UNLOCK(so); 505 506 #ifdef TCP_RFC7413 507 if (tp->t_flags & TF_FASTOPEN) 508 tp->t_tfo_pending = tcp_fastopen_alloc_counter(); 509 #endif 510 if (error != 0) 511 inp->inp_vflag = vflagsav; 512 513 out: 514 TCPDEBUG2(PRU_LISTEN); 515 TCP_PROBE2(debug__user, tp, PRU_LISTEN); 516 INP_WUNLOCK(inp); 517 return (error); 518 } 519 #endif /* INET6 */ 520 521 #ifdef INET 522 /* 523 * Initiate connection to peer. 524 * Create a template for use in transmissions on this connection. 525 * Enter SYN_SENT state, and mark socket as connecting. 526 * Start keep-alive timer, and seed output sequence space. 527 * Send initial segment on connection. 528 */ 529 static int 530 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 531 { 532 int error = 0; 533 struct inpcb *inp; 534 struct tcpcb *tp = NULL; 535 struct sockaddr_in *sinp; 536 537 sinp = (struct sockaddr_in *)nam; 538 if (nam->sa_len != sizeof (*sinp)) 539 return (EINVAL); 540 /* 541 * Must disallow TCP ``connections'' to multicast addresses. 542 */ 543 if (sinp->sin_family == AF_INET 544 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 545 return (EAFNOSUPPORT); 546 if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) 547 return (error); 548 549 TCPDEBUG0; 550 inp = sotoinpcb(so); 551 KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); 552 INP_WLOCK(inp); 553 if (inp->inp_flags & INP_TIMEWAIT) { 554 error = EADDRINUSE; 555 goto out; 556 } 557 if (inp->inp_flags & INP_DROPPED) { 558 error = ECONNREFUSED; 559 goto out; 560 } 561 tp = intotcpcb(inp); 562 TCPDEBUG1(); 563 if ((error = tcp_connect(tp, nam, td)) != 0) 564 goto out; 565 #ifdef TCP_OFFLOAD 566 if (registered_toedevs > 0 && 567 (so->so_options & SO_NO_OFFLOAD) == 0 && 568 (error = tcp_offload_connect(so, nam)) == 0) 569 goto out; 570 #endif 571 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 572 error = tp->t_fb->tfb_tcp_output(tp); 573 out: 574 TCPDEBUG2(PRU_CONNECT); 575 TCP_PROBE2(debug__user, tp, PRU_CONNECT); 576 INP_WUNLOCK(inp); 577 return (error); 578 } 579 #endif /* INET */ 580 581 #ifdef INET6 582 static int 583 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 584 { 585 int error = 0; 586 struct inpcb *inp; 587 struct tcpcb *tp = NULL; 588 struct sockaddr_in6 *sin6p; 589 u_int8_t incflagsav; 590 u_char vflagsav; 591 592 TCPDEBUG0; 593 594 sin6p = (struct sockaddr_in6 *)nam; 595 if (nam->sa_len != sizeof (*sin6p)) 596 return (EINVAL); 597 /* 598 * Must disallow TCP ``connections'' to multicast addresses. 599 */ 600 if (sin6p->sin6_family == AF_INET6 601 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 602 return (EAFNOSUPPORT); 603 604 inp = sotoinpcb(so); 605 KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); 606 INP_WLOCK(inp); 607 vflagsav = inp->inp_vflag; 608 incflagsav = inp->inp_inc.inc_flags; 609 if (inp->inp_flags & INP_TIMEWAIT) { 610 error = EADDRINUSE; 611 goto out; 612 } 613 if (inp->inp_flags & INP_DROPPED) { 614 error = ECONNREFUSED; 615 goto out; 616 } 617 tp = intotcpcb(inp); 618 TCPDEBUG1(); 619 #ifdef INET 620 /* 621 * XXXRW: Some confusion: V4/V6 flags relate to binding, and 622 * therefore probably require the hash lock, which isn't held here. 623 * Is this a significant problem? 624 */ 625 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 626 struct sockaddr_in sin; 627 628 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 629 error = EINVAL; 630 goto out; 631 } 632 633 in6_sin6_2_sin(&sin, sin6p); 634 if ((error = prison_remote_ip4(td->td_ucred, 635 &sin.sin_addr)) != 0) 636 goto out; 637 inp->inp_vflag |= INP_IPV4; 638 inp->inp_vflag &= ~INP_IPV6; 639 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) 640 goto out; 641 #ifdef TCP_OFFLOAD 642 if (registered_toedevs > 0 && 643 (so->so_options & SO_NO_OFFLOAD) == 0 && 644 (error = tcp_offload_connect(so, nam)) == 0) 645 goto out; 646 #endif 647 error = tp->t_fb->tfb_tcp_output(tp); 648 goto out; 649 } 650 #endif 651 if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) 652 goto out; 653 inp->inp_vflag &= ~INP_IPV4; 654 inp->inp_vflag |= INP_IPV6; 655 inp->inp_inc.inc_flags |= INC_ISIPV6; 656 if ((error = tcp6_connect(tp, nam, td)) != 0) 657 goto out; 658 #ifdef TCP_OFFLOAD 659 if (registered_toedevs > 0 && 660 (so->so_options & SO_NO_OFFLOAD) == 0 && 661 (error = tcp_offload_connect(so, nam)) == 0) 662 goto out; 663 #endif 664 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 665 error = tp->t_fb->tfb_tcp_output(tp); 666 667 out: 668 /* 669 * If the implicit bind in the connect call fails, restore 670 * the flags we modified. 671 */ 672 if (error != 0 && inp->inp_lport == 0) { 673 inp->inp_vflag = vflagsav; 674 inp->inp_inc.inc_flags = incflagsav; 675 } 676 677 TCPDEBUG2(PRU_CONNECT); 678 TCP_PROBE2(debug__user, tp, PRU_CONNECT); 679 INP_WUNLOCK(inp); 680 return (error); 681 } 682 #endif /* INET6 */ 683 684 /* 685 * Initiate disconnect from peer. 686 * If connection never passed embryonic stage, just drop; 687 * else if don't need to let data drain, then can just drop anyways, 688 * else have to begin TCP shutdown process: mark socket disconnecting, 689 * drain unread data, state switch to reflect user close, and 690 * send segment (e.g. FIN) to peer. Socket will be really disconnected 691 * when peer sends FIN and acks ours. 692 * 693 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 694 */ 695 static int 696 tcp_usr_disconnect(struct socket *so) 697 { 698 struct inpcb *inp; 699 struct tcpcb *tp = NULL; 700 int error = 0; 701 702 TCPDEBUG0; 703 INP_INFO_RLOCK(&V_tcbinfo); 704 inp = sotoinpcb(so); 705 KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); 706 INP_WLOCK(inp); 707 if (inp->inp_flags & INP_TIMEWAIT) 708 goto out; 709 if (inp->inp_flags & INP_DROPPED) { 710 error = ECONNRESET; 711 goto out; 712 } 713 tp = intotcpcb(inp); 714 TCPDEBUG1(); 715 tcp_disconnect(tp); 716 out: 717 TCPDEBUG2(PRU_DISCONNECT); 718 TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); 719 INP_WUNLOCK(inp); 720 INP_INFO_RUNLOCK(&V_tcbinfo); 721 return (error); 722 } 723 724 #ifdef INET 725 /* 726 * Accept a connection. Essentially all the work is done at higher levels; 727 * just return the address of the peer, storing through addr. 728 */ 729 static int 730 tcp_usr_accept(struct socket *so, struct sockaddr **nam) 731 { 732 int error = 0; 733 struct inpcb *inp = NULL; 734 struct tcpcb *tp = NULL; 735 struct in_addr addr; 736 in_port_t port = 0; 737 TCPDEBUG0; 738 739 if (so->so_state & SS_ISDISCONNECTED) 740 return (ECONNABORTED); 741 742 inp = sotoinpcb(so); 743 KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); 744 INP_WLOCK(inp); 745 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 746 error = ECONNABORTED; 747 goto out; 748 } 749 tp = intotcpcb(inp); 750 TCPDEBUG1(); 751 752 /* 753 * We inline in_getpeeraddr and COMMON_END here, so that we can 754 * copy the data of interest and defer the malloc until after we 755 * release the lock. 756 */ 757 port = inp->inp_fport; 758 addr = inp->inp_faddr; 759 760 #ifdef LVS_TCPOPT_TOA 761 { 762 struct toa_data *toa = (struct toa_data *)so->so_toa; 763 if (toa->opcode == TCPOPT_TOA && toa->opsize == TCPOLEN_TOA) { 764 addr.s_addr = toa->ip; 765 port = toa->port; 766 } 767 } 768 #endif 769 770 out: 771 TCPDEBUG2(PRU_ACCEPT); 772 TCP_PROBE2(debug__user, tp, PRU_ACCEPT); 773 INP_WUNLOCK(inp); 774 if (error == 0) 775 *nam = in_sockaddr(port, &addr); 776 return error; 777 } 778 #endif /* INET */ 779 780 #ifdef INET6 781 static int 782 tcp6_usr_accept(struct socket *so, struct sockaddr **nam) 783 { 784 struct inpcb *inp = NULL; 785 int error = 0; 786 struct tcpcb *tp = NULL; 787 struct in_addr addr; 788 struct in6_addr addr6; 789 in_port_t port = 0; 790 int v4 = 0; 791 TCPDEBUG0; 792 793 if (so->so_state & SS_ISDISCONNECTED) 794 return (ECONNABORTED); 795 796 inp = sotoinpcb(so); 797 KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); 798 INP_INFO_RLOCK(&V_tcbinfo); 799 INP_WLOCK(inp); 800 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 801 error = ECONNABORTED; 802 goto out; 803 } 804 tp = intotcpcb(inp); 805 TCPDEBUG1(); 806 807 /* 808 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can 809 * copy the data of interest and defer the malloc until after we 810 * release the lock. 811 */ 812 if (inp->inp_vflag & INP_IPV4) { 813 v4 = 1; 814 port = inp->inp_fport; 815 addr = inp->inp_faddr; 816 } else { 817 port = inp->inp_fport; 818 addr6 = inp->in6p_faddr; 819 } 820 821 out: 822 TCPDEBUG2(PRU_ACCEPT); 823 TCP_PROBE2(debug__user, tp, PRU_ACCEPT); 824 INP_WUNLOCK(inp); 825 INP_INFO_RUNLOCK(&V_tcbinfo); 826 if (error == 0) { 827 if (v4) 828 *nam = in6_v4mapsin6_sockaddr(port, &addr); 829 else 830 *nam = in6_sockaddr(port, &addr6); 831 } 832 return error; 833 } 834 #endif /* INET6 */ 835 836 /* 837 * Mark the connection as being incapable of further output. 838 */ 839 static int 840 tcp_usr_shutdown(struct socket *so) 841 { 842 int error = 0; 843 struct inpcb *inp; 844 struct tcpcb *tp = NULL; 845 846 TCPDEBUG0; 847 INP_INFO_RLOCK(&V_tcbinfo); 848 inp = sotoinpcb(so); 849 KASSERT(inp != NULL, ("inp == NULL")); 850 INP_WLOCK(inp); 851 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 852 error = ECONNRESET; 853 goto out; 854 } 855 tp = intotcpcb(inp); 856 TCPDEBUG1(); 857 socantsendmore(so); 858 tcp_usrclosed(tp); 859 if (!(inp->inp_flags & INP_DROPPED)) 860 error = tp->t_fb->tfb_tcp_output(tp); 861 862 out: 863 TCPDEBUG2(PRU_SHUTDOWN); 864 TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); 865 INP_WUNLOCK(inp); 866 INP_INFO_RUNLOCK(&V_tcbinfo); 867 868 return (error); 869 } 870 871 /* 872 * After a receive, possibly send window update to peer. 873 */ 874 static int 875 tcp_usr_rcvd(struct socket *so, int flags) 876 { 877 struct inpcb *inp; 878 struct tcpcb *tp = NULL; 879 int error = 0; 880 881 TCPDEBUG0; 882 inp = sotoinpcb(so); 883 KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); 884 INP_WLOCK(inp); 885 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 886 error = ECONNRESET; 887 goto out; 888 } 889 tp = intotcpcb(inp); 890 TCPDEBUG1(); 891 #ifdef TCP_RFC7413 892 /* 893 * For passively-created TFO connections, don't attempt a window 894 * update while still in SYN_RECEIVED as this may trigger an early 895 * SYN|ACK. It is preferable to have the SYN|ACK be sent along with 896 * application response data, or failing that, when the DELACK timer 897 * expires. 898 */ 899 if ((tp->t_flags & TF_FASTOPEN) && 900 (tp->t_state == TCPS_SYN_RECEIVED)) 901 goto out; 902 #endif 903 #ifdef TCP_OFFLOAD 904 if (tp->t_flags & TF_TOE) 905 tcp_offload_rcvd(tp); 906 else 907 #endif 908 tp->t_fb->tfb_tcp_output(tp); 909 910 out: 911 TCPDEBUG2(PRU_RCVD); 912 TCP_PROBE2(debug__user, tp, PRU_RCVD); 913 INP_WUNLOCK(inp); 914 return (error); 915 } 916 917 /* 918 * Do a send by putting data in output queue and updating urgent 919 * marker if URG set. Possibly send more data. Unlike the other 920 * pru_*() routines, the mbuf chains are our responsibility. We 921 * must either enqueue them or free them. The other pru_* routines 922 * generally are caller-frees. 923 */ 924 static int 925 tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 926 struct sockaddr *nam, struct mbuf *control, struct thread *td) 927 { 928 int error = 0; 929 struct inpcb *inp; 930 struct tcpcb *tp = NULL; 931 #ifdef INET6 932 int isipv6; 933 #endif 934 TCPDEBUG0; 935 936 /* 937 * We require the pcbinfo lock if we will close the socket as part of 938 * this call. 939 */ 940 if (flags & PRUS_EOF) 941 INP_INFO_RLOCK(&V_tcbinfo); 942 inp = sotoinpcb(so); 943 KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); 944 INP_WLOCK(inp); 945 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 946 if (control) 947 m_freem(control); 948 /* 949 * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible 950 * for freeing memory. 951 */ 952 if (m && (flags & PRUS_NOTREADY) == 0) 953 m_freem(m); 954 error = ECONNRESET; 955 goto out; 956 } 957 #ifdef INET6 958 isipv6 = nam && nam->sa_family == AF_INET6; 959 #endif /* INET6 */ 960 tp = intotcpcb(inp); 961 TCPDEBUG1(); 962 if (control) { 963 /* TCP doesn't do control messages (rights, creds, etc) */ 964 if (control->m_len) { 965 m_freem(control); 966 if (m) 967 m_freem(m); 968 error = EINVAL; 969 goto out; 970 } 971 m_freem(control); /* empty control, just free it */ 972 } 973 if (!(flags & PRUS_OOB)) { 974 sbappendstream(&so->so_snd, m, flags); 975 if (nam && tp->t_state < TCPS_SYN_SENT) { 976 /* 977 * Do implied connect if not yet connected, 978 * initialize window to default value, and 979 * initialize maxseg using peer's cached MSS. 980 */ 981 #ifdef INET6 982 if (isipv6) 983 error = tcp6_connect(tp, nam, td); 984 #endif /* INET6 */ 985 #if defined(INET6) && defined(INET) 986 else 987 #endif 988 #ifdef INET 989 error = tcp_connect(tp, nam, td); 990 #endif 991 if (error) 992 goto out; 993 tp->snd_wnd = TTCP_CLIENT_SND_WND; 994 tcp_mss(tp, -1); 995 } 996 if (flags & PRUS_EOF) { 997 /* 998 * Close the send side of the connection after 999 * the data is sent. 1000 */ 1001 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1002 socantsendmore(so); 1003 tcp_usrclosed(tp); 1004 } 1005 if (!(inp->inp_flags & INP_DROPPED) && 1006 !(flags & PRUS_NOTREADY)) { 1007 if (flags & PRUS_MORETOCOME) 1008 tp->t_flags |= TF_MORETOCOME; 1009 error = tp->t_fb->tfb_tcp_output(tp); 1010 if (flags & PRUS_MORETOCOME) 1011 tp->t_flags &= ~TF_MORETOCOME; 1012 } 1013 } else { 1014 /* 1015 * XXXRW: PRUS_EOF not implemented with PRUS_OOB? 1016 */ 1017 SOCKBUF_LOCK(&so->so_snd); 1018 if (sbspace(&so->so_snd) < -512) { 1019 SOCKBUF_UNLOCK(&so->so_snd); 1020 m_freem(m); 1021 error = ENOBUFS; 1022 goto out; 1023 } 1024 /* 1025 * According to RFC961 (Assigned Protocols), 1026 * the urgent pointer points to the last octet 1027 * of urgent data. We continue, however, 1028 * to consider it to indicate the first octet 1029 * of data past the urgent section. 1030 * Otherwise, snd_up should be one lower. 1031 */ 1032 sbappendstream_locked(&so->so_snd, m, flags); 1033 SOCKBUF_UNLOCK(&so->so_snd); 1034 if (nam && tp->t_state < TCPS_SYN_SENT) { 1035 /* 1036 * Do implied connect if not yet connected, 1037 * initialize window to default value, and 1038 * initialize maxseg using peer's cached MSS. 1039 */ 1040 #ifdef INET6 1041 if (isipv6) 1042 error = tcp6_connect(tp, nam, td); 1043 #endif /* INET6 */ 1044 #if defined(INET6) && defined(INET) 1045 else 1046 #endif 1047 #ifdef INET 1048 error = tcp_connect(tp, nam, td); 1049 #endif 1050 if (error) 1051 goto out; 1052 tp->snd_wnd = TTCP_CLIENT_SND_WND; 1053 tcp_mss(tp, -1); 1054 } 1055 tp->snd_up = tp->snd_una + sbavail(&so->so_snd); 1056 if (!(flags & PRUS_NOTREADY)) { 1057 tp->t_flags |= TF_FORCEDATA; 1058 error = tp->t_fb->tfb_tcp_output(tp); 1059 tp->t_flags &= ~TF_FORCEDATA; 1060 } 1061 } 1062 out: 1063 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : 1064 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 1065 TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : 1066 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 1067 INP_WUNLOCK(inp); 1068 if (flags & PRUS_EOF) 1069 INP_INFO_RUNLOCK(&V_tcbinfo); 1070 return (error); 1071 } 1072 1073 static int 1074 tcp_usr_ready(struct socket *so, struct mbuf *m, int count) 1075 { 1076 struct inpcb *inp; 1077 struct tcpcb *tp; 1078 int error; 1079 1080 inp = sotoinpcb(so); 1081 INP_WLOCK(inp); 1082 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1083 INP_WUNLOCK(inp); 1084 for (int i = 0; i < count; i++) 1085 m = m_free(m); 1086 return (ECONNRESET); 1087 } 1088 tp = intotcpcb(inp); 1089 1090 SOCKBUF_LOCK(&so->so_snd); 1091 error = sbready(&so->so_snd, m, count); 1092 SOCKBUF_UNLOCK(&so->so_snd); 1093 if (error == 0) 1094 error = tp->t_fb->tfb_tcp_output(tp); 1095 INP_WUNLOCK(inp); 1096 1097 return (error); 1098 } 1099 1100 /* 1101 * Abort the TCP. Drop the connection abruptly. 1102 */ 1103 static void 1104 tcp_usr_abort(struct socket *so) 1105 { 1106 struct inpcb *inp; 1107 struct tcpcb *tp = NULL; 1108 TCPDEBUG0; 1109 1110 inp = sotoinpcb(so); 1111 KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); 1112 1113 INP_INFO_RLOCK(&V_tcbinfo); 1114 INP_WLOCK(inp); 1115 KASSERT(inp->inp_socket != NULL, 1116 ("tcp_usr_abort: inp_socket == NULL")); 1117 1118 /* 1119 * If we still have full TCP state, and we're not dropped, drop. 1120 */ 1121 if (!(inp->inp_flags & INP_TIMEWAIT) && 1122 !(inp->inp_flags & INP_DROPPED)) { 1123 tp = intotcpcb(inp); 1124 TCPDEBUG1(); 1125 tcp_drop(tp, ECONNABORTED); 1126 TCPDEBUG2(PRU_ABORT); 1127 TCP_PROBE2(debug__user, tp, PRU_ABORT); 1128 } 1129 if (!(inp->inp_flags & INP_DROPPED)) { 1130 SOCK_LOCK(so); 1131 so->so_state |= SS_PROTOREF; 1132 SOCK_UNLOCK(so); 1133 inp->inp_flags |= INP_SOCKREF; 1134 } 1135 INP_WUNLOCK(inp); 1136 INP_INFO_RUNLOCK(&V_tcbinfo); 1137 } 1138 1139 /* 1140 * TCP socket is closed. Start friendly disconnect. 1141 */ 1142 static void 1143 tcp_usr_close(struct socket *so) 1144 { 1145 struct inpcb *inp; 1146 struct tcpcb *tp = NULL; 1147 TCPDEBUG0; 1148 1149 inp = sotoinpcb(so); 1150 KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); 1151 1152 INP_INFO_RLOCK(&V_tcbinfo); 1153 INP_WLOCK(inp); 1154 KASSERT(inp->inp_socket != NULL, 1155 ("tcp_usr_close: inp_socket == NULL")); 1156 1157 /* 1158 * If we still have full TCP state, and we're not dropped, initiate 1159 * a disconnect. 1160 */ 1161 if (!(inp->inp_flags & INP_TIMEWAIT) && 1162 !(inp->inp_flags & INP_DROPPED)) { 1163 tp = intotcpcb(inp); 1164 TCPDEBUG1(); 1165 tcp_disconnect(tp); 1166 TCPDEBUG2(PRU_CLOSE); 1167 TCP_PROBE2(debug__user, tp, PRU_CLOSE); 1168 } 1169 if (!(inp->inp_flags & INP_DROPPED)) { 1170 SOCK_LOCK(so); 1171 so->so_state |= SS_PROTOREF; 1172 SOCK_UNLOCK(so); 1173 inp->inp_flags |= INP_SOCKREF; 1174 } 1175 INP_WUNLOCK(inp); 1176 INP_INFO_RUNLOCK(&V_tcbinfo); 1177 } 1178 1179 /* 1180 * Receive out-of-band data. 1181 */ 1182 static int 1183 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 1184 { 1185 int error = 0; 1186 struct inpcb *inp; 1187 struct tcpcb *tp = NULL; 1188 1189 TCPDEBUG0; 1190 inp = sotoinpcb(so); 1191 KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); 1192 INP_WLOCK(inp); 1193 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1194 error = ECONNRESET; 1195 goto out; 1196 } 1197 tp = intotcpcb(inp); 1198 TCPDEBUG1(); 1199 if ((so->so_oobmark == 0 && 1200 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1201 so->so_options & SO_OOBINLINE || 1202 tp->t_oobflags & TCPOOB_HADDATA) { 1203 error = EINVAL; 1204 goto out; 1205 } 1206 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 1207 error = EWOULDBLOCK; 1208 goto out; 1209 } 1210 m->m_len = 1; 1211 *mtod(m, caddr_t) = tp->t_iobc; 1212 if ((flags & MSG_PEEK) == 0) 1213 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1214 1215 out: 1216 TCPDEBUG2(PRU_RCVOOB); 1217 TCP_PROBE2(debug__user, tp, PRU_RCVOOB); 1218 INP_WUNLOCK(inp); 1219 return (error); 1220 } 1221 1222 #ifdef INET 1223 struct pr_usrreqs tcp_usrreqs = { 1224 .pru_abort = tcp_usr_abort, 1225 .pru_accept = tcp_usr_accept, 1226 .pru_attach = tcp_usr_attach, 1227 .pru_bind = tcp_usr_bind, 1228 .pru_connect = tcp_usr_connect, 1229 .pru_control = in_control, 1230 .pru_detach = tcp_usr_detach, 1231 .pru_disconnect = tcp_usr_disconnect, 1232 .pru_listen = tcp_usr_listen, 1233 #ifdef LVS_TCPOPT_TOA 1234 .pru_peeraddr = toa_getpeeraddr, 1235 #else 1236 .pru_peeraddr = in_getpeeraddr, 1237 #endif 1238 .pru_rcvd = tcp_usr_rcvd, 1239 .pru_rcvoob = tcp_usr_rcvoob, 1240 .pru_send = tcp_usr_send, 1241 .pru_ready = tcp_usr_ready, 1242 .pru_shutdown = tcp_usr_shutdown, 1243 .pru_sockaddr = in_getsockaddr, 1244 .pru_sosetlabel = in_pcbsosetlabel, 1245 .pru_close = tcp_usr_close, 1246 }; 1247 #endif /* INET */ 1248 1249 #ifdef INET6 1250 struct pr_usrreqs tcp6_usrreqs = { 1251 .pru_abort = tcp_usr_abort, 1252 .pru_accept = tcp6_usr_accept, 1253 .pru_attach = tcp_usr_attach, 1254 .pru_bind = tcp6_usr_bind, 1255 .pru_connect = tcp6_usr_connect, 1256 .pru_control = in6_control, 1257 .pru_detach = tcp_usr_detach, 1258 .pru_disconnect = tcp_usr_disconnect, 1259 .pru_listen = tcp6_usr_listen, 1260 .pru_peeraddr = in6_mapped_peeraddr, 1261 .pru_rcvd = tcp_usr_rcvd, 1262 .pru_rcvoob = tcp_usr_rcvoob, 1263 .pru_send = tcp_usr_send, 1264 .pru_ready = tcp_usr_ready, 1265 .pru_shutdown = tcp_usr_shutdown, 1266 .pru_sockaddr = in6_mapped_sockaddr, 1267 .pru_sosetlabel = in_pcbsosetlabel, 1268 .pru_close = tcp_usr_close, 1269 }; 1270 #endif /* INET6 */ 1271 1272 #ifdef INET 1273 /* 1274 * Common subroutine to open a TCP connection to remote host specified 1275 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1276 * port number if needed. Call in_pcbconnect_setup to do the routing and 1277 * to choose a local host address (interface). If there is an existing 1278 * incarnation of the same connection in TIME-WAIT state and if the remote 1279 * host was sending CC options and if the connection duration was < MSL, then 1280 * truncate the previous TIME-WAIT state and proceed. 1281 * Initialize connection parameters and enter SYN-SENT state. 1282 */ 1283 static int 1284 tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) 1285 { 1286 struct inpcb *inp = tp->t_inpcb, *oinp; 1287 struct socket *so = inp->inp_socket; 1288 struct in_addr laddr; 1289 u_short lport; 1290 int error; 1291 1292 INP_WLOCK_ASSERT(inp); 1293 INP_HASH_WLOCK(&V_tcbinfo); 1294 1295 #ifndef FSTACK 1296 if (inp->inp_lport == 0) { 1297 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 1298 if (error) 1299 goto out; 1300 } 1301 1302 /* 1303 * Cannot simply call in_pcbconnect, because there might be an 1304 * earlier incarnation of this same connection still in 1305 * TIME_WAIT state, creating an ADDRINUSE error. 1306 */ 1307 laddr = inp->inp_laddr; 1308 lport = inp->inp_lport; 1309 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 1310 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 1311 if (error && oinp == NULL) 1312 goto out; 1313 if (oinp) { 1314 error = EADDRINUSE; 1315 goto out; 1316 } 1317 inp->inp_laddr = laddr; 1318 in_pcbrehash(inp); 1319 #else 1320 int anonport = 0; 1321 if (inp->inp_lport == 0) { 1322 anonport = 1; 1323 } 1324 1325 laddr = inp->inp_laddr; 1326 lport = inp->inp_lport; 1327 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 1328 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 1329 if (error && oinp == NULL) 1330 goto out; 1331 if (oinp) { 1332 error = EADDRINUSE; 1333 goto out; 1334 } 1335 1336 inp->inp_laddr = laddr; 1337 1338 if (inp->inp_lport != lport) { 1339 inp->inp_lport = lport; 1340 oinp = in_pcblookup(inp->inp_pcbinfo, inp->inp_faddr, 1341 inp->inp_fport, laddr, lport, 0, NULL); 1342 if (oinp != NULL) { 1343 error = EADDRINUSE; 1344 goto out; 1345 } 1346 1347 // inp->inp_lport != lport means in_pcbconnect_setup selected new port to inp->inp_lport. 1348 // inp will inhash. 1349 if (in_pcbinshash(inp) != 0) { 1350 inp->inp_laddr.s_addr = INADDR_ANY; 1351 inp->inp_lport = 0; 1352 return (EAGAIN); 1353 } 1354 } 1355 else 1356 { 1357 // app call bind() and connect(), lport is set when bind, and the inp is inhashed in bind() function. 1358 // in_pcbconnect_setup() update inp->inp_faddr/inp->inp_fport, so inp should be rehashed. 1359 in_pcbrehash(inp); 1360 } 1361 1362 if (anonport) { 1363 inp->inp_flags |= INP_ANONPORT; 1364 } 1365 #endif 1366 1367 INP_HASH_WUNLOCK(&V_tcbinfo); 1368 1369 /* 1370 * Compute window scaling to request: 1371 * Scale to fit into sweet spot. See tcp_syncache.c. 1372 * XXX: This should move to tcp_output(). 1373 */ 1374 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1375 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1376 tp->request_r_scale++; 1377 1378 soisconnecting(so); 1379 TCPSTAT_INC(tcps_connattempt); 1380 tcp_state_change(tp, TCPS_SYN_SENT); 1381 tp->iss = tcp_new_isn(tp); 1382 tcp_sendseqinit(tp); 1383 1384 return 0; 1385 1386 out: 1387 INP_HASH_WUNLOCK(&V_tcbinfo); 1388 return (error); 1389 } 1390 #endif /* INET */ 1391 1392 #ifdef INET6 1393 static int 1394 tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) 1395 { 1396 struct inpcb *inp = tp->t_inpcb; 1397 int error; 1398 1399 INP_WLOCK_ASSERT(inp); 1400 INP_HASH_WLOCK(&V_tcbinfo); 1401 1402 if (inp->inp_lport == 0) { 1403 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 1404 if (error) 1405 goto out; 1406 } 1407 error = in6_pcbconnect(inp, nam, td->td_ucred); 1408 if (error != 0) 1409 goto out; 1410 INP_HASH_WUNLOCK(&V_tcbinfo); 1411 1412 /* Compute window scaling to request. */ 1413 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1414 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1415 tp->request_r_scale++; 1416 1417 soisconnecting(inp->inp_socket); 1418 TCPSTAT_INC(tcps_connattempt); 1419 tcp_state_change(tp, TCPS_SYN_SENT); 1420 tp->iss = tcp_new_isn(tp); 1421 tcp_sendseqinit(tp); 1422 1423 return 0; 1424 1425 out: 1426 INP_HASH_WUNLOCK(&V_tcbinfo); 1427 return error; 1428 } 1429 #endif /* INET6 */ 1430 1431 /* 1432 * Export TCP internal state information via a struct tcp_info, based on the 1433 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently 1434 * (TCP state machine, etc). We export all information using FreeBSD-native 1435 * constants -- for example, the numeric values for tcpi_state will differ 1436 * from Linux. 1437 */ 1438 static void 1439 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) 1440 { 1441 1442 INP_WLOCK_ASSERT(tp->t_inpcb); 1443 bzero(ti, sizeof(*ti)); 1444 1445 ti->tcpi_state = tp->t_state; 1446 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 1447 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 1448 if (tp->t_flags & TF_SACK_PERMIT) 1449 ti->tcpi_options |= TCPI_OPT_SACK; 1450 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 1451 ti->tcpi_options |= TCPI_OPT_WSCALE; 1452 ti->tcpi_snd_wscale = tp->snd_scale; 1453 ti->tcpi_rcv_wscale = tp->rcv_scale; 1454 } 1455 1456 ti->tcpi_rto = tp->t_rxtcur * tick; 1457 ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick; 1458 ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; 1459 ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; 1460 1461 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 1462 ti->tcpi_snd_cwnd = tp->snd_cwnd; 1463 1464 /* 1465 * FreeBSD-specific extension fields for tcp_info. 1466 */ 1467 ti->tcpi_rcv_space = tp->rcv_wnd; 1468 ti->tcpi_rcv_nxt = tp->rcv_nxt; 1469 ti->tcpi_snd_wnd = tp->snd_wnd; 1470 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 1471 ti->tcpi_snd_nxt = tp->snd_nxt; 1472 ti->tcpi_snd_mss = tp->t_maxseg; 1473 ti->tcpi_rcv_mss = tp->t_maxseg; 1474 if (tp->t_flags & TF_TOE) 1475 ti->tcpi_options |= TCPI_OPT_TOE; 1476 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 1477 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 1478 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 1479 } 1480 1481 /* 1482 * tcp_ctloutput() must drop the inpcb lock before performing copyin on 1483 * socket option arguments. When it re-acquires the lock after the copy, it 1484 * has to revalidate that the connection is still valid for the socket 1485 * option. 1486 */ 1487 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ 1488 INP_WLOCK(inp); \ 1489 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ 1490 INP_WUNLOCK(inp); \ 1491 cleanup; \ 1492 return (ECONNRESET); \ 1493 } \ 1494 tp = intotcpcb(inp); \ 1495 } while(0) 1496 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) 1497 1498 int 1499 tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1500 { 1501 int error; 1502 struct inpcb *inp; 1503 struct tcpcb *tp; 1504 struct tcp_function_block *blk; 1505 struct tcp_function_set fsn; 1506 1507 error = 0; 1508 inp = sotoinpcb(so); 1509 KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); 1510 INP_WLOCK(inp); 1511 if (sopt->sopt_level != IPPROTO_TCP) { 1512 #ifdef INET6 1513 if (inp->inp_vflag & INP_IPV6PROTO) { 1514 INP_WUNLOCK(inp); 1515 error = ip6_ctloutput(so, sopt); 1516 } 1517 #endif /* INET6 */ 1518 #if defined(INET6) && defined(INET) 1519 else 1520 #endif 1521 #ifdef INET 1522 { 1523 INP_WUNLOCK(inp); 1524 error = ip_ctloutput(so, sopt); 1525 } 1526 #endif 1527 return (error); 1528 } 1529 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1530 INP_WUNLOCK(inp); 1531 return (ECONNRESET); 1532 } 1533 tp = intotcpcb(inp); 1534 /* 1535 * Protect the TCP option TCP_FUNCTION_BLK so 1536 * that a sub-function can *never* overwrite this. 1537 */ 1538 if ((sopt->sopt_dir == SOPT_SET) && 1539 (sopt->sopt_name == TCP_FUNCTION_BLK)) { 1540 INP_WUNLOCK(inp); 1541 error = sooptcopyin(sopt, &fsn, sizeof fsn, 1542 sizeof fsn); 1543 if (error) 1544 return (error); 1545 INP_WLOCK_RECHECK(inp); 1546 if (tp->t_state != TCPS_CLOSED) { 1547 /* 1548 * The user has advanced the state 1549 * past the initial point, we can't 1550 * switch since we are down the road 1551 * and a new set of functions may 1552 * not be compatibile. 1553 */ 1554 INP_WUNLOCK(inp); 1555 return(EINVAL); 1556 } 1557 blk = find_and_ref_tcp_functions(&fsn); 1558 if (blk == NULL) { 1559 INP_WUNLOCK(inp); 1560 return (ENOENT); 1561 } 1562 if (tp->t_fb != blk) { 1563 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { 1564 refcount_release(&blk->tfb_refcnt); 1565 INP_WUNLOCK(inp); 1566 return (ENOENT); 1567 } 1568 /* 1569 * Release the old refcnt, the 1570 * lookup acquires a ref on the 1571 * new one. 1572 */ 1573 if (tp->t_fb->tfb_tcp_fb_fini) 1574 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 1575 refcount_release(&tp->t_fb->tfb_refcnt); 1576 tp->t_fb = blk; 1577 if (tp->t_fb->tfb_tcp_fb_init) { 1578 (*tp->t_fb->tfb_tcp_fb_init)(tp); 1579 } 1580 } 1581 #ifdef TCP_OFFLOAD 1582 if (tp->t_flags & TF_TOE) { 1583 tcp_offload_ctloutput(tp, sopt->sopt_dir, 1584 sopt->sopt_name); 1585 } 1586 #endif 1587 INP_WUNLOCK(inp); 1588 return (error); 1589 } else if ((sopt->sopt_dir == SOPT_GET) && 1590 (sopt->sopt_name == TCP_FUNCTION_BLK)) { 1591 strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, 1592 TCP_FUNCTION_NAME_LEN_MAX); 1593 fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; 1594 fsn.pcbcnt = tp->t_fb->tfb_refcnt; 1595 INP_WUNLOCK(inp); 1596 error = sooptcopyout(sopt, &fsn, sizeof fsn); 1597 return (error); 1598 } 1599 /* Pass in the INP locked, called must unlock it */ 1600 return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); 1601 } 1602 1603 int 1604 tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 1605 { 1606 int error, opt, optval; 1607 u_int ui; 1608 struct tcp_info ti; 1609 struct cc_algo *algo; 1610 char *pbuf, buf[TCP_CA_NAME_MAX]; 1611 size_t len; 1612 1613 /* 1614 * For TCP_CCALGOOPT forward the control to CC module, for both 1615 * SOPT_SET and SOPT_GET. 1616 */ 1617 switch (sopt->sopt_name) { 1618 case TCP_CCALGOOPT: 1619 INP_WUNLOCK(inp); 1620 pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); 1621 error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, 1622 sopt->sopt_valsize); 1623 if (error) { 1624 free(pbuf, M_TEMP); 1625 return (error); 1626 } 1627 INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); 1628 if (CC_ALGO(tp)->ctl_output != NULL) 1629 error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); 1630 else 1631 error = ENOENT; 1632 INP_WUNLOCK(inp); 1633 if (error == 0 && sopt->sopt_dir == SOPT_GET) 1634 error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); 1635 free(pbuf, M_TEMP); 1636 return (error); 1637 } 1638 1639 switch (sopt->sopt_dir) { 1640 case SOPT_SET: 1641 switch (sopt->sopt_name) { 1642 #ifdef TCP_SIGNATURE 1643 case TCP_MD5SIG: 1644 INP_WUNLOCK(inp); 1645 error = sooptcopyin(sopt, &optval, sizeof optval, 1646 sizeof optval); 1647 if (error) 1648 return (error); 1649 1650 INP_WLOCK_RECHECK(inp); 1651 if (optval > 0) 1652 tp->t_flags |= TF_SIGNATURE; 1653 else 1654 tp->t_flags &= ~TF_SIGNATURE; 1655 goto unlock_and_done; 1656 #endif /* TCP_SIGNATURE */ 1657 1658 case TCP_NODELAY: 1659 case TCP_NOOPT: 1660 INP_WUNLOCK(inp); 1661 error = sooptcopyin(sopt, &optval, sizeof optval, 1662 sizeof optval); 1663 if (error) 1664 return (error); 1665 1666 INP_WLOCK_RECHECK(inp); 1667 switch (sopt->sopt_name) { 1668 case TCP_NODELAY: 1669 opt = TF_NODELAY; 1670 break; 1671 case TCP_NOOPT: 1672 opt = TF_NOOPT; 1673 break; 1674 default: 1675 opt = 0; /* dead code to fool gcc */ 1676 break; 1677 } 1678 1679 if (optval) 1680 tp->t_flags |= opt; 1681 else 1682 tp->t_flags &= ~opt; 1683 unlock_and_done: 1684 #ifdef TCP_OFFLOAD 1685 if (tp->t_flags & TF_TOE) { 1686 tcp_offload_ctloutput(tp, sopt->sopt_dir, 1687 sopt->sopt_name); 1688 } 1689 #endif 1690 INP_WUNLOCK(inp); 1691 break; 1692 1693 case TCP_NOPUSH: 1694 INP_WUNLOCK(inp); 1695 error = sooptcopyin(sopt, &optval, sizeof optval, 1696 sizeof optval); 1697 if (error) 1698 return (error); 1699 1700 INP_WLOCK_RECHECK(inp); 1701 if (optval) 1702 tp->t_flags |= TF_NOPUSH; 1703 else if (tp->t_flags & TF_NOPUSH) { 1704 tp->t_flags &= ~TF_NOPUSH; 1705 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1706 error = tp->t_fb->tfb_tcp_output(tp); 1707 } 1708 goto unlock_and_done; 1709 1710 case TCP_MAXSEG: 1711 INP_WUNLOCK(inp); 1712 error = sooptcopyin(sopt, &optval, sizeof optval, 1713 sizeof optval); 1714 if (error) 1715 return (error); 1716 1717 INP_WLOCK_RECHECK(inp); 1718 if (optval > 0 && optval <= tp->t_maxseg && 1719 optval + 40 >= V_tcp_minmss) 1720 tp->t_maxseg = optval; 1721 else 1722 error = EINVAL; 1723 goto unlock_and_done; 1724 1725 case TCP_INFO: 1726 INP_WUNLOCK(inp); 1727 error = EINVAL; 1728 break; 1729 1730 case TCP_CONGESTION: 1731 INP_WUNLOCK(inp); 1732 error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); 1733 if (error) 1734 break; 1735 buf[sopt->sopt_valsize] = '\0'; 1736 INP_WLOCK_RECHECK(inp); 1737 CC_LIST_RLOCK(); 1738 STAILQ_FOREACH(algo, &cc_list, entries) 1739 if (strncmp(buf, algo->name, 1740 TCP_CA_NAME_MAX) == 0) 1741 break; 1742 CC_LIST_RUNLOCK(); 1743 if (algo == NULL) { 1744 INP_WUNLOCK(inp); 1745 error = EINVAL; 1746 break; 1747 } 1748 /* 1749 * We hold a write lock over the tcb so it's safe to 1750 * do these things without ordering concerns. 1751 */ 1752 if (CC_ALGO(tp)->cb_destroy != NULL) 1753 CC_ALGO(tp)->cb_destroy(tp->ccv); 1754 CC_ALGO(tp) = algo; 1755 /* 1756 * If something goes pear shaped initialising the new 1757 * algo, fall back to newreno (which does not 1758 * require initialisation). 1759 */ 1760 if (algo->cb_init != NULL && 1761 algo->cb_init(tp->ccv) != 0) { 1762 CC_ALGO(tp) = &newreno_cc_algo; 1763 /* 1764 * The only reason init should fail is 1765 * because of malloc. 1766 */ 1767 error = ENOMEM; 1768 } 1769 INP_WUNLOCK(inp); 1770 break; 1771 1772 case TCP_KEEPIDLE: 1773 case TCP_KEEPINTVL: 1774 case TCP_KEEPINIT: 1775 INP_WUNLOCK(inp); 1776 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); 1777 if (error) 1778 return (error); 1779 1780 if (ui > (UINT_MAX / hz)) { 1781 error = EINVAL; 1782 break; 1783 } 1784 ui *= hz; 1785 1786 INP_WLOCK_RECHECK(inp); 1787 switch (sopt->sopt_name) { 1788 case TCP_KEEPIDLE: 1789 tp->t_keepidle = ui; 1790 /* 1791 * XXX: better check current remaining 1792 * timeout and "merge" it with new value. 1793 */ 1794 if ((tp->t_state > TCPS_LISTEN) && 1795 (tp->t_state <= TCPS_CLOSING)) 1796 tcp_timer_activate(tp, TT_KEEP, 1797 TP_KEEPIDLE(tp)); 1798 break; 1799 case TCP_KEEPINTVL: 1800 tp->t_keepintvl = ui; 1801 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1802 (TP_MAXIDLE(tp) > 0)) 1803 tcp_timer_activate(tp, TT_2MSL, 1804 TP_MAXIDLE(tp)); 1805 break; 1806 case TCP_KEEPINIT: 1807 tp->t_keepinit = ui; 1808 if (tp->t_state == TCPS_SYN_RECEIVED || 1809 tp->t_state == TCPS_SYN_SENT) 1810 tcp_timer_activate(tp, TT_KEEP, 1811 TP_KEEPINIT(tp)); 1812 break; 1813 } 1814 goto unlock_and_done; 1815 1816 case TCP_KEEPCNT: 1817 INP_WUNLOCK(inp); 1818 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); 1819 if (error) 1820 return (error); 1821 1822 INP_WLOCK_RECHECK(inp); 1823 tp->t_keepcnt = ui; 1824 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1825 (TP_MAXIDLE(tp) > 0)) 1826 tcp_timer_activate(tp, TT_2MSL, 1827 TP_MAXIDLE(tp)); 1828 goto unlock_and_done; 1829 1830 #ifdef TCPPCAP 1831 case TCP_PCAP_OUT: 1832 case TCP_PCAP_IN: 1833 INP_WUNLOCK(inp); 1834 error = sooptcopyin(sopt, &optval, sizeof optval, 1835 sizeof optval); 1836 if (error) 1837 return (error); 1838 1839 INP_WLOCK_RECHECK(inp); 1840 if (optval >= 0) 1841 tcp_pcap_set_sock_max(TCP_PCAP_OUT ? 1842 &(tp->t_outpkts) : &(tp->t_inpkts), 1843 optval); 1844 else 1845 error = EINVAL; 1846 goto unlock_and_done; 1847 #endif 1848 1849 #ifdef TCP_RFC7413 1850 case TCP_FASTOPEN: 1851 INP_WUNLOCK(inp); 1852 if (!V_tcp_fastopen_enabled) 1853 return (EPERM); 1854 1855 error = sooptcopyin(sopt, &optval, sizeof optval, 1856 sizeof optval); 1857 if (error) 1858 return (error); 1859 1860 INP_WLOCK_RECHECK(inp); 1861 if (optval) { 1862 tp->t_flags |= TF_FASTOPEN; 1863 if ((tp->t_state == TCPS_LISTEN) && 1864 (tp->t_tfo_pending == NULL)) 1865 tp->t_tfo_pending = 1866 tcp_fastopen_alloc_counter(); 1867 } else 1868 tp->t_flags &= ~TF_FASTOPEN; 1869 goto unlock_and_done; 1870 #endif 1871 1872 default: 1873 INP_WUNLOCK(inp); 1874 error = ENOPROTOOPT; 1875 break; 1876 } 1877 break; 1878 1879 case SOPT_GET: 1880 tp = intotcpcb(inp); 1881 switch (sopt->sopt_name) { 1882 #ifdef TCP_SIGNATURE 1883 case TCP_MD5SIG: 1884 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1885 INP_WUNLOCK(inp); 1886 error = sooptcopyout(sopt, &optval, sizeof optval); 1887 break; 1888 #endif 1889 1890 case TCP_NODELAY: 1891 optval = tp->t_flags & TF_NODELAY; 1892 INP_WUNLOCK(inp); 1893 error = sooptcopyout(sopt, &optval, sizeof optval); 1894 break; 1895 case TCP_MAXSEG: 1896 optval = tp->t_maxseg; 1897 INP_WUNLOCK(inp); 1898 error = sooptcopyout(sopt, &optval, sizeof optval); 1899 break; 1900 case TCP_NOOPT: 1901 optval = tp->t_flags & TF_NOOPT; 1902 INP_WUNLOCK(inp); 1903 error = sooptcopyout(sopt, &optval, sizeof optval); 1904 break; 1905 case TCP_NOPUSH: 1906 optval = tp->t_flags & TF_NOPUSH; 1907 INP_WUNLOCK(inp); 1908 error = sooptcopyout(sopt, &optval, sizeof optval); 1909 break; 1910 case TCP_INFO: 1911 tcp_fill_info(tp, &ti); 1912 INP_WUNLOCK(inp); 1913 error = sooptcopyout(sopt, &ti, sizeof ti); 1914 break; 1915 case TCP_CONGESTION: 1916 len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); 1917 INP_WUNLOCK(inp); 1918 error = sooptcopyout(sopt, buf, len + 1); 1919 break; 1920 case TCP_KEEPIDLE: 1921 case TCP_KEEPINTVL: 1922 case TCP_KEEPINIT: 1923 case TCP_KEEPCNT: 1924 switch (sopt->sopt_name) { 1925 case TCP_KEEPIDLE: 1926 ui = tp->t_keepidle / hz; 1927 break; 1928 case TCP_KEEPINTVL: 1929 ui = tp->t_keepintvl / hz; 1930 break; 1931 case TCP_KEEPINIT: 1932 ui = tp->t_keepinit / hz; 1933 break; 1934 case TCP_KEEPCNT: 1935 ui = tp->t_keepcnt; 1936 break; 1937 } 1938 INP_WUNLOCK(inp); 1939 error = sooptcopyout(sopt, &ui, sizeof(ui)); 1940 break; 1941 #ifdef TCPPCAP 1942 case TCP_PCAP_OUT: 1943 case TCP_PCAP_IN: 1944 optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? 1945 &(tp->t_outpkts) : &(tp->t_inpkts)); 1946 INP_WUNLOCK(inp); 1947 error = sooptcopyout(sopt, &optval, sizeof optval); 1948 break; 1949 #endif 1950 1951 #ifdef TCP_RFC7413 1952 case TCP_FASTOPEN: 1953 optval = tp->t_flags & TF_FASTOPEN; 1954 INP_WUNLOCK(inp); 1955 error = sooptcopyout(sopt, &optval, sizeof optval); 1956 break; 1957 #endif 1958 default: 1959 INP_WUNLOCK(inp); 1960 error = ENOPROTOOPT; 1961 break; 1962 } 1963 break; 1964 } 1965 return (error); 1966 } 1967 #undef INP_WLOCK_RECHECK 1968 #undef INP_WLOCK_RECHECK_CLEANUP 1969 1970 /* 1971 * Attach TCP protocol to socket, allocating 1972 * internet protocol control block, tcp control block, 1973 * bufer space, and entering LISTEN state if to accept connections. 1974 */ 1975 static int 1976 tcp_attach(struct socket *so) 1977 { 1978 struct tcpcb *tp; 1979 struct inpcb *inp; 1980 int error; 1981 1982 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 1983 error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); 1984 if (error) 1985 return (error); 1986 } 1987 so->so_rcv.sb_flags |= SB_AUTOSIZE; 1988 so->so_snd.sb_flags |= SB_AUTOSIZE; 1989 INP_INFO_RLOCK(&V_tcbinfo); 1990 error = in_pcballoc(so, &V_tcbinfo); 1991 if (error) { 1992 INP_INFO_RUNLOCK(&V_tcbinfo); 1993 return (error); 1994 } 1995 inp = sotoinpcb(so); 1996 #ifdef INET6 1997 if (inp->inp_vflag & INP_IPV6PROTO) { 1998 inp->inp_vflag |= INP_IPV6; 1999 inp->in6p_hops = -1; /* use kernel default */ 2000 } 2001 else 2002 #endif 2003 inp->inp_vflag |= INP_IPV4; 2004 tp = tcp_newtcpcb(inp); 2005 if (tp == NULL) { 2006 in_pcbdetach(inp); 2007 in_pcbfree(inp); 2008 INP_INFO_RUNLOCK(&V_tcbinfo); 2009 return (ENOBUFS); 2010 } 2011 tp->t_state = TCPS_CLOSED; 2012 INP_WUNLOCK(inp); 2013 INP_INFO_RUNLOCK(&V_tcbinfo); 2014 TCPSTATES_INC(TCPS_CLOSED); 2015 return (0); 2016 } 2017 2018 /* 2019 * Initiate (or continue) disconnect. 2020 * If embryonic state, just send reset (once). 2021 * If in ``let data drain'' option and linger null, just drop. 2022 * Otherwise (hard), mark socket disconnecting and drop 2023 * current input data; switch states based on user close, and 2024 * send segment to peer (with FIN). 2025 */ 2026 static void 2027 tcp_disconnect(struct tcpcb *tp) 2028 { 2029 struct inpcb *inp = tp->t_inpcb; 2030 struct socket *so = inp->inp_socket; 2031 2032 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2033 INP_WLOCK_ASSERT(inp); 2034 2035 /* 2036 * Neither tcp_close() nor tcp_drop() should return NULL, as the 2037 * socket is still open. 2038 */ 2039 if (tp->t_state < TCPS_ESTABLISHED) { 2040 tp = tcp_close(tp); 2041 KASSERT(tp != NULL, 2042 ("tcp_disconnect: tcp_close() returned NULL")); 2043 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 2044 tp = tcp_drop(tp, 0); 2045 KASSERT(tp != NULL, 2046 ("tcp_disconnect: tcp_drop() returned NULL")); 2047 } else { 2048 soisdisconnecting(so); 2049 sbflush(&so->so_rcv); 2050 tcp_usrclosed(tp); 2051 if (!(inp->inp_flags & INP_DROPPED)) 2052 tp->t_fb->tfb_tcp_output(tp); 2053 } 2054 } 2055 2056 /* 2057 * User issued close, and wish to trail through shutdown states: 2058 * if never received SYN, just forget it. If got a SYN from peer, 2059 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 2060 * If already got a FIN from peer, then almost done; go to LAST_ACK 2061 * state. In all other cases, have already sent FIN to peer (e.g. 2062 * after PRU_SHUTDOWN), and just have to play tedious game waiting 2063 * for peer to send FIN or not respond to keep-alives, etc. 2064 * We can let the user exit from the close as soon as the FIN is acked. 2065 */ 2066 static void 2067 tcp_usrclosed(struct tcpcb *tp) 2068 { 2069 2070 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2071 INP_WLOCK_ASSERT(tp->t_inpcb); 2072 2073 switch (tp->t_state) { 2074 case TCPS_LISTEN: 2075 #ifdef TCP_OFFLOAD 2076 tcp_offload_listen_stop(tp); 2077 #endif 2078 tcp_state_change(tp, TCPS_CLOSED); 2079 /* FALLTHROUGH */ 2080 case TCPS_CLOSED: 2081 tp = tcp_close(tp); 2082 /* 2083 * tcp_close() should never return NULL here as the socket is 2084 * still open. 2085 */ 2086 KASSERT(tp != NULL, 2087 ("tcp_usrclosed: tcp_close() returned NULL")); 2088 break; 2089 2090 case TCPS_SYN_SENT: 2091 case TCPS_SYN_RECEIVED: 2092 tp->t_flags |= TF_NEEDFIN; 2093 break; 2094 2095 case TCPS_ESTABLISHED: 2096 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2097 break; 2098 2099 case TCPS_CLOSE_WAIT: 2100 tcp_state_change(tp, TCPS_LAST_ACK); 2101 break; 2102 } 2103 if (tp->t_state >= TCPS_FIN_WAIT_2) { 2104 soisdisconnected(tp->t_inpcb->inp_socket); 2105 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 2106 if (tp->t_state == TCPS_FIN_WAIT_2) { 2107 int timeout; 2108 2109 timeout = (tcp_fast_finwait2_recycle) ? 2110 tcp_finwait2_timeout : TP_MAXIDLE(tp); 2111 tcp_timer_activate(tp, TT_2MSL, timeout); 2112 } 2113 } 2114 } 2115 2116 #ifdef DDB 2117 static void 2118 db_print_indent(int indent) 2119 { 2120 int i; 2121 2122 for (i = 0; i < indent; i++) 2123 db_printf(" "); 2124 } 2125 2126 static void 2127 db_print_tstate(int t_state) 2128 { 2129 2130 switch (t_state) { 2131 case TCPS_CLOSED: 2132 db_printf("TCPS_CLOSED"); 2133 return; 2134 2135 case TCPS_LISTEN: 2136 db_printf("TCPS_LISTEN"); 2137 return; 2138 2139 case TCPS_SYN_SENT: 2140 db_printf("TCPS_SYN_SENT"); 2141 return; 2142 2143 case TCPS_SYN_RECEIVED: 2144 db_printf("TCPS_SYN_RECEIVED"); 2145 return; 2146 2147 case TCPS_ESTABLISHED: 2148 db_printf("TCPS_ESTABLISHED"); 2149 return; 2150 2151 case TCPS_CLOSE_WAIT: 2152 db_printf("TCPS_CLOSE_WAIT"); 2153 return; 2154 2155 case TCPS_FIN_WAIT_1: 2156 db_printf("TCPS_FIN_WAIT_1"); 2157 return; 2158 2159 case TCPS_CLOSING: 2160 db_printf("TCPS_CLOSING"); 2161 return; 2162 2163 case TCPS_LAST_ACK: 2164 db_printf("TCPS_LAST_ACK"); 2165 return; 2166 2167 case TCPS_FIN_WAIT_2: 2168 db_printf("TCPS_FIN_WAIT_2"); 2169 return; 2170 2171 case TCPS_TIME_WAIT: 2172 db_printf("TCPS_TIME_WAIT"); 2173 return; 2174 2175 default: 2176 db_printf("unknown"); 2177 return; 2178 } 2179 } 2180 2181 static void 2182 db_print_tflags(u_int t_flags) 2183 { 2184 int comma; 2185 2186 comma = 0; 2187 if (t_flags & TF_ACKNOW) { 2188 db_printf("%sTF_ACKNOW", comma ? ", " : ""); 2189 comma = 1; 2190 } 2191 if (t_flags & TF_DELACK) { 2192 db_printf("%sTF_DELACK", comma ? ", " : ""); 2193 comma = 1; 2194 } 2195 if (t_flags & TF_NODELAY) { 2196 db_printf("%sTF_NODELAY", comma ? ", " : ""); 2197 comma = 1; 2198 } 2199 if (t_flags & TF_NOOPT) { 2200 db_printf("%sTF_NOOPT", comma ? ", " : ""); 2201 comma = 1; 2202 } 2203 if (t_flags & TF_SENTFIN) { 2204 db_printf("%sTF_SENTFIN", comma ? ", " : ""); 2205 comma = 1; 2206 } 2207 if (t_flags & TF_REQ_SCALE) { 2208 db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); 2209 comma = 1; 2210 } 2211 if (t_flags & TF_RCVD_SCALE) { 2212 db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); 2213 comma = 1; 2214 } 2215 if (t_flags & TF_REQ_TSTMP) { 2216 db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); 2217 comma = 1; 2218 } 2219 if (t_flags & TF_RCVD_TSTMP) { 2220 db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); 2221 comma = 1; 2222 } 2223 if (t_flags & TF_SACK_PERMIT) { 2224 db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); 2225 comma = 1; 2226 } 2227 if (t_flags & TF_NEEDSYN) { 2228 db_printf("%sTF_NEEDSYN", comma ? ", " : ""); 2229 comma = 1; 2230 } 2231 if (t_flags & TF_NEEDFIN) { 2232 db_printf("%sTF_NEEDFIN", comma ? ", " : ""); 2233 comma = 1; 2234 } 2235 if (t_flags & TF_NOPUSH) { 2236 db_printf("%sTF_NOPUSH", comma ? ", " : ""); 2237 comma = 1; 2238 } 2239 if (t_flags & TF_MORETOCOME) { 2240 db_printf("%sTF_MORETOCOME", comma ? ", " : ""); 2241 comma = 1; 2242 } 2243 if (t_flags & TF_LQ_OVERFLOW) { 2244 db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); 2245 comma = 1; 2246 } 2247 if (t_flags & TF_LASTIDLE) { 2248 db_printf("%sTF_LASTIDLE", comma ? ", " : ""); 2249 comma = 1; 2250 } 2251 if (t_flags & TF_RXWIN0SENT) { 2252 db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); 2253 comma = 1; 2254 } 2255 if (t_flags & TF_FASTRECOVERY) { 2256 db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); 2257 comma = 1; 2258 } 2259 if (t_flags & TF_CONGRECOVERY) { 2260 db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); 2261 comma = 1; 2262 } 2263 if (t_flags & TF_WASFRECOVERY) { 2264 db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); 2265 comma = 1; 2266 } 2267 if (t_flags & TF_SIGNATURE) { 2268 db_printf("%sTF_SIGNATURE", comma ? ", " : ""); 2269 comma = 1; 2270 } 2271 if (t_flags & TF_FORCEDATA) { 2272 db_printf("%sTF_FORCEDATA", comma ? ", " : ""); 2273 comma = 1; 2274 } 2275 if (t_flags & TF_TSO) { 2276 db_printf("%sTF_TSO", comma ? ", " : ""); 2277 comma = 1; 2278 } 2279 if (t_flags & TF_ECN_PERMIT) { 2280 db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); 2281 comma = 1; 2282 } 2283 if (t_flags & TF_FASTOPEN) { 2284 db_printf("%sTF_FASTOPEN", comma ? ", " : ""); 2285 comma = 1; 2286 } 2287 } 2288 2289 static void 2290 db_print_toobflags(char t_oobflags) 2291 { 2292 int comma; 2293 2294 comma = 0; 2295 if (t_oobflags & TCPOOB_HAVEDATA) { 2296 db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); 2297 comma = 1; 2298 } 2299 if (t_oobflags & TCPOOB_HADDATA) { 2300 db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); 2301 comma = 1; 2302 } 2303 } 2304 2305 static void 2306 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) 2307 { 2308 2309 db_print_indent(indent); 2310 db_printf("%s at %p\n", name, tp); 2311 2312 indent += 2; 2313 2314 db_print_indent(indent); 2315 db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", 2316 LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); 2317 2318 db_print_indent(indent); 2319 db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", 2320 &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); 2321 2322 db_print_indent(indent); 2323 db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, 2324 &tp->t_timers->tt_delack, tp->t_inpcb); 2325 2326 db_print_indent(indent); 2327 db_printf("t_state: %d (", tp->t_state); 2328 db_print_tstate(tp->t_state); 2329 db_printf(")\n"); 2330 2331 db_print_indent(indent); 2332 db_printf("t_flags: 0x%x (", tp->t_flags); 2333 db_print_tflags(tp->t_flags); 2334 db_printf(")\n"); 2335 2336 db_print_indent(indent); 2337 db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", 2338 tp->snd_una, tp->snd_max, tp->snd_nxt); 2339 2340 db_print_indent(indent); 2341 db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", 2342 tp->snd_up, tp->snd_wl1, tp->snd_wl2); 2343 2344 db_print_indent(indent); 2345 db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", 2346 tp->iss, tp->irs, tp->rcv_nxt); 2347 2348 db_print_indent(indent); 2349 db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n", 2350 tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); 2351 2352 db_print_indent(indent); 2353 db_printf("snd_wnd: %lu snd_cwnd: %lu\n", 2354 tp->snd_wnd, tp->snd_cwnd); 2355 2356 db_print_indent(indent); 2357 db_printf("snd_ssthresh: %lu snd_recover: " 2358 "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); 2359 2360 db_print_indent(indent); 2361 db_printf("t_rcvtime: %u t_startime: %u\n", 2362 tp->t_rcvtime, tp->t_starttime); 2363 2364 db_print_indent(indent); 2365 db_printf("t_rttime: %u t_rtsq: 0x%08x\n", 2366 tp->t_rtttime, tp->t_rtseq); 2367 2368 db_print_indent(indent); 2369 db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", 2370 tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); 2371 2372 db_print_indent(indent); 2373 db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " 2374 "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, 2375 tp->t_rttbest); 2376 2377 db_print_indent(indent); 2378 db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n", 2379 tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); 2380 2381 db_print_indent(indent); 2382 db_printf("t_oobflags: 0x%x (", tp->t_oobflags); 2383 db_print_toobflags(tp->t_oobflags); 2384 db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); 2385 2386 db_print_indent(indent); 2387 db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", 2388 tp->snd_scale, tp->rcv_scale, tp->request_r_scale); 2389 2390 db_print_indent(indent); 2391 db_printf("ts_recent: %u ts_recent_age: %u\n", 2392 tp->ts_recent, tp->ts_recent_age); 2393 2394 db_print_indent(indent); 2395 db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " 2396 "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); 2397 2398 db_print_indent(indent); 2399 db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x " 2400 "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, 2401 tp->snd_recover_prev, tp->t_badrxtwin); 2402 2403 db_print_indent(indent); 2404 db_printf("snd_numholes: %d snd_holes first: %p\n", 2405 tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); 2406 2407 db_print_indent(indent); 2408 db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " 2409 "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); 2410 2411 /* Skip sackblks, sackhint. */ 2412 2413 db_print_indent(indent); 2414 db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", 2415 tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); 2416 } 2417 2418 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) 2419 { 2420 struct tcpcb *tp; 2421 2422 if (!have_addr) { 2423 db_printf("usage: show tcpcb <addr>\n"); 2424 return; 2425 } 2426 tp = (struct tcpcb *)addr; 2427 2428 db_print_tcpcb(tp, "tcpcb", 0); 2429 } 2430 #endif 2431