1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2012 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <[email protected]>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #ifdef TCP_OFFLOAD
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/protosw.h>
43 #include <sys/refcount.h>
44 #include <sys/domain.h>
45 #include <sys/fnv_hash.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_types.h>
52 #include <net/if_vlan_var.h>
53 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/in_fib.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip6.h>
59 #include <netinet6/in6_fib.h>
60 #include <netinet6/scope6_var.h>
61 #include <netinet/tcp_timer.h>
62 #define TCPSTATES
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_var.h>
65 #include <netinet/toecore.h>
66 #include <netinet/cc/cc.h>
67
68 #include "common/common.h"
69 #include "common/t4_msg.h"
70 #include "common/t4_regs.h"
71 #include "t4_clip.h"
72 #include "tom/t4_tom_l2t.h"
73 #include "tom/t4_tom.h"
74
75 /* stid services */
76 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
77 static struct listen_ctx *lookup_stid(struct adapter *, int);
78 static void free_stid(struct adapter *, struct listen_ctx *);
79
80 /* lctx services */
81 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
82 struct vi_info *);
83 static int free_lctx(struct adapter *, struct listen_ctx *);
84 static void hold_lctx(struct listen_ctx *);
85 static void listen_hash_add(struct adapter *, struct listen_ctx *);
86 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
87 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
88 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
89
90 static void send_reset_synqe(struct toedev *, struct synq_entry *);
91
92 static int
alloc_stid(struct adapter * sc,struct listen_ctx * lctx,int isipv6)93 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
94 {
95 struct tid_info *t = &sc->tids;
96 u_int stid, n, f, mask;
97 struct stid_region *sr = &lctx->stid_region;
98
99 /*
100 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
101 * the TCAM. The start of the stid region is properly aligned (the chip
102 * requires each region to be 128-cell aligned).
103 */
104 n = isipv6 ? 2 : 1;
105 mask = n - 1;
106 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
107 ("%s: stid region (%u, %u) not properly aligned. n = %u",
108 __func__, t->stid_base, t->nstids, n));
109
110 mtx_lock(&t->stid_lock);
111 if (n > t->nstids - t->stids_in_use) {
112 mtx_unlock(&t->stid_lock);
113 return (-1);
114 }
115
116 if (t->nstids_free_head >= n) {
117 /*
118 * This allocation will definitely succeed because the region
119 * starts at a good alignment and we just checked we have enough
120 * stids free.
121 */
122 f = t->nstids_free_head & mask;
123 t->nstids_free_head -= n + f;
124 stid = t->nstids_free_head;
125 TAILQ_INSERT_HEAD(&t->stids, sr, link);
126 } else {
127 struct stid_region *s;
128
129 stid = t->nstids_free_head;
130 TAILQ_FOREACH(s, &t->stids, link) {
131 stid += s->used + s->free;
132 f = stid & mask;
133 if (s->free >= n + f) {
134 stid -= n + f;
135 s->free -= n + f;
136 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
137 goto allocated;
138 }
139 }
140
141 if (__predict_false(stid != t->nstids)) {
142 panic("%s: stids TAILQ (%p) corrupt."
143 " At %d instead of %d at the end of the queue.",
144 __func__, &t->stids, stid, t->nstids);
145 }
146
147 mtx_unlock(&t->stid_lock);
148 return (-1);
149 }
150
151 allocated:
152 sr->used = n;
153 sr->free = f;
154 t->stids_in_use += n;
155 t->stid_tab[stid] = lctx;
156 mtx_unlock(&t->stid_lock);
157
158 KASSERT(((stid + t->stid_base) & mask) == 0,
159 ("%s: EDOOFUS.", __func__));
160 return (stid + t->stid_base);
161 }
162
163 static struct listen_ctx *
lookup_stid(struct adapter * sc,int stid)164 lookup_stid(struct adapter *sc, int stid)
165 {
166 struct tid_info *t = &sc->tids;
167
168 return (t->stid_tab[stid - t->stid_base]);
169 }
170
171 static void
free_stid(struct adapter * sc,struct listen_ctx * lctx)172 free_stid(struct adapter *sc, struct listen_ctx *lctx)
173 {
174 struct tid_info *t = &sc->tids;
175 struct stid_region *sr = &lctx->stid_region;
176 struct stid_region *s;
177
178 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
179
180 mtx_lock(&t->stid_lock);
181 s = TAILQ_PREV(sr, stid_head, link);
182 if (s != NULL)
183 s->free += sr->used + sr->free;
184 else
185 t->nstids_free_head += sr->used + sr->free;
186 KASSERT(t->stids_in_use >= sr->used,
187 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
188 t->stids_in_use, sr->used));
189 t->stids_in_use -= sr->used;
190 TAILQ_REMOVE(&t->stids, sr, link);
191 mtx_unlock(&t->stid_lock);
192 }
193
194 static struct listen_ctx *
alloc_lctx(struct adapter * sc,struct inpcb * inp,struct vi_info * vi)195 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
196 {
197 struct listen_ctx *lctx;
198
199 INP_WLOCK_ASSERT(inp);
200
201 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
202 if (lctx == NULL)
203 return (NULL);
204
205 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
206 if (lctx->stid < 0) {
207 free(lctx, M_CXGBE);
208 return (NULL);
209 }
210
211 if (inp->inp_vflag & INP_IPV6 &&
212 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
213 lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
214 if (lctx->ce == NULL) {
215 free(lctx, M_CXGBE);
216 return (NULL);
217 }
218 }
219
220 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
221 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
222 refcount_init(&lctx->refcount, 1);
223
224 lctx->inp = inp;
225 lctx->vnet = inp->inp_socket->so_vnet;
226 in_pcbref(inp);
227
228 return (lctx);
229 }
230
231 /* Don't call this directly, use release_lctx instead */
232 static int
free_lctx(struct adapter * sc,struct listen_ctx * lctx)233 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
234 {
235 struct inpcb *inp = lctx->inp;
236
237 INP_WLOCK_ASSERT(inp);
238 KASSERT(lctx->refcount == 0,
239 ("%s: refcount %d", __func__, lctx->refcount));
240 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
241
242 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
243 __func__, lctx->stid, lctx, lctx->inp);
244
245 if (lctx->ce)
246 t4_release_lip(sc, lctx->ce);
247 free_stid(sc, lctx);
248 free(lctx, M_CXGBE);
249
250 return (in_pcbrele_wlocked(inp));
251 }
252
253 static void
hold_lctx(struct listen_ctx * lctx)254 hold_lctx(struct listen_ctx *lctx)
255 {
256
257 refcount_acquire(&lctx->refcount);
258 }
259
260 static inline uint32_t
listen_hashfn(void * key,u_long mask)261 listen_hashfn(void *key, u_long mask)
262 {
263
264 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
265 }
266
267 /*
268 * Add a listen_ctx entry to the listen hash table.
269 */
270 static void
listen_hash_add(struct adapter * sc,struct listen_ctx * lctx)271 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
272 {
273 struct tom_data *td = sc->tom_softc;
274 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
275
276 mtx_lock(&td->lctx_hash_lock);
277 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
278 td->lctx_count++;
279 mtx_unlock(&td->lctx_hash_lock);
280 }
281
282 /*
283 * Look for the listening socket's context entry in the hash and return it.
284 */
285 static struct listen_ctx *
listen_hash_find(struct adapter * sc,struct inpcb * inp)286 listen_hash_find(struct adapter *sc, struct inpcb *inp)
287 {
288 struct tom_data *td = sc->tom_softc;
289 int bucket = listen_hashfn(inp, td->listen_mask);
290 struct listen_ctx *lctx;
291
292 mtx_lock(&td->lctx_hash_lock);
293 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
294 if (lctx->inp == inp)
295 break;
296 }
297 mtx_unlock(&td->lctx_hash_lock);
298
299 return (lctx);
300 }
301
302 /*
303 * Removes the listen_ctx structure for inp from the hash and returns it.
304 */
305 static struct listen_ctx *
listen_hash_del(struct adapter * sc,struct inpcb * inp)306 listen_hash_del(struct adapter *sc, struct inpcb *inp)
307 {
308 struct tom_data *td = sc->tom_softc;
309 int bucket = listen_hashfn(inp, td->listen_mask);
310 struct listen_ctx *lctx, *l;
311
312 mtx_lock(&td->lctx_hash_lock);
313 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
314 if (lctx->inp == inp) {
315 LIST_REMOVE(lctx, link);
316 td->lctx_count--;
317 break;
318 }
319 }
320 mtx_unlock(&td->lctx_hash_lock);
321
322 return (lctx);
323 }
324
325 /*
326 * Releases a hold on the lctx. Must be called with the listening socket's inp
327 * locked. The inp may be freed by this function and it returns NULL to
328 * indicate this.
329 */
330 static struct inpcb *
release_lctx(struct adapter * sc,struct listen_ctx * lctx)331 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
332 {
333 struct inpcb *inp = lctx->inp;
334 int inp_freed = 0;
335
336 INP_WLOCK_ASSERT(inp);
337 if (refcount_release(&lctx->refcount))
338 inp_freed = free_lctx(sc, lctx);
339
340 return (inp_freed ? NULL : inp);
341 }
342
343 static void
send_reset_synqe(struct toedev * tod,struct synq_entry * synqe)344 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
345 {
346 struct adapter *sc = tod->tod_softc;
347 struct mbuf *m = synqe->syn;
348 struct ifnet *ifp = m->m_pkthdr.rcvif;
349 struct vi_info *vi = ifp->if_softc;
350 struct port_info *pi = vi->pi;
351 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
352 struct wrqe *wr;
353 struct fw_flowc_wr *flowc;
354 struct cpl_abort_req *req;
355 int flowclen;
356 struct sge_wrq *ofld_txq;
357 struct sge_ofld_rxq *ofld_rxq;
358 const int nparams = 6;
359 const u_int pfvf = sc->pf << S_FW_VIID_PFN;
360
361 INP_WLOCK_ASSERT(synqe->lctx->inp);
362
363 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
364 __func__, synqe, synqe->flags, synqe->tid,
365 synqe->flags & TPF_ABORT_SHUTDOWN ?
366 " (abort already in progress)" : "");
367 if (synqe->flags & TPF_ABORT_SHUTDOWN)
368 return; /* abort already in progress */
369 synqe->flags |= TPF_ABORT_SHUTDOWN;
370
371 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
372 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
373
374 /* The wrqe will have two WRs - a flowc followed by an abort_req */
375 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
376
377 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
378 if (wr == NULL) {
379 /* XXX */
380 panic("%s: allocation failure.", __func__);
381 }
382 flowc = wrtod(wr);
383 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
384
385 /* First the flowc ... */
386 memset(flowc, 0, wr->wr_len);
387 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
388 V_FW_FLOWC_WR_NPARAMS(nparams));
389 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
390 V_FW_WR_FLOWID(synqe->tid));
391 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
392 flowc->mnemval[0].val = htobe32(pfvf);
393 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
394 flowc->mnemval[1].val = htobe32(pi->tx_chan);
395 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
396 flowc->mnemval[2].val = htobe32(pi->tx_chan);
397 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
398 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
399 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
400 flowc->mnemval[4].val = htobe32(512);
401 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
402 flowc->mnemval[5].val = htobe32(512);
403 synqe->flags |= TPF_FLOWC_WR_SENT;
404
405 /* ... then ABORT request */
406 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
407 req->rsvd0 = 0; /* don't have a snd_nxt */
408 req->rsvd1 = 1; /* no data sent yet */
409 req->cmd = CPL_ABORT_SEND_RST;
410
411 t4_l2t_send(sc, wr, e);
412 }
413
414 static int
create_server(struct adapter * sc,struct listen_ctx * lctx)415 create_server(struct adapter *sc, struct listen_ctx *lctx)
416 {
417 struct wrqe *wr;
418 struct cpl_pass_open_req *req;
419 struct inpcb *inp = lctx->inp;
420
421 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
422 if (wr == NULL) {
423 log(LOG_ERR, "%s: allocation failure", __func__);
424 return (ENOMEM);
425 }
426 req = wrtod(wr);
427
428 INIT_TP_WR(req, 0);
429 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
430 req->local_port = inp->inp_lport;
431 req->peer_port = 0;
432 req->local_ip = inp->inp_laddr.s_addr;
433 req->peer_ip = 0;
434 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
435 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
436 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
437
438 t4_wrq_tx(sc, wr);
439 return (0);
440 }
441
442 static int
create_server6(struct adapter * sc,struct listen_ctx * lctx)443 create_server6(struct adapter *sc, struct listen_ctx *lctx)
444 {
445 struct wrqe *wr;
446 struct cpl_pass_open_req6 *req;
447 struct inpcb *inp = lctx->inp;
448
449 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
450 if (wr == NULL) {
451 log(LOG_ERR, "%s: allocation failure", __func__);
452 return (ENOMEM);
453 }
454 req = wrtod(wr);
455
456 INIT_TP_WR(req, 0);
457 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
458 req->local_port = inp->inp_lport;
459 req->peer_port = 0;
460 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
461 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
462 req->peer_ip_hi = 0;
463 req->peer_ip_lo = 0;
464 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
465 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
466 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
467
468 t4_wrq_tx(sc, wr);
469 return (0);
470 }
471
472 static int
destroy_server(struct adapter * sc,struct listen_ctx * lctx)473 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
474 {
475 struct wrqe *wr;
476 struct cpl_close_listsvr_req *req;
477
478 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
479 if (wr == NULL) {
480 /* XXX */
481 panic("%s: allocation failure.", __func__);
482 }
483 req = wrtod(wr);
484
485 INIT_TP_WR(req, 0);
486 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
487 lctx->stid));
488 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
489 req->rsvd = htobe16(0);
490
491 t4_wrq_tx(sc, wr);
492 return (0);
493 }
494
495 /*
496 * Start a listening server by sending a passive open request to HW.
497 *
498 * Can't take adapter lock here and access to sc->flags,
499 * sc->offload_map, if_capenable are all race prone.
500 */
501 int
t4_listen_start(struct toedev * tod,struct tcpcb * tp)502 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
503 {
504 struct adapter *sc = tod->tod_softc;
505 struct vi_info *vi;
506 struct port_info *pi;
507 struct inpcb *inp = tp->t_inpcb;
508 struct listen_ctx *lctx;
509 int i, rc, v;
510 struct offload_settings settings;
511
512 INP_WLOCK_ASSERT(inp);
513
514 rw_rlock(&sc->policy_lock);
515 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
516 EVL_MAKETAG(0xfff, 0, 0), inp);
517 rw_runlock(&sc->policy_lock);
518 if (!settings.offload)
519 return (0);
520
521 /* Don't start a hardware listener for any loopback address. */
522 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
523 return (0);
524 if (!(inp->inp_vflag & INP_IPV6) &&
525 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
526 return (0);
527 #if 0
528 ADAPTER_LOCK(sc);
529 if (IS_BUSY(sc)) {
530 log(LOG_ERR, "%s: listen request ignored, %s is busy",
531 __func__, device_get_nameunit(sc->dev));
532 goto done;
533 }
534
535 KASSERT(uld_active(sc, ULD_TOM),
536 ("%s: TOM not initialized", __func__));
537 #endif
538
539 /*
540 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first
541 * such VI's queues to send the passive open and receive the reply to
542 * it.
543 *
544 * XXX: need a way to mark a port in use by offload. if_cxgbe should
545 * then reject any attempt to bring down such a port (and maybe reject
546 * attempts to disable IFCAP_TOE on that port too?).
547 */
548 for_each_port(sc, i) {
549 pi = sc->port[i];
550 for_each_vi(pi, v, vi) {
551 if (vi->flags & VI_INIT_DONE &&
552 vi->ifp->if_capenable & IFCAP_TOE)
553 goto found;
554 }
555 }
556 goto done; /* no port that's UP with IFCAP_TOE enabled */
557 found:
558
559 if (listen_hash_find(sc, inp) != NULL)
560 goto done; /* already setup */
561
562 lctx = alloc_lctx(sc, inp, vi);
563 if (lctx == NULL) {
564 log(LOG_ERR,
565 "%s: listen request ignored, %s couldn't allocate lctx\n",
566 __func__, device_get_nameunit(sc->dev));
567 goto done;
568 }
569 listen_hash_add(sc, lctx);
570
571 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
572 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
573 inp->inp_vflag);
574
575 if (inp->inp_vflag & INP_IPV6)
576 rc = create_server6(sc, lctx);
577 else
578 rc = create_server(sc, lctx);
579 if (rc != 0) {
580 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
581 __func__, device_get_nameunit(sc->dev), rc);
582 (void) listen_hash_del(sc, inp);
583 inp = release_lctx(sc, lctx);
584 /* can't be freed, host stack has a reference */
585 KASSERT(inp != NULL, ("%s: inp freed", __func__));
586 goto done;
587 }
588 lctx->flags |= LCTX_RPL_PENDING;
589 done:
590 #if 0
591 ADAPTER_UNLOCK(sc);
592 #endif
593 return (0);
594 }
595
596 int
t4_listen_stop(struct toedev * tod,struct tcpcb * tp)597 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
598 {
599 struct listen_ctx *lctx;
600 struct adapter *sc = tod->tod_softc;
601 struct inpcb *inp = tp->t_inpcb;
602
603 INP_WLOCK_ASSERT(inp);
604
605 lctx = listen_hash_del(sc, inp);
606 if (lctx == NULL)
607 return (ENOENT); /* no hardware listener for this inp */
608
609 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
610 lctx, lctx->flags);
611
612 /*
613 * If the reply to the PASS_OPEN is still pending we'll wait for it to
614 * arrive and clean up when it does.
615 */
616 if (lctx->flags & LCTX_RPL_PENDING) {
617 return (EINPROGRESS);
618 }
619
620 destroy_server(sc, lctx);
621 return (0);
622 }
623
624 static inline struct synq_entry *
alloc_synqe(struct adapter * sc __unused,struct listen_ctx * lctx,int flags)625 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
626 {
627 struct synq_entry *synqe;
628
629 INP_WLOCK_ASSERT(lctx->inp);
630 MPASS(flags == M_WAITOK || flags == M_NOWAIT);
631
632 synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
633 if (__predict_true(synqe != NULL)) {
634 synqe->flags = TPF_SYNQE;
635 refcount_init(&synqe->refcnt, 1);
636 synqe->lctx = lctx;
637 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */
638 synqe->syn = NULL;
639 }
640
641 return (synqe);
642 }
643
644 static inline void
hold_synqe(struct synq_entry * synqe)645 hold_synqe(struct synq_entry *synqe)
646 {
647
648 refcount_acquire(&synqe->refcnt);
649 }
650
651 static inline struct inpcb *
release_synqe(struct adapter * sc,struct synq_entry * synqe)652 release_synqe(struct adapter *sc, struct synq_entry *synqe)
653 {
654 struct inpcb *inp;
655
656 MPASS(synqe->flags & TPF_SYNQE);
657 MPASS(synqe->lctx != NULL);
658
659 inp = synqe->lctx->inp;
660 MPASS(inp != NULL);
661 INP_WLOCK_ASSERT(inp);
662
663 if (refcount_release(&synqe->refcnt)) {
664 inp = release_lctx(sc, synqe->lctx);
665 m_freem(synqe->syn);
666 free(synqe, M_CXGBE);
667 }
668
669 return (inp);
670 }
671
672 void
t4_syncache_added(struct toedev * tod __unused,void * arg)673 t4_syncache_added(struct toedev *tod __unused, void *arg)
674 {
675 struct synq_entry *synqe = arg;
676
677 hold_synqe(synqe);
678 }
679
680 void
t4_syncache_removed(struct toedev * tod,void * arg)681 t4_syncache_removed(struct toedev *tod, void *arg)
682 {
683 struct adapter *sc = tod->tod_softc;
684 struct synq_entry *synqe = arg;
685 struct inpcb *inp = synqe->lctx->inp;
686
687 /*
688 * XXX: this is a LOR but harmless when running from the softclock.
689 */
690 INP_WLOCK(inp);
691 inp = release_synqe(sc, synqe);
692 if (inp != NULL)
693 INP_WUNLOCK(inp);
694 }
695
696 int
t4_syncache_respond(struct toedev * tod,void * arg,struct mbuf * m)697 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
698 {
699 struct synq_entry *synqe = arg;
700
701 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
702 struct tcpopt to;
703 struct ip *ip = mtod(m, struct ip *);
704 struct tcphdr *th;
705
706 if (ip->ip_v == IPVERSION)
707 th = (void *)(ip + 1);
708 else
709 th = (void *)((struct ip6_hdr *)ip + 1);
710 bzero(&to, sizeof(to));
711 tcp_dooptions(&to, (void *)(th + 1),
712 (th->th_off << 2) - sizeof(*th), TO_SYN);
713
714 /* save these for later */
715 synqe->iss = be32toh(th->th_seq);
716 synqe->irs = be32toh(th->th_ack) - 1;
717 synqe->ts = to.to_tsval;
718 }
719
720 m_freem(m); /* don't need this any more */
721 return (0);
722 }
723
724 static int
do_pass_open_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)725 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
726 struct mbuf *m)
727 {
728 struct adapter *sc = iq->adapter;
729 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
730 int stid = GET_TID(cpl);
731 unsigned int status = cpl->status;
732 struct listen_ctx *lctx = lookup_stid(sc, stid);
733 struct inpcb *inp = lctx->inp;
734 #ifdef INVARIANTS
735 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
736 #endif
737
738 KASSERT(opcode == CPL_PASS_OPEN_RPL,
739 ("%s: unexpected opcode 0x%x", __func__, opcode));
740 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
741 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
742
743 INP_WLOCK(inp);
744
745 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
746 __func__, stid, status, lctx->flags);
747
748 lctx->flags &= ~LCTX_RPL_PENDING;
749
750 if (status != CPL_ERR_NONE)
751 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
752
753 #ifdef INVARIANTS
754 /*
755 * If the inp has been dropped (listening socket closed) then
756 * listen_stop must have run and taken the inp out of the hash.
757 */
758 if (inp->inp_flags & INP_DROPPED) {
759 KASSERT(listen_hash_del(sc, inp) == NULL,
760 ("%s: inp %p still in listen hash", __func__, inp));
761 }
762 #endif
763
764 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
765 if (release_lctx(sc, lctx) != NULL)
766 INP_WUNLOCK(inp);
767 return (status);
768 }
769
770 /*
771 * Listening socket stopped listening earlier and now the chip tells us
772 * it has started the hardware listener. Stop it; the lctx will be
773 * released in do_close_server_rpl.
774 */
775 if (inp->inp_flags & INP_DROPPED) {
776 destroy_server(sc, lctx);
777 INP_WUNLOCK(inp);
778 return (status);
779 }
780
781 /*
782 * Failed to start hardware listener. Take inp out of the hash and
783 * release our reference on it. An error message has been logged
784 * already.
785 */
786 if (status != CPL_ERR_NONE) {
787 listen_hash_del(sc, inp);
788 if (release_lctx(sc, lctx) != NULL)
789 INP_WUNLOCK(inp);
790 return (status);
791 }
792
793 /* hardware listener open for business */
794
795 INP_WUNLOCK(inp);
796 return (status);
797 }
798
799 static int
do_close_server_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)800 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
801 struct mbuf *m)
802 {
803 struct adapter *sc = iq->adapter;
804 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
805 int stid = GET_TID(cpl);
806 unsigned int status = cpl->status;
807 struct listen_ctx *lctx = lookup_stid(sc, stid);
808 struct inpcb *inp = lctx->inp;
809 #ifdef INVARIANTS
810 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
811 #endif
812
813 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
814 ("%s: unexpected opcode 0x%x", __func__, opcode));
815 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
816 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
817
818 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
819
820 if (status != CPL_ERR_NONE) {
821 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
822 __func__, status, stid);
823 return (status);
824 }
825
826 INP_WLOCK(inp);
827 inp = release_lctx(sc, lctx);
828 if (inp != NULL)
829 INP_WUNLOCK(inp);
830
831 return (status);
832 }
833
834 static void
done_with_synqe(struct adapter * sc,struct synq_entry * synqe)835 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
836 {
837 struct listen_ctx *lctx = synqe->lctx;
838 struct inpcb *inp = lctx->inp;
839 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
840 int ntids;
841
842 INP_WLOCK_ASSERT(inp);
843 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
844
845 remove_tid(sc, synqe->tid, ntids);
846 release_tid(sc, synqe->tid, lctx->ctrlq);
847 t4_l2t_release(e);
848 inp = release_synqe(sc, synqe);
849 if (inp)
850 INP_WUNLOCK(inp);
851 }
852
853 void
synack_failure_cleanup(struct adapter * sc,int tid)854 synack_failure_cleanup(struct adapter *sc, int tid)
855 {
856 struct synq_entry *synqe = lookup_tid(sc, tid);
857
858 INP_WLOCK(synqe->lctx->inp);
859 done_with_synqe(sc, synqe);
860 }
861
862 int
do_abort_req_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)863 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
864 struct mbuf *m)
865 {
866 struct adapter *sc = iq->adapter;
867 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
868 unsigned int tid = GET_TID(cpl);
869 struct synq_entry *synqe = lookup_tid(sc, tid);
870 struct listen_ctx *lctx = synqe->lctx;
871 struct inpcb *inp = lctx->inp;
872 struct sge_wrq *ofld_txq;
873 #ifdef INVARIANTS
874 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
875 #endif
876
877 KASSERT(opcode == CPL_ABORT_REQ_RSS,
878 ("%s: unexpected opcode 0x%x", __func__, opcode));
879 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
880 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
881
882 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
883 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
884
885 if (negative_advice(cpl->status))
886 return (0); /* Ignore negative advice */
887
888 INP_WLOCK(inp);
889
890 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
891
892 /*
893 * If we'd initiated an abort earlier the reply to it is responsible for
894 * cleaning up resources. Otherwise we tear everything down right here
895 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
896 */
897 if (synqe->flags & TPF_ABORT_SHUTDOWN) {
898 INP_WUNLOCK(inp);
899 goto done;
900 }
901
902 done_with_synqe(sc, synqe);
903 /* inp lock released by done_with_synqe */
904 done:
905 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
906 return (0);
907 }
908
909 int
do_abort_rpl_synqe(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)910 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
911 struct mbuf *m)
912 {
913 struct adapter *sc = iq->adapter;
914 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
915 unsigned int tid = GET_TID(cpl);
916 struct synq_entry *synqe = lookup_tid(sc, tid);
917 struct listen_ctx *lctx = synqe->lctx;
918 struct inpcb *inp = lctx->inp;
919 #ifdef INVARIANTS
920 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
921 #endif
922
923 KASSERT(opcode == CPL_ABORT_RPL_RSS,
924 ("%s: unexpected opcode 0x%x", __func__, opcode));
925 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
926 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
927
928 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
929 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
930
931 INP_WLOCK(inp);
932 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
933 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
934 __func__, synqe, synqe->flags));
935
936 done_with_synqe(sc, synqe);
937 /* inp lock released by done_with_synqe */
938
939 return (0);
940 }
941
942 void
t4_offload_socket(struct toedev * tod,void * arg,struct socket * so)943 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
944 {
945 struct adapter *sc = tod->tod_softc;
946 struct synq_entry *synqe = arg;
947 #ifdef INVARIANTS
948 struct inpcb *inp = sotoinpcb(so);
949 #endif
950 struct toepcb *toep = synqe->toep;
951
952 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
953 INP_WLOCK_ASSERT(inp);
954 KASSERT(synqe->flags & TPF_SYNQE,
955 ("%s: %p not a synq_entry?", __func__, arg));
956 MPASS(toep->tid == synqe->tid);
957
958 offload_socket(so, toep);
959 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
960 toep->flags |= TPF_CPL_PENDING;
961 update_tid(sc, synqe->tid, toep);
962 synqe->flags |= TPF_SYNQE_EXPANDED;
963 }
964
965 static void
t4opt_to_tcpopt(const struct tcp_options * t4opt,struct tcpopt * to)966 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
967 {
968 bzero(to, sizeof(*to));
969
970 if (t4opt->mss) {
971 to->to_flags |= TOF_MSS;
972 to->to_mss = be16toh(t4opt->mss);
973 }
974
975 if (t4opt->wsf > 0 && t4opt->wsf < 15) {
976 to->to_flags |= TOF_SCALE;
977 to->to_wscale = t4opt->wsf;
978 }
979
980 if (t4opt->tstamp)
981 to->to_flags |= TOF_TS;
982
983 if (t4opt->sack)
984 to->to_flags |= TOF_SACKPERM;
985 }
986
987 static void
pass_accept_req_to_protohdrs(struct adapter * sc,const struct mbuf * m,struct in_conninfo * inc,struct tcphdr * th)988 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
989 struct in_conninfo *inc, struct tcphdr *th)
990 {
991 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
992 const struct ether_header *eh;
993 unsigned int hlen = be32toh(cpl->hdr_len);
994 uintptr_t l3hdr;
995 const struct tcphdr *tcp;
996
997 eh = (const void *)(cpl + 1);
998 if (chip_id(sc) >= CHELSIO_T6) {
999 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1000 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1001 } else {
1002 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1003 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1004 }
1005
1006 if (inc) {
1007 bzero(inc, sizeof(*inc));
1008 inc->inc_fport = tcp->th_sport;
1009 inc->inc_lport = tcp->th_dport;
1010 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1011 const struct ip *ip = (const void *)l3hdr;
1012
1013 inc->inc_faddr = ip->ip_src;
1014 inc->inc_laddr = ip->ip_dst;
1015 } else {
1016 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1017
1018 inc->inc_flags |= INC_ISIPV6;
1019 inc->inc6_faddr = ip6->ip6_src;
1020 inc->inc6_laddr = ip6->ip6_dst;
1021 }
1022 }
1023
1024 if (th) {
1025 bcopy(tcp, th, sizeof(*th));
1026 tcp_fields_to_host(th); /* just like tcp_input */
1027 }
1028 }
1029
1030 static struct l2t_entry *
get_l2te_for_nexthop(struct port_info * pi,struct ifnet * ifp,struct in_conninfo * inc)1031 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1032 struct in_conninfo *inc)
1033 {
1034 struct l2t_entry *e;
1035 struct sockaddr_in6 sin6;
1036 struct sockaddr *dst = (void *)&sin6;
1037
1038 if (inc->inc_flags & INC_ISIPV6) {
1039 struct nhop6_basic nh6;
1040
1041 bzero(dst, sizeof(struct sockaddr_in6));
1042 dst->sa_len = sizeof(struct sockaddr_in6);
1043 dst->sa_family = AF_INET6;
1044
1045 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1046 /* no need for route lookup */
1047 e = t4_l2t_get(pi, ifp, dst);
1048 return (e);
1049 }
1050
1051 if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1052 0, 0, 0, &nh6) != 0)
1053 return (NULL);
1054 if (nh6.nh_ifp != ifp)
1055 return (NULL);
1056 ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1057 } else {
1058 struct nhop4_basic nh4;
1059
1060 dst->sa_len = sizeof(struct sockaddr_in);
1061 dst->sa_family = AF_INET;
1062
1063 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1064 &nh4) != 0)
1065 return (NULL);
1066 if (nh4.nh_ifp != ifp)
1067 return (NULL);
1068 ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1069 }
1070
1071 e = t4_l2t_get(pi, ifp, dst);
1072 return (e);
1073 }
1074
1075 static int
send_synack(struct adapter * sc,struct synq_entry * synqe,uint64_t opt0,uint32_t opt2,int tid)1076 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
1077 uint32_t opt2, int tid)
1078 {
1079 struct wrqe *wr;
1080 struct cpl_pass_accept_rpl *rpl;
1081 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
1082
1083 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1084 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
1085 if (wr == NULL)
1086 return (ENOMEM);
1087 rpl = wrtod(wr);
1088
1089 if (is_t4(sc))
1090 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1091 else {
1092 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1093
1094 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1095 rpl5->iss = htobe32(synqe->iss);
1096 }
1097 rpl->opt0 = opt0;
1098 rpl->opt2 = opt2;
1099
1100 return (t4_l2t_send(sc, wr, e));
1101 }
1102
1103 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \
1104 if (!tunnel) { \
1105 m_freem(m); \
1106 m = NULL; \
1107 } \
1108 reject_reason = __LINE__; \
1109 goto reject; \
1110 } while (0)
1111
1112 /*
1113 * The context associated with a tid entry via insert_tid could be a synq_entry
1114 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
1115 */
1116 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1117
1118 /*
1119 * Incoming SYN on a listening socket.
1120 *
1121 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1122 * etc.
1123 */
1124 static int
do_pass_accept_req(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1125 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1126 struct mbuf *m)
1127 {
1128 struct adapter *sc = iq->adapter;
1129 struct toedev *tod;
1130 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1131 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1132 unsigned int tid = GET_TID(cpl);
1133 struct listen_ctx *lctx = lookup_stid(sc, stid);
1134 struct inpcb *inp;
1135 struct socket *so;
1136 struct in_conninfo inc;
1137 struct tcphdr th;
1138 struct tcpopt to;
1139 struct port_info *pi;
1140 struct vi_info *vi;
1141 struct ifnet *hw_ifp, *ifp;
1142 struct l2t_entry *e = NULL;
1143 struct synq_entry *synqe = NULL;
1144 int reject_reason, v, ntids;
1145 uint16_t vid, l2info;
1146 struct epoch_tracker et;
1147 #ifdef INVARIANTS
1148 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1149 #endif
1150 struct offload_settings settings;
1151
1152 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1153 ("%s: unexpected opcode 0x%x", __func__, opcode));
1154 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1155
1156 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1157 lctx);
1158
1159 CURVNET_SET(lctx->vnet); /* before any potential REJECT */
1160
1161 /*
1162 * Use the MAC index to lookup the associated VI. If this SYN didn't
1163 * match a perfect MAC filter, punt.
1164 */
1165 l2info = be16toh(cpl->l2info);
1166 pi = sc->port[G_SYN_INTF(l2info)];
1167 if (!(l2info & F_SYN_XACT_MATCH)) {
1168 REJECT_PASS_ACCEPT_REQ(false);
1169 }
1170 for_each_vi(pi, v, vi) {
1171 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
1172 goto found;
1173 }
1174 REJECT_PASS_ACCEPT_REQ(false);
1175 found:
1176 hw_ifp = vi->ifp; /* the cxgbe ifnet */
1177 m->m_pkthdr.rcvif = hw_ifp;
1178 tod = TOEDEV(hw_ifp);
1179
1180 /*
1181 * Don't offload if the peer requested a TCP option that's not known to
1182 * the silicon. Send the SYN to the kernel instead.
1183 */
1184 if (__predict_false(cpl->tcpopt.unknown))
1185 REJECT_PASS_ACCEPT_REQ(true);
1186
1187 /*
1188 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1189 * involved. Don't offload if the SYN had a VLAN tag and the vid
1190 * doesn't match anything on this interface.
1191 *
1192 * XXX: lagg support, lagg + vlan support.
1193 */
1194 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1195 if (vid != 0xfff && vid != 0) {
1196 ifp = VLAN_DEVAT(hw_ifp, vid);
1197 if (ifp == NULL)
1198 REJECT_PASS_ACCEPT_REQ(true);
1199 } else
1200 ifp = hw_ifp;
1201
1202 /*
1203 * Don't offload if the ifnet that the SYN came in on is not in the same
1204 * vnet as the listening socket.
1205 */
1206 if (lctx->vnet != ifp->if_vnet)
1207 REJECT_PASS_ACCEPT_REQ(true);
1208
1209 pass_accept_req_to_protohdrs(sc, m, &inc, &th);
1210 if (inc.inc_flags & INC_ISIPV6) {
1211
1212 /* Don't offload if the ifcap isn't enabled */
1213 if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1214 REJECT_PASS_ACCEPT_REQ(true);
1215
1216 /*
1217 * SYN must be directed to an IP6 address on this ifnet. This
1218 * is more restrictive than in6_localip.
1219 */
1220 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
1221 REJECT_PASS_ACCEPT_REQ(true);
1222
1223 ntids = 2;
1224 } else {
1225
1226 /* Don't offload if the ifcap isn't enabled */
1227 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1228 REJECT_PASS_ACCEPT_REQ(true);
1229
1230 /*
1231 * SYN must be directed to an IP address on this ifnet. This
1232 * is more restrictive than in_localip.
1233 */
1234 if (!in_ifhasaddr(ifp, inc.inc_laddr))
1235 REJECT_PASS_ACCEPT_REQ(true);
1236
1237 ntids = 1;
1238 }
1239
1240 e = get_l2te_for_nexthop(pi, ifp, &inc);
1241 if (e == NULL)
1242 REJECT_PASS_ACCEPT_REQ(true);
1243
1244 /* Don't offload if the 4-tuple is already in use */
1245 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for 4-tuple check */
1246 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1247 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1248 REJECT_PASS_ACCEPT_REQ(false);
1249 }
1250 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1251
1252 inp = lctx->inp; /* listening socket, not owned by TOE */
1253 INP_WLOCK(inp);
1254
1255 /* Don't offload if the listening socket has closed */
1256 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1257 INP_WUNLOCK(inp);
1258 REJECT_PASS_ACCEPT_REQ(false);
1259 }
1260 so = inp->inp_socket;
1261 rw_rlock(&sc->policy_lock);
1262 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
1263 EVL_MAKETAG(0xfff, 0, 0), inp);
1264 rw_runlock(&sc->policy_lock);
1265 if (!settings.offload) {
1266 INP_WUNLOCK(inp);
1267 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */
1268 }
1269
1270 synqe = alloc_synqe(sc, lctx, M_NOWAIT);
1271 if (synqe == NULL) {
1272 INP_WUNLOCK(inp);
1273 REJECT_PASS_ACCEPT_REQ(true);
1274 }
1275 atomic_store_int(&synqe->ok_to_respond, 0);
1276
1277 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
1278 &synqe->params);
1279
1280 /*
1281 * If all goes well t4_syncache_respond will get called during
1282 * syncache_add. Note that syncache_add releases the pcb lock.
1283 */
1284 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1285 toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1286
1287 if (atomic_load_int(&synqe->ok_to_respond) > 0) {
1288 uint64_t opt0;
1289 uint32_t opt2;
1290
1291 opt0 = calc_options0(vi, &synqe->params);
1292 opt2 = calc_options2(vi, &synqe->params);
1293
1294 insert_tid(sc, tid, synqe, ntids);
1295 synqe->tid = tid;
1296 synqe->syn = m;
1297 m = NULL;
1298
1299 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
1300 remove_tid(sc, tid, ntids);
1301 m = synqe->syn;
1302 synqe->syn = NULL;
1303 REJECT_PASS_ACCEPT_REQ(true);
1304 }
1305
1306 CTR6(KTR_CXGBE,
1307 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
1308 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
1309 } else
1310 REJECT_PASS_ACCEPT_REQ(false);
1311
1312 CURVNET_RESTORE();
1313 return (0);
1314 reject:
1315 CURVNET_RESTORE();
1316 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1317 reject_reason);
1318
1319 if (e)
1320 t4_l2t_release(e);
1321 release_tid(sc, tid, lctx->ctrlq);
1322 if (synqe) {
1323 inp = synqe->lctx->inp;
1324 INP_WLOCK(inp);
1325 inp = release_synqe(sc, synqe);
1326 if (inp)
1327 INP_WUNLOCK(inp);
1328 }
1329
1330 if (m) {
1331 /*
1332 * The connection request hit a TOE listener but is being passed
1333 * on to the kernel sw stack instead of getting offloaded.
1334 */
1335 m_adj(m, sizeof(*cpl));
1336 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1337 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1338 m->m_pkthdr.csum_data = 0xffff;
1339 hw_ifp->if_input(hw_ifp, m);
1340 }
1341
1342 return (reject_reason);
1343 }
1344
1345 static void
synqe_to_protohdrs(struct adapter * sc,struct synq_entry * synqe,const struct cpl_pass_establish * cpl,struct in_conninfo * inc,struct tcphdr * th,struct tcpopt * to)1346 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1347 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1348 struct tcphdr *th, struct tcpopt *to)
1349 {
1350 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1351
1352 /* start off with the original SYN */
1353 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
1354
1355 /* modify parts to make it look like the ACK to our SYN|ACK */
1356 th->th_flags = TH_ACK;
1357 th->th_ack = synqe->iss + 1;
1358 th->th_seq = be32toh(cpl->rcv_isn);
1359 bzero(to, sizeof(*to));
1360 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1361 to->to_flags |= TOF_TS;
1362 to->to_tsecr = synqe->ts;
1363 }
1364 }
1365
1366 static int
do_pass_establish(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1367 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1368 struct mbuf *m)
1369 {
1370 struct adapter *sc = iq->adapter;
1371 struct vi_info *vi;
1372 struct ifnet *ifp;
1373 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1374 #if defined(KTR) || defined(INVARIANTS)
1375 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1376 #endif
1377 unsigned int tid = GET_TID(cpl);
1378 struct synq_entry *synqe = lookup_tid(sc, tid);
1379 struct listen_ctx *lctx = synqe->lctx;
1380 struct inpcb *inp = lctx->inp, *new_inp;
1381 struct socket *so;
1382 struct tcphdr th;
1383 struct tcpopt to;
1384 struct in_conninfo inc;
1385 struct toepcb *toep;
1386 struct epoch_tracker et;
1387 #ifdef INVARIANTS
1388 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1389 #endif
1390
1391 KASSERT(opcode == CPL_PASS_ESTABLISH,
1392 ("%s: unexpected opcode 0x%x", __func__, opcode));
1393 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1394 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1395 KASSERT(synqe->flags & TPF_SYNQE,
1396 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1397
1398 CURVNET_SET(lctx->vnet);
1399 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for syncache_expand */
1400 INP_WLOCK(inp);
1401
1402 CTR6(KTR_CXGBE,
1403 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1404 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1405
1406 ifp = synqe->syn->m_pkthdr.rcvif;
1407 vi = ifp->if_softc;
1408 KASSERT(vi->pi->adapter == sc,
1409 ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1410
1411 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1412 reset:
1413 send_reset_synqe(TOEDEV(ifp), synqe);
1414 INP_WUNLOCK(inp);
1415 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1416 CURVNET_RESTORE();
1417 return (0);
1418 }
1419
1420 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1421 ("%s: CPL arrived on unexpected rxq. %d %d", __func__,
1422 synqe->params.rxq_idx,
1423 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1424
1425 toep = alloc_toepcb(vi, M_NOWAIT);
1426 if (toep == NULL)
1427 goto reset;
1428 toep->tid = tid;
1429 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
1430 toep->vnet = lctx->vnet;
1431 bcopy(&synqe->params, &toep->params, sizeof(toep->params));
1432 init_toepcb(vi, toep);
1433
1434 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
1435 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
1436 synqe->tcp_opt = cpl->tcp_opt;
1437 synqe->toep = toep;
1438
1439 /* Come up with something that syncache_expand should be ok with. */
1440 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1441 if (inc.inc_flags & INC_ISIPV6)
1442 toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce);
1443 so = inp->inp_socket;
1444 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1445
1446 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1447 free_toepcb(toep);
1448 goto reset;
1449 }
1450
1451 /* New connection inpcb is already locked by syncache_expand(). */
1452 new_inp = sotoinpcb(so);
1453 INP_WLOCK_ASSERT(new_inp);
1454 MPASS(so->so_vnet == lctx->vnet);
1455
1456 /*
1457 * This is for expansion from syncookies.
1458 *
1459 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1460 * anyone accept'ing a connection before we've installed our hooks, but
1461 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1462 */
1463 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1464 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1465 t4_offload_socket(TOEDEV(ifp), synqe, so);
1466 }
1467
1468 INP_WUNLOCK(new_inp);
1469
1470 /* Done with the synqe */
1471 inp = release_synqe(sc, synqe);
1472 if (inp != NULL)
1473 INP_WUNLOCK(inp);
1474 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1475 CURVNET_RESTORE();
1476
1477 return (0);
1478 }
1479
1480 void
t4_init_listen_cpl_handlers(void)1481 t4_init_listen_cpl_handlers(void)
1482 {
1483
1484 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1485 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1486 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1487 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1488 }
1489
1490 void
t4_uninit_listen_cpl_handlers(void)1491 t4_uninit_listen_cpl_handlers(void)
1492 {
1493
1494 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1495 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1496 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1497 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
1498 }
1499 #endif
1500