1 /*-
2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include "opt_inet.h"
27 #include "opt_inet6.h"
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/devctl.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/mbuf.h>
38 #include <sys/module.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41
42 #include <net/bpf.h>
43 #include <net/ethernet.h>
44 #include <net/infiniband.h>
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/if_media.h>
49 #include <net/if_lagg.h>
50 #include <net/if_llatbl.h>
51 #include <net/if_types.h>
52 #include <net/netisr.h>
53 #include <net/route.h>
54 #include <netinet/if_ether.h>
55 #include <netinet/in.h>
56 #include <netinet/ip6.h>
57 #include <netinet6/in6_var.h>
58 #include <netinet6/nd6.h>
59
60 #include <security/mac/mac_framework.h>
61
62 /* if_lagg(4) support */
63 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
64
65 #ifdef INET
66 static inline void
infiniband_ipv4_multicast_map(uint32_t addr,const uint8_t * broadcast,uint8_t * buf)67 infiniband_ipv4_multicast_map(uint32_t addr,
68 const uint8_t *broadcast, uint8_t *buf)
69 {
70 uint8_t scope;
71
72 addr = ntohl(addr);
73 scope = broadcast[5] & 0xF;
74
75 buf[0] = 0;
76 buf[1] = 0xff;
77 buf[2] = 0xff;
78 buf[3] = 0xff;
79 buf[4] = 0xff;
80 buf[5] = 0x10 | scope;
81 buf[6] = 0x40;
82 buf[7] = 0x1b;
83 buf[8] = broadcast[8];
84 buf[9] = broadcast[9];
85 buf[10] = 0;
86 buf[11] = 0;
87 buf[12] = 0;
88 buf[13] = 0;
89 buf[14] = 0;
90 buf[15] = 0;
91 buf[16] = (addr >> 24) & 0xff;
92 buf[17] = (addr >> 16) & 0xff;
93 buf[18] = (addr >> 8) & 0xff;
94 buf[19] = addr & 0xff;
95 }
96 #endif
97
98 #ifdef INET6
99 static inline void
infiniband_ipv6_multicast_map(const struct in6_addr * addr,const uint8_t * broadcast,uint8_t * buf)100 infiniband_ipv6_multicast_map(const struct in6_addr *addr,
101 const uint8_t *broadcast, uint8_t *buf)
102 {
103 uint8_t scope;
104
105 scope = broadcast[5] & 0xF;
106
107 buf[0] = 0;
108 buf[1] = 0xff;
109 buf[2] = 0xff;
110 buf[3] = 0xff;
111 buf[4] = 0xff;
112 buf[5] = 0x10 | scope;
113 buf[6] = 0x60;
114 buf[7] = 0x1b;
115 buf[8] = broadcast[8];
116 buf[9] = broadcast[9];
117 memcpy(&buf[10], &addr->s6_addr[6], 10);
118 }
119 #endif
120
121 /*
122 * This is for clients that have an infiniband_header in the mbuf.
123 */
124 void
infiniband_bpf_mtap(struct ifnet * ifp,struct mbuf * mb)125 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
126 {
127 struct infiniband_header *ibh;
128 struct ether_header eh;
129
130 if (mb->m_len < sizeof(*ibh))
131 return;
132
133 ibh = mtod(mb, struct infiniband_header *);
134 eh.ether_type = ibh->ib_protocol;
135 memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
136 memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
137 mb->m_data += sizeof(*ibh);
138 mb->m_len -= sizeof(*ibh);
139 mb->m_pkthdr.len -= sizeof(*ibh);
140 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
141 mb->m_data -= sizeof(*ibh);
142 mb->m_len += sizeof(*ibh);
143 mb->m_pkthdr.len += sizeof(*ibh);
144 }
145
146 static void
update_mbuf_csumflags(struct mbuf * src,struct mbuf * dst)147 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
148 {
149 int csum_flags = 0;
150
151 if (src->m_pkthdr.csum_flags & CSUM_IP)
152 csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
153 if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
154 csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
155 if (src->m_pkthdr.csum_flags & CSUM_SCTP)
156 csum_flags |= CSUM_SCTP_VALID;
157 dst->m_pkthdr.csum_flags |= csum_flags;
158 if (csum_flags & CSUM_DATA_VALID)
159 dst->m_pkthdr.csum_data = 0xffff;
160 }
161
162 /*
163 * Handle link-layer encapsulation requests.
164 */
165 static int
infiniband_requestencap(struct ifnet * ifp,struct if_encap_req * req)166 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
167 {
168 struct infiniband_header *ih;
169 struct arphdr *ah;
170 uint16_t etype;
171 const uint8_t *lladdr;
172
173 if (req->rtype != IFENCAP_LL)
174 return (EOPNOTSUPP);
175
176 if (req->bufsize < INFINIBAND_HDR_LEN)
177 return (ENOMEM);
178
179 ih = (struct infiniband_header *)req->buf;
180 lladdr = req->lladdr;
181 req->lladdr_off = 0;
182
183 switch (req->family) {
184 case AF_INET:
185 etype = htons(ETHERTYPE_IP);
186 break;
187 case AF_INET6:
188 etype = htons(ETHERTYPE_IPV6);
189 break;
190 case AF_ARP:
191 ah = (struct arphdr *)req->hdata;
192 ah->ar_hrd = htons(ARPHRD_INFINIBAND);
193
194 switch (ntohs(ah->ar_op)) {
195 case ARPOP_REVREQUEST:
196 case ARPOP_REVREPLY:
197 etype = htons(ETHERTYPE_REVARP);
198 break;
199 case ARPOP_REQUEST:
200 case ARPOP_REPLY:
201 default:
202 etype = htons(ETHERTYPE_ARP);
203 break;
204 }
205
206 if (req->flags & IFENCAP_FLAG_BROADCAST)
207 lladdr = ifp->if_broadcastaddr;
208 break;
209 default:
210 return (EAFNOSUPPORT);
211 }
212
213 ih->ib_protocol = etype;
214 ih->ib_reserved = 0;
215 memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
216 req->bufsize = sizeof(struct infiniband_header);
217
218 return (0);
219 }
220
221 static int
infiniband_resolve_addr(struct ifnet * ifp,struct mbuf * m,const struct sockaddr * dst,struct route * ro,uint8_t * phdr,uint32_t * pflags,struct llentry ** plle)222 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
223 const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
224 uint32_t *pflags, struct llentry **plle)
225 {
226 struct infiniband_header *ih;
227 uint32_t lleflags = 0;
228 int error = 0;
229
230 if (plle)
231 *plle = NULL;
232 ih = (struct infiniband_header *)phdr;
233
234 switch (dst->sa_family) {
235 #ifdef INET
236 case AF_INET:
237 if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
238 error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
239 } else {
240 if (m->m_flags & M_BCAST) {
241 memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
242 INFINIBAND_ADDR_LEN);
243 } else {
244 infiniband_ipv4_multicast_map(
245 ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
246 ifp->if_broadcastaddr, ih->ib_hwaddr);
247 }
248 ih->ib_protocol = htons(ETHERTYPE_IP);
249 ih->ib_reserved = 0;
250 }
251 break;
252 #endif
253 #ifdef INET6
254 case AF_INET6:
255 if ((m->m_flags & M_MCAST) == 0) {
256 error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle);
257 } else {
258 infiniband_ipv6_multicast_map(
259 &((const struct sockaddr_in6 *)dst)->sin6_addr,
260 ifp->if_broadcastaddr, ih->ib_hwaddr);
261 ih->ib_protocol = htons(ETHERTYPE_IPV6);
262 ih->ib_reserved = 0;
263 }
264 break;
265 #endif
266 default:
267 if_printf(ifp, "can't handle af%d\n", dst->sa_family);
268 if (m != NULL)
269 m_freem(m);
270 return (EAFNOSUPPORT);
271 }
272
273 if (error == EHOSTDOWN) {
274 if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
275 error = EHOSTUNREACH;
276 }
277
278 if (error != 0)
279 return (error);
280
281 *pflags = RT_MAY_LOOP;
282 if (lleflags & LLE_IFADDR)
283 *pflags |= RT_L2_ME;
284
285 return (0);
286 }
287
288 /*
289 * Infiniband output routine.
290 */
291 static int
infiniband_output(struct ifnet * ifp,struct mbuf * m,const struct sockaddr * dst,struct route * ro)292 infiniband_output(struct ifnet *ifp, struct mbuf *m,
293 const struct sockaddr *dst, struct route *ro)
294 {
295 uint8_t linkhdr[INFINIBAND_HDR_LEN];
296 uint8_t *phdr;
297 struct llentry *lle = NULL;
298 struct infiniband_header *ih;
299 int error = 0;
300 int hlen; /* link layer header length */
301 uint32_t pflags;
302 bool addref;
303
304 NET_EPOCH_ASSERT();
305
306 addref = false;
307 phdr = NULL;
308 pflags = 0;
309 if (ro != NULL) {
310 /* XXX BPF uses ro_prepend */
311 if (ro->ro_prepend != NULL) {
312 phdr = ro->ro_prepend;
313 hlen = ro->ro_plen;
314 } else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
315 if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
316 lle = ro->ro_lle;
317 if (lle != NULL &&
318 (lle->la_flags & LLE_VALID) == 0) {
319 LLE_FREE(lle);
320 lle = NULL; /* redundant */
321 ro->ro_lle = NULL;
322 }
323 if (lle == NULL) {
324 /* if we lookup, keep cache */
325 addref = 1;
326 } else
327 /*
328 * Notify LLE code that
329 * the entry was used
330 * by datapath.
331 */
332 llentry_mark_used(lle);
333 }
334 if (lle != NULL) {
335 phdr = lle->r_linkdata;
336 hlen = lle->r_hdrlen;
337 pflags = lle->r_flags;
338 }
339 }
340 }
341
342 #ifdef MAC
343 error = mac_ifnet_check_transmit(ifp, m);
344 if (error)
345 goto bad;
346 #endif
347
348 M_PROFILE(m);
349 if (ifp->if_flags & IFF_MONITOR) {
350 error = ENETDOWN;
351 goto bad;
352 }
353 if (!((ifp->if_flags & IFF_UP) &&
354 (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
355 error = ENETDOWN;
356 goto bad;
357 }
358
359 if (phdr == NULL) {
360 /* No prepend data supplied. Try to calculate ourselves. */
361 phdr = linkhdr;
362 hlen = INFINIBAND_HDR_LEN;
363 error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
364 addref ? &lle : NULL);
365 if (addref && lle != NULL)
366 ro->ro_lle = lle;
367 if (error != 0)
368 return (error == EWOULDBLOCK ? 0 : error);
369 }
370
371 if ((pflags & RT_L2_ME) != 0) {
372 update_mbuf_csumflags(m, m);
373 return (if_simloop(ifp, m, dst->sa_family, 0));
374 }
375
376 /*
377 * Add local infiniband header. If no space in first mbuf,
378 * allocate another.
379 */
380 M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
381 if (m == NULL) {
382 error = ENOBUFS;
383 goto bad;
384 }
385 if ((pflags & RT_HAS_HEADER) == 0) {
386 ih = mtod(m, struct infiniband_header *);
387 memcpy(ih, phdr, hlen);
388 }
389
390 /*
391 * Queue message on interface, update output statistics if
392 * successful, and start output if interface not yet active.
393 */
394 return (ifp->if_transmit(ifp, m));
395 bad:
396 if (m != NULL)
397 m_freem(m);
398 return (error);
399 }
400
401 /*
402 * Process a received Infiniband packet.
403 */
404 static void
infiniband_input(struct ifnet * ifp,struct mbuf * m)405 infiniband_input(struct ifnet *ifp, struct mbuf *m)
406 {
407 struct infiniband_header *ibh;
408 struct epoch_tracker et;
409 int isr;
410
411 CURVNET_SET_QUIET(ifp->if_vnet);
412
413 if ((ifp->if_flags & IFF_UP) == 0) {
414 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
415 m_freem(m);
416 goto done;
417 }
418
419 ibh = mtod(m, struct infiniband_header *);
420
421 /*
422 * Reset layer specific mbuf flags to avoid confusing upper
423 * layers:
424 */
425 m->m_flags &= ~M_VLANTAG;
426 m_clrprotoflags(m);
427
428 if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
429 if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
430 ifp->if_addrlen) == 0)
431 m->m_flags |= M_BCAST;
432 else
433 m->m_flags |= M_MCAST;
434 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
435 }
436
437 /* Let BPF have it before we strip the header. */
438 INFINIBAND_BPF_MTAP(ifp, m);
439
440 /* Allow monitor mode to claim this frame, after stats are updated. */
441 if (ifp->if_flags & IFF_MONITOR) {
442 m_freem(m);
443 goto done;
444 }
445
446 /* Direct packet to correct FIB based on interface config. */
447 M_SETFIB(m, ifp->if_fib);
448
449 /* Handle input from a lagg<N> port */
450 if (ifp->if_type == IFT_INFINIBANDLAG) {
451 KASSERT(lagg_input_infiniband_p != NULL,
452 ("%s: if_lagg not loaded!", __func__));
453 m = (*lagg_input_infiniband_p)(ifp, m);
454 if (__predict_false(m == NULL))
455 goto done;
456 ifp = m->m_pkthdr.rcvif;
457 }
458
459 /*
460 * Dispatch frame to upper layer.
461 */
462 switch (ibh->ib_protocol) {
463 #ifdef INET
464 case htons(ETHERTYPE_IP):
465 isr = NETISR_IP;
466 break;
467
468 case htons(ETHERTYPE_ARP):
469 if (ifp->if_flags & IFF_NOARP) {
470 /* Discard packet if ARP is disabled on interface */
471 m_freem(m);
472 goto done;
473 }
474 isr = NETISR_ARP;
475 break;
476 #endif
477 #ifdef INET6
478 case htons(ETHERTYPE_IPV6):
479 isr = NETISR_IPV6;
480 break;
481 #endif
482 default:
483 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
484 m_freem(m);
485 goto done;
486 }
487
488 /* Strip off the Infiniband header. */
489 m_adj(m, INFINIBAND_HDR_LEN);
490
491 #ifdef MAC
492 /*
493 * Tag the mbuf with an appropriate MAC label before any other
494 * consumers can get to it.
495 */
496 mac_ifnet_create_mbuf(ifp, m);
497 #endif
498 /* Allow monitor mode to claim this frame, after stats are updated. */
499 NET_EPOCH_ENTER(et);
500 netisr_dispatch(isr, m);
501 NET_EPOCH_EXIT(et);
502 done:
503 CURVNET_RESTORE();
504 }
505
506 static int
infiniband_resolvemulti(struct ifnet * ifp,struct sockaddr ** llsa,struct sockaddr * sa)507 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
508 struct sockaddr *sa)
509 {
510 struct sockaddr_dl *sdl;
511 #ifdef INET
512 struct sockaddr_in *sin;
513 #endif
514 #ifdef INET6
515 struct sockaddr_in6 *sin6;
516 #endif
517 uint8_t *e_addr;
518
519 switch (sa->sa_family) {
520 case AF_LINK:
521 /*
522 * No mapping needed. Just check that it's a valid MC address.
523 */
524 sdl = (struct sockaddr_dl *)sa;
525 e_addr = LLADDR(sdl);
526 if (!INFINIBAND_IS_MULTICAST(e_addr))
527 return (EADDRNOTAVAIL);
528 *llsa = NULL;
529 return 0;
530
531 #ifdef INET
532 case AF_INET:
533 sin = (struct sockaddr_in *)sa;
534 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
535 return (EADDRNOTAVAIL);
536 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
537 sdl->sdl_alen = INFINIBAND_ADDR_LEN;
538 e_addr = LLADDR(sdl);
539 infiniband_ipv4_multicast_map(
540 sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
541 *llsa = (struct sockaddr *)sdl;
542 return (0);
543 #endif
544 #ifdef INET6
545 case AF_INET6:
546 sin6 = (struct sockaddr_in6 *)sa;
547 /*
548 * An IP6 address of 0 means listen to all of the
549 * multicast address used for IP6. This has no meaning
550 * in infiniband.
551 */
552 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
553 return (EADDRNOTAVAIL);
554 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
555 return (EADDRNOTAVAIL);
556 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
557 sdl->sdl_alen = INFINIBAND_ADDR_LEN;
558 e_addr = LLADDR(sdl);
559 infiniband_ipv6_multicast_map(
560 &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
561 *llsa = (struct sockaddr *)sdl;
562 return (0);
563 #endif
564 default:
565 return (EAFNOSUPPORT);
566 }
567 }
568
569 void
infiniband_ifattach(struct ifnet * ifp,const uint8_t * lla,const uint8_t * llb)570 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
571 {
572 struct sockaddr_dl *sdl;
573 struct ifaddr *ifa;
574 int i;
575
576 ifp->if_addrlen = INFINIBAND_ADDR_LEN;
577 ifp->if_hdrlen = INFINIBAND_HDR_LEN;
578 ifp->if_mtu = INFINIBAND_MTU;
579 if_attach(ifp);
580 ifp->if_output = infiniband_output;
581 ifp->if_input = infiniband_input;
582 ifp->if_resolvemulti = infiniband_resolvemulti;
583 ifp->if_requestencap = infiniband_requestencap;
584
585 if (ifp->if_baudrate == 0)
586 ifp->if_baudrate = IF_Gbps(10); /* default value */
587 if (llb != NULL)
588 ifp->if_broadcastaddr = llb;
589
590 ifa = ifp->if_addr;
591 KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
592 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
593 sdl->sdl_type = IFT_INFINIBAND;
594 sdl->sdl_alen = ifp->if_addrlen;
595
596 if (lla != NULL) {
597 memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
598
599 if (ifp->if_hw_addr != NULL)
600 memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
601 } else {
602 lla = LLADDR(sdl);
603 }
604
605 /* Attach ethernet compatible network device */
606 bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
607
608 /* Announce Infiniband MAC address if non-zero. */
609 for (i = 0; i < ifp->if_addrlen; i++)
610 if (lla[i] != 0)
611 break;
612 if (i != ifp->if_addrlen)
613 if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
614
615 /* Add necessary bits are setup; announce it now. */
616 EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
617
618 if (IS_DEFAULT_VNET(curvnet))
619 devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
620 }
621
622 /*
623 * Perform common duties while detaching an Infiniband interface
624 */
625 void
infiniband_ifdetach(struct ifnet * ifp)626 infiniband_ifdetach(struct ifnet *ifp)
627 {
628 bpfdetach(ifp);
629 if_detach(ifp);
630 }
631
632 static int
infiniband_modevent(module_t mod,int type,void * data)633 infiniband_modevent(module_t mod, int type, void *data)
634 {
635 switch (type) {
636 case MOD_LOAD:
637 case MOD_UNLOAD:
638 return (0);
639 default:
640 return (EOPNOTSUPP);
641 }
642 }
643
644 static moduledata_t infiniband_mod = {
645 .name = "if_infiniband",
646 .evhand = &infiniband_modevent,
647 };
648
649 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
650 MODULE_VERSION(if_infiniband, 1);
651