xref: /xnu-11215/bsd/net/dlil.c (revision d4514f0b)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/dlil_sysctl.h>
54 #include <net/dlil_var_private.h>
55 #include <net/if_arp.h>
56 #include <net/iptap.h>
57 #include <net/pktap.h>
58 #include <net/droptap.h>
59 #include <net/nwk_wq.h>
60 #include <sys/kern_event.h>
61 #include <sys/kdebug.h>
62 #include <sys/mcache.h>
63 #include <sys/syslog.h>
64 #include <sys/protosw.h>
65 #include <sys/priv.h>
66 
67 #include <kern/assert.h>
68 #include <kern/task.h>
69 #include <kern/thread.h>
70 #include <kern/sched_prim.h>
71 #include <kern/locks.h>
72 #include <kern/zalloc.h>
73 
74 #include <net/kpi_protocol.h>
75 #include <net/if_types.h>
76 #include <net/if_ipsec.h>
77 #include <net/if_llreach.h>
78 #include <net/if_utun.h>
79 #include <net/kpi_interfacefilter.h>
80 #include <net/classq/classq.h>
81 #include <net/classq/classq_sfb.h>
82 #include <net/flowhash.h>
83 #include <net/ntstat.h>
84 #if SKYWALK
85 #include <skywalk/lib/net_filter_event.h>
86 #endif /* SKYWALK */
87 #include <net/net_api_stats.h>
88 #include <net/if_ports_used.h>
89 #include <net/if_vlan_var.h>
90 #include <netinet/in.h>
91 #if INET
92 #include <netinet/in_var.h>
93 #include <netinet/igmp_var.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #include <netinet/if_ether.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_tclass.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip_icmp.h>
104 #include <netinet/icmp_var.h>
105 #endif /* INET */
106 
107 #include <net/nat464_utils.h>
108 #include <netinet6/in6_var.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/mld6_var.h>
111 #include <netinet6/scope6_var.h>
112 #include <netinet/ip6.h>
113 #include <netinet/icmp6.h>
114 #include <net/pf_pbuf.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/tree.h>
117 
118 #include <dev/random/randomdev.h>
119 #include <machine/machine_routines.h>
120 
121 #include <mach/thread_act.h>
122 #include <mach/sdt.h>
123 
124 #if CONFIG_MACF
125 #include <sys/kauth.h>
126 #include <security/mac_framework.h>
127 #include <net/ethernet.h>
128 #include <net/firewire.h>
129 #endif
130 
131 #if PF
132 #include <net/pfvar.h>
133 #endif /* PF */
134 #include <net/pktsched/pktsched.h>
135 #include <net/pktsched/pktsched_netem.h>
136 
137 #if NECP
138 #include <net/necp.h>
139 #endif /* NECP */
140 
141 #if SKYWALK
142 #include <skywalk/packet/packet_queue.h>
143 #include <skywalk/nexus/netif/nx_netif.h>
144 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
145 #endif /* SKYWALK */
146 
147 #include <net/sockaddr_utils.h>
148 
149 #include <os/log.h>
150 
151 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
152 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
153 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
154 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
155 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
156 
157 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
158 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
159 
160 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
161 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
162 
163 enum {
164 	kProtoKPI_v1    = 1,
165 	kProtoKPI_v2    = 2
166 };
167 
168 uint64_t if_creation_generation_count = 0;
169 
170 /*
171  * List of if_proto structures in if_proto_hash[] is protected by
172  * the ifnet lock.  The rest of the fields are initialized at protocol
173  * attach time and never change, thus no lock required as long as
174  * a reference to it is valid, via if_proto_ref().
175  */
176 struct if_proto {
177 	SLIST_ENTRY(if_proto)       next_hash;
178 	u_int32_t                   refcount;
179 	u_int32_t                   detached;
180 	struct ifnet                *ifp;
181 	protocol_family_t           protocol_family;
182 	int                         proto_kpi;
183 	union {
184 		struct {
185 			proto_media_input               input;
186 			proto_media_preout              pre_output;
187 			proto_media_event               event;
188 			proto_media_ioctl               ioctl;
189 			proto_media_detached            detached;
190 			proto_media_resolve_multi       resolve_multi;
191 			proto_media_send_arp            send_arp;
192 		} v1;
193 		struct {
194 			proto_media_input_v2            input;
195 			proto_media_preout              pre_output;
196 			proto_media_event               event;
197 			proto_media_ioctl               ioctl;
198 			proto_media_detached            detached;
199 			proto_media_resolve_multi       resolve_multi;
200 			proto_media_send_arp            send_arp;
201 		} v2;
202 	} kpi;
203 };
204 
205 SLIST_HEAD(proto_hash_entry, if_proto);
206 
207 #define DLIL_SDLDATALEN \
208 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
209 
210 /*
211  * In the common case, the LL address is stored in the
212  * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
213  * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
214  */
215 struct dl_if_lladdr_std {
216 	struct ifaddr   ifa;
217 	u_int8_t        addr_sdl_bytes[DLIL_SDLMAXLEN];
218 	u_int8_t        mask_sdl_bytes[DLIL_SDLMAXLEN];
219 };
220 
221 /*
222  * However, in some rare cases we encounter LL addresses which
223  * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
224  * we allocate the storage in the permanent arena, using this memory layout.
225  */
226 struct dl_if_lladdr_xtra_space {
227 	struct ifaddr   ifa;
228 	u_int8_t        addr_sdl_bytes[SOCK_MAXADDRLEN];
229 	u_int8_t        mask_sdl_bytes[SOCK_MAXADDRLEN];
230 };
231 
232 struct dlil_ifnet {
233 	struct ifnet    dl_if;                  /* public ifnet */
234 	/*
235 	 * DLIL private fields, protected by dl_if_lock
236 	 */
237 	decl_lck_mtx_data(, dl_if_lock);
238 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
239 	u_int32_t dl_if_flags;                  /* flags (below) */
240 	u_int32_t dl_if_refcnt;                 /* refcnt */
241 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
242 	void    *dl_if_uniqueid;                /* unique interface id */
243 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
244 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
245 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
246 	struct dl_if_lladdr_std dl_if_lladdr;   /* link-level address storage*/
247 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
248 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
249 	u_int8_t dl_if_permanent_ether_is_set;
250 	u_int8_t dl_if_unused;
251 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
252 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
253 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
254 };
255 
256 /* Values for dl_if_flags (private to DLIL) */
257 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
258 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
259 #define DLIF_DEBUG      0x4     /* has debugging info */
260 
261 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
262 
263 /* For gdb */
264 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
265 
266 struct dlil_ifnet_dbg {
267 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
268 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
269 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
270 	/*
271 	 * Circular lists of ifnet_{reference,release} callers.
272 	 */
273 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
274 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
275 };
276 
277 #define DLIL_TO_IFP(s)  (&s->dl_if)
278 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
279 
280 struct ifnet_filter {
281 	TAILQ_ENTRY(ifnet_filter)       filt_next;
282 	u_int32_t                       filt_skip;
283 	u_int32_t                       filt_flags;
284 	ifnet_t                         filt_ifp;
285 	const char                      *filt_name;
286 	void                            *filt_cookie;
287 	protocol_family_t               filt_protocol;
288 	iff_input_func                  filt_input;
289 	iff_output_func                 filt_output;
290 	iff_event_func                  filt_event;
291 	iff_ioctl_func                  filt_ioctl;
292 	iff_detached_func               filt_detached;
293 };
294 
295 /* Mbuf queue used for freeing the excessive mbufs */
296 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
297 
298 struct proto_input_entry;
299 
300 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
301 
302 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
303 
304 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
305 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
306 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
307 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
308 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
309 
310 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
311 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
312     &dlil_lck_attributes);
313 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
314     &dlil_lck_attributes);
315 
316 #if DEBUG
317 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
318 #else
319 static unsigned int ifnet_debug;        /* debugging (disabled) */
320 #endif /* !DEBUG */
321 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
322 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
323 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
324 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
325 
326 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
327 
328 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
329 
330 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
331 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
332 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
333 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
334 
335 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
336 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
337 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
338 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
339 
340 static u_int32_t net_rtref;
341 
342 static struct dlil_main_threading_info dlil_main_input_thread_info;
343 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
344     (struct dlil_threading_info *)&dlil_main_input_thread_info;
345 
346 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
347 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
348 static void dlil_if_trace(struct dlil_ifnet *, int);
349 static void if_proto_ref(struct if_proto *);
350 static void if_proto_free(struct if_proto *);
351 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
352 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
353     u_int32_t list_count);
354 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
355 static void if_flt_monitor_busy(struct ifnet *);
356 static void if_flt_monitor_unbusy(struct ifnet *);
357 static void if_flt_monitor_enter(struct ifnet *);
358 static void if_flt_monitor_leave(struct ifnet *);
359 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
360     char **, protocol_family_t, boolean_t);
361 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
362     protocol_family_t);
363 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
364     const struct sockaddr_dl *);
365 static int ifnet_lookup(struct ifnet *);
366 static void if_purgeaddrs(struct ifnet *);
367 
368 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
369     struct mbuf *, char *);
370 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
371     struct mbuf *);
372 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
373     mbuf_t *, const struct sockaddr *, void *, char *, char *);
374 static void ifproto_media_event(struct ifnet *, protocol_family_t,
375     const struct kev_msg *);
376 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
377     unsigned long, void *);
378 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
379     struct sockaddr_dl *, size_t);
380 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
381     const struct sockaddr_dl *, const struct sockaddr *,
382     const struct sockaddr_dl *, const struct sockaddr *);
383 
384 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
385     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
386     boolean_t poll, struct thread *tp);
387 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
388     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
389 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
390 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
391     protocol_family_t *);
392 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
393     const struct ifnet_demux_desc *, u_int32_t);
394 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
395 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
396 #if !XNU_TARGET_OS_OSX
397 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
398     const struct sockaddr *, const char *, const char *,
399     u_int32_t *, u_int32_t *);
400 #else /* XNU_TARGET_OS_OSX */
401 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
402     const struct sockaddr *, const char *, const char *);
403 #endif /* XNU_TARGET_OS_OSX */
404 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
405     const struct sockaddr *, const char *, const char *,
406     u_int32_t *, u_int32_t *);
407 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
408 static void ifp_if_free(struct ifnet *);
409 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
410 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
411 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
412 
413 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
414     dlil_freeq_t *, struct ifnet_stat_increment_param *);
415 
416 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
417     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
418     boolean_t, struct thread *);
419 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
420     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
421     boolean_t, struct thread *);
422 
423 static void dlil_main_input_thread_func(void *, wait_result_t);
424 static void dlil_main_input_thread_cont(void *, wait_result_t);
425 
426 static void dlil_input_thread_func(void *, wait_result_t);
427 static void dlil_input_thread_cont(void *, wait_result_t);
428 
429 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
430 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
431 
432 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
433     thread_continue_t *);
434 static void dlil_terminate_input_thread(struct dlil_threading_info *);
435 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
436     struct dlil_threading_info *, struct ifnet *, boolean_t);
437 static boolean_t dlil_input_stats_sync(struct ifnet *,
438     struct dlil_threading_info *);
439 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
440     u_int32_t, ifnet_model_t, boolean_t);
441 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
442     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
443 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
444 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
445 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
446 #if DEBUG || DEVELOPMENT
447 static void dlil_verify_sum16(void);
448 #endif /* DEBUG || DEVELOPMENT */
449 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
450     protocol_family_t);
451 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
452     protocol_family_t);
453 
454 static void dlil_incr_pending_thread_count(void);
455 static void dlil_decr_pending_thread_count(void);
456 
457 static void ifnet_detacher_thread_func(void *, wait_result_t);
458 static void ifnet_detacher_thread_cont(void *, wait_result_t);
459 static void ifnet_detach_final(struct ifnet *);
460 static void ifnet_detaching_enqueue(struct ifnet *);
461 static struct ifnet *ifnet_detaching_dequeue(void);
462 
463 static void ifnet_start_thread_func(void *, wait_result_t);
464 static void ifnet_start_thread_cont(void *, wait_result_t);
465 
466 static void ifnet_poll_thread_func(void *, wait_result_t);
467 static void ifnet_poll_thread_cont(void *, wait_result_t);
468 
469 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
470     classq_pkt_t *, boolean_t, boolean_t *);
471 
472 static void ifp_src_route_copyout(struct ifnet *, struct route *);
473 static void ifp_src_route_copyin(struct ifnet *, struct route *);
474 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
475 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
476 
477 static errno_t if_mcasts_update_async(struct ifnet *);
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522 
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525 
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527     u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532 
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540 
541 /* rate limit debug messages */
542 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
543 
544 static inline void
ifnet_delay_start_disabled_increment(void)545 ifnet_delay_start_disabled_increment(void)
546 {
547 	OSIncrementAtomic(&ifnet_delay_start_disabled);
548 }
549 
550 static void log_hexdump(void *data, size_t len);
551 
552 unsigned int net_rxpoll = 1;
553 unsigned int net_affinity = 1;
554 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
555 
556 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
557 
558 extern u_int32_t        inject_buckets;
559 
560 /* DLIL data threshold thread call */
561 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
562 
563 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)564 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
565 {
566 	/*
567 	 * update filter count and route_generation ID to let TCP
568 	 * know it should reevalute doing TSO or not
569 	 */
570 	if (filter_enable) {
571 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
572 	} else {
573 		VERIFY(ifp->if_flt_no_tso_count != 0);
574 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
575 	}
576 	routegenid_update();
577 }
578 
579 #if SKYWALK
580 
581 static bool net_check_compatible_if_filter(struct ifnet *ifp);
582 
583 /* if_attach_nx flags defined in os_skywalk_private.h */
584 unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
585 unsigned int if_enable_fsw_ip_netagent =
586     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
587 unsigned int if_enable_fsw_transport_netagent =
588     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
589 
590 unsigned int if_netif_all =
591     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
592 
593 /* Configure flowswitch to use max mtu sized buffer */
594 static bool fsw_use_max_mtu_buffer = false;
595 
596 
597 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
598 
599 #include <skywalk/os_skywalk_private.h>
600 
601 boolean_t
ifnet_nx_noauto(ifnet_t ifp)602 ifnet_nx_noauto(ifnet_t ifp)
603 {
604 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
605 }
606 
607 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)608 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
609 {
610 	return ifnet_is_low_latency(ifp);
611 }
612 
613 boolean_t
ifnet_is_low_latency(ifnet_t ifp)614 ifnet_is_low_latency(ifnet_t ifp)
615 {
616 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
617 }
618 
619 boolean_t
ifnet_needs_compat(ifnet_t ifp)620 ifnet_needs_compat(ifnet_t ifp)
621 {
622 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
623 		return FALSE;
624 	}
625 #if !XNU_TARGET_OS_OSX
626 	/*
627 	 * To conserve memory, we plumb in the compat layer selectively; this
628 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
629 	 * In particular, we check for Wi-Fi Access Point.
630 	 */
631 	if (IFNET_IS_WIFI(ifp)) {
632 		/* Wi-Fi Access Point */
633 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
634 		    ifp->if_name[2] == '\0') {
635 			return if_netif_all;
636 		}
637 	}
638 #else /* XNU_TARGET_OS_OSX */
639 #pragma unused(ifp)
640 #endif /* XNU_TARGET_OS_OSX */
641 	return TRUE;
642 }
643 
644 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)645 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
646 {
647 	if (if_is_fsw_transport_netagent_enabled()) {
648 		/* check if netagent has been manually enabled for ipsec/utun */
649 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
650 			return ipsec_interface_needs_netagent(ifp);
651 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
652 			return utun_interface_needs_netagent(ifp);
653 		}
654 
655 		/* check ifnet no auto nexus override */
656 		if (ifnet_nx_noauto(ifp)) {
657 			return FALSE;
658 		}
659 
660 		/* check global if_attach_nx configuration */
661 		switch (ifp->if_family) {
662 		case IFNET_FAMILY_CELLULAR:
663 		case IFNET_FAMILY_ETHERNET:
664 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
665 				return TRUE;
666 			}
667 			break;
668 		default:
669 			break;
670 		}
671 	}
672 	return FALSE;
673 }
674 
675 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)676 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
677 {
678 #pragma unused(ifp)
679 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
680 		return TRUE;
681 	}
682 	return FALSE;
683 }
684 
685 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)686 ifnet_needs_netif_netagent(ifnet_t ifp)
687 {
688 #pragma unused(ifp)
689 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
690 }
691 
692 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)693 dlil_detach_nexus_instance(nexus_controller_t controller,
694     const char *func_str, uuid_t instance, uuid_t device)
695 {
696 	errno_t         err;
697 
698 	if (instance == NULL || uuid_is_null(instance)) {
699 		return FALSE;
700 	}
701 
702 	/* followed by the device port */
703 	if (device != NULL && !uuid_is_null(device)) {
704 		err = kern_nexus_ifdetach(controller, instance, device);
705 		if (err != 0) {
706 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
707 			    func_str, err);
708 		}
709 	}
710 	err = kern_nexus_controller_free_provider_instance(controller,
711 	    instance);
712 	if (err != 0) {
713 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
714 		    func_str, err);
715 	}
716 	return TRUE;
717 }
718 
719 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)720 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
721     uuid_t device)
722 {
723 	boolean_t               detached = FALSE;
724 	nexus_controller_t      controller = kern_nexus_shared_controller();
725 	int                     err;
726 
727 	if (dlil_detach_nexus_instance(controller, func_str, instance,
728 	    device)) {
729 		detached = TRUE;
730 	}
731 	if (provider != NULL && !uuid_is_null(provider)) {
732 		detached = TRUE;
733 		err = kern_nexus_controller_deregister_provider(controller,
734 		    provider);
735 		if (err != 0) {
736 			DLIL_PRINTF("%s deregister_provider %d\n",
737 			    func_str, err);
738 		}
739 	}
740 	return detached;
741 }
742 
743 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)744 dlil_create_provider_and_instance(nexus_controller_t controller,
745     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
746     nexus_attr_t attr)
747 {
748 	uuid_t          dom_prov;
749 	errno_t         err;
750 	nexus_name_t    provider_name;
751 	const char      *type_name =
752 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
753 	struct kern_nexus_init init;
754 
755 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
756 	if (err != 0) {
757 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
758 		    __func__, type_name, err);
759 		goto failed;
760 	}
761 
762 	snprintf((char *)provider_name, sizeof(provider_name),
763 	    "com.apple.%s.%s", type_name, if_name(ifp));
764 	err = kern_nexus_controller_register_provider(controller,
765 	    dom_prov,
766 	    provider_name,
767 	    NULL,
768 	    0,
769 	    attr,
770 	    provider);
771 	if (err != 0) {
772 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
773 		    __func__, type_name, err);
774 		goto failed;
775 	}
776 	bzero(&init, sizeof(init));
777 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
778 	err = kern_nexus_controller_alloc_provider_instance(controller,
779 	    *provider,
780 	    NULL, NULL,
781 	    instance, &init);
782 	if (err != 0) {
783 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
784 		    __func__, type_name, err);
785 		kern_nexus_controller_deregister_provider(controller,
786 		    *provider);
787 		goto failed;
788 	}
789 failed:
790 	return err;
791 }
792 
793 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)794 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
795 {
796 	nexus_attr_t            attr = NULL;
797 	nexus_controller_t      controller;
798 	errno_t                 err;
799 
800 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
801 		/* it's already attached */
802 		if (dlil_verbose) {
803 			DLIL_PRINTF("%s: %s already has nexus attached\n",
804 			    __func__, if_name(ifp));
805 			/* already attached */
806 		}
807 		goto failed;
808 	}
809 
810 	err = kern_nexus_attr_create(&attr);
811 	if (err != 0) {
812 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
813 		    if_name(ifp));
814 		goto failed;
815 	}
816 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
817 	VERIFY(err == 0);
818 
819 	controller = kern_nexus_shared_controller();
820 
821 	/* create the netif provider and instance */
822 	err = dlil_create_provider_and_instance(controller,
823 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
824 	    &netif_nx->if_nif_instance, attr);
825 	if (err != 0) {
826 		goto failed;
827 	}
828 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
829 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
830 	if (err != 0) {
831 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
832 		    __func__, err);
833 		/* cleanup provider and instance */
834 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
835 		    netif_nx->if_nif_instance, NULL);
836 		goto failed;
837 	}
838 	return TRUE;
839 
840 failed:
841 	if (attr != NULL) {
842 		kern_nexus_attr_destroy(attr);
843 	}
844 	return FALSE;
845 }
846 
847 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)848 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
849 {
850 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
851 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
852 		goto failed;
853 	}
854 	switch (ifp->if_type) {
855 	case IFT_CELLULAR:
856 	case IFT_ETHER:
857 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
858 			/* don't auto-attach */
859 			goto failed;
860 		}
861 		break;
862 	default:
863 		/* don't auto-attach */
864 		goto failed;
865 	}
866 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
867 
868 failed:
869 	return FALSE;
870 }
871 
872 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)873 dlil_is_native_netif_nexus(ifnet_t ifp)
874 {
875 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
876 }
877 
878 __attribute__((noinline))
879 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)880 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
881 {
882 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
883 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
884 }
885 
886 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)887 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
888 {
889 	struct ifreq        ifr;
890 	int                 error;
891 
892 	bzero(&ifr, sizeof(ifr));
893 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
894 	if (error == 0) {
895 		*ifdm_p = ifr.ifr_devmtu;
896 	}
897 	return error;
898 }
899 
900 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)901 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
902 {
903 	uint32_t tso_v4_mtu = 0;
904 	uint32_t tso_v6_mtu = 0;
905 
906 	if (!kernel_is_macos_or_server()) {
907 		return;
908 	}
909 
910 	if (!dlil_is_native_netif_nexus(ifp)) {
911 		return;
912 	}
913 	/*
914 	 * Note that we are reading the real hwassist flags set by the driver
915 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
916 	 * hasn't been called yet.
917 	 */
918 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
919 		tso_v4_mtu = ifp->if_tso_v4_mtu;
920 	}
921 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
922 		tso_v6_mtu = ifp->if_tso_v6_mtu;
923 	}
924 	/*
925 	 * If the hardware supports TSO, adjust the large buf size to match the
926 	 * supported TSO MTU size.
927 	 */
928 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
929 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
930 	} else {
931 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
932 	}
933 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
934 }
935 
936 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)937 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
938     bool *use_multi_buflet, uint32_t *large_buf_size)
939 {
940 	struct kern_pbufpool_memory_info rx_pp_info;
941 	struct kern_pbufpool_memory_info tx_pp_info;
942 	uint32_t if_max_mtu = 0;
943 	uint32_t drv_buf_size;
944 	struct ifdevmtu ifdm;
945 	int err;
946 
947 	/*
948 	 * To perform intra-stack RX aggregation flowswitch needs to use
949 	 * multi-buflet packet.
950 	 */
951 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
952 
953 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
954 	/*
955 	 * IP over Thunderbolt interface can deliver the largest IP packet,
956 	 * but the driver advertises the MAX MTU as only 9K.
957 	 */
958 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
959 		if_max_mtu = IP_MAXPACKET;
960 		goto skip_mtu_ioctl;
961 	}
962 
963 	/* determine max mtu */
964 	bzero(&ifdm, sizeof(ifdm));
965 	err = dlil_siocgifdevmtu(ifp, &ifdm);
966 	if (__improbable(err != 0)) {
967 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
968 		    __func__, if_name(ifp));
969 		/* use default flowswitch buffer size */
970 		if_max_mtu = NX_FSW_BUFSIZE;
971 	} else {
972 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
973 		    ifdm.ifdm_max, ifdm.ifdm_current);
974 		/* rdar://problem/44589731 */
975 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
976 	}
977 
978 skip_mtu_ioctl:
979 	if (if_max_mtu == 0) {
980 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
981 		    __func__, if_name(ifp));
982 		return EINVAL;
983 	}
984 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
985 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
986 		    "max bufsize(%d)\n", __func__,
987 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
988 		return EINVAL;
989 	}
990 
991 	/*
992 	 * for skywalk native driver, consult the driver packet pool also.
993 	 */
994 	if (dlil_is_native_netif_nexus(ifp)) {
995 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
996 		    &tx_pp_info);
997 		if (err != 0) {
998 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
999 			    __func__, if_name(ifp));
1000 			return ENXIO;
1001 		}
1002 		drv_buf_size = tx_pp_info.kpm_bufsize *
1003 		    tx_pp_info.kpm_max_frags;
1004 		if (if_max_mtu > drv_buf_size) {
1005 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1006 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1007 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1008 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1009 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1010 			return EINVAL;
1011 		}
1012 	} else {
1013 		drv_buf_size = if_max_mtu;
1014 	}
1015 
1016 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1017 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1018 		*use_multi_buflet = true;
1019 		/* default flowswitch buffer size */
1020 		*buf_size = NX_FSW_BUFSIZE;
1021 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1022 	} else {
1023 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1024 	}
1025 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1026 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1027 	if (*buf_size >= *large_buf_size) {
1028 		*large_buf_size = 0;
1029 	}
1030 	return 0;
1031 }
1032 
1033 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1034 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1035 {
1036 	nexus_attr_t            attr = NULL;
1037 	nexus_controller_t      controller;
1038 	errno_t                 err = 0;
1039 	uuid_t                  netif;
1040 	uint32_t                buf_size = 0;
1041 	uint32_t                large_buf_size = 0;
1042 	bool                    multi_buflet;
1043 
1044 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1045 	    IFNET_IS_VMNET(ifp)) {
1046 		goto failed;
1047 	}
1048 
1049 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1050 		/* not possible to attach (netif native/compat not plumbed) */
1051 		goto failed;
1052 	}
1053 
1054 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1055 		/* don't auto-attach */
1056 		goto failed;
1057 	}
1058 
1059 	/* get the netif instance from the ifp */
1060 	err = kern_nexus_get_netif_instance(ifp, netif);
1061 	if (err != 0) {
1062 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1063 		    if_name(ifp));
1064 		goto failed;
1065 	}
1066 
1067 	err = kern_nexus_attr_create(&attr);
1068 	if (err != 0) {
1069 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1070 		    if_name(ifp));
1071 		goto failed;
1072 	}
1073 
1074 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1075 	    &multi_buflet, &large_buf_size);
1076 	if (err != 0) {
1077 		goto failed;
1078 	}
1079 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1080 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1081 
1082 	/* Configure flowswitch buffer size */
1083 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1084 	VERIFY(err == 0);
1085 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1086 	    large_buf_size);
1087 	VERIFY(err == 0);
1088 
1089 	/*
1090 	 * Configure flowswitch to use super-packet (multi-buflet).
1091 	 */
1092 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1093 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1094 	VERIFY(err == 0);
1095 
1096 	/* create the flowswitch provider and instance */
1097 	controller = kern_nexus_shared_controller();
1098 	err = dlil_create_provider_and_instance(controller,
1099 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1100 	    &nexus_fsw->if_fsw_instance, attr);
1101 	if (err != 0) {
1102 		goto failed;
1103 	}
1104 
1105 	/* attach the device port */
1106 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1107 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1108 	if (err != 0) {
1109 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1110 		    __func__, err, if_name(ifp));
1111 		/* cleanup provider and instance */
1112 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1113 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1114 		goto failed;
1115 	}
1116 	return TRUE;
1117 
1118 failed:
1119 	if (err != 0) {
1120 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1121 		    __func__, if_name(ifp), err);
1122 	} else {
1123 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1124 		    __func__, if_name(ifp));
1125 	}
1126 	if (attr != NULL) {
1127 		kern_nexus_attr_destroy(attr);
1128 	}
1129 	return FALSE;
1130 }
1131 
1132 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1133 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1134 {
1135 	boolean_t               attached = FALSE;
1136 	if_nexus_flowswitch     nexus_fsw;
1137 
1138 #if (DEVELOPMENT || DEBUG)
1139 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1140 		DLIL_PRINTF("skip attaching fsw to %s\n", if_name(ifp));
1141 		return FALSE;
1142 	}
1143 #endif /* (DEVELOPMENT || DEBUG) */
1144 
1145 	/*
1146 	 * flowswitch attachment is not supported for interface using the
1147 	 * legacy model (IFNET_INIT_LEGACY)
1148 	 */
1149 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1150 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model\n",
1151 		    if_name(ifp));
1152 		return FALSE;
1153 	}
1154 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1155 	if (!ifnet_is_attached(ifp, 1)) {
1156 		os_log(OS_LOG_DEFAULT, "%s: %s not attached",
1157 		    __func__, ifp->if_xname);
1158 		goto done;
1159 	}
1160 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance)) {
1161 		attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1162 		if (attached) {
1163 			ifnet_lock_exclusive(ifp);
1164 			ifp->if_nx_flowswitch = nexus_fsw;
1165 			ifnet_lock_done(ifp);
1166 		}
1167 	}
1168 	ifnet_decr_iorefcnt(ifp);
1169 
1170 done:
1171 	return attached;
1172 }
1173 
1174 __attribute__((noinline))
1175 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1176 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1177 {
1178 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1179 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1180 }
1181 
1182 __attribute__((noinline))
1183 static void
dlil_netif_detach_notify(ifnet_t ifp)1184 dlil_netif_detach_notify(ifnet_t ifp)
1185 {
1186 	ifnet_detach_notify_cb_t notify = NULL;
1187 	void *arg = NULL;
1188 
1189 	ifnet_get_detach_notify(ifp, &notify, &arg);
1190 	if (notify == NULL) {
1191 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1192 		return;
1193 	}
1194 	(*notify)(arg);
1195 }
1196 
1197 __attribute__((noinline))
1198 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1199 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1200 {
1201 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1202 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1203 
1204 	ifnet_datamov_suspend_and_drain(ifp);
1205 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1206 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1207 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1208 		dlil_detach_flowswitch_nexus(nx_fsw);
1209 	} else {
1210 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1211 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1212 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1213 	}
1214 
1215 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1216 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1217 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1218 		dlil_detach_netif_nexus(nx_netif);
1219 	} else {
1220 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1221 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1222 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1223 	}
1224 	ifnet_datamov_resume(ifp);
1225 }
1226 
1227 boolean_t
ifnet_add_netagent(ifnet_t ifp)1228 ifnet_add_netagent(ifnet_t ifp)
1229 {
1230 	int     error;
1231 
1232 	error = kern_nexus_interface_add_netagent(ifp);
1233 	os_log(OS_LOG_DEFAULT,
1234 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1235 	    ifp->if_xname, error);
1236 	return error == 0;
1237 }
1238 
1239 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1240 ifnet_remove_netagent(ifnet_t ifp)
1241 {
1242 	int     error;
1243 
1244 	error = kern_nexus_interface_remove_netagent(ifp);
1245 	os_log(OS_LOG_DEFAULT,
1246 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1247 	    ifp->if_xname, error);
1248 	return error == 0;
1249 }
1250 
1251 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1252 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1253 {
1254 	if (!IF_FULLY_ATTACHED(ifp)) {
1255 		return FALSE;
1256 	}
1257 	return dlil_attach_flowswitch_nexus(ifp);
1258 }
1259 
1260 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1261 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1262 {
1263 	if_nexus_flowswitch     nexus_fsw;
1264 
1265 	ifnet_lock_exclusive(ifp);
1266 	nexus_fsw = ifp->if_nx_flowswitch;
1267 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1268 	ifnet_lock_done(ifp);
1269 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1270 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1271 }
1272 
1273 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1274 ifnet_attach_native_flowswitch(ifnet_t ifp)
1275 {
1276 	if (!dlil_is_native_netif_nexus(ifp)) {
1277 		/* not a native netif */
1278 		return;
1279 	}
1280 	ifnet_attach_flowswitch_nexus(ifp);
1281 }
1282 
1283 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1284 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1285 {
1286 	lck_mtx_lock(&ifp->if_delegate_lock);
1287 	while (ifp->if_fsw_rx_cb_ref > 0) {
1288 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1289 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1290 		    (PZERO + 1), __FUNCTION__, NULL);
1291 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1292 	}
1293 	ifp->if_fsw_rx_cb = cb;
1294 	ifp->if_fsw_rx_cb_arg = arg;
1295 	lck_mtx_unlock(&ifp->if_delegate_lock);
1296 	return 0;
1297 }
1298 
1299 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1300 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1301 {
1302 	/*
1303 	 * This is for avoiding the unnecessary lock acquire for interfaces
1304 	 * not used by a redirect interface.
1305 	 */
1306 	if (ifp->if_fsw_rx_cb == NULL) {
1307 		return ENOENT;
1308 	}
1309 	lck_mtx_lock(&ifp->if_delegate_lock);
1310 	if (ifp->if_fsw_rx_cb == NULL) {
1311 		lck_mtx_unlock(&ifp->if_delegate_lock);
1312 		return ENOENT;
1313 	}
1314 	*cbp = ifp->if_fsw_rx_cb;
1315 	*argp = ifp->if_fsw_rx_cb_arg;
1316 	ifp->if_fsw_rx_cb_ref++;
1317 	lck_mtx_unlock(&ifp->if_delegate_lock);
1318 	return 0;
1319 }
1320 
1321 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1322 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1323 {
1324 	lck_mtx_lock(&ifp->if_delegate_lock);
1325 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1326 		wakeup(&ifp->if_fsw_rx_cb_ref);
1327 	}
1328 	lck_mtx_unlock(&ifp->if_delegate_lock);
1329 }
1330 
1331 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1332 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1333 {
1334 	lck_mtx_lock(&difp->if_delegate_lock);
1335 	while (difp->if_delegate_parent_ref > 0) {
1336 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1337 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1338 		    (PZERO + 1), __FUNCTION__, NULL);
1339 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1340 	}
1341 	difp->if_delegate_parent = parent;
1342 	lck_mtx_unlock(&difp->if_delegate_lock);
1343 	return 0;
1344 }
1345 
1346 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1347 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1348 {
1349 	lck_mtx_lock(&difp->if_delegate_lock);
1350 	if (difp->if_delegate_parent == NULL) {
1351 		lck_mtx_unlock(&difp->if_delegate_lock);
1352 		return ENOENT;
1353 	}
1354 	*parentp = difp->if_delegate_parent;
1355 	difp->if_delegate_parent_ref++;
1356 	lck_mtx_unlock(&difp->if_delegate_lock);
1357 	return 0;
1358 }
1359 
1360 void
ifnet_release_delegate_parent(ifnet_t difp)1361 ifnet_release_delegate_parent(ifnet_t difp)
1362 {
1363 	lck_mtx_lock(&difp->if_delegate_lock);
1364 	if (--difp->if_delegate_parent_ref == 0) {
1365 		wakeup(&difp->if_delegate_parent_ref);
1366 	}
1367 	lck_mtx_unlock(&difp->if_delegate_lock);
1368 }
1369 
1370 __attribute__((noinline))
1371 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1372 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1373 {
1374 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1375 	ifp->if_detach_notify = notify;
1376 	ifp->if_detach_notify_arg = arg;
1377 }
1378 
1379 __attribute__((noinline))
1380 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1381 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1382 {
1383 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1384 	*notifyp = ifp->if_detach_notify;
1385 	*argp = ifp->if_detach_notify_arg;
1386 }
1387 
1388 __attribute__((noinline))
1389 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1390 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1391 {
1392 	ifnet_lock_exclusive(ifp);
1393 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1394 	ifnet_lock_done(ifp);
1395 }
1396 
1397 __attribute__((noinline))
1398 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1399 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1400 {
1401 	ifnet_lock_exclusive(ifp);
1402 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1403 	ifnet_lock_done(ifp);
1404 }
1405 #endif /* SKYWALK */
1406 
1407 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1408 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1409 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1410 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1411 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1412 	/* NOTREACHED */                                        \
1413 	}                                                               \
1414 }
1415 
1416 #define DLIL_EWMA(old, new, decay) do {                                 \
1417 	u_int32_t _avg;                                                 \
1418 	if ((_avg = (old)) > 0)                                         \
1419 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1420 	else                                                            \
1421 	        _avg = (new);                                           \
1422 	(old) = _avg;                                                   \
1423 } while (0)
1424 
1425 #define MBPS    (1ULL * 1000 * 1000)
1426 #define GBPS    (MBPS * 1000)
1427 
1428 struct rxpoll_time_tbl {
1429 	u_int64_t       speed;          /* downlink speed */
1430 	u_int32_t       plowat;         /* packets low watermark */
1431 	u_int32_t       phiwat;         /* packets high watermark */
1432 	u_int32_t       blowat;         /* bytes low watermark */
1433 	u_int32_t       bhiwat;         /* bytes high watermark */
1434 };
1435 
1436 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1437 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1438 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1439 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1440 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1441 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1442 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1443 };
1444 
1445 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1446     &dlil_lck_attributes);
1447 static uint32_t dlil_pending_thread_cnt = 0;
1448 
1449 static void
dlil_incr_pending_thread_count(void)1450 dlil_incr_pending_thread_count(void)
1451 {
1452 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1453 	lck_mtx_lock(&dlil_thread_sync_lock);
1454 	dlil_pending_thread_cnt++;
1455 	lck_mtx_unlock(&dlil_thread_sync_lock);
1456 }
1457 
1458 static void
dlil_decr_pending_thread_count(void)1459 dlil_decr_pending_thread_count(void)
1460 {
1461 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1462 	lck_mtx_lock(&dlil_thread_sync_lock);
1463 	VERIFY(dlil_pending_thread_cnt > 0);
1464 	dlil_pending_thread_cnt--;
1465 	if (dlil_pending_thread_cnt == 0) {
1466 		wakeup(&dlil_pending_thread_cnt);
1467 	}
1468 	lck_mtx_unlock(&dlil_thread_sync_lock);
1469 }
1470 
1471 int
proto_hash_value(u_int32_t protocol_family)1472 proto_hash_value(u_int32_t protocol_family)
1473 {
1474 	/*
1475 	 * dlil_proto_unplumb_all() depends on the mapping between
1476 	 * the hash bucket index and the protocol family defined
1477 	 * here; future changes must be applied there as well.
1478 	 */
1479 	switch (protocol_family) {
1480 	case PF_INET:
1481 		return 0;
1482 	case PF_INET6:
1483 		return 1;
1484 	case PF_VLAN:
1485 		return 2;
1486 	case PF_UNSPEC:
1487 	default:
1488 		return 3;
1489 	}
1490 }
1491 
1492 /*
1493  * Caller must already be holding ifnet lock.
1494  */
1495 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1496 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1497 {
1498 	struct if_proto *proto = NULL;
1499 	u_int32_t i = proto_hash_value(protocol_family);
1500 
1501 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1502 
1503 	if (ifp->if_proto_hash != NULL) {
1504 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1505 	}
1506 
1507 	while (proto != NULL && proto->protocol_family != protocol_family) {
1508 		proto = SLIST_NEXT(proto, next_hash);
1509 	}
1510 
1511 	if (proto != NULL) {
1512 		if_proto_ref(proto);
1513 	}
1514 
1515 	return proto;
1516 }
1517 
1518 static void
if_proto_ref(struct if_proto * proto)1519 if_proto_ref(struct if_proto *proto)
1520 {
1521 	os_atomic_inc(&proto->refcount, relaxed);
1522 }
1523 
1524 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1525 
1526 static void
if_proto_free(struct if_proto * proto)1527 if_proto_free(struct if_proto *proto)
1528 {
1529 	u_int32_t oldval;
1530 	struct ifnet *ifp = proto->ifp;
1531 	u_int32_t proto_family = proto->protocol_family;
1532 	struct kev_dl_proto_data ev_pr_data;
1533 
1534 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1535 	if (oldval > 1) {
1536 		return;
1537 	}
1538 
1539 	if (proto->proto_kpi == kProtoKPI_v1) {
1540 		if (proto->kpi.v1.detached) {
1541 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1542 		}
1543 	}
1544 	if (proto->proto_kpi == kProtoKPI_v2) {
1545 		if (proto->kpi.v2.detached) {
1546 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1547 		}
1548 	}
1549 
1550 	/*
1551 	 * Cleanup routes that may still be in the routing table for that
1552 	 * interface/protocol pair.
1553 	 */
1554 	if_rtproto_del(ifp, proto_family);
1555 
1556 	ifnet_lock_shared(ifp);
1557 
1558 	/* No more reference on this, protocol must have been detached */
1559 	VERIFY(proto->detached);
1560 
1561 	/*
1562 	 * The reserved field carries the number of protocol still attached
1563 	 * (subject to change)
1564 	 */
1565 	ev_pr_data.proto_family = proto_family;
1566 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1567 
1568 	ifnet_lock_done(ifp);
1569 
1570 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1571 	    (struct net_event_data *)&ev_pr_data,
1572 	    sizeof(struct kev_dl_proto_data), FALSE);
1573 
1574 	if (ev_pr_data.proto_remaining_count == 0) {
1575 		/*
1576 		 * The protocol count has gone to zero, mark the interface down.
1577 		 * This used to be done by configd.KernelEventMonitor, but that
1578 		 * is inherently prone to races (rdar://problem/30810208).
1579 		 */
1580 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1581 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1582 		dlil_post_sifflags_msg(ifp);
1583 	}
1584 
1585 	zfree(dlif_proto_zone, proto);
1586 }
1587 
1588 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1589 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1590 {
1591 #if !MACH_ASSERT
1592 #pragma unused(ifp)
1593 #endif
1594 	unsigned int type = 0;
1595 	int ass = 1;
1596 
1597 	switch (what) {
1598 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1599 		type = LCK_RW_ASSERT_EXCLUSIVE;
1600 		break;
1601 
1602 	case IFNET_LCK_ASSERT_SHARED:
1603 		type = LCK_RW_ASSERT_SHARED;
1604 		break;
1605 
1606 	case IFNET_LCK_ASSERT_OWNED:
1607 		type = LCK_RW_ASSERT_HELD;
1608 		break;
1609 
1610 	case IFNET_LCK_ASSERT_NOTOWNED:
1611 		/* nothing to do here for RW lock; bypass assert */
1612 		ass = 0;
1613 		break;
1614 
1615 	default:
1616 		panic("bad ifnet assert type: %d", what);
1617 		/* NOTREACHED */
1618 	}
1619 	if (ass) {
1620 		LCK_RW_ASSERT(&ifp->if_lock, type);
1621 	}
1622 }
1623 
1624 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1625 ifnet_lock_shared(struct ifnet *ifp)
1626 {
1627 	lck_rw_lock_shared(&ifp->if_lock);
1628 }
1629 
1630 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1631 ifnet_lock_exclusive(struct ifnet *ifp)
1632 {
1633 	lck_rw_lock_exclusive(&ifp->if_lock);
1634 }
1635 
1636 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1637 ifnet_lock_done(struct ifnet *ifp)
1638 {
1639 	lck_rw_done(&ifp->if_lock);
1640 }
1641 
1642 #if INET
1643 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1644 if_inetdata_lock_shared(struct ifnet *ifp)
1645 {
1646 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1647 }
1648 
1649 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1650 if_inetdata_lock_exclusive(struct ifnet *ifp)
1651 {
1652 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1653 }
1654 
1655 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1656 if_inetdata_lock_done(struct ifnet *ifp)
1657 {
1658 	lck_rw_done(&ifp->if_inetdata_lock);
1659 }
1660 #endif
1661 
1662 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1663 if_inet6data_lock_shared(struct ifnet *ifp)
1664 {
1665 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1666 }
1667 
1668 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1669 if_inet6data_lock_exclusive(struct ifnet *ifp)
1670 {
1671 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1672 }
1673 
1674 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1675 if_inet6data_lock_done(struct ifnet *ifp)
1676 {
1677 	lck_rw_done(&ifp->if_inet6data_lock);
1678 }
1679 
1680 __private_extern__ void
ifnet_head_lock_shared(void)1681 ifnet_head_lock_shared(void)
1682 {
1683 	lck_rw_lock_shared(&ifnet_head_lock);
1684 }
1685 
1686 __private_extern__ void
ifnet_head_lock_exclusive(void)1687 ifnet_head_lock_exclusive(void)
1688 {
1689 	lck_rw_lock_exclusive(&ifnet_head_lock);
1690 }
1691 
1692 __private_extern__ void
ifnet_head_done(void)1693 ifnet_head_done(void)
1694 {
1695 	lck_rw_done(&ifnet_head_lock);
1696 }
1697 
1698 __private_extern__ void
ifnet_head_assert_exclusive(void)1699 ifnet_head_assert_exclusive(void)
1700 {
1701 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1702 }
1703 
1704 /*
1705  * dlil_ifp_protolist
1706  * - get the list of protocols attached to the interface, or just the number
1707  *   of attached protocols
1708  * - if the number returned is greater than 'list_count', truncation occurred
1709  *
1710  * Note:
1711  * - caller must already be holding ifnet lock.
1712  */
1713 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1714 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1715     u_int32_t list_count)
1716 {
1717 	u_int32_t       count = 0;
1718 	int             i;
1719 
1720 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1721 
1722 	if (ifp->if_proto_hash == NULL) {
1723 		goto done;
1724 	}
1725 
1726 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1727 		struct if_proto *proto;
1728 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1729 			if (list != NULL && count < list_count) {
1730 				list[count] = proto->protocol_family;
1731 			}
1732 			count++;
1733 		}
1734 	}
1735 done:
1736 	return count;
1737 }
1738 
1739 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1740 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1741 {
1742 	ifnet_lock_shared(ifp);
1743 	count = dlil_ifp_protolist(ifp, protolist, count);
1744 	ifnet_lock_done(ifp);
1745 	return count;
1746 }
1747 
1748 __private_extern__ void
if_free_protolist(u_int32_t * list)1749 if_free_protolist(u_int32_t *list)
1750 {
1751 	kfree_data_addr(list);
1752 }
1753 
1754 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1755 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1756     u_int32_t event_code, struct net_event_data *event_data,
1757     u_int32_t event_data_len, boolean_t suppress_generation)
1758 {
1759 	struct net_event_data ev_data;
1760 	struct kev_msg ev_msg;
1761 
1762 	bzero(&ev_msg, sizeof(ev_msg));
1763 	bzero(&ev_data, sizeof(ev_data));
1764 	/*
1765 	 * a net event always starts with a net_event_data structure
1766 	 * but the caller can generate a simple net event or
1767 	 * provide a longer event structure to post
1768 	 */
1769 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1770 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1771 	ev_msg.kev_subclass     = event_subclass;
1772 	ev_msg.event_code       = event_code;
1773 
1774 	if (event_data == NULL) {
1775 		event_data = &ev_data;
1776 		event_data_len = sizeof(struct net_event_data);
1777 	}
1778 
1779 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1780 	event_data->if_family = ifp->if_family;
1781 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
1782 
1783 	ev_msg.dv[0].data_length = event_data_len;
1784 	ev_msg.dv[0].data_ptr    = event_data;
1785 	ev_msg.dv[1].data_length = 0;
1786 
1787 	bool update_generation = true;
1788 	if (event_subclass == KEV_DL_SUBCLASS) {
1789 		/* Don't update interface generation for frequent link quality and state changes  */
1790 		switch (event_code) {
1791 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1792 		case KEV_DL_RRC_STATE_CHANGED:
1793 		case KEV_DL_PRIMARY_ELECTED:
1794 			update_generation = false;
1795 			break;
1796 		default:
1797 			break;
1798 		}
1799 	}
1800 
1801 	/*
1802 	 * Some events that update generation counts might
1803 	 * want to suppress generation count.
1804 	 * One example is node presence/absence where we still
1805 	 * issue kernel event for the invocation but want to avoid
1806 	 * expensive operation of updating generation which triggers
1807 	 * NECP client updates.
1808 	 */
1809 	if (suppress_generation) {
1810 		update_generation = false;
1811 	}
1812 
1813 	return dlil_event_internal(ifp, &ev_msg, update_generation);
1814 }
1815 
1816 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1817 dlil_alloc_local_stats(struct ifnet *ifp)
1818 {
1819 	int ret = EINVAL;
1820 	void *buf, *base, **pbuf;
1821 
1822 	if (ifp == NULL) {
1823 		goto end;
1824 	}
1825 
1826 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1827 		/* allocate tcpstat_local structure */
1828 		buf = zalloc_flags(dlif_tcpstat_zone,
1829 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1830 
1831 		/* Get the 64-bit aligned base address for this object */
1832 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1833 		    sizeof(u_int64_t));
1834 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1835 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1836 
1837 		/*
1838 		 * Wind back a pointer size from the aligned base and
1839 		 * save the original address so we can free it later.
1840 		 */
1841 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1842 		*pbuf = buf;
1843 		ifp->if_tcp_stat = base;
1844 
1845 		/* allocate udpstat_local structure */
1846 		buf = zalloc_flags(dlif_udpstat_zone,
1847 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1848 
1849 		/* Get the 64-bit aligned base address for this object */
1850 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1851 		    sizeof(u_int64_t));
1852 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1853 		    ((intptr_t)buf + dlif_udpstat_bufsize));
1854 
1855 		/*
1856 		 * Wind back a pointer size from the aligned base and
1857 		 * save the original address so we can free it later.
1858 		 */
1859 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1860 		*pbuf = buf;
1861 		ifp->if_udp_stat = base;
1862 
1863 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1864 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1865 
1866 		ret = 0;
1867 	}
1868 
1869 	if (ifp->if_ipv4_stat == NULL) {
1870 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1871 	}
1872 
1873 	if (ifp->if_ipv6_stat == NULL) {
1874 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1875 	}
1876 end:
1877 	if (ifp != NULL && ret != 0) {
1878 		if (ifp->if_tcp_stat != NULL) {
1879 			pbuf = (void **)
1880 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1881 			zfree(dlif_tcpstat_zone, *pbuf);
1882 			ifp->if_tcp_stat = NULL;
1883 		}
1884 		if (ifp->if_udp_stat != NULL) {
1885 			pbuf = (void **)
1886 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1887 			zfree(dlif_udpstat_zone, *pbuf);
1888 			ifp->if_udp_stat = NULL;
1889 		}
1890 		/* The macro kfree_type sets the passed pointer to NULL */
1891 		if (ifp->if_ipv4_stat != NULL) {
1892 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
1893 		}
1894 		if (ifp->if_ipv6_stat != NULL) {
1895 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
1896 		}
1897 	}
1898 
1899 	return ret;
1900 }
1901 
1902 static void
dlil_reset_rxpoll_params(ifnet_t ifp)1903 dlil_reset_rxpoll_params(ifnet_t ifp)
1904 {
1905 	ASSERT(ifp != NULL);
1906 	ifnet_set_poll_cycle(ifp, NULL);
1907 	ifp->if_poll_update = 0;
1908 	ifp->if_poll_flags = 0;
1909 	ifp->if_poll_req = 0;
1910 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
1911 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
1912 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
1913 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
1914 	net_timerclear(&ifp->if_poll_mode_holdtime);
1915 	net_timerclear(&ifp->if_poll_mode_lasttime);
1916 	net_timerclear(&ifp->if_poll_sample_holdtime);
1917 	net_timerclear(&ifp->if_poll_sample_lasttime);
1918 	net_timerclear(&ifp->if_poll_dbg_lasttime);
1919 }
1920 
1921 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)1922 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
1923     thread_continue_t *thfunc)
1924 {
1925 	boolean_t dlil_rxpoll_input;
1926 	thread_continue_t func = NULL;
1927 	u_int32_t limit;
1928 	int error = 0;
1929 
1930 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
1931 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
1932 
1933 	/* default strategy utilizes the DLIL worker thread */
1934 	inp->dlth_strategy = dlil_input_async;
1935 
1936 	/* NULL ifp indicates the main input thread, called at dlil_init time */
1937 	if (ifp == NULL) {
1938 		/*
1939 		 * Main input thread only.
1940 		 */
1941 		func = dlil_main_input_thread_func;
1942 		VERIFY(inp == dlil_main_input_thread);
1943 		(void) strlcat(inp->dlth_name,
1944 		    "main_input", DLIL_THREADNAME_LEN);
1945 	} else if (dlil_rxpoll_input) {
1946 		/*
1947 		 * Legacy (non-netif) hybrid polling.
1948 		 */
1949 		func = dlil_rxpoll_input_thread_func;
1950 		VERIFY(inp != dlil_main_input_thread);
1951 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1952 		    "%s_input_poll", if_name(ifp));
1953 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
1954 		/*
1955 		 * Asynchronous strategy.
1956 		 */
1957 		func = dlil_input_thread_func;
1958 		VERIFY(inp != dlil_main_input_thread);
1959 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1960 		    "%s_input", if_name(ifp));
1961 	} else {
1962 		/*
1963 		 * Synchronous strategy if there's a netif below and
1964 		 * the device isn't capable of hybrid polling.
1965 		 */
1966 		ASSERT(func == NULL);
1967 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
1968 		VERIFY(inp != dlil_main_input_thread);
1969 		ASSERT(!inp->dlth_affinity);
1970 		inp->dlth_strategy = dlil_input_sync;
1971 	}
1972 	VERIFY(inp->dlth_thread == THREAD_NULL);
1973 
1974 	/* let caller know */
1975 	if (thfunc != NULL) {
1976 		*thfunc = func;
1977 	}
1978 
1979 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
1980 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
1981 
1982 	inp->dlth_ifp = ifp; /* NULL for main input thread */
1983 
1984 	/*
1985 	 * For interfaces that support opportunistic polling, set the
1986 	 * low and high watermarks for outstanding inbound packets/bytes.
1987 	 * Also define freeze times for transitioning between modes
1988 	 * and updating the average.
1989 	 */
1990 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1991 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1992 		if (ifp->if_xflags & IFXF_LEGACY) {
1993 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
1994 		}
1995 	} else {
1996 		/*
1997 		 * For interfaces that don't support opportunistic
1998 		 * polling, set the burst limit to prevent memory exhaustion.
1999 		 * The values of `if_rcvq_burst_limit' are safeguarded
2000 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2001 		 */
2002 		limit = if_rcvq_burst_limit;
2003 	}
2004 
2005 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2006 	if (inp == dlil_main_input_thread) {
2007 		struct dlil_main_threading_info *inpm =
2008 		    (struct dlil_main_threading_info *)inp;
2009 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2010 	}
2011 
2012 	if (func == NULL) {
2013 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2014 		ASSERT(error == 0);
2015 		error = ENODEV;
2016 		goto done;
2017 	}
2018 
2019 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2020 	if (error == KERN_SUCCESS) {
2021 		thread_precedence_policy_data_t info;
2022 		__unused kern_return_t kret;
2023 
2024 		bzero(&info, sizeof(info));
2025 		info.importance = 0;
2026 		kret = thread_policy_set(inp->dlth_thread,
2027 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2028 		    THREAD_PRECEDENCE_POLICY_COUNT);
2029 		ASSERT(kret == KERN_SUCCESS);
2030 		/*
2031 		 * We create an affinity set so that the matching workloop
2032 		 * thread or the starter thread (for loopback) can be
2033 		 * scheduled on the same processor set as the input thread.
2034 		 */
2035 		if (net_affinity) {
2036 			struct thread *tp = inp->dlth_thread;
2037 			u_int32_t tag;
2038 			/*
2039 			 * Randomize to reduce the probability
2040 			 * of affinity tag namespace collision.
2041 			 */
2042 			read_frandom(&tag, sizeof(tag));
2043 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2044 				thread_reference(tp);
2045 				inp->dlth_affinity_tag = tag;
2046 				inp->dlth_affinity = TRUE;
2047 			}
2048 		}
2049 	} else if (inp == dlil_main_input_thread) {
2050 		panic_plain("%s: couldn't create main input thread", __func__);
2051 		/* NOTREACHED */
2052 	} else {
2053 		panic_plain("%s: couldn't create %s input thread", __func__,
2054 		    if_name(ifp));
2055 		/* NOTREACHED */
2056 	}
2057 	OSAddAtomic(1, &cur_dlil_input_threads);
2058 
2059 done:
2060 	return error;
2061 }
2062 
2063 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2064 dlil_clean_threading_info(struct dlil_threading_info *inp)
2065 {
2066 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2067 	lck_grp_free(inp->dlth_lock_grp);
2068 	inp->dlth_lock_grp = NULL;
2069 
2070 	inp->dlth_flags = 0;
2071 	inp->dlth_wtot = 0;
2072 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2073 	inp->dlth_ifp = NULL;
2074 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2075 	qlimit(&inp->dlth_pkts) = 0;
2076 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2077 
2078 	VERIFY(!inp->dlth_affinity);
2079 	inp->dlth_thread = THREAD_NULL;
2080 	inp->dlth_strategy = NULL;
2081 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2082 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2083 	VERIFY(inp->dlth_affinity_tag == 0);
2084 #if IFNET_INPUT_SANITY_CHK
2085 	inp->dlth_pkts_cnt = 0;
2086 #endif /* IFNET_INPUT_SANITY_CHK */
2087 }
2088 
2089 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2090 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2091 {
2092 	struct ifnet *ifp = inp->dlth_ifp;
2093 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2094 
2095 	VERIFY(current_thread() == inp->dlth_thread);
2096 	VERIFY(inp != dlil_main_input_thread);
2097 
2098 	OSAddAtomic(-1, &cur_dlil_input_threads);
2099 
2100 #if TEST_INPUT_THREAD_TERMINATION
2101 	{ /* do something useless that won't get optimized away */
2102 		uint32_t        v = 1;
2103 		for (uint32_t i = 0;
2104 		    i < if_input_thread_termination_spin;
2105 		    i++) {
2106 			v = (i + 1) * v;
2107 		}
2108 		DLIL_PRINTF("the value is %d\n", v);
2109 	}
2110 #endif /* TEST_INPUT_THREAD_TERMINATION */
2111 
2112 	lck_mtx_lock_spin(&inp->dlth_lock);
2113 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2114 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2115 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2116 	wakeup_one((caddr_t)&inp->dlth_flags);
2117 	lck_mtx_unlock(&inp->dlth_lock);
2118 
2119 	/* free up pending packets */
2120 	if (pkt.cp_mbuf != NULL) {
2121 		mbuf_freem_list(pkt.cp_mbuf);
2122 	}
2123 
2124 	/* for the extra refcnt from kernel_thread_start() */
2125 	thread_deallocate(current_thread());
2126 
2127 	if (dlil_verbose) {
2128 		DLIL_PRINTF("%s: input thread terminated\n",
2129 		    if_name(ifp));
2130 	}
2131 
2132 	/* this is the end */
2133 	thread_terminate(current_thread());
2134 	/* NOTREACHED */
2135 }
2136 
2137 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2138 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2139 {
2140 	thread_affinity_policy_data_t policy;
2141 
2142 	bzero(&policy, sizeof(policy));
2143 	policy.affinity_tag = tag;
2144 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2145 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2146 }
2147 
2148 #if SKYWALK
2149 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2150 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2151     enum net_filter_event_subsystems state)
2152 {
2153 	evhlog(debug, "%s: eventhandler saw event type=net_filter_event_state event_code=0x%d",
2154 	    __func__, state);
2155 
2156 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2157 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2158 		if_enable_fsw_transport_netagent = 1;
2159 	} else {
2160 		if_enable_fsw_transport_netagent = 0;
2161 	}
2162 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2163 		kern_nexus_update_netagents();
2164 	} else if (!if_enable_fsw_transport_netagent) {
2165 		necp_update_all_clients();
2166 	}
2167 }
2168 #endif /* SKYWALK */
2169 
2170 void
dlil_init(void)2171 dlil_init(void)
2172 {
2173 	thread_t thread = THREAD_NULL;
2174 
2175 	/*
2176 	 * The following fields must be 64-bit aligned for atomic operations.
2177 	 */
2178 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2179 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2180 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2181 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2182 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2183 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2184 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2185 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2186 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2187 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2188 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2189 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2190 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2191 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2192 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2193 
2194 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2195 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2196 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2197 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2198 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2199 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2200 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2201 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2202 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2203 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2204 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2205 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2206 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2207 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2208 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2209 
2210 	/*
2211 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2212 	 */
2213 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2214 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2215 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2216 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2217 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2218 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2219 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2220 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2221 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2222 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2223 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2224 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2225 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2226 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2227 
2228 	/*
2229 	 * ... as well as the mbuf checksum flags counterparts.
2230 	 */
2231 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2232 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2233 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2234 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2235 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2236 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2237 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2238 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2239 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2240 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2241 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2242 
2243 	/*
2244 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2245 	 */
2246 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2247 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2248 
2249 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2250 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2251 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2252 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2253 
2254 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2255 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2256 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2257 
2258 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2259 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2260 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2261 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2262 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2263 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2264 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2265 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2266 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2267 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2268 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2269 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2270 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2271 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2272 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2273 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2274 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2275 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2276 
2277 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2278 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2279 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2280 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2281 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2282 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2283 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2284 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2285 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2286 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2287 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2288 
2289 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2290 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2291 
2292 	PE_parse_boot_argn("net_affinity", &net_affinity,
2293 	    sizeof(net_affinity));
2294 
2295 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2296 
2297 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2298 
2299 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2300 
2301 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2302 
2303 	VERIFY(dlil_pending_thread_cnt == 0);
2304 #if SKYWALK
2305 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2306 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2307 	boolean_t enable_fsw_netagent =
2308 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2309 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2310 
2311 	/*
2312 	 * Check the device tree to see if Skywalk netagent has been explicitly
2313 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2314 	 * Note that the property is a 0-length key, and so checking for the
2315 	 * presence itself is enough (no need to check for the actual value of
2316 	 * the retrieved variable.)
2317 	 */
2318 	pe_enable_fsw_transport_netagent =
2319 	    PE_get_default("kern.skywalk_netagent_enable",
2320 	    &pe_enable_fsw_transport_netagent,
2321 	    sizeof(pe_enable_fsw_transport_netagent));
2322 	pe_disable_fsw_transport_netagent =
2323 	    PE_get_default("kern.skywalk_netagent_disable",
2324 	    &pe_disable_fsw_transport_netagent,
2325 	    sizeof(pe_disable_fsw_transport_netagent));
2326 
2327 	/*
2328 	 * These two are mutually exclusive, i.e. they both can be absent,
2329 	 * but only one can be present at a time, and so we assert to make
2330 	 * sure it is correct.
2331 	 */
2332 	VERIFY((!pe_enable_fsw_transport_netagent &&
2333 	    !pe_disable_fsw_transport_netagent) ||
2334 	    (pe_enable_fsw_transport_netagent ^
2335 	    pe_disable_fsw_transport_netagent));
2336 
2337 	if (pe_enable_fsw_transport_netagent) {
2338 		kprintf("SK: netagent is enabled via an override for "
2339 		    "this platform\n");
2340 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2341 	} else if (pe_disable_fsw_transport_netagent) {
2342 		kprintf("SK: netagent is disabled via an override for "
2343 		    "this platform\n");
2344 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2345 	} else {
2346 		kprintf("SK: netagent is %s by default for this platform\n",
2347 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2348 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2349 	}
2350 
2351 	/*
2352 	 * Now see if there's a boot-arg override.
2353 	 */
2354 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2355 	    sizeof(if_attach_nx));
2356 	if_enable_fsw_transport_netagent =
2357 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2358 
2359 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2360 
2361 	if (pe_disable_fsw_transport_netagent &&
2362 	    if_enable_fsw_transport_netagent) {
2363 		kprintf("SK: netagent is force-enabled\n");
2364 	} else if (!pe_disable_fsw_transport_netagent &&
2365 	    !if_enable_fsw_transport_netagent) {
2366 		kprintf("SK: netagent is force-disabled\n");
2367 	}
2368 	if (kernel_is_macos_or_server() && if_enable_fsw_transport_netagent) {
2369 		net_filter_event_register(dlil_filter_event);
2370 	}
2371 
2372 #if (DEVELOPMENT || DEBUG)
2373 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2374 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2375 #endif /* (DEVELOPMENT || DEBUG) */
2376 
2377 #endif /* SKYWALK */
2378 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2379 	    sizeof(struct dlil_ifnet_dbg);
2380 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2381 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2382 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2383 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2384 
2385 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2386 	/* Enforce 64-bit alignment for tcpstat_local structure */
2387 	dlif_tcpstat_bufsize =
2388 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2389 	dlif_tcpstat_bufsize = (uint32_t)
2390 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2391 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2392 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2393 
2394 	dlif_udpstat_size = sizeof(struct udpstat_local);
2395 	/* Enforce 64-bit alignment for udpstat_local structure */
2396 	dlif_udpstat_bufsize =
2397 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2398 	dlif_udpstat_bufsize = (uint32_t)
2399 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2400 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2401 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2402 
2403 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2404 
2405 	TAILQ_INIT(&dlil_ifnet_head);
2406 	TAILQ_INIT(&ifnet_head);
2407 	TAILQ_INIT(&ifnet_detaching_head);
2408 	TAILQ_INIT(&ifnet_ordered_head);
2409 
2410 	/* Initialize interface address subsystem */
2411 	ifa_init();
2412 
2413 #if PF
2414 	/* Initialize the packet filter */
2415 	pfinit();
2416 #endif /* PF */
2417 
2418 	/* Initialize queue algorithms */
2419 	classq_init();
2420 
2421 	/* Initialize packet schedulers */
2422 	pktsched_init();
2423 
2424 	/* Initialize flow advisory subsystem */
2425 	flowadv_init();
2426 
2427 	/* Initialize the pktap virtual interface */
2428 	pktap_init();
2429 
2430 	/* Initialize droptap interface */
2431 	droptap_init();
2432 
2433 	/* Initialize the service class to dscp map */
2434 	net_qos_map_init();
2435 
2436 	/* Initialize the interface low power mode event handler */
2437 	if_low_power_evhdlr_init();
2438 
2439 	/* Initialize the interface offload port list subsystem */
2440 	if_ports_used_init();
2441 
2442 #if DEBUG || DEVELOPMENT
2443 	/* Run self-tests */
2444 	dlil_verify_sum16();
2445 #endif /* DEBUG || DEVELOPMENT */
2446 
2447 	/*
2448 	 * Create and start up the main DLIL input thread and the interface
2449 	 * detacher threads once everything is initialized.
2450 	 */
2451 	dlil_incr_pending_thread_count();
2452 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2453 
2454 	/*
2455 	 * Create ifnet detacher thread.
2456 	 * When an interface gets detached, part of the detach processing
2457 	 * is delayed. The interface is added to delayed detach list
2458 	 * and this thread is woken up to call ifnet_detach_final
2459 	 * on these interfaces.
2460 	 */
2461 	dlil_incr_pending_thread_count();
2462 	if (kernel_thread_start(ifnet_detacher_thread_func,
2463 	    NULL, &thread) != KERN_SUCCESS) {
2464 		panic_plain("%s: couldn't create detacher thread", __func__);
2465 		/* NOTREACHED */
2466 	}
2467 	thread_deallocate(thread);
2468 
2469 	/*
2470 	 * Wait for the created kernel threads for dlil to get
2471 	 * scheduled and run at least once before we proceed
2472 	 */
2473 	lck_mtx_lock(&dlil_thread_sync_lock);
2474 	while (dlil_pending_thread_cnt != 0) {
2475 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2476 		    "threads to get scheduled at least once.\n", __func__);
2477 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2478 		    (PZERO - 1), __func__, NULL);
2479 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2480 	}
2481 	lck_mtx_unlock(&dlil_thread_sync_lock);
2482 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2483 	    "scheduled at least once. Proceeding.\n", __func__);
2484 }
2485 
2486 static void
if_flt_monitor_busy(struct ifnet * ifp)2487 if_flt_monitor_busy(struct ifnet *ifp)
2488 {
2489 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2490 
2491 	++ifp->if_flt_busy;
2492 	VERIFY(ifp->if_flt_busy != 0);
2493 }
2494 
2495 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2496 if_flt_monitor_unbusy(struct ifnet *ifp)
2497 {
2498 	if_flt_monitor_leave(ifp);
2499 }
2500 
2501 static void
if_flt_monitor_enter(struct ifnet * ifp)2502 if_flt_monitor_enter(struct ifnet *ifp)
2503 {
2504 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2505 
2506 	while (ifp->if_flt_busy) {
2507 		++ifp->if_flt_waiters;
2508 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2509 		    (PZERO - 1), "if_flt_monitor", NULL);
2510 	}
2511 	if_flt_monitor_busy(ifp);
2512 }
2513 
2514 static void
if_flt_monitor_leave(struct ifnet * ifp)2515 if_flt_monitor_leave(struct ifnet *ifp)
2516 {
2517 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2518 
2519 	VERIFY(ifp->if_flt_busy != 0);
2520 	--ifp->if_flt_busy;
2521 
2522 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2523 		ifp->if_flt_waiters = 0;
2524 		wakeup(&ifp->if_flt_head);
2525 	}
2526 }
2527 
2528 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2529 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2530     interface_filter_t *filter_ref, u_int32_t flags)
2531 {
2532 	int retval = 0;
2533 	struct ifnet_filter *filter = NULL;
2534 
2535 	ifnet_head_lock_shared();
2536 
2537 	/* Check that the interface is in the global list */
2538 	if (!ifnet_lookup(ifp)) {
2539 		retval = ENXIO;
2540 		goto done;
2541 	}
2542 	if (!ifnet_is_attached(ifp, 1)) {
2543 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2544 		    __func__, if_name(ifp));
2545 		retval = ENXIO;
2546 		goto done;
2547 	}
2548 
2549 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2550 
2551 	/* refcnt held above during lookup */
2552 	filter->filt_flags = flags;
2553 	filter->filt_ifp = ifp;
2554 	filter->filt_cookie = if_filter->iff_cookie;
2555 	filter->filt_name = if_filter->iff_name;
2556 	filter->filt_protocol = if_filter->iff_protocol;
2557 	/*
2558 	 * Do not install filter callbacks for internal coproc interface
2559 	 * and for management interfaces
2560 	 */
2561 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2562 		filter->filt_input = if_filter->iff_input;
2563 		filter->filt_output = if_filter->iff_output;
2564 		filter->filt_event = if_filter->iff_event;
2565 		filter->filt_ioctl = if_filter->iff_ioctl;
2566 	}
2567 	filter->filt_detached = if_filter->iff_detached;
2568 
2569 	lck_mtx_lock(&ifp->if_flt_lock);
2570 	if_flt_monitor_enter(ifp);
2571 
2572 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2573 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2574 
2575 	*filter_ref = filter;
2576 
2577 	/*
2578 	 * Bump filter count and route_generation ID to let TCP
2579 	 * know it shouldn't do TSO on this connection
2580 	 */
2581 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2582 		ifnet_filter_update_tso(ifp, TRUE);
2583 	}
2584 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2585 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2586 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2587 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2588 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2589 	} else {
2590 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2591 	}
2592 	if_flt_monitor_leave(ifp);
2593 	lck_mtx_unlock(&ifp->if_flt_lock);
2594 
2595 #if SKYWALK
2596 	if (kernel_is_macos_or_server()) {
2597 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2598 		    net_check_compatible_if_filter(NULL));
2599 	}
2600 #endif /* SKYWALK */
2601 
2602 	if (dlil_verbose) {
2603 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2604 		    if_filter->iff_name);
2605 	}
2606 	ifnet_decr_iorefcnt(ifp);
2607 
2608 done:
2609 	ifnet_head_done();
2610 	if (retval != 0 && ifp != NULL) {
2611 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2612 		    if_name(ifp), if_filter->iff_name, retval);
2613 	}
2614 	if (retval != 0 && filter != NULL) {
2615 		zfree(dlif_filt_zone, filter);
2616 	}
2617 
2618 	return retval;
2619 }
2620 
2621 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2622 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2623 {
2624 	int retval = 0;
2625 
2626 	if (detached == 0) {
2627 		ifnet_t ifp = NULL;
2628 
2629 		ifnet_head_lock_shared();
2630 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2631 			interface_filter_t entry = NULL;
2632 
2633 			lck_mtx_lock(&ifp->if_flt_lock);
2634 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2635 				if (entry != filter || entry->filt_skip) {
2636 					continue;
2637 				}
2638 				/*
2639 				 * We've found a match; since it's possible
2640 				 * that the thread gets blocked in the monitor,
2641 				 * we do the lock dance.  Interface should
2642 				 * not be detached since we still have a use
2643 				 * count held during filter attach.
2644 				 */
2645 				entry->filt_skip = 1;   /* skip input/output */
2646 				lck_mtx_unlock(&ifp->if_flt_lock);
2647 				ifnet_head_done();
2648 
2649 				lck_mtx_lock(&ifp->if_flt_lock);
2650 				if_flt_monitor_enter(ifp);
2651 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2652 				    LCK_MTX_ASSERT_OWNED);
2653 
2654 				/* Remove the filter from the list */
2655 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2656 				    filt_next);
2657 
2658 				if (dlil_verbose) {
2659 					DLIL_PRINTF("%s: %s filter detached\n",
2660 					    if_name(ifp), filter->filt_name);
2661 				}
2662 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2663 					VERIFY(ifp->if_flt_non_os_count != 0);
2664 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2665 				}
2666 				/*
2667 				 * Decrease filter count and route_generation
2668 				 * ID to let TCP know it should reevalute doing
2669 				 * TSO or not.
2670 				 */
2671 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2672 					ifnet_filter_update_tso(ifp, FALSE);
2673 				}
2674 				/*
2675 				 * When we remove the bridge's interface filter,
2676 				 * clear the field in the ifnet.
2677 				 */
2678 				if ((filter->filt_flags & DLIL_IFF_BRIDGE)
2679 				    != 0) {
2680 					ifp->if_bridge = NULL;
2681 				}
2682 				if_flt_monitor_leave(ifp);
2683 				lck_mtx_unlock(&ifp->if_flt_lock);
2684 				goto destroy;
2685 			}
2686 			lck_mtx_unlock(&ifp->if_flt_lock);
2687 		}
2688 		ifnet_head_done();
2689 
2690 		/* filter parameter is not a valid filter ref */
2691 		retval = EINVAL;
2692 		goto done;
2693 	} else {
2694 		struct ifnet *ifp = filter->filt_ifp;
2695 		/*
2696 		 * Here we are called from ifnet_detach_final(); the
2697 		 * caller had emptied if_flt_head and we're doing an
2698 		 * implicit filter detach because the interface is
2699 		 * about to go away.  Make sure to adjust the counters
2700 		 * in this case.  We don't need the protection of the
2701 		 * filter monitor since we're called as part of the
2702 		 * final detach in the context of the detacher thread.
2703 		 */
2704 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2705 			VERIFY(ifp->if_flt_non_os_count != 0);
2706 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2707 		}
2708 		/*
2709 		 * Decrease filter count and route_generation
2710 		 * ID to let TCP know it should reevalute doing
2711 		 * TSO or not.
2712 		 */
2713 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2714 			ifnet_filter_update_tso(ifp, FALSE);
2715 		}
2716 	}
2717 
2718 	if (dlil_verbose) {
2719 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2720 	}
2721 
2722 destroy:
2723 
2724 	/* Call the detached function if there is one */
2725 	if (filter->filt_detached) {
2726 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2727 	}
2728 
2729 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2730 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2731 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2732 	}
2733 #if SKYWALK
2734 	if (kernel_is_macos_or_server()) {
2735 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2736 		    net_check_compatible_if_filter(NULL));
2737 	}
2738 #endif /* SKYWALK */
2739 
2740 	/* Free the filter */
2741 	zfree(dlif_filt_zone, filter);
2742 	filter = NULL;
2743 done:
2744 	if (retval != 0 && filter != NULL) {
2745 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2746 		    filter->filt_name, retval);
2747 	}
2748 
2749 	return retval;
2750 }
2751 
2752 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2753 dlil_detach_filter(interface_filter_t filter)
2754 {
2755 	if (filter == NULL) {
2756 		return;
2757 	}
2758 	dlil_detach_filter_internal(filter, 0);
2759 }
2760 
2761 __private_extern__ boolean_t
dlil_has_ip_filter(void)2762 dlil_has_ip_filter(void)
2763 {
2764 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
2765 
2766 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
2767 
2768 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2769 	return has_filter;
2770 }
2771 
2772 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2773 dlil_has_if_filter(struct ifnet *ifp)
2774 {
2775 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2776 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2777 	return has_filter;
2778 }
2779 
2780 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2781 dlil_input_wakeup(struct dlil_threading_info *inp)
2782 {
2783 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2784 
2785 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2786 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2787 		inp->dlth_wtot++;
2788 		wakeup_one((caddr_t)&inp->dlth_flags);
2789 	}
2790 }
2791 
2792 __attribute__((noreturn))
2793 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2794 dlil_main_input_thread_func(void *v, wait_result_t w)
2795 {
2796 #pragma unused(w)
2797 	struct dlil_threading_info *inp = v;
2798 
2799 	VERIFY(inp == dlil_main_input_thread);
2800 	VERIFY(inp->dlth_ifp == NULL);
2801 	VERIFY(current_thread() == inp->dlth_thread);
2802 
2803 	lck_mtx_lock(&inp->dlth_lock);
2804 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2805 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2806 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2807 	/* wake up once to get out of embryonic state */
2808 	dlil_input_wakeup(inp);
2809 	lck_mtx_unlock(&inp->dlth_lock);
2810 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2811 	/* NOTREACHED */
2812 	__builtin_unreachable();
2813 }
2814 
2815 /*
2816  * Main input thread:
2817  *
2818  *   a) handles all inbound packets for lo0
2819  *   b) handles all inbound packets for interfaces with no dedicated
2820  *	input thread (e.g. anything but Ethernet/PDP or those that support
2821  *	opportunistic polling.)
2822  *   c) protocol registrations
2823  *   d) packet injections
2824  */
2825 __attribute__((noreturn))
2826 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2827 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2828 {
2829 	struct dlil_main_threading_info *inpm = v;
2830 	struct dlil_threading_info *inp = v;
2831 
2832 	/* main input thread is uninterruptible */
2833 	VERIFY(wres != THREAD_INTERRUPTED);
2834 	lck_mtx_lock_spin(&inp->dlth_lock);
2835 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2836 	    DLIL_INPUT_RUNNING)));
2837 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2838 
2839 	while (1) {
2840 		struct mbuf *m = NULL, *m_loop = NULL;
2841 		u_int32_t m_cnt, m_cnt_loop;
2842 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2843 		boolean_t proto_req;
2844 		boolean_t embryonic;
2845 
2846 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2847 
2848 		if (__improbable(embryonic =
2849 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2850 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2851 		}
2852 
2853 		proto_req = (inp->dlth_flags &
2854 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2855 
2856 		/* Packets for non-dedicated interfaces other than lo0 */
2857 		m_cnt = qlen(&inp->dlth_pkts);
2858 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2859 		m = pkt.cp_mbuf;
2860 
2861 		/* Packets exclusive to lo0 */
2862 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2863 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2864 		m_loop = pkt.cp_mbuf;
2865 
2866 		inp->dlth_wtot = 0;
2867 
2868 		lck_mtx_unlock(&inp->dlth_lock);
2869 
2870 		if (__improbable(embryonic)) {
2871 			dlil_decr_pending_thread_count();
2872 		}
2873 
2874 		/*
2875 		 * NOTE warning %%% attention !!!!
2876 		 * We should think about putting some thread starvation
2877 		 * safeguards if we deal with long chains of packets.
2878 		 */
2879 		if (__probable(m_loop != NULL)) {
2880 			dlil_input_packet_list_extended(lo_ifp, m_loop,
2881 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2882 		}
2883 
2884 		if (__probable(m != NULL)) {
2885 			dlil_input_packet_list_extended(NULL, m,
2886 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2887 		}
2888 
2889 		if (__improbable(proto_req)) {
2890 			proto_input_run();
2891 		}
2892 
2893 		lck_mtx_lock_spin(&inp->dlth_lock);
2894 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2895 		/* main input thread cannot be terminated */
2896 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2897 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2898 			break;
2899 		}
2900 	}
2901 
2902 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
2903 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2904 	lck_mtx_unlock(&inp->dlth_lock);
2905 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2906 
2907 	VERIFY(0);      /* we should never get here */
2908 	/* NOTREACHED */
2909 	__builtin_unreachable();
2910 }
2911 
2912 /*
2913  * Input thread for interfaces with legacy input model.
2914  */
2915 __attribute__((noreturn))
2916 static void
dlil_input_thread_func(void * v,wait_result_t w)2917 dlil_input_thread_func(void *v, wait_result_t w)
2918 {
2919 #pragma unused(w)
2920 	char thread_name[MAXTHREADNAMESIZE];
2921 	struct dlil_threading_info *inp = v;
2922 	struct ifnet *ifp = inp->dlth_ifp;
2923 
2924 	VERIFY(inp != dlil_main_input_thread);
2925 	VERIFY(ifp != NULL);
2926 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
2927 	    !(ifp->if_xflags & IFXF_LEGACY));
2928 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
2929 	    !(ifp->if_xflags & IFXF_LEGACY));
2930 	VERIFY(current_thread() == inp->dlth_thread);
2931 
2932 	/* construct the name for this thread, and then apply it */
2933 	bzero(thread_name, sizeof(thread_name));
2934 	(void) snprintf(thread_name, sizeof(thread_name),
2935 	    "dlil_input_%s", ifp->if_xname);
2936 	thread_set_thread_name(inp->dlth_thread, thread_name);
2937 
2938 	lck_mtx_lock(&inp->dlth_lock);
2939 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2940 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2941 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2942 	/* wake up once to get out of embryonic state */
2943 	dlil_input_wakeup(inp);
2944 	lck_mtx_unlock(&inp->dlth_lock);
2945 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
2946 	/* NOTREACHED */
2947 	__builtin_unreachable();
2948 }
2949 
2950 __attribute__((noreturn))
2951 static void
dlil_input_thread_cont(void * v,wait_result_t wres)2952 dlil_input_thread_cont(void *v, wait_result_t wres)
2953 {
2954 	struct dlil_threading_info *inp = v;
2955 	struct ifnet *ifp = inp->dlth_ifp;
2956 
2957 	lck_mtx_lock_spin(&inp->dlth_lock);
2958 	if (__improbable(wres == THREAD_INTERRUPTED ||
2959 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
2960 		goto terminate;
2961 	}
2962 
2963 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
2964 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2965 
2966 	while (1) {
2967 		struct mbuf *m = NULL;
2968 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2969 		boolean_t notify = FALSE;
2970 		boolean_t embryonic;
2971 		u_int32_t m_cnt;
2972 
2973 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2974 
2975 		if (__improbable(embryonic =
2976 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2977 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2978 		}
2979 
2980 		/*
2981 		 * Protocol registration and injection must always use
2982 		 * the main input thread; in theory the latter can utilize
2983 		 * the corresponding input thread where the packet arrived
2984 		 * on, but that requires our knowing the interface in advance
2985 		 * (and the benefits might not worth the trouble.)
2986 		 */
2987 		VERIFY(!(inp->dlth_flags &
2988 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
2989 
2990 		/* Packets for this interface */
2991 		m_cnt = qlen(&inp->dlth_pkts);
2992 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2993 		m = pkt.cp_mbuf;
2994 
2995 		inp->dlth_wtot = 0;
2996 
2997 #if SKYWALK
2998 		/*
2999 		 * If this interface is attached to a netif nexus,
3000 		 * the stats are already incremented there; otherwise
3001 		 * do it here.
3002 		 */
3003 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3004 #endif /* SKYWALK */
3005 		notify = dlil_input_stats_sync(ifp, inp);
3006 
3007 		lck_mtx_unlock(&inp->dlth_lock);
3008 
3009 		if (__improbable(embryonic)) {
3010 			ifnet_decr_pending_thread_count(ifp);
3011 		}
3012 
3013 		if (__improbable(notify)) {
3014 			ifnet_notify_data_threshold(ifp);
3015 		}
3016 
3017 		/*
3018 		 * NOTE warning %%% attention !!!!
3019 		 * We should think about putting some thread starvation
3020 		 * safeguards if we deal with long chains of packets.
3021 		 */
3022 		if (__probable(m != NULL)) {
3023 			dlil_input_packet_list_extended(ifp, m,
3024 			    m_cnt, ifp->if_poll_mode);
3025 		}
3026 
3027 		lck_mtx_lock_spin(&inp->dlth_lock);
3028 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3029 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3030 		    DLIL_INPUT_TERMINATE))) {
3031 			break;
3032 		}
3033 	}
3034 
3035 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3036 
3037 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3038 terminate:
3039 		lck_mtx_unlock(&inp->dlth_lock);
3040 		dlil_terminate_input_thread(inp);
3041 		/* NOTREACHED */
3042 	} else {
3043 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3044 		lck_mtx_unlock(&inp->dlth_lock);
3045 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3046 		/* NOTREACHED */
3047 	}
3048 
3049 	VERIFY(0);      /* we should never get here */
3050 	/* NOTREACHED */
3051 	__builtin_unreachable();
3052 }
3053 
3054 /*
3055  * Input thread for interfaces with opportunistic polling input model.
3056  */
3057 __attribute__((noreturn))
3058 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3059 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3060 {
3061 #pragma unused(w)
3062 	char thread_name[MAXTHREADNAMESIZE];
3063 	struct dlil_threading_info *inp = v;
3064 	struct ifnet *ifp = inp->dlth_ifp;
3065 
3066 	VERIFY(inp != dlil_main_input_thread);
3067 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3068 	    (ifp->if_xflags & IFXF_LEGACY));
3069 	VERIFY(current_thread() == inp->dlth_thread);
3070 
3071 	/* construct the name for this thread, and then apply it */
3072 	bzero(thread_name, sizeof(thread_name));
3073 	(void) snprintf(thread_name, sizeof(thread_name),
3074 	    "dlil_input_poll_%s", ifp->if_xname);
3075 	thread_set_thread_name(inp->dlth_thread, thread_name);
3076 
3077 	lck_mtx_lock(&inp->dlth_lock);
3078 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3079 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3080 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3081 	/* wake up once to get out of embryonic state */
3082 	dlil_input_wakeup(inp);
3083 	lck_mtx_unlock(&inp->dlth_lock);
3084 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3085 	/* NOTREACHED */
3086 	__builtin_unreachable();
3087 }
3088 
3089 __attribute__((noreturn))
3090 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3091 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3092 {
3093 	struct dlil_threading_info *inp = v;
3094 	struct ifnet *ifp = inp->dlth_ifp;
3095 	struct timespec ts;
3096 
3097 	lck_mtx_lock_spin(&inp->dlth_lock);
3098 	if (__improbable(wres == THREAD_INTERRUPTED ||
3099 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3100 		goto terminate;
3101 	}
3102 
3103 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3104 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3105 
3106 	while (1) {
3107 		struct mbuf *m = NULL;
3108 		uint32_t m_cnt, poll_req = 0;
3109 		uint64_t m_size = 0;
3110 		ifnet_model_t mode;
3111 		struct timespec now, delta;
3112 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3113 		boolean_t notify;
3114 		boolean_t embryonic;
3115 		uint64_t ival;
3116 
3117 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3118 
3119 		if (__improbable(embryonic =
3120 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3121 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3122 			goto skip;
3123 		}
3124 
3125 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3126 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3127 		}
3128 
3129 		/* Link parameters changed? */
3130 		if (ifp->if_poll_update != 0) {
3131 			ifp->if_poll_update = 0;
3132 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3133 		}
3134 
3135 		/* Current operating mode */
3136 		mode = ifp->if_poll_mode;
3137 
3138 		/*
3139 		 * Protocol registration and injection must always use
3140 		 * the main input thread; in theory the latter can utilize
3141 		 * the corresponding input thread where the packet arrived
3142 		 * on, but that requires our knowing the interface in advance
3143 		 * (and the benefits might not worth the trouble.)
3144 		 */
3145 		VERIFY(!(inp->dlth_flags &
3146 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3147 
3148 		/* Total count of all packets */
3149 		m_cnt = qlen(&inp->dlth_pkts);
3150 
3151 		/* Total bytes of all packets */
3152 		m_size = qsize(&inp->dlth_pkts);
3153 
3154 		/* Packets for this interface */
3155 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3156 		m = pkt.cp_mbuf;
3157 		VERIFY(m != NULL || m_cnt == 0);
3158 
3159 		nanouptime(&now);
3160 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3161 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3162 		}
3163 
3164 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3165 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3166 			u_int32_t ptot, btot;
3167 
3168 			/* Accumulate statistics for current sampling */
3169 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3170 
3171 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3172 				goto skip;
3173 			}
3174 
3175 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3176 
3177 			/* Calculate min/max of inbound bytes */
3178 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3179 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3180 				ifp->if_rxpoll_bmin = btot;
3181 			}
3182 			if (btot > ifp->if_rxpoll_bmax) {
3183 				ifp->if_rxpoll_bmax = btot;
3184 			}
3185 
3186 			/* Calculate EWMA of inbound bytes */
3187 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3188 
3189 			/* Calculate min/max of inbound packets */
3190 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3191 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3192 				ifp->if_rxpoll_pmin = ptot;
3193 			}
3194 			if (ptot > ifp->if_rxpoll_pmax) {
3195 				ifp->if_rxpoll_pmax = ptot;
3196 			}
3197 
3198 			/* Calculate EWMA of inbound packets */
3199 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3200 
3201 			/* Reset sampling statistics */
3202 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3203 
3204 			/* Calculate EWMA of wakeup requests */
3205 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3206 			    if_rxpoll_decay);
3207 			inp->dlth_wtot = 0;
3208 
3209 			if (dlil_verbose) {
3210 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3211 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3212 				}
3213 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3214 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3215 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3216 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3217 					    "limits [%d/%d], wreq avg %d "
3218 					    "limits [%d/%d], bytes avg %d "
3219 					    "limits [%d/%d]\n", if_name(ifp),
3220 					    (ifp->if_poll_mode ==
3221 					    IFNET_MODEL_INPUT_POLL_ON) ?
3222 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3223 					    ifp->if_rxpoll_pmax,
3224 					    ifp->if_rxpoll_plowat,
3225 					    ifp->if_rxpoll_phiwat,
3226 					    ifp->if_rxpoll_wavg,
3227 					    ifp->if_rxpoll_wlowat,
3228 					    ifp->if_rxpoll_whiwat,
3229 					    ifp->if_rxpoll_bavg,
3230 					    ifp->if_rxpoll_blowat,
3231 					    ifp->if_rxpoll_bhiwat);
3232 				}
3233 			}
3234 
3235 			/* Perform mode transition, if necessary */
3236 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3237 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3238 			}
3239 
3240 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3241 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3242 				goto skip;
3243 			}
3244 
3245 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3246 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3247 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3248 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3249 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3250 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3251 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3252 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3253 				mode = IFNET_MODEL_INPUT_POLL_ON;
3254 			}
3255 
3256 			if (mode != ifp->if_poll_mode) {
3257 				ifp->if_poll_mode = mode;
3258 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3259 				poll_req++;
3260 			}
3261 		}
3262 skip:
3263 		notify = dlil_input_stats_sync(ifp, inp);
3264 
3265 		lck_mtx_unlock(&inp->dlth_lock);
3266 
3267 		if (__improbable(embryonic)) {
3268 			ifnet_decr_pending_thread_count(ifp);
3269 		}
3270 
3271 		if (__improbable(notify)) {
3272 			ifnet_notify_data_threshold(ifp);
3273 		}
3274 
3275 		/*
3276 		 * If there's a mode change and interface is still attached,
3277 		 * perform a downcall to the driver for the new mode.  Also
3278 		 * hold an IO refcnt on the interface to prevent it from
3279 		 * being detached (will be release below.)
3280 		 */
3281 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3282 			struct ifnet_model_params p = {
3283 				.model = mode, .reserved = { 0 }
3284 			};
3285 			errno_t err;
3286 
3287 			if (dlil_verbose) {
3288 				DLIL_PRINTF("%s: polling is now %s, "
3289 				    "pkts avg %d max %d limits [%d/%d], "
3290 				    "wreq avg %d limits [%d/%d], "
3291 				    "bytes avg %d limits [%d/%d]\n",
3292 				    if_name(ifp),
3293 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3294 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3295 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3296 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3297 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3298 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3299 				    ifp->if_rxpoll_bhiwat);
3300 			}
3301 
3302 			if ((err = ((*ifp->if_input_ctl)(ifp,
3303 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3304 				DLIL_PRINTF("%s: error setting polling mode "
3305 				    "to %s (%d)\n", if_name(ifp),
3306 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3307 				    "ON" : "OFF", err);
3308 			}
3309 
3310 			switch (mode) {
3311 			case IFNET_MODEL_INPUT_POLL_OFF:
3312 				ifnet_set_poll_cycle(ifp, NULL);
3313 				ifp->if_rxpoll_offreq++;
3314 				if (err != 0) {
3315 					ifp->if_rxpoll_offerr++;
3316 				}
3317 				break;
3318 
3319 			case IFNET_MODEL_INPUT_POLL_ON:
3320 				net_nsectimer(&ival, &ts);
3321 				ifnet_set_poll_cycle(ifp, &ts);
3322 				ifnet_poll(ifp);
3323 				ifp->if_rxpoll_onreq++;
3324 				if (err != 0) {
3325 					ifp->if_rxpoll_onerr++;
3326 				}
3327 				break;
3328 
3329 			default:
3330 				VERIFY(0);
3331 				/* NOTREACHED */
3332 			}
3333 
3334 			/* Release the IO refcnt */
3335 			ifnet_decr_iorefcnt(ifp);
3336 		}
3337 
3338 		/*
3339 		 * NOTE warning %%% attention !!!!
3340 		 * We should think about putting some thread starvation
3341 		 * safeguards if we deal with long chains of packets.
3342 		 */
3343 		if (__probable(m != NULL)) {
3344 			dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
3345 		}
3346 
3347 		lck_mtx_lock_spin(&inp->dlth_lock);
3348 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3349 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3350 		    DLIL_INPUT_TERMINATE))) {
3351 			break;
3352 		}
3353 	}
3354 
3355 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3356 
3357 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3358 terminate:
3359 		lck_mtx_unlock(&inp->dlth_lock);
3360 		dlil_terminate_input_thread(inp);
3361 		/* NOTREACHED */
3362 	} else {
3363 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3364 		lck_mtx_unlock(&inp->dlth_lock);
3365 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3366 		    inp);
3367 		/* NOTREACHED */
3368 	}
3369 
3370 	VERIFY(0);      /* we should never get here */
3371 	/* NOTREACHED */
3372 	__builtin_unreachable();
3373 }
3374 
3375 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3376 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3377 {
3378 	if (p != NULL) {
3379 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3380 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3381 			return EINVAL;
3382 		}
3383 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3384 		    p->packets_lowat >= p->packets_hiwat) {
3385 			return EINVAL;
3386 		}
3387 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3388 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3389 			return EINVAL;
3390 		}
3391 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3392 		    p->bytes_lowat >= p->bytes_hiwat) {
3393 			return EINVAL;
3394 		}
3395 		if (p->interval_time != 0 &&
3396 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3397 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3398 		}
3399 	}
3400 	return 0;
3401 }
3402 
3403 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3404 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3405 {
3406 	u_int64_t sample_holdtime, inbw;
3407 
3408 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3409 		sample_holdtime = 0;    /* polling is disabled */
3410 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3411 		    ifp->if_rxpoll_blowat = 0;
3412 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3413 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3414 		ifp->if_rxpoll_plim = 0;
3415 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3416 	} else {
3417 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3418 		u_int64_t ival;
3419 		unsigned int n, i;
3420 
3421 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3422 			if (inbw < rxpoll_tbl[i].speed) {
3423 				break;
3424 			}
3425 			n = i;
3426 		}
3427 		/* auto-tune if caller didn't specify a value */
3428 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3429 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3430 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3431 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3432 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3433 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3434 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3435 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3436 		plim = ((p == NULL || p->packets_limit == 0 ||
3437 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3438 		ival = ((p == NULL || p->interval_time == 0 ||
3439 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3440 		    if_rxpoll_interval_time : p->interval_time);
3441 
3442 		VERIFY(plowat != 0 && phiwat != 0);
3443 		VERIFY(blowat != 0 && bhiwat != 0);
3444 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3445 
3446 		sample_holdtime = if_rxpoll_sample_holdtime;
3447 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3448 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3449 		ifp->if_rxpoll_plowat = plowat;
3450 		ifp->if_rxpoll_phiwat = phiwat;
3451 		ifp->if_rxpoll_blowat = blowat;
3452 		ifp->if_rxpoll_bhiwat = bhiwat;
3453 		ifp->if_rxpoll_plim = plim;
3454 		ifp->if_rxpoll_ival = ival;
3455 	}
3456 
3457 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3458 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3459 
3460 	if (dlil_verbose) {
3461 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3462 		    "poll interval %llu nsec, pkts per poll %u, "
3463 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3464 		    "bytes limits [%u/%u]\n", if_name(ifp),
3465 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3466 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3467 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3468 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3469 		    ifp->if_rxpoll_bhiwat);
3470 	}
3471 }
3472 
3473 /*
3474  * Must be called on an attached ifnet (caller is expected to check.)
3475  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3476  */
3477 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3478 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3479     boolean_t locked)
3480 {
3481 	errno_t err;
3482 	struct dlil_threading_info *inp;
3483 
3484 	VERIFY(ifp != NULL);
3485 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3486 		return ENXIO;
3487 	}
3488 	err = dlil_rxpoll_validate_params(p);
3489 	if (err != 0) {
3490 		return err;
3491 	}
3492 
3493 	if (!locked) {
3494 		lck_mtx_lock(&inp->dlth_lock);
3495 	}
3496 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3497 	/*
3498 	 * Normally, we'd reset the parameters to the auto-tuned values
3499 	 * if the the input thread detects a change in link rate.  If the
3500 	 * driver provides its own parameters right after a link rate
3501 	 * changes, but before the input thread gets to run, we want to
3502 	 * make sure to keep the driver's values.  Clearing if_poll_update
3503 	 * will achieve that.
3504 	 */
3505 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3506 		ifp->if_poll_update = 0;
3507 	}
3508 	dlil_rxpoll_update_params(ifp, p);
3509 	if (!locked) {
3510 		lck_mtx_unlock(&inp->dlth_lock);
3511 	}
3512 	return 0;
3513 }
3514 
3515 /*
3516  * Must be called on an attached ifnet (caller is expected to check.)
3517  */
3518 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3519 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3520 {
3521 	struct dlil_threading_info *inp;
3522 
3523 	VERIFY(ifp != NULL && p != NULL);
3524 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3525 		return ENXIO;
3526 	}
3527 
3528 	bzero(p, sizeof(*p));
3529 
3530 	lck_mtx_lock(&inp->dlth_lock);
3531 	p->packets_limit = ifp->if_rxpoll_plim;
3532 	p->packets_lowat = ifp->if_rxpoll_plowat;
3533 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3534 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3535 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3536 	p->interval_time = ifp->if_rxpoll_ival;
3537 	lck_mtx_unlock(&inp->dlth_lock);
3538 
3539 	return 0;
3540 }
3541 
3542 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3543 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3544     const struct ifnet_stat_increment_param *s)
3545 {
3546 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3547 }
3548 
3549 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3550 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3551     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3552 {
3553 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3554 }
3555 
3556 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3557 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3558     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3559 {
3560 	return ifnet_input_common(ifp, m_head, m_tail, s,
3561 	           (m_head != NULL), TRUE);
3562 }
3563 
3564 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3565 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3566     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3567 {
3568 	dlil_input_func input_func;
3569 	struct ifnet_stat_increment_param _s;
3570 	u_int32_t m_cnt = 0, m_size = 0;
3571 	struct mbuf *last;
3572 	errno_t err = 0;
3573 
3574 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3575 		if (m_head != NULL) {
3576 			mbuf_freem_list(m_head);
3577 		}
3578 		return EINVAL;
3579 	}
3580 
3581 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3582 	VERIFY(m_tail == NULL || ext);
3583 	VERIFY(s != NULL || !ext);
3584 
3585 	/*
3586 	 * Drop the packet(s) if the parameters are invalid, or if the
3587 	 * interface is no longer attached; else hold an IO refcnt to
3588 	 * prevent it from being detached (will be released below.)
3589 	 */
3590 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3591 		if (m_head != NULL) {
3592 			mbuf_freem_list(m_head);
3593 		}
3594 		return EINVAL;
3595 	}
3596 
3597 	input_func = ifp->if_input_dlil;
3598 	VERIFY(input_func != NULL);
3599 
3600 	if (m_tail == NULL) {
3601 		last = m_head;
3602 		while (m_head != NULL) {
3603 			m_add_hdr_crumb_interface_input(last, ifp->if_index, false);
3604 #if IFNET_INPUT_SANITY_CHK
3605 			if (__improbable(dlil_input_sanity_check != 0)) {
3606 				DLIL_INPUT_CHECK(last, ifp);
3607 			}
3608 #endif /* IFNET_INPUT_SANITY_CHK */
3609 			m_cnt++;
3610 			m_size += m_length(last);
3611 			if (mbuf_nextpkt(last) == NULL) {
3612 				break;
3613 			}
3614 			last = mbuf_nextpkt(last);
3615 		}
3616 		m_tail = last;
3617 	} else {
3618 #if IFNET_INPUT_SANITY_CHK
3619 		if (__improbable(dlil_input_sanity_check != 0)) {
3620 			last = m_head;
3621 			while (1) {
3622 				m_add_hdr_crumb_interface_input(last, ifp->if_index, false);
3623 				DLIL_INPUT_CHECK(last, ifp);
3624 				m_cnt++;
3625 				m_size += m_length(last);
3626 				if (mbuf_nextpkt(last) == NULL) {
3627 					break;
3628 				}
3629 				last = mbuf_nextpkt(last);
3630 			}
3631 		} else {
3632 			m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
3633 			m_cnt = s->packets_in;
3634 			m_size = s->bytes_in;
3635 			last = m_tail;
3636 		}
3637 #else
3638 		m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
3639 		m_cnt = s->packets_in;
3640 		m_size = s->bytes_in;
3641 		last = m_tail;
3642 #endif /* IFNET_INPUT_SANITY_CHK */
3643 	}
3644 
3645 	if (last != m_tail) {
3646 		panic_plain("%s: invalid input packet chain for %s, "
3647 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3648 		    m_tail, last);
3649 	}
3650 
3651 	/*
3652 	 * Assert packet count only for the extended variant, for backwards
3653 	 * compatibility, since this came directly from the device driver.
3654 	 * Relax this assertion for input bytes, as the driver may have
3655 	 * included the link-layer headers in the computation; hence
3656 	 * m_size is just an approximation.
3657 	 */
3658 	if (ext && s->packets_in != m_cnt) {
3659 		panic_plain("%s: input packet count mismatch for %s, "
3660 		    "%d instead of %d\n", __func__, if_name(ifp),
3661 		    s->packets_in, m_cnt);
3662 	}
3663 
3664 	if (s == NULL) {
3665 		bzero(&_s, sizeof(_s));
3666 		s = &_s;
3667 	} else {
3668 		_s = *s;
3669 	}
3670 	_s.packets_in = m_cnt;
3671 	_s.bytes_in = m_size;
3672 
3673 	if (ifp->if_xflags & IFXF_DISABLE_INPUT) {
3674 		m_freem_list(m_head);
3675 
3676 		os_atomic_add(&ifp->if_data.ifi_ipackets, _s.packets_in, relaxed);
3677 		os_atomic_add(&ifp->if_data.ifi_ibytes, _s.bytes_in, relaxed);
3678 
3679 		goto done;
3680 	}
3681 
3682 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3683 
3684 done:
3685 	if (ifp != lo_ifp) {
3686 		/* Release the IO refcnt */
3687 		ifnet_datamov_end(ifp);
3688 	}
3689 
3690 	return err;
3691 }
3692 
3693 #if SKYWALK
3694 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3695 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3696 {
3697 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3698 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3699 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3700 }
3701 
3702 void
dlil_reset_input_handler(struct ifnet * ifp)3703 dlil_reset_input_handler(struct ifnet *ifp)
3704 {
3705 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3706 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3707 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
3708 		;
3709 	}
3710 }
3711 
3712 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3713 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3714 {
3715 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3716 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3717 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3718 }
3719 
3720 void
dlil_reset_output_handler(struct ifnet * ifp)3721 dlil_reset_output_handler(struct ifnet *ifp)
3722 {
3723 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3724 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3725 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
3726 		;
3727 	}
3728 }
3729 #endif /* SKYWALK */
3730 
3731 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3732 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3733 {
3734 	return ifp->if_output(ifp, m);
3735 }
3736 
3737 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3738 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3739     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3740     boolean_t poll, struct thread *tp)
3741 {
3742 	struct dlil_threading_info *inp = ifp->if_inp;
3743 
3744 	if (__improbable(inp == NULL)) {
3745 		inp = dlil_main_input_thread;
3746 	}
3747 
3748 #if (DEVELOPMENT || DEBUG)
3749 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3750 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3751 	} else
3752 #endif /* (DEVELOPMENT || DEBUG) */
3753 	{
3754 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3755 	}
3756 }
3757 
3758 /*
3759  * Detect whether a queue contains a burst that needs to be trimmed.
3760  */
3761 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
3762 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
3763 	                        qtype(q) == QP_MBUF)
3764 
3765 #define MAX_KNOWN_MBUF_CLASS 8
3766 
3767 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)3768 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
3769     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
3770 {
3771 	uint32_t overcommitted_qlen;    /* Length in packets. */
3772 	uint64_t overcommitted_qsize;   /* Size in bytes. */
3773 	uint32_t target_qlen;           /* The desired queue length after trimming. */
3774 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
3775 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
3776 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
3777 	struct mbuf *m = NULL, *m_tmp = NULL;
3778 
3779 	overcommitted_qlen = qlen(input_queue);
3780 	overcommitted_qsize = qsize(input_queue);
3781 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
3782 
3783 	if (overcommitted_qlen <= target_qlen) {
3784 		/*
3785 		 * The queue is already within the target limits.
3786 		 */
3787 		dropped_pkts = 0;
3788 		goto out;
3789 	}
3790 
3791 	pkts_to_drop = overcommitted_qlen - target_qlen;
3792 
3793 	/*
3794 	 * Proceed to removing packets from the head of the queue,
3795 	 * starting from the oldest, until the desired number of packets
3796 	 * has been dropped.
3797 	 */
3798 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
3799 		if (pkts_to_drop <= dropped_pkts) {
3800 			break;
3801 		}
3802 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
3803 		MBUFQ_NEXT(m) = NULL;
3804 		MBUFQ_ENQUEUE(freeq, m);
3805 
3806 		dropped_pkts += 1;
3807 		dropped_bytes += m_length(m);
3808 	}
3809 
3810 	/*
3811 	 * Adjust the length and the estimated size of the queue
3812 	 * after trimming.
3813 	 */
3814 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
3815 	qlen(input_queue) = target_qlen;
3816 
3817 	/* qsize() is an approximation. */
3818 	if (dropped_bytes < qsize(input_queue)) {
3819 		qsize(input_queue) -= dropped_bytes;
3820 	} else {
3821 		qsize(input_queue) = 0;
3822 	}
3823 
3824 	/*
3825 	 * Adjust the ifnet statistics increments, if needed.
3826 	 */
3827 	stat_delta->dropped += dropped_pkts;
3828 	if (dropped_pkts < stat_delta->packets_in) {
3829 		stat_delta->packets_in -= dropped_pkts;
3830 	} else {
3831 		stat_delta->packets_in = 0;
3832 	}
3833 	if (dropped_bytes < stat_delta->bytes_in) {
3834 		stat_delta->bytes_in -= dropped_bytes;
3835 	} else {
3836 		stat_delta->bytes_in = 0;
3837 	}
3838 
3839 out:
3840 	if (dlil_verbose) {
3841 		/*
3842 		 * The basic information about the drop is logged
3843 		 * by the invoking function (dlil_input_{,a}sync).
3844 		 * If `dlil_verbose' flag is set, provide more information
3845 		 * that can be useful for debugging.
3846 		 */
3847 		DLIL_PRINTF("%s: "
3848 		    "qlen: %u -> %u, "
3849 		    "qsize: %llu -> %llu "
3850 		    "qlimit: %u (sysctl: %u) "
3851 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
3852 		    "dropped_pkts: %u dropped_bytes %u\n",
3853 		    __func__,
3854 		    overcommitted_qlen, qlen(input_queue),
3855 		    overcommitted_qsize, qsize(input_queue),
3856 		    qlimit(input_queue), if_rcvq_burst_limit,
3857 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
3858 		    dropped_pkts, dropped_bytes);
3859 	}
3860 
3861 	return dropped_pkts;
3862 }
3863 
3864 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3865 dlil_input_async(struct dlil_threading_info *inp,
3866     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3867     const struct ifnet_stat_increment_param *s, boolean_t poll,
3868     struct thread *tp)
3869 {
3870 	u_int32_t m_cnt = s->packets_in;
3871 	u_int32_t m_size = s->bytes_in;
3872 	boolean_t notify = FALSE;
3873 	struct ifnet_stat_increment_param s_adj = *s;
3874 	dlil_freeq_t freeq;
3875 	MBUFQ_INIT(&freeq);
3876 
3877 	/*
3878 	 * If there is a matching DLIL input thread associated with an
3879 	 * affinity set, associate this thread with the same set.  We
3880 	 * will only do this once.
3881 	 */
3882 	lck_mtx_lock_spin(&inp->dlth_lock);
3883 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3884 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3885 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3886 		u_int32_t tag = inp->dlth_affinity_tag;
3887 
3888 		if (poll) {
3889 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3890 			inp->dlth_poller_thread = tp;
3891 		} else {
3892 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3893 			inp->dlth_driver_thread = tp;
3894 		}
3895 		lck_mtx_unlock(&inp->dlth_lock);
3896 
3897 		/* Associate the current thread with the new affinity tag */
3898 		(void) dlil_affinity_set(tp, tag);
3899 
3900 		/*
3901 		 * Take a reference on the current thread; during detach,
3902 		 * we will need to refer to it in order to tear down its
3903 		 * affinity.
3904 		 */
3905 		thread_reference(tp);
3906 		lck_mtx_lock_spin(&inp->dlth_lock);
3907 	}
3908 
3909 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3910 
3911 	/*
3912 	 * Because of loopbacked multicast we cannot stuff the ifp in
3913 	 * the rcvif of the packet header: loopback (lo0) packets use a
3914 	 * dedicated list so that we can later associate them with lo_ifp
3915 	 * on their way up the stack.  Packets for other interfaces without
3916 	 * dedicated input threads go to the regular list.
3917 	 */
3918 	if (m_head != NULL) {
3919 		classq_pkt_t head, tail;
3920 		class_queue_t *input_queue;
3921 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
3922 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3923 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3924 			struct dlil_main_threading_info *inpm =
3925 			    (struct dlil_main_threading_info *)inp;
3926 			input_queue = &inpm->lo_rcvq_pkts;
3927 		} else {
3928 			input_queue = &inp->dlth_pkts;
3929 		}
3930 
3931 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
3932 
3933 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
3934 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
3935 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
3936 			inp->dlth_trim_cnt += 1;
3937 
3938 			os_log_error(OS_LOG_DEFAULT,
3939 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
3940 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
3941 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
3942 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
3943 			    qlen(input_queue));
3944 		}
3945 	}
3946 
3947 #if IFNET_INPUT_SANITY_CHK
3948 	/*
3949 	 * Verify that the original stat increment parameter
3950 	 * accurately describes the input chain `m_head`.
3951 	 * This is not affected by the trimming of input queue.
3952 	 */
3953 	if (__improbable(dlil_input_sanity_check != 0)) {
3954 		u_int32_t count = 0, size = 0;
3955 		struct mbuf *m0;
3956 
3957 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3958 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
3959 			size += m_length(m0);
3960 			count++;
3961 		}
3962 
3963 		if (count != m_cnt) {
3964 			panic_plain("%s: invalid total packet count %u "
3965 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3966 			/* NOTREACHED */
3967 			__builtin_unreachable();
3968 		} else if (size != m_size) {
3969 			panic_plain("%s: invalid total packet size %u "
3970 			    "(expected %u)\n", if_name(ifp), size, m_size);
3971 			/* NOTREACHED */
3972 			__builtin_unreachable();
3973 		}
3974 
3975 		inp->dlth_pkts_cnt += m_cnt;
3976 	}
3977 #else
3978 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
3979 #endif /* IFNET_INPUT_SANITY_CHK */
3980 
3981 	/* NOTE: use the adjusted parameter, vs the original one */
3982 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
3983 	/*
3984 	 * If we're using the main input thread, synchronize the
3985 	 * stats now since we have the interface context.  All
3986 	 * other cases involving dedicated input threads will
3987 	 * have their stats synchronized there.
3988 	 */
3989 	if (inp == dlil_main_input_thread) {
3990 		notify = dlil_input_stats_sync(ifp, inp);
3991 	}
3992 
3993 	dlil_input_wakeup(inp);
3994 	lck_mtx_unlock(&inp->dlth_lock);
3995 
3996 	/*
3997 	 * Actual freeing of the excess packets must happen
3998 	 * after the dlth_lock had been released.
3999 	 */
4000 	if (!MBUFQ_EMPTY(&freeq)) {
4001 		m_freem_list(MBUFQ_FIRST(&freeq));
4002 	}
4003 
4004 	if (notify) {
4005 		ifnet_notify_data_threshold(ifp);
4006 	}
4007 
4008 	return 0;
4009 }
4010 
4011 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4012 dlil_input_sync(struct dlil_threading_info *inp,
4013     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4014     const struct ifnet_stat_increment_param *s, boolean_t poll,
4015     struct thread *tp)
4016 {
4017 #pragma unused(tp)
4018 	u_int32_t m_cnt = s->packets_in;
4019 	u_int32_t m_size = s->bytes_in;
4020 	boolean_t notify = FALSE;
4021 	classq_pkt_t head, tail;
4022 	struct ifnet_stat_increment_param s_adj = *s;
4023 	dlil_freeq_t freeq;
4024 	MBUFQ_INIT(&freeq);
4025 
4026 	ASSERT(inp != dlil_main_input_thread);
4027 
4028 	/* XXX: should we just assert instead? */
4029 	if (__improbable(m_head == NULL)) {
4030 		return 0;
4031 	}
4032 
4033 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4034 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4035 
4036 	lck_mtx_lock_spin(&inp->dlth_lock);
4037 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4038 
4039 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4040 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4041 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4042 		inp->dlth_trim_cnt += 1;
4043 
4044 		os_log_error(OS_LOG_DEFAULT,
4045 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4046 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4047 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4048 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4049 		    qlen(&inp->dlth_pkts));
4050 	}
4051 
4052 #if IFNET_INPUT_SANITY_CHK
4053 	if (__improbable(dlil_input_sanity_check != 0)) {
4054 		u_int32_t count = 0, size = 0;
4055 		struct mbuf *m0;
4056 
4057 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4058 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
4059 			size += m_length(m0);
4060 			count++;
4061 		}
4062 
4063 		if (count != m_cnt) {
4064 			panic_plain("%s: invalid total packet count %u "
4065 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4066 			/* NOTREACHED */
4067 			__builtin_unreachable();
4068 		} else if (size != m_size) {
4069 			panic_plain("%s: invalid total packet size %u "
4070 			    "(expected %u)\n", if_name(ifp), size, m_size);
4071 			/* NOTREACHED */
4072 			__builtin_unreachable();
4073 		}
4074 
4075 		inp->dlth_pkts_cnt += m_cnt;
4076 	}
4077 #else
4078 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
4079 #endif /* IFNET_INPUT_SANITY_CHK */
4080 
4081 	/* NOTE: use the adjusted parameter, vs the original one */
4082 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4083 
4084 	m_cnt = qlen(&inp->dlth_pkts);
4085 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4086 
4087 #if SKYWALK
4088 	/*
4089 	 * If this interface is attached to a netif nexus,
4090 	 * the stats are already incremented there; otherwise
4091 	 * do it here.
4092 	 */
4093 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4094 #endif /* SKYWALK */
4095 	notify = dlil_input_stats_sync(ifp, inp);
4096 
4097 	lck_mtx_unlock(&inp->dlth_lock);
4098 
4099 	/*
4100 	 * Actual freeing of the excess packets must happen
4101 	 * after the dlth_lock had been released.
4102 	 */
4103 	if (!MBUFQ_EMPTY(&freeq)) {
4104 		m_freem_list(MBUFQ_FIRST(&freeq));
4105 	}
4106 
4107 	if (notify) {
4108 		ifnet_notify_data_threshold(ifp);
4109 	}
4110 
4111 	/*
4112 	 * NOTE warning %%% attention !!!!
4113 	 * We should think about putting some thread starvation
4114 	 * safeguards if we deal with long chains of packets.
4115 	 */
4116 	if (head.cp_mbuf != NULL) {
4117 		dlil_input_packet_list_extended(ifp, head.cp_mbuf,
4118 		    m_cnt, ifp->if_poll_mode);
4119 	}
4120 
4121 	return 0;
4122 }
4123 
4124 #if SKYWALK
4125 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4126 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4127 {
4128 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4129 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4130 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4131 }
4132 
4133 void
ifnet_reset_output_handler(struct ifnet * ifp)4134 ifnet_reset_output_handler(struct ifnet *ifp)
4135 {
4136 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4137 	    ptrauth_nop_cast(void *, ifp->if_output),
4138 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4139 		;
4140 	}
4141 }
4142 
4143 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4144 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4145 {
4146 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4147 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4148 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4149 }
4150 
4151 void
ifnet_reset_start_handler(struct ifnet * ifp)4152 ifnet_reset_start_handler(struct ifnet *ifp)
4153 {
4154 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4155 	    ptrauth_nop_cast(void *, ifp->if_start),
4156 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4157 		;
4158 	}
4159 }
4160 #endif /* SKYWALK */
4161 
4162 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4163 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4164 {
4165 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4166 		return;
4167 	}
4168 	/*
4169 	 * If the starter thread is inactive, signal it to do work,
4170 	 * unless the interface is being flow controlled from below,
4171 	 * e.g. a virtual interface being flow controlled by a real
4172 	 * network interface beneath it, or it's been disabled via
4173 	 * a call to ifnet_disable_output().
4174 	 */
4175 	lck_mtx_lock_spin(&ifp->if_start_lock);
4176 	if (ignore_delay) {
4177 		ifp->if_start_flags |= IFSF_NO_DELAY;
4178 	}
4179 	if (resetfc) {
4180 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4181 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4182 		lck_mtx_unlock(&ifp->if_start_lock);
4183 		return;
4184 	}
4185 	ifp->if_start_req++;
4186 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4187 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4188 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4189 	    ifp->if_start_delayed == 0)) {
4190 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4191 	}
4192 	lck_mtx_unlock(&ifp->if_start_lock);
4193 }
4194 
4195 void
ifnet_start(struct ifnet * ifp)4196 ifnet_start(struct ifnet *ifp)
4197 {
4198 	ifnet_start_common(ifp, FALSE, FALSE);
4199 }
4200 
4201 void
ifnet_start_ignore_delay(struct ifnet * ifp)4202 ifnet_start_ignore_delay(struct ifnet *ifp)
4203 {
4204 	ifnet_start_common(ifp, FALSE, TRUE);
4205 }
4206 
4207 __attribute__((noreturn))
4208 static void
ifnet_start_thread_func(void * v,wait_result_t w)4209 ifnet_start_thread_func(void *v, wait_result_t w)
4210 {
4211 #pragma unused(w)
4212 	struct ifnet *ifp = v;
4213 	char thread_name[MAXTHREADNAMESIZE];
4214 
4215 	/* Construct the name for this thread, and then apply it. */
4216 	bzero(thread_name, sizeof(thread_name));
4217 	(void) snprintf(thread_name, sizeof(thread_name),
4218 	    "ifnet_start_%s", ifp->if_xname);
4219 #if SKYWALK
4220 	/* override name for native Skywalk interface */
4221 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4222 		(void) snprintf(thread_name, sizeof(thread_name),
4223 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4224 	}
4225 #endif /* SKYWALK */
4226 	ASSERT(ifp->if_start_thread == current_thread());
4227 	thread_set_thread_name(current_thread(), thread_name);
4228 
4229 	/*
4230 	 * Treat the dedicated starter thread for lo0 as equivalent to
4231 	 * the driver workloop thread; if net_affinity is enabled for
4232 	 * the main input thread, associate this starter thread to it
4233 	 * by binding them with the same affinity tag.  This is done
4234 	 * only once (as we only have one lo_ifp which never goes away.)
4235 	 */
4236 	if (ifp == lo_ifp) {
4237 		struct dlil_threading_info *inp = dlil_main_input_thread;
4238 		struct thread *tp = current_thread();
4239 #if SKYWALK
4240 		/* native skywalk loopback not yet implemented */
4241 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4242 #endif /* SKYWALK */
4243 
4244 		lck_mtx_lock(&inp->dlth_lock);
4245 		if (inp->dlth_affinity) {
4246 			u_int32_t tag = inp->dlth_affinity_tag;
4247 
4248 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4249 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4250 			inp->dlth_driver_thread = tp;
4251 			lck_mtx_unlock(&inp->dlth_lock);
4252 
4253 			/* Associate this thread with the affinity tag */
4254 			(void) dlil_affinity_set(tp, tag);
4255 		} else {
4256 			lck_mtx_unlock(&inp->dlth_lock);
4257 		}
4258 	}
4259 
4260 	lck_mtx_lock(&ifp->if_start_lock);
4261 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4262 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4263 	ifp->if_start_embryonic = 1;
4264 	/* wake up once to get out of embryonic state */
4265 	ifp->if_start_req++;
4266 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4267 	lck_mtx_unlock(&ifp->if_start_lock);
4268 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4269 	/* NOTREACHED */
4270 	__builtin_unreachable();
4271 }
4272 
4273 __attribute__((noreturn))
4274 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4275 ifnet_start_thread_cont(void *v, wait_result_t wres)
4276 {
4277 	struct ifnet *ifp = v;
4278 	struct ifclassq *ifq = ifp->if_snd;
4279 
4280 	lck_mtx_lock_spin(&ifp->if_start_lock);
4281 	if (__improbable(wres == THREAD_INTERRUPTED ||
4282 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4283 		goto terminate;
4284 	}
4285 
4286 	if (__improbable(ifp->if_start_embryonic)) {
4287 		ifp->if_start_embryonic = 0;
4288 		lck_mtx_unlock(&ifp->if_start_lock);
4289 		ifnet_decr_pending_thread_count(ifp);
4290 		lck_mtx_lock_spin(&ifp->if_start_lock);
4291 		goto skip;
4292 	}
4293 
4294 	ifp->if_start_active = 1;
4295 
4296 	/*
4297 	 * Keep on servicing until no more request.
4298 	 */
4299 	for (;;) {
4300 		u_int32_t req = ifp->if_start_req;
4301 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4302 		    !IFCQ_IS_EMPTY(ifq) &&
4303 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4304 		    ifp->if_start_delayed == 0 &&
4305 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4306 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4307 			ifp->if_start_delayed = 1;
4308 			ifnet_start_delayed++;
4309 			break;
4310 		}
4311 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4312 		ifp->if_start_delayed = 0;
4313 		lck_mtx_unlock(&ifp->if_start_lock);
4314 
4315 		/*
4316 		 * If no longer attached, don't call start because ifp
4317 		 * is being destroyed; else hold an IO refcnt to
4318 		 * prevent the interface from being detached (will be
4319 		 * released below.)
4320 		 */
4321 		if (!ifnet_datamov_begin(ifp)) {
4322 			lck_mtx_lock_spin(&ifp->if_start_lock);
4323 			break;
4324 		}
4325 
4326 		/* invoke the driver's start routine */
4327 		((*ifp->if_start)(ifp));
4328 
4329 		/*
4330 		 * Release the io ref count taken above.
4331 		 */
4332 		ifnet_datamov_end(ifp);
4333 
4334 		lck_mtx_lock_spin(&ifp->if_start_lock);
4335 
4336 		/*
4337 		 * If there's no pending request or if the
4338 		 * interface has been disabled, we're done.
4339 		 */
4340 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4341 		if (req == ifp->if_start_req ||
4342 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4343 			break;
4344 		}
4345 	}
4346 skip:
4347 	ifp->if_start_req = 0;
4348 	ifp->if_start_active = 0;
4349 
4350 #if SKYWALK
4351 	/*
4352 	 * Wakeup any waiters, e.g. any threads waiting to
4353 	 * detach the interface from the flowswitch, etc.
4354 	 */
4355 	if (ifp->if_start_waiters != 0) {
4356 		ifp->if_start_waiters = 0;
4357 		wakeup(&ifp->if_start_waiters);
4358 	}
4359 #endif /* SKYWALK */
4360 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4361 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4362 		struct timespec delay_start_ts;
4363 		struct timespec *ts = NULL;
4364 
4365 		if (ts == NULL) {
4366 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4367 			    &ifp->if_start_cycle : NULL);
4368 		}
4369 
4370 		if (ts == NULL && ifp->if_start_delayed == 1) {
4371 			delay_start_ts.tv_sec = 0;
4372 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4373 			ts = &delay_start_ts;
4374 		}
4375 
4376 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4377 			ts = NULL;
4378 		}
4379 
4380 		if (__improbable(ts != NULL)) {
4381 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4382 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4383 		}
4384 
4385 		(void) assert_wait_deadline(&ifp->if_start_thread,
4386 		    THREAD_UNINT, deadline);
4387 		lck_mtx_unlock(&ifp->if_start_lock);
4388 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4389 		/* NOTREACHED */
4390 	} else {
4391 terminate:
4392 		/* interface is detached? */
4393 		ifnet_set_start_cycle(ifp, NULL);
4394 
4395 		/* clear if_start_thread to allow termination to continue */
4396 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4397 		ifp->if_start_thread = THREAD_NULL;
4398 		wakeup((caddr_t)&ifp->if_start_thread);
4399 		lck_mtx_unlock(&ifp->if_start_lock);
4400 
4401 		if (dlil_verbose) {
4402 			DLIL_PRINTF("%s: starter thread terminated\n",
4403 			    if_name(ifp));
4404 		}
4405 
4406 		/* for the extra refcnt from kernel_thread_start() */
4407 		thread_deallocate(current_thread());
4408 		/* this is the end */
4409 		thread_terminate(current_thread());
4410 		/* NOTREACHED */
4411 	}
4412 
4413 	/* must never get here */
4414 	VERIFY(0);
4415 	/* NOTREACHED */
4416 	__builtin_unreachable();
4417 }
4418 
4419 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4420 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4421 {
4422 	if (ts == NULL) {
4423 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4424 	} else {
4425 		*(&ifp->if_start_cycle) = *ts;
4426 	}
4427 
4428 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4429 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4430 		    if_name(ifp), ts->tv_nsec);
4431 	}
4432 }
4433 
4434 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4435 ifnet_poll_wakeup(struct ifnet *ifp)
4436 {
4437 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4438 
4439 	ifp->if_poll_req++;
4440 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4441 	    ifp->if_poll_thread != THREAD_NULL) {
4442 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4443 	}
4444 }
4445 
4446 void
ifnet_poll(struct ifnet * ifp)4447 ifnet_poll(struct ifnet *ifp)
4448 {
4449 	/*
4450 	 * If the poller thread is inactive, signal it to do work.
4451 	 */
4452 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4453 	ifnet_poll_wakeup(ifp);
4454 	lck_mtx_unlock(&ifp->if_poll_lock);
4455 }
4456 
4457 __attribute__((noreturn))
4458 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4459 ifnet_poll_thread_func(void *v, wait_result_t w)
4460 {
4461 #pragma unused(w)
4462 	char thread_name[MAXTHREADNAMESIZE];
4463 	struct ifnet *ifp = v;
4464 
4465 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4466 	VERIFY(current_thread() == ifp->if_poll_thread);
4467 
4468 	/* construct the name for this thread, and then apply it */
4469 	bzero(thread_name, sizeof(thread_name));
4470 	(void) snprintf(thread_name, sizeof(thread_name),
4471 	    "ifnet_poller_%s", ifp->if_xname);
4472 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4473 
4474 	lck_mtx_lock(&ifp->if_poll_lock);
4475 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4476 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4477 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4478 	/* wake up once to get out of embryonic state */
4479 	ifnet_poll_wakeup(ifp);
4480 	lck_mtx_unlock(&ifp->if_poll_lock);
4481 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4482 	/* NOTREACHED */
4483 	__builtin_unreachable();
4484 }
4485 
4486 __attribute__((noreturn))
4487 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4488 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4489 {
4490 	struct dlil_threading_info *inp;
4491 	struct ifnet *ifp = v;
4492 	struct ifnet_stat_increment_param s;
4493 	struct timespec start_time;
4494 
4495 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4496 
4497 	bzero(&s, sizeof(s));
4498 	net_timerclear(&start_time);
4499 
4500 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4501 	if (__improbable(wres == THREAD_INTERRUPTED ||
4502 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4503 		goto terminate;
4504 	}
4505 
4506 	inp = ifp->if_inp;
4507 	VERIFY(inp != NULL);
4508 
4509 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4510 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4511 		lck_mtx_unlock(&ifp->if_poll_lock);
4512 		ifnet_decr_pending_thread_count(ifp);
4513 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4514 		goto skip;
4515 	}
4516 
4517 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4518 
4519 	/*
4520 	 * Keep on servicing until no more request.
4521 	 */
4522 	for (;;) {
4523 		struct mbuf *m_head, *m_tail;
4524 		u_int32_t m_lim, m_cnt, m_totlen;
4525 		u_int16_t req = ifp->if_poll_req;
4526 
4527 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4528 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4529 		lck_mtx_unlock(&ifp->if_poll_lock);
4530 
4531 		/*
4532 		 * If no longer attached, there's nothing to do;
4533 		 * else hold an IO refcnt to prevent the interface
4534 		 * from being detached (will be released below.)
4535 		 */
4536 		if (!ifnet_is_attached(ifp, 1)) {
4537 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4538 			break;
4539 		}
4540 
4541 		if (dlil_verbose > 1) {
4542 			DLIL_PRINTF("%s: polling up to %d pkts, "
4543 			    "pkts avg %d max %d, wreq avg %d, "
4544 			    "bytes avg %d\n",
4545 			    if_name(ifp), m_lim,
4546 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4547 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4548 		}
4549 
4550 		/* invoke the driver's input poll routine */
4551 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4552 		&m_cnt, &m_totlen));
4553 
4554 		if (m_head != NULL) {
4555 			VERIFY(m_tail != NULL && m_cnt > 0);
4556 
4557 			if (dlil_verbose > 1) {
4558 				DLIL_PRINTF("%s: polled %d pkts, "
4559 				    "pkts avg %d max %d, wreq avg %d, "
4560 				    "bytes avg %d\n",
4561 				    if_name(ifp), m_cnt,
4562 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4563 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4564 			}
4565 
4566 			/* stats are required for extended variant */
4567 			s.packets_in = m_cnt;
4568 			s.bytes_in = m_totlen;
4569 
4570 			(void) ifnet_input_common(ifp, m_head, m_tail,
4571 			    &s, TRUE, TRUE);
4572 		} else {
4573 			if (dlil_verbose > 1) {
4574 				DLIL_PRINTF("%s: no packets, "
4575 				    "pkts avg %d max %d, wreq avg %d, "
4576 				    "bytes avg %d\n",
4577 				    if_name(ifp), ifp->if_rxpoll_pavg,
4578 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4579 				    ifp->if_rxpoll_bavg);
4580 			}
4581 
4582 			(void) ifnet_input_common(ifp, NULL, NULL,
4583 			    NULL, FALSE, TRUE);
4584 		}
4585 
4586 		/* Release the io ref count */
4587 		ifnet_decr_iorefcnt(ifp);
4588 
4589 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4590 
4591 		/* if there's no pending request, we're done */
4592 		if (req == ifp->if_poll_req ||
4593 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4594 			break;
4595 		}
4596 	}
4597 skip:
4598 	ifp->if_poll_req = 0;
4599 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4600 
4601 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4602 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4603 		struct timespec *ts;
4604 
4605 		/*
4606 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4607 		 * until ifnet_poll() is called again.
4608 		 */
4609 		ts = &ifp->if_poll_cycle;
4610 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4611 			ts = NULL;
4612 		}
4613 
4614 		if (ts != NULL) {
4615 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4616 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4617 		}
4618 
4619 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4620 		    THREAD_UNINT, deadline);
4621 		lck_mtx_unlock(&ifp->if_poll_lock);
4622 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4623 		/* NOTREACHED */
4624 	} else {
4625 terminate:
4626 		/* interface is detached (maybe while asleep)? */
4627 		ifnet_set_poll_cycle(ifp, NULL);
4628 
4629 		/* clear if_poll_thread to allow termination to continue */
4630 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4631 		ifp->if_poll_thread = THREAD_NULL;
4632 		wakeup((caddr_t)&ifp->if_poll_thread);
4633 		lck_mtx_unlock(&ifp->if_poll_lock);
4634 
4635 		if (dlil_verbose) {
4636 			DLIL_PRINTF("%s: poller thread terminated\n",
4637 			    if_name(ifp));
4638 		}
4639 
4640 		/* for the extra refcnt from kernel_thread_start() */
4641 		thread_deallocate(current_thread());
4642 		/* this is the end */
4643 		thread_terminate(current_thread());
4644 		/* NOTREACHED */
4645 	}
4646 
4647 	/* must never get here */
4648 	VERIFY(0);
4649 	/* NOTREACHED */
4650 	__builtin_unreachable();
4651 }
4652 
4653 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4654 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4655 {
4656 	if (ts == NULL) {
4657 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4658 	} else {
4659 		*(&ifp->if_poll_cycle) = *ts;
4660 	}
4661 
4662 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4663 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4664 		    if_name(ifp), ts->tv_nsec);
4665 	}
4666 }
4667 
4668 void
ifnet_purge(struct ifnet * ifp)4669 ifnet_purge(struct ifnet *ifp)
4670 {
4671 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4672 		if_qflush_snd(ifp, false);
4673 	}
4674 }
4675 
4676 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4677 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4678 {
4679 	IFCQ_LOCK_ASSERT_HELD(ifq);
4680 
4681 	if (!(IFCQ_IS_READY(ifq))) {
4682 		return;
4683 	}
4684 
4685 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4686 		struct tb_profile tb = {
4687 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4688 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4689 		};
4690 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4691 	}
4692 
4693 	ifclassq_update(ifq, ev);
4694 }
4695 
4696 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4697 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4698 {
4699 	switch (ev) {
4700 	case CLASSQ_EV_LINK_BANDWIDTH:
4701 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4702 			ifp->if_poll_update++;
4703 		}
4704 		break;
4705 
4706 	default:
4707 		break;
4708 	}
4709 }
4710 
4711 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4712 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4713 {
4714 	struct ifclassq *ifq;
4715 	u_int32_t omodel;
4716 	errno_t err;
4717 
4718 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4719 		return EINVAL;
4720 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4721 		return ENXIO;
4722 	}
4723 
4724 	ifq = ifp->if_snd;
4725 	IFCQ_LOCK(ifq);
4726 	omodel = ifp->if_output_sched_model;
4727 	ifp->if_output_sched_model = model;
4728 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4729 		ifp->if_output_sched_model = omodel;
4730 	}
4731 	IFCQ_UNLOCK(ifq);
4732 
4733 	return err;
4734 }
4735 
4736 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4737 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4738 {
4739 	if (ifp == NULL) {
4740 		return EINVAL;
4741 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4742 		return ENXIO;
4743 	}
4744 
4745 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4746 
4747 	return 0;
4748 }
4749 
4750 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4751 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4752 {
4753 	if (ifp == NULL || maxqlen == NULL) {
4754 		return EINVAL;
4755 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4756 		return ENXIO;
4757 	}
4758 
4759 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4760 
4761 	return 0;
4762 }
4763 
4764 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4765 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4766 {
4767 	errno_t err;
4768 
4769 	if (ifp == NULL || pkts == NULL) {
4770 		err = EINVAL;
4771 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4772 		err = ENXIO;
4773 	} else {
4774 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4775 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4776 	}
4777 
4778 	return err;
4779 }
4780 
4781 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4782 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4783     u_int32_t *pkts, u_int32_t *bytes)
4784 {
4785 	errno_t err;
4786 
4787 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4788 	    (pkts == NULL && bytes == NULL)) {
4789 		err = EINVAL;
4790 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4791 		err = ENXIO;
4792 	} else {
4793 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4794 		    pkts, bytes);
4795 	}
4796 
4797 	return err;
4798 }
4799 
4800 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4801 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4802 {
4803 	struct dlil_threading_info *inp;
4804 
4805 	if (ifp == NULL) {
4806 		return EINVAL;
4807 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4808 		return ENXIO;
4809 	}
4810 
4811 	if (maxqlen == 0) {
4812 		maxqlen = if_rcvq_maxlen;
4813 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4814 		maxqlen = IF_RCVQ_MINLEN;
4815 	}
4816 
4817 	inp = ifp->if_inp;
4818 	lck_mtx_lock(&inp->dlth_lock);
4819 	qlimit(&inp->dlth_pkts) = maxqlen;
4820 	lck_mtx_unlock(&inp->dlth_lock);
4821 
4822 	return 0;
4823 }
4824 
4825 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4826 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4827 {
4828 	struct dlil_threading_info *inp;
4829 
4830 	if (ifp == NULL || maxqlen == NULL) {
4831 		return EINVAL;
4832 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4833 		return ENXIO;
4834 	}
4835 
4836 	inp = ifp->if_inp;
4837 	lck_mtx_lock(&inp->dlth_lock);
4838 	*maxqlen = qlimit(&inp->dlth_pkts);
4839 	lck_mtx_unlock(&inp->dlth_lock);
4840 	return 0;
4841 }
4842 
4843 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4844 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4845     uint16_t delay_timeout)
4846 {
4847 	if (delay_qlen > 0 && delay_timeout > 0) {
4848 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4849 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4850 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4851 		/* convert timeout to nanoseconds */
4852 		ifp->if_start_delay_timeout *= 1000;
4853 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4854 		    ifp->if_xname, (uint32_t)delay_qlen,
4855 		    (uint32_t)delay_timeout);
4856 	} else {
4857 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4858 	}
4859 }
4860 
4861 /*
4862  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4863  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4864  * buf holds the full header.
4865  */
4866 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4867 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4868 {
4869 	struct ip *ip;
4870 	struct ip6_hdr *ip6;
4871 	uint8_t lbuf[64] __attribute__((aligned(8)));
4872 	uint8_t *p = buf;
4873 
4874 	if (ip_ver == IPVERSION) {
4875 		uint8_t old_tos;
4876 		uint32_t sum;
4877 
4878 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4879 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4880 			bcopy(buf, lbuf, sizeof(struct ip));
4881 			p = lbuf;
4882 		}
4883 		ip = (struct ip *)(void *)p;
4884 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4885 			return;
4886 		}
4887 
4888 		DTRACE_IP1(clear__v4, struct ip *, ip);
4889 		old_tos = ip->ip_tos;
4890 		ip->ip_tos &= IPTOS_ECN_MASK;
4891 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4892 		sum = (sum >> 16) + (sum & 0xffff);
4893 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4894 
4895 		if (__improbable(p == lbuf)) {
4896 			bcopy(lbuf, buf, sizeof(struct ip));
4897 		}
4898 	} else {
4899 		uint32_t flow;
4900 		ASSERT(ip_ver == IPV6_VERSION);
4901 
4902 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4903 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4904 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4905 			p = lbuf;
4906 		}
4907 		ip6 = (struct ip6_hdr *)(void *)p;
4908 		flow = ntohl(ip6->ip6_flow);
4909 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4910 			return;
4911 		}
4912 
4913 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4914 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4915 
4916 		if (__improbable(p == lbuf)) {
4917 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4918 		}
4919 	}
4920 }
4921 
4922 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4923 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4924     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4925 {
4926 #if SKYWALK
4927 	volatile struct sk_nexusadv *nxadv = NULL;
4928 #endif /* SKYWALK */
4929 	volatile uint64_t *fg_ts = NULL;
4930 	volatile uint64_t *rt_ts = NULL;
4931 	struct timespec now;
4932 	u_int64_t now_nsec = 0;
4933 	int error = 0;
4934 	uint8_t *mcast_buf = NULL;
4935 	uint8_t ip_ver;
4936 	uint32_t pktlen;
4937 
4938 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4939 #if SKYWALK
4940 	/*
4941 	 * If attached to flowswitch, grab pointers to the
4942 	 * timestamp variables in the nexus advisory region.
4943 	 */
4944 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4945 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4946 		fg_ts = &nxadv->nxadv_fg_sendts;
4947 		rt_ts = &nxadv->nxadv_rt_sendts;
4948 	}
4949 #endif /* SKYWALK */
4950 
4951 	/*
4952 	 * If packet already carries a timestamp, either from dlil_output()
4953 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4954 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4955 	 * the timestamp value is used internally there.
4956 	 */
4957 	switch (p->cp_ptype) {
4958 	case QP_MBUF:
4959 #if SKYWALK
4960 		/*
4961 		 * Valid only for non-native (compat) Skywalk interface.
4962 		 * If the data source uses packet, caller must convert
4963 		 * it to mbuf first prior to calling this routine.
4964 		 */
4965 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4966 #endif /* SKYWALK */
4967 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4968 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4969 
4970 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4971 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4972 			nanouptime(&now);
4973 			net_timernsec(&now, &now_nsec);
4974 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4975 		}
4976 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4977 		/*
4978 		 * If the packet service class is not background,
4979 		 * update the timestamp to indicate recent activity
4980 		 * on a foreground socket.
4981 		 */
4982 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4983 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4984 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4985 			    PKTF_SO_BACKGROUND)) {
4986 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
4987 				if (fg_ts != NULL) {
4988 					*fg_ts = (uint32_t)_net_uptime;
4989 				}
4990 			}
4991 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4992 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
4993 				if (rt_ts != NULL) {
4994 					*rt_ts = (uint32_t)_net_uptime;
4995 				}
4996 			}
4997 		}
4998 		pktlen = m_pktlen(p->cp_mbuf);
4999 
5000 		/*
5001 		 * Some Wi-Fi AP implementations do not correctly handle
5002 		 * multicast IP packets with DSCP bits set (radr://9331522).
5003 		 * As a workaround we clear the DSCP bits but keep service
5004 		 * class (rdar://51507725).
5005 		 */
5006 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5007 		    IFNET_IS_WIFI_INFRA(ifp)) {
5008 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5009 			struct ether_header *eh;
5010 			boolean_t pullup = FALSE;
5011 			uint16_t etype;
5012 
5013 			if (__improbable(len < sizeof(struct ether_header))) {
5014 				DTRACE_IP1(small__ether, size_t, len);
5015 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5016 				    sizeof(struct ether_header))) == NULL) {
5017 					return ENOMEM;
5018 				}
5019 			}
5020 			eh = mtod(p->cp_mbuf, struct ether_header *);
5021 			etype = ntohs(eh->ether_type);
5022 			if (etype == ETHERTYPE_IP) {
5023 				hlen = sizeof(struct ether_header) +
5024 				    sizeof(struct ip);
5025 				if (len < hlen) {
5026 					DTRACE_IP1(small__v4, size_t, len);
5027 					pullup = TRUE;
5028 				}
5029 				ip_ver = IPVERSION;
5030 			} else if (etype == ETHERTYPE_IPV6) {
5031 				hlen = sizeof(struct ether_header) +
5032 				    sizeof(struct ip6_hdr);
5033 				if (len < hlen) {
5034 					DTRACE_IP1(small__v6, size_t, len);
5035 					pullup = TRUE;
5036 				}
5037 				ip_ver = IPV6_VERSION;
5038 			} else {
5039 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5040 				break;
5041 			}
5042 			if (pullup) {
5043 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5044 				    NULL) {
5045 					return ENOMEM;
5046 				}
5047 
5048 				eh = mtod(p->cp_mbuf, struct ether_header *);
5049 			}
5050 			mcast_buf = (uint8_t *)(eh + 1);
5051 			/*
5052 			 * ifnet_mcast_clear_dscp() will finish the work below.
5053 			 * Note that the pullups above ensure that mcast_buf
5054 			 * points to a full IP header.
5055 			 */
5056 		}
5057 		break;
5058 
5059 #if SKYWALK
5060 	case QP_PACKET:
5061 		/*
5062 		 * Valid only for native Skywalk interface.  If the data
5063 		 * source uses mbuf, caller must convert it to packet first
5064 		 * prior to calling this routine.
5065 		 */
5066 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5067 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5068 		    p->cp_kpkt->pkt_timestamp == 0) {
5069 			nanouptime(&now);
5070 			net_timernsec(&now, &now_nsec);
5071 			p->cp_kpkt->pkt_timestamp = now_nsec;
5072 		}
5073 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5074 		/*
5075 		 * If the packet service class is not background,
5076 		 * update the timestamps on the interface, as well as
5077 		 * the ones in nexus-wide advisory to indicate recent
5078 		 * activity on a foreground flow.
5079 		 */
5080 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5081 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5082 			if (fg_ts != NULL) {
5083 				*fg_ts = (uint32_t)_net_uptime;
5084 			}
5085 		}
5086 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5087 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5088 			if (rt_ts != NULL) {
5089 				*rt_ts = (uint32_t)_net_uptime;
5090 			}
5091 		}
5092 		pktlen = p->cp_kpkt->pkt_length;
5093 
5094 		/*
5095 		 * Some Wi-Fi AP implementations do not correctly handle
5096 		 * multicast IP packets with DSCP bits set (radr://9331522).
5097 		 * As a workaround we clear the DSCP bits but keep service
5098 		 * class (rdar://51507725).
5099 		 */
5100 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5101 		    IFNET_IS_WIFI_INFRA(ifp)) {
5102 			uint8_t *baddr;
5103 			struct ether_header *eh;
5104 			uint16_t etype;
5105 
5106 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5107 			baddr += p->cp_kpkt->pkt_headroom;
5108 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5109 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5110 				    p->cp_kpkt);
5111 				break;
5112 			}
5113 			eh = (struct ether_header *)(void *)baddr;
5114 			etype = ntohs(eh->ether_type);
5115 			if (etype == ETHERTYPE_IP) {
5116 				if (pktlen < sizeof(struct ether_header) +
5117 				    sizeof(struct ip)) {
5118 					DTRACE_IP1(pkt__small__v4, uint32_t,
5119 					    pktlen);
5120 					break;
5121 				}
5122 				ip_ver = IPVERSION;
5123 			} else if (etype == ETHERTYPE_IPV6) {
5124 				if (pktlen < sizeof(struct ether_header) +
5125 				    sizeof(struct ip6_hdr)) {
5126 					DTRACE_IP1(pkt__small__v6, uint32_t,
5127 					    pktlen);
5128 					break;
5129 				}
5130 				ip_ver = IPV6_VERSION;
5131 			} else {
5132 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5133 				    etype);
5134 				break;
5135 			}
5136 			mcast_buf = (uint8_t *)(eh + 1);
5137 			/*
5138 			 * ifnet_mcast_clear_dscp() will finish the work below.
5139 			 * The checks above verify that the IP header is in the
5140 			 * first buflet.
5141 			 */
5142 		}
5143 		break;
5144 #endif /* SKYWALK */
5145 
5146 	default:
5147 		VERIFY(0);
5148 		/* NOTREACHED */
5149 		__builtin_unreachable();
5150 	}
5151 
5152 	if (mcast_buf != NULL) {
5153 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5154 	}
5155 
5156 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5157 		if (now_nsec == 0) {
5158 			nanouptime(&now);
5159 			net_timernsec(&now, &now_nsec);
5160 		}
5161 		/*
5162 		 * If the driver chose to delay start callback for
5163 		 * coalescing multiple packets, Then use the following
5164 		 * heuristics to make sure that start callback will
5165 		 * be delayed only when bulk data transfer is detected.
5166 		 * 1. number of packets enqueued in (delay_win * 2) is
5167 		 * greater than or equal to the delay qlen.
5168 		 * 2. If delay_start is enabled it will stay enabled for
5169 		 * another 10 idle windows. This is to take into account
5170 		 * variable RTT and burst traffic.
5171 		 * 3. If the time elapsed since last enqueue is more
5172 		 * than 200ms we disable delaying start callback. This is
5173 		 * is to take idle time into account.
5174 		 */
5175 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5176 		if (ifp->if_start_delay_swin > 0) {
5177 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5178 				ifp->if_start_delay_cnt++;
5179 			} else if ((now_nsec - ifp->if_start_delay_swin)
5180 			    >= (200 * 1000 * 1000)) {
5181 				ifp->if_start_delay_swin = now_nsec;
5182 				ifp->if_start_delay_cnt = 1;
5183 				ifp->if_start_delay_idle = 0;
5184 				if (ifp->if_eflags & IFEF_DELAY_START) {
5185 					if_clear_eflags(ifp, IFEF_DELAY_START);
5186 					ifnet_delay_start_disabled_increment();
5187 				}
5188 			} else {
5189 				if (ifp->if_start_delay_cnt >=
5190 				    ifp->if_start_delay_qlen) {
5191 					if_set_eflags(ifp, IFEF_DELAY_START);
5192 					ifp->if_start_delay_idle = 0;
5193 				} else {
5194 					if (ifp->if_start_delay_idle >= 10) {
5195 						if_clear_eflags(ifp,
5196 						    IFEF_DELAY_START);
5197 						ifnet_delay_start_disabled_increment();
5198 					} else {
5199 						ifp->if_start_delay_idle++;
5200 					}
5201 				}
5202 				ifp->if_start_delay_swin = now_nsec;
5203 				ifp->if_start_delay_cnt = 1;
5204 			}
5205 		} else {
5206 			ifp->if_start_delay_swin = now_nsec;
5207 			ifp->if_start_delay_cnt = 1;
5208 			ifp->if_start_delay_idle = 0;
5209 			if_clear_eflags(ifp, IFEF_DELAY_START);
5210 		}
5211 	} else {
5212 		if_clear_eflags(ifp, IFEF_DELAY_START);
5213 	}
5214 
5215 	/* enqueue the packet (caller consumes object) */
5216 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5217 	    1, pktlen, pdrop);
5218 
5219 	/*
5220 	 * Tell the driver to start dequeueing; do this even when the queue
5221 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5222 	 * be dequeueing from other unsuspended queues.
5223 	 */
5224 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5225 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5226 		ifnet_start(ifp);
5227 	}
5228 
5229 	return error;
5230 }
5231 
5232 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5233 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5234     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5235     boolean_t flush, boolean_t *pdrop)
5236 {
5237 	int error;
5238 
5239 	/* enqueue the packet (caller consumes object) */
5240 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5241 	    cnt, bytes, pdrop);
5242 
5243 	/*
5244 	 * Tell the driver to start dequeueing; do this even when the queue
5245 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5246 	 * be dequeueing from other unsuspended queues.
5247 	 */
5248 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5249 		ifnet_start(ifp);
5250 	}
5251 	return error;
5252 }
5253 
5254 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5255 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5256 {
5257 	struct ifnet *ifp = handle;
5258 	boolean_t pdrop;        /* dummy */
5259 	uint32_t i;
5260 
5261 	ASSERT(n_pkts >= 1);
5262 	for (i = 0; i < n_pkts - 1; i++) {
5263 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5264 		    FALSE, &pdrop);
5265 	}
5266 	/* flush with the last packet */
5267 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5268 	    TRUE, &pdrop);
5269 
5270 	return 0;
5271 }
5272 
5273 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5274 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5275     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5276 {
5277 	if (ifp->if_output_netem != NULL) {
5278 		bool drop;
5279 		errno_t error;
5280 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5281 		*pdrop = drop ? TRUE : FALSE;
5282 		return error;
5283 	} else {
5284 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5285 	}
5286 }
5287 
5288 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5289 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5290 {
5291 	uint32_t bytes = m_pktlen(m);
5292 	struct mbuf *tail = m;
5293 	uint32_t cnt = 1;
5294 	boolean_t pdrop;
5295 
5296 	while (tail->m_nextpkt) {
5297 		VERIFY(tail->m_flags & M_PKTHDR);
5298 		tail = tail->m_nextpkt;
5299 		cnt++;
5300 		bytes += m_pktlen(tail);
5301 	}
5302 
5303 	return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5304 }
5305 
5306 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5307 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5308     boolean_t *pdrop)
5309 {
5310 	classq_pkt_t pkt;
5311 
5312 	m_add_hdr_crumb_interface_output(m, ifp->if_index, false);
5313 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5314 	    m->m_nextpkt != NULL) {
5315 		if (m != NULL) {
5316 			m_freem_list(m);
5317 			*pdrop = TRUE;
5318 		}
5319 		return EINVAL;
5320 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5321 	    !IF_FULLY_ATTACHED(ifp)) {
5322 		/* flag tested without lock for performance */
5323 		m_freem(m);
5324 		*pdrop = TRUE;
5325 		return ENXIO;
5326 	} else if (!(ifp->if_flags & IFF_UP)) {
5327 		m_freem(m);
5328 		*pdrop = TRUE;
5329 		return ENETDOWN;
5330 	}
5331 
5332 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5333 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5334 }
5335 
5336 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5337 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5338     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5339     boolean_t *pdrop)
5340 {
5341 	classq_pkt_t head, tail;
5342 
5343 	m_add_hdr_crumb_interface_output(m_head, ifp->if_index, true);
5344 	ASSERT(m_head != NULL);
5345 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5346 	ASSERT(m_tail != NULL);
5347 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5348 	ASSERT(ifp != NULL);
5349 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5350 
5351 	if (!IF_FULLY_ATTACHED(ifp)) {
5352 		/* flag tested without lock for performance */
5353 		m_freem_list(m_head);
5354 		*pdrop = TRUE;
5355 		return ENXIO;
5356 	} else if (!(ifp->if_flags & IFF_UP)) {
5357 		m_freem_list(m_head);
5358 		*pdrop = TRUE;
5359 		return ENETDOWN;
5360 	}
5361 
5362 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5363 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5364 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5365 	           flush, pdrop);
5366 }
5367 
5368 #if SKYWALK
5369 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5370 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5371     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5372 {
5373 	classq_pkt_t pkt;
5374 
5375 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5376 
5377 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5378 		if (kpkt != NULL) {
5379 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5380 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5381 			*pdrop = TRUE;
5382 		}
5383 		return EINVAL;
5384 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5385 	    !IF_FULLY_ATTACHED(ifp))) {
5386 		/* flag tested without lock for performance */
5387 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5388 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5389 		*pdrop = TRUE;
5390 		return ENXIO;
5391 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5392 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5393 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5394 		*pdrop = TRUE;
5395 		return ENETDOWN;
5396 	}
5397 
5398 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5399 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5400 }
5401 
5402 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5403 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5404     boolean_t flush, boolean_t *pdrop)
5405 {
5406 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5407 }
5408 
5409 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5410 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5411     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5412 {
5413 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5414 }
5415 
5416 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5417 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5418     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5419     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5420 {
5421 	classq_pkt_t head, tail;
5422 
5423 	ASSERT(k_head != NULL);
5424 	ASSERT(k_tail != NULL);
5425 	ASSERT(ifp != NULL);
5426 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5427 
5428 	if (!IF_FULLY_ATTACHED(ifp)) {
5429 		/* flag tested without lock for performance */
5430 		pp_free_packet_chain(k_head, NULL);
5431 		*pdrop = TRUE;
5432 		return ENXIO;
5433 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5434 		pp_free_packet_chain(k_head, NULL);
5435 		*pdrop = TRUE;
5436 		return ENETDOWN;
5437 	}
5438 
5439 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5440 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5441 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5442 	           flush, pdrop);
5443 }
5444 
5445 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5446 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5447     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5448     boolean_t *pdrop)
5449 {
5450 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5451 	           cnt, bytes, flush, pdrop);
5452 }
5453 
5454 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5455 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5456     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5457     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5458 {
5459 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5460 	           cnt, bytes, flush, pdrop);
5461 }
5462 #endif /* SKYWALK */
5463 
5464 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5465 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5466 {
5467 	errno_t rc;
5468 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5469 
5470 	if (ifp == NULL || mp == NULL) {
5471 		return EINVAL;
5472 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5473 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5474 		return ENXIO;
5475 	}
5476 	if (!ifnet_is_attached(ifp, 1)) {
5477 		return ENXIO;
5478 	}
5479 
5480 #if SKYWALK
5481 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5482 #endif /* SKYWALK */
5483 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5484 	    &pkt, NULL, NULL, NULL, 0);
5485 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5486 	ifnet_decr_iorefcnt(ifp);
5487 	*mp = pkt.cp_mbuf;
5488 	m_add_hdr_crumb_interface_output(*mp, ifp->if_index, false);
5489 	return rc;
5490 }
5491 
5492 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5493 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5494     struct mbuf **mp)
5495 {
5496 	errno_t rc;
5497 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5498 
5499 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5500 		return EINVAL;
5501 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5502 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5503 		return ENXIO;
5504 	}
5505 	if (!ifnet_is_attached(ifp, 1)) {
5506 		return ENXIO;
5507 	}
5508 
5509 #if SKYWALK
5510 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5511 #endif /* SKYWALK */
5512 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5513 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5514 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5515 	ifnet_decr_iorefcnt(ifp);
5516 	*mp = pkt.cp_mbuf;
5517 	m_add_hdr_crumb_interface_output(*mp, ifp->if_index, false);
5518 	return rc;
5519 }
5520 
5521 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5522 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5523     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5524 {
5525 	errno_t rc;
5526 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5527 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5528 
5529 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5530 		return EINVAL;
5531 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5532 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5533 		return ENXIO;
5534 	}
5535 	if (!ifnet_is_attached(ifp, 1)) {
5536 		return ENXIO;
5537 	}
5538 
5539 #if SKYWALK
5540 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5541 #endif /* SKYWALK */
5542 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5543 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5544 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5545 	ifnet_decr_iorefcnt(ifp);
5546 	*head = pkt_head.cp_mbuf;
5547 	m_add_hdr_crumb_interface_output(*head, ifp->if_index, false);
5548 	if (tail != NULL) {
5549 		*tail = pkt_tail.cp_mbuf;
5550 	}
5551 	return rc;
5552 }
5553 
5554 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5555 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5556     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5557 {
5558 	errno_t rc;
5559 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5560 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5561 
5562 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5563 		return EINVAL;
5564 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5565 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5566 		return ENXIO;
5567 	}
5568 	if (!ifnet_is_attached(ifp, 1)) {
5569 		return ENXIO;
5570 	}
5571 
5572 #if SKYWALK
5573 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5574 #endif /* SKYWALK */
5575 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5576 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5577 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5578 	ifnet_decr_iorefcnt(ifp);
5579 	*head = pkt_head.cp_mbuf;
5580 	m_add_hdr_crumb_interface_output(*head, ifp->if_index, false);
5581 	if (tail != NULL) {
5582 		*tail = pkt_tail.cp_mbuf;
5583 	}
5584 	return rc;
5585 }
5586 
5587 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5588 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5589     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5590     u_int32_t *len)
5591 {
5592 	errno_t rc;
5593 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5594 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5595 
5596 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5597 	    !MBUF_VALID_SC(sc)) {
5598 		return EINVAL;
5599 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5600 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5601 		return ENXIO;
5602 	}
5603 	if (!ifnet_is_attached(ifp, 1)) {
5604 		return ENXIO;
5605 	}
5606 
5607 #if SKYWALK
5608 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5609 #endif /* SKYWALK */
5610 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5611 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5612 	    cnt, len, 0);
5613 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5614 	ifnet_decr_iorefcnt(ifp);
5615 	*head = pkt_head.cp_mbuf;
5616 	m_add_hdr_crumb_interface_output(*head, ifp->if_index, false);
5617 	if (tail != NULL) {
5618 		*tail = pkt_tail.cp_mbuf;
5619 	}
5620 	return rc;
5621 }
5622 
5623 #if XNU_TARGET_OS_OSX
5624 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5625 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5626     const struct sockaddr *dest, const char *dest_linkaddr,
5627     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5628 {
5629 	if (pre != NULL) {
5630 		*pre = 0;
5631 	}
5632 	if (post != NULL) {
5633 		*post = 0;
5634 	}
5635 
5636 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5637 }
5638 #endif /* XNU_TARGET_OS_OSX */
5639 
5640 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5641 packet_has_vlan_tag(struct mbuf * m)
5642 {
5643 	u_int   tag = 0;
5644 
5645 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5646 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5647 		if (tag == 0) {
5648 			/* the packet is just priority-tagged, clear the bit */
5649 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5650 		}
5651 	}
5652 	return tag != 0;
5653 }
5654 
5655 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)5656 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5657     char **frame_header_p, protocol_family_t protocol_family,
5658     boolean_t skip_bridge)
5659 {
5660 	boolean_t               is_vlan_packet = FALSE;
5661 	struct ifnet_filter     *filter;
5662 	struct mbuf             *m = *m_p;
5663 
5664 	is_vlan_packet = packet_has_vlan_tag(m);
5665 
5666 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5667 		return 0;
5668 	}
5669 
5670 	/*
5671 	 * Pass the inbound packet to the interface filters
5672 	 */
5673 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5674 	/* prevent filter list from changing in case we drop the lock */
5675 	if_flt_monitor_busy(ifp);
5676 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5677 		int result;
5678 
5679 		/* exclude VLAN packets from external filters PR-3586856 */
5680 		if (is_vlan_packet &&
5681 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5682 			continue;
5683 		}
5684 		/* the bridge has already seen the packet */
5685 		if (skip_bridge &&
5686 		    (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
5687 			continue;
5688 		}
5689 		if (!filter->filt_skip && filter->filt_input != NULL &&
5690 		    (filter->filt_protocol == 0 ||
5691 		    filter->filt_protocol == protocol_family)) {
5692 			lck_mtx_unlock(&ifp->if_flt_lock);
5693 
5694 			result = (*filter->filt_input)(filter->filt_cookie,
5695 			    ifp, protocol_family, m_p, frame_header_p);
5696 
5697 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5698 			if (result != 0) {
5699 				/* we're done with the filter list */
5700 				if_flt_monitor_unbusy(ifp);
5701 				lck_mtx_unlock(&ifp->if_flt_lock);
5702 				return result;
5703 			}
5704 		}
5705 	}
5706 	/* we're done with the filter list */
5707 	if_flt_monitor_unbusy(ifp);
5708 	lck_mtx_unlock(&ifp->if_flt_lock);
5709 
5710 	/*
5711 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5712 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5713 	 */
5714 	if (*m_p != NULL) {
5715 		(*m_p)->m_flags &= ~M_PROTO1;
5716 	}
5717 
5718 	return 0;
5719 }
5720 
5721 __attribute__((noinline))
5722 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5723 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5724     protocol_family_t protocol_family)
5725 {
5726 	boolean_t               is_vlan_packet;
5727 	struct ifnet_filter     *filter;
5728 	struct mbuf             *m = *m_p;
5729 
5730 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5731 		return 0;
5732 	}
5733 	is_vlan_packet = packet_has_vlan_tag(m);
5734 
5735 	/*
5736 	 * Pass the outbound packet to the interface filters
5737 	 */
5738 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5739 	/* prevent filter list from changing in case we drop the lock */
5740 	if_flt_monitor_busy(ifp);
5741 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5742 		int result;
5743 
5744 		/* exclude VLAN packets from external filters PR-3586856 */
5745 		if (is_vlan_packet &&
5746 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5747 			continue;
5748 		}
5749 
5750 		if (!filter->filt_skip && filter->filt_output != NULL &&
5751 		    (filter->filt_protocol == 0 ||
5752 		    filter->filt_protocol == protocol_family)) {
5753 			lck_mtx_unlock(&ifp->if_flt_lock);
5754 
5755 			result = filter->filt_output(filter->filt_cookie, ifp,
5756 			    protocol_family, m_p);
5757 
5758 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5759 			if (result != 0) {
5760 				/* we're done with the filter list */
5761 				if_flt_monitor_unbusy(ifp);
5762 				lck_mtx_unlock(&ifp->if_flt_lock);
5763 				return result;
5764 			}
5765 		}
5766 	}
5767 	/* we're done with the filter list */
5768 	if_flt_monitor_unbusy(ifp);
5769 	lck_mtx_unlock(&ifp->if_flt_lock);
5770 
5771 	return 0;
5772 }
5773 
5774 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5775 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5776 {
5777 	int error;
5778 
5779 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5780 		/* Version 1 protocols get one packet at a time */
5781 		while (m != NULL) {
5782 			char *  frame_header;
5783 			mbuf_t  next_packet;
5784 
5785 			next_packet = m->m_nextpkt;
5786 			m->m_nextpkt = NULL;
5787 			frame_header = m->m_pkthdr.pkt_hdr;
5788 			m->m_pkthdr.pkt_hdr = NULL;
5789 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5790 			    ifproto->protocol_family, m, frame_header);
5791 			if (error != 0 && error != EJUSTRETURN) {
5792 				m_freem(m);
5793 			}
5794 			m = next_packet;
5795 		}
5796 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5797 		/* Version 2 protocols support packet lists */
5798 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5799 		    ifproto->protocol_family, m);
5800 		if (error != 0 && error != EJUSTRETURN) {
5801 			m_freem_list(m);
5802 		}
5803 	}
5804 }
5805 
5806 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5807 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5808     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5809 {
5810 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5811 
5812 	if (s->packets_in != 0) {
5813 		d->packets_in += s->packets_in;
5814 	}
5815 	if (s->bytes_in != 0) {
5816 		d->bytes_in += s->bytes_in;
5817 	}
5818 	if (s->errors_in != 0) {
5819 		d->errors_in += s->errors_in;
5820 	}
5821 
5822 	if (s->packets_out != 0) {
5823 		d->packets_out += s->packets_out;
5824 	}
5825 	if (s->bytes_out != 0) {
5826 		d->bytes_out += s->bytes_out;
5827 	}
5828 	if (s->errors_out != 0) {
5829 		d->errors_out += s->errors_out;
5830 	}
5831 
5832 	if (s->collisions != 0) {
5833 		d->collisions += s->collisions;
5834 	}
5835 	if (s->dropped != 0) {
5836 		d->dropped += s->dropped;
5837 	}
5838 
5839 	if (poll) {
5840 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5841 	}
5842 }
5843 
5844 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5845 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5846 {
5847 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5848 
5849 	/*
5850 	 * Use of atomic operations is unavoidable here because
5851 	 * these stats may also be incremented elsewhere via KPIs.
5852 	 */
5853 	if (s->packets_in != 0) {
5854 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
5855 		s->packets_in = 0;
5856 	}
5857 	if (s->bytes_in != 0) {
5858 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
5859 		s->bytes_in = 0;
5860 	}
5861 	if (s->errors_in != 0) {
5862 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
5863 		s->errors_in = 0;
5864 	}
5865 
5866 	if (s->packets_out != 0) {
5867 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
5868 		s->packets_out = 0;
5869 	}
5870 	if (s->bytes_out != 0) {
5871 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
5872 		s->bytes_out = 0;
5873 	}
5874 	if (s->errors_out != 0) {
5875 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
5876 		s->errors_out = 0;
5877 	}
5878 
5879 	if (s->collisions != 0) {
5880 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
5881 		s->collisions = 0;
5882 	}
5883 	if (s->dropped != 0) {
5884 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
5885 		s->dropped = 0;
5886 	}
5887 
5888 	/*
5889 	 * No need for atomic operations as they are modified here
5890 	 * only from within the DLIL input thread context.
5891 	 */
5892 	if (ifp->if_poll_tstats.packets != 0) {
5893 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5894 		ifp->if_poll_tstats.packets = 0;
5895 	}
5896 	if (ifp->if_poll_tstats.bytes != 0) {
5897 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5898 		ifp->if_poll_tstats.bytes = 0;
5899 	}
5900 
5901 	return ifp->if_data_threshold != 0;
5902 }
5903 
5904 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5905 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5906 {
5907 	return dlil_input_packet_list_common(ifp, m, 0,
5908 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5909 }
5910 
5911 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5912 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5913     u_int32_t cnt, ifnet_model_t mode)
5914 {
5915 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5916 }
5917 
5918 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)5919 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
5920 {
5921 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5922 	if_flt_monitor_busy(ifp);
5923 	lck_mtx_unlock(&ifp->if_flt_lock);
5924 
5925 	if (ifp->if_bridge != NULL) {
5926 		m = bridge_early_input(ifp, m, cnt);
5927 	}
5928 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5929 	if_flt_monitor_unbusy(ifp);
5930 	lck_mtx_unlock(&ifp->if_flt_lock);
5931 	return m;
5932 }
5933 
5934 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5935 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5936     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5937 {
5938 	int error = 0;
5939 	protocol_family_t protocol_family;
5940 	mbuf_t next_packet;
5941 	ifnet_t ifp = ifp_param;
5942 	char *frame_header = NULL;
5943 	struct if_proto *last_ifproto = NULL;
5944 	mbuf_t pkt_first = NULL;
5945 	mbuf_t *pkt_next = NULL;
5946 	u_int32_t poll_thresh = 0, poll_ival = 0;
5947 	int iorefcnt = 0;
5948 	boolean_t skip_bridge_filter = FALSE;
5949 
5950 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5951 
5952 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5953 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
5954 		poll_thresh = cnt;
5955 	}
5956 	if (bridge_enable_early_input != 0 &&
5957 	    ifp != NULL && ifp->if_bridge != NULL) {
5958 		m = handle_bridge_early_input(ifp, m, cnt);
5959 		skip_bridge_filter = TRUE;
5960 	}
5961 	while (m != NULL) {
5962 		struct if_proto *ifproto = NULL;
5963 		uint32_t pktf_mask;     /* pkt flags to preserve */
5964 
5965 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5966 		m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
5967 
5968 		if (ifp_param == NULL) {
5969 			ifp = m->m_pkthdr.rcvif;
5970 		}
5971 
5972 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
5973 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5974 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5975 			ifnet_poll(ifp);
5976 		}
5977 
5978 		/* Check if this mbuf looks valid */
5979 		MBUF_INPUT_CHECK(m, ifp);
5980 
5981 		next_packet = m->m_nextpkt;
5982 		m->m_nextpkt = NULL;
5983 		frame_header = m->m_pkthdr.pkt_hdr;
5984 		m->m_pkthdr.pkt_hdr = NULL;
5985 
5986 		/*
5987 		 * Get an IO reference count if the interface is not
5988 		 * loopback (lo0) and it is attached; lo0 never goes
5989 		 * away, so optimize for that.
5990 		 */
5991 		if (ifp != lo_ifp) {
5992 			/* iorefcnt is 0 if it hasn't been taken yet */
5993 			if (iorefcnt == 0) {
5994 				if (!ifnet_datamov_begin(ifp)) {
5995 					m_freem(m);
5996 					goto next;
5997 				}
5998 			}
5999 			iorefcnt = 1;
6000 			/*
6001 			 * Preserve the time stamp and skip pktap flags.
6002 			 */
6003 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6004 		} else {
6005 			/*
6006 			 * If this arrived on lo0, preserve interface addr
6007 			 * info to allow for connectivity between loopback
6008 			 * and local interface addresses.
6009 			 */
6010 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6011 		}
6012 		pktf_mask |= PKTF_WAKE_PKT;
6013 
6014 		/* make sure packet comes in clean */
6015 		m_classifier_init(m, pktf_mask);
6016 
6017 		ifp_inc_traffic_class_in(ifp, m);
6018 
6019 		/* find which protocol family this packet is for */
6020 		ifnet_lock_shared(ifp);
6021 		error = (*ifp->if_demux)(ifp, m, frame_header,
6022 		    &protocol_family);
6023 		ifnet_lock_done(ifp);
6024 		if (error != 0) {
6025 			if (error == EJUSTRETURN) {
6026 				goto next;
6027 			}
6028 			protocol_family = 0;
6029 		}
6030 		/* check for an updated frame header */
6031 		if (m->m_pkthdr.pkt_hdr != NULL) {
6032 			frame_header = m->m_pkthdr.pkt_hdr;
6033 			m->m_pkthdr.pkt_hdr = NULL;
6034 		}
6035 
6036 #if (DEVELOPMENT || DEBUG)
6037 		/*
6038 		 * For testing we do not care about broadcast and multicast packets as
6039 		 * they are not as controllable as unicast traffic
6040 		 */
6041 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6042 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6043 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6044 				/*
6045 				 * This is a one-shot command
6046 				 */
6047 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6048 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6049 			}
6050 		}
6051 #endif /* (DEVELOPMENT || DEBUG) */
6052 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6053 			char buffer[64];
6054 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6055 
6056 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6057 			    ifp->if_xname, m_pktlen(m));
6058 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6059 				log_hexdump(buffer, buflen);
6060 			}
6061 		}
6062 
6063 		pktap_input(ifp, protocol_family, m, frame_header);
6064 
6065 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6066 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6067 		    ifp->if_type == IFT_CELLULAR) {
6068 			m_freem(m);
6069 			ip6stat.ip6s_clat464_in_v4_drop++;
6070 			goto next;
6071 		}
6072 
6073 		/* Translate the packet if it is received on CLAT interface */
6074 		if ((m->m_flags & M_PROMISC) == 0 &&
6075 		    protocol_family == PF_INET6 &&
6076 		    IS_INTF_CLAT46(ifp) &&
6077 		    dlil_is_clat_needed(protocol_family, m)) {
6078 			char *data = NULL;
6079 			struct ether_header eh;
6080 			struct ether_header *ehp = NULL;
6081 
6082 			if (ifp->if_type == IFT_ETHER) {
6083 				ehp = (struct ether_header *)(void *)frame_header;
6084 				/* Skip RX Ethernet packets if they are not IPV6 */
6085 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6086 					goto skip_clat;
6087 				}
6088 
6089 				/* Keep a copy of frame_header for Ethernet packets */
6090 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6091 			}
6092 			error = dlil_clat64(ifp, &protocol_family, &m);
6093 			data = mtod(m, char*);
6094 			if (error != 0) {
6095 				m_freem(m);
6096 				ip6stat.ip6s_clat464_in_drop++;
6097 				goto next;
6098 			}
6099 			/* Native v6 should be No-op */
6100 			if (protocol_family != PF_INET) {
6101 				goto skip_clat;
6102 			}
6103 
6104 			/* Do this only for translated v4 packets. */
6105 			switch (ifp->if_type) {
6106 			case IFT_CELLULAR:
6107 				frame_header = data;
6108 				break;
6109 			case IFT_ETHER:
6110 				/*
6111 				 * Drop if the mbuf doesn't have enough
6112 				 * space for Ethernet header
6113 				 */
6114 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6115 					m_freem(m);
6116 					ip6stat.ip6s_clat464_in_drop++;
6117 					goto next;
6118 				}
6119 				/*
6120 				 * Set the frame_header ETHER_HDR_LEN bytes
6121 				 * preceeding the data pointer. Change
6122 				 * the ether_type too.
6123 				 */
6124 				frame_header = data - ETHER_HDR_LEN;
6125 				eh.ether_type = htons(ETHERTYPE_IP);
6126 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6127 				break;
6128 			}
6129 		}
6130 skip_clat:
6131 		/*
6132 		 * Match the wake packet against the list of ports that has been
6133 		 * been queried by the driver before the device went to sleep
6134 		 */
6135 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6136 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6137 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6138 			}
6139 		}
6140 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6141 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6142 			dlil_input_cksum_dbg(ifp, m, frame_header,
6143 			    protocol_family);
6144 		}
6145 		/*
6146 		 * For partial checksum offload, we expect the driver to
6147 		 * set the start offset indicating the start of the span
6148 		 * that is covered by the hardware-computed checksum;
6149 		 * adjust this start offset accordingly because the data
6150 		 * pointer has been advanced beyond the link-layer header.
6151 		 *
6152 		 * Virtual lan types (bridge, vlan, bond) can call
6153 		 * dlil_input_packet_list() with the same packet with the
6154 		 * checksum flags set. Set a flag indicating that the
6155 		 * adjustment has already been done.
6156 		 */
6157 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6158 			/* adjustment has already been done */
6159 		} else if ((m->m_pkthdr.csum_flags &
6160 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6161 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6162 			int adj;
6163 			if (frame_header == NULL ||
6164 			    frame_header < (char *)mbuf_datastart(m) ||
6165 			    frame_header > (char *)m->m_data ||
6166 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6167 			    m->m_pkthdr.csum_rx_start) {
6168 				m->m_pkthdr.csum_data = 0;
6169 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6170 				hwcksum_in_invalidated++;
6171 			} else {
6172 				m->m_pkthdr.csum_rx_start -= adj;
6173 			}
6174 			/* make sure we don't adjust more than once */
6175 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6176 		}
6177 		if (clat_debug) {
6178 			pktap_input(ifp, protocol_family, m, frame_header);
6179 		}
6180 
6181 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6182 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6183 		}
6184 
6185 		/* run interface filters */
6186 		error = dlil_interface_filters_input(ifp, &m,
6187 		    &frame_header, protocol_family, skip_bridge_filter);
6188 		if (error != 0) {
6189 			if (error != EJUSTRETURN) {
6190 				m_freem(m);
6191 			}
6192 			goto next;
6193 		}
6194 		/*
6195 		 * A VLAN and Bond interface receives packets by attaching
6196 		 * a "protocol" to the underlying interface.
6197 		 * A promiscuous packet needs to be delivered to the
6198 		 * VLAN or Bond interface since:
6199 		 * - Bond interface member may not support setting the
6200 		 *   MAC address, so packets are inherently "promiscuous"
6201 		 * - A VLAN or Bond interface could be members of a bridge,
6202 		 *   where promiscuous packets correspond to other
6203 		 *   devices that the bridge forwards packets to/from
6204 		 */
6205 		if ((m->m_flags & M_PROMISC) != 0) {
6206 			switch (protocol_family) {
6207 			case PF_VLAN:
6208 			case PF_BOND:
6209 				/* VLAN and Bond get promiscuous packets */
6210 				break;
6211 			default:
6212 				m_freem(m);
6213 				goto next;
6214 			}
6215 		}
6216 
6217 		/* Lookup the protocol attachment to this interface */
6218 		if (protocol_family == 0) {
6219 			ifproto = NULL;
6220 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6221 		    (last_ifproto->protocol_family == protocol_family)) {
6222 			VERIFY(ifproto == NULL);
6223 			ifproto = last_ifproto;
6224 			if_proto_ref(last_ifproto);
6225 		} else {
6226 			VERIFY(ifproto == NULL);
6227 			ifnet_lock_shared(ifp);
6228 			/* callee holds a proto refcnt upon success */
6229 			ifproto = find_attached_proto(ifp, protocol_family);
6230 			ifnet_lock_done(ifp);
6231 		}
6232 		if (ifproto == NULL) {
6233 			/* no protocol for this packet, discard */
6234 			m_freem(m);
6235 			goto next;
6236 		}
6237 		if (ifproto != last_ifproto) {
6238 			if (last_ifproto != NULL) {
6239 				/* pass up the list for the previous protocol */
6240 				dlil_ifproto_input(last_ifproto, pkt_first);
6241 				pkt_first = NULL;
6242 				if_proto_free(last_ifproto);
6243 			}
6244 			last_ifproto = ifproto;
6245 			if_proto_ref(ifproto);
6246 		}
6247 		/* extend the list */
6248 		m->m_pkthdr.pkt_hdr = frame_header;
6249 		if (pkt_first == NULL) {
6250 			pkt_first = m;
6251 		} else {
6252 			*pkt_next = m;
6253 		}
6254 		pkt_next = &m->m_nextpkt;
6255 
6256 next:
6257 		if (next_packet == NULL && last_ifproto != NULL) {
6258 			/* pass up the last list of packets */
6259 			dlil_ifproto_input(last_ifproto, pkt_first);
6260 			if_proto_free(last_ifproto);
6261 			last_ifproto = NULL;
6262 		}
6263 		if (ifproto != NULL) {
6264 			if_proto_free(ifproto);
6265 			ifproto = NULL;
6266 		}
6267 
6268 		m = next_packet;
6269 
6270 		/* update the driver's multicast filter, if needed */
6271 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6272 			ifp->if_updatemcasts = 0;
6273 		}
6274 		if (iorefcnt == 1) {
6275 			/* If the next mbuf is on a different interface, unlock data-mov */
6276 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6277 				ifnet_datamov_end(ifp);
6278 				iorefcnt = 0;
6279 			}
6280 		}
6281 	}
6282 
6283 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6284 }
6285 
6286 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6287 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6288 {
6289 	errno_t err;
6290 
6291 	if (sync) {
6292 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6293 		if (err == EAFNOSUPPORT) {
6294 			err = 0;
6295 		}
6296 	} else {
6297 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6298 		err = 0;
6299 	}
6300 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6301 	    "(err=%d)\n", if_name(ifp),
6302 	    (err == 0 ? "successfully restored" : "failed to restore"),
6303 	    ifp->if_updatemcasts, err);
6304 
6305 	/* just return success */
6306 	return 0;
6307 }
6308 
6309 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6310 if_mcasts_update_async(struct ifnet *ifp)
6311 {
6312 	return if_mcasts_update_common(ifp, false);
6313 }
6314 
6315 errno_t
if_mcasts_update(struct ifnet * ifp)6316 if_mcasts_update(struct ifnet *ifp)
6317 {
6318 	return if_mcasts_update_common(ifp, true);
6319 }
6320 
6321 /* If ifp is set, we will increment the generation for the interface */
6322 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6323 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6324 {
6325 	if (ifp != NULL) {
6326 		ifnet_increment_generation(ifp);
6327 	}
6328 
6329 #if NECP
6330 	necp_update_all_clients();
6331 #endif /* NECP */
6332 
6333 	return kev_post_msg(event);
6334 }
6335 
6336 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6337 dlil_post_sifflags_msg(struct ifnet * ifp)
6338 {
6339 	struct kev_msg ev_msg;
6340 	struct net_event_data ev_data;
6341 
6342 	bzero(&ev_data, sizeof(ev_data));
6343 	bzero(&ev_msg, sizeof(ev_msg));
6344 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6345 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6346 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6347 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6348 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6349 	ev_data.if_family = ifp->if_family;
6350 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6351 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6352 	ev_msg.dv[0].data_ptr = &ev_data;
6353 	ev_msg.dv[1].data_length = 0;
6354 	dlil_post_complete_msg(ifp, &ev_msg);
6355 }
6356 
6357 #define TMP_IF_PROTO_ARR_SIZE   10
6358 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6359 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6360 {
6361 	struct ifnet_filter *filter = NULL;
6362 	struct if_proto *proto = NULL;
6363 	int if_proto_count = 0;
6364 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6365 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6366 	int tmp_ifproto_arr_idx = 0;
6367 
6368 	/*
6369 	 * Pass the event to the interface filters
6370 	 */
6371 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6372 	/* prevent filter list from changing in case we drop the lock */
6373 	if_flt_monitor_busy(ifp);
6374 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6375 		if (filter->filt_event != NULL) {
6376 			lck_mtx_unlock(&ifp->if_flt_lock);
6377 
6378 			filter->filt_event(filter->filt_cookie, ifp,
6379 			    filter->filt_protocol, event);
6380 
6381 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6382 		}
6383 	}
6384 	/* we're done with the filter list */
6385 	if_flt_monitor_unbusy(ifp);
6386 	lck_mtx_unlock(&ifp->if_flt_lock);
6387 
6388 	/* Get an io ref count if the interface is attached */
6389 	if (!ifnet_is_attached(ifp, 1)) {
6390 		goto done;
6391 	}
6392 
6393 	/*
6394 	 * An embedded tmp_list_entry in if_proto may still get
6395 	 * over-written by another thread after giving up ifnet lock,
6396 	 * therefore we are avoiding embedded pointers here.
6397 	 */
6398 	ifnet_lock_shared(ifp);
6399 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6400 	if (if_proto_count) {
6401 		int i;
6402 		VERIFY(ifp->if_proto_hash != NULL);
6403 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6404 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6405 		} else {
6406 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6407 			    if_proto_count, Z_WAITOK | Z_ZERO);
6408 			if (tmp_ifproto_arr == NULL) {
6409 				ifnet_lock_done(ifp);
6410 				goto cleanup;
6411 			}
6412 		}
6413 
6414 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6415 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6416 			    next_hash) {
6417 				if_proto_ref(proto);
6418 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6419 				tmp_ifproto_arr_idx++;
6420 			}
6421 		}
6422 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6423 	}
6424 	ifnet_lock_done(ifp);
6425 
6426 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6427 	    tmp_ifproto_arr_idx++) {
6428 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6429 		VERIFY(proto != NULL);
6430 		proto_media_event eventp =
6431 		    (proto->proto_kpi == kProtoKPI_v1 ?
6432 		    proto->kpi.v1.event :
6433 		    proto->kpi.v2.event);
6434 
6435 		if (eventp != NULL) {
6436 			eventp(ifp, proto->protocol_family,
6437 			    event);
6438 		}
6439 		if_proto_free(proto);
6440 	}
6441 
6442 cleanup:
6443 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6444 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6445 	}
6446 
6447 	/* Pass the event to the interface */
6448 	if (ifp->if_event != NULL) {
6449 		ifp->if_event(ifp, event);
6450 	}
6451 
6452 	/* Release the io ref count */
6453 	ifnet_decr_iorefcnt(ifp);
6454 done:
6455 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6456 }
6457 
6458 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6459 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6460 {
6461 	struct kev_msg kev_msg;
6462 	int result = 0;
6463 
6464 	if (ifp == NULL || event == NULL) {
6465 		return EINVAL;
6466 	}
6467 
6468 	bzero(&kev_msg, sizeof(kev_msg));
6469 	kev_msg.vendor_code = event->vendor_code;
6470 	kev_msg.kev_class = event->kev_class;
6471 	kev_msg.kev_subclass = event->kev_subclass;
6472 	kev_msg.event_code = event->event_code;
6473 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6474 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6475 	kev_msg.dv[1].data_length = 0;
6476 
6477 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6478 
6479 	return result;
6480 }
6481 
6482 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6483 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6484 {
6485 	mbuf_t  n = m;
6486 	int chainlen = 0;
6487 
6488 	while (n != NULL) {
6489 		chainlen++;
6490 		n = n->m_next;
6491 	}
6492 	switch (chainlen) {
6493 	case 0:
6494 		break;
6495 	case 1:
6496 		os_atomic_inc(&cls->cls_one, relaxed);
6497 		break;
6498 	case 2:
6499 		os_atomic_inc(&cls->cls_two, relaxed);
6500 		break;
6501 	case 3:
6502 		os_atomic_inc(&cls->cls_three, relaxed);
6503 		break;
6504 	case 4:
6505 		os_atomic_inc(&cls->cls_four, relaxed);
6506 		break;
6507 	case 5:
6508 	default:
6509 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6510 		break;
6511 	}
6512 }
6513 
6514 #if CONFIG_DTRACE
6515 __attribute__((noinline))
6516 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6517 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6518 {
6519 	if (proto_family == PF_INET) {
6520 		struct ip *ip = mtod(m, struct ip *);
6521 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6522 		    struct ip *, ip, struct ifnet *, ifp,
6523 		    struct ip *, ip, struct ip6_hdr *, NULL);
6524 	} else if (proto_family == PF_INET6) {
6525 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6526 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6527 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6528 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6529 	}
6530 }
6531 #endif /* CONFIG_DTRACE */
6532 
6533 /*
6534  * dlil_output
6535  *
6536  * Caller should have a lock on the protocol domain if the protocol
6537  * doesn't support finer grained locking. In most cases, the lock
6538  * will be held from the socket layer and won't be released until
6539  * we return back to the socket layer.
6540  *
6541  * This does mean that we must take a protocol lock before we take
6542  * an interface lock if we're going to take both. This makes sense
6543  * because a protocol is likely to interact with an ifp while it
6544  * is under the protocol lock.
6545  *
6546  * An advisory code will be returned if adv is not null. This
6547  * can be used to provide feedback about interface queues to the
6548  * application.
6549  */
6550 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int flags,struct flowadv * adv)6551 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6552     void *route, const struct sockaddr *dest, int flags, struct flowadv *adv)
6553 {
6554 	char *frame_type = NULL;
6555 	char *dst_linkaddr = NULL;
6556 	int retval = 0;
6557 	char frame_type_buffer[DLIL_MAX_FRAME_TYPE_BUFFER_SIZE];
6558 	char dst_linkaddr_buffer[DLIL_MAX_LINKADDR_BUFFER_SIZE];
6559 	struct if_proto *proto = NULL;
6560 	mbuf_t  m = NULL;
6561 	mbuf_t  send_head = NULL;
6562 	mbuf_t  *send_tail = &send_head;
6563 	int iorefcnt = 0;
6564 	u_int32_t pre = 0, post = 0;
6565 	u_int32_t fpkts = 0, fbytes = 0;
6566 	int32_t flen = 0;
6567 	struct timespec now;
6568 	u_int64_t now_nsec;
6569 	boolean_t did_clat46 = FALSE;
6570 	protocol_family_t old_proto_family = proto_family;
6571 	struct sockaddr_in6 dest6;
6572 	struct rtentry *rt = NULL;
6573 	u_int16_t m_loop_set = 0;
6574 	bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0;
6575 
6576 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6577 
6578 	/*
6579 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6580 	 * from happening while this operation is in progress
6581 	 */
6582 	if (!ifnet_datamov_begin(ifp)) {
6583 		retval = ENXIO;
6584 		goto cleanup;
6585 	}
6586 	iorefcnt = 1;
6587 
6588 	VERIFY(ifp->if_output_dlil != NULL);
6589 
6590 	/* update the driver's multicast filter, if needed */
6591 	if (ifp->if_updatemcasts > 0) {
6592 		if_mcasts_update_async(ifp);
6593 		ifp->if_updatemcasts = 0;
6594 	}
6595 
6596 	frame_type = frame_type_buffer;
6597 	dst_linkaddr = dst_linkaddr_buffer;
6598 
6599 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6600 		ifnet_lock_shared(ifp);
6601 		/* callee holds a proto refcnt upon success */
6602 		proto = find_attached_proto(ifp, proto_family);
6603 		if (proto == NULL) {
6604 			ifnet_lock_done(ifp);
6605 			retval = ENXIO;
6606 			goto cleanup;
6607 		}
6608 		ifnet_lock_done(ifp);
6609 	}
6610 
6611 preout_again:
6612 	if (packetlist == NULL) {
6613 		goto cleanup;
6614 	}
6615 
6616 	m = packetlist;
6617 	packetlist = packetlist->m_nextpkt;
6618 	m->m_nextpkt = NULL;
6619 
6620 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6621 
6622 	/*
6623 	 * Perform address family translation for the first
6624 	 * packet outside the loop in order to perform address
6625 	 * lookup for the translated proto family.
6626 	 */
6627 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6628 	    (ifp->if_type == IFT_CELLULAR ||
6629 	    dlil_is_clat_needed(proto_family, m))) {
6630 		retval = dlil_clat46(ifp, &proto_family, &m);
6631 		/*
6632 		 * Go to the next packet if translation fails
6633 		 */
6634 		if (retval != 0) {
6635 			m_freem(m);
6636 			m = NULL;
6637 			ip6stat.ip6s_clat464_out_drop++;
6638 			/* Make sure that the proto family is PF_INET */
6639 			ASSERT(proto_family == PF_INET);
6640 			goto preout_again;
6641 		}
6642 		/*
6643 		 * Free the old one and make it point to the IPv6 proto structure.
6644 		 *
6645 		 * Change proto for the first time we have successfully
6646 		 * performed address family translation.
6647 		 */
6648 		if (!did_clat46 && proto_family == PF_INET6) {
6649 			did_clat46 = TRUE;
6650 
6651 			if (proto != NULL) {
6652 				if_proto_free(proto);
6653 			}
6654 			ifnet_lock_shared(ifp);
6655 			/* callee holds a proto refcnt upon success */
6656 			proto = find_attached_proto(ifp, proto_family);
6657 			if (proto == NULL) {
6658 				ifnet_lock_done(ifp);
6659 				retval = ENXIO;
6660 				m_freem(m);
6661 				m = NULL;
6662 				goto cleanup;
6663 			}
6664 			ifnet_lock_done(ifp);
6665 			if (ifp->if_type == IFT_ETHER) {
6666 				/* Update the dest to translated v6 address */
6667 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6668 				dest6.sin6_family = AF_INET6;
6669 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6670 				dest = SA(&dest6);
6671 
6672 				/*
6673 				 * Lookup route to the translated destination
6674 				 * Free this route ref during cleanup
6675 				 */
6676 				rt = rtalloc1_scoped(SA(&dest6),
6677 				    0, 0, ifp->if_index);
6678 
6679 				route = rt;
6680 			}
6681 		}
6682 	}
6683 
6684 	/*
6685 	 * This path gets packet chain going to the same destination.
6686 	 * The pre output routine is used to either trigger resolution of
6687 	 * the next hop or retrieve the next hop's link layer addressing.
6688 	 * For ex: ether_inet(6)_pre_output routine.
6689 	 *
6690 	 * If the routine returns EJUSTRETURN, it implies that packet has
6691 	 * been queued, and therefore we have to call preout_again for the
6692 	 * following packet in the chain.
6693 	 *
6694 	 * For errors other than EJUSTRETURN, the current packet is freed
6695 	 * and the rest of the chain (pointed by packetlist is freed as
6696 	 * part of clean up.
6697 	 *
6698 	 * Else if there is no error the retrieved information is used for
6699 	 * all the packets in the chain.
6700 	 */
6701 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6702 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6703 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6704 		retval = 0;
6705 		if (preoutp != NULL) {
6706 			retval = preoutp(ifp, proto_family, &m, dest, route,
6707 			    frame_type, dst_linkaddr);
6708 
6709 			if (retval != 0) {
6710 				if (retval == EJUSTRETURN) {
6711 					goto preout_again;
6712 				}
6713 				m_freem(m);
6714 				m = NULL;
6715 				goto cleanup;
6716 			}
6717 		}
6718 	}
6719 
6720 	nanouptime(&now);
6721 	net_timernsec(&now, &now_nsec);
6722 
6723 	do {
6724 		m_add_hdr_crumb_interface_output(m, ifp->if_index, false);
6725 		/*
6726 		 * pkt_hdr is set here to point to m_data prior to
6727 		 * calling into the framer. This value of pkt_hdr is
6728 		 * used by the netif gso logic to retrieve the ip header
6729 		 * for the TCP packets, offloaded for TSO processing.
6730 		 */
6731 		if (raw && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6732 			uint8_t vlan_encap_len = 0;
6733 
6734 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6735 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6736 			}
6737 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6738 		} else {
6739 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6740 		}
6741 
6742 		/*
6743 		 * Perform address family translation if needed.
6744 		 * For now we only support stateless 4 to 6 translation
6745 		 * on the out path.
6746 		 *
6747 		 * The routine below translates IP header, updates protocol
6748 		 * checksum and also translates ICMP.
6749 		 *
6750 		 * We skip the first packet as it is already translated and
6751 		 * the proto family is set to PF_INET6.
6752 		 */
6753 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6754 		    (ifp->if_type == IFT_CELLULAR ||
6755 		    dlil_is_clat_needed(proto_family, m))) {
6756 			retval = dlil_clat46(ifp, &proto_family, &m);
6757 			/* Goto the next packet if the translation fails */
6758 			if (retval != 0) {
6759 				m_freem(m);
6760 				m = NULL;
6761 				ip6stat.ip6s_clat464_out_drop++;
6762 				goto next;
6763 			}
6764 		}
6765 
6766 #if CONFIG_DTRACE
6767 		if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6768 			dlil_output_dtrace(ifp, proto_family, m);
6769 		}
6770 #endif /* CONFIG_DTRACE */
6771 
6772 		if (flags == DLIL_OUTPUT_FLAGS_NONE && ifp->if_framer != NULL) {
6773 			int rcvif_set = 0;
6774 
6775 			/*
6776 			 * If this is a broadcast packet that needs to be
6777 			 * looped back into the system, set the inbound ifp
6778 			 * to that of the outbound ifp.  This will allow
6779 			 * us to determine that it is a legitimate packet
6780 			 * for the system.  Only set the ifp if it's not
6781 			 * already set, just to be safe.
6782 			 */
6783 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6784 			    m->m_pkthdr.rcvif == NULL) {
6785 				m->m_pkthdr.rcvif = ifp;
6786 				rcvif_set = 1;
6787 			}
6788 			m_loop_set = m->m_flags & M_LOOP;
6789 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6790 			    frame_type, &pre, &post);
6791 			if (retval != 0) {
6792 				if (retval != EJUSTRETURN) {
6793 					m_freem(m);
6794 				}
6795 				goto next;
6796 			}
6797 
6798 			/*
6799 			 * For partial checksum offload, adjust the start
6800 			 * and stuff offsets based on the prepended header.
6801 			 */
6802 			if ((m->m_pkthdr.csum_flags &
6803 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6804 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6805 				m->m_pkthdr.csum_tx_stuff += pre;
6806 				m->m_pkthdr.csum_tx_start += pre;
6807 			}
6808 
6809 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6810 				dlil_output_cksum_dbg(ifp, m, pre,
6811 				    proto_family);
6812 			}
6813 
6814 			/*
6815 			 * Clear the ifp if it was set above, and to be
6816 			 * safe, only if it is still the same as the
6817 			 * outbound ifp we have in context.  If it was
6818 			 * looped back, then a copy of it was sent to the
6819 			 * loopback interface with the rcvif set, and we
6820 			 * are clearing the one that will go down to the
6821 			 * layer below.
6822 			 */
6823 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6824 				m->m_pkthdr.rcvif = NULL;
6825 			}
6826 		}
6827 
6828 		/*
6829 		 * Let interface filters (if any) do their thing ...
6830 		 */
6831 		if ((flags & DLIL_OUTPUT_FLAGS_SKIP_IF_FILTERS) == 0) {
6832 			retval = dlil_interface_filters_output(ifp, &m, proto_family);
6833 			if (retval != 0) {
6834 				if (retval != EJUSTRETURN) {
6835 					m_freem(m);
6836 				}
6837 				goto next;
6838 			}
6839 		}
6840 		/*
6841 		 * Strip away M_PROTO1 bit prior to sending packet
6842 		 * to the driver as this field may be used by the driver
6843 		 */
6844 		m->m_flags &= ~M_PROTO1;
6845 
6846 		/*
6847 		 * If the underlying interface is not capable of handling a
6848 		 * packet whose data portion spans across physically disjoint
6849 		 * pages, we need to "normalize" the packet so that we pass
6850 		 * down a chain of mbufs where each mbuf points to a span that
6851 		 * resides in the system page boundary.  If the packet does
6852 		 * not cross page(s), the following is a no-op.
6853 		 */
6854 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6855 			if ((m = m_normalize(m)) == NULL) {
6856 				goto next;
6857 			}
6858 		}
6859 
6860 		/*
6861 		 * If this is a TSO packet, make sure the interface still
6862 		 * advertise TSO capability.
6863 		 */
6864 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6865 			retval = EMSGSIZE;
6866 			m_freem(m);
6867 			goto cleanup;
6868 		}
6869 
6870 		ifp_inc_traffic_class_out(ifp, m);
6871 
6872 #if SKYWALK
6873 		/*
6874 		 * For native skywalk devices, packets will be passed to pktap
6875 		 * after GSO or after the mbuf to packet conversion.
6876 		 * This is done for IPv4/IPv6 packets only because there is no
6877 		 * space in the mbuf to pass down the proto family.
6878 		 */
6879 		if (dlil_is_native_netif_nexus(ifp)) {
6880 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6881 				pktap_output(ifp, proto_family, m, pre, post);
6882 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6883 			}
6884 		} else {
6885 			pktap_output(ifp, proto_family, m, pre, post);
6886 		}
6887 #else /* SKYWALK */
6888 		pktap_output(ifp, proto_family, m, pre, post);
6889 #endif /* SKYWALK */
6890 
6891 		/*
6892 		 * Count the number of elements in the mbuf chain
6893 		 */
6894 		if (tx_chain_len_count) {
6895 			dlil_count_chain_len(m, &tx_chain_len_stats);
6896 		}
6897 
6898 		/*
6899 		 * Discard partial sum information if this packet originated
6900 		 * from another interface; the packet would already have the
6901 		 * final checksum and we shouldn't recompute it.
6902 		 */
6903 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6904 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6905 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6906 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6907 			m->m_pkthdr.csum_data = 0;
6908 		}
6909 
6910 		/*
6911 		 * Finally, call the driver.
6912 		 */
6913 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6914 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6915 				flen += (m_pktlen(m) - (pre + post));
6916 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6917 			}
6918 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6919 
6920 			*send_tail = m;
6921 			send_tail = &m->m_nextpkt;
6922 		} else {
6923 			/*
6924 			 * Record timestamp; ifnet_enqueue() will use this info
6925 			 * rather than redoing the work.
6926 			 */
6927 			nanouptime(&now);
6928 			net_timernsec(&now, &now_nsec);
6929 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6930 
6931 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6932 				flen = (m_pktlen(m) - (pre + post));
6933 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6934 			} else {
6935 				flen = 0;
6936 			}
6937 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6938 			    0, 0, 0, 0, 0);
6939 			retval = (*ifp->if_output_dlil)(ifp, m);
6940 			if (retval == EQFULL || retval == EQSUSPENDED) {
6941 				if (adv != NULL && adv->code == FADV_SUCCESS) {
6942 					adv->code = (retval == EQFULL ?
6943 					    FADV_FLOW_CONTROLLED :
6944 					    FADV_SUSPENDED);
6945 				}
6946 				retval = 0;
6947 			}
6948 			if (retval == 0 && flen > 0) {
6949 				fbytes += flen;
6950 				fpkts++;
6951 			}
6952 			if (retval != 0 && dlil_verbose) {
6953 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6954 				    __func__, if_name(ifp),
6955 				    retval);
6956 			}
6957 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6958 			    0, 0, 0, 0, 0);
6959 		}
6960 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6961 
6962 next:
6963 		m = packetlist;
6964 		if (m != NULL) {
6965 			m->m_flags |= m_loop_set;
6966 			packetlist = packetlist->m_nextpkt;
6967 			m->m_nextpkt = NULL;
6968 		}
6969 		/* Reset the proto family to old proto family for CLAT */
6970 		if (did_clat46) {
6971 			proto_family = old_proto_family;
6972 		}
6973 	} while (m != NULL);
6974 
6975 	if (send_head != NULL) {
6976 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6977 		    0, 0, 0, 0, 0);
6978 		if (ifp->if_eflags & IFEF_SENDLIST) {
6979 			retval = (*ifp->if_output_dlil)(ifp, send_head);
6980 			if (retval == EQFULL || retval == EQSUSPENDED) {
6981 				if (adv != NULL) {
6982 					adv->code = (retval == EQFULL ?
6983 					    FADV_FLOW_CONTROLLED :
6984 					    FADV_SUSPENDED);
6985 				}
6986 				retval = 0;
6987 			}
6988 			if (retval == 0 && flen > 0) {
6989 				fbytes += flen;
6990 				fpkts++;
6991 			}
6992 			if (retval != 0 && dlil_verbose) {
6993 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6994 				    __func__, if_name(ifp), retval);
6995 			}
6996 		} else {
6997 			struct mbuf *send_m;
6998 			int enq_cnt = 0;
6999 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7000 			while (send_head != NULL) {
7001 				send_m = send_head;
7002 				send_head = send_m->m_nextpkt;
7003 				send_m->m_nextpkt = NULL;
7004 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7005 				if (retval == EQFULL || retval == EQSUSPENDED) {
7006 					if (adv != NULL) {
7007 						adv->code = (retval == EQFULL ?
7008 						    FADV_FLOW_CONTROLLED :
7009 						    FADV_SUSPENDED);
7010 					}
7011 					retval = 0;
7012 				}
7013 				if (retval == 0) {
7014 					enq_cnt++;
7015 					if (flen > 0) {
7016 						fpkts++;
7017 					}
7018 				}
7019 				if (retval != 0 && dlil_verbose) {
7020 					DLIL_PRINTF("%s: output error on %s "
7021 					    "retval = %d\n",
7022 					    __func__, if_name(ifp), retval);
7023 				}
7024 			}
7025 			if (enq_cnt > 0) {
7026 				fbytes += flen;
7027 				ifnet_start(ifp);
7028 			}
7029 		}
7030 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7031 	}
7032 
7033 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7034 
7035 cleanup:
7036 	if (fbytes > 0) {
7037 		ifp->if_fbytes += fbytes;
7038 	}
7039 	if (fpkts > 0) {
7040 		ifp->if_fpackets += fpkts;
7041 	}
7042 	if (proto != NULL) {
7043 		if_proto_free(proto);
7044 	}
7045 	if (packetlist) { /* if any packets are left, clean up */
7046 		mbuf_freem_list(packetlist);
7047 	}
7048 	if (retval == EJUSTRETURN) {
7049 		retval = 0;
7050 	}
7051 	if (iorefcnt == 1) {
7052 		ifnet_datamov_end(ifp);
7053 	}
7054 	if (rt != NULL) {
7055 		rtfree(rt);
7056 		rt = NULL;
7057 	}
7058 
7059 	return retval;
7060 }
7061 
7062 /*
7063  * This routine checks if the destination address is not a loopback, link-local,
7064  * multicast or broadcast address.
7065  */
7066 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7067 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7068 {
7069 	int ret = 0;
7070 	switch (proto_family) {
7071 	case PF_INET: {
7072 		struct ip *iph = mtod(m, struct ip *);
7073 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7074 			ret = 1;
7075 		}
7076 		break;
7077 	}
7078 	case PF_INET6: {
7079 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7080 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7081 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7082 			ret = 1;
7083 		}
7084 		break;
7085 	}
7086 	}
7087 
7088 	return ret;
7089 }
7090 /*
7091  * @brief This routine translates IPv4 packet to IPv6 packet,
7092  *     updates protocol checksum and also translates ICMP for code
7093  *     along with inner header translation.
7094  *
7095  * @param ifp Pointer to the interface
7096  * @param proto_family pointer to protocol family. It is updated if function
7097  *     performs the translation successfully.
7098  * @param m Pointer to the pointer pointing to the packet. Needed because this
7099  *     routine can end up changing the mbuf to a different one.
7100  *
7101  * @return 0 on success or else a negative value.
7102  */
7103 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7104 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7105 {
7106 	VERIFY(*proto_family == PF_INET);
7107 	VERIFY(IS_INTF_CLAT46(ifp));
7108 
7109 	pbuf_t pbuf_store, *pbuf = NULL;
7110 	struct ip *iph = NULL;
7111 	struct in_addr osrc, odst;
7112 	uint8_t proto = 0;
7113 	struct in6_addr src_storage = {};
7114 	struct in6_addr *src = NULL;
7115 	struct sockaddr_in6 dstsock = {};
7116 	int error = 0;
7117 	uint16_t off = 0;
7118 	uint16_t tot_len = 0;
7119 	uint16_t ip_id_val = 0;
7120 	uint16_t ip_frag_off = 0;
7121 
7122 	boolean_t is_frag = FALSE;
7123 	boolean_t is_first_frag = TRUE;
7124 	boolean_t is_last_frag = TRUE;
7125 
7126 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7127 	pbuf = &pbuf_store;
7128 	iph = pbuf->pb_data;
7129 
7130 	osrc = iph->ip_src;
7131 	odst = iph->ip_dst;
7132 	proto = iph->ip_p;
7133 	off = (uint16_t)(iph->ip_hl << 2);
7134 	ip_id_val = iph->ip_id;
7135 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7136 
7137 	tot_len = ntohs(iph->ip_len);
7138 
7139 	/*
7140 	 * For packets that are not first frags
7141 	 * we only need to adjust CSUM.
7142 	 * For 4 to 6, Fragmentation header gets appended
7143 	 * after proto translation.
7144 	 */
7145 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7146 		is_frag = TRUE;
7147 
7148 		/* If the offset is not zero, it is not first frag */
7149 		if (ip_frag_off != 0) {
7150 			is_first_frag = FALSE;
7151 		}
7152 
7153 		/* If IP_MF is set, then it is not last frag */
7154 		if (ntohs(iph->ip_off) & IP_MF) {
7155 			is_last_frag = FALSE;
7156 		}
7157 	}
7158 
7159 	/*
7160 	 * Translate IPv4 destination to IPv6 destination by using the
7161 	 * prefixes learned through prior PLAT discovery.
7162 	 */
7163 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7164 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7165 		goto cleanup;
7166 	}
7167 
7168 	dstsock.sin6_len = sizeof(struct sockaddr_in6);
7169 	dstsock.sin6_family = AF_INET6;
7170 
7171 	/*
7172 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7173 	 * translation.
7174 	 */
7175 	src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7176 	    NULL, NULL, TRUE);
7177 
7178 	if (src == NULL) {
7179 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7180 		error = -1;
7181 		goto cleanup;
7182 	}
7183 
7184 
7185 	/* Translate the IP header part first */
7186 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7187 	    iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7188 
7189 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7190 
7191 	if (error != 0) {
7192 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7193 		goto cleanup;
7194 	}
7195 
7196 	/*
7197 	 * Translate protocol header, update checksum, checksum flags
7198 	 * and related fields.
7199 	 */
7200 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7201 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7202 
7203 	if (error != 0) {
7204 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7205 		goto cleanup;
7206 	}
7207 
7208 	/* Now insert the IPv6 fragment header */
7209 	if (is_frag) {
7210 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7211 
7212 		if (error != 0) {
7213 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7214 			goto cleanup;
7215 		}
7216 	}
7217 
7218 cleanup:
7219 	if (pbuf_is_valid(pbuf)) {
7220 		*m = pbuf->pb_mbuf;
7221 		pbuf->pb_mbuf = NULL;
7222 		pbuf_destroy(pbuf);
7223 	} else {
7224 		error = -1;
7225 		*m = NULL;
7226 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7227 	}
7228 
7229 	if (error == 0) {
7230 		*proto_family = PF_INET6;
7231 		ip6stat.ip6s_clat464_out_success++;
7232 	}
7233 
7234 	return error;
7235 }
7236 
7237 /*
7238  * @brief This routine translates incoming IPv6 to IPv4 packet,
7239  *     updates protocol checksum and also translates ICMPv6 outer
7240  *     and inner headers
7241  *
7242  * @return 0 on success or else a negative value.
7243  */
7244 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7245 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7246 {
7247 	VERIFY(*proto_family == PF_INET6);
7248 	VERIFY(IS_INTF_CLAT46(ifp));
7249 
7250 	struct ip6_hdr *ip6h = NULL;
7251 	struct in6_addr osrc, odst;
7252 	uint8_t proto = 0;
7253 	struct in6_ifaddr *ia6_clat_dst = NULL;
7254 	struct in_ifaddr *ia4_clat_dst = NULL;
7255 	struct in_addr *dst = NULL;
7256 	struct in_addr src;
7257 	int error = 0;
7258 	uint32_t off = 0;
7259 	u_int64_t tot_len = 0;
7260 	uint8_t tos = 0;
7261 	boolean_t is_first_frag = TRUE;
7262 
7263 	/* Incoming mbuf does not contain valid IP6 header */
7264 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7265 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7266 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7267 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7268 		return -1;
7269 	}
7270 
7271 	ip6h = mtod(*m, struct ip6_hdr *);
7272 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7273 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7274 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7275 		return -1;
7276 	}
7277 
7278 	osrc = ip6h->ip6_src;
7279 	odst = ip6h->ip6_dst;
7280 
7281 	/*
7282 	 * Retrieve the local CLAT46 reserved IPv6 address.
7283 	 * Let the packet pass if we don't find one, as the flag
7284 	 * may get set before IPv6 configuration has taken place.
7285 	 */
7286 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7287 	if (ia6_clat_dst == NULL) {
7288 		goto done;
7289 	}
7290 
7291 	/*
7292 	 * Check if the original dest in the packet is same as the reserved
7293 	 * CLAT46 IPv6 address
7294 	 */
7295 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7296 		pbuf_t pbuf_store, *pbuf = NULL;
7297 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7298 		pbuf = &pbuf_store;
7299 
7300 		/*
7301 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7302 		 * translation.
7303 		 */
7304 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7305 		if (ia4_clat_dst == NULL) {
7306 			ifa_remref(&ia6_clat_dst->ia_ifa);
7307 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7308 			error = -1;
7309 			goto cleanup;
7310 		}
7311 		ifa_remref(&ia6_clat_dst->ia_ifa);
7312 
7313 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7314 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7315 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7316 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7317 			error = -1;
7318 			goto cleanup;
7319 		}
7320 
7321 		ip6h = pbuf->pb_data;
7322 		off = sizeof(struct ip6_hdr);
7323 		proto = ip6h->ip6_nxt;
7324 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7325 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7326 
7327 		/*
7328 		 * Translate the IP header and update the fragmentation
7329 		 * header if needed
7330 		 */
7331 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7332 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7333 		    0 : -1;
7334 
7335 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7336 
7337 		if (error != 0) {
7338 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7339 			goto cleanup;
7340 		}
7341 
7342 		/*
7343 		 * Translate protocol header, update checksum, checksum flags
7344 		 * and related fields.
7345 		 */
7346 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7347 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7348 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7349 
7350 		if (error != 0) {
7351 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7352 			goto cleanup;
7353 		}
7354 
7355 cleanup:
7356 		if (ia4_clat_dst != NULL) {
7357 			ifa_remref(&ia4_clat_dst->ia_ifa);
7358 		}
7359 
7360 		if (pbuf_is_valid(pbuf)) {
7361 			*m = pbuf->pb_mbuf;
7362 			pbuf->pb_mbuf = NULL;
7363 			pbuf_destroy(pbuf);
7364 		} else {
7365 			error = -1;
7366 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7367 		}
7368 
7369 		if (error == 0) {
7370 			*proto_family = PF_INET;
7371 			ip6stat.ip6s_clat464_in_success++;
7372 		}
7373 	} /* CLAT traffic */
7374 
7375 done:
7376 	return error;
7377 }
7378 
7379 /* The following is used to enqueue work items for ifnet ioctl events */
7380 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7381 
7382 struct ifnet_ioctl_event {
7383 	struct ifnet *ifp;
7384 	u_long ioctl_code;
7385 };
7386 
7387 struct ifnet_ioctl_event_nwk_wq_entry {
7388 	struct nwk_wq_entry nwk_wqe;
7389 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7390 };
7391 
7392 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7393 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7394 {
7395 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7396 	bool compare_expected;
7397 
7398 	/*
7399 	 * Get an io ref count if the interface is attached.
7400 	 * At this point it most likely is. We are taking a reference for
7401 	 * deferred processing.
7402 	 */
7403 	if (!ifnet_is_attached(ifp, 1)) {
7404 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7405 		    "is not attached",
7406 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7407 		return;
7408 	}
7409 	switch (ioctl_code) {
7410 	case SIOCADDMULTI:
7411 		compare_expected = false;
7412 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7413 			ifnet_decr_iorefcnt(ifp);
7414 			return;
7415 		}
7416 		break;
7417 	case SIOCDELMULTI:
7418 		compare_expected = false;
7419 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7420 			ifnet_decr_iorefcnt(ifp);
7421 			return;
7422 		}
7423 		break;
7424 	default:
7425 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7426 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7427 		return;
7428 	}
7429 
7430 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7431 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7432 
7433 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7434 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7435 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7436 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7437 }
7438 
7439 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7440 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7441 {
7442 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7443 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7444 
7445 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7446 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7447 	int ret = 0;
7448 
7449 	switch (ioctl_code) {
7450 	case SIOCADDMULTI:
7451 		atomic_store(&ifp->if_mcast_add_signaled, false);
7452 		break;
7453 	case SIOCDELMULTI:
7454 		atomic_store(&ifp->if_mcast_del_signaled, false);
7455 		break;
7456 	}
7457 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7458 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7459 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7460 	} else if (dlil_verbose) {
7461 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7462 		    "for ioctl %lu",
7463 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7464 	}
7465 	ifnet_decr_iorefcnt(ifp);
7466 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7467 	return;
7468 }
7469 
7470 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7471 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7472     void *ioctl_arg)
7473 {
7474 	struct ifnet_filter *filter;
7475 	int retval = EOPNOTSUPP;
7476 	int result = 0;
7477 
7478 	if (ifp == NULL || ioctl_code == 0) {
7479 		return EINVAL;
7480 	}
7481 
7482 	/* Get an io ref count if the interface is attached */
7483 	if (!ifnet_is_attached(ifp, 1)) {
7484 		return EOPNOTSUPP;
7485 	}
7486 
7487 	/*
7488 	 * Run the interface filters first.
7489 	 * We want to run all filters before calling the protocol,
7490 	 * interface family, or interface.
7491 	 */
7492 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7493 	/* prevent filter list from changing in case we drop the lock */
7494 	if_flt_monitor_busy(ifp);
7495 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7496 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7497 		    filter->filt_protocol == proto_fam)) {
7498 			lck_mtx_unlock(&ifp->if_flt_lock);
7499 
7500 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7501 			    proto_fam, ioctl_code, ioctl_arg);
7502 
7503 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7504 
7505 			/* Only update retval if no one has handled the ioctl */
7506 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7507 				if (result == ENOTSUP) {
7508 					result = EOPNOTSUPP;
7509 				}
7510 				retval = result;
7511 				if (retval != 0 && retval != EOPNOTSUPP) {
7512 					/* we're done with the filter list */
7513 					if_flt_monitor_unbusy(ifp);
7514 					lck_mtx_unlock(&ifp->if_flt_lock);
7515 					goto cleanup;
7516 				}
7517 			}
7518 		}
7519 	}
7520 	/* we're done with the filter list */
7521 	if_flt_monitor_unbusy(ifp);
7522 	lck_mtx_unlock(&ifp->if_flt_lock);
7523 
7524 	/* Allow the protocol to handle the ioctl */
7525 	if (proto_fam != 0) {
7526 		struct if_proto *proto;
7527 
7528 		/* callee holds a proto refcnt upon success */
7529 		ifnet_lock_shared(ifp);
7530 		proto = find_attached_proto(ifp, proto_fam);
7531 		ifnet_lock_done(ifp);
7532 		if (proto != NULL) {
7533 			proto_media_ioctl ioctlp =
7534 			    (proto->proto_kpi == kProtoKPI_v1 ?
7535 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7536 			result = EOPNOTSUPP;
7537 			if (ioctlp != NULL) {
7538 				result = ioctlp(ifp, proto_fam, ioctl_code,
7539 				    ioctl_arg);
7540 			}
7541 			if_proto_free(proto);
7542 
7543 			/* Only update retval if no one has handled the ioctl */
7544 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7545 				if (result == ENOTSUP) {
7546 					result = EOPNOTSUPP;
7547 				}
7548 				retval = result;
7549 				if (retval && retval != EOPNOTSUPP) {
7550 					goto cleanup;
7551 				}
7552 			}
7553 		}
7554 	}
7555 
7556 	/* retval is either 0 or EOPNOTSUPP */
7557 
7558 	/*
7559 	 * Let the interface handle this ioctl.
7560 	 * If it returns EOPNOTSUPP, ignore that, we may have
7561 	 * already handled this in the protocol or family.
7562 	 */
7563 	if (ifp->if_ioctl) {
7564 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7565 	}
7566 
7567 	/* Only update retval if no one has handled the ioctl */
7568 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7569 		if (result == ENOTSUP) {
7570 			result = EOPNOTSUPP;
7571 		}
7572 		retval = result;
7573 		if (retval && retval != EOPNOTSUPP) {
7574 			goto cleanup;
7575 		}
7576 	}
7577 
7578 cleanup:
7579 	if (retval == EJUSTRETURN) {
7580 		retval = 0;
7581 	}
7582 
7583 	ifnet_decr_iorefcnt(ifp);
7584 
7585 	return retval;
7586 }
7587 
7588 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7589 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7590 {
7591 	errno_t error = 0;
7592 
7593 	if (ifp->if_set_bpf_tap) {
7594 		/* Get an io reference on the interface if it is attached */
7595 		if (!ifnet_is_attached(ifp, 1)) {
7596 			return ENXIO;
7597 		}
7598 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7599 		ifnet_decr_iorefcnt(ifp);
7600 	}
7601 	return error;
7602 }
7603 
7604 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7605 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7606     struct sockaddr *ll_addr, size_t ll_len)
7607 {
7608 	errno_t result = EOPNOTSUPP;
7609 	struct if_proto *proto;
7610 	const struct sockaddr *verify;
7611 	proto_media_resolve_multi resolvep;
7612 
7613 	if (!ifnet_is_attached(ifp, 1)) {
7614 		return result;
7615 	}
7616 
7617 	bzero(ll_addr, ll_len);
7618 
7619 	/* Call the protocol first; callee holds a proto refcnt upon success */
7620 	ifnet_lock_shared(ifp);
7621 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7622 	ifnet_lock_done(ifp);
7623 	if (proto != NULL) {
7624 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7625 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7626 		if (resolvep != NULL) {
7627 			result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7628 		}
7629 		if_proto_free(proto);
7630 	}
7631 
7632 	/* Let the interface verify the multicast address */
7633 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7634 		if (result == 0) {
7635 			verify = ll_addr;
7636 		} else {
7637 			verify = proto_addr;
7638 		}
7639 		result = ifp->if_check_multi(ifp, verify);
7640 	}
7641 
7642 	ifnet_decr_iorefcnt(ifp);
7643 	return result;
7644 }
7645 
7646 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7647 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7648     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7649     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7650 {
7651 	struct if_proto *proto;
7652 	errno_t result = 0;
7653 
7654 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7655 		result = ENOTSUP;
7656 		goto done;
7657 	}
7658 
7659 	/* callee holds a proto refcnt upon success */
7660 	ifnet_lock_shared(ifp);
7661 	proto = find_attached_proto(ifp, target_proto->sa_family);
7662 	ifnet_lock_done(ifp);
7663 	if (proto == NULL) {
7664 		result = ENOTSUP;
7665 	} else {
7666 		proto_media_send_arp    arpp;
7667 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7668 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7669 		if (arpp == NULL) {
7670 			result = ENOTSUP;
7671 		} else {
7672 			switch (arpop) {
7673 			case ARPOP_REQUEST:
7674 				arpstat.txrequests++;
7675 				if (target_hw != NULL) {
7676 					arpstat.txurequests++;
7677 				}
7678 				break;
7679 			case ARPOP_REPLY:
7680 				arpstat.txreplies++;
7681 				break;
7682 			}
7683 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7684 			    target_hw, target_proto);
7685 		}
7686 		if_proto_free(proto);
7687 	}
7688 done:
7689 	return result;
7690 }
7691 
7692 struct net_thread_marks { };
7693 static const struct net_thread_marks net_thread_marks_base = { };
7694 
7695 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7696     &net_thread_marks_base;
7697 
7698 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7699 net_thread_marks_push(u_int32_t push)
7700 {
7701 	static const char *const base = (const void*)&net_thread_marks_base;
7702 	u_int32_t pop = 0;
7703 
7704 	if (push != 0) {
7705 		struct uthread *uth = current_uthread();
7706 
7707 		pop = push & ~uth->uu_network_marks;
7708 		if (pop != 0) {
7709 			uth->uu_network_marks |= pop;
7710 		}
7711 	}
7712 
7713 	return (net_thread_marks_t)&base[pop];
7714 }
7715 
7716 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7717 net_thread_unmarks_push(u_int32_t unpush)
7718 {
7719 	static const char *const base = (const void*)&net_thread_marks_base;
7720 	u_int32_t unpop = 0;
7721 
7722 	if (unpush != 0) {
7723 		struct uthread *uth = current_uthread();
7724 
7725 		unpop = unpush & uth->uu_network_marks;
7726 		if (unpop != 0) {
7727 			uth->uu_network_marks &= ~unpop;
7728 		}
7729 	}
7730 
7731 	return (net_thread_marks_t)&base[unpop];
7732 }
7733 
7734 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7735 net_thread_marks_pop(net_thread_marks_t popx)
7736 {
7737 	static const char *const base = (const void*)&net_thread_marks_base;
7738 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7739 
7740 	if (pop != 0) {
7741 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7742 		struct uthread *uth = current_uthread();
7743 
7744 		VERIFY((pop & ones) == pop);
7745 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7746 		uth->uu_network_marks &= ~pop;
7747 	}
7748 }
7749 
7750 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7751 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7752 {
7753 	static const char *const base = (const void*)&net_thread_marks_base;
7754 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7755 
7756 	if (unpop != 0) {
7757 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7758 		struct uthread *uth = current_uthread();
7759 
7760 		VERIFY((unpop & ones) == unpop);
7761 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7762 		uth->uu_network_marks |= (u_int32_t)unpop;
7763 	}
7764 }
7765 
7766 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7767 net_thread_is_marked(u_int32_t check)
7768 {
7769 	if (check != 0) {
7770 		struct uthread *uth = current_uthread();
7771 		return uth->uu_network_marks & check;
7772 	} else {
7773 		return 0;
7774 	}
7775 }
7776 
7777 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7778 net_thread_is_unmarked(u_int32_t check)
7779 {
7780 	if (check != 0) {
7781 		struct uthread *uth = current_uthread();
7782 		return ~uth->uu_network_marks & check;
7783 	} else {
7784 		return 0;
7785 	}
7786 }
7787 
7788 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7789 _is_announcement(const struct sockaddr_in * sender_sin,
7790     const struct sockaddr_in * target_sin)
7791 {
7792 	if (target_sin == NULL || sender_sin == NULL) {
7793 		return FALSE;
7794 	}
7795 
7796 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7797 }
7798 
7799 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7800 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7801     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7802     const struct sockaddr *target_proto0, u_int32_t rtflags)
7803 {
7804 	errno_t result = 0;
7805 	const struct sockaddr_in * sender_sin;
7806 	const struct sockaddr_in * target_sin;
7807 	struct sockaddr_inarp target_proto_sinarp;
7808 	struct sockaddr *target_proto = __DECONST_SA(target_proto0);
7809 
7810 	if (target_proto == NULL || sender_proto == NULL) {
7811 		return EINVAL;
7812 	}
7813 
7814 	if (sender_proto->sa_family != target_proto->sa_family) {
7815 		return EINVAL;
7816 	}
7817 
7818 	/*
7819 	 * If the target is a (default) router, provide that
7820 	 * information to the send_arp callback routine.
7821 	 */
7822 	if (rtflags & RTF_ROUTER) {
7823 		SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
7824 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7825 		target_proto = SA(&target_proto_sinarp);
7826 	}
7827 
7828 	/*
7829 	 * If this is an ARP request and the target IP is IPv4LL,
7830 	 * send the request on all interfaces.  The exception is
7831 	 * an announcement, which must only appear on the specific
7832 	 * interface.
7833 	 */
7834 	sender_sin = SIN(sender_proto);
7835 	target_sin = SIN(target_proto);
7836 	if (target_proto->sa_family == AF_INET &&
7837 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7838 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7839 	    !_is_announcement(sender_sin, target_sin)) {
7840 		ifnet_t         *__counted_by(count) ifp_list;
7841 		u_int32_t       count;
7842 		u_int32_t       ifp_on;
7843 
7844 		result = ENOTSUP;
7845 
7846 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7847 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7848 				errno_t new_result;
7849 				ifaddr_t source_hw = NULL;
7850 				ifaddr_t source_ip = NULL;
7851 				struct sockaddr_in source_ip_copy;
7852 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7853 
7854 				/*
7855 				 * Only arp on interfaces marked for IPv4LL
7856 				 * ARPing.  This may mean that we don't ARP on
7857 				 * the interface the subnet route points to.
7858 				 */
7859 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7860 					continue;
7861 				}
7862 
7863 				/* Find the source IP address */
7864 				ifnet_lock_shared(cur_ifp);
7865 				source_hw = cur_ifp->if_lladdr;
7866 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7867 				    ifa_link) {
7868 					IFA_LOCK(source_ip);
7869 					if (source_ip->ifa_addr != NULL &&
7870 					    source_ip->ifa_addr->sa_family ==
7871 					    AF_INET) {
7872 						/* Copy the source IP address */
7873 						SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
7874 						IFA_UNLOCK(source_ip);
7875 						break;
7876 					}
7877 					IFA_UNLOCK(source_ip);
7878 				}
7879 
7880 				/* No IP Source, don't arp */
7881 				if (source_ip == NULL) {
7882 					ifnet_lock_done(cur_ifp);
7883 					continue;
7884 				}
7885 
7886 				ifa_addref(source_hw);
7887 				ifnet_lock_done(cur_ifp);
7888 
7889 				/* Send the ARP */
7890 				new_result = dlil_send_arp_internal(cur_ifp,
7891 				    arpop, SDL(source_hw->ifa_addr),
7892 				    SA(&source_ip_copy), NULL,
7893 				    target_proto);
7894 
7895 				ifa_remref(source_hw);
7896 				if (result == ENOTSUP) {
7897 					result = new_result;
7898 				}
7899 			}
7900 			ifnet_list_free_counted_by(ifp_list, count);
7901 		}
7902 	} else {
7903 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7904 		    sender_proto, target_hw, target_proto);
7905 	}
7906 
7907 	return result;
7908 }
7909 
7910 /*
7911  * Caller must hold ifnet head lock.
7912  */
7913 static int
ifnet_lookup(struct ifnet * ifp)7914 ifnet_lookup(struct ifnet *ifp)
7915 {
7916 	struct ifnet *_ifp;
7917 
7918 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7919 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7920 		if (_ifp == ifp) {
7921 			break;
7922 		}
7923 	}
7924 	return _ifp != NULL;
7925 }
7926 
7927 /*
7928  * Caller has to pass a non-zero refio argument to get a
7929  * IO reference count. This will prevent ifnet_detach from
7930  * being called when there are outstanding io reference counts.
7931  */
7932 int
ifnet_is_attached(struct ifnet * ifp,int refio)7933 ifnet_is_attached(struct ifnet *ifp, int refio)
7934 {
7935 	int ret;
7936 
7937 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7938 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7939 		if (refio > 0) {
7940 			ifp->if_refio++;
7941 		}
7942 	}
7943 	lck_mtx_unlock(&ifp->if_ref_lock);
7944 
7945 	return ret;
7946 }
7947 
7948 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7949 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7950 {
7951 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7952 	ifp->if_threads_pending++;
7953 	lck_mtx_unlock(&ifp->if_ref_lock);
7954 }
7955 
7956 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7957 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7958 {
7959 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7960 	VERIFY(ifp->if_threads_pending > 0);
7961 	ifp->if_threads_pending--;
7962 	if (ifp->if_threads_pending == 0) {
7963 		wakeup(&ifp->if_threads_pending);
7964 	}
7965 	lck_mtx_unlock(&ifp->if_ref_lock);
7966 }
7967 
7968 /*
7969  * Caller must ensure the interface is attached; the assumption is that
7970  * there is at least an outstanding IO reference count held already.
7971  * Most callers would call ifnet_is_{attached,data_ready}() instead.
7972  */
7973 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7974 ifnet_incr_iorefcnt(struct ifnet *ifp)
7975 {
7976 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7977 	VERIFY(IF_FULLY_ATTACHED(ifp));
7978 	VERIFY(ifp->if_refio > 0);
7979 	ifp->if_refio++;
7980 	lck_mtx_unlock(&ifp->if_ref_lock);
7981 }
7982 
7983 __attribute__((always_inline))
7984 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7985 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7986 {
7987 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7988 
7989 	VERIFY(ifp->if_refio > 0);
7990 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7991 
7992 	ifp->if_refio--;
7993 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7994 
7995 	/*
7996 	 * if there are no more outstanding io references, wakeup the
7997 	 * ifnet_detach thread if detaching flag is set.
7998 	 */
7999 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8000 		wakeup(&(ifp->if_refio));
8001 	}
8002 }
8003 
8004 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8005 ifnet_decr_iorefcnt(struct ifnet *ifp)
8006 {
8007 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8008 	ifnet_decr_iorefcnt_locked(ifp);
8009 	lck_mtx_unlock(&ifp->if_ref_lock);
8010 }
8011 
8012 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8013 ifnet_datamov_begin(struct ifnet *ifp)
8014 {
8015 	boolean_t ret;
8016 
8017 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8018 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8019 		ifp->if_refio++;
8020 		ifp->if_datamov++;
8021 	}
8022 	lck_mtx_unlock(&ifp->if_ref_lock);
8023 
8024 	DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8025 	return ret;
8026 }
8027 
8028 void
ifnet_datamov_end(struct ifnet * ifp)8029 ifnet_datamov_end(struct ifnet *ifp)
8030 {
8031 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8032 	VERIFY(ifp->if_datamov > 0);
8033 	/*
8034 	 * if there's no more thread moving data, wakeup any
8035 	 * drainers that's blocked waiting for this.
8036 	 */
8037 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8038 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8039 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8040 		wakeup(&(ifp->if_datamov));
8041 	}
8042 	ifnet_decr_iorefcnt_locked(ifp);
8043 	lck_mtx_unlock(&ifp->if_ref_lock);
8044 
8045 	DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8046 }
8047 
8048 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8049 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8050 {
8051 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8052 	ifp->if_refio++;
8053 	if (ifp->if_suspend++ == 0) {
8054 		VERIFY(ifp->if_refflags & IFRF_READY);
8055 		ifp->if_refflags &= ~IFRF_READY;
8056 	}
8057 }
8058 
8059 void
ifnet_datamov_suspend(struct ifnet * ifp)8060 ifnet_datamov_suspend(struct ifnet *ifp)
8061 {
8062 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8063 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8064 	ifnet_datamov_suspend_locked(ifp);
8065 	lck_mtx_unlock(&ifp->if_ref_lock);
8066 }
8067 
8068 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8069 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8070 {
8071 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8072 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8073 	if (ifp->if_suspend > 0) {
8074 		lck_mtx_unlock(&ifp->if_ref_lock);
8075 		return FALSE;
8076 	}
8077 	ifnet_datamov_suspend_locked(ifp);
8078 	lck_mtx_unlock(&ifp->if_ref_lock);
8079 	return TRUE;
8080 }
8081 
8082 void
ifnet_datamov_drain(struct ifnet * ifp)8083 ifnet_datamov_drain(struct ifnet *ifp)
8084 {
8085 	lck_mtx_lock(&ifp->if_ref_lock);
8086 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8087 	/* data movement must already be suspended */
8088 	VERIFY(ifp->if_suspend > 0);
8089 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8090 	ifp->if_drainers++;
8091 	while (ifp->if_datamov != 0) {
8092 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8093 		    if_name(ifp));
8094 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8095 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8096 		    (PZERO - 1), __func__, NULL);
8097 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8098 	}
8099 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8100 	VERIFY(ifp->if_drainers > 0);
8101 	ifp->if_drainers--;
8102 	lck_mtx_unlock(&ifp->if_ref_lock);
8103 
8104 	/* purge the interface queues */
8105 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8106 		if_qflush_snd(ifp, false);
8107 	}
8108 }
8109 
8110 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8111 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8112 {
8113 	ifnet_datamov_suspend(ifp);
8114 	ifnet_datamov_drain(ifp);
8115 }
8116 
8117 void
ifnet_datamov_resume(struct ifnet * ifp)8118 ifnet_datamov_resume(struct ifnet *ifp)
8119 {
8120 	lck_mtx_lock(&ifp->if_ref_lock);
8121 	/* data movement must already be suspended */
8122 	VERIFY(ifp->if_suspend > 0);
8123 	if (--ifp->if_suspend == 0) {
8124 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8125 		ifp->if_refflags |= IFRF_READY;
8126 	}
8127 	ifnet_decr_iorefcnt_locked(ifp);
8128 	lck_mtx_unlock(&ifp->if_ref_lock);
8129 }
8130 
8131 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8132 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8133 {
8134 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8135 	ctrace_t *tr;
8136 	u_int32_t idx;
8137 	u_int16_t *cnt;
8138 
8139 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8140 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8141 		/* NOTREACHED */
8142 	}
8143 
8144 	if (refhold) {
8145 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8146 		tr = dl_if_dbg->dldbg_if_refhold;
8147 	} else {
8148 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8149 		tr = dl_if_dbg->dldbg_if_refrele;
8150 	}
8151 
8152 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8153 	ctrace_record(&tr[idx]);
8154 }
8155 
8156 errno_t
dlil_if_ref(struct ifnet * ifp)8157 dlil_if_ref(struct ifnet *ifp)
8158 {
8159 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8160 
8161 	if (dl_if == NULL) {
8162 		return EINVAL;
8163 	}
8164 
8165 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8166 	++dl_if->dl_if_refcnt;
8167 	if (dl_if->dl_if_refcnt == 0) {
8168 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8169 		/* NOTREACHED */
8170 	}
8171 	if (dl_if->dl_if_trace != NULL) {
8172 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8173 	}
8174 	lck_mtx_unlock(&dl_if->dl_if_lock);
8175 
8176 	return 0;
8177 }
8178 
8179 errno_t
dlil_if_free(struct ifnet * ifp)8180 dlil_if_free(struct ifnet *ifp)
8181 {
8182 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8183 	bool need_release = FALSE;
8184 
8185 	if (dl_if == NULL) {
8186 		return EINVAL;
8187 	}
8188 
8189 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8190 	switch (dl_if->dl_if_refcnt) {
8191 	case 0:
8192 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8193 		/* NOTREACHED */
8194 		break;
8195 	case 1:
8196 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8197 			need_release = TRUE;
8198 		}
8199 		break;
8200 	default:
8201 		break;
8202 	}
8203 	--dl_if->dl_if_refcnt;
8204 	if (dl_if->dl_if_trace != NULL) {
8205 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8206 	}
8207 	lck_mtx_unlock(&dl_if->dl_if_lock);
8208 	if (need_release) {
8209 		_dlil_if_release(ifp, true);
8210 	}
8211 	return 0;
8212 }
8213 
8214 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8215 dlil_attach_protocol(struct if_proto *proto,
8216     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8217     uint32_t * proto_count)
8218 {
8219 	struct kev_dl_proto_data ev_pr_data;
8220 	struct ifnet *ifp = proto->ifp;
8221 	errno_t retval = 0;
8222 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8223 	struct if_proto *prev_proto;
8224 	struct if_proto *_proto;
8225 
8226 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8227 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8228 		return EINVAL;
8229 	}
8230 
8231 	if (!ifnet_is_attached(ifp, 1)) {
8232 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8233 		    __func__, if_name(ifp));
8234 		return ENXIO;
8235 	}
8236 	/* callee holds a proto refcnt upon success */
8237 	ifnet_lock_exclusive(ifp);
8238 	_proto = find_attached_proto(ifp, proto->protocol_family);
8239 	if (_proto != NULL) {
8240 		ifnet_lock_done(ifp);
8241 		if_proto_free(_proto);
8242 		retval = EEXIST;
8243 		goto ioref_done;
8244 	}
8245 
8246 	/*
8247 	 * Call family module add_proto routine so it can refine the
8248 	 * demux descriptors as it wishes.
8249 	 */
8250 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8251 	    demux_count);
8252 	if (retval) {
8253 		ifnet_lock_done(ifp);
8254 		goto ioref_done;
8255 	}
8256 
8257 	/*
8258 	 * Insert the protocol in the hash
8259 	 */
8260 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8261 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8262 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8263 	}
8264 	if (prev_proto) {
8265 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8266 	} else {
8267 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8268 		    proto, next_hash);
8269 	}
8270 
8271 	/* hold a proto refcnt for attach */
8272 	if_proto_ref(proto);
8273 
8274 	/*
8275 	 * The reserved field carries the number of protocol still attached
8276 	 * (subject to change)
8277 	 */
8278 	ev_pr_data.proto_family = proto->protocol_family;
8279 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8280 
8281 	ifnet_lock_done(ifp);
8282 
8283 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8284 	    (struct net_event_data *)&ev_pr_data,
8285 	    sizeof(struct kev_dl_proto_data), FALSE);
8286 	if (proto_count != NULL) {
8287 		*proto_count = ev_pr_data.proto_remaining_count;
8288 	}
8289 ioref_done:
8290 	ifnet_decr_iorefcnt(ifp);
8291 	return retval;
8292 }
8293 
8294 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8295 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8296 {
8297 	/*
8298 	 * A protocol has been attached, mark the interface up.
8299 	 * This used to be done by configd.KernelEventMonitor, but that
8300 	 * is inherently prone to races (rdar://problem/30810208).
8301 	 */
8302 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8303 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8304 	dlil_post_sifflags_msg(ifp);
8305 #if SKYWALK
8306 	switch (protocol) {
8307 	case AF_INET:
8308 	case AF_INET6:
8309 		/* don't attach the flowswitch unless attaching IP */
8310 		dlil_attach_flowswitch_nexus(ifp);
8311 		break;
8312 	default:
8313 		break;
8314 	}
8315 #endif /* SKYWALK */
8316 }
8317 
8318 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8319 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8320     const struct ifnet_attach_proto_param *proto_details)
8321 {
8322 	int retval = 0;
8323 	struct if_proto  *ifproto = NULL;
8324 	uint32_t proto_count = 0;
8325 
8326 	ifnet_head_lock_shared();
8327 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8328 		retval = EINVAL;
8329 		goto end;
8330 	}
8331 	/* Check that the interface is in the global list */
8332 	if (!ifnet_lookup(ifp)) {
8333 		retval = ENXIO;
8334 		goto end;
8335 	}
8336 
8337 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8338 
8339 	/* refcnt held above during lookup */
8340 	ifproto->ifp = ifp;
8341 	ifproto->protocol_family = protocol;
8342 	ifproto->proto_kpi = kProtoKPI_v1;
8343 	ifproto->kpi.v1.input = proto_details->input;
8344 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8345 	ifproto->kpi.v1.event = proto_details->event;
8346 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8347 	ifproto->kpi.v1.detached = proto_details->detached;
8348 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8349 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8350 
8351 	retval = dlil_attach_protocol(ifproto,
8352 	    proto_details->demux_list, proto_details->demux_count,
8353 	    &proto_count);
8354 
8355 end:
8356 	if (retval == EEXIST) {
8357 		/* already attached */
8358 		if (dlil_verbose) {
8359 			DLIL_PRINTF("%s: protocol %d already attached\n",
8360 			    ifp != NULL ? if_name(ifp) : "N/A",
8361 			    protocol);
8362 		}
8363 	} else if (retval != 0) {
8364 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8365 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8366 	} else if (dlil_verbose) {
8367 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8368 		    ifp != NULL ? if_name(ifp) : "N/A",
8369 		    protocol, proto_count);
8370 	}
8371 	ifnet_head_done();
8372 	if (retval == 0) {
8373 		dlil_handle_proto_attach(ifp, protocol);
8374 	} else if (ifproto != NULL) {
8375 		zfree(dlif_proto_zone, ifproto);
8376 	}
8377 	return retval;
8378 }
8379 
8380 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8381 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8382     const struct ifnet_attach_proto_param_v2 *proto_details)
8383 {
8384 	int retval = 0;
8385 	struct if_proto  *ifproto = NULL;
8386 	uint32_t proto_count = 0;
8387 
8388 	ifnet_head_lock_shared();
8389 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8390 		retval = EINVAL;
8391 		goto end;
8392 	}
8393 	/* Check that the interface is in the global list */
8394 	if (!ifnet_lookup(ifp)) {
8395 		retval = ENXIO;
8396 		goto end;
8397 	}
8398 
8399 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8400 
8401 	/* refcnt held above during lookup */
8402 	ifproto->ifp = ifp;
8403 	ifproto->protocol_family = protocol;
8404 	ifproto->proto_kpi = kProtoKPI_v2;
8405 	ifproto->kpi.v2.input = proto_details->input;
8406 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8407 	ifproto->kpi.v2.event = proto_details->event;
8408 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8409 	ifproto->kpi.v2.detached = proto_details->detached;
8410 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8411 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8412 
8413 	retval = dlil_attach_protocol(ifproto,
8414 	    proto_details->demux_list, proto_details->demux_count,
8415 	    &proto_count);
8416 
8417 end:
8418 	if (retval == EEXIST) {
8419 		/* already attached */
8420 		if (dlil_verbose) {
8421 			DLIL_PRINTF("%s: protocol %d already attached\n",
8422 			    ifp != NULL ? if_name(ifp) : "N/A",
8423 			    protocol);
8424 		}
8425 	} else if (retval != 0) {
8426 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8427 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8428 	} else if (dlil_verbose) {
8429 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8430 		    ifp != NULL ? if_name(ifp) : "N/A",
8431 		    protocol, proto_count);
8432 	}
8433 	ifnet_head_done();
8434 	if (retval == 0) {
8435 		dlil_handle_proto_attach(ifp, protocol);
8436 	} else if (ifproto != NULL) {
8437 		zfree(dlif_proto_zone, ifproto);
8438 	}
8439 	return retval;
8440 }
8441 
8442 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8443 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8444 {
8445 	struct if_proto *proto = NULL;
8446 	int     retval = 0;
8447 
8448 	if (ifp == NULL || proto_family == 0) {
8449 		retval = EINVAL;
8450 		goto end;
8451 	}
8452 
8453 	ifnet_lock_exclusive(ifp);
8454 	/* callee holds a proto refcnt upon success */
8455 	proto = find_attached_proto(ifp, proto_family);
8456 	if (proto == NULL) {
8457 		retval = ENXIO;
8458 		ifnet_lock_done(ifp);
8459 		goto end;
8460 	}
8461 
8462 	/* call family module del_proto */
8463 	if (ifp->if_del_proto) {
8464 		ifp->if_del_proto(ifp, proto->protocol_family);
8465 	}
8466 
8467 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8468 	    proto, if_proto, next_hash);
8469 
8470 	if (proto->proto_kpi == kProtoKPI_v1) {
8471 		proto->kpi.v1.input = ifproto_media_input_v1;
8472 		proto->kpi.v1.pre_output = ifproto_media_preout;
8473 		proto->kpi.v1.event = ifproto_media_event;
8474 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8475 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8476 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8477 	} else {
8478 		proto->kpi.v2.input = ifproto_media_input_v2;
8479 		proto->kpi.v2.pre_output = ifproto_media_preout;
8480 		proto->kpi.v2.event = ifproto_media_event;
8481 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8482 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8483 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8484 	}
8485 	proto->detached = 1;
8486 	ifnet_lock_done(ifp);
8487 
8488 	if (dlil_verbose) {
8489 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8490 		    (proto->proto_kpi == kProtoKPI_v1) ?
8491 		    "v1" : "v2", proto_family);
8492 	}
8493 
8494 	/* release proto refcnt held during protocol attach */
8495 	if_proto_free(proto);
8496 
8497 	/*
8498 	 * Release proto refcnt held during lookup; the rest of
8499 	 * protocol detach steps will happen when the last proto
8500 	 * reference is released.
8501 	 */
8502 	if_proto_free(proto);
8503 
8504 end:
8505 	return retval;
8506 }
8507 
8508 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8509 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8510     struct mbuf *packet, char *header)
8511 {
8512 #pragma unused(ifp, protocol, packet, header)
8513 	return ENXIO;
8514 }
8515 
8516 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8517 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8518     struct mbuf *packet)
8519 {
8520 #pragma unused(ifp, protocol, packet)
8521 	return ENXIO;
8522 }
8523 
8524 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8525 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8526     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8527     char *link_layer_dest)
8528 {
8529 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8530 	return ENXIO;
8531 }
8532 
8533 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8534 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8535     const struct kev_msg *event)
8536 {
8537 #pragma unused(ifp, protocol, event)
8538 }
8539 
8540 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8541 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8542     unsigned long command, void *argument)
8543 {
8544 #pragma unused(ifp, protocol, command, argument)
8545 	return ENXIO;
8546 }
8547 
8548 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8549 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8550     struct sockaddr_dl *out_ll, size_t ll_len)
8551 {
8552 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8553 	return ENXIO;
8554 }
8555 
8556 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8557 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8558     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8559     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8560 {
8561 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8562 	return ENXIO;
8563 }
8564 
8565 extern int if_next_index(void);
8566 extern int tcp_ecn_outbound;
8567 
8568 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8569 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8570 {
8571 	uint32_t sflags = 0;
8572 	int err;
8573 
8574 	if (if_flowadv) {
8575 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8576 	}
8577 
8578 	if (if_delaybased_queue) {
8579 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8580 	}
8581 
8582 	if (ifp->if_output_sched_model ==
8583 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8584 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8585 	}
8586 	/* Inherit drop limit from the default queue */
8587 	if (ifp->if_snd != ifcq) {
8588 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8589 	}
8590 	/* Initialize transmit queue(s) */
8591 	err = ifclassq_setup(ifcq, ifp, sflags);
8592 	if (err != 0) {
8593 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8594 		    "err=%d", __func__, ifp, err);
8595 		/* NOTREACHED */
8596 	}
8597 }
8598 
8599 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8600 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8601 {
8602 #if SKYWALK
8603 	boolean_t netif_compat;
8604 	if_nexus_netif  nexus_netif;
8605 #endif /* SKYWALK */
8606 	struct ifnet *tmp_if;
8607 	struct ifaddr *ifa;
8608 	struct if_data_internal if_data_saved;
8609 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8610 	struct dlil_threading_info *dl_inp;
8611 	thread_continue_t thfunc = NULL;
8612 	int err;
8613 
8614 	if (ifp == NULL) {
8615 		return EINVAL;
8616 	}
8617 
8618 	/*
8619 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8620 	 * prevent the interface from being configured while it is
8621 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8622 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8623 	 */
8624 	dlil_if_lock();
8625 	ifnet_head_lock_exclusive();
8626 	/* Verify we aren't already on the list */
8627 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8628 		if (tmp_if == ifp) {
8629 			ifnet_head_done();
8630 			dlil_if_unlock();
8631 			return EEXIST;
8632 		}
8633 	}
8634 
8635 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8636 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8637 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8638 		    __func__, ifp);
8639 		/* NOTREACHED */
8640 	}
8641 	lck_mtx_unlock(&ifp->if_ref_lock);
8642 
8643 	ifnet_lock_exclusive(ifp);
8644 
8645 	/* Sanity check */
8646 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8647 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8648 	VERIFY(ifp->if_threads_pending == 0);
8649 
8650 	if (ll_addr != NULL) {
8651 		if (ifp->if_addrlen == 0) {
8652 			ifp->if_addrlen = ll_addr->sdl_alen;
8653 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8654 			ifnet_lock_done(ifp);
8655 			ifnet_head_done();
8656 			dlil_if_unlock();
8657 			return EINVAL;
8658 		}
8659 	}
8660 
8661 	/*
8662 	 * Allow interfaces without protocol families to attach
8663 	 * only if they have the necessary fields filled out.
8664 	 */
8665 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8666 		DLIL_PRINTF("%s: Attempt to attach interface without "
8667 		    "family module - %d\n", __func__, ifp->if_family);
8668 		ifnet_lock_done(ifp);
8669 		ifnet_head_done();
8670 		dlil_if_unlock();
8671 		return ENODEV;
8672 	}
8673 
8674 	/* Allocate protocol hash table */
8675 	VERIFY(ifp->if_proto_hash == NULL);
8676 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8677 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8678 
8679 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8680 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8681 	TAILQ_INIT(&ifp->if_flt_head);
8682 	VERIFY(ifp->if_flt_busy == 0);
8683 	VERIFY(ifp->if_flt_waiters == 0);
8684 	VERIFY(ifp->if_flt_non_os_count == 0);
8685 	VERIFY(ifp->if_flt_no_tso_count == 0);
8686 	lck_mtx_unlock(&ifp->if_flt_lock);
8687 
8688 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8689 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8690 		LIST_INIT(&ifp->if_multiaddrs);
8691 	}
8692 
8693 	VERIFY(ifp->if_allhostsinm == NULL);
8694 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8695 	TAILQ_INIT(&ifp->if_addrhead);
8696 
8697 	if (ifp->if_index == 0) {
8698 		int idx = if_next_index();
8699 
8700 		/*
8701 		 * Since we exhausted the list of
8702 		 * if_index's, try to find an empty slot
8703 		 * in ifindex2ifnet.
8704 		 */
8705 		if (idx == -1 && if_index >= UINT16_MAX) {
8706 			for (int i = 1; i < if_index; i++) {
8707 				if (ifindex2ifnet[i] == NULL &&
8708 				    ifnet_addrs[i - 1] == NULL) {
8709 					idx = i;
8710 					break;
8711 				}
8712 			}
8713 		}
8714 		if (idx == -1) {
8715 			ifp->if_index = 0;
8716 			ifnet_lock_done(ifp);
8717 			ifnet_head_done();
8718 			dlil_if_unlock();
8719 			return ENOBUFS;
8720 		}
8721 		ifp->if_index = (uint16_t)idx;
8722 
8723 		/* the lladdr passed at attach time is the permanent address */
8724 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8725 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8726 			bcopy(CONST_LLADDR(ll_addr),
8727 			    dl_if->dl_if_permanent_ether,
8728 			    ETHER_ADDR_LEN);
8729 			dl_if->dl_if_permanent_ether_is_set = 1;
8730 		}
8731 	}
8732 	/* There should not be anything occupying this slot */
8733 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8734 
8735 	/* allocate (if needed) and initialize a link address */
8736 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8737 	if (ifa == NULL) {
8738 		ifnet_lock_done(ifp);
8739 		ifnet_head_done();
8740 		dlil_if_unlock();
8741 		return ENOBUFS;
8742 	}
8743 
8744 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8745 	ifnet_addrs[ifp->if_index - 1] = ifa;
8746 
8747 	/* make this address the first on the list */
8748 	IFA_LOCK(ifa);
8749 	/* hold a reference for ifnet_addrs[] */
8750 	ifa_addref(ifa);
8751 	/* if_attach_link_ifa() holds a reference for ifa_link */
8752 	if_attach_link_ifa(ifp, ifa);
8753 	IFA_UNLOCK(ifa);
8754 
8755 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8756 	ifindex2ifnet[ifp->if_index] = ifp;
8757 
8758 	/* Hold a reference to the underlying dlil_ifnet */
8759 	ifnet_reference(ifp);
8760 
8761 	/* Clear stats (save and restore other fields that we care) */
8762 	if_data_saved = ifp->if_data;
8763 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8764 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8765 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8766 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8767 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8768 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8769 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8770 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8771 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8772 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8773 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8774 	ifnet_touch_lastchange(ifp);
8775 
8776 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8777 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8778 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8779 
8780 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8781 
8782 	/* Sanity checks on the input thread storage */
8783 	dl_inp = &dl_if->dl_if_inpstorage;
8784 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8785 	VERIFY(dl_inp->dlth_flags == 0);
8786 	VERIFY(dl_inp->dlth_wtot == 0);
8787 	VERIFY(dl_inp->dlth_ifp == NULL);
8788 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8789 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8790 	VERIFY(!dl_inp->dlth_affinity);
8791 	VERIFY(ifp->if_inp == NULL);
8792 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8793 	VERIFY(dl_inp->dlth_strategy == NULL);
8794 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8795 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8796 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8797 
8798 #if IFNET_INPUT_SANITY_CHK
8799 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8800 #endif /* IFNET_INPUT_SANITY_CHK */
8801 
8802 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8803 	dlil_reset_rxpoll_params(ifp);
8804 	/*
8805 	 * A specific DLIL input thread is created per non-loopback interface.
8806 	 */
8807 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8808 		ifp->if_inp = dl_inp;
8809 		ifnet_incr_pending_thread_count(ifp);
8810 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8811 		if (err == ENODEV) {
8812 			VERIFY(thfunc == NULL);
8813 			ifnet_decr_pending_thread_count(ifp);
8814 		} else if (err != 0) {
8815 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8816 			    "err=%d", __func__, ifp, err);
8817 			/* NOTREACHED */
8818 		}
8819 	}
8820 	/*
8821 	 * If the driver supports the new transmit model, calculate flow hash
8822 	 * and create a workloop starter thread to invoke the if_start callback
8823 	 * where the packets may be dequeued and transmitted.
8824 	 */
8825 	if (ifp->if_eflags & IFEF_TXSTART) {
8826 		thread_precedence_policy_data_t info;
8827 		__unused kern_return_t kret;
8828 
8829 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8830 		VERIFY(ifp->if_flowhash != 0);
8831 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8832 
8833 		ifnet_set_start_cycle(ifp, NULL);
8834 		ifp->if_start_active = 0;
8835 		ifp->if_start_req = 0;
8836 		ifp->if_start_flags = 0;
8837 		VERIFY(ifp->if_start != NULL);
8838 		ifnet_incr_pending_thread_count(ifp);
8839 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8840 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8841 			panic_plain("%s: "
8842 			    "ifp=%p couldn't get a start thread; "
8843 			    "err=%d", __func__, ifp, err);
8844 			/* NOTREACHED */
8845 		}
8846 		bzero(&info, sizeof(info));
8847 		info.importance = 1;
8848 		kret = thread_policy_set(ifp->if_start_thread,
8849 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8850 		    THREAD_PRECEDENCE_POLICY_COUNT);
8851 		ASSERT(kret == KERN_SUCCESS);
8852 	} else {
8853 		ifp->if_flowhash = 0;
8854 	}
8855 
8856 	/* Reset polling parameters */
8857 	ifnet_set_poll_cycle(ifp, NULL);
8858 	ifp->if_poll_update = 0;
8859 	ifp->if_poll_flags = 0;
8860 	ifp->if_poll_req = 0;
8861 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8862 
8863 	/*
8864 	 * If the driver supports the new receive model, create a poller
8865 	 * thread to invoke if_input_poll callback where the packets may
8866 	 * be dequeued from the driver and processed for reception.
8867 	 * if the interface is netif compat then the poller thread is
8868 	 * managed by netif.
8869 	 */
8870 	if (thfunc == dlil_rxpoll_input_thread_func) {
8871 		thread_precedence_policy_data_t info;
8872 		__unused kern_return_t kret;
8873 #if SKYWALK
8874 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8875 #endif /* SKYWALK */
8876 		VERIFY(ifp->if_input_poll != NULL);
8877 		VERIFY(ifp->if_input_ctl != NULL);
8878 		ifnet_incr_pending_thread_count(ifp);
8879 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8880 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8881 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8882 			    "err=%d", __func__, ifp, err);
8883 			/* NOTREACHED */
8884 		}
8885 		bzero(&info, sizeof(info));
8886 		info.importance = 1;
8887 		kret = thread_policy_set(ifp->if_poll_thread,
8888 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8889 		    THREAD_PRECEDENCE_POLICY_COUNT);
8890 		ASSERT(kret == KERN_SUCCESS);
8891 	}
8892 
8893 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8894 	VERIFY(ifp->if_desc.ifd_len == 0);
8895 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8896 
8897 	/* Record attach PC stacktrace */
8898 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8899 
8900 	ifp->if_updatemcasts = 0;
8901 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8902 		struct ifmultiaddr *ifma;
8903 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8904 			IFMA_LOCK(ifma);
8905 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8906 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8907 				ifp->if_updatemcasts++;
8908 			}
8909 			IFMA_UNLOCK(ifma);
8910 		}
8911 
8912 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8913 		    "membership(s)\n", if_name(ifp),
8914 		    ifp->if_updatemcasts);
8915 	}
8916 
8917 	/* Clear logging parameters */
8918 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8919 
8920 	/* Clear foreground/realtime activity timestamps */
8921 	ifp->if_fg_sendts = 0;
8922 	ifp->if_rt_sendts = 0;
8923 
8924 	/* Clear throughput estimates and radio type */
8925 	ifp->if_estimated_up_bucket = 0;
8926 	ifp->if_estimated_down_bucket = 0;
8927 	ifp->if_radio_type = 0;
8928 	ifp->if_radio_channel = 0;
8929 
8930 	VERIFY(ifp->if_delegated.ifp == NULL);
8931 	VERIFY(ifp->if_delegated.type == 0);
8932 	VERIFY(ifp->if_delegated.family == 0);
8933 	VERIFY(ifp->if_delegated.subfamily == 0);
8934 	VERIFY(ifp->if_delegated.expensive == 0);
8935 	VERIFY(ifp->if_delegated.constrained == 0);
8936 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
8937 
8938 	VERIFY(ifp->if_agentids == NULL);
8939 	VERIFY(ifp->if_agentcount == 0);
8940 
8941 	/* Reset interface state */
8942 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8943 	ifp->if_interface_state.valid_bitmask |=
8944 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8945 	ifp->if_interface_state.interface_availability =
8946 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8947 
8948 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8949 	if (ifp == lo_ifp) {
8950 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8951 		ifp->if_interface_state.valid_bitmask |=
8952 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8953 	} else {
8954 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8955 	}
8956 
8957 	/*
8958 	 * Enable ECN capability on this interface depending on the
8959 	 * value of ECN global setting
8960 	 */
8961 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8962 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8963 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8964 	}
8965 
8966 	/*
8967 	 * Built-in Cyclops always on policy for WiFi infra
8968 	 */
8969 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8970 		errno_t error;
8971 
8972 		error = if_set_qosmarking_mode(ifp,
8973 		    IFRTYPE_QOSMARKING_FASTLANE);
8974 		if (error != 0) {
8975 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8976 			    __func__, ifp->if_xname, error);
8977 		} else {
8978 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8979 #if (DEVELOPMENT || DEBUG)
8980 			DLIL_PRINTF("%s fastlane enabled on %s\n",
8981 			    __func__, ifp->if_xname);
8982 #endif /* (DEVELOPMENT || DEBUG) */
8983 		}
8984 	}
8985 
8986 	ifnet_lock_done(ifp);
8987 	ifnet_head_done();
8988 
8989 #if SKYWALK
8990 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8991 #endif /* SKYWALK */
8992 
8993 	lck_mtx_lock(&ifp->if_cached_route_lock);
8994 	/* Enable forwarding cached route */
8995 	ifp->if_fwd_cacheok = 1;
8996 	/* Clean up any existing cached routes */
8997 	ROUTE_RELEASE(&ifp->if_fwd_route);
8998 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8999 	ROUTE_RELEASE(&ifp->if_src_route);
9000 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9001 	ROUTE_RELEASE(&ifp->if_src_route6);
9002 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9003 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9004 
9005 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9006 
9007 	/*
9008 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9009 	 * and trees; do this before the ifnet is marked as attached.
9010 	 * The ifnet keeps the reference to the info structures even after
9011 	 * the ifnet is detached, since the network-layer records still
9012 	 * refer to the info structures even after that.  This also
9013 	 * makes it possible for them to still function after the ifnet
9014 	 * is recycled or reattached.
9015 	 */
9016 #if INET
9017 	if (IGMP_IFINFO(ifp) == NULL) {
9018 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9019 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9020 	} else {
9021 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9022 		igmp_domifreattach(IGMP_IFINFO(ifp));
9023 	}
9024 #endif /* INET */
9025 	if (MLD_IFINFO(ifp) == NULL) {
9026 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9027 		VERIFY(MLD_IFINFO(ifp) != NULL);
9028 	} else {
9029 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9030 		mld_domifreattach(MLD_IFINFO(ifp));
9031 	}
9032 
9033 	VERIFY(ifp->if_data_threshold == 0);
9034 	VERIFY(ifp->if_dt_tcall != NULL);
9035 
9036 	/*
9037 	 * Wait for the created kernel threads for I/O to get
9038 	 * scheduled and run at least once before we proceed
9039 	 * to mark interface as attached.
9040 	 */
9041 	lck_mtx_lock(&ifp->if_ref_lock);
9042 	while (ifp->if_threads_pending != 0) {
9043 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9044 		    "interface %s to get scheduled at least once.\n",
9045 		    __func__, ifp->if_xname);
9046 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9047 		    __func__, NULL);
9048 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9049 	}
9050 	lck_mtx_unlock(&ifp->if_ref_lock);
9051 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9052 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9053 
9054 	/* Final mark this ifnet as attached. */
9055 	ifnet_lock_exclusive(ifp);
9056 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9057 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9058 	lck_mtx_unlock(&ifp->if_ref_lock);
9059 	if (net_rtref) {
9060 		/* boot-args override; enable idle notification */
9061 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9062 		    IFRF_IDLE_NOTIFY);
9063 	} else {
9064 		/* apply previous request(s) to set the idle flags, if any */
9065 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9066 		    ifp->if_idle_new_flags_mask);
9067 	}
9068 #if SKYWALK
9069 	/* the interface is fully attached; let the nexus adapter know */
9070 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9071 		if (netif_compat) {
9072 			if (sk_netif_compat_txmodel ==
9073 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9074 				ifnet_enqueue_multi_setup(ifp,
9075 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9076 			}
9077 			ifp->if_nx_netif = nexus_netif;
9078 		}
9079 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9080 	}
9081 #endif /* SKYWALK */
9082 	ifnet_lock_done(ifp);
9083 	dlil_if_unlock();
9084 
9085 #if PF
9086 	/*
9087 	 * Attach packet filter to this interface, if enabled.
9088 	 */
9089 	pf_ifnet_hook(ifp, 1);
9090 #endif /* PF */
9091 
9092 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9093 
9094 	if (dlil_verbose) {
9095 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9096 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9097 	}
9098 
9099 	return 0;
9100 }
9101 
9102 /*
9103  * Prepare the storage for the first/permanent link address, which must
9104  * must have the same lifetime as the ifnet itself.  Although the link
9105  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9106  * its location in memory must never change as it may still be referred
9107  * to by some parts of the system afterwards (unfortunate implementation
9108  * artifacts inherited from BSD.)
9109  *
9110  * Caller must hold ifnet lock as writer.
9111  */
9112 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9113 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9114 {
9115 	struct ifaddr *ifa, *oifa = NULL;
9116 	struct sockaddr_dl *addr_sdl, *mask_sdl;
9117 	char workbuf[IFNAMSIZ * 2];
9118 	int namelen, masklen, socksize;
9119 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9120 
9121 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9122 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9123 
9124 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9125 	    if_name(ifp));
9126 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9127 	    + ((namelen > 0) ? namelen : 0);
9128 	socksize = masklen + ifp->if_addrlen;
9129 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9130 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9131 		socksize = sizeof(struct sockaddr_dl);
9132 	}
9133 	socksize = ROUNDUP(socksize);
9134 #undef ROUNDUP
9135 
9136 	ifa = ifp->if_lladdr;
9137 	if (socksize > DLIL_SDLMAXLEN ||
9138 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9139 		/*
9140 		 * Rare, but in the event that the link address requires
9141 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9142 		 * largest possible storages for address and mask, such
9143 		 * that we can reuse the same space when if_addrlen grows.
9144 		 * This same space will be used when if_addrlen shrinks.
9145 		 */
9146 		struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9147 
9148 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9149 			dl_if_lladdr_ext = zalloc_permanent(
9150 				sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9151 
9152 			ifa = &dl_if_lladdr_ext->ifa;
9153 			ifa_lock_init(ifa);
9154 			ifa_initref(ifa);
9155 			/* Don't set IFD_ALLOC, as this is permanent */
9156 			ifa->ifa_debug = IFD_LINK;
9157 		} else {
9158 			dl_if_lladdr_ext = __unsafe_forge_single(
9159 				struct dl_if_lladdr_xtra_space*, ifa);
9160 			ifa = &dl_if_lladdr_ext->ifa;
9161 		}
9162 
9163 		IFA_LOCK(ifa);
9164 		/* address and mask sockaddr_dl locations */
9165 		bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9166 		    sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9167 		bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9168 		    sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9169 		addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9170 		mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9171 	} else {
9172 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9173 		/*
9174 		 * Use the storage areas for address and mask within the
9175 		 * dlil_ifnet structure.  This is the most common case.
9176 		 */
9177 		if (ifa == NULL) {
9178 			ifa = &dl_if->dl_if_lladdr.ifa;
9179 			ifa_lock_init(ifa);
9180 			ifa_initref(ifa);
9181 			/* Don't set IFD_ALLOC, as this is permanent */
9182 			ifa->ifa_debug = IFD_LINK;
9183 		}
9184 		IFA_LOCK(ifa);
9185 		/* address and mask sockaddr_dl locations */
9186 		bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9187 		    sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9188 		bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9189 		    sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9190 		addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9191 		mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9192 	}
9193 
9194 	if (ifp->if_lladdr != ifa) {
9195 		oifa = ifp->if_lladdr;
9196 		ifp->if_lladdr = ifa;
9197 	}
9198 
9199 	VERIFY(ifa->ifa_debug == IFD_LINK);
9200 	ifa->ifa_ifp = ifp;
9201 	ifa->ifa_rtrequest = link_rtrequest;
9202 	ifa->ifa_addr = SA(addr_sdl);
9203 	addr_sdl->sdl_len = (u_char)socksize;
9204 	addr_sdl->sdl_family = AF_LINK;
9205 	if (namelen > 0) {
9206 		bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9207 		    sizeof(addr_sdl->sdl_data)));
9208 		addr_sdl->sdl_nlen = (u_char)namelen;
9209 	} else {
9210 		addr_sdl->sdl_nlen = 0;
9211 	}
9212 	addr_sdl->sdl_index = ifp->if_index;
9213 	addr_sdl->sdl_type = ifp->if_type;
9214 	if (ll_addr != NULL) {
9215 		addr_sdl->sdl_alen = ll_addr->sdl_alen;
9216 		bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9217 	} else {
9218 		addr_sdl->sdl_alen = 0;
9219 	}
9220 	ifa->ifa_netmask = SA(mask_sdl);
9221 	mask_sdl->sdl_len = (u_char)masklen;
9222 	while (namelen > 0) {
9223 		mask_sdl->sdl_data[--namelen] = 0xff;
9224 	}
9225 	IFA_UNLOCK(ifa);
9226 
9227 	if (oifa != NULL) {
9228 		ifa_remref(oifa);
9229 	}
9230 
9231 	return ifa;
9232 }
9233 
9234 static void
if_purgeaddrs(struct ifnet * ifp)9235 if_purgeaddrs(struct ifnet *ifp)
9236 {
9237 #if INET
9238 	in_purgeaddrs(ifp);
9239 #endif /* INET */
9240 	in6_purgeaddrs(ifp);
9241 }
9242 
9243 errno_t
ifnet_detach(ifnet_t ifp)9244 ifnet_detach(ifnet_t ifp)
9245 {
9246 	struct ifnet *delegated_ifp;
9247 	struct nd_ifinfo *ndi = NULL;
9248 
9249 	if (ifp == NULL) {
9250 		return EINVAL;
9251 	}
9252 
9253 	ndi = ND_IFINFO(ifp);
9254 	if (NULL != ndi) {
9255 		ndi->cga_initialized = FALSE;
9256 	}
9257 
9258 	/* Mark the interface down */
9259 	if_down(ifp);
9260 
9261 	/*
9262 	 * IMPORTANT NOTE
9263 	 *
9264 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9265 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9266 	 * until after we've waited for all I/O references to drain
9267 	 * in ifnet_detach_final().
9268 	 */
9269 
9270 	ifnet_head_lock_exclusive();
9271 	ifnet_lock_exclusive(ifp);
9272 
9273 	if (ifp->if_output_netem != NULL) {
9274 		netem_destroy(ifp->if_output_netem);
9275 		ifp->if_output_netem = NULL;
9276 	}
9277 
9278 	/*
9279 	 * Check to see if this interface has previously triggered
9280 	 * aggressive protocol draining; if so, decrement the global
9281 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9282 	 * there are no more of such an interface around.
9283 	 */
9284 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9285 
9286 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9287 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9288 		lck_mtx_unlock(&ifp->if_ref_lock);
9289 		ifnet_lock_done(ifp);
9290 		ifnet_head_done();
9291 		return EINVAL;
9292 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9293 		/* Interface has already been detached */
9294 		lck_mtx_unlock(&ifp->if_ref_lock);
9295 		ifnet_lock_done(ifp);
9296 		ifnet_head_done();
9297 		return ENXIO;
9298 	}
9299 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9300 	/* Indicate this interface is being detached */
9301 	ifp->if_refflags &= ~IFRF_ATTACHED;
9302 	ifp->if_refflags |= IFRF_DETACHING;
9303 	lck_mtx_unlock(&ifp->if_ref_lock);
9304 
9305 	if (dlil_verbose) {
9306 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9307 	}
9308 
9309 	/* clean up flow control entry object if there's any */
9310 	if (ifp->if_eflags & IFEF_TXSTART) {
9311 		ifnet_flowadv(ifp->if_flowhash);
9312 	}
9313 
9314 	/* Reset ECN enable/disable flags */
9315 	/* Reset CLAT46 flag */
9316 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9317 
9318 	/*
9319 	 * We do not reset the TCP keep alive counters in case
9320 	 * a TCP connection stays connection after the interface
9321 	 * went down
9322 	 */
9323 	if (ifp->if_tcp_kao_cnt > 0) {
9324 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9325 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9326 	}
9327 	ifp->if_tcp_kao_max = 0;
9328 
9329 	/*
9330 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9331 	 * no longer be visible during lookups from this point.
9332 	 */
9333 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9334 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9335 	ifp->if_link.tqe_next = NULL;
9336 	ifp->if_link.tqe_prev = NULL;
9337 	if (ifp->if_ordered_link.tqe_next != NULL ||
9338 	    ifp->if_ordered_link.tqe_prev != NULL) {
9339 		ifnet_remove_from_ordered_list(ifp);
9340 	}
9341 	ifindex2ifnet[ifp->if_index] = NULL;
9342 
9343 	/* 18717626 - reset router mode */
9344 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9345 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9346 
9347 	/* Record detach PC stacktrace */
9348 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9349 
9350 	/* Clear logging parameters */
9351 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9352 
9353 	/* Clear delegated interface info (reference released below) */
9354 	delegated_ifp = ifp->if_delegated.ifp;
9355 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9356 
9357 	/* Reset interface state */
9358 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9359 
9360 	/*
9361 	 * Increment the generation count on interface deletion
9362 	 */
9363 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9364 
9365 	ifnet_lock_done(ifp);
9366 	ifnet_head_done();
9367 
9368 	/* Release reference held on the delegated interface */
9369 	if (delegated_ifp != NULL) {
9370 		ifnet_release(delegated_ifp);
9371 	}
9372 
9373 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9374 	if (ifp != lo_ifp) {
9375 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9376 	}
9377 
9378 	/* Reset TCP local statistics */
9379 	if (ifp->if_tcp_stat != NULL) {
9380 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9381 	}
9382 
9383 	/* Reset UDP local statistics */
9384 	if (ifp->if_udp_stat != NULL) {
9385 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9386 	}
9387 
9388 	/* Reset ifnet IPv4 stats */
9389 	if (ifp->if_ipv4_stat != NULL) {
9390 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9391 	}
9392 
9393 	/* Reset ifnet IPv6 stats */
9394 	if (ifp->if_ipv6_stat != NULL) {
9395 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9396 	}
9397 
9398 	/* Release memory held for interface link status report */
9399 	if (ifp->if_link_status != NULL) {
9400 		kfree_type(struct if_link_status, ifp->if_link_status);
9401 		ifp->if_link_status = NULL;
9402 	}
9403 
9404 	/* Disable forwarding cached route */
9405 	lck_mtx_lock(&ifp->if_cached_route_lock);
9406 	ifp->if_fwd_cacheok = 0;
9407 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9408 
9409 	/* Disable data threshold and wait for any pending event posting */
9410 	ifp->if_data_threshold = 0;
9411 	VERIFY(ifp->if_dt_tcall != NULL);
9412 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9413 
9414 	/*
9415 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9416 	 * references to the info structures and leave them attached to
9417 	 * this ifnet.
9418 	 */
9419 #if INET
9420 	igmp_domifdetach(ifp);
9421 #endif /* INET */
9422 	mld_domifdetach(ifp);
9423 
9424 #if SKYWALK
9425 	/* Clean up any netns tokens still pointing to to this ifnet */
9426 	netns_ifnet_detach(ifp);
9427 #endif /* SKYWALK */
9428 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9429 
9430 	/* Let worker thread take care of the rest, to avoid reentrancy */
9431 	dlil_if_lock();
9432 	ifnet_detaching_enqueue(ifp);
9433 	dlil_if_unlock();
9434 
9435 	return 0;
9436 }
9437 
9438 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9439 ifnet_detaching_enqueue(struct ifnet *ifp)
9440 {
9441 	dlil_if_lock_assert();
9442 
9443 	++ifnet_detaching_cnt;
9444 	VERIFY(ifnet_detaching_cnt != 0);
9445 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9446 	wakeup((caddr_t)&ifnet_delayed_run);
9447 }
9448 
9449 static struct ifnet *
ifnet_detaching_dequeue(void)9450 ifnet_detaching_dequeue(void)
9451 {
9452 	struct ifnet *ifp;
9453 
9454 	dlil_if_lock_assert();
9455 
9456 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9457 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9458 	if (ifp != NULL) {
9459 		VERIFY(ifnet_detaching_cnt != 0);
9460 		--ifnet_detaching_cnt;
9461 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9462 		ifp->if_detaching_link.tqe_next = NULL;
9463 		ifp->if_detaching_link.tqe_prev = NULL;
9464 	}
9465 	return ifp;
9466 }
9467 
9468 __attribute__((noreturn))
9469 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9470 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9471 {
9472 #pragma unused(v, wres)
9473 	struct ifnet *ifp;
9474 
9475 	dlil_if_lock();
9476 	if (__improbable(ifnet_detaching_embryonic)) {
9477 		ifnet_detaching_embryonic = FALSE;
9478 		/* there's no lock ordering constrain so OK to do this here */
9479 		dlil_decr_pending_thread_count();
9480 	}
9481 
9482 	for (;;) {
9483 		dlil_if_lock_assert();
9484 
9485 		if (ifnet_detaching_cnt == 0) {
9486 			break;
9487 		}
9488 
9489 		net_update_uptime();
9490 
9491 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9492 
9493 		/* Take care of detaching ifnet */
9494 		ifp = ifnet_detaching_dequeue();
9495 		if (ifp != NULL) {
9496 			dlil_if_unlock();
9497 			ifnet_detach_final(ifp);
9498 			dlil_if_lock();
9499 		}
9500 	}
9501 
9502 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9503 	dlil_if_unlock();
9504 	(void) thread_block(ifnet_detacher_thread_cont);
9505 
9506 	VERIFY(0);      /* we should never get here */
9507 	/* NOTREACHED */
9508 	__builtin_unreachable();
9509 }
9510 
9511 __dead2
9512 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9513 ifnet_detacher_thread_func(void *v, wait_result_t w)
9514 {
9515 #pragma unused(v, w)
9516 	dlil_if_lock();
9517 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9518 	ifnet_detaching_embryonic = TRUE;
9519 	/* wake up once to get out of embryonic state */
9520 	wakeup((caddr_t)&ifnet_delayed_run);
9521 	dlil_if_unlock();
9522 	(void) thread_block(ifnet_detacher_thread_cont);
9523 	VERIFY(0);
9524 	/* NOTREACHED */
9525 	__builtin_unreachable();
9526 }
9527 
9528 static void
ifnet_detach_final(struct ifnet * ifp)9529 ifnet_detach_final(struct ifnet *ifp)
9530 {
9531 	struct ifnet_filter *filter, *filter_next;
9532 	struct dlil_ifnet *dlifp;
9533 	struct ifnet_filter_head fhead;
9534 	struct dlil_threading_info *inp;
9535 	struct ifaddr *ifa;
9536 	ifnet_detached_func if_free;
9537 	int i;
9538 	bool waited = false;
9539 
9540 	/* Let BPF know we're detaching */
9541 	bpfdetach(ifp);
9542 
9543 #if SKYWALK
9544 	dlil_netif_detach_notify(ifp);
9545 	/*
9546 	 * Wait for the datapath to quiesce before tearing down
9547 	 * netif/flowswitch nexuses.
9548 	 */
9549 	dlil_quiesce_and_detach_nexuses(ifp);
9550 #endif /* SKYWALK */
9551 
9552 	lck_mtx_lock(&ifp->if_ref_lock);
9553 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9554 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9555 		    __func__, ifp);
9556 		/* NOTREACHED */
9557 	}
9558 
9559 	/*
9560 	 * Wait until the existing IO references get released
9561 	 * before we proceed with ifnet_detach.  This is not a
9562 	 * common case, so block without using a continuation.
9563 	 */
9564 	while (ifp->if_refio > 0) {
9565 		waited = true;
9566 		DLIL_PRINTF("%s: %s waiting for IO references to drain\n",
9567 		    __func__, if_name(ifp));
9568 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9569 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9570 	}
9571 	if (waited) {
9572 		DLIL_PRINTF("%s: %s IO references drained\n",
9573 		    __func__, if_name(ifp));
9574 	}
9575 	VERIFY(ifp->if_datamov == 0);
9576 	VERIFY(ifp->if_drainers == 0);
9577 	VERIFY(ifp->if_suspend == 0);
9578 	ifp->if_refflags &= ~IFRF_READY;
9579 	lck_mtx_unlock(&ifp->if_ref_lock);
9580 
9581 #if SKYWALK
9582 	VERIFY(LIST_EMPTY(&ifp->if_netns_tokens));
9583 #endif /* SKYWALK */
9584 	/* Drain and destroy send queue */
9585 	ifclassq_teardown(ifp->if_snd);
9586 
9587 	/* Detach interface filters */
9588 	lck_mtx_lock(&ifp->if_flt_lock);
9589 	if_flt_monitor_enter(ifp);
9590 
9591 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9592 	fhead = ifp->if_flt_head;
9593 	TAILQ_INIT(&ifp->if_flt_head);
9594 
9595 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9596 		filter_next = TAILQ_NEXT(filter, filt_next);
9597 		lck_mtx_unlock(&ifp->if_flt_lock);
9598 
9599 		dlil_detach_filter_internal(filter, 1);
9600 		lck_mtx_lock(&ifp->if_flt_lock);
9601 	}
9602 	if_flt_monitor_leave(ifp);
9603 	lck_mtx_unlock(&ifp->if_flt_lock);
9604 
9605 	/* Tell upper layers to drop their network addresses */
9606 	if_purgeaddrs(ifp);
9607 
9608 	ifnet_lock_exclusive(ifp);
9609 
9610 	/* Clear agent IDs */
9611 	if (ifp->if_agentids != NULL) {
9612 		kfree_data(ifp->if_agentids,
9613 		    sizeof(uuid_t) * ifp->if_agentcount);
9614 		ifp->if_agentids = NULL;
9615 	}
9616 	ifp->if_agentcount = 0;
9617 
9618 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
9619 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
9620 
9621 	/* Unplumb all protocols */
9622 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9623 		struct if_proto *proto;
9624 
9625 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9626 		while (proto != NULL) {
9627 			protocol_family_t family = proto->protocol_family;
9628 			ifnet_lock_done(ifp);
9629 			proto_unplumb(family, ifp);
9630 			ifnet_lock_exclusive(ifp);
9631 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9632 		}
9633 		/* There should not be any protocols left */
9634 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9635 	}
9636 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9637 	ifp->if_proto_hash = NULL;
9638 
9639 	/* Detach (permanent) link address from if_addrhead */
9640 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9641 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9642 	IFA_LOCK(ifa);
9643 	if_detach_link_ifa(ifp, ifa);
9644 	IFA_UNLOCK(ifa);
9645 
9646 	/* Remove (permanent) link address from ifnet_addrs[] */
9647 	ifa_remref(ifa);
9648 	ifnet_addrs[ifp->if_index - 1] = NULL;
9649 
9650 	/* This interface should not be on {ifnet_head,detaching} */
9651 	VERIFY(ifp->if_link.tqe_next == NULL);
9652 	VERIFY(ifp->if_link.tqe_prev == NULL);
9653 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9654 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9655 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9656 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9657 
9658 	/* The slot should have been emptied */
9659 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9660 
9661 	/* There should not be any addresses left */
9662 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9663 
9664 	/*
9665 	 * Signal the starter thread to terminate itself, and wait until
9666 	 * it has exited.
9667 	 */
9668 	if (ifp->if_start_thread != THREAD_NULL) {
9669 		lck_mtx_lock_spin(&ifp->if_start_lock);
9670 		ifp->if_start_flags |= IFSF_TERMINATING;
9671 		wakeup_one((caddr_t)&ifp->if_start_thread);
9672 		lck_mtx_unlock(&ifp->if_start_lock);
9673 
9674 		/* wait for starter thread to terminate */
9675 		lck_mtx_lock(&ifp->if_start_lock);
9676 		while (ifp->if_start_thread != THREAD_NULL) {
9677 			if (dlil_verbose) {
9678 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9679 				    __func__,
9680 				    if_name(ifp));
9681 			}
9682 			(void) msleep(&ifp->if_start_thread,
9683 			    &ifp->if_start_lock, (PZERO - 1),
9684 			    "ifnet_start_thread_exit", NULL);
9685 		}
9686 		lck_mtx_unlock(&ifp->if_start_lock);
9687 		if (dlil_verbose) {
9688 			DLIL_PRINTF("%s: %s starter thread termination complete",
9689 			    __func__, if_name(ifp));
9690 		}
9691 	}
9692 
9693 	/*
9694 	 * Signal the poller thread to terminate itself, and wait until
9695 	 * it has exited.
9696 	 */
9697 	if (ifp->if_poll_thread != THREAD_NULL) {
9698 #if SKYWALK
9699 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9700 #endif /* SKYWALK */
9701 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9702 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9703 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9704 		lck_mtx_unlock(&ifp->if_poll_lock);
9705 
9706 		/* wait for poller thread to terminate */
9707 		lck_mtx_lock(&ifp->if_poll_lock);
9708 		while (ifp->if_poll_thread != THREAD_NULL) {
9709 			if (dlil_verbose) {
9710 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9711 				    __func__,
9712 				    if_name(ifp));
9713 			}
9714 			(void) msleep(&ifp->if_poll_thread,
9715 			    &ifp->if_poll_lock, (PZERO - 1),
9716 			    "ifnet_poll_thread_exit", NULL);
9717 		}
9718 		lck_mtx_unlock(&ifp->if_poll_lock);
9719 		if (dlil_verbose) {
9720 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9721 			    __func__, if_name(ifp));
9722 		}
9723 	}
9724 
9725 	/*
9726 	 * If thread affinity was set for the workloop thread, we will need
9727 	 * to tear down the affinity and release the extra reference count
9728 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9729 	 * without dedicated input threads.
9730 	 */
9731 	if ((inp = ifp->if_inp) != NULL) {
9732 		VERIFY(inp != dlil_main_input_thread);
9733 
9734 		if (inp->dlth_affinity) {
9735 			struct thread *tp, *wtp, *ptp;
9736 
9737 			lck_mtx_lock_spin(&inp->dlth_lock);
9738 			wtp = inp->dlth_driver_thread;
9739 			inp->dlth_driver_thread = THREAD_NULL;
9740 			ptp = inp->dlth_poller_thread;
9741 			inp->dlth_poller_thread = THREAD_NULL;
9742 			ASSERT(inp->dlth_thread != THREAD_NULL);
9743 			tp = inp->dlth_thread;    /* don't nullify now */
9744 			inp->dlth_affinity_tag = 0;
9745 			inp->dlth_affinity = FALSE;
9746 			lck_mtx_unlock(&inp->dlth_lock);
9747 
9748 			/* Tear down poll thread affinity */
9749 			if (ptp != NULL) {
9750 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9751 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9752 				(void) dlil_affinity_set(ptp,
9753 				    THREAD_AFFINITY_TAG_NULL);
9754 				thread_deallocate(ptp);
9755 			}
9756 
9757 			/* Tear down workloop thread affinity */
9758 			if (wtp != NULL) {
9759 				(void) dlil_affinity_set(wtp,
9760 				    THREAD_AFFINITY_TAG_NULL);
9761 				thread_deallocate(wtp);
9762 			}
9763 
9764 			/* Tear down DLIL input thread affinity */
9765 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9766 			thread_deallocate(tp);
9767 		}
9768 
9769 		/* disassociate ifp DLIL input thread */
9770 		ifp->if_inp = NULL;
9771 
9772 		/* if the worker thread was created, tell it to terminate */
9773 		if (inp->dlth_thread != THREAD_NULL) {
9774 			lck_mtx_lock_spin(&inp->dlth_lock);
9775 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9776 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9777 				wakeup_one((caddr_t)&inp->dlth_flags);
9778 			}
9779 			lck_mtx_unlock(&inp->dlth_lock);
9780 			ifnet_lock_done(ifp);
9781 
9782 			/* wait for the input thread to terminate */
9783 			lck_mtx_lock_spin(&inp->dlth_lock);
9784 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9785 			    == 0) {
9786 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9787 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9788 			}
9789 			lck_mtx_unlock(&inp->dlth_lock);
9790 			ifnet_lock_exclusive(ifp);
9791 		}
9792 
9793 		/* clean-up input thread state */
9794 		dlil_clean_threading_info(inp);
9795 		/* clean-up poll parameters */
9796 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9797 		dlil_reset_rxpoll_params(ifp);
9798 	}
9799 
9800 	/* The driver might unload, so point these to ourselves */
9801 	if_free = ifp->if_free;
9802 	ifp->if_output_dlil = ifp_if_output;
9803 	ifp->if_output = ifp_if_output;
9804 	ifp->if_pre_enqueue = ifp_if_output;
9805 	ifp->if_start = ifp_if_start;
9806 	ifp->if_output_ctl = ifp_if_ctl;
9807 	ifp->if_input_dlil = ifp_if_input;
9808 	ifp->if_input_poll = ifp_if_input_poll;
9809 	ifp->if_input_ctl = ifp_if_ctl;
9810 	ifp->if_ioctl = ifp_if_ioctl;
9811 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9812 	ifp->if_free = ifp_if_free;
9813 	ifp->if_demux = ifp_if_demux;
9814 	ifp->if_event = ifp_if_event;
9815 	ifp->if_framer_legacy = ifp_if_framer;
9816 	ifp->if_framer = ifp_if_framer_extended;
9817 	ifp->if_add_proto = ifp_if_add_proto;
9818 	ifp->if_del_proto = ifp_if_del_proto;
9819 	ifp->if_check_multi = ifp_if_check_multi;
9820 
9821 	/* wipe out interface description */
9822 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9823 	ifp->if_desc.ifd_len = 0;
9824 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9825 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9826 
9827 	/* there shouldn't be any delegation by now */
9828 	VERIFY(ifp->if_delegated.ifp == NULL);
9829 	VERIFY(ifp->if_delegated.type == 0);
9830 	VERIFY(ifp->if_delegated.family == 0);
9831 	VERIFY(ifp->if_delegated.subfamily == 0);
9832 	VERIFY(ifp->if_delegated.expensive == 0);
9833 	VERIFY(ifp->if_delegated.constrained == 0);
9834 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
9835 
9836 	/* QoS marking get cleared */
9837 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9838 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9839 
9840 #if SKYWALK
9841 	/* the nexus destructor is responsible for clearing these */
9842 	VERIFY(ifp->if_na_ops == NULL);
9843 	VERIFY(ifp->if_na == NULL);
9844 #endif /* SKYWALK */
9845 
9846 	/* promiscuous/allmulti counts need to start at zero again */
9847 	ifp->if_pcount = 0;
9848 	ifp->if_amcount = 0;
9849 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9850 
9851 	ifnet_lock_done(ifp);
9852 
9853 #if PF
9854 	/*
9855 	 * Detach this interface from packet filter, if enabled.
9856 	 */
9857 	pf_ifnet_hook(ifp, 0);
9858 #endif /* PF */
9859 
9860 	/* Filter list should be empty */
9861 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9862 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9863 	VERIFY(ifp->if_flt_busy == 0);
9864 	VERIFY(ifp->if_flt_waiters == 0);
9865 	VERIFY(ifp->if_flt_non_os_count == 0);
9866 	VERIFY(ifp->if_flt_no_tso_count == 0);
9867 	lck_mtx_unlock(&ifp->if_flt_lock);
9868 
9869 	/* Last chance to drain send queue */
9870 	if_qflush_snd(ifp, 0);
9871 
9872 	/* Last chance to cleanup any cached route */
9873 	lck_mtx_lock(&ifp->if_cached_route_lock);
9874 	VERIFY(!ifp->if_fwd_cacheok);
9875 	ROUTE_RELEASE(&ifp->if_fwd_route);
9876 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9877 	ROUTE_RELEASE(&ifp->if_src_route);
9878 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9879 	ROUTE_RELEASE(&ifp->if_src_route6);
9880 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9881 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9882 
9883 	/* Ignore any pending data threshold as the interface is anyways gone */
9884 	ifp->if_data_threshold = 0;
9885 
9886 	VERIFY(ifp->if_dt_tcall != NULL);
9887 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9888 
9889 	ifnet_llreach_ifdetach(ifp);
9890 
9891 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9892 
9893 	/*
9894 	 * Finally, mark this ifnet as detached.
9895 	 */
9896 	if (dlil_verbose) {
9897 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9898 	}
9899 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9900 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9901 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9902 		    __func__, ifp);
9903 		/* NOTREACHED */
9904 	}
9905 	ifp->if_refflags &= ~IFRF_DETACHING;
9906 	lck_mtx_unlock(&ifp->if_ref_lock);
9907 	if (if_free != NULL) {
9908 		if_free(ifp);
9909 	}
9910 
9911 	ifclassq_release(&ifp->if_snd);
9912 
9913 	/* we're fully detached, clear the "in use" bit */
9914 	dlifp = (struct dlil_ifnet *)ifp;
9915 	lck_mtx_lock(&dlifp->dl_if_lock);
9916 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9917 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9918 	lck_mtx_unlock(&dlifp->dl_if_lock);
9919 
9920 	/* Release reference held during ifnet attach */
9921 	ifnet_release(ifp);
9922 }
9923 
9924 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9925 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9926 {
9927 #pragma unused(ifp)
9928 	m_freem_list(m);
9929 	return 0;
9930 }
9931 
9932 void
ifp_if_start(struct ifnet * ifp)9933 ifp_if_start(struct ifnet *ifp)
9934 {
9935 	ifnet_purge(ifp);
9936 }
9937 
9938 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9939 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9940     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9941     boolean_t poll, struct thread *tp)
9942 {
9943 #pragma unused(ifp, m_tail, s, poll, tp)
9944 	m_freem_list(m_head);
9945 	return ENXIO;
9946 }
9947 
9948 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9949 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9950     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9951 {
9952 #pragma unused(ifp, flags, max_cnt)
9953 	if (m_head != NULL) {
9954 		*m_head = NULL;
9955 	}
9956 	if (m_tail != NULL) {
9957 		*m_tail = NULL;
9958 	}
9959 	if (cnt != NULL) {
9960 		*cnt = 0;
9961 	}
9962 	if (len != NULL) {
9963 		*len = 0;
9964 	}
9965 }
9966 
9967 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9968 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9969 {
9970 #pragma unused(ifp, cmd, arglen, arg)
9971 	return EOPNOTSUPP;
9972 }
9973 
9974 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9975 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9976 {
9977 #pragma unused(ifp, fh, pf)
9978 	m_freem(m);
9979 	return EJUSTRETURN;
9980 }
9981 
9982 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9983 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9984     const struct ifnet_demux_desc *da, u_int32_t dc)
9985 {
9986 #pragma unused(ifp, pf, da, dc)
9987 	return EINVAL;
9988 }
9989 
9990 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9991 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9992 {
9993 #pragma unused(ifp, pf)
9994 	return EINVAL;
9995 }
9996 
9997 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9998 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9999 {
10000 #pragma unused(ifp, sa)
10001 	return EOPNOTSUPP;
10002 }
10003 
10004 #if !XNU_TARGET_OS_OSX
10005 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10006 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10007     const struct sockaddr *sa, const char *ll, const char *t,
10008     u_int32_t *pre, u_int32_t *post)
10009 #else /* XNU_TARGET_OS_OSX */
10010 static errno_t
10011 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10012     const struct sockaddr *sa, const char *ll, const char *t)
10013 #endif /* XNU_TARGET_OS_OSX */
10014 {
10015 #pragma unused(ifp, m, sa, ll, t)
10016 #if !XNU_TARGET_OS_OSX
10017 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10018 #else /* XNU_TARGET_OS_OSX */
10019 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10020 #endif /* XNU_TARGET_OS_OSX */
10021 }
10022 
10023 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10024 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10025     const struct sockaddr *sa, const char *ll, const char *t,
10026     u_int32_t *pre, u_int32_t *post)
10027 {
10028 #pragma unused(ifp, sa, ll, t)
10029 	m_freem(*m);
10030 	*m = NULL;
10031 
10032 	if (pre != NULL) {
10033 		*pre = 0;
10034 	}
10035 	if (post != NULL) {
10036 		*post = 0;
10037 	}
10038 
10039 	return EJUSTRETURN;
10040 }
10041 
10042 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10043 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10044 {
10045 #pragma unused(ifp, cmd, arg)
10046 	return EOPNOTSUPP;
10047 }
10048 
10049 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10050 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10051 {
10052 #pragma unused(ifp, tm, f)
10053 	/* XXX not sure what to do here */
10054 	return 0;
10055 }
10056 
10057 static void
ifp_if_free(struct ifnet * ifp)10058 ifp_if_free(struct ifnet *ifp)
10059 {
10060 #pragma unused(ifp)
10061 }
10062 
10063 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10064 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10065 {
10066 #pragma unused(ifp, e)
10067 }
10068 
10069 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10070 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10071     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10072 {
10073 	struct ifnet *ifp1 = NULL;
10074 	struct dlil_ifnet *dlifp1 = NULL;
10075 	struct dlil_ifnet *dlifp1_saved = NULL;
10076 	void *buf, *base, **pbuf;
10077 	int ret = 0;
10078 
10079 	VERIFY(*ifp == NULL);
10080 	dlil_if_lock();
10081 	/*
10082 	 * We absolutely can't have an interface with the same name
10083 	 * in in-use state.
10084 	 * To make sure of that list has to be traversed completely
10085 	 */
10086 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10087 		ifp1 = (struct ifnet *)dlifp1;
10088 
10089 		if (ifp1->if_family != family) {
10090 			continue;
10091 		}
10092 
10093 		/*
10094 		 * If interface is in use, return EBUSY if either unique id
10095 		 * or interface extended names are the same
10096 		 */
10097 		lck_mtx_lock(&dlifp1->dl_if_lock);
10098 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10099 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10100 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10101 			ret = EBUSY;
10102 			goto end;
10103 		}
10104 
10105 		if (uniqueid_len != 0 &&
10106 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10107 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10108 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10109 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10110 				ret = EBUSY;
10111 				goto end;
10112 			}
10113 			if (dlifp1_saved == NULL) {
10114 				/* cache the first match */
10115 				dlifp1_saved = dlifp1;
10116 			}
10117 			/*
10118 			 * Do not break or jump to end as we have to traverse
10119 			 * the whole list to ensure there are no name collisions
10120 			 */
10121 		}
10122 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10123 	}
10124 
10125 	/* If there's an interface that can be recycled, use that */
10126 	if (dlifp1_saved != NULL) {
10127 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10128 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10129 			/* some other thread got in ahead of us */
10130 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10131 			ret = EBUSY;
10132 			goto end;
10133 		}
10134 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10135 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10136 		*ifp = (struct ifnet *)dlifp1_saved;
10137 		dlil_if_ref(*ifp);
10138 		goto end;
10139 	}
10140 
10141 	/* no interface found, allocate a new one */
10142 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10143 
10144 	/* Get the 64-bit aligned base address for this object */
10145 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10146 	    sizeof(u_int64_t));
10147 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10148 
10149 	/*
10150 	 * Wind back a pointer size from the aligned base and
10151 	 * save the original address so we can free it later.
10152 	 */
10153 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10154 	*pbuf = buf;
10155 	dlifp1 = base;
10156 
10157 	if (uniqueid_len) {
10158 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10159 		    Z_WAITOK);
10160 		if (dlifp1->dl_if_uniqueid == NULL) {
10161 			zfree(dlif_zone, buf);
10162 			ret = ENOMEM;
10163 			goto end;
10164 		}
10165 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10166 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10167 	}
10168 
10169 	ifp1 = (struct ifnet *)dlifp1;
10170 	dlifp1->dl_if_flags = DLIF_INUSE;
10171 	if (ifnet_debug) {
10172 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10173 		dlifp1->dl_if_trace = dlil_if_trace;
10174 	}
10175 	ifp1->if_name = dlifp1->dl_if_namestorage;
10176 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10177 
10178 	/* initialize interface description */
10179 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10180 	ifp1->if_desc.ifd_len = 0;
10181 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10182 
10183 #if SKYWALK
10184 	LIST_INIT(&ifp1->if_netns_tokens);
10185 #endif /* SKYWALK */
10186 
10187 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10188 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10189 		    "error: %d\n", __func__, ret);
10190 		/* This probably shouldn't be fatal */
10191 		ret = 0;
10192 	}
10193 
10194 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10195 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10196 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10197 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10198 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10199 	    &ifnet_lock_attr);
10200 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 #if INET
10202 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10203 	    &ifnet_lock_attr);
10204 	ifp1->if_inetdata = NULL;
10205 #endif
10206 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10207 	ifp1->if_inet6_ioctl_busy = FALSE;
10208 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10209 	    &ifnet_lock_attr);
10210 	ifp1->if_inet6data = NULL;
10211 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10212 	    &ifnet_lock_attr);
10213 	ifp1->if_link_status = NULL;
10214 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10215 
10216 	/* for send data paths */
10217 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10218 	    &ifnet_lock_attr);
10219 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10220 	    &ifnet_lock_attr);
10221 
10222 	/* for receive data paths */
10223 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10224 	    &ifnet_lock_attr);
10225 
10226 	/* thread call allocation is done with sleeping zalloc */
10227 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10228 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10229 	if (ifp1->if_dt_tcall == NULL) {
10230 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10231 		/* NOTREACHED */
10232 	}
10233 
10234 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10235 
10236 	*ifp = ifp1;
10237 	dlil_if_ref(*ifp);
10238 
10239 end:
10240 	dlil_if_unlock();
10241 
10242 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10243 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10244 
10245 	return ret;
10246 }
10247 
10248 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10249 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10250 {
10251 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10252 
10253 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10254 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10255 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10256 	}
10257 
10258 	ifnet_lock_exclusive(ifp);
10259 	kfree_data_counted_by(ifp->if_broadcast.ptr, ifp->if_broadcast.length);
10260 	lck_mtx_lock(&dlifp->dl_if_lock);
10261 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10262 	ifp->if_name = dlifp->dl_if_namestorage;
10263 	/* Reset external name (name + unit) */
10264 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10265 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10266 	    "%s?", ifp->if_name);
10267 	if (clear_in_use) {
10268 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10269 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10270 	}
10271 	lck_mtx_unlock(&dlifp->dl_if_lock);
10272 	ifnet_lock_done(ifp);
10273 }
10274 
10275 __private_extern__ void
dlil_if_release(ifnet_t ifp)10276 dlil_if_release(ifnet_t ifp)
10277 {
10278 	_dlil_if_release(ifp, false);
10279 }
10280 
10281 __private_extern__ void
dlil_if_lock(void)10282 dlil_if_lock(void)
10283 {
10284 	lck_mtx_lock(&dlil_ifnet_lock);
10285 }
10286 
10287 __private_extern__ void
dlil_if_unlock(void)10288 dlil_if_unlock(void)
10289 {
10290 	lck_mtx_unlock(&dlil_ifnet_lock);
10291 }
10292 
10293 __private_extern__ void
dlil_if_lock_assert(void)10294 dlil_if_lock_assert(void)
10295 {
10296 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10297 }
10298 
10299 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10300 dlil_proto_unplumb_all(struct ifnet *ifp)
10301 {
10302 	/*
10303 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10304 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10305 	 * explicit unplumb.
10306 	 *
10307 	 * if_proto_hash[3] is for other protocols; we expect anything
10308 	 * in this bucket to respond to the DETACHING event (which would
10309 	 * have happened by now) and do the unplumb then.
10310 	 */
10311 	(void) proto_unplumb(PF_INET, ifp);
10312 	(void) proto_unplumb(PF_INET6, ifp);
10313 }
10314 
10315 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10316 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10317 {
10318 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10319 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10320 
10321 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10322 
10323 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10324 }
10325 
10326 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10327 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10328 {
10329 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10330 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10331 
10332 	if (ifp->if_fwd_cacheok) {
10333 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10334 	} else {
10335 		ROUTE_RELEASE(src);
10336 	}
10337 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10338 }
10339 
10340 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10341 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10342 {
10343 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10344 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10345 
10346 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10347 	    sizeof(*dst));
10348 
10349 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10350 }
10351 
10352 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10353 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10354 {
10355 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10356 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10357 
10358 	if (ifp->if_fwd_cacheok) {
10359 		route_copyin((struct route *)src,
10360 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10361 	} else {
10362 		ROUTE_RELEASE(src);
10363 	}
10364 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10365 }
10366 
10367 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10368 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10369 {
10370 	struct route            src_rt;
10371 	struct sockaddr_in      *dst;
10372 
10373 	dst = SIN(&src_rt.ro_dst);
10374 
10375 	ifp_src_route_copyout(ifp, &src_rt);
10376 
10377 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10378 		ROUTE_RELEASE(&src_rt);
10379 		if (dst->sin_family != AF_INET) {
10380 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10381 			dst->sin_len = sizeof(src_rt.ro_dst);
10382 			dst->sin_family = AF_INET;
10383 		}
10384 		dst->sin_addr = src_ip;
10385 
10386 		VERIFY(src_rt.ro_rt == NULL);
10387 		src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10388 		    0, 0, ifp->if_index);
10389 
10390 		if (src_rt.ro_rt != NULL) {
10391 			/* retain a ref, copyin consumes one */
10392 			struct rtentry  *rte = src_rt.ro_rt;
10393 			RT_ADDREF(rte);
10394 			ifp_src_route_copyin(ifp, &src_rt);
10395 			src_rt.ro_rt = rte;
10396 		}
10397 	}
10398 
10399 	return src_rt.ro_rt;
10400 }
10401 
10402 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10403 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10404 {
10405 	struct route_in6 src_rt;
10406 
10407 	ifp_src_route6_copyout(ifp, &src_rt);
10408 
10409 	if (ROUTE_UNUSABLE(&src_rt) ||
10410 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10411 		ROUTE_RELEASE(&src_rt);
10412 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10413 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10414 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10415 			src_rt.ro_dst.sin6_family = AF_INET6;
10416 		}
10417 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10418 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10419 		    sizeof(src_rt.ro_dst.sin6_addr));
10420 
10421 		if (src_rt.ro_rt == NULL) {
10422 			src_rt.ro_rt = rtalloc1_scoped(
10423 				SA(&src_rt.ro_dst), 0, 0,
10424 				ifp->if_index);
10425 
10426 			if (src_rt.ro_rt != NULL) {
10427 				/* retain a ref, copyin consumes one */
10428 				struct rtentry  *rte = src_rt.ro_rt;
10429 				RT_ADDREF(rte);
10430 				ifp_src_route6_copyin(ifp, &src_rt);
10431 				src_rt.ro_rt = rte;
10432 			}
10433 		}
10434 	}
10435 
10436 	return src_rt.ro_rt;
10437 }
10438 
10439 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10440 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10441 {
10442 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10443 
10444 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10445 
10446 	/* Normalize to edge */
10447 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10448 		lqm = IFNET_LQM_THRESH_ABORT;
10449 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10450 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10451 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10452 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10453 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10454 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10455 	    lqm <= IFNET_LQM_THRESH_POOR) {
10456 		lqm = IFNET_LQM_THRESH_POOR;
10457 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10458 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10459 		lqm = IFNET_LQM_THRESH_GOOD;
10460 	}
10461 
10462 	/*
10463 	 * Take the lock if needed
10464 	 */
10465 	if (!locked) {
10466 		ifnet_lock_exclusive(ifp);
10467 	}
10468 
10469 	if (lqm == ifp->if_interface_state.lqm_state &&
10470 	    (ifp->if_interface_state.valid_bitmask &
10471 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10472 		/*
10473 		 * Release the lock if was not held by the caller
10474 		 */
10475 		if (!locked) {
10476 			ifnet_lock_done(ifp);
10477 		}
10478 		return;         /* nothing to update */
10479 	}
10480 	ifp->if_interface_state.valid_bitmask |=
10481 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10482 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10483 
10484 	/*
10485 	 * Don't want to hold the lock when issuing kernel events
10486 	 */
10487 	ifnet_lock_done(ifp);
10488 
10489 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10490 	ev_lqm_data.link_quality_metric = lqm;
10491 
10492 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10493 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10494 
10495 	/*
10496 	 * Reacquire the lock for the caller
10497 	 */
10498 	if (locked) {
10499 		ifnet_lock_exclusive(ifp);
10500 	}
10501 }
10502 
10503 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10504 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10505 {
10506 	struct kev_dl_rrc_state kev;
10507 
10508 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10509 	    (ifp->if_interface_state.valid_bitmask &
10510 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10511 		return;
10512 	}
10513 
10514 	ifp->if_interface_state.valid_bitmask |=
10515 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10516 
10517 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10518 
10519 	/*
10520 	 * Don't want to hold the lock when issuing kernel events
10521 	 */
10522 	ifnet_lock_done(ifp);
10523 
10524 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10525 	kev.rrc_state = rrc_state;
10526 
10527 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10528 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10529 
10530 	ifnet_lock_exclusive(ifp);
10531 }
10532 
10533 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10534 if_state_update(struct ifnet *ifp,
10535     struct if_interface_state *if_interface_state)
10536 {
10537 	u_short if_index_available = 0;
10538 
10539 	ifnet_lock_exclusive(ifp);
10540 
10541 	if ((ifp->if_type != IFT_CELLULAR) &&
10542 	    (if_interface_state->valid_bitmask &
10543 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10544 		ifnet_lock_done(ifp);
10545 		return ENOTSUP;
10546 	}
10547 	if ((if_interface_state->valid_bitmask &
10548 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10549 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10550 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10551 		ifnet_lock_done(ifp);
10552 		return EINVAL;
10553 	}
10554 	if ((if_interface_state->valid_bitmask &
10555 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10556 	    if_interface_state->rrc_state !=
10557 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10558 	    if_interface_state->rrc_state !=
10559 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10560 		ifnet_lock_done(ifp);
10561 		return EINVAL;
10562 	}
10563 
10564 	if (if_interface_state->valid_bitmask &
10565 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10566 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10567 	}
10568 	if (if_interface_state->valid_bitmask &
10569 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10570 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10571 	}
10572 	if (if_interface_state->valid_bitmask &
10573 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10574 		ifp->if_interface_state.valid_bitmask |=
10575 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10576 		ifp->if_interface_state.interface_availability =
10577 		    if_interface_state->interface_availability;
10578 
10579 		if (ifp->if_interface_state.interface_availability ==
10580 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10581 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10582 			    __func__, if_name(ifp), ifp->if_index);
10583 			if_index_available = ifp->if_index;
10584 		} else {
10585 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10586 			    __func__, if_name(ifp), ifp->if_index);
10587 		}
10588 	}
10589 	ifnet_lock_done(ifp);
10590 
10591 	/*
10592 	 * Check if the TCP connections going on this interface should be
10593 	 * forced to send probe packets instead of waiting for TCP timers
10594 	 * to fire. This is done on an explicit notification such as
10595 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10596 	 */
10597 	if (if_index_available > 0) {
10598 		tcp_interface_send_probe(if_index_available);
10599 	}
10600 
10601 	return 0;
10602 }
10603 
10604 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10605 if_get_state(struct ifnet *ifp,
10606     struct if_interface_state *if_interface_state)
10607 {
10608 	ifnet_lock_shared(ifp);
10609 
10610 	if_interface_state->valid_bitmask = 0;
10611 
10612 	if (ifp->if_interface_state.valid_bitmask &
10613 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10614 		if_interface_state->valid_bitmask |=
10615 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10616 		if_interface_state->rrc_state =
10617 		    ifp->if_interface_state.rrc_state;
10618 	}
10619 	if (ifp->if_interface_state.valid_bitmask &
10620 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10621 		if_interface_state->valid_bitmask |=
10622 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10623 		if_interface_state->lqm_state =
10624 		    ifp->if_interface_state.lqm_state;
10625 	}
10626 	if (ifp->if_interface_state.valid_bitmask &
10627 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10628 		if_interface_state->valid_bitmask |=
10629 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10630 		if_interface_state->interface_availability =
10631 		    ifp->if_interface_state.interface_availability;
10632 	}
10633 
10634 	ifnet_lock_done(ifp);
10635 }
10636 
10637 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10638 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10639 {
10640 	if (conn_probe > 1) {
10641 		return EINVAL;
10642 	}
10643 	if (conn_probe == 0) {
10644 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10645 	} else {
10646 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10647 	}
10648 
10649 #if NECP
10650 	necp_update_all_clients();
10651 #endif /* NECP */
10652 
10653 	tcp_probe_connectivity(ifp, conn_probe);
10654 	return 0;
10655 }
10656 
10657 /* for uuid.c */
10658 static int
get_ether_index(int * ret_other_index)10659 get_ether_index(int * ret_other_index)
10660 {
10661 	struct ifnet *ifp;
10662 	int en0_index = 0;
10663 	int other_en_index = 0;
10664 	int any_ether_index = 0;
10665 	short best_unit = 0;
10666 
10667 	*ret_other_index = 0;
10668 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10669 		/*
10670 		 * find en0, or if not en0, the lowest unit en*, and if not
10671 		 * that, any ethernet
10672 		 */
10673 		ifnet_lock_shared(ifp);
10674 		if (strcmp(ifp->if_name, "en") == 0) {
10675 			if (ifp->if_unit == 0) {
10676 				/* found en0, we're done */
10677 				en0_index = ifp->if_index;
10678 				ifnet_lock_done(ifp);
10679 				break;
10680 			}
10681 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10682 				other_en_index = ifp->if_index;
10683 				best_unit = ifp->if_unit;
10684 			}
10685 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10686 			any_ether_index = ifp->if_index;
10687 		}
10688 		ifnet_lock_done(ifp);
10689 	}
10690 	if (en0_index == 0) {
10691 		if (other_en_index != 0) {
10692 			*ret_other_index = other_en_index;
10693 		} else if (any_ether_index != 0) {
10694 			*ret_other_index = any_ether_index;
10695 		}
10696 	}
10697 	return en0_index;
10698 }
10699 
10700 int
uuid_get_ethernet(u_int8_t * node)10701 uuid_get_ethernet(u_int8_t *node)
10702 {
10703 	static int en0_index;
10704 	struct ifnet *ifp;
10705 	int other_index = 0;
10706 	int the_index = 0;
10707 	int ret;
10708 
10709 	ifnet_head_lock_shared();
10710 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10711 		en0_index = get_ether_index(&other_index);
10712 	}
10713 	if (en0_index != 0) {
10714 		the_index = en0_index;
10715 	} else if (other_index != 0) {
10716 		the_index = other_index;
10717 	}
10718 	if (the_index != 0) {
10719 		struct dlil_ifnet *dl_if;
10720 
10721 		ifp = ifindex2ifnet[the_index];
10722 		VERIFY(ifp != NULL);
10723 		dl_if = (struct dlil_ifnet *)ifp;
10724 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10725 			/*
10726 			 * Use the permanent ethernet address if it is
10727 			 * available because it will never change.
10728 			 */
10729 			memcpy(node, dl_if->dl_if_permanent_ether,
10730 			    ETHER_ADDR_LEN);
10731 		} else {
10732 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10733 		}
10734 		ret = 0;
10735 	} else {
10736 		ret = -1;
10737 	}
10738 	ifnet_head_done();
10739 	return ret;
10740 }
10741 
10742 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10743 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10744     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10745 {
10746 	struct kev_dl_node_presence kev;
10747 	struct sockaddr_dl *sdl;
10748 	struct sockaddr_in6 *sin6;
10749 	int ret = 0;
10750 
10751 	VERIFY(ifp);
10752 	VERIFY(sa);
10753 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10754 
10755 	bzero(&kev, sizeof(kev));
10756 	sin6 = &kev.sin6_node_address;
10757 	sdl = &kev.sdl_node_address;
10758 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10759 	kev.rssi = rssi;
10760 	kev.link_quality_metric = lqm;
10761 	kev.node_proximity_metric = npm;
10762 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10763 
10764 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10765 	if (ret == 0 || ret == EEXIST) {
10766 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10767 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10768 		if (err != 0) {
10769 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10770 			    "error %d\n", __func__, err);
10771 		}
10772 	}
10773 
10774 	if (ret == EEXIST) {
10775 		ret = 0;
10776 	}
10777 	return ret;
10778 }
10779 
10780 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10781 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10782 {
10783 	struct kev_dl_node_absence kev = {};
10784 	struct sockaddr_in6 *kev_sin6 = NULL;
10785 	struct sockaddr_dl *kev_sdl = NULL;
10786 	int error = 0;
10787 
10788 	VERIFY(ifp != NULL);
10789 	VERIFY(sa != NULL);
10790 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10791 
10792 	kev_sin6 = &kev.sin6_node_address;
10793 	kev_sdl = &kev.sdl_node_address;
10794 
10795 	if (sa->sa_family == AF_INET6) {
10796 		/*
10797 		 * If IPv6 address is given, get the link layer
10798 		 * address from what was cached in the neighbor cache
10799 		 */
10800 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10801 		bcopy(sa, kev_sin6, sa->sa_len);
10802 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10803 	} else {
10804 		/*
10805 		 * If passed address is AF_LINK type, derive the address
10806 		 * based on the link address.
10807 		 */
10808 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10809 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10810 	}
10811 
10812 	if (error == 0) {
10813 		kev_sdl->sdl_type = ifp->if_type;
10814 		kev_sdl->sdl_index = ifp->if_index;
10815 
10816 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10817 		    &kev.link_data, sizeof(kev), FALSE);
10818 	}
10819 }
10820 
10821 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10822 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10823     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10824 {
10825 	struct kev_dl_node_presence kev = {};
10826 	struct sockaddr_dl *kev_sdl = NULL;
10827 	struct sockaddr_in6 *kev_sin6 = NULL;
10828 	int ret = 0;
10829 
10830 	VERIFY(ifp != NULL);
10831 	VERIFY(sa != NULL && sdl != NULL);
10832 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10833 
10834 	kev_sin6 = &kev.sin6_node_address;
10835 	kev_sdl = &kev.sdl_node_address;
10836 
10837 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10838 	bcopy(sdl, kev_sdl, sdl->sdl_len);
10839 	kev_sdl->sdl_type = ifp->if_type;
10840 	kev_sdl->sdl_index = ifp->if_index;
10841 
10842 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10843 	bcopy(sa, kev_sin6, sa->sa_len);
10844 
10845 	kev.rssi = rssi;
10846 	kev.link_quality_metric = lqm;
10847 	kev.node_proximity_metric = npm;
10848 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10849 
10850 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10851 	if (ret == 0 || ret == EEXIST) {
10852 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10853 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10854 		if (err != 0) {
10855 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10856 		}
10857 	}
10858 
10859 	if (ret == EEXIST) {
10860 		ret = 0;
10861 	}
10862 	return ret;
10863 }
10864 
10865 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10866 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10867     kauth_cred_t *credp)
10868 {
10869 	const u_int8_t *bytes;
10870 	size_t size;
10871 
10872 	bytes = CONST_LLADDR(sdl);
10873 	size = sdl->sdl_alen;
10874 
10875 #if CONFIG_MACF
10876 	if (dlil_lladdr_ckreq) {
10877 		switch (sdl->sdl_type) {
10878 		case IFT_ETHER:
10879 		case IFT_IEEE1394:
10880 			break;
10881 		default:
10882 			credp = NULL;
10883 			break;
10884 		}
10885 		;
10886 
10887 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10888 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10889 				[0] = 2
10890 			};
10891 
10892 			bytes = unspec;
10893 		}
10894 	}
10895 #else
10896 #pragma unused(credp)
10897 #endif
10898 
10899 	if (sizep != NULL) {
10900 		*sizep = size;
10901 	}
10902 	return bytes;
10903 }
10904 
10905 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10906 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10907     u_int8_t info[DLIL_MODARGLEN])
10908 {
10909 	struct kev_dl_issues kev;
10910 	struct timeval tv;
10911 
10912 	VERIFY(ifp != NULL);
10913 	VERIFY(modid != NULL);
10914 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10915 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10916 
10917 	bzero(&kev, sizeof(kev));
10918 
10919 	microtime(&tv);
10920 	kev.timestamp = tv.tv_sec;
10921 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10922 	if (info != NULL) {
10923 		bcopy(info, &kev.info, DLIL_MODARGLEN);
10924 	}
10925 
10926 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10927 	    &kev.link_data, sizeof(kev), FALSE);
10928 }
10929 
10930 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10931 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10932     struct proc *p)
10933 {
10934 	u_int32_t level = IFNET_THROTTLE_OFF;
10935 	errno_t result = 0;
10936 
10937 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10938 
10939 	if (cmd == SIOCSIFOPPORTUNISTIC) {
10940 		/*
10941 		 * XXX: Use priv_check_cred() instead of root check?
10942 		 */
10943 		if ((result = proc_suser(p)) != 0) {
10944 			return result;
10945 		}
10946 
10947 		if (ifr->ifr_opportunistic.ifo_flags ==
10948 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
10949 			level = IFNET_THROTTLE_OPPORTUNISTIC;
10950 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10951 			level = IFNET_THROTTLE_OFF;
10952 		} else {
10953 			result = EINVAL;
10954 		}
10955 
10956 		if (result == 0) {
10957 			result = ifnet_set_throttle(ifp, level);
10958 		}
10959 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10960 		ifr->ifr_opportunistic.ifo_flags = 0;
10961 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10962 			ifr->ifr_opportunistic.ifo_flags |=
10963 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
10964 		}
10965 	}
10966 
10967 	/*
10968 	 * Return the count of current opportunistic connections
10969 	 * over the interface.
10970 	 */
10971 	if (result == 0) {
10972 		uint32_t flags = 0;
10973 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10974 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
10975 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10976 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10977 		ifr->ifr_opportunistic.ifo_inuse =
10978 		    udp_count_opportunistic(ifp->if_index, flags) +
10979 		    tcp_count_opportunistic(ifp->if_index, flags);
10980 	}
10981 
10982 	if (result == EALREADY) {
10983 		result = 0;
10984 	}
10985 
10986 	return result;
10987 }
10988 
10989 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10990 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10991 {
10992 	struct ifclassq *ifq;
10993 	int err = 0;
10994 
10995 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10996 		return ENXIO;
10997 	}
10998 
10999 	*level = IFNET_THROTTLE_OFF;
11000 
11001 	ifq = ifp->if_snd;
11002 	IFCQ_LOCK(ifq);
11003 	/* Throttling works only for IFCQ, not ALTQ instances */
11004 	if (IFCQ_IS_ENABLED(ifq)) {
11005 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11006 
11007 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11008 		*level = req.level;
11009 	}
11010 	IFCQ_UNLOCK(ifq);
11011 
11012 	return err;
11013 }
11014 
11015 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11016 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11017 {
11018 	struct ifclassq *ifq;
11019 	int err = 0;
11020 
11021 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11022 		return ENXIO;
11023 	}
11024 
11025 	ifq = ifp->if_snd;
11026 
11027 	switch (level) {
11028 	case IFNET_THROTTLE_OFF:
11029 	case IFNET_THROTTLE_OPPORTUNISTIC:
11030 		break;
11031 	default:
11032 		return EINVAL;
11033 	}
11034 
11035 	IFCQ_LOCK(ifq);
11036 	if (IFCQ_IS_ENABLED(ifq)) {
11037 		cqrq_throttle_t req = { 1, level };
11038 
11039 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11040 	}
11041 	IFCQ_UNLOCK(ifq);
11042 
11043 	if (err == 0) {
11044 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11045 		    level);
11046 #if NECP
11047 		necp_update_all_clients();
11048 #endif /* NECP */
11049 		if (level == IFNET_THROTTLE_OFF) {
11050 			ifnet_start(ifp);
11051 		}
11052 	}
11053 
11054 	return err;
11055 }
11056 
11057 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11058 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11059     struct proc *p)
11060 {
11061 #pragma unused(p)
11062 	errno_t result = 0;
11063 	uint32_t flags;
11064 	int level, category, subcategory;
11065 
11066 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11067 
11068 	if (cmd == SIOCSIFLOG) {
11069 		if ((result = priv_check_cred(kauth_cred_get(),
11070 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11071 			return result;
11072 		}
11073 
11074 		level = ifr->ifr_log.ifl_level;
11075 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11076 			result = EINVAL;
11077 		}
11078 
11079 		flags = ifr->ifr_log.ifl_flags;
11080 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11081 			result = EINVAL;
11082 		}
11083 
11084 		category = ifr->ifr_log.ifl_category;
11085 		subcategory = ifr->ifr_log.ifl_subcategory;
11086 
11087 		if (result == 0) {
11088 			result = ifnet_set_log(ifp, level, flags,
11089 			    category, subcategory);
11090 		}
11091 	} else {
11092 		result = ifnet_get_log(ifp, &level, &flags, &category,
11093 		    &subcategory);
11094 		if (result == 0) {
11095 			ifr->ifr_log.ifl_level = level;
11096 			ifr->ifr_log.ifl_flags = flags;
11097 			ifr->ifr_log.ifl_category = category;
11098 			ifr->ifr_log.ifl_subcategory = subcategory;
11099 		}
11100 	}
11101 
11102 	return result;
11103 }
11104 
11105 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11106 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11107     int32_t category, int32_t subcategory)
11108 {
11109 	int err = 0;
11110 
11111 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11112 	VERIFY(flags & IFNET_LOGF_MASK);
11113 
11114 	/*
11115 	 * The logging level applies to all facilities; make sure to
11116 	 * update them all with the most current level.
11117 	 */
11118 	flags |= ifp->if_log.flags;
11119 
11120 	if (ifp->if_output_ctl != NULL) {
11121 		struct ifnet_log_params l;
11122 
11123 		bzero(&l, sizeof(l));
11124 		l.level = level;
11125 		l.flags = flags;
11126 		l.flags &= ~IFNET_LOGF_DLIL;
11127 		l.category = category;
11128 		l.subcategory = subcategory;
11129 
11130 		/* Send this request to lower layers */
11131 		if (l.flags != 0) {
11132 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11133 			    sizeof(l), &l);
11134 		}
11135 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11136 		/*
11137 		 * If targeted to the lower layers without an output
11138 		 * control callback registered on the interface, just
11139 		 * silently ignore facilities other than ours.
11140 		 */
11141 		flags &= IFNET_LOGF_DLIL;
11142 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11143 			level = 0;
11144 		}
11145 	}
11146 
11147 	if (err == 0) {
11148 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11149 			ifp->if_log.flags = 0;
11150 		} else {
11151 			ifp->if_log.flags |= flags;
11152 		}
11153 
11154 		log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11155 		    "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11156 		    ifp->if_log.level, ifp->if_log.flags, flags,
11157 		    category, subcategory);
11158 	}
11159 
11160 	return err;
11161 }
11162 
11163 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11164 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11165     int32_t *category, int32_t *subcategory)
11166 {
11167 	if (level != NULL) {
11168 		*level = ifp->if_log.level;
11169 	}
11170 	if (flags != NULL) {
11171 		*flags = ifp->if_log.flags;
11172 	}
11173 	if (category != NULL) {
11174 		*category = ifp->if_log.category;
11175 	}
11176 	if (subcategory != NULL) {
11177 		*subcategory = ifp->if_log.subcategory;
11178 	}
11179 
11180 	return 0;
11181 }
11182 
11183 int
ifnet_notify_address(struct ifnet * ifp,int af)11184 ifnet_notify_address(struct ifnet *ifp, int af)
11185 {
11186 	struct ifnet_notify_address_params na;
11187 
11188 #if PF
11189 	(void) pf_ifaddr_hook(ifp);
11190 #endif /* PF */
11191 
11192 	if (ifp->if_output_ctl == NULL) {
11193 		return EOPNOTSUPP;
11194 	}
11195 
11196 	bzero(&na, sizeof(na));
11197 	na.address_family = (sa_family_t)af;
11198 
11199 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11200 	           sizeof(na), &na);
11201 }
11202 
11203 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11204 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11205 {
11206 	if (ifp == NULL || flowid == NULL) {
11207 		return EINVAL;
11208 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11209 	    !IF_FULLY_ATTACHED(ifp)) {
11210 		return ENXIO;
11211 	}
11212 
11213 	*flowid = ifp->if_flowhash;
11214 
11215 	return 0;
11216 }
11217 
11218 errno_t
ifnet_disable_output(struct ifnet * ifp)11219 ifnet_disable_output(struct ifnet *ifp)
11220 {
11221 	int err = 0;
11222 
11223 	if (ifp == NULL) {
11224 		return EINVAL;
11225 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11226 	    !IF_FULLY_ATTACHED(ifp)) {
11227 		return ENXIO;
11228 	}
11229 
11230 	lck_mtx_lock(&ifp->if_start_lock);
11231 	if (ifp->if_start_flags & IFSF_FLOW_RESUME_PENDING) {
11232 		ifp->if_start_flags &= ~(IFSF_FLOW_RESUME_PENDING | IFSF_FLOW_CONTROLLED);
11233 	} else if ((err = ifnet_fc_add(ifp)) == 0) {
11234 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11235 	}
11236 	lck_mtx_unlock(&ifp->if_start_lock);
11237 
11238 	return err;
11239 }
11240 
11241 errno_t
ifnet_enable_output(struct ifnet * ifp)11242 ifnet_enable_output(struct ifnet *ifp)
11243 {
11244 	if (ifp == NULL) {
11245 		return EINVAL;
11246 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11247 	    !IF_FULLY_ATTACHED(ifp)) {
11248 		return ENXIO;
11249 	}
11250 
11251 	ifnet_start_common(ifp, TRUE, FALSE);
11252 	return 0;
11253 }
11254 
11255 void
ifnet_flowadv(uint32_t flowhash)11256 ifnet_flowadv(uint32_t flowhash)
11257 {
11258 	struct ifnet_fc_entry *ifce;
11259 	struct ifnet *ifp;
11260 
11261 	ifce = ifnet_fc_get(flowhash);
11262 	if (ifce == NULL) {
11263 		return;
11264 	}
11265 
11266 	VERIFY(ifce->ifce_ifp != NULL);
11267 	ifp = ifce->ifce_ifp;
11268 
11269 	/* flow hash gets recalculated per attach, so check */
11270 	if (ifnet_is_attached(ifp, 1)) {
11271 		if (ifp->if_flowhash == flowhash) {
11272 			lck_mtx_lock_spin(&ifp->if_start_lock);
11273 			if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) {
11274 				ifp->if_start_flags |= IFSF_FLOW_RESUME_PENDING;
11275 			}
11276 			lck_mtx_unlock(&ifp->if_start_lock);
11277 			(void) ifnet_enable_output(ifp);
11278 		}
11279 		ifnet_decr_iorefcnt(ifp);
11280 	}
11281 	ifnet_fc_entry_free(ifce);
11282 }
11283 
11284 /*
11285  * Function to compare ifnet_fc_entries in ifnet flow control tree
11286  */
11287 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11288 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11289 {
11290 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11291 }
11292 
11293 static int
ifnet_fc_add(struct ifnet * ifp)11294 ifnet_fc_add(struct ifnet *ifp)
11295 {
11296 	struct ifnet_fc_entry keyfc, *ifce;
11297 	uint32_t flowhash;
11298 
11299 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11300 	VERIFY(ifp->if_flowhash != 0);
11301 	flowhash = ifp->if_flowhash;
11302 
11303 	bzero(&keyfc, sizeof(keyfc));
11304 	keyfc.ifce_flowhash = flowhash;
11305 
11306 	lck_mtx_lock_spin(&ifnet_fc_lock);
11307 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11308 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11309 		/* Entry is already in ifnet_fc_tree, return */
11310 		lck_mtx_unlock(&ifnet_fc_lock);
11311 		return 0;
11312 	}
11313 
11314 	if (ifce != NULL) {
11315 		/*
11316 		 * There is a different fc entry with the same flow hash
11317 		 * but different ifp pointer.  There can be a collision
11318 		 * on flow hash but the probability is low.  Let's just
11319 		 * avoid adding a second one when there is a collision.
11320 		 */
11321 		lck_mtx_unlock(&ifnet_fc_lock);
11322 		return EAGAIN;
11323 	}
11324 
11325 	/* become regular mutex */
11326 	lck_mtx_convert_spin(&ifnet_fc_lock);
11327 
11328 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11329 	ifce->ifce_flowhash = flowhash;
11330 	ifce->ifce_ifp = ifp;
11331 
11332 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11333 	lck_mtx_unlock(&ifnet_fc_lock);
11334 	return 0;
11335 }
11336 
11337 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11338 ifnet_fc_get(uint32_t flowhash)
11339 {
11340 	struct ifnet_fc_entry keyfc, *ifce;
11341 	struct ifnet *ifp;
11342 
11343 	bzero(&keyfc, sizeof(keyfc));
11344 	keyfc.ifce_flowhash = flowhash;
11345 
11346 	lck_mtx_lock_spin(&ifnet_fc_lock);
11347 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11348 	if (ifce == NULL) {
11349 		/* Entry is not present in ifnet_fc_tree, return */
11350 		lck_mtx_unlock(&ifnet_fc_lock);
11351 		return NULL;
11352 	}
11353 
11354 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11355 
11356 	VERIFY(ifce->ifce_ifp != NULL);
11357 	ifp = ifce->ifce_ifp;
11358 
11359 	/* become regular mutex */
11360 	lck_mtx_convert_spin(&ifnet_fc_lock);
11361 
11362 	if (!ifnet_is_attached(ifp, 0)) {
11363 		/*
11364 		 * This ifp is not attached or in the process of being
11365 		 * detached; just don't process it.
11366 		 */
11367 		ifnet_fc_entry_free(ifce);
11368 		ifce = NULL;
11369 	}
11370 	lck_mtx_unlock(&ifnet_fc_lock);
11371 
11372 	return ifce;
11373 }
11374 
11375 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11376 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11377 {
11378 	zfree(ifnet_fc_zone, ifce);
11379 }
11380 
11381 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11382 ifnet_calc_flowhash(struct ifnet *ifp)
11383 {
11384 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11385 	uint32_t flowhash = 0;
11386 
11387 	if (ifnet_flowhash_seed == 0) {
11388 		ifnet_flowhash_seed = RandomULong();
11389 	}
11390 
11391 	bzero(&fh, sizeof(fh));
11392 
11393 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11394 	fh.ifk_unit = ifp->if_unit;
11395 	fh.ifk_flags = ifp->if_flags;
11396 	fh.ifk_eflags = ifp->if_eflags;
11397 	fh.ifk_capabilities = ifp->if_capabilities;
11398 	fh.ifk_capenable = ifp->if_capenable;
11399 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11400 	fh.ifk_rand1 = RandomULong();
11401 	fh.ifk_rand2 = RandomULong();
11402 
11403 try_again:
11404 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11405 	if (flowhash == 0) {
11406 		/* try to get a non-zero flowhash */
11407 		ifnet_flowhash_seed = RandomULong();
11408 		goto try_again;
11409 	}
11410 
11411 	return flowhash;
11412 }
11413 
11414 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11415 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11416     uint16_t flags, uint8_t *data)
11417 {
11418 #pragma unused(flags)
11419 	int error = 0;
11420 
11421 	switch (family) {
11422 	case AF_INET:
11423 		if_inetdata_lock_exclusive(ifp);
11424 		if (IN_IFEXTRA(ifp) != NULL) {
11425 			if (len == 0) {
11426 				/* Allow clearing the signature */
11427 				IN_IFEXTRA(ifp)->netsig_len = 0;
11428 				bzero(IN_IFEXTRA(ifp)->netsig,
11429 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11430 				if_inetdata_lock_done(ifp);
11431 				break;
11432 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11433 				error = EINVAL;
11434 				if_inetdata_lock_done(ifp);
11435 				break;
11436 			}
11437 			IN_IFEXTRA(ifp)->netsig_len = len;
11438 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11439 		} else {
11440 			error = ENOMEM;
11441 		}
11442 		if_inetdata_lock_done(ifp);
11443 		break;
11444 
11445 	case AF_INET6:
11446 		if_inet6data_lock_exclusive(ifp);
11447 		if (IN6_IFEXTRA(ifp) != NULL) {
11448 			if (len == 0) {
11449 				/* Allow clearing the signature */
11450 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11451 				bzero(IN6_IFEXTRA(ifp)->netsig,
11452 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11453 				if_inet6data_lock_done(ifp);
11454 				break;
11455 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11456 				error = EINVAL;
11457 				if_inet6data_lock_done(ifp);
11458 				break;
11459 			}
11460 			IN6_IFEXTRA(ifp)->netsig_len = len;
11461 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11462 		} else {
11463 			error = ENOMEM;
11464 		}
11465 		if_inet6data_lock_done(ifp);
11466 		break;
11467 
11468 	default:
11469 		error = EINVAL;
11470 		break;
11471 	}
11472 
11473 	return error;
11474 }
11475 
11476 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11477 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11478     uint16_t *flags, uint8_t *data)
11479 {
11480 	int error = 0;
11481 
11482 	if (ifp == NULL || len == NULL || data == NULL) {
11483 		return EINVAL;
11484 	}
11485 
11486 	switch (family) {
11487 	case AF_INET:
11488 		if_inetdata_lock_shared(ifp);
11489 		if (IN_IFEXTRA(ifp) != NULL) {
11490 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11491 				error = EINVAL;
11492 				if_inetdata_lock_done(ifp);
11493 				break;
11494 			}
11495 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11496 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11497 			} else {
11498 				error = ENOENT;
11499 			}
11500 		} else {
11501 			error = ENOMEM;
11502 		}
11503 		if_inetdata_lock_done(ifp);
11504 		break;
11505 
11506 	case AF_INET6:
11507 		if_inet6data_lock_shared(ifp);
11508 		if (IN6_IFEXTRA(ifp) != NULL) {
11509 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11510 				error = EINVAL;
11511 				if_inet6data_lock_done(ifp);
11512 				break;
11513 			}
11514 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11515 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11516 			} else {
11517 				error = ENOENT;
11518 			}
11519 		} else {
11520 			error = ENOMEM;
11521 		}
11522 		if_inet6data_lock_done(ifp);
11523 		break;
11524 
11525 	default:
11526 		error = EINVAL;
11527 		break;
11528 	}
11529 
11530 	if (error == 0 && flags != NULL) {
11531 		*flags = 0;
11532 	}
11533 
11534 	return error;
11535 }
11536 
11537 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11538 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11539 {
11540 	int i, error = 0, one_set = 0;
11541 
11542 	if_inet6data_lock_exclusive(ifp);
11543 
11544 	if (IN6_IFEXTRA(ifp) == NULL) {
11545 		error = ENOMEM;
11546 		goto out;
11547 	}
11548 
11549 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11550 		uint32_t prefix_len =
11551 		    prefixes[i].prefix_len;
11552 		struct in6_addr *prefix =
11553 		    &prefixes[i].ipv6_prefix;
11554 
11555 		if (prefix_len == 0) {
11556 			clat_log0((LOG_DEBUG,
11557 			    "NAT64 prefixes purged from Interface %s\n",
11558 			    if_name(ifp)));
11559 			/* Allow clearing the signature */
11560 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11561 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11562 			    sizeof(struct in6_addr));
11563 
11564 			continue;
11565 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11566 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11567 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11568 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11569 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11570 		    prefix_len != NAT64_PREFIX_LEN_96) {
11571 			clat_log0((LOG_DEBUG,
11572 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11573 			error = EINVAL;
11574 			goto out;
11575 		}
11576 
11577 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11578 			clat_log0((LOG_DEBUG,
11579 			    "NAT64 prefix has interface/link local scope.\n"));
11580 			error = EINVAL;
11581 			goto out;
11582 		}
11583 
11584 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11585 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11586 		    sizeof(struct in6_addr));
11587 		clat_log0((LOG_DEBUG,
11588 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11589 		    ip6_sprintf(prefix), prefix_len));
11590 		one_set = 1;
11591 	}
11592 
11593 out:
11594 	if_inet6data_lock_done(ifp);
11595 
11596 	if (error == 0 && one_set != 0) {
11597 		necp_update_all_clients();
11598 	}
11599 
11600 	return error;
11601 }
11602 
11603 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11604 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11605 {
11606 	int i, found_one = 0, error = 0;
11607 
11608 	if (ifp == NULL) {
11609 		return EINVAL;
11610 	}
11611 
11612 	if_inet6data_lock_shared(ifp);
11613 
11614 	if (IN6_IFEXTRA(ifp) == NULL) {
11615 		error = ENOMEM;
11616 		goto out;
11617 	}
11618 
11619 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11620 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11621 			found_one = 1;
11622 		}
11623 	}
11624 
11625 	if (found_one == 0) {
11626 		error = ENOENT;
11627 		goto out;
11628 	}
11629 
11630 	if (prefixes) {
11631 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11632 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11633 	}
11634 
11635 out:
11636 	if_inet6data_lock_done(ifp);
11637 
11638 	return error;
11639 }
11640 
11641 __attribute__((noinline))
11642 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11643 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11644     protocol_family_t pf)
11645 {
11646 #pragma unused(ifp)
11647 	uint32_t did_sw;
11648 
11649 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11650 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11651 		return;
11652 	}
11653 
11654 	switch (pf) {
11655 	case PF_INET:
11656 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11657 		if (did_sw & CSUM_DELAY_IP) {
11658 			hwcksum_dbg_finalized_hdr++;
11659 		}
11660 		if (did_sw & CSUM_DELAY_DATA) {
11661 			hwcksum_dbg_finalized_data++;
11662 		}
11663 		break;
11664 	case PF_INET6:
11665 		/*
11666 		 * Checksum offload should not have been enabled when
11667 		 * extension headers exist; that also means that we
11668 		 * cannot force-finalize packets with extension headers.
11669 		 * Indicate to the callee should it skip such case by
11670 		 * setting optlen to -1.
11671 		 */
11672 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11673 		    m->m_pkthdr.csum_flags);
11674 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11675 			hwcksum_dbg_finalized_data++;
11676 		}
11677 		break;
11678 	default:
11679 		return;
11680 	}
11681 }
11682 
11683 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11684 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11685     protocol_family_t pf)
11686 {
11687 	uint16_t sum = 0;
11688 	uint32_t hlen;
11689 
11690 	if (frame_header == NULL ||
11691 	    frame_header < (char *)mbuf_datastart(m) ||
11692 	    frame_header > (char *)m->m_data) {
11693 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11694 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11695 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11696 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11697 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11698 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11699 		return;
11700 	}
11701 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
11702 
11703 	switch (pf) {
11704 	case PF_INET:
11705 	case PF_INET6:
11706 		break;
11707 	default:
11708 		return;
11709 	}
11710 
11711 	/*
11712 	 * Force partial checksum offload; useful to simulate cases
11713 	 * where the hardware does not support partial checksum offload,
11714 	 * in order to validate correctness throughout the layers above.
11715 	 */
11716 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11717 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11718 
11719 		if (foff > (uint32_t)m->m_pkthdr.len) {
11720 			return;
11721 		}
11722 
11723 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11724 
11725 		/* Compute 16-bit 1's complement sum from forced offset */
11726 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11727 
11728 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11729 		m->m_pkthdr.csum_rx_val = sum;
11730 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11731 
11732 		hwcksum_dbg_partial_forced++;
11733 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11734 	}
11735 
11736 	/*
11737 	 * Partial checksum offload verification (and adjustment);
11738 	 * useful to validate and test cases where the hardware
11739 	 * supports partial checksum offload.
11740 	 */
11741 	if ((m->m_pkthdr.csum_flags &
11742 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11743 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11744 		uint32_t rxoff;
11745 
11746 		/* Start offset must begin after frame header */
11747 		rxoff = m->m_pkthdr.csum_rx_start;
11748 		if (hlen > rxoff) {
11749 			hwcksum_dbg_bad_rxoff++;
11750 			if (dlil_verbose) {
11751 				DLIL_PRINTF("%s: partial cksum start offset %d "
11752 				    "is less than frame header length %d for "
11753 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11754 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11755 			}
11756 			return;
11757 		}
11758 		rxoff -= hlen;
11759 
11760 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11761 			/*
11762 			 * Compute the expected 16-bit 1's complement sum;
11763 			 * skip this if we've already computed it above
11764 			 * when partial checksum offload is forced.
11765 			 */
11766 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11767 
11768 			/* Hardware or driver is buggy */
11769 			if (sum != m->m_pkthdr.csum_rx_val) {
11770 				hwcksum_dbg_bad_cksum++;
11771 				if (dlil_verbose) {
11772 					DLIL_PRINTF("%s: bad partial cksum value "
11773 					    "0x%x (expected 0x%x) for mbuf "
11774 					    "0x%llx [rx_start %d]\n",
11775 					    if_name(ifp),
11776 					    m->m_pkthdr.csum_rx_val, sum,
11777 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11778 					    m->m_pkthdr.csum_rx_start);
11779 				}
11780 				return;
11781 			}
11782 		}
11783 		hwcksum_dbg_verified++;
11784 
11785 		/*
11786 		 * This code allows us to emulate various hardwares that
11787 		 * perform 16-bit 1's complement sum beginning at various
11788 		 * start offset values.
11789 		 */
11790 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11791 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11792 
11793 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11794 				return;
11795 			}
11796 
11797 			sum = m_adj_sum16(m, rxoff, aoff,
11798 			    m_pktlen(m) - aoff, sum);
11799 
11800 			m->m_pkthdr.csum_rx_val = sum;
11801 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11802 
11803 			hwcksum_dbg_adjusted++;
11804 		}
11805 	}
11806 }
11807 
11808 #if DEBUG || DEVELOPMENT
11809 /* Blob for sum16 verification */
11810 static uint8_t sumdata[] = {
11811 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11812 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11813 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11814 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11815 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11816 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11817 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11818 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11819 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11820 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11821 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11822 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11823 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11824 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11825 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11826 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11827 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11828 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11829 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11830 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11831 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11832 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11833 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11834 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11835 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11836 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11837 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11838 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11839 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11840 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11841 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11842 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11843 	0xc8, 0x28, 0x02, 0x00, 0x00
11844 };
11845 
11846 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11847 static struct {
11848 	boolean_t       init;
11849 	uint16_t        len;
11850 	uint16_t        sumr;   /* reference */
11851 	uint16_t        sumrp;  /* reference, precomputed */
11852 } sumtbl[] = {
11853 	{ FALSE, 0, 0, 0x0000 },
11854 	{ FALSE, 1, 0, 0x001f },
11855 	{ FALSE, 2, 0, 0x8b1f },
11856 	{ FALSE, 3, 0, 0x8b27 },
11857 	{ FALSE, 7, 0, 0x790e },
11858 	{ FALSE, 11, 0, 0xcb6d },
11859 	{ FALSE, 20, 0, 0x20dd },
11860 	{ FALSE, 27, 0, 0xbabd },
11861 	{ FALSE, 32, 0, 0xf3e8 },
11862 	{ FALSE, 37, 0, 0x197d },
11863 	{ FALSE, 43, 0, 0x9eae },
11864 	{ FALSE, 64, 0, 0x4678 },
11865 	{ FALSE, 127, 0, 0x9399 },
11866 	{ FALSE, 256, 0, 0xd147 },
11867 	{ FALSE, 325, 0, 0x0358 },
11868 };
11869 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11870 
11871 static void
dlil_verify_sum16(void)11872 dlil_verify_sum16(void)
11873 {
11874 	struct mbuf *m;
11875 	uint8_t *buf;
11876 	int n;
11877 
11878 	/* Make sure test data plus extra room for alignment fits in cluster */
11879 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11880 
11881 	kprintf("DLIL: running SUM16 self-tests ... ");
11882 
11883 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11884 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11885 
11886 	buf = mtod(m, uint8_t *);               /* base address */
11887 
11888 	for (n = 0; n < SUMTBL_MAX; n++) {
11889 		uint16_t len = sumtbl[n].len;
11890 		int i;
11891 
11892 		/* Verify for all possible alignments */
11893 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
11894 			uint16_t sum, sumr;
11895 			uint8_t *c;
11896 
11897 			/* Copy over test data to mbuf */
11898 			VERIFY(len <= sizeof(sumdata));
11899 			c = buf + i;
11900 			bcopy(sumdata, c, len);
11901 
11902 			/* Zero-offset test (align by data pointer) */
11903 			m->m_data = (uintptr_t)c;
11904 			m->m_len = len;
11905 			sum = m_sum16(m, 0, len);
11906 
11907 			if (!sumtbl[n].init) {
11908 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11909 				sumtbl[n].sumr = sumr;
11910 				sumtbl[n].init = TRUE;
11911 			} else {
11912 				sumr = sumtbl[n].sumr;
11913 			}
11914 
11915 			/* Something is horribly broken; stop now */
11916 			if (sumr != sumtbl[n].sumrp) {
11917 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11918 				    "for len=%d align=%d sum=0x%04x "
11919 				    "[expected=0x%04x]\n", __func__,
11920 				    len, i, sum, sumr);
11921 				/* NOTREACHED */
11922 			} else if (sum != sumr) {
11923 				panic_plain("\n%s: broken m_sum16() for len=%d "
11924 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11925 				    __func__, len, i, sum, sumr);
11926 				/* NOTREACHED */
11927 			}
11928 
11929 			/* Alignment test by offset (fixed data pointer) */
11930 			m->m_data = (uintptr_t)buf;
11931 			m->m_len = i + len;
11932 			sum = m_sum16(m, i, len);
11933 
11934 			/* Something is horribly broken; stop now */
11935 			if (sum != sumr) {
11936 				panic_plain("\n%s: broken m_sum16() for len=%d "
11937 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
11938 				    __func__, len, i, sum, sumr);
11939 				/* NOTREACHED */
11940 			}
11941 #if INET
11942 			/* Simple sum16 contiguous buffer test by aligment */
11943 			sum = b_sum16(c, len);
11944 
11945 			/* Something is horribly broken; stop now */
11946 			if (sum != sumr) {
11947 				panic_plain("\n%s: broken b_sum16() for len=%d "
11948 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11949 				    __func__, len, i, sum, sumr);
11950 				/* NOTREACHED */
11951 			}
11952 #endif /* INET */
11953 		}
11954 	}
11955 	m_freem(m);
11956 
11957 	kprintf("PASSED\n");
11958 }
11959 #endif /* DEBUG || DEVELOPMENT */
11960 
11961 #define CASE_STRINGIFY(x) case x: return #x
11962 
11963 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11964 dlil_kev_dl_code_str(u_int32_t event_code)
11965 {
11966 	switch (event_code) {
11967 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11968 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11969 		CASE_STRINGIFY(KEV_DL_SIFMTU);
11970 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
11971 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11972 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11973 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
11974 		CASE_STRINGIFY(KEV_DL_DELMULTI);
11975 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11976 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11977 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11978 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
11979 		CASE_STRINGIFY(KEV_DL_LINK_ON);
11980 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11981 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11982 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11983 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11984 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11985 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11986 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11987 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11988 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11989 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11990 		CASE_STRINGIFY(KEV_DL_ISSUES);
11991 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11992 	default:
11993 		break;
11994 	}
11995 	return "";
11996 }
11997 
11998 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11999 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12000 {
12001 #pragma unused(arg1)
12002 	struct ifnet *ifp = arg0;
12003 
12004 	if (ifnet_is_attached(ifp, 1)) {
12005 		nstat_ifnet_threshold_reached(ifp->if_index);
12006 		ifnet_decr_iorefcnt(ifp);
12007 	}
12008 }
12009 
12010 void
ifnet_notify_data_threshold(struct ifnet * ifp)12011 ifnet_notify_data_threshold(struct ifnet *ifp)
12012 {
12013 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12014 	uint64_t oldbytes = ifp->if_dt_bytes;
12015 
12016 	ASSERT(ifp->if_dt_tcall != NULL);
12017 
12018 	/*
12019 	 * If we went over the threshold, notify NetworkStatistics.
12020 	 * We rate-limit it based on the threshold interval value.
12021 	 */
12022 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12023 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12024 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12025 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12026 		uint64_t now = mach_absolute_time(), deadline = now;
12027 		uint64_t ival;
12028 
12029 		if (tival != 0) {
12030 			nanoseconds_to_absolutetime(tival, &ival);
12031 			clock_deadline_for_periodic_event(ival, now, &deadline);
12032 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12033 			    deadline);
12034 		} else {
12035 			(void) thread_call_enter(ifp->if_dt_tcall);
12036 		}
12037 	}
12038 }
12039 
12040 
12041 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12042 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12043     struct ifnet *ifp)
12044 {
12045 	tcp_update_stats_per_flow(ifs, ifp);
12046 }
12047 
12048 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12049 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12050 {
12051 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12052 }
12053 
12054 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12055 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12056 {
12057 	OSBitAndAtomic(~clear_flags, flags_p);
12058 }
12059 
12060 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12061 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12062 {
12063 	return _set_flags(&interface->if_eflags, set_flags);
12064 }
12065 
12066 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12067 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12068 {
12069 	_clear_flags(&interface->if_eflags, clear_flags);
12070 }
12071 
12072 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12073 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12074 {
12075 	return _set_flags(&interface->if_xflags, set_flags);
12076 }
12077 
12078 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12079 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12080 {
12081 	_clear_flags(&interface->if_xflags, clear_flags);
12082 }
12083 
12084 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12085 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12086 {
12087 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12088 }
12089 
12090 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12091 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12092 {
12093 	if (*genid != ifp->if_traffic_rule_genid) {
12094 		*genid = ifp->if_traffic_rule_genid;
12095 		return TRUE;
12096 	}
12097 	return FALSE;
12098 }
12099 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12100 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12101 {
12102 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12103 	ifnet_update_traffic_rule_genid(ifp);
12104 }
12105 
12106 static void
log_hexdump(void * data,size_t len)12107 log_hexdump(void *data, size_t len)
12108 {
12109 	size_t i, j, k;
12110 	unsigned char *ptr = (unsigned char *)data;
12111 #define MAX_DUMP_BUF 32
12112 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12113 
12114 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12115 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12116 			unsigned char msnbl = ptr[j] >> 4;
12117 			unsigned char lsnbl = ptr[j] & 0x0f;
12118 
12119 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12120 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12121 
12122 			if ((j % 2) == 1) {
12123 				buf[k++] = ' ';
12124 			}
12125 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12126 				buf[k++] = ' ';
12127 			}
12128 		}
12129 		buf[k] = 0;
12130 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12131 	}
12132 }
12133 
12134 #if SKYWALK
12135 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12136 net_check_compatible_if_filter(struct ifnet *ifp)
12137 {
12138 	if (ifp == NULL) {
12139 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12140 			return false;
12141 		}
12142 	} else {
12143 		if (ifp->if_flt_non_os_count > 0) {
12144 			return false;
12145 		}
12146 	}
12147 	return true;
12148 }
12149 #endif /* SKYWALK */
12150 
12151 #define DUMP_BUF_CHK() {        \
12152 	clen -= k;              \
12153 	if (clen < 1)           \
12154 	        goto done;      \
12155 	c += k;                 \
12156 }
12157 
12158 int dlil_dump_top_if_qlen(char *, int);
12159 int
dlil_dump_top_if_qlen(char * str,int str_len)12160 dlil_dump_top_if_qlen(char *str, int str_len)
12161 {
12162 	char *c = str;
12163 	int k, clen = str_len;
12164 	struct ifnet *top_ifcq_ifp = NULL;
12165 	uint32_t top_ifcq_len = 0;
12166 	struct ifnet *top_inq_ifp = NULL;
12167 	uint32_t top_inq_len = 0;
12168 
12169 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12170 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12171 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12172 
12173 		if (ifp == NULL) {
12174 			continue;
12175 		}
12176 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12177 			top_ifcq_len = ifp->if_snd->ifcq_len;
12178 			top_ifcq_ifp = ifp;
12179 		}
12180 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12181 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12182 			top_inq_ifp = ifp;
12183 		}
12184 	}
12185 
12186 	if (top_ifcq_ifp != NULL) {
12187 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12188 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12189 		DUMP_BUF_CHK();
12190 	}
12191 	if (top_inq_ifp != NULL) {
12192 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12193 		    top_inq_len, top_inq_ifp->if_xname);
12194 		DUMP_BUF_CHK();
12195 	}
12196 done:
12197 	return str_len - clen;
12198 }
12199